Imported Upstream version 0.9.0 26/303326/1 upstream upstream/0.9.0
authorJiyong <jiyong.min@samsung.com>
Tue, 26 Dec 2023 01:55:00 +0000 (10:55 +0900)
committerJiyong <jiyong.min@samsung.com>
Tue, 26 Dec 2023 02:08:07 +0000 (11:08 +0900)
Change-Id: I60862786d19d92bb65425923bfeaa8ec236d8722

934 files changed:
.bazelignore [new file with mode: 0644]
.clang-tidy
.github/PULL_REQUEST_TEMPLATE.md [new file with mode: 0644]
.github/dependabot.yml [new file with mode: 0644]
.github/workflows/build_test.yml
.github/workflows/build_test_cross.yml
.github/workflows/build_test_md.yml [new file with mode: 0644]
.github/workflows/codeql.yml [new file with mode: 0644]
.github/workflows/conformance.yml
.github/workflows/debug_ci.yml
.github/workflows/dependency-review.yml [new file with mode: 0644]
.github/workflows/fuzz.yml
.github/workflows/gitlab_mirror.yml [new file with mode: 0644]
.github/workflows/highway.patch [new file with mode: 0644]
.github/workflows/pages.yml [new file with mode: 0644]
.github/workflows/pull_request.yml
.github/workflows/release.yaml
.github/workflows/scorecard.yml [new file with mode: 0644]
.github/workflows/test_new_highway.yml [new file with mode: 0644]
.gitignore
.gitmodules
.pre-commit-config.yaml [new file with mode: 0644]
.readthedocs.yaml
AUTHORS
BUILD.bazel [new file with mode: 0644]
BUILDING.md [new file with mode: 0644]
BUILDING_Haiku.md [moved from README.Haiku.md with 94% similarity]
BUILDING_OSX.md [moved from README.OSX.md with 95% similarity]
CHANGELOG.md
CMakeLists.txt
CONTRIBUTING.md
README.md
WORKSPACE [new file with mode: 0644]
bash_test.sh
ci.sh
cmake/FindBrotli.cmake
cmake/FindHWY.cmake
debian/changelog
debian/control
debian/copyright
debian/libjxl-dev.install
debian/rules
deps.sh
doc/api.txt
doc/building_and_testing.md
doc/building_wasm.md
doc/color_management.md
doc/debugging_workflows.md [new file with mode: 0644]
doc/developing_in_debian.md
doc/developing_in_docker.md [deleted file]
doc/developing_in_windows_vcpkg.md
doc/encode_effort.md [new file with mode: 0644]
doc/release.md
doc/software_support.md
doc/sphinx/api.rst
doc/sphinx/api_butteraugli.rst [deleted file]
doc/sphinx/api_color.rst [new file with mode: 0644]
doc/sphinx/api_cpp.rst [new file with mode: 0644]
doc/sphinx/api_metadata.rst [new file with mode: 0644]
doc/tables/adobe.md [deleted file]
doc/tables/all_tables.pdf [deleted file]
doc/tables/all_tables.sh [deleted file]
doc/tables/app0.md [deleted file]
doc/tables/brn_proto.md [deleted file]
doc/tables/context_modes.md [deleted file]
doc/tables/dct_gen.md [deleted file]
doc/tables/ducky.md [deleted file]
doc/tables/freq_context.md [deleted file]
doc/tables/icc.md [deleted file]
doc/tables/is_zero_base.md [deleted file]
doc/tables/markdown-pdf.css [deleted file]
doc/tables/nonzero_buckets.md [deleted file]
doc/tables/num_nonzero_context.md [deleted file]
doc/tables/num_nonzeros_base.md [deleted file]
doc/tables/quant.md [deleted file]
doc/tables/stock_counts.md [deleted file]
doc/tables/stock_quant.md [deleted file]
doc/tables/stock_values.md [deleted file]
doc/tables/symbol_order.md [deleted file]
doc/xl_overview.md
docker/Dockerfile.jpegxl-builder [deleted file]
docker/Dockerfile.jpegxl-builder-run-aarch64 [deleted file]
docker/README.md [deleted file]
docker/build.sh [deleted file]
docker/scripts/99_norecommends [deleted file]
docker/scripts/binutils_align_fix.patch [deleted file]
docker/scripts/emsdk_install.sh [deleted file]
docker/scripts/jpegxl_builder.sh [deleted file]
docker/scripts/msan_install.sh [deleted file]
docker/scripts/qemu_install.sh [deleted file]
examples/CMakeLists.txt
examples/decode_exif_metadata.cc
examples/decode_oneshot.cc
examples/decode_progressive.cc
examples/encode_oneshot.cc
experimental/fast_lossless/fast_lossless.cc [deleted file]
experimental/fast_lossless/fast_lossless.h [deleted file]
flake.lock [new file with mode: 0644]
flake.nix [new file with mode: 0644]
lib/BUILD [new file with mode: 0644]
lib/CMakeLists.txt
lib/extras/alpha_blend.cc [new file with mode: 0644]
lib/extras/alpha_blend.h [new file with mode: 0644]
lib/extras/codec.cc
lib/extras/codec.h
lib/extras/codec_test.cc
lib/extras/common.cc [new file with mode: 0644]
lib/extras/common.h [new file with mode: 0644]
lib/extras/dec/apng.cc
lib/extras/dec/apng.h
lib/extras/dec/color_description.cc
lib/extras/dec/color_description.h
lib/extras/dec/color_description_test.cc
lib/extras/dec/color_hints.cc
lib/extras/dec/color_hints.h
lib/extras/dec/decode.cc
lib/extras/dec/decode.h
lib/extras/dec/exr.cc
lib/extras/dec/exr.h
lib/extras/dec/gif.cc
lib/extras/dec/gif.h
lib/extras/dec/jpegli.cc [new file with mode: 0644]
lib/extras/dec/jpegli.h [new file with mode: 0644]
lib/extras/dec/jpg.cc
lib/extras/dec/jpg.h
lib/extras/dec/jxl.cc
lib/extras/dec/jxl.h
lib/extras/dec/pgx.cc
lib/extras/dec/pgx.h
lib/extras/dec/pgx_test.cc
lib/extras/dec/pnm.cc
lib/extras/dec/pnm.h
lib/extras/enc/apng.cc
lib/extras/enc/encode.cc
lib/extras/enc/encode.h
lib/extras/enc/exr.cc
lib/extras/enc/jpegli.cc [new file with mode: 0644]
lib/extras/enc/jpegli.h [new file with mode: 0644]
lib/extras/enc/jpg.cc
lib/extras/enc/jxl.cc [new file with mode: 0644]
lib/extras/enc/jxl.h [new file with mode: 0644]
lib/extras/enc/npy.cc
lib/extras/enc/pgx.cc
lib/extras/enc/pnm.cc
lib/extras/enc/pnm.h
lib/extras/exif.cc
lib/extras/hlg.cc
lib/extras/jpegli_test.cc [new file with mode: 0644]
lib/extras/metrics.cc [moved from lib/jxl/enc_butteraugli_pnorm.cc with 84% similarity]
lib/extras/metrics.h [moved from lib/jxl/enc_butteraugli_pnorm.h with 74% similarity]
lib/extras/packed_image.h
lib/extras/packed_image_convert.cc
lib/extras/packed_image_convert.h
lib/extras/render_hdr.cc [deleted file]
lib/extras/render_hdr.h [deleted file]
lib/extras/size_constraints.h [moved from lib/jxl/size_constraints.h with 50% similarity]
lib/extras/time.cc
lib/extras/tone_mapping.cc
lib/extras/tone_mapping_gbench.cc
lib/include/jxl/butteraugli.h [deleted file]
lib/include/jxl/butteraugli_cxx.h [deleted file]
lib/include/jxl/cms.h [new file with mode: 0644]
lib/include/jxl/cms_interface.h
lib/include/jxl/codestream_header.h
lib/include/jxl/color_encoding.h
lib/include/jxl/decode.h
lib/include/jxl/decode_cxx.h
lib/include/jxl/encode.h
lib/include/jxl/encode_cxx.h
lib/include/jxl/parallel_runner.h
lib/include/jxl/resizable_parallel_runner.h
lib/include/jxl/resizable_parallel_runner_cxx.h
lib/include/jxl/stats.h [new file with mode: 0644]
lib/include/jxl/thread_parallel_runner.h
lib/include/jxl/thread_parallel_runner_cxx.h
lib/include/jxl/types.h
lib/jpegli.cmake [new file with mode: 0644]
lib/jpegli/README.md [new file with mode: 0644]
lib/jpegli/adaptive_quantization.cc [new file with mode: 0644]
lib/jpegli/adaptive_quantization.h [new file with mode: 0644]
lib/jpegli/bit_writer.cc [new file with mode: 0644]
lib/jpegli/bit_writer.h [new file with mode: 0644]
lib/jpegli/bitstream.cc [new file with mode: 0644]
lib/jpegli/bitstream.h [new file with mode: 0644]
lib/jpegli/color_quantize.cc [new file with mode: 0644]
lib/jpegli/color_quantize.h [new file with mode: 0644]
lib/jpegli/color_transform.cc [new file with mode: 0644]
lib/jpegli/color_transform.h [new file with mode: 0644]
lib/jpegli/common.cc [new file with mode: 0644]
lib/jpegli/common.h [new file with mode: 0644]
lib/jpegli/common_internal.h [new file with mode: 0644]
lib/jpegli/dct-inl.h [new file with mode: 0644]
lib/jpegli/decode.cc [new file with mode: 0644]
lib/jpegli/decode.h [new file with mode: 0644]
lib/jpegli/decode_api_test.cc [new file with mode: 0644]
lib/jpegli/decode_internal.h [new file with mode: 0644]
lib/jpegli/decode_marker.cc [new file with mode: 0644]
lib/jpegli/decode_marker.h [new file with mode: 0644]
lib/jpegli/decode_scan.cc [new file with mode: 0644]
lib/jpegli/decode_scan.h [new file with mode: 0644]
lib/jpegli/destination_manager.cc [new file with mode: 0644]
lib/jpegli/downsample.cc [new file with mode: 0644]
lib/jpegli/downsample.h [new file with mode: 0644]
lib/jpegli/encode.cc [new file with mode: 0644]
lib/jpegli/encode.h [new file with mode: 0644]
lib/jpegli/encode_api_test.cc [new file with mode: 0644]
lib/jpegli/encode_finish.cc [new file with mode: 0644]
lib/jpegli/encode_finish.h [new file with mode: 0644]
lib/jpegli/encode_internal.h [new file with mode: 0644]
lib/jpegli/encode_streaming.cc [new file with mode: 0644]
lib/jpegli/encode_streaming.h [new file with mode: 0644]
lib/jpegli/entropy_coding-inl.h [new file with mode: 0644]
lib/jpegli/entropy_coding.cc [new file with mode: 0644]
lib/jpegli/entropy_coding.h [new file with mode: 0644]
lib/jpegli/error.cc [new file with mode: 0644]
lib/jpegli/error.h [new file with mode: 0644]
lib/jpegli/error_handling_test.cc [new file with mode: 0644]
lib/jpegli/huffman.cc [new file with mode: 0644]
lib/jpegli/huffman.h [new file with mode: 0644]
lib/jpegli/idct.cc [new file with mode: 0644]
lib/jpegli/idct.h [new file with mode: 0644]
lib/jpegli/input.cc [new file with mode: 0644]
lib/jpegli/input.h [new file with mode: 0644]
lib/jpegli/input_suspension_test.cc [new file with mode: 0644]
lib/jpegli/jpeg.version.62 [new file with mode: 0644]
lib/jpegli/jpeg.version.8 [new file with mode: 0644]
lib/jpegli/libjpeg_test_util.cc [new file with mode: 0644]
lib/jpegli/libjpeg_test_util.h [new file with mode: 0644]
lib/jpegli/libjpeg_wrapper.cc [new file with mode: 0644]
lib/jpegli/memory_manager.cc [new file with mode: 0644]
lib/jpegli/memory_manager.h [new file with mode: 0644]
lib/jpegli/output_suspension_test.cc [new file with mode: 0644]
lib/jpegli/quant.cc [new file with mode: 0644]
lib/jpegli/quant.h [new file with mode: 0644]
lib/jpegli/render.cc [new file with mode: 0644]
lib/jpegli/render.h [new file with mode: 0644]
lib/jpegli/simd.cc [new file with mode: 0644]
lib/jpegli/simd.h [new file with mode: 0644]
lib/jpegli/source_manager.cc [new file with mode: 0644]
lib/jpegli/source_manager_test.cc [new file with mode: 0644]
lib/jpegli/streaming_test.cc [new file with mode: 0644]
lib/jpegli/test_params.h [new file with mode: 0644]
lib/jpegli/test_utils-inl.h [new file with mode: 0644]
lib/jpegli/test_utils.cc [new file with mode: 0644]
lib/jpegli/test_utils.h [new file with mode: 0644]
lib/jpegli/testing.h [new file with mode: 0644]
lib/jpegli/transcode_api_test.cc [new file with mode: 0644]
lib/jpegli/transpose-inl.h [new file with mode: 0644]
lib/jpegli/types.h [new file with mode: 0644]
lib/jpegli/upsample.cc [new file with mode: 0644]
lib/jpegli/upsample.h [new file with mode: 0644]
lib/jxl.cmake
lib/jxl/ac_strategy.cc
lib/jxl/ac_strategy.h
lib/jxl/ac_strategy_test.cc
lib/jxl/alpha.cc
lib/jxl/alpha.h
lib/jxl/alpha_test.cc
lib/jxl/ans_common.cc
lib/jxl/ans_common_test.cc
lib/jxl/ans_test.cc
lib/jxl/aux_out.cc [deleted file]
lib/jxl/aux_out.h [deleted file]
lib/jxl/aux_out_fwd.h [deleted file]
lib/jxl/base/byte_order.h
lib/jxl/base/common.h [new file with mode: 0644]
lib/jxl/base/compiler_specific.h
lib/jxl/base/data_parallel.cc [deleted file]
lib/jxl/base/data_parallel.h
lib/jxl/base/fast_math-inl.h [moved from lib/jxl/fast_math-inl.h with 94% similarity]
lib/jxl/base/file_io.h [deleted file]
lib/jxl/base/float.h [new file with mode: 0644]
lib/jxl/base/matrix_ops.h [new file with mode: 0644]
lib/jxl/base/padded_bytes.cc [deleted file]
lib/jxl/base/profiler.h [deleted file]
lib/jxl/base/random.cc [deleted file]
lib/jxl/base/random.h
lib/jxl/base/rational_polynomial-inl.h [moved from lib/jxl/rational_polynomial-inl.h with 89% similarity]
lib/jxl/base/span.h
lib/jxl/base/status.h
lib/jxl/base/thread_pool_internal.h [deleted file]
lib/jxl/bit_reader_test.cc
lib/jxl/bits_test.cc
lib/jxl/blending.cc
lib/jxl/blending_test.cc
lib/jxl/box_content_decoder.h
lib/jxl/butteraugli/butteraugli.cc
lib/jxl/butteraugli/butteraugli.h
lib/jxl/butteraugli/butteraugli_test.cc [new file with mode: 0644]
lib/jxl/butteraugli_test.cc [deleted file]
lib/jxl/butteraugli_wrapper.cc [deleted file]
lib/jxl/byte_order_test.cc
lib/jxl/cache_aligned.cc [moved from lib/jxl/base/cache_aligned.cc with 99% similarity]
lib/jxl/cache_aligned.h [moved from lib/jxl/base/cache_aligned.h with 88% similarity]
lib/jxl/chroma_from_luma.h
lib/jxl/cms/color_encoding_cms.h [new file with mode: 0644]
lib/jxl/cms/jxl_cms.cc [moved from lib/jxl/enc_color_management.cc with 76% similarity]
lib/jxl/cms/jxl_cms_internal.h [new file with mode: 0644]
lib/jxl/cms/opsin_params.h [new file with mode: 0644]
lib/jxl/cms/tone_mapping-inl.h [moved from lib/jxl/dec_tone_mapping-inl.h with 56% similarity]
lib/jxl/cms/tone_mapping.h [new file with mode: 0644]
lib/jxl/cms/tone_mapping_test.cc [new file with mode: 0644]
lib/jxl/cms/transfer_functions-inl.h [moved from lib/jxl/transfer_functions-inl.h with 78% similarity]
lib/jxl/cms/transfer_functions.h [new file with mode: 0644]
lib/jxl/cms/transfer_functions_test.cc [new file with mode: 0644]
lib/jxl/codec_in_out.h
lib/jxl/codec_y4m_testonly.cc [deleted file]
lib/jxl/codec_y4m_testonly.h [deleted file]
lib/jxl/coeff_order.cc
lib/jxl/coeff_order.h
lib/jxl/coeff_order_fwd.h
lib/jxl/coeff_order_test.cc
lib/jxl/color_encoding_internal.cc
lib/jxl/color_encoding_internal.h
lib/jxl/color_encoding_internal_test.cc
lib/jxl/color_management.cc [deleted file]
lib/jxl/color_management.h [deleted file]
lib/jxl/color_management_test.cc
lib/jxl/common.h
lib/jxl/compressed_dc.cc
lib/jxl/convolve-inl.h
lib/jxl/convolve_slow.cc
lib/jxl/convolve_symmetric5.cc
lib/jxl/convolve_test.cc
lib/jxl/data_parallel_test.cc
lib/jxl/dct-inl.h
lib/jxl/dct_for_test.h
lib/jxl/dct_test.cc
lib/jxl/dct_util.h
lib/jxl/dec_ans.cc
lib/jxl/dec_ans.h
lib/jxl/dec_bit_reader.h
lib/jxl/dec_cache.cc
lib/jxl/dec_cache.h
lib/jxl/dec_context_map.cc
lib/jxl/dec_external_image.cc
lib/jxl/dec_external_image.h
lib/jxl/dec_external_image_gbench.cc
lib/jxl/dec_frame.cc
lib/jxl/dec_frame.h
lib/jxl/dec_group.cc
lib/jxl/dec_group.h
lib/jxl/dec_group_border.h
lib/jxl/dec_modular.cc
lib/jxl/dec_modular.h
lib/jxl/dec_noise.cc
lib/jxl/dec_noise.h
lib/jxl/dec_patch_dictionary.cc
lib/jxl/dec_patch_dictionary.h
lib/jxl/dec_transforms-inl.h
lib/jxl/dec_transforms_testonly.cc
lib/jxl/dec_transforms_testonly.h
lib/jxl/dec_xyb-inl.h
lib/jxl/dec_xyb.cc
lib/jxl/dec_xyb.h
lib/jxl/decode.cc
lib/jxl/decode_test.cc
lib/jxl/decode_to_jpeg.cc
lib/jxl/decode_to_jpeg.h
lib/jxl/enc_ac_strategy.cc
lib/jxl/enc_ac_strategy.h
lib/jxl/enc_adaptive_quantization.cc
lib/jxl/enc_adaptive_quantization.h
lib/jxl/enc_ans.cc
lib/jxl/enc_ans.h
lib/jxl/enc_ar_control_field.cc
lib/jxl/enc_ar_control_field.h
lib/jxl/enc_aux_out.cc [new file with mode: 0644]
lib/jxl/enc_aux_out.h [new file with mode: 0644]
lib/jxl/enc_bit_writer.cc
lib/jxl/enc_bit_writer.h
lib/jxl/enc_butteraugli_comparator.cc
lib/jxl/enc_butteraugli_comparator.h
lib/jxl/enc_cache.cc
lib/jxl/enc_cache.h
lib/jxl/enc_chroma_from_luma.cc
lib/jxl/enc_chroma_from_luma.h
lib/jxl/enc_cluster.cc
lib/jxl/enc_cluster.h
lib/jxl/enc_coeff_order.cc
lib/jxl/enc_coeff_order.h
lib/jxl/enc_color_management.h [deleted file]
lib/jxl/enc_comparator.cc
lib/jxl/enc_comparator.h
lib/jxl/enc_context_map.cc
lib/jxl/enc_context_map.h
lib/jxl/enc_debug_image.cc [new file with mode: 0644]
lib/jxl/enc_debug_image.h [new file with mode: 0644]
lib/jxl/enc_detect_dots.cc
lib/jxl/enc_dot_dictionary.cc
lib/jxl/enc_dot_dictionary.h
lib/jxl/enc_entropy_coder.cc
lib/jxl/enc_external_image.cc
lib/jxl/enc_external_image.h
lib/jxl/enc_external_image_gbench.cc
lib/jxl/enc_external_image_test.cc
lib/jxl/enc_fast_lossless.cc [new file with mode: 0644]
lib/jxl/enc_fast_lossless.h [new file with mode: 0644]
lib/jxl/enc_fields.cc [new file with mode: 0644]
lib/jxl/enc_fields.h [new file with mode: 0644]
lib/jxl/enc_file.cc [deleted file]
lib/jxl/enc_file.h [deleted file]
lib/jxl/enc_frame.cc
lib/jxl/enc_frame.h
lib/jxl/enc_gaborish.cc [new file with mode: 0644]
lib/jxl/enc_gaborish.h [moved from lib/jxl/gaborish.h with 90% similarity]
lib/jxl/enc_gaborish_test.cc [moved from lib/jxl/gaborish_test.cc with 86% similarity]
lib/jxl/enc_gamma_correct.h
lib/jxl/enc_group.cc
lib/jxl/enc_group.h
lib/jxl/enc_heuristics.cc
lib/jxl/enc_heuristics.h
lib/jxl/enc_huffman.cc
lib/jxl/enc_huffman_tree.cc [moved from lib/jxl/huffman_tree.cc with 99% similarity]
lib/jxl/enc_huffman_tree.h [moved from lib/jxl/huffman_tree.h with 92% similarity]
lib/jxl/enc_icc_codec.cc
lib/jxl/enc_icc_codec.h
lib/jxl/enc_image_bundle.cc
lib/jxl/enc_jxl_skcms.h [deleted file]
lib/jxl/enc_linalg.cc [new file with mode: 0644]
lib/jxl/enc_linalg.h [new file with mode: 0644]
lib/jxl/enc_linalg_test.cc [new file with mode: 0644]
lib/jxl/enc_modular.cc
lib/jxl/enc_modular.h
lib/jxl/enc_noise.cc
lib/jxl/enc_noise.h
lib/jxl/enc_optimize.cc [moved from lib/jxl/optimize.cc with 99% similarity]
lib/jxl/enc_optimize.h [moved from lib/jxl/optimize.h with 99% similarity]
lib/jxl/enc_optimize_test.cc [moved from lib/jxl/optimize_test.cc with 97% similarity]
lib/jxl/enc_params.h
lib/jxl/enc_patch_dictionary.cc
lib/jxl/enc_patch_dictionary.h
lib/jxl/enc_photon_noise.cc
lib/jxl/enc_photon_noise_test.cc
lib/jxl/enc_progressive_split.cc [new file with mode: 0644]
lib/jxl/enc_progressive_split.h [moved from lib/jxl/progressive_split.h with 73% similarity]
lib/jxl/enc_quant_weights.cc
lib/jxl/enc_quant_weights.h
lib/jxl/enc_splines.cc
lib/jxl/enc_splines.h
lib/jxl/enc_toc.cc
lib/jxl/enc_toc.h
lib/jxl/enc_transforms-inl.h
lib/jxl/enc_xyb.cc
lib/jxl/enc_xyb.h
lib/jxl/encode.cc
lib/jxl/encode_internal.h
lib/jxl/encode_test.cc
lib/jxl/entropy_coder.cc
lib/jxl/entropy_coder_test.cc
lib/jxl/epf.cc
lib/jxl/exif.h
lib/jxl/fake_parallel_runner_testonly.h
lib/jxl/fast_dct-inl.h
lib/jxl/fast_dct_test.cc
lib/jxl/fast_math_test.cc
lib/jxl/field_encodings.h
lib/jxl/fields.cc
lib/jxl/fields.h
lib/jxl/fields_test.cc
lib/jxl/frame_dimensions.h [new file with mode: 0644]
lib/jxl/frame_header.cc
lib/jxl/frame_header.h
lib/jxl/gaborish.cc [deleted file]
lib/jxl/gamma_correct_test.cc
lib/jxl/gauss_blur.cc
lib/jxl/gauss_blur_test.cc
lib/jxl/gradient_test.cc
lib/jxl/headers.cc
lib/jxl/headers.h
lib/jxl/iaca_test.cc
lib/jxl/icc_codec.cc
lib/jxl/icc_codec.h
lib/jxl/icc_codec_common.cc
lib/jxl/icc_codec_common.h
lib/jxl/icc_codec_test.cc
lib/jxl/image.cc
lib/jxl/image.h
lib/jxl/image_bundle.cc
lib/jxl/image_bundle.h
lib/jxl/image_bundle_test.cc
lib/jxl/image_metadata.cc
lib/jxl/image_metadata.h
lib/jxl/image_ops.h
lib/jxl/image_ops_test.cc
lib/jxl/image_test_utils.h
lib/jxl/inverse_mtf-inl.h [new file with mode: 0644]
lib/jxl/jpeg/dec_jpeg_data.cc
lib/jxl/jpeg/dec_jpeg_data_writer.cc
lib/jxl/jpeg/dec_jpeg_data_writer.h
lib/jxl/jpeg/dec_jpeg_serialization_state.h
lib/jxl/jpeg/enc_jpeg_data.cc
lib/jxl/jpeg/enc_jpeg_data.h
lib/jxl/jpeg/enc_jpeg_data_reader.cc
lib/jxl/jpeg/jpeg_data.cc
lib/jxl/jpeg/jpeg_data.h
lib/jxl/jxl_inspection.h [deleted file]
lib/jxl/jxl_test.cc
lib/jxl/lehmer_code_test.cc
lib/jxl/libjxl.pc.in
lib/jxl/libjxl_cms.pc.in [new file with mode: 0644]
lib/jxl/linalg.cc [deleted file]
lib/jxl/linalg.h [deleted file]
lib/jxl/linalg_test.cc [deleted file]
lib/jxl/loop_filter.cc
lib/jxl/loop_filter.h
lib/jxl/luminance.cc
lib/jxl/luminance.h
lib/jxl/memory_manager_internal.h
lib/jxl/modular/encoding/context_predict.h
lib/jxl/modular/encoding/dec_ma.cc
lib/jxl/modular/encoding/enc_encoding.cc
lib/jxl/modular/encoding/enc_encoding.h
lib/jxl/modular/encoding/enc_ma.cc
lib/jxl/modular/encoding/encoding.cc
lib/jxl/modular/encoding/encoding.h
lib/jxl/modular/modular_image.cc
lib/jxl/modular/modular_image.h
lib/jxl/modular/transform/enc_palette.cc
lib/jxl/modular/transform/enc_rct.cc
lib/jxl/modular/transform/enc_squeeze.cc
lib/jxl/modular/transform/palette.cc [new file with mode: 0644]
lib/jxl/modular/transform/palette.h
lib/jxl/modular/transform/rct.h
lib/jxl/modular/transform/squeeze.cc
lib/jxl/modular/transform/squeeze.h
lib/jxl/modular_test.cc
lib/jxl/noise.h
lib/jxl/opsin_image_test.cc
lib/jxl/opsin_inverse_test.cc
lib/jxl/opsin_params.cc
lib/jxl/opsin_params.h
lib/jxl/pack_signed.h [new file with mode: 0644]
lib/jxl/padded_bytes.h [moved from lib/jxl/base/padded_bytes.h with 81% similarity]
lib/jxl/padded_bytes_test.cc
lib/jxl/passes_state.cc
lib/jxl/passes_state.h
lib/jxl/passes_test.cc
lib/jxl/patch_dictionary_test.cc
lib/jxl/preview_test.cc
lib/jxl/progressive_split.cc [deleted file]
lib/jxl/quant_weights.cc
lib/jxl/quant_weights.h
lib/jxl/quant_weights_test.cc
lib/jxl/quantizer.cc
lib/jxl/quantizer.h
lib/jxl/quantizer_test.cc
lib/jxl/rational_polynomial_test.cc
lib/jxl/render_pipeline/low_memory_render_pipeline.cc
lib/jxl/render_pipeline/render_pipeline_test.cc
lib/jxl/render_pipeline/simple_render_pipeline.cc
lib/jxl/render_pipeline/stage_blending.cc
lib/jxl/render_pipeline/stage_chroma_upsampling.cc
lib/jxl/render_pipeline/stage_chroma_upsampling.h
lib/jxl/render_pipeline/stage_cms.cc [new file with mode: 0644]
lib/jxl/render_pipeline/stage_cms.h [new file with mode: 0644]
lib/jxl/render_pipeline/stage_epf.cc
lib/jxl/render_pipeline/stage_from_linear.cc
lib/jxl/render_pipeline/stage_gaborish.cc
lib/jxl/render_pipeline/stage_gaborish.h
lib/jxl/render_pipeline/stage_noise.cc
lib/jxl/render_pipeline/stage_patches.cc
lib/jxl/render_pipeline/stage_splines.cc
lib/jxl/render_pipeline/stage_spot.cc
lib/jxl/render_pipeline/stage_to_linear.cc
lib/jxl/render_pipeline/stage_tone_mapping.cc
lib/jxl/render_pipeline/stage_tone_mapping.h
lib/jxl/render_pipeline/stage_upsampling.cc
lib/jxl/render_pipeline/stage_write.cc
lib/jxl/render_pipeline/stage_write.h
lib/jxl/render_pipeline/stage_xyb.cc
lib/jxl/render_pipeline/stage_xyb.h
lib/jxl/render_pipeline/stage_ycbcr.cc
lib/jxl/render_pipeline/stage_ycbcr.h
lib/jxl/roundtrip_test.cc
lib/jxl/simd_util.cc [new file with mode: 0644]
lib/jxl/simd_util.h [new file with mode: 0644]
lib/jxl/simd_util_test.cc
lib/jxl/speed_tier_test.cc
lib/jxl/splines.cc
lib/jxl/splines.h
lib/jxl/splines_test.cc
lib/jxl/test_image.cc [new file with mode: 0644]
lib/jxl/test_image.h
lib/jxl/test_utils.cc [new file with mode: 0644]
lib/jxl/test_utils.h
lib/jxl/testdata.h [deleted file]
lib/jxl/testing.h [new file with mode: 0644]
lib/jxl/tf_gbench.cc
lib/jxl/toc.cc
lib/jxl/toc_test.cc
lib/jxl/xorshift128plus_test.cc
lib/jxl_benchmark.cmake
lib/jxl_cms.cmake [new file with mode: 0644]
lib/jxl_extras.cmake
lib/jxl_lists.bzl [new file with mode: 0644]
lib/jxl_lists.cmake [new file with mode: 0644]
lib/jxl_profiler.cmake [deleted file]
lib/jxl_tests.cmake
lib/jxl_threads.cmake
lib/jxl_vars.bzl [new file with mode: 0644]
lib/lib.gni [changed from file to symlink]
lib/profiler/profiler.cc [deleted file]
lib/profiler/profiler.h [deleted file]
lib/profiler/tsc_timer.h [deleted file]
lib/threads/libjxl_threads.pc.in
lib/threads/resizable_parallel_runner.cc
lib/threads/thread_parallel_runner.cc
lib/threads/thread_parallel_runner_internal.cc
lib/threads/thread_parallel_runner_internal.h
lib/threads/thread_parallel_runner_test.cc
plugins/gdk-pixbuf/CMakeLists.txt
plugins/gdk-pixbuf/README.md
plugins/gdk-pixbuf/pixbufloader-jxl.c
plugins/gimp/common.h
plugins/gimp/file-jxl-load.cc
plugins/gimp/file-jxl-load.h
plugins/gimp/file-jxl-save.cc
plugins/gimp/file-jxl-save.h
plugins/mime/README.md
third_party/CMakeLists.txt
third_party/highway/.github/workflows/build_test.yml [deleted file]
third_party/highway/BUILD [deleted file]
third_party/highway/CMakeLists.txt [deleted file]
third_party/highway/CMakeLists.txt.in [deleted file]
third_party/highway/CONTRIBUTING [deleted file]
third_party/highway/LICENSE [deleted file]
third_party/highway/README.md [deleted file]
third_party/highway/WORKSPACE [deleted file]
third_party/highway/debian/changelog [deleted file]
third_party/highway/debian/compat [deleted file]
third_party/highway/debian/control [deleted file]
third_party/highway/debian/copyright [deleted file]
third_party/highway/debian/rules [deleted file]
third_party/highway/debian/source/format [deleted file]
third_party/highway/g3doc/design_philosophy.md [deleted file]
third_party/highway/g3doc/highway_intro.pdf [deleted file]
third_party/highway/g3doc/impl_details.md [deleted file]
third_party/highway/g3doc/instruction_matrix.pdf [deleted file]
third_party/highway/g3doc/quick_reference.md [deleted file]
third_party/highway/g3doc/release_testing_process.md [deleted file]
third_party/highway/hwy/aligned_allocator.cc [deleted file]
third_party/highway/hwy/aligned_allocator.h [deleted file]
third_party/highway/hwy/aligned_allocator_test.cc [deleted file]
third_party/highway/hwy/base.h [deleted file]
third_party/highway/hwy/base_test.cc [deleted file]
third_party/highway/hwy/cache_control.h [deleted file]
third_party/highway/hwy/contrib/algo/copy-inl.h [deleted file]
third_party/highway/hwy/contrib/algo/copy_test.cc [deleted file]
third_party/highway/hwy/contrib/algo/find-inl.h [deleted file]
third_party/highway/hwy/contrib/algo/find_test.cc [deleted file]
third_party/highway/hwy/contrib/algo/transform-inl.h [deleted file]
third_party/highway/hwy/contrib/algo/transform_test.cc [deleted file]
third_party/highway/hwy/contrib/dot/dot-inl.h [deleted file]
third_party/highway/hwy/contrib/dot/dot_test.cc [deleted file]
third_party/highway/hwy/contrib/image/image.cc [deleted file]
third_party/highway/hwy/contrib/image/image.h [deleted file]
third_party/highway/hwy/contrib/image/image_test.cc [deleted file]
third_party/highway/hwy/contrib/math/math-inl.h [deleted file]
third_party/highway/hwy/contrib/math/math_test.cc [deleted file]
third_party/highway/hwy/contrib/sort/BUILD [deleted file]
third_party/highway/hwy/contrib/sort/README.md [deleted file]
third_party/highway/hwy/contrib/sort/algo-inl.h [deleted file]
third_party/highway/hwy/contrib/sort/bench_parallel.cc [deleted file]
third_party/highway/hwy/contrib/sort/bench_sort.cc [deleted file]
third_party/highway/hwy/contrib/sort/print_network.cc [deleted file]
third_party/highway/hwy/contrib/sort/result-inl.h [deleted file]
third_party/highway/hwy/contrib/sort/shared-inl.h [deleted file]
third_party/highway/hwy/contrib/sort/sort_test.cc [deleted file]
third_party/highway/hwy/contrib/sort/sorting_networks-inl.h [deleted file]
third_party/highway/hwy/contrib/sort/traits-inl.h [deleted file]
third_party/highway/hwy/contrib/sort/traits128-inl.h [deleted file]
third_party/highway/hwy/contrib/sort/vqsort-inl.h [deleted file]
third_party/highway/hwy/contrib/sort/vqsort.cc [deleted file]
third_party/highway/hwy/contrib/sort/vqsort.h [deleted file]
third_party/highway/hwy/contrib/sort/vqsort_128a.cc [deleted file]
third_party/highway/hwy/contrib/sort/vqsort_128d.cc [deleted file]
third_party/highway/hwy/contrib/sort/vqsort_f32a.cc [deleted file]
third_party/highway/hwy/contrib/sort/vqsort_f32d.cc [deleted file]
third_party/highway/hwy/contrib/sort/vqsort_f64a.cc [deleted file]
third_party/highway/hwy/contrib/sort/vqsort_f64d.cc [deleted file]
third_party/highway/hwy/contrib/sort/vqsort_i16a.cc [deleted file]
third_party/highway/hwy/contrib/sort/vqsort_i16d.cc [deleted file]
third_party/highway/hwy/contrib/sort/vqsort_i32a.cc [deleted file]
third_party/highway/hwy/contrib/sort/vqsort_i32d.cc [deleted file]
third_party/highway/hwy/contrib/sort/vqsort_i64a.cc [deleted file]
third_party/highway/hwy/contrib/sort/vqsort_i64d.cc [deleted file]
third_party/highway/hwy/contrib/sort/vqsort_kv128a.cc [deleted file]
third_party/highway/hwy/contrib/sort/vqsort_kv128d.cc [deleted file]
third_party/highway/hwy/contrib/sort/vqsort_u16a.cc [deleted file]
third_party/highway/hwy/contrib/sort/vqsort_u16d.cc [deleted file]
third_party/highway/hwy/contrib/sort/vqsort_u32a.cc [deleted file]
third_party/highway/hwy/contrib/sort/vqsort_u32d.cc [deleted file]
third_party/highway/hwy/contrib/sort/vqsort_u64a.cc [deleted file]
third_party/highway/hwy/contrib/sort/vqsort_u64d.cc [deleted file]
third_party/highway/hwy/detect_compiler_arch.h [deleted file]
third_party/highway/hwy/detect_targets.h [deleted file]
third_party/highway/hwy/examples/benchmark.cc [deleted file]
third_party/highway/hwy/examples/skeleton-inl.h [deleted file]
third_party/highway/hwy/examples/skeleton.cc [deleted file]
third_party/highway/hwy/examples/skeleton.h [deleted file]
third_party/highway/hwy/examples/skeleton_test.cc [deleted file]
third_party/highway/hwy/foreach_target.h [deleted file]
third_party/highway/hwy/highway.h [deleted file]
third_party/highway/hwy/highway_export.h [deleted file]
third_party/highway/hwy/highway_test.cc [deleted file]
third_party/highway/hwy/hwy.version [deleted file]
third_party/highway/hwy/nanobenchmark.cc [deleted file]
third_party/highway/hwy/nanobenchmark.h [deleted file]
third_party/highway/hwy/nanobenchmark_test.cc [deleted file]
third_party/highway/hwy/ops/arm_neon-inl.h [deleted file]
third_party/highway/hwy/ops/arm_sve-inl.h [deleted file]
third_party/highway/hwy/ops/emu128-inl.h [deleted file]
third_party/highway/hwy/ops/generic_ops-inl.h [deleted file]
third_party/highway/hwy/ops/rvv-inl.h [deleted file]
third_party/highway/hwy/ops/scalar-inl.h [deleted file]
third_party/highway/hwy/ops/set_macros-inl.h [deleted file]
third_party/highway/hwy/ops/shared-inl.h [deleted file]
third_party/highway/hwy/ops/wasm_128-inl.h [deleted file]
third_party/highway/hwy/ops/wasm_256-inl.h [deleted file]
third_party/highway/hwy/ops/x86_128-inl.h [deleted file]
third_party/highway/hwy/ops/x86_256-inl.h [deleted file]
third_party/highway/hwy/ops/x86_512-inl.h [deleted file]
third_party/highway/hwy/per_target.cc [deleted file]
third_party/highway/hwy/per_target.h [deleted file]
third_party/highway/hwy/print-inl.h [deleted file]
third_party/highway/hwy/print.cc [deleted file]
third_party/highway/hwy/print.h [deleted file]
third_party/highway/hwy/targets.cc [deleted file]
third_party/highway/hwy/targets.h [deleted file]
third_party/highway/hwy/targets_test.cc [deleted file]
third_party/highway/hwy/tests/arithmetic_test.cc [deleted file]
third_party/highway/hwy/tests/blockwise_shift_test.cc [deleted file]
third_party/highway/hwy/tests/blockwise_test.cc [deleted file]
third_party/highway/hwy/tests/combine_test.cc [deleted file]
third_party/highway/hwy/tests/compare_test.cc [deleted file]
third_party/highway/hwy/tests/compress_test.cc [deleted file]
third_party/highway/hwy/tests/convert_test.cc [deleted file]
third_party/highway/hwy/tests/crypto_test.cc [deleted file]
third_party/highway/hwy/tests/demote_test.cc [deleted file]
third_party/highway/hwy/tests/float_test.cc [deleted file]
third_party/highway/hwy/tests/hwy_gtest.h [deleted file]
third_party/highway/hwy/tests/if_test.cc [deleted file]
third_party/highway/hwy/tests/interleaved_test.cc [deleted file]
third_party/highway/hwy/tests/list_targets.cc [deleted file]
third_party/highway/hwy/tests/logical_test.cc [deleted file]
third_party/highway/hwy/tests/mask_mem_test.cc [deleted file]
third_party/highway/hwy/tests/mask_test.cc [deleted file]
third_party/highway/hwy/tests/memory_test.cc [deleted file]
third_party/highway/hwy/tests/mul_test.cc [deleted file]
third_party/highway/hwy/tests/reduction_test.cc [deleted file]
third_party/highway/hwy/tests/reverse_test.cc [deleted file]
third_party/highway/hwy/tests/shift_test.cc [deleted file]
third_party/highway/hwy/tests/swizzle_test.cc [deleted file]
third_party/highway/hwy/tests/test_util-inl.h [deleted file]
third_party/highway/hwy/tests/test_util.cc [deleted file]
third_party/highway/hwy/tests/test_util.h [deleted file]
third_party/highway/hwy/tests/test_util_test.cc [deleted file]
third_party/highway/libhwy-contrib.pc.in [deleted file]
third_party/highway/libhwy-test.pc.in [deleted file]
third_party/highway/libhwy.pc.in [deleted file]
third_party/highway/preamble.js.lds [deleted file]
third_party/highway/run_tests.bat [deleted file]
third_party/highway/run_tests.sh [deleted file]
third_party/lcms2.cmake
third_party/skcms.cmake
third_party/testing.cmake
tools/BUILD [new file with mode: 0644]
tools/CMakeLists.txt
tools/README.cjpeg_hdr.md [deleted file]
tools/args.h
tools/benchmark/benchmark_args.cc
tools/benchmark/benchmark_args.h
tools/benchmark/benchmark_codec.cc
tools/benchmark/benchmark_codec.h
tools/benchmark/benchmark_codec_avif.cc
tools/benchmark/benchmark_codec_avif.h
tools/benchmark/benchmark_codec_custom.cc
tools/benchmark/benchmark_codec_custom.h
tools/benchmark/benchmark_codec_jpeg.cc
tools/benchmark/benchmark_codec_jpeg.h
tools/benchmark/benchmark_codec_jxl.cc
tools/benchmark/benchmark_codec_jxl.h
tools/benchmark/benchmark_codec_png.cc
tools/benchmark/benchmark_codec_png.h
tools/benchmark/benchmark_codec_webp.cc
tools/benchmark/benchmark_codec_webp.h
tools/benchmark/benchmark_file_io.cc
tools/benchmark/benchmark_file_io.h
tools/benchmark/benchmark_stats.cc
tools/benchmark/benchmark_stats.h
tools/benchmark/benchmark_utils.cc
tools/benchmark/benchmark_utils.h
tools/benchmark/benchmark_xl.cc
tools/box/CMakeLists.txt [deleted file]
tools/box/box.cc [deleted file]
tools/box/box.h [deleted file]
tools/box/box_list_main.cc [deleted file]
tools/box/box_test.cc [deleted file]
tools/build_cleaner.py [deleted file]
tools/butteraugli_main.cc
tools/cjpeg_hdr.cc [deleted file]
tools/cjpegli.cc [new file with mode: 0644]
tools/cjxl_fuzzer.cc
tools/cjxl_main.cc
tools/cmdline.cc
tools/cmdline.h
tools/codec_config.h
tools/color_encoding_fuzzer.cc
tools/comparison_viewer/CMakeLists.txt
tools/comparison_viewer/codec_comparison_window.cc
tools/comparison_viewer/codec_comparison_window.h
tools/comparison_viewer/codec_comparison_window.ui
tools/comparison_viewer/compare_codecs.cc
tools/comparison_viewer/compare_images.cc
tools/comparison_viewer/image_loading.cc
tools/comparison_viewer/image_loading.h
tools/comparison_viewer/settings.cc
tools/comparison_viewer/settings.h
tools/comparison_viewer/split_image_renderer.cc
tools/comparison_viewer/split_image_renderer.h
tools/comparison_viewer/split_image_view.cc
tools/comparison_viewer/split_image_view.h
tools/comparison_viewer/split_image_view.ui
tools/conformance/conformance.py
tools/conformance/lcms2.py
tools/decode_and_encode.cc
tools/decode_basic_info_fuzzer.cc
tools/djpegli.cc [new file with mode: 0644]
tools/djxl_fuzzer.cc
tools/djxl_fuzzer_corpus.cc [moved from tools/fuzzer_corpus.cc with 87% similarity]
tools/djxl_fuzzer_test.cc
tools/djxl_main.cc
tools/fast_lossless/.gitignore [moved from experimental/fast_lossless/.gitignore with 100% similarity]
tools/fast_lossless/README.md [new file with mode: 0644]
tools/fast_lossless/build-android.sh [moved from experimental/fast_lossless/build-android.sh with 88% similarity]
tools/fast_lossless/build.sh [moved from experimental/fast_lossless/build.sh with 79% similarity]
tools/fast_lossless/cross_compile_aarch64.sh [new file with mode: 0755]
tools/fast_lossless/fast_lossless_main.cc [moved from experimental/fast_lossless/fast_lossless_main.cc with 61% similarity]
tools/fast_lossless/pam-input.h [moved from experimental/fast_lossless/pam-input.h with 98% similarity]
tools/fields_fuzzer.cc
tools/file_io.cc [deleted file]
tools/file_io.h
tools/flicker_test/CMakeLists.txt
tools/flicker_test/main.cc
tools/flicker_test/parameters.cc
tools/flicker_test/parameters.h
tools/flicker_test/setup.cc
tools/flicker_test/setup.h
tools/flicker_test/setup.ui
tools/flicker_test/split_view.cc
tools/flicker_test/split_view.h
tools/flicker_test/test_window.cc
tools/flicker_test/test_window.h
tools/flicker_test/test_window.ui
tools/fuzzer_stub.cc
tools/hdr/README.md
tools/hdr/display_to_hlg.cc
tools/hdr/exr_to_pq.cc [new file with mode: 0644]
tools/hdr/generate_lut_template.cc
tools/hdr/image_utils.h [new file with mode: 0644]
tools/hdr/local_tone_map.cc [new file with mode: 0644]
tools/hdr/pq_to_hlg.cc
tools/hdr/render_hlg.cc
tools/hdr/texture_to_cube.cc
tools/hdr/tone_map.cc
tools/icc_codec_fuzzer.cc
tools/icc_detect/icc_detect.h
tools/icc_detect/icc_detect_empty.cc
tools/icc_detect/icc_detect_win32.cc
tools/icc_detect/icc_detect_x11.cc
tools/jni/org/jpeg/jpegxl/wrapper/Decoder.java
tools/jni/org/jpeg/jpegxl/wrapper/decoder_jni.cc
tools/jpegli_dec_fuzzer.cc [new file with mode: 0644]
tools/jpegli_dec_fuzzer_corpus.cc [new file with mode: 0644]
tools/jxl_from_tree.cc
tools/jxlinfo.c
tools/libjxl_test.c
tools/optimizer/apply_simplex.py [new file with mode: 0755]
tools/optimizer/simplex_fork.py
tools/optimizer/update_jpegli_global_scale.py [new file with mode: 0755]
tools/rans_fuzzer.cc
tools/scripts/bisector [moved from tools/bisector with 98% similarity]
tools/scripts/build_cleaner.py [new file with mode: 0755]
tools/scripts/build_stats.py [moved from tools/build_stats.py with 92% similarity]
tools/scripts/check_author.py [moved from tools/check_author.py with 93% similarity]
tools/scripts/cjxl_bisect_bpp [moved from tools/cjxl_bisect_bpp with 89% similarity]
tools/scripts/cjxl_bisect_size [moved from tools/cjxl_bisect_size with 86% similarity]
tools/scripts/demo_progressive_saliency_encoding.py [moved from tools/demo_progressive_saliency_encoding.py with 100% similarity]
tools/scripts/jpegli_tools_test.sh [new file with mode: 0644]
tools/scripts/jxl-eval.sh [new file with mode: 0755]
tools/scripts/ossfuzz-build.sh [moved from tools/ossfuzz-build.sh with 98% similarity]
tools/scripts/progressive_saliency.conf [moved from tools/progressive_saliency.conf with 100% similarity]
tools/scripts/progressive_sizes.sh [moved from tools/progressive_sizes.sh with 84% similarity]
tools/scripts/reference_zip.sh [moved from tools/reference_zip.sh with 100% similarity]
tools/scripts/roundtrip_test.sh [moved from tools/roundtrip_test.sh with 51% similarity]
tools/scripts/test_cost-arm64-lowprecision.zip [new file with mode: 0644]
tools/scripts/test_cost-arm64.zip [new file with mode: 0644]
tools/scripts/test_cost-armhf.zip [new file with mode: 0644]
tools/scripts/test_cost-i386.zip [new file with mode: 0644]
tools/scripts/transform_sources_list.py [new file with mode: 0644]
tools/set_from_bytes_fuzzer.cc
tools/speed_stats.cc
tools/ssimulacra2.cc [new file with mode: 0644]
tools/ssimulacra2.h [new file with mode: 0644]
tools/ssimulacra2_main.cc [new file with mode: 0644]
tools/ssimulacra_main.cc
tools/thread_pool_internal.h [new file with mode: 0644]
tools/transforms_fuzzer.cc
tools/upscaling_coefficients/upscaler_demo.py
tools/viewer/CMakeLists.txt
tools/viewer/load_jxl.cc
tools/viewer/load_jxl.h
tools/viewer/main.cc
tools/viewer/viewer_window.cc
tools/viewer/viewer_window.h
tools/wasm_demo/CMakeLists.txt [new file with mode: 0644]
tools/wasm_demo/README.md [new file with mode: 0644]
tools/wasm_demo/build_site.py [new file with mode: 0644]
tools/wasm_demo/client_worker.js [new file with mode: 0644]
tools/wasm_demo/jxl_decoder.cc [moved from tools/jxl_emcc.cc with 64% similarity]
tools/wasm_demo/jxl_decoder.h [new file with mode: 0644]
tools/wasm_demo/jxl_decoder_test.js [new file with mode: 0644]
tools/wasm_demo/jxl_decompressor.cc [new file with mode: 0644]
tools/wasm_demo/jxl_decompressor.h [new file with mode: 0644]
tools/wasm_demo/manual_decode_demo.html [new file with mode: 0644]
tools/wasm_demo/netlify.toml [new file with mode: 0644]
tools/wasm_demo/netlify/edge-functions/precompressed.ts [new file with mode: 0644]
tools/wasm_demo/no_png.cc [new file with mode: 0644]
tools/wasm_demo/no_png.h [new file with mode: 0644]
tools/wasm_demo/one_line_demo.html [new file with mode: 0644]
tools/wasm_demo/one_line_demo_with_console.html [new file with mode: 0644]
tools/wasm_demo/service_worker.js [new file with mode: 0644]
tools/xyb_range.cc

diff --git a/.bazelignore b/.bazelignore
new file mode 100644 (file)
index 0000000..912eacc
--- /dev/null
@@ -0,0 +1 @@
+third_party
index abccf4e..06c9875 100644 (file)
@@ -1,6 +1,4 @@
 # Disabled checks:
-# - google-readability-todo: We don't use the google TODO format.
-#
 # - modernize-deprecated-headers: We don't use std:: versions of the standard
 #   types and functions like size_t or printf, so we should include <stdio.h>
 #   instead <cstdio>.
@@ -27,7 +25,6 @@ Checks: >-
   modernize-*,
   performance-*,
   readability-*,
-  -google-readability-todo,
   -modernize-deprecated-headers,
   -modernize-return-braced-init-list,
   -modernize-use-auto,
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
new file mode 100644 (file)
index 0000000..6ccfc33
--- /dev/null
@@ -0,0 +1,14 @@
+<!-- Thank you for considering a contribution to `libjxl`! -->
+
+### Description
+
+<!-- Please provide a brief description of the changes in this PR and any additional context (e.g., why these changes were made, related issues, etc.). -->
+
+### Pull Request Checklist
+
+- [ ] **CLA Signed**: Have you signed the [Contributor License Agreement](https://code.google.com/legal/individual-cla-v1.0.html) (individual or corporate, as appropriate)? Only contributions from signed contributors can be accepted.
+- [ ] **Authors**: Have you considered adding your name to the [AUTHORS](AUTHORS) file?
+- [ ] **Code Style**: Have you ensured your code adheres to the project's coding style guidelines? You can use `./ci.sh lint` for automatic code formatting.
+
+
+Please review the full [contributing guidelines](https://github.com/libjxl/libjxl/blob/main/CONTRIBUTING.md) for more details.
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644 (file)
index 0000000..9756cee
--- /dev/null
@@ -0,0 +1,21 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# To get started with Dependabot version updates, you'll need to specify which
+# package ecosystems to update and where the package manifests are located.
+# Please see the documentation for all configuration options:
+# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
+
+version: 2
+updates:
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "weekly"
+
+  - package-ecosystem: pip
+    directory: /doc/sphinx
+    schedule:
+      interval: daily
index e8b28b9..88b7dc6 100644 (file)
@@ -7,6 +7,7 @@
 
 name: Build/Test
 on:
+  merge_group:
   push:
     branches:
       - main
@@ -14,29 +15,40 @@ on:
   pull_request:
     types: [opened, reopened, labeled, synchronize]
 
+permissions:
+  contents: read
+
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
   cancel-in-progress: ${{ github.event_name == 'pull_request' }}
 
 jobs:
   ubuntu_build:
-    name: Ubuntu Build ${{ matrix.name }}
+    name: ${{ startsWith(matrix.os, 'macos-') && 'MacOS' || 'Ubuntu' }} Build ${{ matrix.name }}
     runs-on: ${{ matrix.os || 'ubuntu-latest' }}
     strategy:
+      fail-fast: false
       matrix:
         # We have one job per "name" in the matrix. Attributes are set on the
         # specific job names.
         name: [release, debug, asan, msan, scalar]
         include:
           - name: release
+            mode: release
+            run_bench: true
             test_in_pr: true
+            cmake_args: >-
+              -DJPEGXL_TEST_TOOLS=ON
+              -DJPEGLI_LIBJPEG_LIBRARY_VERSION="8.2.2"
+              -DJPEGLI_LIBJPEG_LIBRARY_SOVERSION="8"
             # Track static stack size on build and check it doesn't exceed 3 kB.
             env_stack_size: 1
-            max_stack: 3000
+            max_stack: 2400
             # Conformance tooling test requires numpy.
-            apt_pkgs: graphviz python3-numpy
+            apt_pkgs: doxygen graphviz python3-numpy
           - name: lowprecision
             mode: release
+            run_bench: true
             test_in_pr: true
             cmake_args: -DCMAKE_CXX_FLAGS=-DJXL_HIGH_PRECISION=0
           - name: debug
@@ -46,9 +58,10 @@ jobs:
           # Build scalar-only hwy instructions.
           - name: scalar
             mode: release
-            cxxflags: -DHWY_COMPILE_ONLY_SCALAR
+            cxxflags: -DHWY_COMPILE_ONLY_SCALAR -DFJXL_ENABLE_AVX2=0 -DFJXL_ENABLE_AVX512=0
           # Disabling optional features to speed up msan build a little bit.
           - name: msan
+            os: ubuntu-20.04
             skip_install: true
             cmake_args: >-
               -DJPEGXL_ENABLE_DEVTOOLS=OFF -DJPEGXL_ENABLE_PLUGINS=OFF
@@ -56,9 +69,8 @@ jobs:
           - name: asan
             skip_install: true
           - name: coverage
-            apt_pkgs: gcovr
-            # Coverage builds require a bit more RAM.
             env_test_stack_size: 2048
+            skip_install: true
           # Build with support for decoding to JPEG bytes disabled. Produces a
           # smaller build if only decoding to pixels is needed.
           - name: release-nojpeg
@@ -68,39 +80,81 @@ jobs:
               -DJPEGXL_ENABLE_TRANSCODE_JPEG=OFF
               -DJPEGXL_ENABLE_PLUGINS=OFF
               -DJPEGXL_ENABLE_VIEWERS=OFF
+          # Build with jxl_cms based on lcms2 library.
+          - name: release-lcms2
+            mode: release
+            cmake_args: >-
+              -DJPEGXL_ENABLE_SKCMS=OFF
+          - name: release-system-lcms2
+            mode: release
+            cmake_args: >-
+              -DJPEGXL_ENABLE_SKCMS=OFF
+              -DJPEGXL_FORCE_SYSTEM_LCMS2=ON
+            apt_pkgs: liblcms2-dev
+            # static build is impossible
+            skip_install: true
+          # Build optimized for binary size, all features not needed for
+          # reconstructing pixels is disabled.
+          - name: release:minimal
+            mode: release
+            cxxflags: -DJXL_DEBUG_ON_ABORT=0
+            cmake_args: >-
+              -DJPEGXL_ENABLE_TRANSCODE_JPEG=OFF
+              -DJPEGXL_ENABLE_BOXES=OFF
+              -DJPEGXL_ENABLE_PLUGINS=OFF
+              -DJPEGXL_ENABLE_VIEWERS=OFF
           # Builds with gcc in release mode
           - name: release:gcc8
+            os: ubuntu-20.04
             mode: release
             apt_pkgs: gcc-8 g++-8
             cmake_args: >-
               -DCMAKE_C_COMPILER=gcc-8 -DCMAKE_CXX_COMPILER=g++-8
-          # Builds with clang-5 in release mode
-          - name: release:clang-5
-            os: ubuntu-18.04
+          # Builds with clang-7 in release mode
+          - name: release:clang-7
+            os: ubuntu-20.04
+            mode: release
+            skip_install: true
+            apt_pkgs: clang-7
+            cc: clang-7
+            cxx: clang++-7
+          - name: release:osx
+            os: macos-latest
             mode: release
-            # TODO(eustas): investigate, why static brotli library is not found.
             skip_install: true
-            apt_pkgs: clang-5.0
             cmake_args: >-
-              -DCMAKE_C_COMPILER=clang-5.0 -DCMAKE_CXX_COMPILER=clang++-5.0
-              -DJPEGXL_ENABLE_PLUGINS=OFF
+              -DCMAKE_FIND_FRAMEWORK=NEVER
 
     env:
       CCACHE_DIR: ${{ github.workspace }}/.ccache
       # Whether we track the stack size.
       STACK_SIZE: ${{ matrix.env_stack_size }}
       TEST_STACK_LIMIT: ${{ matrix.env_test_stack_size }}
-      WILL_RUN_TESTS: ${{ github.event_name == 'push' || (github.event_name == 'pull_request' && (matrix.test_in_pr || contains(github.event.pull_request.labels.*.name, 'CI:full'))) }}
+      WILL_TEST: ${{  github.event_name == 'push' || (github.event_name == 'pull_request' && matrix.name != 'coverage' && (matrix.test_in_pr || contains(github.event.pull_request.labels.*.name, 'CI:full'))) }}
+      WILL_BUILD: ${{ github.event_name == 'push' || (github.event_name == 'pull_request' && matrix.name != 'coverage') }}
+      WILL_BENCH: ${{ github.event_name != 'merge_group' && matrix.run_bench }}
+      WILL_DOC: ${{ github.event_name != 'merge_group' && matrix.name == 'release' }}
+      WILL_COV: ${{ github.event_name == 'push' && matrix.name == 'coverage' }}
+      JPEGXL_OPT_DBG: true
+      FASTER_MSAN_BUILD: 1
 
     steps:
-    - name: Install build deps
+    - name: Harden Runner
+      uses: step-security/harden-runner@1b05615854632b887b69ae1be8cbefe72d3ae423 # v2.6.0
+      with:
+        egress-policy: audit
+
+    - name: Install build deps Ubuntu
+      if: startsWith(matrix.os, 'macos-') == false
       run: |
+        sudo rm -f /var/lib/man-db/auto-update
         sudo apt update
         sudo apt install -y \
           ccache \
-          clang-7 \
+          clang \
           cmake \
-          doxygen \
+          graphviz \
+          imagemagick \
           libbenchmark-dev \
           libbenchmark-tools \
           libbrotli-dev \
@@ -109,6 +163,7 @@ jobs:
           libgtest-dev \
           libgtk2.0-dev  \
           libjpeg-dev \
+          libjpeg-turbo-progs \
           libopenexr-dev \
           libpng-dev \
           libwebp-dev \
@@ -117,29 +172,51 @@ jobs:
           xvfb \
           ${{ matrix.apt_pkgs }} \
         #
-        echo "CC=clang-7" >> $GITHUB_ENV
-        echo "CXX=clang++-7" >> $GITHUB_ENV
+        echo "CC=${{ matrix.cc || 'clang' }}" >> $GITHUB_ENV
+        echo "CXX=${{ matrix.cxx || 'clang++' }}" >> $GITHUB_ENV
+    - name: Install build deps MacOS
+      if: startsWith(matrix.os, 'macos-')
+      run: |
+        # Should be already installed:
+        #  brew install brotli giflib jpeg-turbo libpng zlib
+        # Not required, since we skip building documentation
+        #  brew install doxygen
+        brew install binutils ccache coreutils google-benchmark googletest ninja sdl2
+
     - name: Checkout the source
-      uses: actions/checkout@v2
+      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
       with:
         submodules: true
         fetch-depth: 2
 
+    - name: Setup the Homebrew prefixes
+      if: startsWith(matrix.os, 'macos-')
+      run: |
+          CMAKE_PREFIX_PATH=`brew --prefix brotli`:`brew --prefix giflib`:`brew --prefix google-benchmark`:`brew --prefix jpeg-turbo`:`brew --prefix libpng`:`brew --prefix sdl2`:`brew --prefix zlib`
+          echo "CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}" >> $GITHUB_ENV
+
+    - name: Suppress doxygen target
+      if: matrix.name != 'release'
+      run: |
+        echo "TARGETS=all" >> $GITHUB_ENV
+
     - name: Setup the LLVM source path
       if: matrix.name == 'msan'
       run: |
         LLVM_ROOT=${GITHUB_WORKSPACE}/llvm_root
         mkdir -p ${LLVM_ROOT}
         echo "LLVM_ROOT=${LLVM_ROOT}" >> $GITHUB_ENV
+
     - name: Cache LLVM sources
       if: matrix.name == 'msan'
-      uses: actions/cache@v2
+      uses: actions/cache@704facf57e6136b1bc63b828d79edcd491f0ee84 # v3.3.2
       with:
         path: ${{ env.LLVM_ROOT }}
         key: llvm
+
     - name: Checkout the LLVM source
       if: matrix.name == 'msan'
-      uses: actions/checkout@v2
+      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
       with:
         submodules: false
         repository: llvm/llvm-project
@@ -148,16 +225,22 @@ jobs:
 
     - name: Sphinx dependencies
       # Dependencies for sphinx HTML documentation
-      if: matrix.name == 'release'
+      if: env.WILL_DOC == 'true'
       run: |
         pip3 install -r doc/sphinx/requirements.txt
+
+    - name: Install gcovr
+      if: env.WILL_COV == 'true'
+      run: pip install gcovr
+
     - name: Git environment
       id: git-env
       run: |
-        echo "::set-output name=parent::$(git rev-parse ${{ github.sha }}^)"
+        echo "parent=$(git rev-parse ${{ github.sha }}^)" >> $GITHUB_OUTPUT
       shell: bash
+
     - name: ccache
-      uses: actions/cache@v2
+      uses: actions/cache@704facf57e6136b1bc63b828d79edcd491f0ee84 # v3.3.2
       with:
         path: ${{ env.CCACHE_DIR }}
         # When the cache hits the key it is not updated, so if this is a rebuild
@@ -167,13 +250,14 @@ jobs:
         key: build-${{ runner.os }}-${{ github.sha }}-${{ matrix.name }}
         restore-keys: |
           build-${{ runner.os }}-${{ steps.git-env.outputs.parent }}-${{ matrix.name }}
+
     - name: Build
-      if: matrix.name != 'coverage' || env.WILL_RUN_TESTS == 'true'
+      if: env.WILL_BUILD == 'true'
       run: |
         mkdir -p ${CCACHE_DIR}
         echo "max_size = 200M" > ${CCACHE_DIR}/ccache.conf
         mode="${{ matrix.mode }}"
-        build_tests=$([ "$WILL_RUN_TESTS" == "true" ] && echo "ON" || echo "OFF")
+        build_tests=$([ "$WILL_TEST" == "true" ] && echo "ON" || echo "OFF")
         [[ -n "${mode}" ]] || mode="${{ matrix.name }}"
         ./ci.sh ${mode} -DJPEGXL_FORCE_SYSTEM_BROTLI=ON \
           -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
@@ -183,81 +267,83 @@ jobs:
       env:
         SKIP_TEST: 1
         CMAKE_CXX_FLAGS: ${{ matrix.cxxflags }}
+
     - name: Build stats
+      if: env.WILL_BUILD == 'true'
       run: |
         awk '!/^#/ {total[$4]+=($2-$1);cntr[$4]+=1} END {for (key in total) print total[key]/cntr[key] " " key}' build/.ninja_log | sort -n | tail -n 25
+
     - name: ccache stats
       run: ccache --show-stats
+
     - name: Build stats ${{ matrix.name }}
-      if: matrix.mode == 'release' || matrix.name == 'release'
+      if: env.WILL_BUILD == 'true' && matrix.mode == 'release'
       run: |
-        tools/build_stats.py --save build/stats.json \
-          --max-stack ${{ matrix.max_stack || '0' }} \
-          cjxl djxl libjxl.so libjxl_dec.so
+        SHARED_LIB_EXT="${{ startsWith(matrix.os, 'macos-') && 'dylib' || 'so' }}"
+        SELECT_BINUTILS="${{ startsWith(matrix.os, 'macos-') && '--binutils `brew --prefix binutils`/bin/' || '' }}"
+        tools/scripts/build_stats.py --save build/stats.json \
+          --max-stack ${{ matrix.max_stack || '0' }} ${SELECT_BINUTILS} \
+          cjxl djxl libjxl.${SHARED_LIB_EXT} libjxl_dec.${SHARED_LIB_EXT}
+
     # Check that we can build the example project against the installed libs.
     - name: Install and build examples
-      if: |
-        (matrix.mode == 'release' || matrix.name == 'release') &&
-        !matrix.skip_install
+      if: env.WILL_BUILD == 'true' && matrix.mode == 'release' && !matrix.skip_install
       run: |
         set -x
         sudo cmake --build build -- install
         cmake -Bbuild-example -Hexamples -G Ninja
         cmake --build build-example
-        if ldd build-example/decode_oneshot_static | grep libjxl; then
-          echo "decode_oneshot_static is not using the static lib" >&2
-          exit 1
-        fi
         # Test that the built binaries run.
         echo -e -n "PF\n1 1\n-1.0\n\0\0\x80\x3f\0\0\x80\x3f\0\0\x80\x3f" > test.pfm
         build-example/encode_oneshot test.pfm test.jxl
-        build-example/encode_oneshot_static test.pfm test-static.jxl
         build-example/decode_oneshot test.jxl dec.pfm dec.icc
-        build-example/decode_oneshot_static test.jxl dec-static.pfm dec-static.icc
+
     # Run the tests on push and when requested in pull_request.
     - name: Test ${{ matrix.mode }}
-      if: env.WILL_RUN_TESTS == 'true'
+      if: env.WILL_TEST == 'true'
       run: |
         ./ci.sh test ${{ matrix.ctest_args }}
+
     # Print the running time summary for the slowest tests.
     - name: Test runtime stats
+      if: env.WILL_TEST == 'true'
       run: |
         sort build/Testing/Temporary/CTestCostData.txt -k 3 -n | tail -n 20 || true
+
     - name: Build HTML documentation (sphinx/readthetdocs)
-      if: matrix.name == 'release'
+      if: env.WILL_DOC == 'true'
       run: |
         cmake --build build -- rtd-html
+
     - name: Coverage report
-      if: github.event_name == 'push' && matrix.name == 'coverage'
+      if: env.WILL_COV == 'true'
       run: |
         ./ci.sh coverage_report
+
     - name: Coverage upload to Codecov
-      if: github.event_name == 'push' && matrix.name == 'coverage'
-      uses: codecov/codecov-action@v2
+      if: env.WILL_COV == 'true'
+      uses: codecov/codecov-action@eaaf4bedf32dbdc6b720b63067d99c4d77d6047d # v3.1.4
       with:
         flags: unittests
         files: build/coverage.xml
+
     - name: Fast benchmark ${{ matrix.mode }}
-      if: |
-        matrix.name != 'coverage' && (github.event_name == 'push' ||
-        (github.event_name == 'pull_request' && (
-         matrix.test_in_pr ||
-         contains(github.event.pull_request.labels.*.name, 'CI:full'))))
+      if: env.WILL_BENCH == 'true'
       run: |
         STORE_IMAGES=0 ./ci.sh fast_benchmark
+
     # Run gbench once, just to make sure it runs, not for actual benchmarking.
     # This doesn't work on msan because we use gbench library from the system
     # which is not instrumented by msan.
     - name: gbench check
-      if: |
-        matrix.name == 'release' || (
-          github.event_name == 'push' && matrix.name != 'msan')
+      if: env.WILL_BENCH == 'true'
       run: |
         ./ci.sh gbench --benchmark_min_time=0
 
   windows_msys:
     name: Windows MSYS2 / ${{ matrix.msystem }}
     runs-on: windows-latest
+    continue-on-error: ${{ matrix.faulty || false }}
     strategy:
       fail-fast: false
       matrix:
@@ -265,22 +351,26 @@ jobs:
          - msystem: mingw64
          - msystem: clang64
          - msystem: mingw32
-           # TODO(eustas): investigate HWY Mul failures
-           disable_tests: HwyMulTestGroup/HwyMulTest\.TestAllMulHigh/EMU128|HwyMulTestGroup/HwyMulTest\.TestAllMulFixedPoint15/EMU128
+           disable_tests:
+             - ButteraugliTest.Lossless
+             - ButteraugliTest.Distmap
          - msystem: clang32
-           # TODO(eustas): investigate HWY Sort and JXL ANS failures
-           disable_tests: SortTestGroup/SortTest\.TestAllSort/.*|ANSTest\.RandomUnbalancedStreamRoundtrip3|ANSTest\.RandomUnbalancedStreamRoundtripBig
 
     defaults:
       run:
         shell: msys2 {0}
     steps:
+      - name: Harden Runner
+        uses: step-security/harden-runner@1b05615854632b887b69ae1be8cbefe72d3ae423 # v2.6.0
+        with:
+          egress-policy: audit
+
       - name: Checkout the source
-        uses: actions/checkout@v2
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
         with:
           submodules: true
           fetch-depth: 1
-      - uses: msys2/setup-msys2@v2
+      - uses: msys2/setup-msys2@07aeda7763550b267746a772dcea5e5ac3340b36 # v2
         with:
           msystem: ${{ matrix.msystem }}
           update: true
@@ -317,7 +407,7 @@ jobs:
           github.event_name == 'push' ||
           (github.event_name == 'pull_request' &&
            contains(github.event.pull_request.labels.*.name, 'CI:full'))
-        run: ctest --test-dir build --parallel 2 --output-on-failure -E "${{ matrix.disable_tests }}"
+        run: ctest --test-dir build --parallel 2 --output-on-failure -E "${{ join(matrix.disable_tests, '|') }}"
 
   wasm32_build:
     name: WASM wasm32/${{ matrix.variant }}
@@ -325,17 +415,23 @@ jobs:
     env:
       CCACHE_DIR: ${{ github.workspace }}/.ccache
       BUILD_TARGET: wasm32
-      EM_VERSION: 3.1.1
+      EM_VERSION: 3.1.50
       NODE_VERSION: 18
 
     strategy:
       matrix:
         include:
           - variant: scalar
-          - variant: simd
+          - variant: simd-128
+          - variant: simd-256
 
     steps:
-    - uses: actions/checkout@v2
+    - name: Harden Runner
+      uses: step-security/harden-runner@1b05615854632b887b69ae1be8cbefe72d3ae423 # v2.6.0
+      with:
+        egress-policy: audit
+
+    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
       with:
         submodules: true
         fetch-depth: 1
@@ -343,12 +439,14 @@ jobs:
       shell: bash
       run: |
         set -x
+        sudo rm -f /var/lib/man-db/auto-update
         sudo apt update
         pkgs=(
           # Build dependencies
           ccache
           cmake
           doxygen
+          graphviz
           ninja-build
           pkg-config
         )
@@ -357,10 +455,10 @@ jobs:
     - name: Git environment
       id: git-env
       run: |
-        echo "::set-output name=parent::$(git rev-parse ${{ github.sha }}^)"
+        echo "parent=$(git rev-parse ${{ github.sha }}^)" >> $GITHUB_OUTPUT
       shell: bash
     - name: ccache
-      uses: actions/cache@v2
+      uses: actions/cache@704facf57e6136b1bc63b828d79edcd491f0ee84 # v3.3.2
       with:
         path: ${{ env.CCACHE_DIR }}
         key: build-wasm-${{ runner.os }}-${{ github.sha }}-${{ matrix.variant }}
@@ -368,7 +466,7 @@ jobs:
           build-wasm-${{ runner.os }}-${{ steps.git-env.outputs.parent }}-${{ matrix.variant }}
 
     - name: Install node
-      uses: actions/setup-node@v3
+      uses: actions/setup-node@8f152de45cc393bb48ce5d89d36b731f54556e65 # v4.0.0
       with:
         node-version: ${{env.NODE_VERSION}}
 
@@ -376,7 +474,7 @@ jobs:
       run: which node >> $HOME/.base_node_path
 
     - name: Install emsdk
-      uses: mymindstorm/setup-emsdk@v11
+      uses: mymindstorm/setup-emsdk@ab889da2abbcbb280f91ec4c215d3bb4f3a8f775 # v12
       # TODO(deymo): We could cache this action but it doesn't work when running
       # in a matrix.
       with:
@@ -385,7 +483,7 @@ jobs:
 
     - name: Set EMSDK node version
       run: |
-        echo "NODE_JS='$(cat $HOME/.base_node_path)'" >> $EM_CONFIG
+        echo "NODE_JS='$(cat $HOME/.base_node_path)'" >> $EMSDK/.emscripten
         emsdk construct_env
 
     # TODO(deymo): Build and install other dependencies like libpng, libjpeg,
@@ -394,14 +492,27 @@ jobs:
       run: |
         mkdir -p ${CCACHE_DIR}
         echo "max_size = 200M" > ${CCACHE_DIR}/ccache.conf
-        if [[ "${{ matrix.variant }}" == "simd" ]]; then
+        if [[ "${{ matrix.variant }}" == "simd-128" ]]; then
           export ENABLE_WASM_SIMD=1
         fi
+        if [[ "${{ matrix.variant }}" == "simd-256" ]]; then
+          export ENABLE_WASM_SIMD=2
+        fi
         ./ci.sh release \
           -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-          -DCMAKE_C_COMPILER_LAUNCHER=ccache
+          -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+          -DJPEGXL_ENABLE_BENCHMARK=OFF \
+          -DJPEGXL_ENABLE_DEVTOOLS=OFF \
+          -DJPEGXL_ENABLE_DOXYGEN=OFF \
+          -DJPEGXL_ENABLE_EXAMPLES=OFF \
+          -DJPEGXL_ENABLE_JNI=OFF \
+          -DJPEGXL_ENABLE_MANPAGES=OFF \
+          -DJPEGXL_ENABLE_PLUGINS=OFF \
+          -DJPEGXL_ENABLE_TOOLS=OFF \
+          -DJPEGXL_ENABLE_VIEWERS=OFF
       env:
         SKIP_TEST: 1
+        TARGETS: all
     - name: ccache stats
       run: ccache --show-stats
 
@@ -412,3 +523,36 @@ jobs:
          contains(github.event.pull_request.labels.*.name, 'CI:full'))
       run: |
         ./ci.sh test
+
+  bazel:
+    name: Bazel
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+
+    steps:
+      - name: Harden Runner
+        uses: step-security/harden-runner@1b05615854632b887b69ae1be8cbefe72d3ae423 # v2.6.0
+        with:
+          egress-policy: audit
+
+      - name: Checkout the source
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+        with:
+          submodules: true
+          fetch-depth: 1
+
+      - name: Patch
+        run: |
+          cd third_party/highway
+          git fetch origin 31fbbd7ce1e4179a32d86688cd67316556f582bf
+          git checkout 31fbbd7ce1e4179a32d86688cd67316556f582bf
+          git apply ${{ github.workspace }}/.github/workflows/highway.patch
+      - name: Build
+        run: bazel build -c opt ...:all
+      - name: Test
+        if: |
+          github.event_name == 'push' ||
+          (github.event_name == 'pull_request' &&
+           contains(github.event.pull_request.labels.*.name, 'CI:full'))
+        run: bazel test -c opt --test_output=errors ...:all
index 5b53720..84e97f5 100644 (file)
@@ -7,6 +7,7 @@
 
 name: Build/Test Cross
 on:
+  merge_group:
   push:
     branches:
       - main
@@ -14,84 +15,91 @@ on:
   pull_request:
     types: [opened, reopened, labeled, synchronize]
 
+permissions:
+  contents: read
+
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
   cancel-in-progress: ${{ github.event_name == 'pull_request' }}
 
 jobs:
-  cross_compile_ubuntu:
-    name: Cross-compiling ${{ matrix.build_target }} ${{ matrix.variant }} 
+  compile:
+    name: Cross-compiling ${{ matrix.identifier }}
     runs-on: [ubuntu-22.04]
     container:
-      image: debian:bullseye
+      image: debian:bookworm
     strategy:
       fail-fast: false
       matrix:
+        identifier: [arm64, arm64-sve, arm64-lowprecision, armhf, i386]
         include:
           - arch: arm64
+            identifier: arm64
             build_target: aarch64-linux-gnu
             cmake_args:
              - -DCMAKE_CROSSCOMPILING_EMULATOR=/usr/bin/qemu-aarch64-static
 
           - arch: arm64
-            variant: SVE
+            identifier: arm64-sve
             build_target: aarch64-linux-gnu
             cmake_args:
              - -DCMAKE_CROSSCOMPILING_EMULATOR=/usr/bin/qemu-aarch64-static
              - -DJPEGXL_ENABLE_OPENEXR=off
              - -DJPEGXL_ENABLE_SIZELESS_VECTORS=on
+             - -DJPEGXL_WARNINGS_AS_ERRORS=off
             cmake_flags: -march=armv8-a+sve
             c_compiler: aarch64-linux-gnu-gcc
             cxx_compiler: aarch64-linux-gnu-g++
             disable_tests: true
 
           - arch: arm64
-            variant: lowprecision
+            identifier: arm64-lowprecision
             build_target: aarch64-linux-gnu
             cmake_args:
              - -DCMAKE_CROSSCOMPILING_EMULATOR=/usr/bin/qemu-aarch64-static
              - -DCMAKE_CXX_FLAGS=-DJXL_HIGH_PRECISION=0
 
           - arch: armhf
+            identifier: armhf
             build_target: arm-linux-gnueabihf
             cmake_args: [-DCMAKE_CROSSCOMPILING_EMULATOR=/usr/bin/qemu-arm-static]
 
           - arch: i386
+            identifier: i386
             test_in_pr: true
             build_target: i686-linux-gnu
 
     env:
       BUILD_DIR: build
-      WILL_RUN_TESTS: ${{ (github.event_name == 'push' || (github.event_name == 'pull_request' && (matrix.test_in_pr || contains(github.event.pull_request.labels.*.name, 'CI:full')))) && !matrix.disable_tests }}
+      WILL_RUN_TESTS: ${{ (github.event_name == 'push' || (github.event_name == 'pull_request' && (matrix.test_in_pr || contains(github.event.pull_request.labels.*.name, 'CI:full')))) }}
 
     steps:
-    - name: Setup apt
+    - name: Harden Runner
+      uses: step-security/harden-runner@1b05615854632b887b69ae1be8cbefe72d3ae423 # v2.6.0
+      with:
+        egress-policy: audit
+
+    - name: Warmup apt
       shell: bash
       run: |
         set -x
+        rm -f /var/lib/man-db/auto-update
         apt-get update -y
-        apt-get install -y ca-certificates debian-ports-archive-keyring
-
-        dpkg --add-architecture "${{ matrix.arch }}"
+        apt-get install -y ca-certificates debian-ports-archive-keyring git python3
 
-        # Update the sources.list with the split of supported architectures.
-        bkplist="/etc/apt/sources.list.bkp"
-        mv /etc/apt/sources.list "${bkplist}"
-
-        newlist="/etc/apt/sources.list"
-        rm -f "${newlist}"
+    - name: Checkout the source
+      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+      with:
+        submodules: true
+        fetch-depth: 1
 
-        main_list="amd64,${{ matrix.arch }}"
-        port_list=""
-        if [[ "${{ matrix.arch }}" == "i386" ]]; then
-          main_list="amd64,i386"
-        else
-          port_list="${{ matrix.arch }}"
-        fi
+    - name: Setup apt
+      shell: bash
+      run: |
+        set -x
+        dpkg --add-architecture "${{ matrix.arch }}"
 
-        grep -v -E '^#' "${bkplist}" |
-          sed -E "s;^deb (http[^ ]+) (.*)\$;deb [arch=${main_list}] \\1 \\2\ndeb-src [arch=${main_list}] \\1 \\2;" \
-          | tee -a "${newlist}"
+        python3 ./tools/scripts/transform_sources_list.py "amd64,${{ matrix.arch }}"
 
     - name: Install build deps
       shell: bash
@@ -102,20 +110,20 @@ jobs:
           # Build dependencies
           cmake
           doxygen
-          git
           graphviz
           ninja-build
           pkg-config
           qemu-user-static
+          unzip
           xdg-utils
           xvfb
 
           # Toolchain for cross-compiling.
-          clang-11
+          clang-14
           g++-aarch64-linux-gnu
           libc6-dev-${{ matrix.arch }}-cross
-          libstdc++-10-dev-${{ matrix.arch }}-cross
-          libstdc++-10-dev:${{ matrix.arch }}
+          libstdc++-12-dev-${{ matrix.arch }}-cross
+          libstdc++-12-dev:${{ matrix.arch }}
 
           # Dependencies
           libbrotli-dev:${{ matrix.arch }}
@@ -131,10 +139,6 @@ jobs:
           # GTK plugins
           libgdk-pixbuf2.0-dev:${{ matrix.arch }}
           libgtk2.0-dev:${{ matrix.arch }}
-
-          # QT
-          libqt5x11extras5-dev:${{ matrix.arch }}
-          qtbase5-dev:${{ matrix.arch }}
         )
         if [[ "${{ matrix.build_target }}" != "x86_64-linux-gnu" ]]; then
           pkgs+=(
@@ -152,13 +156,9 @@ jobs:
           )
         fi
         DEBIAN_FRONTEND=noninteractive apt install -y "${pkgs[@]}"
-        echo "CC=${{ matrix.c_compiler || 'clang-11' }}" >> $GITHUB_ENV
-        echo "CXX=${{ matrix.cxx_compiler || 'clang++-11' }}" >> $GITHUB_ENV
-    - name: Checkout the source
-      uses: actions/checkout@v2
-      with:
-        submodules: true
-        fetch-depth: 1
+        echo "CC=${{ matrix.c_compiler || 'clang-14' }}" >> $GITHUB_ENV
+        echo "CXX=${{ matrix.cxx_compiler || 'clang++-14' }}" >> $GITHUB_ENV
+
     - name: Build
       run: |
         CMAKE_FLAGS="${{ matrix.cmake_flags }}" ./ci.sh release \
@@ -168,16 +168,147 @@ jobs:
       env:
         SKIP_TEST: 1
         BUILD_TARGET: ${{ matrix.build_target }}
-    - name: Build stats ${{ matrix.build_target }}
+        TARGETS: ${{ env.WILL_RUN_TESTS == 'true' && 'all_tests cjxl djxl libjxl.so libjxl_dec.so' || 'all' }}
+
+    - name: Build stats
       run: |
-        tools/build_stats.py --save build/stats.json \
+        tools/scripts/build_stats.py --save build/stats.json \
           --binutils ${{ matrix.build_target }}- \
           --max-stack ${{ matrix.max_stack || '0' }} \
           cjxl djxl libjxl.so libjxl_dec.so
-    # Run the tests on push and when requested in pull_request.
-    - name: Test
+
+    - name: Prepare artefacts
       if: env.WILL_RUN_TESTS == 'true'
       run: |
-        ./ci.sh test
-      env:
-        BUILD_TARGET: ${{ matrix.build_target }}
+        find ./build -regextype egrep -type f -regex '.*\.(a|h|jar|log|o)'
+        find ./build -type f -executable > executable.lst
+        cp /etc/apt/sources.list.d/debian.sources ./
+
+    - name: Test ranging
+      if: ${{ !matrix.disable_tests }}
+      run: |
+          mkdir -p ./build/Testing/Temporary
+          unzip ./tools/scripts/test_cost-${{ matrix.identifier }}.zip -d ./build/Testing/Temporary
+
+    - uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32 # v3.1.3
+      if: env.WILL_RUN_TESTS == 'true'
+      with:
+        name: cross_binary-${{ matrix.identifier }}
+        path: |
+          build/
+          ci.sh
+          debian.sources
+          executable.lst
+          testdata/
+        retention-days: 1
+
+  test:
+    name: Testing ${{ matrix.identifier }} shard ${{ matrix.shard_number }}
+    needs: compile
+    runs-on: [ubuntu-22.04]
+    container:
+      image: debian:bookworm
+    strategy:
+      fail-fast: false
+      matrix:
+        shard_number: [0, 1, 2, 3, 4, 5, 6, 7]
+        identifier: [arm64, arm64-lowprecision, armhf, i386]
+        include:
+          - arch: arm64
+          - identifier: arm64
+            last_shard: 8
+
+          #- arch: arm64
+          #- identifier: arm64-sve
+          #  last_shard: 8
+
+          - arch: arm64
+            identifier: arm64-lowprecision
+            last_shard: 8
+
+          - arch: armhf
+            identifier: armhf
+            last_shard: 8
+
+          - arch: i386
+            identifier: i386
+            test_in_pr: true
+            last_shard: 4
+
+    env:
+      BUILD_DIR: build
+      UPLOAD_TEST_COST: false
+      LAST_SHARD: ${{ false && 1 || matrix.last_shard}}
+      # Run the tests on push and when requested in pull_request.
+      WILL_RUN_TESTS: ${{ (github.event_name == 'push' || (github.event_name == 'pull_request' && (matrix.test_in_pr || contains(github.event.pull_request.labels.*.name, 'CI:full')))) }}
+
+    steps:
+    - name: Harden Runner
+      uses: step-security/harden-runner@1b05615854632b887b69ae1be8cbefe72d3ae423 # v2.6.0
+      with:
+        egress-policy: audit
+
+    - uses: actions/download-artifact@9bc31d5ccc31df68ecc42ccf4149144866c47d8a # v3.0.2
+      if: (matrix.shard_number < env.LAST_SHARD) && (env.WILL_RUN_TESTS == 'true')
+      with:
+        name: cross_binary-${{ matrix.identifier }}
+
+    - name: Setup apt
+      if: (matrix.shard_number < env.LAST_SHARD) && (env.WILL_RUN_TESTS == 'true')
+      shell: bash
+      run: |
+        set -x
+        rm -f /var/lib/man-db/auto-update
+        apt-get update -y
+        apt-get install -y ca-certificates debian-ports-archive-keyring
+
+        dpkg --add-architecture "${{ matrix.arch }}"
+
+        cp ./debian.sources /etc/apt/sources.list.d/
+
+    - name: Install build deps
+      if: (matrix.shard_number < env.LAST_SHARD) && (env.WILL_RUN_TESTS == 'true')
+      shell: bash
+      run: |
+        set -x
+        apt update
+        pkgs=(
+          # Build dependencies
+          cmake
+          qemu-user-static
+
+          # Dependencies
+          libbrotli-dev:${{ matrix.arch }}
+          libgif-dev:${{ matrix.arch }}
+          libjpeg-dev:${{ matrix.arch }}
+          libpng-dev:${{ matrix.arch }}
+          libwebp-dev:${{ matrix.arch }}
+
+          # For OpenEXR:
+          libilmbase-dev:${{ matrix.arch }}
+          libopenexr-dev:${{ matrix.arch }}
+        )
+        DEBIAN_FRONTEND=noninteractive apt install -y "${pkgs[@]}"
+
+    - name: Prepare
+      if: (env.UPLOAD_TEST_COST == 'true') && (matrix.shard_number == 0) && (env.WILL_RUN_TESTS == 'true')
+      run: |
+        rm build/Testing/Temporary/CTestCostData.txt
+
+    - name: Test
+      if: (matrix.shard_number < env.LAST_SHARD) && (env.WILL_RUN_TESTS == 'true')
+      run: |
+        chmod +x ./ci.sh
+        chmod +x `cat executable.lst`
+        ./ci.sh test \
+          -I ${{ matrix.shard_number }},,${{ env.LAST_SHARD }} \
+          -E '(bash_test|conformance_tooling_test|test_jxl_jni_wrapper)'
+
+    - uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32 # v3.1.3
+      name: Upload test cost
+      if: (env.UPLOAD_TEST_COST == 'true') && (matrix.shard_number == 0) && (env.WILL_RUN_TESTS == 'true')
+      with:
+        name: test_cost-${{ matrix.identifier }}
+        path: |
+          build/Testing/Temporary/CTestCostData.txt
+        retention-days: 1
diff --git a/.github/workflows/build_test_md.yml b/.github/workflows/build_test_md.yml
new file mode 100644 (file)
index 0000000..e109523
--- /dev/null
@@ -0,0 +1,60 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Workflow for building and running tests.
+
+name: Build/Test
+on:
+  pull_request:
+    types: [opened, reopened, labeled, synchronize]
+    paths:
+      - '**.md'
+
+permissions:
+  contents: read
+
+concurrency: 
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
+jobs:
+  ubuntu-build:
+    name: Ubuntu Build ${{ matrix.name }}
+    # Include all names of required jobs here
+    strategy:
+      matrix:
+        include:
+          - name: release
+          - name: debug
+          - name: scalar
+          - name: asan
+          - name: release-nojpeg
+          - name: release-lcms2
+          - name: release:gcc8
+    runs-on: ubuntu-latest
+    steps:
+      - name: Harden Runner
+        uses: step-security/harden-runner@1b05615854632b887b69ae1be8cbefe72d3ae423 # v2.6.0
+        with:
+          egress-policy: audit
+
+      - run: 'echo "markdown only changes: no build required"'
+
+  windows_msys:
+    name: Windows MSYS2 / ${{ matrix.msystem }}
+    # Include all msystem of required jobs here
+    strategy:
+      matrix:
+          include:
+           - msystem: clang64
+           - msystem: clang32
+    runs-on: ubuntu-latest
+    steps:
+      - name: Harden Runner
+        uses: step-security/harden-runner@1b05615854632b887b69ae1be8cbefe72d3ae423 # v2.6.0
+        with:
+          egress-policy: audit
+
+      - run: 'echo "markdown only changes: no build required"'
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
new file mode 100644 (file)
index 0000000..49821fe
--- /dev/null
@@ -0,0 +1,116 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# For most projects, this workflow file will not need changing; you simply need
+# to commit it to your repository.
+#
+# You may wish to alter this file to override the set of languages analyzed,
+# or to provide custom queries or build logic.
+#
+# ******** NOTE ********
+# We have attempted to detect the languages in your repository. Please check
+# the `language` matrix defined below to confirm you have the correct set of
+# supported CodeQL languages.
+#
+name: "CodeQL"
+
+on:
+  push:
+    branches: ["main"]
+  pull_request:
+    # The branches below must be a subset of the branches above
+    branches: ["main"]
+  schedule:
+    - cron: "0 0 * * 1"
+
+permissions:
+  contents: read
+
+concurrency: 
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
+jobs:
+  analyze:
+    name: Analyze
+    runs-on: ubuntu-latest
+    permissions:
+      actions: read
+      contents: read
+      security-events: write
+
+    strategy:
+      fail-fast: false
+      matrix:
+        language: ["cpp"]
+        # CodeQL supports [ $supported-codeql-languages ]
+        # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
+
+    steps:
+      - name: Harden Runner
+        uses: step-security/harden-runner@1b05615854632b887b69ae1be8cbefe72d3ae423 # v2.6.0
+        with:
+          egress-policy: audit
+
+      - name: Checkout repository
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+
+      # Initializes the CodeQL tools for scanning.
+      - name: Initialize CodeQL
+        uses: github/codeql-action/init@74483a38d39275f33fcff5f35b679b5ca4a26a99 # v2.22.5
+        with:
+          languages: ${{ matrix.language }}
+          # If you wish to specify custom queries, you can do so here or in a config file.
+          # By default, queries listed here will override any specified in a config file.
+          # Prefix the list here with "+" to use these queries and those in the config file.
+
+      - name: Install build deps
+        run: |
+          sudo rm -f /var/lib/man-db/auto-update
+          sudo apt update
+          sudo apt install -y \
+            ccache \
+            clang \
+            cmake \
+            doxygen \
+            graphviz \
+            imagemagick \
+            libbenchmark-dev \
+            libbenchmark-tools \
+            libbrotli-dev \
+            libgdk-pixbuf2.0-dev \
+            libgif-dev \
+            libgtest-dev \
+            libgtk2.0-dev  \
+            libjpeg-dev \
+            libjpeg-turbo-progs \
+            libopenexr-dev \
+            libpng-dev \
+            libwebp-dev \
+            ninja-build \
+            pkg-config \
+            xvfb \
+            ${{ matrix.apt_pkgs }} \
+          #
+          echo "CC=${{ matrix.cc || 'clang' }}" >> $GITHUB_ENV
+          echo "CXX=${{ matrix.cxx || 'clang++' }}" >> $GITHUB_ENV
+      - name: Checkout the source
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+        with:
+          submodules: true
+          fetch-depth: 2
+
+
+      - name: Build
+        run: |
+          ./ci.sh opt -DJPEGXL_FORCE_SYSTEM_BROTLI=ON \
+            -DBUILD_TESTING=OFF
+        env:
+          SKIP_TEST: 1
+
+      - name: Perform CodeQL Analysis
+        uses: github/codeql-action/analyze@74483a38d39275f33fcff5f35b679b5ca4a26a99 # v2.22.5
+        with:
+          category: "/language:${{matrix.language}}"
index c59dc89..5ce1e3b 100644 (file)
@@ -7,6 +7,7 @@
 
 name: Conformance
 on:
+  merge_group:
   push:
     branches:
       - main
@@ -14,24 +15,36 @@ on:
   pull_request:
     types: [opened, reopened, labeled, synchronize]
 
+permissions:
+  contents: read
+
+env:
+  CONFORMANCE_REPO_HASH: ee6008ef151489a5330cd886b422af8f6ed58881
+  LIBJXL_VERSION: 0.9.0
+  LIBJXL_ABI_VERSION: 0.9
+
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
   cancel-in-progress: ${{ github.event_name == 'pull_request' }}
 
 jobs:
-  warmup: # If necessary, fetch files just once, before tests are run. 
+  warmup: # If necessary, fetch files just once, before tests are run.
     name: Warmup caches
     runs-on: ubuntu-latest
     steps:
+    - name: Harden Runner
+      uses: step-security/harden-runner@1b05615854632b887b69ae1be8cbefe72d3ae423 # v2.6.0
+      with:
+        egress-policy: audit
+
     - name: Checkout the conformance source
-      uses: actions/checkout@v2
+      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
       with:
         repository: libjxl/conformance
-        # TODO(eustas): move ref to a global variable / file?
-        ref: a6a44bbbd69830e1dc862174599ce5738a0a414f
+        ref: ${{ env.CONFORMANCE_REPO_HASH }}
         path: conformance
     - name: Cache
-      uses: actions/cache@v2
+      uses: actions/cache@704facf57e6136b1bc63b828d79edcd491f0ee84 # v3.3.2
       with:
         path: ${{ github.workspace }}/conformance/.objects
         key: conformance-refs
@@ -64,14 +77,21 @@ jobs:
     env:
       CCACHE_DIR: ${{ github.workspace }}/.ccache
     steps:
+    - name: Harden Runner
+      uses: step-security/harden-runner@1b05615854632b887b69ae1be8cbefe72d3ae423 # v2.6.0
+      with:
+        egress-policy: audit
+
     - name: Install build deps
       run: |
+        sudo rm -f /var/lib/man-db/auto-update
         sudo apt update
         sudo apt install -y \
           ccache \
-          clang-7 \
+          clang \
           cmake \
           doxygen \
+          graphviz \
           libbenchmark-dev \
           libbenchmark-tools \
           libbrotli-dev \
@@ -88,20 +108,20 @@ jobs:
           xvfb \
           ${{ matrix.apt_pkgs }} \
         #
-        echo "CC=clang-7" >> $GITHUB_ENV
-        echo "CXX=clang++-7" >> $GITHUB_ENV
+        echo "CC=clang" >> $GITHUB_ENV
+        echo "CXX=clang++" >> $GITHUB_ENV
     - name: Checkout the jxl source
-      uses: actions/checkout@v2
+      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
       with:
         submodules: true
         fetch-depth: 2
     - name: Git environment
       id: git-env
       run: |
-        echo "::set-output name=parent::$(git rev-parse ${{ github.sha }}^)"
+        echo "parent=$(git rev-parse ${{ github.sha }}^)" >> $GITHUB_OUTPUT
       shell: bash
     - name: ccache
-      uses: actions/cache@v2
+      uses: actions/cache@704facf57e6136b1bc63b828d79edcd491f0ee84 # v3.3.2
       with:
         path: ${{ env.CCACHE_DIR }}
         # When the cache hits the key it is not updated, so if this is a rebuild
@@ -116,6 +136,7 @@ jobs:
         mkdir -p ${CCACHE_DIR}
         echo "max_size = 200M" > ${CCACHE_DIR}/ccache.conf
         CMAKE_FLAGS="${{ matrix.cflags }}" \
+        TARGETS="tools/djxl" \
         ./ci.sh ${{ matrix.build_type || 'release' }} -DJPEGXL_FORCE_SYSTEM_BROTLI=ON \
           -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
           -DCMAKE_C_COMPILER_LAUNCHER=ccache \
@@ -124,19 +145,23 @@ jobs:
         cp tools/conformance/conformance.py build/tools/conformance
         cp tools/conformance/lcms2.py build/tools/conformance
         cp build/tools/djxl build/tools/conformance
-        cp build/libjxl.so.0.7.0 build/tools/conformance
-        cp build/libjxl_threads.so.0.7.0 build/tools/conformance
+        cp build/lib/libjxl.so.${{ env.LIBJXL_VERSION }} build/tools/conformance
+        cp build/lib/libjxl_cms.so.${{ env.LIBJXL_VERSION }} build/tools/conformance
+        cp build/lib/libjxl_threads.so.${{ env.LIBJXL_VERSION }} build/tools/conformance
+        cp build/lib/libjxl_extras_codec.so.${{ env.LIBJXL_VERSION }} build/tools/conformance
       env:
         SKIP_TEST: 1
-    - uses: actions/upload-artifact@v2
+    - uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32 # v3.1.3
       with:
         name: conformance_binary-${{ matrix.name }}
         path: |
           build/tools/conformance/conformance.py
           build/tools/conformance/lcms2.py
           build/tools/conformance/djxl
-          build/tools/conformance/libjxl.so.0.7.0
-          build/tools/conformance/libjxl_threads.so.0.7.0
+          build/tools/conformance/libjxl.so.${{ env.LIBJXL_VERSION }}
+          build/tools/conformance/libjxl_cms.so.${{ env.LIBJXL_VERSION }}
+          build/tools/conformance/libjxl_threads.so.${{ env.LIBJXL_VERSION }}
+          build/tools/conformance/libjxl_extras_codec.so.${{ env.LIBJXL_VERSION }}
     - name: ccache stats
       run: ccache --show-stats
 
@@ -150,31 +175,38 @@ jobs:
         name: [main_level5, main_level10]
         target: [AVX3, AVX2, SSE4, SSSE3, EMU128, SCALAR, SCALAR_ASAN]
     steps:
+    - name: Harden Runner
+      uses: step-security/harden-runner@1b05615854632b887b69ae1be8cbefe72d3ae423 # v2.6.0
+      with:
+        egress-policy: audit
+
     - name: Install deps
       run: |
         pip install numpy
     - name: Checkout the conformance source
-      uses: actions/checkout@v2
+      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
       with:
         repository: libjxl/conformance
-        ref: a6a44bbbd69830e1dc862174599ce5738a0a414f
+        ref: ${{ env.CONFORMANCE_REPO_HASH }}
         path: conformance
     - name: Cache
-      uses: actions/cache@v2
+      uses: actions/cache@704facf57e6136b1bc63b828d79edcd491f0ee84 # v3.3.2
       with:
         path: ${{ github.workspace }}/conformance/.objects
         key: conformance-refs
     - name: Download and link conformance files
       run: |
         ${{ github.workspace }}/conformance/scripts/download_and_symlink.sh
-    - uses: actions/download-artifact@v2
+    - uses: actions/download-artifact@9bc31d5ccc31df68ecc42ccf4149144866c47d8a # v3.0.2
       with:
         name: conformance_binary-${{ matrix.target }}
     - name: Run conformance tests
       run: |
         chmod +x djxl
-        ln -s libjxl.so.0.7.0 libjxl.so.0.7
-        ln -s libjxl_threads.so.0.7.0 libjxl_threads.so.0.7
+        ln -s libjxl.so.${{ env.LIBJXL_VERSION }} libjxl.so.${{ env.LIBJXL_ABI_VERSION }}
+        ln -s libjxl_cms.so.${{ env.LIBJXL_VERSION }} libjxl_cms.so.${{ env.LIBJXL_ABI_VERSION }}
+        ln -s libjxl_threads.so.${{ env.LIBJXL_VERSION }} libjxl_threads.so.${{ env.LIBJXL_ABI_VERSION }}
+        ln -s libjxl_extras_codec.so.${{ env.LIBJXL_VERSION }} libjxl_extras_codec.so.${{ env.LIBJXL_ABI_VERSION }}
         export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:`pwd`
         python conformance.py \
           --decoder=`pwd`/djxl \
index fb3522e..d7f6295 100644 (file)
@@ -11,49 +11,134 @@ on:
     branches:
       - ci-*-debug
 
+permissions:
+  contents: read
+
 jobs:
-  ubuntu_build:
-    name: Ubuntu Build and SSH
-    runs-on: [ubuntu-latest]
+  cross_compile_ubuntu:
+    name: Cross-compiling ${{ matrix.build_target }} ${{ matrix.variant }}
+    runs-on: [ubuntu-22.04]
+    container:
+      image: debian:bookworm
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - arch: i386
+            build_target: i686-linux-gnu
+
+    env:
+      BUILD_DIR: build
 
     steps:
+    - name: Harden Runner
+      uses: step-security/harden-runner@1b05615854632b887b69ae1be8cbefe72d3ae423 # v2.6.0
+      with:
+        egress-policy: audit
+
+    - name: Setup apt
+      shell: bash
+      run: |
+        set -x
+        rm -f /var/lib/man-db/auto-update
+        apt-get update -y
+        apt-get install -y ca-certificates debian-ports-archive-keyring
+
+        dpkg --add-architecture "${{ matrix.arch }}"
+
+        # Update the sources.list with the split of supported architectures.
+        bkplist="/etc/apt/sources.list.bkp"
+        mv /etc/apt/sources.list "${bkplist}"
+
+        newlist="/etc/apt/sources.list"
+        rm -f "${newlist}"
+
+        main_list="amd64,${{ matrix.arch }}"
+        port_list=""
+        if [[ "${{ matrix.arch }}" == "i386" ]]; then
+          main_list="amd64,i386"
+        else
+          port_list="${{ matrix.arch }}"
+        fi
+
+        grep -v -E '^#' "${bkplist}" |
+          sed -E "s;^deb (http[^ ]+) (.*)\$;deb [arch=${main_list}] \\1 \\2\ndeb-src [arch=${main_list}] \\1 \\2;" \
+          | tee -a "${newlist}"
+
     - name: Install build deps
+      shell: bash
       run: |
-        sudo apt update
-        sudo apt install -y \
-          ccache \
-          clang-7 \
-          cmake \
-          doxygen \
-          libbrotli-dev \
-          libgdk-pixbuf2.0-dev \
-          libgif-dev \
-          libgtest-dev \
-          libgtk2.0-dev  \
-          libjpeg-dev \
-          libopenexr-dev \
-          libpng-dev \
-          libwebp-dev \
-          ninja-build \
-          pkg-config \
-          xvfb \
-          ${{ matrix.apt_pkgs }} \
-        #
-        echo "CC=clang-7" >> $GITHUB_ENV
-        echo "CXX=clang++-7" >> $GITHUB_ENV
+        set -x
+        rm -f /var/lib/man-db/auto-update
+        apt update
+        pkgs=(
+          # Build dependencies
+          cmake
+          doxygen
+          git
+          graphviz
+          ninja-build
+          pkg-config
+          qemu-user-static
+          xdg-utils
+          xvfb
+
+          # Toolchain for cross-compiling.
+          clang-11
+          g++-aarch64-linux-gnu
+          libc6-dev-${{ matrix.arch }}-cross
+          libstdc++-10-dev-${{ matrix.arch }}-cross
+          libstdc++-10-dev:${{ matrix.arch }}
+
+          # Dependencies
+          libbrotli-dev:${{ matrix.arch }}
+          libgif-dev:${{ matrix.arch }}
+          libjpeg-dev:${{ matrix.arch }}
+          libpng-dev:${{ matrix.arch }}
+          libwebp-dev:${{ matrix.arch }}
+
+          # For OpenEXR:
+          libilmbase-dev:${{ matrix.arch }}
+          libopenexr-dev:${{ matrix.arch }}
+
+          # GTK plugins
+          libgdk-pixbuf2.0-dev:${{ matrix.arch }}
+          libgtk2.0-dev:${{ matrix.arch }}
+        )
+        if [[ "${{ matrix.build_target }}" != "x86_64-linux-gnu" ]]; then
+          pkgs+=(
+            binutils-${{ matrix.build_target }}
+            gcc-${{ matrix.build_target }}
+          )
+        fi
+        if [[ "${{ matrix.arch }}" != "i386" ]]; then
+          pkgs+=(
+            # TCMalloc
+            libgoogle-perftools-dev:${{ matrix.arch }}
+            libgoogle-perftools4:${{ matrix.arch }}
+            libtcmalloc-minimal4:${{ matrix.arch }}
+            libunwind-dev:${{ matrix.arch }}
+          )
+        fi
+        DEBIAN_FRONTEND=noninteractive apt install -y "${pkgs[@]}"
+        echo "CC=${{ matrix.c_compiler || 'clang-11' }}" >> $GITHUB_ENV
+        echo "CXX=${{ matrix.cxx_compiler || 'clang++-11' }}" >> $GITHUB_ENV
     - name: Checkout the source
-      uses: actions/checkout@v2
+      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
       with:
         submodules: true
-        fetch-depth: 2
-    - name: Build
+        fetch-depth: 1
+    - name: Configure
       run: |
-        ./ci.sh $(echo ${{ github.ref }} | sed 's_refs/heads/ci-\([a-z_]*\)-debug_\1_') \
-          -DJPEGXL_FORCE_SYSTEM_BROTLI=ON
+        CMAKE_FLAGS="${{ matrix.cmake_flags }}" ./ci.sh release \
+          -DJPEGXL_FORCE_SYSTEM_BROTLI=ON \
+          -DJPEGXL_ENABLE_JNI=OFF \
+          ${{ join(matrix.cmake_args, ' ') }}
       env:
-        SKIP_TEST: 1
+        SKIP_BUILD: 1
+        BUILD_TARGET: ${{ matrix.build_target }}
     - name: Setup tmate session
-      uses: mxschmitt/action-tmate@v3
+      uses: mxschmitt/action-tmate@a283f9441d2d96eb62436dc46d7014f5d357ac22 # v3.17
 
 
 
diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml
new file mode 100644 (file)
index 0000000..46417a7
--- /dev/null
@@ -0,0 +1,36 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Dependency Review Action
+#
+# This Action will scan dependency manifest files that change as part of a Pull Request,
+# surfacing known-vulnerable versions of the packages declared or updated in the PR.
+# Once installed, if the workflow run is marked as required,
+# PRs introducing known-vulnerable packages will be blocked from merging.
+#
+# Source repository: https://github.com/actions/dependency-review-action
+name: 'Dependency Review'
+on: [pull_request]
+
+permissions:
+  contents: read
+
+concurrency: 
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
+jobs:
+  dependency-review:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Harden Runner
+        uses: step-security/harden-runner@1b05615854632b887b69ae1be8cbefe72d3ae423 # v2.6.0
+        with:
+          egress-policy: audit
+
+      - name: 'Checkout Repository'
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+      - name: 'Dependency Review'
+        uses: actions/dependency-review-action@fde92acd0840415674c16b39c7d703fc28bc511e # v3.1.2
index 188a4c7..eaa04cd 100644 (file)
@@ -9,6 +9,7 @@
 
 name: CIFuzz
 on:
+  merge_group:
   pull_request:
     types: [opened, reopened, synchronize]
     paths:
@@ -19,7 +20,10 @@ on:
       - '**CMakeLists.txt'
       - .github/workflows/fuzz.yml
 
-concurrency: 
+permissions:
+  contents: read
+
+concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
   cancel-in-progress: ${{ github.event_name == 'pull_request' }}
 
@@ -27,8 +31,13 @@ jobs:
   fuzzing:
     runs-on: ubuntu-latest
     steps:
+    - name: Harden Runner
+      uses: step-security/harden-runner@1b05615854632b887b69ae1be8cbefe72d3ae423 # v2.6.0
+      with:
+        egress-policy: audit
+
     - name: Checkout source
-      uses: actions/checkout@v2
+      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
       id: checkout
       with:
         # The build_fuzzers action checks out the code to the storage/libjxl
@@ -38,18 +47,18 @@ jobs:
         submodules: true
     - name: Build Fuzzers
       id: build
-      uses: google/oss-fuzz/infra/cifuzz/actions/build_fuzzers@master
+      uses: google/oss-fuzz/infra/cifuzz/actions/build_fuzzers@71ecd5d4e4bf9a6edc19c9fa6d2422fb528bca4f # master
       with:
         oss-fuzz-project-name: 'libjxl'
         language: c++
     - name: Run Fuzzers
-      uses: google/oss-fuzz/infra/cifuzz/actions/run_fuzzers@master
+      uses: google/oss-fuzz/infra/cifuzz/actions/run_fuzzers@71ecd5d4e4bf9a6edc19c9fa6d2422fb528bca4f # master
       with:
         oss-fuzz-project-name: 'libjxl'
         language: c++
         fuzz-seconds: 600
     - name: Upload Crash
-      uses: actions/upload-artifact@v1
+      uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32 # v3.1.3
       if: failure() && steps.build.outcome == 'success'
       with:
         name: artifacts
diff --git a/.github/workflows/gitlab_mirror.yml b/.github/workflows/gitlab_mirror.yml
new file mode 100644 (file)
index 0000000..72ce463
--- /dev/null
@@ -0,0 +1,53 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Workflow for building and running tests.
+
+name: Mirror to GitLab
+
+on:
+  push:
+    branches:
+      - main
+      - v*.*.x
+
+env:
+  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
+
+permissions:
+  contents: read
+
+jobs:
+  mirror:
+    permissions:
+      contents: write  # for Git to git push
+    if: github.repository_owner == 'libjxl'
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Harden Runner
+      uses: step-security/harden-runner@1b05615854632b887b69ae1be8cbefe72d3ae423 # v2.6.0
+      with:
+        egress-policy: audit
+
+    - name: Checkout source
+      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+      with:
+        fetch-depth: 0  # Disable shallow clone
+
+    - name: Set up SSH
+      run: |
+        mkdir -p ~/.ssh/
+        chmod 700 ~/.ssh/
+        echo "${{ secrets.GITLAB_DEPLOY_KEY }}" > ~/.ssh/id_ed25519
+        chmod 400 ~/.ssh/id_ed25519
+        ssh-keyscan gitlab.com >> ~/.ssh/known_hosts
+
+    - name: Push to GitLab
+      env:
+        GIT_SSH_COMMAND: ssh -v -i ~/.ssh/id_ed25519 -o IdentitiesOnly=yes -o StrictHostKeyChecking=no
+      run: |
+        git remote add gitlab git@gitlab.com:wg1/jpeg-xl.git
+        git push gitlab $BRANCH_NAME:$BRANCH_NAME
diff --git a/.github/workflows/highway.patch b/.github/workflows/highway.patch
new file mode 100644 (file)
index 0000000..a63dbbf
--- /dev/null
@@ -0,0 +1,12 @@
+diff --git a/BUILD b/BUILD
+index 438b671..d2777b2 100644
+--- a/BUILD
++++ b/BUILD
+@@ -153,6 +153,7 @@ cc_library(
+         "hwy/detect_compiler_arch.h",  # private
+         "hwy/print.h",
+     ],
++    includes = ["."],
+     compatible_with = [],
+     copts = COPTS,
+     defines = DEFINES,
diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml
new file mode 100644 (file)
index 0000000..f4338d2
--- /dev/null
@@ -0,0 +1,69 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Simple workflow for deploying static content to GitHub Pages
+name: Deploy static content to Pages
+
+on:
+  # Runs on pushes targeting the default branch
+  push:
+    branches: ["main"]
+
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
+permissions:
+  contents: read
+  pages: write
+  id-token: write
+
+# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
+# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
+concurrency:
+  group: "pages"
+  cancel-in-progress: false
+
+jobs:
+  # Single deploy job since we're just deploying
+  deploy:
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+        with:
+          submodules: true
+          fetch-depth: 2
+      - name: Install build deps
+        run: |
+          sudo rm -f /var/lib/man-db/auto-update
+          sudo apt update
+          sudo apt install -y \
+            cmake \
+            doxygen \
+            graphviz
+      - name: Sphinx dependencies
+        run: |
+          pip3 install -r doc/sphinx/requirements.txt
+      - name: Build
+        run: |
+          cmake -B build . \
+            -DCMAKE_BUILD_TYPE=RelWithDebInfo \
+            -DJPEGXL_FORCE_SYSTEM_BROTLI=ON \
+            -DBUILD_TESTING=OFF
+          cmake --build build -- rtd-html
+      - name: Setup Pages
+        uses: actions/configure-pages@v4
+      - name: Upload artifact
+        uses: actions/upload-pages-artifact@v2
+        with:
+          # Upload entire repository
+          path: './build/rtd'
+      - name: Deploy to GitHub Pages
+        id: deployment
+        uses: actions/deploy-pages@v3
index b1214e1..29f4c5c 100644 (file)
@@ -7,36 +7,59 @@
 
 name: PR
 on:
+  merge_group:
   pull_request:
     types: [opened, reopened, synchronize]
 
+permissions:
+  contents: read
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
 jobs:
   # Checks that the AUTHORS files is updated with new contributors.
   authors:
     runs-on: [ubuntu-latest]
     steps:
+    - name: Harden Runner
+      uses: step-security/harden-runner@1b05615854632b887b69ae1be8cbefe72d3ae423 # v2.6.0
+      with:
+        egress-policy: audit
+
     - name: Checkout the source
-      uses: actions/checkout@v2
+      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
     - name: Check AUTHORS file
+      # This is an optional check
+      continue-on-error: True
       run:
         ./ci.sh authors
 
   format:
     runs-on: [ubuntu-latest]
     steps:
+    - name: Harden Runner
+      uses: step-security/harden-runner@1b05615854632b887b69ae1be8cbefe72d3ae423 # v2.6.0
+      with:
+        egress-policy: audit
+
     - name: Install build deps
       run: |
+        sudo rm -f /var/lib/man-db/auto-update
         sudo apt update
         sudo apt install -y \
           clang-format \
-          clang-format-7 \
-          clang-format-8 \
-          clang-format-9 \
-          clang-format-10 \
-          clang-format-11 \
+          clang-format-14 \
+          clang-format-15 \
         #
     - name: Checkout the source
-      uses: actions/checkout@v2
-    - name: clang-format
-      run:
+      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+    - name: Install buildifier
+      run: |
+        eval "$(/home/linuxbrew/.linuxbrew/bin/brew shellenv)"
+        brew install buildifier
+    - name: lint
+      run: |
+        eval "$(/home/linuxbrew/.linuxbrew/bin/brew shellenv)"
         ./ci.sh lint >&2
index 4222266..515d2c8 100644 (file)
 
 name: Release build / deploy
 on:
+  merge_group:
   push:
     branches:
       - main
       - v*.*.x
+  pull_request:
+    types: [opened, reopened, labeled, synchronize]
+    paths-ignore:
+      - '**.md'
+      - 'AUTHORS'
   release:
     types: [ published ]
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
 
 jobs:
   ubuntu_static_x86_64:
     name: Release linux x86_64 static
     runs-on: [ubuntu-latest]
     steps:
+    - name: Harden Runner
+      uses: step-security/harden-runner@1b05615854632b887b69ae1be8cbefe72d3ae423 # v2.6.0
+      with:
+        egress-policy: audit
+
     - name: Install build deps
       run: |
+        sudo rm -f /var/lib/man-db/auto-update
         sudo apt update
         sudo apt install -y \
           asciidoc \
           clang \
           cmake \
           doxygen \
+          graphviz \
           libbrotli-dev \
           libgdk-pixbuf2.0-dev \
           libgif-dev \
@@ -49,7 +70,7 @@ jobs:
         echo "CXX=clang++" >> $GITHUB_ENV
 
     - name: Checkout the source
-      uses: actions/checkout@v2
+      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
       with:
         submodules: true
         fetch-depth: 1
@@ -60,29 +81,31 @@ jobs:
       run: |
         ./ci.sh release \
           -DJPEGXL_DEP_LICENSE_DIR=/usr/share/doc \
-          -DJPEGXL_STATIC=ON \
+          -DBUILD_SHARED_LIBS=OFF \
           -DBUILD_TESTING=OFF \
+          -DJPEGXL_ENABLE_JPEGLI_LIBJPEG=OFF \
           -DJPEGXL_ENABLE_VIEWERS=OFF \
           -DJPEGXL_ENABLE_PLUGINS=OFF \
           -DJPEGXL_ENABLE_OPENEXR=OFF \
+          -DJPEGXL_ENABLE_DEVTOOLS=ON \
 
     - name: Package release tarball
       run: |
         cd build
         tar -zcvf ${{ runner.workspace }}/release_file.tar.gz \
-          LICENSE* tools/{cjxl,djxl,benchmark_xl}
+          LICENSE* tools/{cjxl,djxl,benchmark_xl,cjpegli,djpegli,jxlinfo,butteraugli_main,ssimulacra2}
         ln -s ${{ runner.workspace }}/release_file.tar.gz \
           ${{ runner.workspace }}/jxl-linux-x86_64-static-${{ github.event.release.tag_name }}.tar.gz
 
     - name: Upload artifacts
-      uses: actions/upload-artifact@v2
+      uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32 # v3.1.3
       with:
         name: jxl-linux-x86_64-static
         path: ${{ runner.workspace }}/release_file.tar.gz
 
     - name: Upload binaries to release
       if: github.event_name == 'release'
-      uses: AButler/upload-release-assets@v2.0
+      uses: AButler/upload-release-assets@c94805dc72e4b20745f543da0f62eaee7722df7a # v2.0.2
       with:
         files: ${{ runner.workspace }}/jxl-linux-x86_64-static-${{ github.event.release.tag_name }}.tar.gz
         repo-token: ${{ secrets.GITHUB_TOKEN }}
@@ -96,76 +119,48 @@ jobs:
       fail-fast: false
       matrix:
         os:
+        - ubuntu:22.04
         - ubuntu:20.04
-        - ubuntu:18.04
-        - debian:buster
         - debian:bullseye
         - debian:bookworm
+        - debian:trixie
         - debian:sid
 
     container:
       image: ${{ matrix.os }}
 
     steps:
+    - name: Harden Runner
+      uses: step-security/harden-runner@1b05615854632b887b69ae1be8cbefe72d3ae423 # v2.6.0
+      with:
+        egress-policy: audit
+
     - name: Set env
       shell: 'bash'
       id: 'env'
       run: |
         artifact_name="jxl-debs-amd64-${matrix_os/:/-}"
         echo ${artifact_name}
-        echo "::set-output name=artifact_name::${artifact_name}"
+        echo "artifact_name=${artifact_name}" >> $GITHUB_OUTPUT
       env:
         matrix_os: ${{ matrix.os }}
 
     - name: Install build deps
       run: |
+        rm -f /var/lib/man-db/auto-update
         apt update
         DEBIAN_FRONTEND=noninteractive apt install -y \
           build-essential \
           devscripts \
         #
 
-    - name: Install git (only 18.04)
-      if: matrix.os == 'ubuntu:18.04'
-        # Ubuntu 18.04 ships with git 2.17 but we need 2.18 or newer for
-        # actions/checkout@v2 to work
-      shell: 'bash'
-      run: |
-        apt install -y \
-          libcurl4-openssl-dev \
-          libexpat1-dev \
-          libssl-dev \
-          wget \
-          zlib1g-dev \
-        #
-        git_version="2.32.0"
-        wget -nv \
-          "https://github.com/git/git/archive/refs/tags/v${git_version}.tar.gz"
-        tar -zxf "v${git_version}.tar.gz"
-        cd "git-${git_version}"
-        make prefix=/usr -j4 install
-
-    - name: Install gcc-8 (only 18.04)
-      if: matrix.os == 'ubuntu:18.04'
-        # Compiler bug workaround: install and use gcc-8
-      shell: 'bash'
-      run: |
-        apt install -y \
-          gcc-8 \
-          g++-8 \
-        #
-        update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-8 100
-        update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 100
-        update-alternatives --set g++ /usr/bin/g++-8
-        update-alternatives --set gcc /usr/bin/gcc-8
-
     - name: Set git safe dir
       run: |
         export GIT_CEILING_DIRECTORIES=/__w # only work before git v2.35.2
         git config --global --add safe.directory /__w/libjxl/libjxl
 
     - name: Checkout the source
-      uses: actions/checkout@v2
+      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
       with:
         submodules: true
         fetch-depth: 1
@@ -197,20 +192,6 @@ jobs:
           dch -M --distribution unstable --release ''
         fi
 
-    - name: Install gtest (only 18.04)
-      if: matrix.os == 'ubuntu:18.04'
-        # In Ubuntu 18.04 no package installed the libgtest.a. libgtest-dev
-        # installs the source files only.
-      run: |
-        apt install -y libgtest-dev cmake
-        for prj in googletest googlemock; do
-          (cd /usr/src/googletest/${prj}/ &&
-           cmake CMakeLists.txt -DCMAKE_INSTALL_PREFIX=/usr &&
-           make all install)
-        done
-        # Remove libgmock-dev dependency in Ubuntu 18.04. It doesn't exist there.
-        sed '/libgmock-dev,/d' -i debian/control
-
     - name: Install gmock-dev (debian:sid)
       # gtest-dev cmake depends on gmock-dev, but it is not installed by the
       # package.
@@ -218,11 +199,10 @@ jobs:
       run: |
         apt install -y libgmock-dev
 
-    - name: Remove libjxl-gimp-plugin package (only 18.04)
-      if: matrix.os == 'ubuntu:18.04'
+    - name: Configure hwy
+      if: ${{ github.event_name == 'schedule' || github.event_name == 'release' || contains(github.event.pull_request.labels.*.name, 'CI:full') }}
       run: |
-        # Gimp 2.8 is not supported.
-        sed -i '/Package: libjxl-gimp-plugin/,/^$/d' debian/control
+        echo "HWY_PKG_OPTIONS=" >> $GITHUB_ENV
 
     - name: Build hwy
       run: |
@@ -240,7 +220,7 @@ jobs:
         ./ci.sh debian_stats
 
     - name: Upload artifacts
-      uses: actions/upload-artifact@v2
+      uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32 # v3.1.3
       with:
         name: ${{ steps.env.outputs.artifact_name }}
         path: |
@@ -256,7 +236,7 @@ jobs:
 
     - name: Upload binaries to release
       if: github.event_name == 'release'
-      uses: AButler/upload-release-assets@v2.0
+      uses: AButler/upload-release-assets@c94805dc72e4b20745f543da0f62eaee7722df7a # v2.0.2
       with:
         files: ${{ steps.env.outputs.artifact_name }}-${{ github.event.release.tag_name }}.tar.gz
         repo-token: ${{ secrets.GITHUB_TOKEN }}
@@ -280,13 +260,18 @@ jobs:
       VCPKG_DISABLE_METRICS: 1
 
     steps:
+    - name: Harden Runner
+      uses: step-security/harden-runner@1b05615854632b887b69ae1be8cbefe72d3ae423 # v2.6.0
+      with:
+        egress-policy: audit
+
     - name: Checkout the source
-      uses: actions/checkout@v2
+      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
       with:
         submodules: true
         fetch-depth: 2
 
-    - uses: actions/cache@v2
+    - uses: actions/cache@704facf57e6136b1bc63b828d79edcd491f0ee84 # v3.3.2
       id: cache-vcpkg
       with:
         path: vcpkg
@@ -327,14 +312,17 @@ jobs:
         set -x
         mkdir build
         cmake -Bbuild -H. ${{ matrix.arch }} \
+          -DBUILD_SHARED_LIBS=OFF \
           -DBUILD_TESTING=OFF \
           -DCMAKE_BUILD_TYPE=Release \
           -DCMAKE_INSTALL_PREFIX=`pwd`/prefix \
           -DCMAKE_TOOLCHAIN_FILE=${VCPKG_ROOT}/scripts/buildsystems/vcpkg.cmake \
+          -DJPEGXL_ENABLE_JPEGLI_LIBJPEG=OFF \
           -DJPEGXL_ENABLE_OPENEXR=OFF \
           -DJPEGXL_ENABLE_PLUGINS=OFF \
           -DJPEGXL_ENABLE_TCMALLOC=OFF \
           -DJPEGXL_ENABLE_VIEWERS=OFF \
+          -DJPEGXL_ENABLE_DEVTOOLS=ON \
           -DVCPKG_TARGET_TRIPLET=${{ matrix.triplet }} \
         #
     - name: Build
@@ -357,7 +345,7 @@ jobs:
         cp third_party/brotli/LICENSE prefix/bin/LICENSE.brotli
         cp LICENSE prefix/bin/LICENSE.libjxl
     - name: Upload artifacts
-      uses: actions/upload-artifact@v2
+      uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32 # v3.1.3
       with:
         name: jxl-${{matrix.triplet}}
         path: |
@@ -372,7 +360,7 @@ jobs:
 
     - name: Upload binaries to release
       if: github.event_name == 'release'
-      uses: AButler/upload-release-assets@v2.0
+      uses: AButler/upload-release-assets@c94805dc72e4b20745f543da0f62eaee7722df7a # v2.0.2
       with:
         files: jxl-${{matrix.triplet}}.zip
         repo-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml
new file mode 100644 (file)
index 0000000..f719f01
--- /dev/null
@@ -0,0 +1,75 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file
+
+# This workflow uses actions that are not certified by GitHub. They are provided
+# by a third-party and are governed by separate terms of service, privacy
+# policy, and support documentation.
+
+name: Scorecard supply-chain security
+on:
+  # For Branch-Protection check. Only the default branch is supported. See
+  # https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection
+  branch_protection_rule:
+  # To guarantee Maintained check is occasionally updated. See
+  # https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained
+  schedule:
+    - cron: '13 2 * * 3'
+  push:
+    branches: [ "main" ]
+
+# Declare default permissions as read only.
+permissions: read-all
+
+jobs:
+  analysis:
+    name: Scorecard analysis
+    runs-on: ubuntu-latest
+    permissions:
+      # Needed to upload the results to code-scanning dashboard.
+      security-events: write
+      # Needed to publish results and get a badge (see publish_results below).
+      id-token: write
+
+    steps:
+      - name: Harden Runner
+        uses: step-security/harden-runner@1b05615854632b887b69ae1be8cbefe72d3ae423 # v2.6.0
+        with:
+          egress-policy: audit
+
+      - name: "Checkout code"
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+        with:
+          persist-credentials: false
+
+      - name: "Run analysis"
+        uses: ossf/scorecard-action@0864cf19026789058feabb7e87baa5f140aac736 # v2.3.1
+        with:
+          results_file: results.sarif
+          results_format: sarif
+          # (Optional) "write" PAT token. Uncomment the `repo_token` line below if:
+          # you want to enable the Branch-Protection check on a *public* repository
+          # To create the PAT, follow the steps in
+          # https://github.com/ossf/scorecard-action#authentication-with-fine-grained-pat-optional
+          repo_token: ${{ secrets.SCORECARD_TOKEN }}
+
+          # - Publish results to OpenSSF REST API for easy access by consumers
+          # - Allows the repository to include the Scorecard badge.
+          # - See https://github.com/ossf/scorecard-action#publishing-results.
+          publish_results: true
+
+      # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
+      # format to the repository Actions tab.
+      - name: "Upload artifact"
+        uses: actions/upload-artifact@a8a3f3ad30e3422c9c7b888a15615d19a852ae32 # v3.1.3
+        with:
+          name: SARIF file
+          path: results.sarif
+          retention-days: 5
+
+      # Upload the results to GitHub's code scanning dashboard.
+      - name: "Upload to code-scanning"
+        uses: github/codeql-action/upload-sarif@74483a38d39275f33fcff5f35b679b5ca4a26a99 # v2.22.5
+        with:
+          sarif_file: results.sarif
diff --git a/.github/workflows/test_new_highway.yml b/.github/workflows/test_new_highway.yml
new file mode 100644 (file)
index 0000000..81669da
--- /dev/null
@@ -0,0 +1,56 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Workflow for building and running tests.
+
+name: Update branch with updated submodule on libjxl
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: '37 2 * * *' # Daily on 02:37 UTC
+
+permissions:
+  contents: read
+
+jobs:
+  update:
+    if: github.repository_owner == 'libjxl'
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Harden Runner
+        uses: step-security/harden-runner@1b05615854632b887b69ae1be8cbefe72d3ae423 # v2.6.0
+        with:
+          egress-policy: audit
+
+      - name: 'Cloning libjxl'
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+        with:
+          submodules: recursive
+          persist-credentials: false # otherwise, the wrong auhtentication is used in the push
+          fetch-depth: 0 # otherwise, there would be errors pushing refs to the destination repository.
+      - name: Pull & update highway
+        working-directory: ./third_party/highway
+        run: |
+          git fetch origin
+          git checkout origin/master
+      - name: Update deps.sh
+        run: |
+          NEWHASH=`git submodule status third_party/highway | cut -d' ' -f1 | cut -c2-`
+          sed -i "s/\(THIRD_PARTY_HIGHWAY=\"\)[^\"]*/\1$NEWHASH/" deps.sh
+      - name: Commit
+        run: |
+          git config user.email "firsching@google.com"
+          git config user.name "GitHub Actions - update submodules"
+          git add --all
+          git commit -m "Update highway submodule" || echo "No changes to commit"
+
+      - name: Push changes
+        uses: ad-m/github-push-action@d91a481090679876dfc4178fef17f286781251df # v0.8.0
+        with:
+          github_token: ${{ secrets.TOKEN }}
+          branch: 'refs/heads/test_highway'
+          force: true
index 58fea2d..381100d 100644 (file)
@@ -1,7 +1,7 @@
 # Build output directories
 /build
 /build*
-/docker/*.log
+/bazel*
 
 # The downloaded corpora files for benchmark.
 /third_party/corpora
index bd008a6..99b500f 100644 (file)
@@ -25,3 +25,6 @@
 [submodule "third_party/testdata"]
        path = testdata
        url = https://github.com/libjxl/testdata
+[submodule "third_party/libjpeg-turbo"]
+       path = third_party/libjpeg-turbo
+       url = https://github.com/libjpeg-turbo/libjpeg-turbo.git
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644 (file)
index 0000000..96b5d7f
--- /dev/null
@@ -0,0 +1,30 @@
+repos:
+- repo: https://github.com/gherynos/pre-commit-java
+  rev: v0.2.4
+  hooks:
+  - id: Checkstyle
+- repo: https://github.com/gitleaks/gitleaks
+  rev: v8.16.3
+  hooks:
+  - id: gitleaks
+- repo: https://github.com/jumanjihouse/pre-commit-hooks
+  rev: 3.0.0
+  hooks:
+  - id: shellcheck
+- repo: https://github.com/pocc/pre-commit-hooks
+  rev: v1.3.5
+  hooks:
+  - id: cpplint
+- repo: https://github.com/pre-commit/mirrors-eslint
+  rev: v8.38.0
+  hooks:
+  - id: eslint
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v4.4.0
+  hooks:
+  - id: end-of-file-fixer
+  - id: trailing-whitespace
+- repo: https://github.com/pylint-dev/pylint
+  rev: v2.17.2
+  hooks:
+  - id: pylint
index 6d714ba..ee25fed 100644 (file)
@@ -11,7 +11,14 @@ version: 2
 sphinx:
    configuration: doc/sphinx/conf.py
 
+build:
+   os: ubuntu-22.04
+   tools:
+      python: "3"
+   apt_packages:
+      - doxygen
+      - graphviz
+
 python:
-   version: "3.7"
    install:
    - requirements: doc/sphinx/requirements.txt
diff --git a/AUTHORS b/AUTHORS
index c8522b8..75486ae 100644 (file)
--- a/AUTHORS
+++ b/AUTHORS
@@ -1,36 +1,66 @@
-# List of the project authors for copyright purposes. When contributing to the
-# project add your name or your organization's name to this list. See
-# CONTRIBUTING.md for details.
+# List of the project authors.
+# When contributing you can add your name to this list.
+# For a complete list of contributions made after the move
+# from gitlab to github, see
+# https://github.com/libjxl/libjxl/graphs/contributors.
+# See CONTRIBUTING.md for details.
 #
-# For organizations:
-#   Organization <email pattern: *@domain>
-#
-# For individuals:
-#   Name <email address>
 #
 # Please keep each list sorted. If you wish to change your email address please
 # send a pull request.
 
 # Organizations:
-Cloudinary Ltd. <*@cloudinary.com>
-Google LLC <*@google.com>
+
+# - Cloudinary Ltd.:
+Jon Sneyers <jon@cloudinary.com>
+
+# - Google:
+Evgenii Kliuchnikov <eustas@google.com>
+Iulia Comșa <iuliacomsa@google.com>
+Jan Wassenberg <janwas@google.com>
+Jyrki Alakuijala <jyrki@google.com>
+Lode Vandevenne <lode@google.com>
+Luca Versari <veluca@google.com>
+Marcin Kowalczyk <qrczak@google.com>
+Martin Bruse <zond@google.com>
+Moritz Firsching <firsching@google.com>
+Sami Boukortt <sboukortt@google.com>
+Sebastian Gomez <sggonzalez@google.com>
+Thomas Fischbacher <tfish@google.com>
+Zoltan Szabadka <szabadka@google.com>
 
 # Individuals:
+a-shvedov
+Aditya Patadia <adityapatadia@users.noreply.github.com>
 Alex Xu (Hello71) <alex_y_xu@yahoo.ca>
 Alexander Sago <cagelight@gmail.com>
+Alistair Barrow
 Andrius Lukas Narbutas <andrius4669@gmail.com>
 Aous Naman <aous@unsw.edu.au>
 Artem Selishchev
 Biswapriyo Nath <nathbappai@gmail.com>
 CanadianBaconBoi <beamconnor@gmail.com>
+Damiano Albani <damiano.albani@gmail.com>
 Daniel Novomeský <dnovomesky@gmail.com>
 David Burnett <vargolsoft@gmail.com>
+dependabot[bot]
+Diego Pino <dpino@igalia.com>
 Dirk Lemstra <dirk@lemstra.org>
+Dmitry Baryshev <dima8w@gmail.com>
 Don Olmstead <don.j.olmstead@gmail.com>
+Dong Xu <xdong181@gmail.com>
+estrogently <41487185+estrogently@users.noreply.github.com>
 Even Rouault <even.rouault@spatialys.com>
+Fred Brennan <copypaste@kittens.ph>
+gi-man
+Gilles Devillers (GilDev) <gildev@gmail.com>
 Heiko Becker <heirecka@exherbo.org>
-Jon Sneyers <jon@cloudinary.com>
+Ivan Kokorev
+Jim Robinson <jimbo2150@gmail.com>
+Jonathan Brown (Jonnyawsom3) <jonathanbr30@gmail.com>
+Joshua Root <jmr@macports.org>
 Kai Hollberg <Schweinepriester@users.noreply.github.com>
+Kerry Su <me@sshockwave.net>
 Kleis Auke Wolthuizen <github@kleisauke.nl>
 L. E. Segovia
 Leo Izen <leo.izen@gmail.com>
@@ -41,14 +71,22 @@ Martin Strunz
 Mathieu Malaterre <mathieu.malaterre@gmail.com>
 Mikk Leini <mikk.leini@krakul.eu>
 Misaki Kasumi <misakikasumi@outlook.com>
+Moonchild Straver <moonchild@palemoon.org>
+Nicholas Hayes <0xC0000054@users.noreply.github.com>
+Nigel Tao <nigeltao@golang.org>
 Petr Diblík
 Pieter Wuille
 roland-rollo
 Samuel Leong <wvvwvvvvwvvw@gmail.com>
 Sandro <sandro.jaeckel@gmail.com>
+Sergey Fedorov <vital.had@gmail.com>
 Stephan T. Lavavej <stl@nuwen.net>
+StepSecurity Bot <bot@stepsecurity.io>
+Sylvestre Ledru <sylvestre@debian.org>
 Thomas Bonfort <thomas.bonfort@airbus.com>
+tmkk <tmkkmac@gmail.com>
 Vincent Torri <vincent.torri@gmail.com>
 xiota
 Yonatan Nebenzhal <yonatan.nebenzhl@gmail.com>
 Ziemowit Zabawa <ziemek.zabawa@outlook.com>
+源文雨 <41315874+fumiama@users.noreply.github.com>
diff --git a/BUILD.bazel b/BUILD.bazel
new file mode 100644 (file)
index 0000000..0b81fc7
--- /dev/null
@@ -0,0 +1,22 @@
+package(default_visibility = ["//:__subpackages__"])
+
+filegroup(
+    name = "testdata",
+    srcs = glob([
+        "testdata/**/*.icc",
+        "testdata/**/*.pam",
+        "testdata/**/*.pfm",
+        "testdata/**/*.pgm",
+        "testdata/**/*.pnm",
+        "testdata/**/*.ppm",
+        "testdata/**/*.png",
+        "testdata/**/*.jpg",
+        "testdata/**/*.jxl",
+        "testdata/**/*.gif",
+        "testdata/**/*.y4m",
+        "testdata/**/*.jxl",
+        "testdata/**/*.png",
+        "testdata/**/*.jpg",
+        "testdata/position_encoding/*.txt",
+    ]),
+)
diff --git a/BUILDING.md b/BUILDING.md
new file mode 100644 (file)
index 0000000..7e9bc2a
--- /dev/null
@@ -0,0 +1,85 @@
+# Compilation
+
+For more details and other workflows see the "Advanced guide" below.
+
+## Checking out the code
+
+```bash
+git clone https://github.com/libjxl/libjxl.git --recursive --shallow-submodules
+```
+
+This repository uses git submodules to handle some third party dependencies
+under `third_party`, that's why it is important to pass `--recursive`. If you
+didn't check out with `--recursive`, or any submodule has changed, run:
+
+```bash
+git submodule update --init --recursive --depth 1 --recommend-shallow
+```
+
+The `--shallow-submodules` and `--depth 1 --recommend-shallow` options create
+shallow clones which only downloads the commits requested, and is all that is
+needed to build `libjxl`. Should full clones be necessary, you could always run:
+
+```bash
+git submodule foreach git fetch --unshallow
+git submodule update --init --recursive
+```
+
+which pulls the rest of the commits in the submodules.
+
+Important: If you downloaded a zip file or tarball from the web interface you
+won't get the needed submodules and the code will not compile. You can download
+these external dependencies from source running `./deps.sh`. The git workflow
+described above is recommended instead.
+
+## Installing dependencies
+
+Required dependencies for compiling the code, in a Debian/Ubuntu based
+distribution run:
+
+```bash
+sudo apt install cmake pkg-config libbrotli-dev
+```
+
+Optional dependencies for supporting other formats in the `cjxl`/`djxl` tools,
+in a Debian/Ubuntu based distribution run:
+
+```bash
+sudo apt install libgif-dev libjpeg-dev libopenexr-dev libpng-dev libwebp-dev
+```
+
+We recommend using a recent Clang compiler (version 7 or newer), for that
+install clang and set `CC` and `CXX` variables.
+
+```bash
+sudo apt install clang
+export CC=clang CXX=clang++
+```
+
+## Building
+
+```bash
+cd libjxl
+mkdir build
+cd build
+cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTING=OFF ..
+cmake --build . -- -j$(nproc)
+```
+
+The encoder/decoder tools will be available in the `build/tools` directory.
+
+## <a name="installing"></a> Installing
+
+```bash
+sudo cmake --install .
+```
+
+
+## Building JPEG XL for developers
+
+For experienced developers, we provide build instructions for several other environments:
+
+*   [Building on Debian](doc/developing_in_debian.md)
+*   Building on Windows with [vcpkg](doc/developing_in_windows_vcpkg.md) (Visual Studio 2019)
+*   Building on Windows with [MSYS2](doc/developing_in_windows_msys.md)
+*   [Cross Compiling for Windows with Crossroad](doc/developing_with_crossroad.md)
similarity index 94%
rename from README.Haiku.md
rename to BUILDING_Haiku.md
index 20111c5..1ffca14 100644 (file)
@@ -4,7 +4,7 @@ Haiku builds are not officially supported, i.e. the build might not work at all,
 some tests may fail and some sub-projects are excluded from build.
 
 This manual outlines Haiku-specific setup. For general building and testing
-instructions see "[README](README.md)" and
+instructions see "[BUILDING](BUILDING.md)" and
 "[Building and Testing changes](doc/building_and_testing.md)".
 
 ## Dependencies
similarity index 95%
rename from README.OSX.md
rename to BUILDING_OSX.md
index 8c6dc5a..b5f5e34 100644 (file)
@@ -4,7 +4,7 @@ OSX builds have "best effort" support, i.e. build might not work at all, some
 tests may fail and some sub-projects are excluded from build.
 
 This manual outlines OSX specific setup. For general building and testing
-instructions see "[README](README.md)" and
+instructions see "[BUILDING](BUILDING.md)" and
 "[Building and Testing changes](doc/building_and_testing.md)".
 
 [Homebrew](https://brew.sh/) is a popular package manager. JPEG XL library and
index cf68400..044f433 100644 (file)
@@ -5,7 +5,74 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## Unreleased
+## [0.9.0] - 2023-12-22
+
+### Added
+ - encoder API: add `JxlEncoderSetExtraChannelDistance` to adjust the quality
+   of extra channels (like alpha) separately.
+ - encoder API: new api functions for streaming encoding:
+  - `JxlEncoderSetOutputProcessor`
+  - `JxlEncoderFlushInput`
+  - `JxlEncoderOutputProcessor` struct
+  - `JxlEncoderSetOutputCallback`
+  - `JxlChunkedFrameInputSource` struct
+  - `JxlEncoderAddChunkedFrame`
+ - encoder API: new options for more fine-grained control over metadata
+   preservation when using `JxlEncoderAddJPEGFrame`:
+  - `JXL_ENC_FRAME_SETTING_JPEG_KEEP_EXIF`
+  - `JXL_ENC_FRAME_SETTING_JPEG_KEEP_XMP`
+  - `JXL_ENC_FRAME_SETTING_JPEG_KEEP_JUMBF`
+ - encoder API: new function `JxlEncoderSetUpsamplingMode` to change the upsampling
+   method, e.g. to use nearest-neighbor upsampling for pixel art
+ - decoder API: implemented `JxlDecoderSetOutputColorProfile` and
+   `JxlDecoderSetCms` to enable decoding to desired colorspace.
+ - cjxl can now be used to explicitly add/update/strip Exif/XMP/JUMBF metadata using
+   the decoder-hints syntax, e.g. `cjxl input.ppm -x exif=input.exif output.jxl`
+ - djxl can now be used to extract Exif/XMP/JUMBF metadata
+ - encoder API: new function `JxlEncoderDistanceFromQuality` for convenience to
+   calculate a `distance` given a `quality`
+
+### Removed
+ - API: the Butteraugli API (`jxl/butteraugli.h`) was removed.
+ - encoder and decoder API: all deprecated functions were removed:
+   `JxlDecoderDefaultPixelFormat`, `JxlEncoderOptionsSetLossless`,
+   `JxlEncoderOptionsSetEffort`, `JxlEncoderOptionsSetDecodingSpeed`,
+   `JxlEncoderOptionsSetDistance`, `JxlEncoderOptionsCreate`, as well as
+   the deprecated enumerator values `JXL_DEC_EXTENSIONS`, `JXL_ENC_NOT_SUPPORTED`,
+   `JXL_TYPE_BOOLEAN`, `JXL_TYPE_UINT32`, and deprecated type `JxlEncoderOptions`.
+ - decoder API: the signature of `JxlDecoderGetColorAsEncodedProfile`,
+   `JxlDecoderGetICCProfileSize`, and `JxlDecoderGetColorAsICCProfile`
+   changed: a deprecated unused argument was removed.
+
+### Changed / clarified
+ - changed the name of the cjxl flag `photon_noise` to `photon_noise_iso`
+ - fixed how large boxes are decoded (#2958)
+ - fixed encoding files with unreadable patches (#3042, #3046)
+
+## [0.8.0] - 2023-01-18
+
+### Added
+ - decoder API: new function `JxlDecoderSetImageBitDepth` to set the bit depth
+   of the output buffer.
+ - decoder API proposal: add `JxlDecoderSetOutputColorProfile` and
+   `JxlDecoderSetCms` to enable decoding to desired colorspace; NB: not
+   implemented yet.
+ - encoder API: new function `JxlEncoderSetFrameBitDepth` to set the bit depth
+   of the input buffer.
+ - encoder API: add an effort 10 option for lossless compression; using this
+   setting requires calling `JxlEncoderAllowExpertOptions`.
+ - encoder API: new `JXL_ENC_FRAME_SETTING_JPEG_COMPRESS_BOXES` enum value to
+   allow explicit control of metadata compression
+
+### Removed
+ - common API: removed `JxlIntrinsicSizeHeader`
+ - decoder API: removed deprecated `JXL_DEC_NEED_DC_OUT_BUFFER` and
+   `JXL_DEC_DC_IMAGE` events, `JxlDecoderDCOutBufferSize` and
+   `JxlDecoderSetDCOutBuffer` functions
+
+### Changed / clarified
+ - encoder API: `JxlEncoderProcessOutput` requires at least 32 bytes of output
+   space to proceed and guarantees that at least one byte will be written
 
 ## [0.7] - 2022-07-21
 
index 533815d..12c3330 100644 (file)
@@ -3,41 +3,21 @@
 # Use of this source code is governed by a BSD-style
 # license that can be found in the LICENSE file.
 
-# Ubuntu bionic ships with cmake 3.10.
-cmake_minimum_required(VERSION 3.10)
+# Ubuntu focal ships with cmake 3.16.
+cmake_minimum_required(VERSION 3.16...3.27)
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 
-# Honor VISIBILITY_INLINES_HIDDEN on all types of targets.
-if(POLICY CMP0063)
-  cmake_policy(SET CMP0063 NEW)
-endif()
-# Pass CMAKE_EXE_LINKER_FLAGS to CC and CXX compilers when testing if they work.
-if(POLICY CMP0065)
-  cmake_policy(SET CMP0065 NEW)
-endif()
-
-# Set PIE flags for POSITION_INDEPENDENT_CODE targets, added in 3.14.
-if(POLICY CMP0083)
-  cmake_policy(SET CMP0083 NEW)
-endif()
-
 project(LIBJXL LANGUAGES C CXX)
 
-include(CheckCXXSourceCompiles)
-check_cxx_source_compiles(
-   "int main() {
-      #if !defined(__EMSCRIPTEN__)
-      static_assert(false, \"__EMSCRIPTEN__ is not defined\");
-      #endif
-      return 0;
-    }"
-  JPEGXL_EMSCRIPTEN
-)
+# TODO(sboukortt): remove once oss-fuzz passes -DBUILD_SHARED_LIBS=OFF
+if(JPEGXL_ENABLE_FUZZERS)
+  message(INFO "Fuzzer build detected, building static libs")
+  set(BUILD_SHARED_LIBS OFF)
+endif()
 
 message(STATUS "CMAKE_SYSTEM_PROCESSOR is ${CMAKE_SYSTEM_PROCESSOR}")
 include(CheckCXXCompilerFlag)
 check_cxx_compiler_flag("-fsanitize=fuzzer-no-link" CXX_FUZZERS_SUPPORTED)
-check_cxx_compiler_flag("-Xclang -mconstructor-aliases" CXX_CONSTRUCTOR_ALIASES_SUPPORTED)
 check_cxx_compiler_flag("-fmacro-prefix-map=OLD=NEW" CXX_MACRO_PREFIX_MAP)
 check_cxx_compiler_flag("-fno-rtti" CXX_NO_RTTI_SUPPORTED)
 
@@ -50,6 +30,19 @@ if(CHECK_PIE_SUPPORTED)
   endif()
 endif()
 
+if(PROVISION_DEPENDENCIES)
+  # Run script to provision dependencies.
+  find_program (BASH_PROGRAM bash)
+  if(BASH_PROGRAM)
+    execute_process(
+      COMMAND ${BASH_PROGRAM} ${CMAKE_CURRENT_SOURCE_DIR}/deps.sh
+      RESULT_VARIABLE PROVISION_DEPENDENCIES_RESULT)
+  endif()
+  if(NOT PROVISION_DEPENDENCIES_RESULT EQUAL "0")
+    message(FATAL_ERROR "${CMAKE_CURRENT_SOURCE_DIR}/deps.sh failed with ${PROVISION_DEPENDENCIES_RESULT}")
+  endif()
+endif()
+
 ### Project build options:
 if(CXX_FUZZERS_SUPPORTED)
   # Enabled by default except on arm64, Windows and Apple builds.
@@ -84,7 +77,7 @@ check_cxx_source_compiles(
 
 set(WARNINGS_AS_ERRORS_DEFAULT false)
 
-if((SANITIZER STREQUAL "msan") OR JPEGXL_EMSCRIPTEN)
+if((SANITIZER STREQUAL "msan") OR EMSCRIPTEN)
   set(BUNDLE_LIBPNG_DEFAULT YES)
 else()
   set(BUNDLE_LIBPNG_DEFAULT NO)
@@ -100,6 +93,16 @@ set(JPEGXL_ENABLE_DEVTOOLS false CACHE BOOL
     "Build JPEGXL developer tools.")
 set(JPEGXL_ENABLE_TOOLS true CACHE BOOL
     "Build JPEGXL user tools: cjxl and djxl.")
+set(JPEGXL_ENABLE_JPEGLI true CACHE BOOL
+    "Build jpegli library.")
+set(JPEGXL_ENABLE_JPEGLI_LIBJPEG true CACHE BOOL
+    "Build libjpeg.so shared library based on jpegli.")
+set(JPEGXL_INSTALL_JPEGLI_LIBJPEG false CACHE BOOL
+    "Install jpegli version of libjpeg.so system-wide.")
+set(JPEGLI_LIBJPEG_LIBRARY_VERSION "62.3.0" CACHE STRING
+    "Library version of the libjpeg.so shared library that we build.")
+set(JPEGLI_LIBJPEG_LIBRARY_SOVERSION "62" CACHE STRING
+    "Library so-version of the libjpeg.so shared library that we build.")
 set(JPEGXL_ENABLE_DOXYGEN true CACHE BOOL
     "Generate C API documentation using Doxygen.")
 set(JPEGXL_ENABLE_MANPAGES true CACHE BOOL
@@ -118,8 +121,6 @@ set(JPEGXL_ENABLE_OPENEXR true CACHE BOOL
     "Build JPEGXL with support for OpenEXR if available.")
 set(JPEGXL_ENABLE_SKCMS true CACHE BOOL
     "Build with skcms instead of lcms2.")
-set(JPEGXL_BUNDLE_SKCMS true CACHE BOOL
-    "When building with skcms, bundle it into libjxl.a.")
 set(JPEGXL_ENABLE_VIEWERS false CACHE BOOL
     "Build JPEGXL viewer tools for evaluation.")
 set(JPEGXL_ENABLE_TCMALLOC ${ENABLE_TCMALLOC_DEFAULT} CACHE BOOL
@@ -128,23 +129,30 @@ set(JPEGXL_ENABLE_PLUGINS false CACHE BOOL
     "Build third-party plugins to support JPEG XL in other applications.")
 set(JPEGXL_ENABLE_COVERAGE false CACHE BOOL
     "Enable code coverage tracking for libjxl. This also enables debug and disables optimizations.")
-set(JPEGXL_ENABLE_PROFILER false CACHE BOOL
-    "Builds in support for profiling (printed by tools if extra flags given)")
 set(JPEGXL_ENABLE_SIZELESS_VECTORS false CACHE BOOL
     "Builds in support for SVE/RVV vectorization")
 set(JPEGXL_ENABLE_TRANSCODE_JPEG true CACHE BOOL
     "Builds in support for decoding transcoded JXL files back to JPEG,\
  disabling it makes the decoder reject JXL_DEC_JPEG_RECONSTRUCTION events,\
  (default enabled)")
-set(JPEGXL_STATIC false CACHE BOOL
-    "Build tools as static binaries.")
+set(JPEGXL_ENABLE_BOXES true CACHE BOOL
+    "Builds in support for decoding boxes in JXL files,\
+ disabling it makes the decoder reject JXL_DEC_BOX events,\
+ (default enabled)")
 set(JPEGXL_WARNINGS_AS_ERRORS ${WARNINGS_AS_ERRORS_DEFAULT} CACHE BOOL
     "Treat warnings as errors during compilation.")
 set(JPEGXL_DEP_LICENSE_DIR "" CACHE STRING
     "Directory where to search for system dependencies \"copyright\" files.")
 set(JPEGXL_FORCE_NEON false CACHE BOOL
     "Set flags to enable NEON in arm if not enabled by your toolchain.")
-
+set(JPEGXL_TEST_TOOLS false CACHE BOOL
+    "Run scripts that test the encoding / decoding tools.")
+set(JPEGXL_ENABLE_AVX512 false CACHE BOOL
+    "Build with AVX512 support (faster on CPUs that support it, but larger binary size).")
+set(JPEGXL_ENABLE_AVX512_SPR false CACHE BOOL
+    "Build with AVX-512FP16 support (faster on CPUs that support it, but larger binary size).")
+set(JPEGXL_ENABLE_AVX512_ZEN4 false CACHE BOOL
+"Build with Zen4-optimized AVX512 support (faster on CPUs that support it, but larger binary size).")
 
 # Force system dependencies.
 set(JPEGXL_FORCE_SYSTEM_BROTLI false CACHE BOOL
@@ -162,40 +170,33 @@ if (NOT CMAKE_C_COMPILER_ID STREQUAL CMAKE_CXX_COMPILER_ID)
   message(FATAL_ERROR "Different C/C++ compilers set: "
           "${CMAKE_C_COMPILER_ID} vs ${CMAKE_CXX_COMPILER_ID}")
 endif()
-if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-  # Android NDK's toolchain.cmake fakes the clang version in
-  # CMAKE_CXX_COMPILER_VERSION with an incorrect number, so ignore this.
-  if (NOT CMAKE_ANDROID_NDK_TOOLCHAIN_VERSION MATCHES "clang"
-      AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5)
-    message(FATAL_ERROR
-      "Minimum Clang version required is Clang 5, please update.")
-  endif()
-elseif (CMAKE_CXX_COMPILER_ID MATCHES "GNU")
-  if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7)
-    message(FATAL_ERROR
-      "Minimum GCC version required is 7, please update.")
-  endif()
-endif()
 
 message(STATUS
     "Compiled IDs C:${CMAKE_C_COMPILER_ID}, C++:${CMAKE_CXX_COMPILER_ID}")
 
+set(JXL_HWY_INCLUDE_DIRS "$<BUILD_INTERFACE:$<TARGET_PROPERTY:$<IF:$<TARGET_EXISTS:hwy::hwy>,hwy::hwy,hwy>,INTERFACE_INCLUDE_DIRECTORIES>>")
+# Always disable SSSE3 since it is rare to have SSSE3 but not SSE4
+set(HWY_DISABLED_TARGETS "HWY_SSSE3")
+if (NOT JPEGXL_ENABLE_AVX512)
+  message(STATUS "Disabled AVX512 (set JPEGXL_ENABLE_AVX512 to enable it)")
+  set(HWY_DISABLED_TARGETS "${HWY_DISABLED_TARGETS}|HWY_AVX3")
+  add_definitions(-DFJXL_ENABLE_AVX512=0)
+endif()
+if (NOT JPEGXL_ENABLE_AVX512_SPR)
+  message(STATUS "Disabled AVX512_SPR (set JPEGXL_ENABLE_AVX512_SPR to enable it)")
+  set(HWY_DISABLED_TARGETS "${HWY_DISABLED_TARGETS}|HWY_AVX3_SPR")
+endif()
+if (NOT JPEGXL_ENABLE_AVX512_ZEN4)
+  message(STATUS "Disabled AVX512_ZEN4 (set JPEGXL_ENABLE_AVX512_ZEN4 to enable it)")
+  set(HWY_DISABLED_TARGETS "${HWY_DISABLED_TARGETS}|HWY_AVX3_ZEN4")
+endif()
+
+
+
 # CMAKE_EXPORT_COMPILE_COMMANDS is used to generate the compilation database
 # used by clang-tidy.
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
-if(JPEGXL_STATIC)
-  set(BUILD_SHARED_LIBS 0)
-  # Clang developers say that in case to use "static" we have to build stdlib
-  # ourselves; for real use case we don't care about stdlib, as it is "granted",
-  # so just linking all other libraries is fine.
-  if (NOT MSVC AND NOT APPLE)
-    set(CMAKE_FIND_LIBRARY_SUFFIXES .a)
-    set(CMAKE_EXE_LINKER_FLAGS
-        "${CMAKE_EXE_LINKER_FLAGS} -static -static-libgcc -static-libstdc++")
-  endif()
-endif()  # JPEGXL_STATIC
-
 # Threads
 set(THREADS_PREFER_PTHREAD_FLAG YES)
 find_package(Threads REQUIRED)
@@ -209,41 +210,7 @@ set(CMAKE_CXX_STANDARD_REQUIRED YES)
 # Atomics
 find_package(Atomics REQUIRED)
 
-if(JPEGXL_STATIC)
-  if (MINGW)
-    # In MINGW libstdc++ uses pthreads directly. When building statically a
-    # program (regardless of whether the source code uses pthread or not) the
-    # toolchain will add stdc++ and pthread to the linking step but stdc++ will
-    # be linked statically while pthread will be linked dynamically.
-    # To avoid this and have pthread statically linked with need to pass it in
-    # the command line with "-Wl,-Bstatic -lpthread -Wl,-Bdynamic" but the
-    # linker will discard it if not used by anything else up to that point in
-    # the linker command line. If the program or any dependency don't use
-    # pthread directly -lpthread is discarded and libstdc++ (added by the
-    # toolchain later) will then use the dynamic version. For this we also need
-    # to pass -lstdc++ explicitly before -lpthread. For pure C programs -lstdc++
-    # will be discarded anyway.
-    # This adds these flags as dependencies for *all* targets. Adding this to
-    # CMAKE_EXE_LINKER_FLAGS instead would cause them to be included before any
-    # object files and therefore discarded. This should be set in the
-    # INTERFACE_LINK_LIBRARIES of Threads::Threads but some third_part targets
-    # don't depend on it.
-    link_libraries(-Wl,-Bstatic -lstdc++ -lpthread -Wl,-Bdynamic)
-  elseif(CMAKE_USE_PTHREADS_INIT)
-    # "whole-archive" is not supported on OSX.
-    if (NOT APPLE)
-      # Set pthreads as a whole-archive, otherwise weak symbols in the static
-      # libraries will discard pthreads symbols leading to segmentation fault at
-      # runtime.
-      message(STATUS "Using -lpthread as --whole-archive")
-      set_target_properties(Threads::Threads PROPERTIES
-        INTERFACE_LINK_LIBRARIES
-            "-Wl,--whole-archive;-lpthread;-Wl,--no-whole-archive")
-    endif()
-  endif()
-endif()  # JPEGXL_STATIC
-
-if (JPEGXL_EMSCRIPTEN)
+if (EMSCRIPTEN)
   set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -pthread")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread")
@@ -257,73 +224,76 @@ if (CXX_NO_RTTI_SUPPORTED)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti")
 endif()
 
-if (MSVC)
-# TODO(janwas): add flags
-else ()
+# Internal flags for coverage builds:
+set(JPEGXL_COVERAGE_FLAGS)
+set(JPEGXL_COVERAGE_LINK_FLAGS)
 
-# Global compiler flags for all targets here and in subdirectories.
-add_definitions(
-  # Avoid changing the binary based on the current time and date.
-  -D__DATE__="redacted"
-  -D__TIMESTAMP__="redacted"
-  -D__TIME__="redacted"
-)
-
-# Avoid log spam from fopen etc.
-if(MSVC)
+if (MSVC)
+  # TODO(janwas): add flags
   add_definitions(-D_CRT_SECURE_NO_WARNINGS)
-endif()
-
-# TODO(eustas): JXL currently compiles, but does not pass tests...
-if (NOT JXL_HWY_DISABLED_TARGETS_FORCED AND NOT JPEGXL_ENABLE_SIZELESS_VECTORS)
-  add_definitions(-DHWY_DISABLED_TARGETS=\(HWY_SVE|HWY_SVE2|HWY_SVE_256|HWY_SVE2_128|HWY_RVV\))
-  message("Warning: HWY_SVE, HWY_SVE2, HWY_SVE_256, HWY_SVE2_128 and HWY_RVV CPU targets are disabled")
-endif()
-
-# In CMake before 3.12 it is problematic to pass repeated flags like -Xclang.
-# For this reason we place them in CMAKE_CXX_FLAGS instead.
-# See https://gitlab.kitware.com/cmake/cmake/issues/15826
+else ()
+  # Global compiler flags for all targets here and in subdirectories.
+  add_definitions(
+    # Avoid changing the binary based on the current time and date.
+    -D__DATE__="redacted"
+    -D__TIMESTAMP__="redacted"
+    -D__TIME__="redacted"
+  )
 
-# Machine flags.
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -funwind-tables")
-if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xclang -mrelax-all")
-endif()
-if (CXX_CONSTRUCTOR_ALIASES_SUPPORTED)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xclang -mconstructor-aliases")
-endif()
+  # TODO(eustas): JXL currently compiles, but does not pass tests...
+  if (NOT JXL_HWY_DISABLED_TARGETS_FORCED)
+    if (NOT JPEGXL_ENABLE_SIZELESS_VECTORS)
+      set(HWY_DISABLED_TARGETS "${HWY_DISABLED_TARGETS}|HWY_SVE|HWY_SVE2|HWY_SVE_256|HWY_SVE2_128|HWY_RVV")
+    endif()
+    add_definitions(-DHWY_DISABLED_TARGETS=\(${HWY_DISABLED_TARGETS}\))
+  endif()
 
-if(WIN32)
-# Not supported by clang-cl, but frame pointers are default on Windows
-else()
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-omit-frame-pointer")
-endif()
+  # Machine flags.
+  add_compile_options(-funwind-tables)
+  if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    add_compile_options("SHELL:-Xclang -mrelax-all")
+  endif()
+  if (CXX_CONSTRUCTOR_ALIASES_SUPPORTED)
+    add_compile_options("SHELL:-Xclang -mconstructor-aliases")
+  endif()
 
-# CPU flags - remove once we have NEON dynamic dispatch
+  if(WIN32)
+    # Not supported by clang-cl, but frame pointers are default on Windows
+  else()
+    add_compile_options(-fno-omit-frame-pointer)
+  endif()
 
-# TODO(janwas): this also matches M1, but only ARMv7 is intended/needed.
-if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
-if(JPEGXL_FORCE_NEON)
-# GCC requires these flags, otherwise __ARM_NEON is undefined.
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} \
-   -mfpu=neon-vfpv4 -mfloat-abi=hard")
-endif()
-endif()
+  # CPU flags - remove once we have NEON dynamic dispatch
 
-# Force build with optimizations in release mode.
-set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2")
+  # TODO(janwas): this also matches M1, but only ARMv7 is intended/needed.
+  if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
+    if(JPEGXL_FORCE_NEON)
+      # GCC requires these flags, otherwise __ARM_NEON is undefined.
+      add_compile_options(-mfpu=neon-vfpv4 -mfloat-abi=hard)
+    endif()
+  endif()
 
-add_compile_options(
-  # Ignore this to allow redefining __DATE__ and others.
-  -Wno-builtin-macro-redefined
+  add_compile_options(
+    # Ignore this to allow redefining __DATE__ and others.
+    -Wno-builtin-macro-redefined
 
-  # Global warning settings.
-  -Wall
-)
+    # Global warning settings.
+    -Wall
+  )
 
-if (JPEGXL_WARNINGS_AS_ERRORS)
-add_compile_options(-Werror)
-endif ()
+  if (JPEGXL_WARNINGS_AS_ERRORS)
+    add_compile_options(-Werror)
+  endif ()
+
+  if(JPEGXL_ENABLE_COVERAGE)
+    set(JPEGXL_COVERAGE_FLAGS
+        -g -O0 -fprofile-arcs -ftest-coverage
+        -DJXL_ENABLE_ASSERT=0 -DJXL_ENABLE_CHECK=0
+    )
+    set(JPEGXL_COVERAGE_LINK_FLAGS
+        --coverage
+    )
+  endif()  # JPEGXL_ENABLE_COVERAGE
 endif ()  # !MSVC
 
 include(GNUInstallDirs)
@@ -348,125 +318,146 @@ endif()
 add_subdirectory(lib)
 
 if(BUILD_TESTING)
-# Script to run tests over the source code in bash.
-find_program (BASH_PROGRAM bash)
-if(BASH_PROGRAM)
-  add_test(
-    NAME bash_test
-    COMMAND ${BASH_PROGRAM} ${CMAKE_CURRENT_SOURCE_DIR}/bash_test.sh)
-endif()
+  # Script to run tests over the source code in bash.
+  find_program (BASH_PROGRAM bash)
+  if(BASH_PROGRAM)
+    add_test(
+      NAME bash_test
+      COMMAND ${BASH_PROGRAM} ${CMAKE_CURRENT_SOURCE_DIR}/bash_test.sh)
+  endif()
 endif() # BUILD_TESTING
 
 # Documentation generated by Doxygen
 if(JPEGXL_ENABLE_DOXYGEN)
-find_package(Doxygen)
-if(DOXYGEN_FOUND)
-set(DOXYGEN_GENERATE_HTML "YES")
-set(DOXYGEN_GENERATE_XML "YES")
-set(DOXYGEN_STRIP_FROM_PATH "${CMAKE_CURRENT_SOURCE_DIR}/lib/include")
-set(DOXYGEN_USE_MDFILE_AS_MAINPAGE "README.md")
-if(JPEGXL_WARNINGS_AS_ERRORS)
-set(DOXYGEN_WARN_AS_ERROR "YES")
-endif()
-set(DOXYGEN_QUIET "YES")
-doxygen_add_docs(doc
-  "${CMAKE_CURRENT_SOURCE_DIR}/lib/include"
-  "${CMAKE_CURRENT_SOURCE_DIR}/doc/api.txt"
-  WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}"
-  COMMENT "Generating C API documentation")
-
-# Add sphinx doc build step for readthedocs.io (requires doxygen too).
-find_program(SPHINX_BUILD_PROGRAM sphinx-build)
-if(SPHINX_BUILD_PROGRAM)
-  add_custom_command(
-    OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/rtd/nonexistent"
-    COMMENT "Generating readthedocs.io output on ${CMAKE_CURRENT_BINARY_DIR}/rtd"
-    COMMAND ${SPHINX_BUILD_PROGRAM} -q -W -b html -j auto
-      ${CMAKE_SOURCE_DIR}/doc/sphinx
-      ${CMAKE_CURRENT_BINARY_DIR}/rtd
-    DEPENDS doc
-  )
-  # This command runs the documentation generation every time since the output
-  # target file doesn't exist.
-  add_custom_target(rtd-html
-    DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/rtd/nonexistent
-  )
-else() # SPHINX_BUILD_PROGRAM\
-  message(WARNING "sphinx-build not found, skipping rtd documentation")
-endif() # SPHINX_BUILD_PROGRAM
+  find_package(Doxygen)
+  if(DOXYGEN_FOUND)
+    set(DOXYGEN_GENERATE_HTML "YES")
+    set(DOXYGEN_GENERATE_XML "YES")
+    set(DOXYGEN_STRIP_FROM_PATH "${CMAKE_CURRENT_SOURCE_DIR}/lib/include")
+    if(JPEGXL_WARNINGS_AS_ERRORS)
+      set(DOXYGEN_WARN_AS_ERROR "YES")
+    endif()
+    set(DOXYGEN_QUIET "YES")
+    doxygen_add_docs(doc
+      "${CMAKE_CURRENT_SOURCE_DIR}/lib/include"
+      "${CMAKE_CURRENT_SOURCE_DIR}/doc/api.txt"
+      WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}"
+      COMMENT "Generating C API documentation")
+
+    # Add sphinx doc build step for readthedocs.io (requires doxygen too).
+    find_program(SPHINX_BUILD_PROGRAM sphinx-build)
+    if(SPHINX_BUILD_PROGRAM)
+      add_custom_command(
+        OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/rtd/nonexistent"
+        COMMENT "Generating readthedocs.io output on ${CMAKE_CURRENT_BINARY_DIR}/rtd"
+        COMMAND ${SPHINX_BUILD_PROGRAM} -q -W -b html -j auto
+          ${CMAKE_SOURCE_DIR}/doc/sphinx
+          ${CMAKE_CURRENT_BINARY_DIR}/rtd
+        DEPENDS doc
+      )
+      # This command runs the documentation generation every time since the output
+      # target file doesn't exist.
+      add_custom_target(rtd-html
+        DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/rtd/nonexistent
+      )
+    else() # SPHINX_BUILD_PROGRAM\
+      message(WARNING "sphinx-build not found, skipping rtd documentation")
+    endif() # SPHINX_BUILD_PROGRAM
 
-else()
-# Create a "doc" target for compatibility since "doc" is not otherwise added to
-# the build when doxygen is not installed.
-add_custom_target(doc false
-  COMMENT "Error: Can't generate doc since Doxygen not installed.")
-endif() # DOXYGEN_FOUND
+  else()
+    # Create a "doc" target for compatibility since "doc" is not otherwise added to
+    # the build when doxygen is not installed.
+    add_custom_target(doc false
+      COMMENT "Error: Can't generate doc since Doxygen not installed.")
+  endif() # DOXYGEN_FOUND
 endif() # JPEGXL_ENABLE_DOXYGEN
 
 if(JPEGXL_ENABLE_MANPAGES)
-find_program(ASCIIDOC a2x)
-if(ASCIIDOC)
-file(STRINGS "${ASCIIDOC}" ASCIIDOC_SHEBANG LIMIT_COUNT 1)
-if(ASCIIDOC_SHEBANG MATCHES "/sh|/bash")
-  set(ASCIIDOC_PY_FOUND ON)
-  # Run the program directly and set ASCIIDOC as empty.
-  set(ASCIIDOC_PY "${ASCIIDOC}")
-  set(ASCIIDOC "")
-elseif(ASCIIDOC_SHEBANG MATCHES "python2")
-  find_package(Python2 COMPONENTS Interpreter)
-  set(ASCIIDOC_PY_FOUND "${Python2_Interpreter_FOUND}")
-  set(ASCIIDOC_PY Python2::Interpreter)
-elseif(ASCIIDOC_SHEBANG MATCHES "python3")
-  find_package(Python3 COMPONENTS Interpreter)
-  set(ASCIIDOC_PY_FOUND "${Python3_Interpreter_FOUND}")
-  set(ASCIIDOC_PY Python3::Interpreter)
-else()
-  find_package(Python COMPONENTS Interpreter QUIET)
-  if(NOT Python_Interpreter_FOUND)
-    find_program(ASCIIDOC_PY python)
-    if(ASCIIDOC_PY)
+  find_program(ASCIIDOC a2x)
+  if(ASCIIDOC)
+    file(STRINGS "${ASCIIDOC}" ASCIIDOC_SHEBANG LIMIT_COUNT 1)
+    if(ASCIIDOC_SHEBANG MATCHES "/sh|/bash" OR MINGW)
       set(ASCIIDOC_PY_FOUND ON)
+      # Run the program directly and set ASCIIDOC as empty.
+      set(ASCIIDOC_PY "${ASCIIDOC}")
+      set(ASCIIDOC "")
+    elseif(ASCIIDOC_SHEBANG MATCHES "python2")
+      find_package(Python2 COMPONENTS Interpreter)
+      set(ASCIIDOC_PY_FOUND "${Python2_Interpreter_FOUND}")
+      set(ASCIIDOC_PY Python2::Interpreter)
+    elseif(ASCIIDOC_SHEBANG MATCHES "python3")
+      find_package(Python3 COMPONENTS Interpreter)
+      set(ASCIIDOC_PY_FOUND "${Python3_Interpreter_FOUND}")
+      set(ASCIIDOC_PY Python3::Interpreter)
+    else()
+      find_package(Python COMPONENTS Interpreter QUIET)
+      if(NOT Python_Interpreter_FOUND)
+        find_program(ASCIIDOC_PY python)
+        if(ASCIIDOC_PY)
+          set(ASCIIDOC_PY_FOUND ON)
+        endif()
+      else()
+        set(ASCIIDOC_PY_FOUND "${Python_Interpreter_FOUND}")
+        set(ASCIIDOC_PY Python::Interpreter)
+      endif()
     endif()
-  else()
-    set(ASCIIDOC_PY_FOUND "${Python_Interpreter_FOUND}")
-    set(ASCIIDOC_PY Python::Interpreter)
-  endif()
-endif()
 
-if (ASCIIDOC_PY_FOUND)
-  set(MANPAGE_FILES "")
-  set(MANPAGES "")
-  foreach(PAGE IN ITEMS cjxl djxl)
-    # Invoking the Python interpreter ourselves instead of running the a2x binary
-    # directly is necessary on MSYS2, otherwise it is run through cmd.exe which
-    # does not recognize it.
-    add_custom_command(
-      OUTPUT "${PAGE}.1"
-      COMMAND "${ASCIIDOC_PY}"
-      ARGS ${ASCIIDOC}
-        --format manpage --destination-dir="${CMAKE_CURRENT_BINARY_DIR}"
-        "${CMAKE_CURRENT_SOURCE_DIR}/doc/man/${PAGE}.txt"
-      MAIN_DEPENDENCY "${CMAKE_CURRENT_SOURCE_DIR}/doc/man/${PAGE}.txt")
-    list(APPEND MANPAGE_FILES "${CMAKE_CURRENT_BINARY_DIR}/${PAGE}.1")
-    list(APPEND MANPAGES "${PAGE}.1")
-  endforeach()
-  add_custom_target(manpages ALL DEPENDS ${MANPAGES})
-  install(FILES ${MANPAGE_FILES} DESTINATION ${CMAKE_INSTALL_MANDIR}/man1)
-endif()  # ASCIIDOC_PY_FOUND
-else()
-  message(WARNING "asciidoc was not found, the man pages will not be installed.")
-endif()  # ASCIIDOC
+    if (ASCIIDOC_PY_FOUND)
+      set(MANPAGE_FILES "")
+      set(MANPAGES "")
+      foreach(PAGE IN ITEMS cjxl djxl)
+        # Invoking the Python interpreter ourselves instead of running the a2x binary
+        # directly is necessary on MSYS2, otherwise it is run through cmd.exe which
+        # does not recognize it.
+        add_custom_command(
+          OUTPUT "${PAGE}.1"
+          COMMAND "${ASCIIDOC_PY}"
+          ARGS ${ASCIIDOC}
+            --format manpage --destination-dir="${CMAKE_CURRENT_BINARY_DIR}"
+            "${CMAKE_CURRENT_SOURCE_DIR}/doc/man/${PAGE}.txt"
+          MAIN_DEPENDENCY "${CMAKE_CURRENT_SOURCE_DIR}/doc/man/${PAGE}.txt")
+        list(APPEND MANPAGE_FILES "${CMAKE_CURRENT_BINARY_DIR}/${PAGE}.1")
+        list(APPEND MANPAGES "${PAGE}.1")
+      endforeach()
+      add_custom_target(manpages ALL DEPENDS ${MANPAGES})
+      install(FILES ${MANPAGE_FILES} DESTINATION ${CMAKE_INSTALL_MANDIR}/man1)
+    endif()  # ASCIIDOC_PY_FOUND
+  else()
+    message(WARNING "asciidoc was not found, the man pages will not be installed.")
+  endif()  # ASCIIDOC
 endif()  # JPEGXL_ENABLE_MANPAGES
 
 # Example usage code.
 if (JPEGXL_ENABLE_EXAMPLES)
-include(examples/examples.cmake)
+  include(examples/examples.cmake)
 endif ()
 
 # Plugins for third-party software
 if (JPEGXL_ENABLE_PLUGINS)
-add_subdirectory(plugins)
+  add_subdirectory(plugins)
 endif ()
 
 # Binary tools
 add_subdirectory(tools)
+
+
+macro(list_test_targets out dir)
+  get_property(dir_targets DIRECTORY ${dir} PROPERTY BUILDSYSTEM_TARGETS)
+  foreach(target ${dir_targets})
+    if (target MATCHES ".*_test")
+      list(APPEND ${out} ${target})
+    endif()
+  endforeach()
+  get_property(subdirectories DIRECTORY ${dir} PROPERTY SUBDIRECTORIES)
+  foreach(subdir ${subdirectories})
+    list_test_targets(${out} ${subdir})
+  endforeach()
+endmacro()
+
+set(all_tests_list)
+list_test_targets(all_tests_list ${CMAKE_CURRENT_SOURCE_DIR})
+
+if(all_tests_list)
+  add_custom_target(all_tests)
+  add_dependencies(all_tests ${all_tests_list})
+endif()
index cb64597..1a255ba 100644 (file)
@@ -75,7 +75,9 @@ information on using pull requests.
   functionality. Reviewers may ask you to split a Pull Request and it is
   easier to create a smaller change from the beginning.
 
-  * Describe your commits. Add a meaningful description to your commit message, explain what you are changing if it is not trivially obvious, but more importantly explain *why* you are making those changes. For example "Fix
+  * Describe your commits. Add a meaningful description to your commit message,
+  explain what you are changing if it is not trivially obvious, but more
+  importantly explain *why* you are making those changes. For example "Fix
   build" is not a good commit message, describe what build and if it makes sense
   why is this fixing it or why was it failing without this. It is very likely
   that people far in the future without any context you have right now will be
@@ -113,8 +115,7 @@ information on using pull requests.
 
   * Sign the CLA (only needed once per user, see above).
 
-  * AUTHORS: If this is your first contribution, add your name or your
-  company name to the [AUTHORS](AUTHORS) file for copyright tracking purposes.
+  * AUTHORS: You can add your name to the [AUTHORS](AUTHORS) file.
 
   * Style guide. Check `./ci.sh lint`.
 
index b0f2e3b..bcea13f 100644 (file)
--- a/README.md
+++ b/README.md
@@ -12,6 +12,10 @@ https://github.com/libjxl/libjxl/actions/workflows/fuzz.yml)
 https://github.com/libjxl/libjxl/actions/workflows/release.yaml)
 [![Doc](https://readthedocs.org/projects/libjxl/badge/?version=latest)](
 https://libjxl.readthedocs.io/en/latest/?badge=latest)
+[![OpenSSF Best Practices](https://www.bestpractices.dev/projects/7845/badge)](
+https://www.bestpractices.dev/projects/7845)
+[![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/libjxl/libjxl/badge)](
+https://securityscorecards.dev/viewer/?uri=github.com/libjxl/libjxl)
 [![codecov](https://codecov.io/gh/libjxl/libjxl/branch/main/graph/badge.svg)](
 https://codecov.io/gh/libjxl/libjxl)
 
@@ -21,109 +25,59 @@ This repository contains a reference implementation of JPEG XL (encoder and
 decoder), called `libjxl`. This software library is
 [used by many applications that support JPEG XL](doc/software_support.md).
 
-JPEG XL is in the final stages of standardization and its codestream and file format
-are frozen.
+JPEG XL was standardized in 2022 as [ISO/IEC 18181](https://jpeg.org/jpegxl/workplan.html).
+The [core codestream](doc/format_overview.md#codestream-features) is specified in 18181-1,
+the [file format](doc/format_overview.md#file-format-features) in 18181-2.
+[Decoder conformance](https://github.com/libjxl/conformance) is defined in 18181-3,
+and 18181-4 is the [reference software](https://github.com/libjxl/libjxl).
 
 The library API, command line options, and tools in this repository are subject
-to change, however files encoded with `cjxl` conform to the JPEG XL format
-specification and can be decoded with current and future `djxl` decoders or
-`libjxl` decoding library.
+to change, however files encoded with `cjxl` conform to the JPEG XL specification
+and can be decoded with current and future `djxl` decoders or the `libjxl` decoding library.
 
-## Quick start guide
+## Installation
 
-For more details and other workflows see the "Advanced guide" below.
+In most Linux distributions, installing `libjxl` is just a matter of using the package management system.
+For example in Debian-based distributions: `apt install libjxl-tools` will install `cjxl` and `djxl`
+and other tools like `benchmark_xl` are available in the package `libjxl-devtools`.
+On MacOS, you can use [Homebrew](https://brew.sh/): `brew install jpeg-xl`.
 
-### Checking out the code
+[![libjxl packaging status](https://repology.org/badge/vertical-allrepos/libjxl.svg?exclude_unsupported=1&columns=3&exclude_sources=modules,site&header=libjxl%20packaging%20status)](https://repology.org/project/libjxl/versions)
 
-```bash
-git clone https://github.com/libjxl/libjxl.git --recursive --shallow-submodules
-```
-
-This repository uses git submodules to handle some third party dependencies
-under `third_party`, that's why is important to pass `--recursive`. If you
-didn't check out with `--recursive`, or any submodule has changed, run:
-
-```bash
-git submodule update --init --recursive --depth 1 --recommend-shallow
-```
-
-The `--shallow-submodules` and `--depth 1 --recommend-shallow` options create
-shallow clones which only downloads the commits requested, and is all that is
-needed to build `libjxl`. Should full clones be necessary, you could always run:
-
-```bash
-git submodule foreach git fetch --unshallow
-git submodule update --init --recursive
-```
+From the [releases page](https://github.com/libjxl/libjxl/releases/) the following can be downloaded:
+ - Windows binaries 
+ - Debian and Ubuntu .deb packages 
 
-which pulls the rest of the commits in the submodules.
+Of course you can also [build libjxl from sources](BUILDING.md).
 
-Important: If you downloaded a zip file or tarball from the web interface you
-won't get the needed submodules and the code will not compile. You can download
-these external dependencies from source running `./deps.sh`. The git workflow
-described above is recommended instead.
-
-### Installing dependencies
-
-Required dependencies for compiling the code, in a Debian/Ubuntu based
-distribution run:
-
-```bash
-sudo apt install cmake pkg-config libbrotli-dev
-```
-
-Optional dependencies for supporting other formats in the `cjxl`/`djxl` tools,
-in a Debian/Ubuntu based distribution run:
-
-```bash
-sudo apt install libgif-dev libjpeg-dev libopenexr-dev libpng-dev libwebp-dev
-```
 
-We recommend using a recent Clang compiler (version 7 or newer), for that
-install clang and set `CC` and `CXX` variables.
-
-```bash
-sudo apt install clang
-export CC=clang CXX=clang++
-```
-
-### Building
-
-```bash
-cd libjxl
-mkdir build
-cd build
-cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTING=OFF ..
-cmake --build . -- -j$(nproc)
-```
-
-The encoder/decoder tools will be available in the `build/tools` directory.
-
-### <a name="installing"></a> Installing
-
-```bash
-sudo cmake --install .
-```
-
-### Basic encoder/decoder
+## Usage
 
 To encode a source image to JPEG XL with default settings:
 
 ```bash
-build/tools/cjxl input.png output.jxl
+cjxl input.png output.jxl
 ```
 
-For more settings run `build/tools/cjxl --help` or for a full list of options
-run `build/tools/cjxl -v -v --help`.
+The desired visual fidelity can be selected using the `--distance` parameter
+(in units of just-noticeable difference, where 0 is lossless and the most useful lossy range is 0.5 .. 3.0),
+or using `--quality` (on a scale from 0 to 100, roughly matching libjpeg).
+The [encode effort](doc/encode_effort.md) can be selected using the `--effort` parameter.
+
+For more settings run `cjxl --help` or for a full list of options
+run `cjxl -v -v --help`.
 
 To decode a JPEG XL file run:
 
 ```bash
-build/tools/djxl input.jxl output.png
+djxl input.jxl output.png
 ```
 
 When possible `cjxl`/`djxl` are able to read/write the following
 image formats: .exr, .gif, .jpeg/.jpg, .pfm, .pgm/.ppm, .pgx, .png.
+Specifically for JPEG files, the default `cjxl` behavior is to apply lossless
+recompression and the default `djxl` behavior is to reconstruct the original
+JPEG file (when the extension of the output file is .jpg).
 
 ### Benchmarking
 
@@ -135,26 +89,12 @@ benchmarking purposes.
 For more comprehensive benchmarking options, see the
 [benchmarking guide](doc/benchmarking.md).
 
-## Advanced guide
-
-### Building with Docker
-
-We build a common environment based on Debian/Ubuntu using Docker. Other
-systems may have different combinations of versions and dependencies that
-have not been tested and may not work. For those cases we recommend using the
-Docker container as explained in the
-[step by step guide](doc/developing_in_docker.md).
-
-### Building JPEG XL for developers
-
-For experienced developers, we provide build instructions for several other environments:
+### Library API
 
-*   [Building on Debian](doc/developing_in_debian.md)
-*   Building on Windows with [vcpkg](doc/developing_in_windows_vcpkg.md) (Visual Studio 2019)
-*   Building on Windows with [MSYS2](doc/developing_in_windows_msys.md)
-*   [Cross Compiling for Windows with Crossroad](doc/developing_with_crossroad.md)
+Besides the `libjxl` library [API documentation](https://libjxl.readthedocs.io/en/latest/),
+there are [example applications](examples/) and [plugins](plugins/) that can be used as a reference or
+starting point for developers who wish to integrate `libjxl` in their project.
 
-If you encounter any difficulties, please use Docker instead.
 
 ## License
 
diff --git a/WORKSPACE b/WORKSPACE
new file mode 100644 (file)
index 0000000..417c9cd
--- /dev/null
+++ b/WORKSPACE
@@ -0,0 +1,771 @@
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository", "new_git_repository")
+
+workspace(name = "libjxl")
+
+http_archive(
+    name = "bazel_skylib",
+    sha256 = "74d544d96f4a5bb630d465ca8bbcfe231e3594e5aae57e1edbf17a6eb3ca2506",
+    urls = [
+        "https://mirror.bazel.build/github.com/bazelbuild/bazel-skylib/releases/download/1.3.0/bazel-skylib-1.3.0.tar.gz",
+        "https://github.com/bazelbuild/bazel-skylib/releases/download/1.3.0/bazel-skylib-1.3.0.tar.gz",
+    ],
+)
+
+load("@bazel_skylib//:workspace.bzl", "bazel_skylib_workspace")
+
+bazel_skylib_workspace()
+
+local_repository(
+    name = "highway",
+    path = "third_party/highway",
+)
+
+local_repository(
+    name = "brotli",
+    path = "third_party/brotli",
+)
+
+new_local_repository(
+    name = "googletest",
+    build_file = "@libjxl//:third_party/googletest/BUILD.bazel",
+    path = "third_party/googletest",
+)
+
+new_local_repository(
+    name = "skcms",
+    build_file_content = """
+cc_library(
+    name = "skcms",
+    srcs = [
+        "skcms.cc",
+        "src/skcms_internals.h",
+        "src/skcms_Transform.h",
+        "src/Transform_inl.h",
+    ],
+    hdrs = ["skcms.h"],
+    visibility = ["//visibility:public"],
+)
+    """,
+    path = "third_party/skcms",
+)
+
+new_git_repository(
+    name = "zlib",
+    build_file_content = """
+cc_library(
+    name = "zlib",
+    defines = ["HAVE_UNISTD_H"],
+    srcs = [
+        "adler32.c",
+        "compress.c",
+        "crc32.c",
+        "crc32.h",
+        "deflate.c",
+        "deflate.h",
+        "gzclose.c",
+        "gzguts.h",
+        "gzlib.c",
+        "gzread.c",
+        "gzwrite.c",
+        "infback.c",
+        "inffast.c",
+        "inffast.h",
+        "inffixed.h",
+        "inflate.c",
+        "inflate.h",
+        "inftrees.c",
+        "inftrees.h",
+        "trees.c",
+        "trees.h",
+        "uncompr.c",
+        "zconf.h",
+        "zutil.c",
+        "zutil.h",
+    ],
+    hdrs = ["zlib.h"],
+    includes = ["."],
+    visibility = ["//visibility:public"],
+)
+    """,
+    remote = "https://github.com/madler/zlib",
+    tag = "v1.2.13",
+)
+
+new_local_repository(
+    name = "png",
+    build_file_content = """
+genrule(
+    name = "pnglibconf",
+    srcs = ["scripts/pnglibconf.h.prebuilt"],
+    outs = ["pnglibconf.h"],
+    cmd = "cp -f $< $@",
+)
+cc_library(
+    name = "png",
+    srcs = [
+        "png.c",
+        "pngconf.h",
+        "pngdebug.h",
+        "pngerror.c",
+        "pngget.c",
+        "pnginfo.h",
+        ":pnglibconf",
+        "pngmem.c",
+        "pngpread.c",
+        "pngpriv.h",
+        "pngread.c",
+        "pngrio.c",
+        "pngrtran.c",
+        "pngrutil.c",
+        "pngset.c",
+        "pngstruct.h",
+        "pngtrans.c",
+        "pngwio.c",
+        "pngwrite.c",
+        "pngwtran.c",
+        "pngwutil.c",
+    ],
+    hdrs = ["png.h"],
+    includes = ["."],
+    linkopts = ["-lm"],
+    visibility = ["//visibility:public"],
+    deps = ["@zlib//:zlib"],
+)
+    """,
+    path = "third_party/libpng",
+)
+
+new_git_repository(
+    name = "libjpeg_turbo",
+    build_file_content = """
+load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
+SUBSTITUTIONS = {
+    "@BUILD@" : "20230208",
+    "@CMAKE_PROJECT_NAME@" : "libjpeg-turbo",
+    "@COPYRIGHT_YEAR@" : "2023",
+    "@INLINE@" : "__inline__",
+    "@JPEG_LIB_VERSION@" : "62",
+    "@LIBJPEG_TURBO_VERSION_NUMBER@" : "2001091",
+    "@SIZE_T@" : "8",
+    "@THREAD_LOCAL@" : "__thread",
+    "@VERSION@" : "2.1.91",
+}
+YES_DEFINES = [
+    "C_ARITH_CODING_SUPPORTED", "D_ARITH_CODING_SUPPORTED",
+    "HAVE_BUILTIN_CTZL", "MEM_SRCDST_SUPPORTED"
+]
+NO_DEFINES = [
+    "WITH_SIMD", "RIGHT_SHIFT_IS_UNSIGNED", "HAVE_INTRIN_H"
+]
+SUBSTITUTIONS.update({
+    "#cmakedefine " + key : "#define " + key for key in YES_DEFINES
+})
+SUBSTITUTIONS.update({
+    "#cmakedefine " + key : "// #define " + key for key in NO_DEFINES
+})
+[
+    expand_template(
+        name = "expand_" + src,
+        template = src + ".in",
+        out = src,
+        substitutions = SUBSTITUTIONS,
+        visibility = ["//visibility:public"],
+    ) for src in ["jconfig.h", "jconfigint.h", "jversion.h"]
+]
+JPEG16_SOURCES = [
+    "jccolor.c",
+    "jcdiffct.c",
+    "jclossls.c",
+    "jcmainct.c",
+    "jcprepct.c",
+    "jcsample.c",
+    "jdcolor.c",
+    "jddiffct.c",
+    "jdlossls.c",
+    "jdmainct.c",
+    "jdmerge.c",
+    "jdpostct.c",
+    "jdsample.c",
+    "jquant1.c",
+    "jquant2.c",
+    "jutils.c",
+]
+JPEG12_SOURCES = JPEG16_SOURCES + [
+    "jccoefct.c",
+    "jcdctmgr.c",
+    "jdcoefct.c",
+    "jddctmgr.c",
+    "jfdctfst.c",
+    "jfdctint.c",
+    "jidctflt.c",
+    "jidctfst.c",
+    "jidctint.c",
+    "jidctred.c",
+]
+JPEG_SOURCES = JPEG12_SOURCES + [
+    "jaricom.c",
+    "jcapimin.c",
+    "jcapistd.c",
+    "jcarith.c",
+    "jchuff.c",
+    "jcicc.c",
+    "jcinit.c",
+    "jclhuff.c",
+    "jcmarker.c",
+    "jcmaster.c",
+    "jcomapi.c",
+    "jcparam.c",
+    "jcphuff.c",
+    "jdapimin.c",
+    "jdapistd.c",
+    "jdarith.c",
+    "jdatadst.c",
+    "jdatasrc.c",
+    "jdhuff.c",
+    "jdicc.c",
+    "jdinput.c",
+    "jdlhuff.c",
+    "jdmarker.c",
+    "jdmaster.c",
+    "jdphuff.c",
+    "jdtrans.c",
+    "jerror.c",
+    "jfdctflt.c",
+    "jmemmgr.c",
+    "jmemnobs.c",
+]
+JPEG_HEADERS = [
+    "jccolext.c",
+    "jchuff.h",
+    "jcmaster.h",
+    "jconfig.h",
+    "jconfigint.h",
+    "jdcoefct.h",
+    "jdcol565.c",
+    "jdcolext.c",
+    "jdct.h",
+    "jdhuff.h",
+    "jdmainct.h",
+    "jdmaster.h",
+    "jdmerge.h",
+    "jdmrg565.c",
+    "jdmrgext.c",
+    "jdsample.h",
+    "jerror.h",
+    "jinclude.h",
+    "jlossls.h",
+    "jmemsys.h",
+    "jmorecfg.h",
+    "jpeg_nbits_table.h",
+    "jpegapicomp.h",
+    "jpegint.h",
+    "jpeglib.h",
+    "jsamplecomp.h",
+    "jsimd.h",
+    "jsimddct.h",
+    "jstdhuff.c",
+    "jversion.h",
+]
+cc_library(
+    name = "jpeg16",
+    srcs = JPEG16_SOURCES,
+    hdrs = JPEG_HEADERS,
+    local_defines = ["BITS_IN_JSAMPLE=16"],
+    visibility = ["//visibility:public"],
+)
+cc_library(
+    name = "jpeg12",
+    srcs = JPEG12_SOURCES,
+    hdrs = JPEG_HEADERS,
+    local_defines = ["BITS_IN_JSAMPLE=12"],
+    visibility = ["//visibility:public"],
+)
+cc_library(
+    name = "jpeg",
+    srcs = JPEG_SOURCES,
+    hdrs = JPEG_HEADERS,
+    deps = [":jpeg16", ":jpeg12"],
+    includes = ["."],
+    visibility = ["//visibility:public"],
+)
+exports_files([
+    "jmorecfg.h",
+    "jpeglib.h",
+])
+    """,
+    remote = "https://github.com/libjpeg-turbo/libjpeg-turbo.git",
+    tag = "2.1.91",
+)
+
+http_archive(
+    name = "gif",
+    build_file_content = """
+cc_library(
+    name = "gif",
+    srcs = [
+        "dgif_lib.c", "egif_lib.c", "gifalloc.c", "gif_err.c", "gif_font.c",
+        "gif_hash.c", "openbsd-reallocarray.c", "gif_hash.h",
+        "gif_lib_private.h"
+    ],
+    hdrs = ["gif_lib.h"],
+    includes = ["."],
+    visibility = ["//visibility:public"],
+)
+    """,
+    sha256 = "31da5562f44c5f15d63340a09a4fd62b48c45620cd302f77a6d9acf0077879bd",
+    strip_prefix = "giflib-5.2.1",
+    url = "https://netcologne.dl.sourceforge.net/project/giflib/giflib-5.2.1.tar.gz",
+)
+
+new_git_repository(
+    name = "imath",
+    build_file_content = """
+load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
+SUBSTITUTIONS = {
+    "@IMATH_INTERNAL_NAMESPACE@": "Imath_3_1",
+    "@IMATH_LIB_VERSION@": "3.1.4",
+    "@IMATH_NAMESPACE_CUSTOM@": "0",
+    "@IMATH_NAMESPACE@": "Imath",
+    "@IMATH_PACKAGE_NAME@": "Imath 3.1.4",
+    "@IMATH_VERSION_MAJOR@": "3",
+    "@IMATH_VERSION_MINOR@": "1",
+    "@IMATH_VERSION_PATCH@": "4",
+    "@IMATH_VERSION@": "3.1.4",
+}
+YES_DEFINES = [
+    "IMATH_HALF_USE_LOOKUP_TABLE", "IMATH_ENABLE_API_VISIBILITY",
+]
+NO_DEFINES = [
+    "IMATH_HAVE_LARGE_STACK",
+]
+ONE_DEFINES = [
+    "IMATH_USE_NOEXCEPT",
+]
+SUBSTITUTIONS.update({
+    "#cmakedefine " + key : "#define " + key for key in YES_DEFINES
+})
+SUBSTITUTIONS.update({
+    "#cmakedefine " + key : "// #define " + key for key in NO_DEFINES
+})
+SUBSTITUTIONS.update({
+    "#cmakedefine01 " + key : "#define " + key + " 1" for key in ONE_DEFINES
+})
+expand_template(
+    name = "expand_ImathConfig",
+    template = "config/ImathConfig.h.in",
+    out = "src/Imath/ImathConfig.h",
+    substitutions = SUBSTITUTIONS,
+)
+cc_library(
+    name = "Imath",
+    srcs = [
+        "src/Imath/ImathColorAlgo.cpp",
+        ":src/Imath/ImathConfig.h",
+        "src/Imath/ImathFun.cpp",
+        "src/Imath/ImathMatrixAlgo.cpp",
+        "src/Imath/ImathRandom.cpp",
+        "src/Imath/half.cpp",
+        "src/Imath/toFloat.h",
+    ],
+    hdrs = [
+        "src/Imath/ImathBox.h",
+        "src/Imath/ImathBoxAlgo.h",
+        "src/Imath/ImathColor.h",
+        "src/Imath/ImathColorAlgo.h",
+        "src/Imath/ImathEuler.h",
+        "src/Imath/ImathExport.h",
+        "src/Imath/ImathForward.h",
+        "src/Imath/ImathFrame.h",
+        "src/Imath/ImathFrustum.h",
+        "src/Imath/ImathFrustumTest.h",
+        "src/Imath/ImathFun.h",
+        "src/Imath/ImathGL.h",
+        "src/Imath/ImathGLU.h",
+        "src/Imath/ImathInt64.h",
+        "src/Imath/ImathInterval.h",
+        "src/Imath/ImathLine.h",
+        "src/Imath/ImathLineAlgo.h",
+        "src/Imath/ImathMath.h",
+        "src/Imath/ImathMatrix.h",
+        "src/Imath/ImathMatrixAlgo.h",
+        "src/Imath/ImathNamespace.h",
+        "src/Imath/ImathPlane.h",
+        "src/Imath/ImathPlatform.h",
+        "src/Imath/ImathQuat.h",
+        "src/Imath/ImathRandom.h",
+        "src/Imath/ImathRoots.h",
+        "src/Imath/ImathShear.h",
+        "src/Imath/ImathSphere.h",
+        "src/Imath/ImathTypeTraits.h",
+        "src/Imath/ImathVec.h",
+        "src/Imath/ImathVecAlgo.h",
+        "src/Imath/half.h",
+        "src/Imath/halfFunction.h",
+        "src/Imath/halfLimits.h",
+    ],
+    includes = ["src/Imath"],
+    visibility = ["//visibility:public"],
+)
+""",
+    remote = "https://github.com/AcademySoftwareFoundation/imath",
+    tag = "v3.1.6",
+)
+
+new_git_repository(
+    name = "openexr",
+    build_file_content = """
+load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
+SUBSTITUTIONS = {
+    "@IEX_INTERNAL_NAMESPACE@": "Iex_3_0",
+    "@IEX_NAMESPACE_CUSTOM@": "0",
+    "@IEX_NAMESPACE@": "Iex",
+    "@ILMTHREAD_INTERNAL_NAMESPACE@": "IlmThread_3_0",
+    "@ILMTHREAD_NAMESPACE_CUSTOM@": "0",
+    "@ILMTHREAD_NAMESPACE@": "IlmThread",
+    "@OPENEXR_IMF_NAMESPACE@": "Imf",
+    "@OPENEXR_INTERNAL_IMF_NAMESPACE@": "Imf_3_0",
+    "@OPENEXR_LIB_VERSION@": "3.0.4",
+    "@OPENEXR_NAMESPACE_CUSTOM@": "0",
+    "@OPENEXR_PACKAGE_NAME@": "OpenEXR 3.0.4",
+    "@OPENEXR_VERSION_EXTRA@": "",
+    "@OPENEXR_VERSION_MAJOR@": "3",
+    "@OPENEXR_VERSION_MINOR@": "0",
+    "@OPENEXR_VERSION_PATCH@": "4",
+    "@OPENEXR_VERSION@": "3.0.4",
+}
+YES_DEFINES = [
+    "OPENEXR_ENABLE_API_VISIBILITY", "OPENEXR_IMF_HAVE_COMPLETE_IOMANIP",
+    "OPENEXR_HAVE_LARGE_STACK",
+]
+NO_DEFINES = [
+    "HAVE_UCONTEXT_H", "IEX_HAVE_CONTROL_REGISTER_SUPPORT",
+    "IEX_HAVE_SIGCONTEXT_CONTROL_REGISTER_SUPPORT", "OPENEXR_IMF_HAVE_DARWIN",
+    "OPENEXR_IMF_HAVE_GCC_INLINE_ASM_AVX", "OPENEXR_IMF_HAVE_LINUX_PROCFS",
+    "OPENEXR_IMF_HAVE_SYSCONF_NPROCESSORS_ONLN",
+]
+ONE_DEFINES = [
+    "ILMTHREAD_THREADING_ENABLED",
+]
+ZERO_DEFINES = [
+    "ILMTHREAD_HAVE_POSIX_SEMAPHORES",
+]
+SUBSTITUTIONS.update({
+    "#cmakedefine " + key : "#define " + key for key in YES_DEFINES
+})
+SUBSTITUTIONS.update({
+    "#cmakedefine " + key : "// #define " + key for key in NO_DEFINES
+})
+SUBSTITUTIONS.update({
+    "#cmakedefine01 " + key : "#define " + key + " 1" for key in ONE_DEFINES
+})
+SUBSTITUTIONS.update({
+    "#cmakedefine01 " + key : "#define " + key + " 0" for key in ZERO_DEFINES
+})
+[
+    expand_template(
+        name = "expand_" + item,
+        template = "cmake/" + item + ".h.in",
+        out = "src/lib/Iex/" + item + ".h",
+        substitutions = SUBSTITUTIONS,
+    ) for item in ["IexConfig", "IexConfigInternal"]
+]
+[
+expand_template(
+        name = "expand_" + item,
+        template = "cmake/" + item + ".h.in",
+        out = "src/lib/IlmThread/" + item + ".h",
+        substitutions = SUBSTITUTIONS,
+    ) for item in ["IlmThreadConfig"]
+]
+[
+expand_template(
+        name = "expand_" + item,
+        template = "cmake/" + item + ".h.in",
+        out = "src/lib/OpenEXR/" + item + ".h",
+        substitutions = SUBSTITUTIONS,
+    ) for item in ["OpenEXRConfig", "OpenEXRConfigInternal"]
+]
+cc_library(
+    name = "Iex",
+    srcs = [
+        "src/lib/Iex/IexBaseExc.cpp",
+        "src/lib/Iex/IexMathFloatExc.cpp",
+        "src/lib/Iex/IexMathFpu.cpp",
+        "src/lib/Iex/IexThrowErrnoExc.cpp",
+    ],
+    hdrs = [
+        "src/lib/Iex/Iex.h",
+        "src/lib/Iex/IexBaseExc.h",
+        ":src/lib/Iex/IexConfig.h",
+        ":src/lib/Iex/IexConfigInternal.h",
+        "src/lib/Iex/IexErrnoExc.h",
+        "src/lib/Iex/IexExport.h",
+        "src/lib/Iex/IexForward.h",
+        "src/lib/Iex/IexMacros.h",
+        "src/lib/Iex/IexMathExc.h",
+        "src/lib/Iex/IexMathFloatExc.h",
+        "src/lib/Iex/IexMathFpu.h",
+        "src/lib/Iex/IexMathIeeeExc.h",
+        "src/lib/Iex/IexNamespace.h",
+        "src/lib/Iex/IexThrowErrnoExc.h",
+        ":src/lib/OpenEXR/OpenEXRConfig.h",
+    ],
+    includes = [
+        "src/lib/Iex",
+        "src/lib/OpenEXR",
+    ],
+)
+
+cc_library(
+    name = "IlmThread",
+    srcs = [
+        "src/lib/IlmThread/IlmThread.cpp",
+        "src/lib/IlmThread/IlmThreadPool.cpp",
+        "src/lib/IlmThread/IlmThreadSemaphore.cpp",
+        "src/lib/IlmThread/IlmThreadSemaphoreOSX.cpp",
+        "src/lib/IlmThread/IlmThreadSemaphorePosix.cpp",
+        "src/lib/IlmThread/IlmThreadSemaphorePosixCompat.cpp",
+        "src/lib/IlmThread/IlmThreadSemaphoreWin32.cpp",
+    ],
+    hdrs = [
+        "src/lib/IlmThread/IlmThread.h",
+        ":src/lib/IlmThread/IlmThreadConfig.h",
+        "src/lib/IlmThread/IlmThreadExport.h",
+        "src/lib/IlmThread/IlmThreadForward.h",
+        "src/lib/IlmThread/IlmThreadMutex.h",
+        "src/lib/IlmThread/IlmThreadNamespace.h",
+        "src/lib/IlmThread/IlmThreadPool.h",
+        "src/lib/IlmThread/IlmThreadSemaphore.h",
+    ],
+    includes = ["src/lib/IlmThread"],
+    deps = [":Iex"],
+)
+cc_library(
+    name = "OpenEXR",
+    srcs = [
+        "src/lib/OpenEXR/ImfAcesFile.cpp",
+        "src/lib/OpenEXR/ImfAttribute.cpp",
+        "src/lib/OpenEXR/ImfB44Compressor.cpp",
+        "src/lib/OpenEXR/ImfBoxAttribute.cpp",
+        "src/lib/OpenEXR/ImfCRgbaFile.cpp",
+        "src/lib/OpenEXR/ImfChannelList.cpp",
+        "src/lib/OpenEXR/ImfChannelListAttribute.cpp",
+        "src/lib/OpenEXR/ImfChromaticities.cpp",
+        "src/lib/OpenEXR/ImfChromaticitiesAttribute.cpp",
+        "src/lib/OpenEXR/ImfCompositeDeepScanLine.cpp",
+        "src/lib/OpenEXR/ImfCompressionAttribute.cpp",
+        "src/lib/OpenEXR/ImfCompressor.cpp",
+        "src/lib/OpenEXR/ImfConvert.cpp",
+        "src/lib/OpenEXR/ImfDeepCompositing.cpp",
+        "src/lib/OpenEXR/ImfDeepFrameBuffer.cpp",
+        "src/lib/OpenEXR/ImfDeepImageStateAttribute.cpp",
+        "src/lib/OpenEXR/ImfDeepScanLineInputFile.cpp",
+        "src/lib/OpenEXR/ImfDeepScanLineInputPart.cpp",
+        "src/lib/OpenEXR/ImfDeepScanLineOutputFile.cpp",
+        "src/lib/OpenEXR/ImfDeepScanLineOutputPart.cpp",
+        "src/lib/OpenEXR/ImfDeepTiledInputFile.cpp",
+        "src/lib/OpenEXR/ImfDeepTiledInputPart.cpp",
+        "src/lib/OpenEXR/ImfDeepTiledOutputFile.cpp",
+        "src/lib/OpenEXR/ImfDeepTiledOutputPart.cpp",
+        "src/lib/OpenEXR/ImfDoubleAttribute.cpp",
+        "src/lib/OpenEXR/ImfDwaCompressor.cpp",
+        "src/lib/OpenEXR/ImfEnvmap.cpp",
+        "src/lib/OpenEXR/ImfEnvmapAttribute.cpp",
+        "src/lib/OpenEXR/ImfFastHuf.cpp",
+        "src/lib/OpenEXR/ImfFloatAttribute.cpp",
+        "src/lib/OpenEXR/ImfFloatVectorAttribute.cpp",
+        "src/lib/OpenEXR/ImfFrameBuffer.cpp",
+        "src/lib/OpenEXR/ImfFramesPerSecond.cpp",
+        "src/lib/OpenEXR/ImfGenericInputFile.cpp",
+        "src/lib/OpenEXR/ImfGenericOutputFile.cpp",
+        "src/lib/OpenEXR/ImfHeader.cpp",
+        "src/lib/OpenEXR/ImfHuf.cpp",
+        "src/lib/OpenEXR/ImfIDManifest.cpp",
+        "src/lib/OpenEXR/ImfIDManifestAttribute.cpp",
+        "src/lib/OpenEXR/ImfIO.cpp",
+        "src/lib/OpenEXR/ImfInputFile.cpp",
+        "src/lib/OpenEXR/ImfInputPart.cpp",
+        "src/lib/OpenEXR/ImfInputPartData.cpp",
+        "src/lib/OpenEXR/ImfIntAttribute.cpp",
+        "src/lib/OpenEXR/ImfKeyCode.cpp",
+        "src/lib/OpenEXR/ImfKeyCodeAttribute.cpp",
+        "src/lib/OpenEXR/ImfLineOrderAttribute.cpp",
+        "src/lib/OpenEXR/ImfLut.cpp",
+        "src/lib/OpenEXR/ImfMatrixAttribute.cpp",
+        "src/lib/OpenEXR/ImfMisc.cpp",
+        "src/lib/OpenEXR/ImfMultiPartInputFile.cpp",
+        "src/lib/OpenEXR/ImfMultiPartOutputFile.cpp",
+        "src/lib/OpenEXR/ImfMultiView.cpp",
+        "src/lib/OpenEXR/ImfOpaqueAttribute.cpp",
+        "src/lib/OpenEXR/ImfOutputFile.cpp",
+        "src/lib/OpenEXR/ImfOutputPart.cpp",
+        "src/lib/OpenEXR/ImfOutputPartData.cpp",
+        "src/lib/OpenEXR/ImfPartType.cpp",
+        "src/lib/OpenEXR/ImfPizCompressor.cpp",
+        "src/lib/OpenEXR/ImfPreviewImage.cpp",
+        "src/lib/OpenEXR/ImfPreviewImageAttribute.cpp",
+        "src/lib/OpenEXR/ImfPxr24Compressor.cpp",
+        "src/lib/OpenEXR/ImfRational.cpp",
+        "src/lib/OpenEXR/ImfRationalAttribute.cpp",
+        "src/lib/OpenEXR/ImfRgbaFile.cpp",
+        "src/lib/OpenEXR/ImfRgbaYca.cpp",
+        "src/lib/OpenEXR/ImfRle.cpp",
+        "src/lib/OpenEXR/ImfRleCompressor.cpp",
+        "src/lib/OpenEXR/ImfScanLineInputFile.cpp",
+        "src/lib/OpenEXR/ImfStandardAttributes.cpp",
+        "src/lib/OpenEXR/ImfStdIO.cpp",
+        "src/lib/OpenEXR/ImfStringAttribute.cpp",
+        "src/lib/OpenEXR/ImfStringVectorAttribute.cpp",
+        "src/lib/OpenEXR/ImfSystemSpecific.cpp",
+        "src/lib/OpenEXR/ImfTestFile.cpp",
+        "src/lib/OpenEXR/ImfThreading.cpp",
+        "src/lib/OpenEXR/ImfTileDescriptionAttribute.cpp",
+        "src/lib/OpenEXR/ImfTileOffsets.cpp",
+        "src/lib/OpenEXR/ImfTiledInputFile.cpp",
+        "src/lib/OpenEXR/ImfTiledInputPart.cpp",
+        "src/lib/OpenEXR/ImfTiledMisc.cpp",
+        "src/lib/OpenEXR/ImfTiledOutputFile.cpp",
+        "src/lib/OpenEXR/ImfTiledOutputPart.cpp",
+        "src/lib/OpenEXR/ImfTiledRgbaFile.cpp",
+        "src/lib/OpenEXR/ImfTimeCode.cpp",
+        "src/lib/OpenEXR/ImfTimeCodeAttribute.cpp",
+        "src/lib/OpenEXR/ImfVecAttribute.cpp",
+        "src/lib/OpenEXR/ImfVersion.cpp",
+        "src/lib/OpenEXR/ImfWav.cpp",
+        "src/lib/OpenEXR/ImfZip.cpp",
+        "src/lib/OpenEXR/ImfZipCompressor.cpp",
+        "src/lib/OpenEXR/b44ExpLogTable.h",
+        "src/lib/OpenEXR/dwaLookups.h",
+    ],
+    hdrs = [
+        ":src/lib/Iex/IexConfig.h",
+        ":src/lib/Iex/IexConfigInternal.h",
+        ":src/lib/IlmThread/IlmThreadConfig.h",
+        "src/lib/OpenEXR/ImfAcesFile.h",
+        "src/lib/OpenEXR/ImfArray.h",
+        "src/lib/OpenEXR/ImfAttribute.h",
+        "src/lib/OpenEXR/ImfAutoArray.h",
+        "src/lib/OpenEXR/ImfB44Compressor.h",
+        "src/lib/OpenEXR/ImfBoxAttribute.h",
+        "src/lib/OpenEXR/ImfCRgbaFile.h",
+        "src/lib/OpenEXR/ImfChannelList.h",
+        "src/lib/OpenEXR/ImfChannelListAttribute.h",
+        "src/lib/OpenEXR/ImfCheckedArithmetic.h",
+        "src/lib/OpenEXR/ImfChromaticities.h",
+        "src/lib/OpenEXR/ImfChromaticitiesAttribute.h",
+        "src/lib/OpenEXR/ImfCompositeDeepScanLine.h",
+        "src/lib/OpenEXR/ImfCompression.h",
+        "src/lib/OpenEXR/ImfCompressionAttribute.h",
+        "src/lib/OpenEXR/ImfCompressor.h",
+        "src/lib/OpenEXR/ImfConvert.h",
+        "src/lib/OpenEXR/ImfDeepCompositing.h",
+        "src/lib/OpenEXR/ImfDeepFrameBuffer.h",
+        "src/lib/OpenEXR/ImfDeepImageState.h",
+        "src/lib/OpenEXR/ImfDeepImageStateAttribute.h",
+        "src/lib/OpenEXR/ImfDeepScanLineInputFile.h",
+        "src/lib/OpenEXR/ImfDeepScanLineInputPart.h",
+        "src/lib/OpenEXR/ImfDeepScanLineOutputFile.h",
+        "src/lib/OpenEXR/ImfDeepScanLineOutputPart.h",
+        "src/lib/OpenEXR/ImfDeepTiledInputFile.h",
+        "src/lib/OpenEXR/ImfDeepTiledInputPart.h",
+        "src/lib/OpenEXR/ImfDeepTiledOutputFile.h",
+        "src/lib/OpenEXR/ImfDeepTiledOutputPart.h",
+        "src/lib/OpenEXR/ImfDoubleAttribute.h",
+        "src/lib/OpenEXR/ImfDwaCompressor.h",
+        "src/lib/OpenEXR/ImfDwaCompressorSimd.h",
+        "src/lib/OpenEXR/ImfEnvmap.h",
+        "src/lib/OpenEXR/ImfEnvmapAttribute.h",
+        "src/lib/OpenEXR/ImfExport.h",
+        "src/lib/OpenEXR/ImfFastHuf.h",
+        "src/lib/OpenEXR/ImfFloatAttribute.h",
+        "src/lib/OpenEXR/ImfFloatVectorAttribute.h",
+        "src/lib/OpenEXR/ImfForward.h",
+        "src/lib/OpenEXR/ImfFrameBuffer.h",
+        "src/lib/OpenEXR/ImfFramesPerSecond.h",
+        "src/lib/OpenEXR/ImfGenericInputFile.h",
+        "src/lib/OpenEXR/ImfGenericOutputFile.h",
+        "src/lib/OpenEXR/ImfHeader.h",
+        "src/lib/OpenEXR/ImfHuf.h",
+        "src/lib/OpenEXR/ImfIDManifest.h",
+        "src/lib/OpenEXR/ImfIDManifestAttribute.h",
+        "src/lib/OpenEXR/ImfIO.h",
+        "src/lib/OpenEXR/ImfInputFile.h",
+        "src/lib/OpenEXR/ImfInputPart.h",
+        "src/lib/OpenEXR/ImfInputPartData.h",
+        "src/lib/OpenEXR/ImfInputStreamMutex.h",
+        "src/lib/OpenEXR/ImfInt64.h",
+        "src/lib/OpenEXR/ImfIntAttribute.h",
+        "src/lib/OpenEXR/ImfKeyCode.h",
+        "src/lib/OpenEXR/ImfKeyCodeAttribute.h",
+        "src/lib/OpenEXR/ImfLineOrder.h",
+        "src/lib/OpenEXR/ImfLineOrderAttribute.h",
+        "src/lib/OpenEXR/ImfLut.h",
+        "src/lib/OpenEXR/ImfMatrixAttribute.h",
+        "src/lib/OpenEXR/ImfMisc.h",
+        "src/lib/OpenEXR/ImfMultiPartInputFile.h",
+        "src/lib/OpenEXR/ImfMultiPartOutputFile.h",
+        "src/lib/OpenEXR/ImfMultiView.h",
+        "src/lib/OpenEXR/ImfName.h",
+        "src/lib/OpenEXR/ImfNamespace.h",
+        "src/lib/OpenEXR/ImfOpaqueAttribute.h",
+        "src/lib/OpenEXR/ImfOptimizedPixelReading.h",
+        "src/lib/OpenEXR/ImfOutputFile.h",
+        "src/lib/OpenEXR/ImfOutputPart.h",
+        "src/lib/OpenEXR/ImfOutputPartData.h",
+        "src/lib/OpenEXR/ImfOutputStreamMutex.h",
+        "src/lib/OpenEXR/ImfPartHelper.h",
+        "src/lib/OpenEXR/ImfPartType.h",
+        "src/lib/OpenEXR/ImfPixelType.h",
+        "src/lib/OpenEXR/ImfPizCompressor.h",
+        "src/lib/OpenEXR/ImfPreviewImage.h",
+        "src/lib/OpenEXR/ImfPreviewImageAttribute.h",
+        "src/lib/OpenEXR/ImfPxr24Compressor.h",
+        "src/lib/OpenEXR/ImfRational.h",
+        "src/lib/OpenEXR/ImfRationalAttribute.h",
+        "src/lib/OpenEXR/ImfRgba.h",
+        "src/lib/OpenEXR/ImfRgbaFile.h",
+        "src/lib/OpenEXR/ImfRgbaYca.h",
+        "src/lib/OpenEXR/ImfRle.h",
+        "src/lib/OpenEXR/ImfRleCompressor.h",
+        "src/lib/OpenEXR/ImfScanLineInputFile.h",
+        "src/lib/OpenEXR/ImfSimd.h",
+        "src/lib/OpenEXR/ImfStandardAttributes.h",
+        "src/lib/OpenEXR/ImfStdIO.h",
+        "src/lib/OpenEXR/ImfStringAttribute.h",
+        "src/lib/OpenEXR/ImfStringVectorAttribute.h",
+        "src/lib/OpenEXR/ImfSystemSpecific.h",
+        "src/lib/OpenEXR/ImfTestFile.h",
+        "src/lib/OpenEXR/ImfThreading.h",
+        "src/lib/OpenEXR/ImfTileDescription.h",
+        "src/lib/OpenEXR/ImfTileDescriptionAttribute.h",
+        "src/lib/OpenEXR/ImfTileOffsets.h",
+        "src/lib/OpenEXR/ImfTiledInputFile.h",
+        "src/lib/OpenEXR/ImfTiledInputPart.h",
+        "src/lib/OpenEXR/ImfTiledMisc.h",
+        "src/lib/OpenEXR/ImfTiledOutputFile.h",
+        "src/lib/OpenEXR/ImfTiledOutputPart.h",
+        "src/lib/OpenEXR/ImfTiledRgbaFile.h",
+        "src/lib/OpenEXR/ImfTimeCode.h",
+        "src/lib/OpenEXR/ImfTimeCodeAttribute.h",
+        "src/lib/OpenEXR/ImfVecAttribute.h",
+        "src/lib/OpenEXR/ImfVersion.h",
+        "src/lib/OpenEXR/ImfWav.h",
+        "src/lib/OpenEXR/ImfXdr.h",
+        "src/lib/OpenEXR/ImfZip.h",
+        "src/lib/OpenEXR/ImfZipCompressor.h",
+        ":src/lib/OpenEXR/OpenEXRConfig.h",
+        ":src/lib/OpenEXR/OpenEXRConfigInternal.h",
+    ],
+    includes = ["src/lib/OpenEXR"],
+    deps = [
+        ":IlmThread",
+        "@imath//:Imath",
+        "@zlib//:zlib",
+    ],
+    visibility = ["//visibility:public"],
+)
+""",
+    remote = "https://github.com/AcademySoftwareFoundation/openexr",
+    tag = "v3.1.6",
+)
index 675026a..9bd28f4 100755 (executable)
@@ -18,14 +18,11 @@ test_includes() {
     if [ ! -e "$f" ]; then
       continue
     fi
-    # Check that the public files (in lib/include/ directory) don't use the full
-    # path to the public header since users of the library will include the
-    # library as: #include "jxl/foobar.h".
-    if [[ "${f#lib/include/}" != "${f}" ]]; then
-      if grep -i -H -n -E '#include\s*[<"]lib/include/jxl' "$f" >&2; then
-        echo "Don't add \"include/\" to the include path of public headers." >&2
-        ret=1
-      fi
+    # Check that the full paths to the public headers are not used, since users
+    # of the library will include the library as: #include "jxl/foobar.h".
+    if grep -i -H -n -E '#include\s*[<"]lib/include/jxl' "$f" >&2; then
+      echo "Don't add \"include/\" to the include path of public headers." >&2
+      ret=1
     fi
 
     if [[ "${f#third_party/}" == "$f" ]]; then
@@ -103,9 +100,15 @@ test_printf_size_t() {
     ret=1
   fi
 
+  if grep -n -E '[^_]gtest\.h' \
+      $(git ls-files | grep -E '(\.c|\.cc|\.cpp|\.h)$' | grep -v -F /testing.h); then
+    echo "Don't include gtest directly, instead include 'testing.h'. " >&2
+    ret=1
+  fi
+
   if grep -n -E 'gmock\.h' \
-      $(git ls-files | grep -E '(\.c|\.cc|\.cpp|\.h)$' | grep -v -F /test_utils.h); then
-    echo "Don't include gmock directly, instead include 'test_utils.h'. " >&2
+      $(git ls-files | grep -E '(\.c|\.cc|\.cpp|\.h)$' | grep -v -F /testing.h); then
+    echo "Don't include gmock directly, instead include 'testing.h'. " >&2
     ret=1
   fi
 
@@ -122,7 +125,7 @@ test_printf_size_t() {
     fi
   done
 
-  for f in $(git ls-files | grep -E "\.h$" | grep -v -E '(printf_macros\.h|test_utils\.h)' |
+  for f in $(git ls-files | grep -E "\.h$" | grep -v -E '(printf_macros\.h|testing\.h)' |
       xargs grep -n 'PRI[udx]S'); do
     # Having PRIuS / PRIdS in a header file means that printf_macros.h may
     # be included before a system header, in particular before gtest headers.
diff --git a/ci.sh b/ci.sh
index 45d5218..342eac0 100755 (executable)
--- a/ci.sh
+++ b/ci.sh
@@ -5,8 +5,8 @@
 # license that can be found in the LICENSE file.
 
 # Continuous integration helper module. This module is meant to be called from
-# the .gitlab-ci.yml file during the continuous integration build, as well as
-# from the command line for developers.
+# workflows during the continuous integration build, as well as from the
+# command line for developers.
 
 set -eu
 
@@ -21,7 +21,10 @@ CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH:-}
 CMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER:-}
 CMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER:-}
 CMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM:-}
+SKIP_BUILD="${SKIP_BUILD:-0}"
 SKIP_TEST="${SKIP_TEST:-0}"
+FASTER_MSAN_BUILD="${FASTER_MSAN_BUILD:-0}"
+TARGETS="${TARGETS:-all doc}"
 TEST_SELECTOR="${TEST_SELECTOR:-}"
 BUILD_TARGET="${BUILD_TARGET:-}"
 ENABLE_WASM_SIMD="${ENABLE_WASM_SIMD:-0}"
@@ -32,6 +35,8 @@ else
 fi
 # Whether we should post a message in the MR when the build fails.
 POST_MESSAGE_ON_ERROR="${POST_MESSAGE_ON_ERROR:-1}"
+# By default, do a lightweight debian HWY package build.
+HWY_PKG_OPTIONS="${HWY_PKG_OPTIONS:---set-envvar=HWY_EXTRA_CONFIG=-DBUILD_TESTING=OFF -DHWY_ENABLE_EXAMPLES=OFF -DHWY_ENABLE_CONTRIB=OFF}"
 
 # Set default compilers to clang if not already set
 export CC=${CC:-clang}
@@ -69,13 +74,18 @@ if [[ "${ENABLE_WASM_SIMD}" -ne "0" ]]; then
   CMAKE_EXE_LINKER_FLAGS="${CMAKE_EXE_LINKER_FLAGS} -msimd128"
 fi
 
+if [[ "${ENABLE_WASM_SIMD}" -eq "2" ]]; then
+  CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -DHWY_WANT_WASM2"
+  CMAKE_C_FLAGS="${CMAKE_C_FLAGS} -DHWY_WANT_WASM2"
+fi
+
 if [[ ! -z "${HWY_BASELINE_TARGETS}" ]]; then
   CMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS} -DHWY_BASELINE_TARGETS=${HWY_BASELINE_TARGETS}"
 fi
 
 # Version inferred from the CI variables.
-CI_COMMIT_SHA=${CI_COMMIT_SHA:-${GITHUB_SHA:-}}
-JPEGXL_VERSION=${JPEGXL_VERSION:-${CI_COMMIT_SHA:0:8}}
+CI_COMMIT_SHA=${GITHUB_SHA:-}
+JPEGXL_VERSION=${JPEGXL_VERSION:-}
 
 # Benchmark parameters
 STORE_IMAGES=${STORE_IMAGES:-1}
@@ -139,6 +149,7 @@ detect_clang_version() {
   fi
   local clang_version=$("${CC:-clang}" --version | head -n1)
   clang_version=${clang_version#"Debian "}
+  clang_version=${clang_version#"Ubuntu "}
   local llvm_tag
   case "${clang_version}" in
     "clang version 6."*)
@@ -171,27 +182,6 @@ on_exit() {
   local retcode="$1"
   # Always cleanup the CLEANUP_FILES.
   cleanup
-
-  # Post a message in the MR when requested with POST_MESSAGE_ON_ERROR but only
-  # if the run failed and we are not running from a MR pipeline.
-  if [[ ${retcode} -ne 0 && -n "${CI_BUILD_NAME:-}" &&
-        -n "${POST_MESSAGE_ON_ERROR}" && -z "${CI_MERGE_REQUEST_ID:-}" &&
-        "${CI_BUILD_REF_NAME}" = "master" ]]; then
-    load_mr_vars_from_commit
-    { set +xeu; } 2>/dev/null
-    local message="**Run ${CI_BUILD_NAME} @ ${CI_COMMIT_SHORT_SHA} failed.**
-
-Check the output of the job at ${CI_JOB_URL:-} to see if this was your problem.
-If it was, please rollback this change or fix the problem ASAP, broken builds
-slow down development. Check if the error already existed in the previous build
-as well.
-
-Pipeline: ${CI_PIPELINE_URL}
-
-Previous build commit: ${CI_COMMIT_BEFORE_SHA}
-"
-    cmd_post_mr_comment "${message}"
-  fi
 }
 
 trap 'retcode=$?; { set +x; } 2>/dev/null; on_exit ${retcode}' INT TERM EXIT
@@ -203,7 +193,7 @@ trap 'retcode=$?; { set +x; } 2>/dev/null; on_exit ${retcode}' INT TERM EXIT
 # running from a merge request pipeline).
 MR_HEAD_SHA=""
 # The common ancestor between the current commit and the tracked branch, such
-# as master. This includes a list
+# as main. This includes a list
 MR_ANCESTOR_SHA=""
 
 # Populate MR_HEAD_SHA and MR_ANCESTOR_SHA.
@@ -216,30 +206,23 @@ merge_request_commits() {
     # changes on the Pull Request if needed. This fetches 10 more commits which
     # should be enough given that PR normally should have 1 commit.
     git -C "${MYDIR}" fetch -q origin "${GITHUB_SHA}" --depth 10
-    MR_HEAD_SHA="$(git rev-parse "FETCH_HEAD^2" 2>/dev/null ||
+    if [ "${GITHUB_EVENT_NAME}" = "pull_request" ]; then
+      MR_HEAD_SHA="$(git rev-parse "FETCH_HEAD^2" 2>/dev/null ||
                    echo "${GITHUB_SHA}")"
+    else
+      MR_HEAD_SHA="${GITHUB_SHA}"
+    fi
   else
-    # CI_BUILD_REF is the reference currently being build in the CI workflow.
-    MR_HEAD_SHA=$(git -C "${MYDIR}" rev-parse -q "${CI_BUILD_REF:-HEAD}")
+    MR_HEAD_SHA=$(git -C "${MYDIR}" rev-parse -q "HEAD")
   fi
 
-  if [[ -n "${CI_MERGE_REQUEST_IID:-}" ]]; then
-    # Merge request pipeline in CI. In this case the upstream is called "origin"
-    # but it refers to the forked project that's the source of the merge
-    # request. We need to get the target of the merge request, for which we need
-    # to query that repository using our CI_JOB_TOKEN.
-    echo "machine gitlab.com login gitlab-ci-token password ${CI_JOB_TOKEN}" \
-      >> "${HOME}/.netrc"
-    git -C "${MYDIR}" fetch "${CI_MERGE_REQUEST_PROJECT_URL}" \
-      "${CI_MERGE_REQUEST_TARGET_BRANCH_NAME}"
-    MR_ANCESTOR_SHA=$(git -C "${MYDIR}" rev-parse -q FETCH_HEAD)
-  elif [[ -n "${GITHUB_BASE_REF:-}" ]]; then
+  if [[ -n "${GITHUB_BASE_REF:-}" ]]; then
     # Pull request workflow in GitHub Actions. GitHub checkout action uses
     # "origin" as the remote for the git checkout.
     git -C "${MYDIR}" fetch -q origin "${GITHUB_BASE_REF}"
     MR_ANCESTOR_SHA=$(git -C "${MYDIR}" rev-parse -q FETCH_HEAD)
   else
-    # We are in a local branch, not a merge request.
+    # We are in a local branch, not a pull request workflow.
     MR_ANCESTOR_SHA=$(git -C "${MYDIR}" rev-parse -q HEAD@{upstream} || true)
   fi
 
@@ -258,40 +241,6 @@ merge_request_commits() {
   set -x
 }
 
-# Load the MR iid from the landed commit message when running not from a
-# merge request workflow. This is useful to post back results at the merge
-# request when running pipelines from master.
-load_mr_vars_from_commit() {
-  { set +x; } 2>/dev/null
-  if [[ -z "${CI_MERGE_REQUEST_IID:-}" ]]; then
-    local mr_iid=$(git rev-list --format=%B --max-count=1 HEAD |
-      grep -F "${CI_PROJECT_URL}" | grep -F "/merge_requests" | head -n 1)
-    # mr_iid contains a string like this if it matched:
-    #  Part-of: <https://gitlab.com/wg1/jpeg-xlm/merge_requests/123456>
-    if [[ -n "${mr_iid}" ]]; then
-      mr_iid=$(echo "${mr_iid}" |
-        sed -E 's,^.*merge_requests/([0-9]+)>.*$,\1,')
-      CI_MERGE_REQUEST_IID="${mr_iid}"
-      CI_MERGE_REQUEST_PROJECT_ID=${CI_PROJECT_ID}
-    fi
-  fi
-  set -x
-}
-
-# Posts a comment to the current merge request.
-cmd_post_mr_comment() {
-  { set +x; } 2>/dev/null
-  local comment="$1"
-  if [[ -n "${BOT_TOKEN:-}" && -n "${CI_MERGE_REQUEST_IID:-}" ]]; then
-    local url="${CI_API_V4_URL}/projects/${CI_MERGE_REQUEST_PROJECT_ID}/merge_requests/${CI_MERGE_REQUEST_IID}/notes"
-    curl -X POST -g \
-      -H "PRIVATE-TOKEN: ${BOT_TOKEN}" \
-      --data-urlencode "body=${comment}" \
-      --output /dev/null \
-      "${url}"
-  fi
-  set -x
-}
 
 # Set up and export the environment variables needed by the child processes.
 export_env() {
@@ -452,9 +401,12 @@ cmake_configure() {
 }
 
 cmake_build_and_test() {
+  if [[ "${SKIP_BUILD}" -eq "1" ]]; then
+      return 0
+  fi
   # gtest_discover_tests() runs the test binaries to discover the list of tests
   # at build time, which fails under qemu.
-  ASAN_OPTIONS=detect_leaks=0 cmake --build "${BUILD_DIR}" -- all doc
+  ASAN_OPTIONS=detect_leaks=0 cmake --build "${BUILD_DIR}" -- $TARGETS
   # Pack test binaries if requested.
   if [[ "${PACK_TEST:-}" == "1" ]]; then
     (cd "${BUILD_DIR}"
@@ -547,6 +499,7 @@ cmd_coverage_report() {
     # Only print coverage information for the libjxl directories. The rest
     # is not part of the code under test.
     --filter '.*jxl/.*'
+    --exclude '.*_gbench.cc'
     --exclude '.*_test.cc'
     --exclude '.*_testonly..*'
     --exclude '.*_debug.*'
@@ -576,7 +529,7 @@ cmd_test() {
   (cd "${BUILD_DIR}"
    export UBSAN_OPTIONS=print_stacktrace=1
    [[ "${TEST_STACK_LIMIT}" == "none" ]] || ulimit -s "${TEST_STACK_LIMIT}"
-   ctest -j $(nproc --all || echo 1) --output-on-failure "$@")
+   ctest -j $(nproc --all || echo 1) ${TEST_SELECTOR} --output-on-failure "$@")
 }
 
 cmd_gbench() {
@@ -652,7 +605,6 @@ cmd_msan() {
   local msan_c_flags=(
     -fsanitize=memory
     -fno-omit-frame-pointer
-    -fsanitize-memory-track-origins
 
     -DJXL_ENABLE_ASSERT=1
     -g
@@ -661,6 +613,13 @@ cmd_msan() {
     # Force gtest to not use the cxxbai.
     -DGTEST_HAS_CXXABI_H_=0
   )
+  if [[ "${FASTER_MSAN_BUILD}" -ne "1" ]]; then
+    msan_c_flags=(
+      "${msan_c_flags[@]}"
+      -fsanitize-memory-track-origins
+    )
+  fi
+
   local msan_cxx_flags=(
     "${msan_c_flags[@]}"
 
@@ -724,6 +683,15 @@ cmd_msan_install() {
   local msan_prefix="${HOME}/.msan/${CLANG_VERSION}"
   rm -rf "${msan_prefix}"
 
+  local TARGET_OPTS=""
+  if [[ -n "${BUILD_TARGET}" ]]; then
+    TARGET_OPTS=" \
+      -DCMAKE_C_COMPILER_TARGET=\"${BUILD_TARGET}\" \
+      -DCMAKE_CXX_COMPILER_TARGET=\"${BUILD_TARGET}\" \
+      -DCMAKE_SYSTEM_PROCESSOR=\"${BUILD_TARGET%%-*}\" \
+    "
+  fi
+
   declare -A CMAKE_EXTRAS
   CMAKE_EXTRAS[libcxx]="\
     -DLIBCXX_CXX_ABI=libstdc++ \
@@ -745,6 +713,7 @@ cmd_msan_install() {
       -DCMAKE_EXE_LINKER_FLAGS="${CMAKE_EXE_LINKER_FLAGS}" \
       -DCMAKE_SHARED_LINKER_FLAGS="${CMAKE_SHARED_LINKER_FLAGS}" \
       -DCMAKE_INSTALL_PREFIX="${msan_prefix}" \
+      ${TARGET_OPTS} \
       ${CMAKE_EXTRAS[${project}]}
     cmake --build "${proj_build}"
     ninja -C "${proj_build}" install
@@ -788,7 +757,7 @@ _cmd_ossfuzz() {
     -e MSAN_LIBS_PATH="/work/msan" \
     -e JPEGXL_EXTRA_ARGS="${jpegxl_extra_args}" \
     -v "${MYDIR}":/src/libjxl \
-    -v "${MYDIR}/tools/ossfuzz-build.sh":/src/build.sh \
+    -v "${MYDIR}/tools/scripts/ossfuzz-build.sh":/src/build.sh \
     -v "${real_build_dir}":/work \
     gcr.io/oss-fuzz/libjxl
 }
@@ -902,7 +871,7 @@ run_benchmark() {
     --input "${src_img_dir}/*.png"
     --codec=jpeg:yuv420:q85,webp:q80,jxl:d1:6,jxl:d1:6:downsampling=8,jxl:d5:6,jxl:d5:6:downsampling=8,jxl:m:d0:2,jxl:m:d0:3,jxl:m:d2:2
     --output_dir "${output_dir}"
-    --noprofiler --show_progress
+    --show_progress
     --num_threads="${num_threads}"
   )
   if [[ "${STORE_IMAGES}" == "1" ]]; then
@@ -917,15 +886,7 @@ run_benchmark() {
     return ${PIPESTATUS[0]}
   )
 
-  if [[ -n "${CI_BUILD_NAME:-}" ]]; then
-    { set +x; } 2>/dev/null
-    local message="Results for ${CI_BUILD_NAME} @ ${CI_COMMIT_SHORT_SHA} (job ${CI_JOB_URL:-}):
 
-$(cat "${output_dir}/results.txt")
-"
-    cmd_post_mr_comment "${message}"
-    set -x
-  fi
 }
 
 # Helper function to wait for the CPU temperature to cool down on ARM.
@@ -1153,18 +1114,6 @@ cmd_arm_benchmark() {
   cmd_cpuset "${RUNNER_CPU_ALL:-}"
   cat "${runs_file}"
 
-  if [[ -n "${CI_BUILD_NAME:-}" ]]; then
-    load_mr_vars_from_commit
-    { set +x; } 2>/dev/null
-    local message="Results for ${CI_BUILD_NAME} @ ${CI_COMMIT_SHORT_SHA} (job ${CI_JOB_URL:-}):
-
-\`\`\`
-$(column -t -s "       " "${runs_file}")
-\`\`\`
-"
-    cmd_post_mr_comment "${message}"
-    set -x
-  fi
 }
 
 # Generate a corpus and run the fuzzer on that corpus.
@@ -1184,22 +1133,38 @@ cmd_fuzz() {
   )
 }
 
-# Runs the linter (clang-format) on the pending CLs.
+# Runs the linters (clang-format, build_cleaner, buildirier) on the pending CLs.
 cmd_lint() {
   merge_request_commits
   { set +x; } 2>/dev/null
-  local versions=(${1:-6.0 7 8 9 10 11})
+  local versions=(${1:-16 15 14 13 12 11 10 9 8 7 6.0})
   local clang_format_bins=("${versions[@]/#/clang-format-}" clang-format)
   local tmpdir=$(mktemp -d)
   CLEANUP_FILES+=("${tmpdir}")
 
   local ret=0
   local build_patch="${tmpdir}/build_cleaner.patch"
-  if ! "${MYDIR}/tools/build_cleaner.py" >"${build_patch}"; then
+  if ! "${MYDIR}/tools/scripts/build_cleaner.py" >"${build_patch}"; then
     ret=1
     echo "build_cleaner.py findings:" >&2
     "${COLORDIFF_BIN}" <"${build_patch}"
-    echo "Run \`tools/build_cleaner.py --update\` to apply them" >&2
+    echo "Run \`tools/scripts/build_cleaner.py --update\` to apply them" >&2
+  fi
+
+  # It is ok, if buildifier is not installed.
+  if which buildifier >/dev/null; then
+    local buildifier_patch="${tmpdir}/buildifier.patch"
+    local bazel_files=`git -C ${MYDIR} ls-files | grep -E "/BUILD$|WORKSPACE|.bzl$"`
+    set -x
+    buildifier -d ${bazel_files} >"${buildifier_patch}"|| true
+    { set +x; } 2>/dev/null
+    if [ -s "${buildifier_patch}" ]; then
+      ret=1
+      echo 'buildifier have found some problems in Bazel build files:' >&2
+      "${COLORDIFF_BIN}" <"${buildifier_patch}"
+      echo 'To fix them run (from the base directory):' >&2
+      echo '  buildifier `git ls-files | grep -E "/BUILD$|WORKSPACE|.bzl$"`' >&2
+    fi
   fi
 
   local installed=()
@@ -1218,7 +1183,7 @@ cmd_lint() {
     git -C "${MYDIR}" "${clang_format}" --binary "${clang_format}" \
       --style=file --diff "${MR_ANCESTOR_SHA}" -- >"${tmppatch}" || true
     { set +x; } 2>/dev/null
-    if grep -E '^--- ' "${tmppatch}">/dev/null; then
+    if grep -E '^--- ' "${tmppatch}" | grep -v 'a/third_party' >/dev/null; then
       if [[ -n "${LINT_OUTPUT:-}" ]]; then
         cp "${tmppatch}" "${LINT_OUTPUT}"
       fi
@@ -1326,6 +1291,7 @@ cmd_debian_stats() {
 build_debian_pkg() {
   local srcdir="$1"
   local srcpkg="$2"
+  local options="${3:-}"
 
   local debsdir="${BUILD_DIR}/debs"
   local builddir="${debsdir}/${srcpkg}"
@@ -1341,7 +1307,7 @@ build_debian_pkg() {
   done
   (
     cd "${builddir}"
-    debuild -b -uc -us
+    debuild "${options}" -b -uc -us
   )
 }
 
@@ -1353,7 +1319,7 @@ cmd_debian_build() {
       build_debian_pkg "${MYDIR}" "jpeg-xl"
       ;;
     highway)
-      build_debian_pkg "${MYDIR}/third_party/highway" "highway"
+      build_debian_pkg "${MYDIR}/third_party/highway" "highway" "${HWY_PKG_OPTIONS}"
       ;;
     *)
       echo "ERROR: Must pass a valid source package name to build." >&2
@@ -1374,7 +1340,7 @@ cmd_bump_version() {
   local newver="${1:-}"
 
   if ! which dch >/dev/null; then
-    echo "Run:\n  sudo apt install debhelper"
+    echo "Missing dch\nTo install it run:\n  sudo apt install devscripts"
     exit 1
   fi
 
@@ -1402,9 +1368,13 @@ cmd_bump_version() {
     -e "s/(set\\(JPEGXL_MINOR_VERSION) [0-9]+\\)/\\1 ${minor})/" \
     -e "s/(set\\(JPEGXL_PATCH_VERSION) [0-9]+\\)/\\1 ${patch})/" \
     -i lib/CMakeLists.txt
+  sed -E \
+    -e "s/(LIBJXL_VERSION: )[0-9\\.]+/\\1 ${major}.${minor}.${patch}/" \
+    -e "s/(LIBJXL_ABI_VERSION: )[0-9\\.]+/\\1 ${major}.${minor}/" \
+    -i .github/workflows/conformance.yml
 
   # Update lib.gni
-  tools/build_cleaner.py --update
+  tools/scripts/build_cleaner.py --update
 
   # Mark the previous version as "unstable".
   DEBCHANGE_RELEASE_HEURISTIC=log dch -M --distribution unstable --release ''
@@ -1418,11 +1388,11 @@ cmd_authors() {
   merge_request_commits
   local emails
   local names
-  readarray -t emails < <(git log --format='%ae' "${MR_HEAD_SHA}...${MR_ANCESTOR_SHA}")
-  readarray -t names < <(git log --format='%an' "${MR_HEAD_SHA}...${MR_ANCESTOR_SHA}")
+  readarray -t emails < <(git log --format='%ae' "${MR_ANCESTOR_SHA}..${MR_HEAD_SHA}")
+  readarray -t names < <(git log --format='%an' "${MR_ANCESTOR_SHA}..${MR_HEAD_SHA}")
   for i in "${!names[@]}"; do
     echo "Checking name '${names[$i]}' with email '${emails[$i]}' ..."
-    "${MYDIR}"/tools/check_author.py "${emails[$i]}" "${names[$i]}"
+    "${MYDIR}"/tools/scripts/check_author.py "${emails[$i]}" "${names[$i]}"
   done
 }
 
@@ -1450,7 +1420,7 @@ Where cmd is one of:
  benchmark Run the benchmark over the default corpus.
  fast_benchmark Run the benchmark over the small corpus.
 
- coverage  Buils and run tests with coverage support. Runs coverage_report as
+ coverage  Build and run tests with coverage support. Runs coverage_report as
            well.
  coverage_report Generate HTML, XML and text coverage report after a coverage
            run.
@@ -1483,6 +1453,7 @@ You can pass some optional environment variables as well:
  - FUZZER_MAX_TIME: "fuzz" command fuzzer running timeout in seconds.
  - LINT_OUTPUT: Path to the output patch from the "lint" command.
  - SKIP_CPUSET=1: Skip modifying the cpuset in the arm_benchmark.
+ - SKIP_BUILD=1: Skip the build stage, cmake configure only.
  - SKIP_TEST=1: Skip the test stage.
  - STORE_IMAGES=0: Makes the benchmark discard the computed images.
  - TEST_STACK_LIMIT: Stack size limit (ulimit -s) during tests, in KiB.
index 5c6cb09..9fb78e4 100644 (file)
@@ -31,11 +31,6 @@ foreach(brlib IN ITEMS ${brlibs})
       set_property(TARGET ${brlib} PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${BROTLI_INCLUDE_DIR})
       target_link_libraries(${brlib} INTERFACE ${${BRPREFIX}_LIBRARY})
       set_property(TARGET ${brlib} PROPERTY INTERFACE_COMPILE_OPTIONS ${PC_${BRPREFIX}_CFLAGS_OTHER})
-
-      add_library(${brlib}-static INTERFACE IMPORTED GLOBAL)
-      set_property(TARGET ${brlib}-static PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${BROTLI_INCLUDE_DIR})
-      target_link_libraries(${brlib}-static INTERFACE ${${BRPREFIX}_LIBRARY})
-      set_property(TARGET ${brlib}-static PROPERTY INTERFACE_COMPILE_OPTIONS ${PC_${BRPREFIX}_CFLAGS_OTHER})
     else()
     add_library(${brlib} INTERFACE IMPORTED GLOBAL)
       target_include_directories(${brlib}
@@ -46,11 +41,6 @@ foreach(brlib IN ITEMS ${brlibs})
         INTERFACE ${PC_${BRPREFIX}_LDFLAGS_OTHER})
       target_compile_options(${brlib}
         INTERFACE ${PC_${BRPREFIX}_CFLAGS_OTHER})
-
-      # TODO(deymo): Remove the -static library versions, this target is
-      # currently needed by brunsli.cmake. When importing it this way, the
-      # brotli*-static target is just an alias.
-      add_library(${brlib}-static ALIAS ${brlib})
     endif()
   endif()
 endforeach()
index c1deb9b..c5a90fb 100644 (file)
@@ -23,13 +23,13 @@ if (HWY_INCLUDE_DIR AND NOT HWY_VERSION)
   if (EXISTS "${HWY_INCLUDE_DIR}/hwy/highway.h")
     file(READ "${HWY_INCLUDE_DIR}/hwy/highway.h" HWY_VERSION_CONTENT)
 
-    string(REGEX MATCH "#define HWY_MAJOR +([0-9]+)" _dummy "${HWY_VERSION_CONTENT}")
+    string(REGEX MATCH "#define HWY_MAJOR +([0-9]+)" _sink "${HWY_VERSION_CONTENT}")
     set(HWY_VERSION_MAJOR "${CMAKE_MATCH_1}")
 
-    string(REGEX MATCH "#define +HWY_MINOR +([0-9]+)" _dummy "${HWY_VERSION_CONTENT}")
+    string(REGEX MATCH "#define +HWY_MINOR +([0-9]+)" _sink "${HWY_VERSION_CONTENT}")
     set(HWY_VERSION_MINOR "${CMAKE_MATCH_1}")
 
-    string(REGEX MATCH "#define +HWY_PATCH +([0-9]+)" _dummy "${HWY_VERSION_CONTENT}")
+    string(REGEX MATCH "#define +HWY_PATCH +([0-9]+)" _sink "${HWY_VERSION_CONTENT}")
     set(HWY_VERSION_PATCH "${CMAKE_MATCH_1}")
 
     set(HWY_VERSION "${HWY_VERSION_MAJOR}.${HWY_VERSION_MINOR}.${HWY_VERSION_PATCH}")
index a63607e..dde1130 100644 (file)
@@ -1,4 +1,16 @@
-jpeg-xl (0.7) UNRELEASED; urgency=medium
+jpeg-xl (0.9.0) unstable; urgency=medium
+
+  * Bump JPEG XL version to 0.9.0.
+
+ -- JPEG XL Maintainers <jpegxl@google.com>  Wed, 11 Jan 2023 16:12:35 +0000
+
+jpeg-xl (0.8) unstable; urgency=medium
+
+  * Bump JPEG XL version to 0.8.
+
+ -- JPEG XL Maintainers <jpegxl@google.com>  Wed, 11 Jan 2023 16:12:34 +0000
+
+jpeg-xl (0.7) unstable; urgency=medium
 
   * Bump JPEG XL version to 0.7.
 
index 7a3c502..f5dc5ce 100644 (file)
@@ -14,7 +14,7 @@ Build-Depends:
  libgmock-dev,
  libgoogle-perftools-dev,
  libgtest-dev,
- libhwy-dev (>= 0.15.0),
+ libhwy-dev (>= 1.0.0),
  libjpeg-dev,
  libopenexr-dev,
  libpng-dev,
index 20225a9..7786a87 100644 (file)
@@ -5,6 +5,11 @@ Files: *
 Copyright: 2020 the JPEG XL Project
 License: BSD-3-clause
 
+Files: third_party/libjpeg-turbo/*
+Copyright (C)2009-2023 D. R. Commander. All Rights Reserved.
+Copyright (C)2015 Viktor Szathmáry. All Rights Reserved.
+License: BSD-3-clause
+
 Files: third_party/sjpeg/*
 Copyright: 2017 Google, Inc
 License: Apache-2.0
index b735ec2..ebe4ac4 100644 (file)
@@ -1,4 +1,3 @@
 usr/include/jxl/*.h
-usr/lib/*/*.a
 usr/lib/*/*.so
 usr/lib/*/pkgconfig/*.pc
index efed75d..6259dbf 100755 (executable)
@@ -8,10 +8,14 @@ include /usr/share/dpkg/pkg-info.mk
 override_dh_auto_configure:
        # TODO(deymo): Remove the DCMAKE_BUILD_TYPE once builds without NDEBUG
        # are as useful as Release builds.
+        # TODO(szabadka) Re-enable jpegli after tests are fixed on Ubuntu 20.04,
+        # and debian:buster
        dh_auto_configure -- \
          -DJPEGXL_VERSION=$(DEB_VERSION) \
          -DCMAKE_BUILD_TYPE=RelWithDebInfo \
          -DJPEGXL_FORCE_SYSTEM_GTEST=ON \
          -DJPEGXL_FORCE_SYSTEM_BROTLI=ON \
          -DJPEGXL_FORCE_SYSTEM_HWY=ON \
-         -DJPEGXL_ENABLE_PLUGINS=ON 
+         -DJPEGXL_ENABLE_JPEGLI=OFF \
+         -DJPEGXL_ENABLE_JPEGLI_LIBJPEG=OFF \
+         -DJPEGXL_ENABLE_PLUGINS=ON
diff --git a/deps.sh b/deps.sh
index 9aaabba..78987c3 100755 (executable)
--- a/deps.sh
+++ b/deps.sh
@@ -5,7 +5,7 @@
 # license that can be found in the LICENSE file.
 
 # This file downloads the dependencies needed to build JPEG XL into third_party.
-# These dependencies are normally pulled by gtest.
+# These dependencies are normally pulled by git.
 
 set -eu
 
@@ -13,12 +13,13 @@ MYDIR=$(dirname $(realpath "$0"))
 
 # Git revisions we use for the given submodules. Update these whenever you
 # update a git submodule.
-THIRD_PARTY_BROTLI="35ef5c554d888bef217d449346067de05e269b30"
-THIRD_PARTY_HIGHWAY="22e3d7276f4157d4a47586ba9fd91dd6303f441a"
-THIRD_PARTY_SKCMS="64374756e03700d649f897dbd98c95e78c30c7da"
-THIRD_PARTY_SJPEG="868ab558fad70fcbe8863ba4e85179eeb81cc840"
+THIRD_PARTY_BROTLI="36533a866ed1ca4b75cf049f4521e4ec5fe24727"
+THIRD_PARTY_HIGHWAY="ba0900a4957b929390ab73827235557959234fea"
+THIRD_PARTY_SKCMS="42030a771244ba67f86b1c1c76a6493f873c5f91"
+THIRD_PARTY_SJPEG="e5ab13008bb214deb66d5f3e17ca2f8dbff150bf"
 THIRD_PARTY_ZLIB="cacf7f1d4e3d44d871b605da3b647f07d718623f"
-THIRD_PARTY_LIBPNG="a40189cf881e9f0db80511c382292a5604c3c3d1"
+THIRD_PARTY_LIBPNG="f135775ad4e5d4408d2e12ffcc71bb36e6b48551" # v1.6.40
+THIRD_PARTY_LIBJPEG_TURBO="8ecba3647edb6dd940463fedf38ca33a8e2a73d1" # 2.1.5.1
 
 # Download the target revision from GitHub.
 download_github() {
@@ -26,7 +27,7 @@ download_github() {
   local project="$2"
 
   local varname=`echo "$path" | tr '[:lower:]' '[:upper:]'`
-  varname="${varname/\//_}"
+  varname="${varname/[\/-]/_}"
   local sha
   eval "sha=\${${varname}}"
 
@@ -57,9 +58,16 @@ download_github() {
   mv "${local_fn}.tmp" "${local_fn}"
 }
 
+is_git_repository() {
+    local dir="$1"
+    local toplevel=$(git rev-parse --show-toplevel)
+
+    [[ "${dir}" == "${toplevel}" ]]
+}
+
 
 main() {
-  if git -C "${MYDIR}" rev-parse; then
+  if is_git_repository "${MYDIR}"; then
     cat >&2 <<EOF
 Current directory is a git repository, downloading dependencies via git:
 
@@ -78,6 +86,7 @@ EOF
     "https://skia.googlesource.com/skcms/+archive/"
   download_github third_party/zlib madler/zlib
   download_github third_party/libpng glennrp/libpng
+  download_github third_party/libjpeg-turbo libjpeg-turbo/libjpeg-turbo
   echo "Done."
 }
 
index ed3f939..7d20e32 100644 (file)
 @addtogroup libjxl
 @{
 
-@defgroup libjxl_decoder JPEG XL Decoder
+@defgroup libjxl_decoder JPEG XL Decoder API
 
-@defgroup libjxl_encoder JPEG XL Encoder
+@defgroup libjxl_encoder JPEG XL Encoder API
 
-@defgroup libjxl_common JPEG XL common definitions
+@defgroup libjxl_common Common API concepts
 
-@defgroup libjxl_butteraugli Butteraugli metric
+@defgroup libjxl_metadata Image and frame metadata
+
+@defgroup libjxl_color Color encoding and conversion
+
+@defgroup libjxl_cpp C++ helpers
 
 @}
 
index 0a1a84e..38b6c90 100644 (file)
@@ -1,8 +1,7 @@
 # Building and Testing
 
 This file describes the building and testing facilities provided by the `ci.sh`
-script. It assumes you already have the build environment set up, preferably
-Docker (see [instructions](developing_in_docker.md)).
+script. It assumes you already have the build environment set up.
 
 ## Basic building
 
@@ -62,11 +61,7 @@ environment variable in `./ci.sh`. For some targets such the Windows targets
 `ci.sh` sets up extra environment variables that are needed for testing.
 
 This assumes that you already have a cross-compiling environment set up and the
-library dependencies are already installed for the target architecture as well
-which is tricky to do in some cases. For this reason we provide a [jpegxl docker
-container](developing_in_docker.md) already configured to cross-compile and run
-for other architectures which is also used in our continuous integration
-pipeline.
+library dependencies are already installed for the target architecture as well.
 
 For example, to compile for the `aarch64-linux-gnu` target triplet you can run:
 
@@ -92,9 +87,9 @@ Linter checks will verify that the format of your patch conforms to the project
 style. For this, we run clang-format only on the lines that were changed by
 your commits.
 
-If your local git branch is tracking `origin/master` and you landed a few
+If your local git branch is tracking `origin/main` and you landed a few
 commits in your branch, running this lint command will check all the changes
-made from the common ancestor with `origin/master` to the latest changes,
+made from the common ancestor with `origin/main` to the latest changes,
 including uncommitted changes. The output of the program will show the patch
 that should be applied to fix your commits. You can apply these changes with the
 following command from the base directory of the git checkout:
index a4a52ee..8d15bc4 100644 (file)
@@ -4,15 +4,12 @@ This file describes the building and testing of JPEG XL
 [Web Assembly](https://webassembly.org/) bundles and wrappers.
 
 These instructions assume an up-to-date Debian/Ubuntu system.
-For other platforms, or if you encounter any difficulties,
-please instead use the [Docker container](developing_in_docker.md).
 
 For the sake of simplicity, it is considered, that the following environment
 variables are set:
 
  * `OPT` - path to the directory containing additional software;
-   the `emsdk` directory with the Emscripten SDK should reside there;
-   in the Docker container (mentioned above) this should be `/opt`
+   the `emsdk` directory with the Emscripten SDK should reside there.
 
 ## Requirements
 
@@ -40,28 +37,6 @@ cd emsdk
 ./emsdk activate latest
 ```
 
-[v8](https://v8.dev/) is a JavaScript engine used for running tests.
-v8 has better WASM SIMD support than NodeJS 14.
-To install it use [JSVU](https://github.com/GoogleChromeLabs/jsvu):
-
-```bash
-# Fix some v8 version know to work well.
-export v8_version="8.5.133"
-
-# Install JSVU
-npm install jsvu -g
-
-# Trick JSVU to install to specific location instead of user "home".
-# Note: "os" flag should match the host OS.
-HOME=$OPT jsvu --os=linux64 "v8@${v8_version}"
-
-# Link v8 binary to version-indepentent path.
-ln -s "$OPT/.jsvu/v8-${v8_version}" "$OPT/.jsvu/v8"
-```
-
-In [Docker container](developing_in_docker.md)
-CMake, Emscripten SDK and V8 are pre-installed.
-
 ## Building and testing the project
 
 ```bash
@@ -69,15 +44,19 @@ CMake, Emscripten SDK and V8 are pre-installed.
 # $OPT/emsdk.
 source $OPT/emsdk/emsdk_env.sh
 
-# Specify JS engine binary
-export V8=$OPT/.jsvu/v8
-
-# If building using the jpegxl-builder docker container prefix the following commands with:
-# CMAKE_FLAGS=-I/usr/wasm32/include
-# ex. CMAKE_FLAGS=-I/usr/wasm32/include BUILD_TARGET=wasm32 emconfigure ./ci.sh release
+# This should set the $EMSDK variable.
+# If your node version is <16.4.0, you might need to update to a newer version or override
+# the node binary with a version which supports SIMD:
+echo "NODE_JS='/path/to/node_binary'" >> $EMSDK/.emscripten
 
-# Either build with regular WASM:
+# Assuming you are in the root level of the cloned libjxl repo,
+# either build with regular WASM:
 BUILD_TARGET=wasm32 emconfigure ./ci.sh release
 # or with SIMD WASM:
 BUILD_TARGET=wasm32 ENABLE_WASM_SIMD=1 emconfigure ./ci.sh release
 ```
+
+## Example site
+
+Once you have build the wasm binary, you can give it a try by building a site
+that decodes jxl images, see [wasm_demo](../tools/wasm_demo/README.md).
index 56f4a28..88a7b60 100644 (file)
@@ -4,7 +4,7 @@
 
 <!--*
 # Document freshness: For more information, see go/fresh-source.
-freshness: { owner: 'janwas' reviewed: '2019-02-01' }
+freshness: { owner: 'sboukortt' reviewed: '2022-09-27' }
 *-->
 
 ## Why
diff --git a/doc/debugging_workflows.md b/doc/debugging_workflows.md
new file mode 100644 (file)
index 0000000..e66c5ca
--- /dev/null
@@ -0,0 +1,70 @@
+### Reasoning 
+
+Given the differences in compilers / environment it is not always clear why some
+build / test fails in workflows. In this document we gather practices that
+would help debugging workflows.
+
+### Debugging workflows on GitHub
+
+To connect to real workflow on GitHub one can use "tmate" plugin. To do that,
+add the following snippet in workflow .yml:
+
+```
+ - name: Setup tmate session
+   # Or other condition that pin-points a single strategy matrix item
+   if: failure()
+   uses: mxschmitt/action-tmate@a283f9441d2d96eb62436dc46d7014f5d357ac22 # v3.17
+```
+
+When the plugin is executed it dumps to log a command to "ssh" to that instance.
+
+NB: since session is wrapped in tmux, scrolling might be very inconvenient.
+
+### Debugging build_test_cross.yml locally
+
+"cross" workflows are executed in container, so those are easy to reproduce
+locally. Here is a snippet that reflects how setup / compilation are (currently)
+done in the workflow:
+
+```
+docker run -it -v`pwd`:/libjxl debian:bookworm bash
+
+cd /libjxl
+
+export ARCH=i386 # arm64 armhf
+export MAIN_LIST="amd64,${ARCH}"
+export BUILD_DIR=build
+export CC=clang-14
+export CXX=clang++-14
+export BUILD_TARGET=i686-linux-gnu # aarch64-linux-gnu arm-linux-gnueabihf
+
+rm -f /var/lib/man-db/auto-update
+apt-get update -y
+apt-get install -y ca-certificates debian-ports-archive-keyring python3
+
+dpkg --add-architecture ${ARCH}
+python3 ./tools/scripts/transform_sources_list.py "${MAIN_LIST}"
+apt update
+
+apt-get install -y \
+  clang-14 cmake doxygen g++-aarch64-linux-gnu graphviz libbrotli-dev:${ARCH} \
+  libc6-dev-${ARCH}-cross libgdk-pixbuf2.0-dev:${ARCH} libgif-dev:${ARCH} \
+  libgtk2.0-dev:${ARCH} libilmbase-dev:${ARCH} libjpeg-dev:${ARCH} \
+  libopenexr-dev:${ARCH} libpng-dev:${ARCH} libstdc++-12-dev-${ARCH}-cross \
+  libstdc++-12-dev:${ARCH} libwebp-dev:${ARCH} ninja-build pkg-config \
+  qemu-user-static unzip xdg-utils xvfb
+
+#apt-get install -y binutils-${BUILD_TARGET} gcc-${BUILD_TARGET}
+#apt-get install -y \
+#  libgoogle-perftools-dev:${ARCH} libgoogle-perftools4:${ARCH} \
+#  libtcmalloc-minimal4:${ARCH} libunwind-dev:${ARCH}
+#export CMAKE_FLAGS="-march=armv8-a+sve"
+
+SKIP_TEST=1 ./ci.sh release \
+  -DJPEGXL_FORCE_SYSTEM_BROTLI=ON \
+  -DJPEGXL_ENABLE_JNI=OFF
+#  -DCMAKE_CROSSCOMPILING_EMULATOR=/usr/bin/qemu-aarch64-static
+#  -DJPEGXL_ENABLE_OPENEXR=off
+#  -DJPEGXL_ENABLE_SIZELESS_VECTORS=on
+#  -DCMAKE_CXX_FLAGS=-DJXL_HIGH_PRECISION=0
+```
index a88b682..5b2bbd3 100644 (file)
@@ -3,7 +3,6 @@
 These instructions assume an up-to-date Debian/Ubuntu system.
 For other platforms, please instead use the following:
 
-* [Developing in Docker](developing_in_docker.md).
 * [Cross Compiling for Windows with Crossroad](developing_with_crossroad.md).
 
 ## Minimum build dependencies
@@ -34,8 +33,8 @@ Optionally, to compile some of the extra tool support and tests you can install
 the following packages:
 
 ```bash
-sudo apt install qtbase5-dev libqt5x11extras5-dev libwebp-dev libgimp2.0-dev \
-  libopenexr-dev libgtest-dev libgmock-dev libbenchmark-dev libbenchmark-tools
+sudo apt install qt6-base-dev libwebp-dev libgimp2.0-dev libopenexr-dev \
+  libgtest-dev libgmock-dev libbenchmark-dev libbenchmark-tools
 ```
 
 For the lint/coverage commands, you will also need additional packages:
diff --git a/doc/developing_in_docker.md b/doc/developing_in_docker.md
deleted file mode 100644 (file)
index f104a44..0000000
+++ /dev/null
@@ -1,114 +0,0 @@
-# Developing in Docker
-
-Docker allows software to be run in a packaged container, isolated from the
-host system. This allows code to be run in a standard environment instead
-of dealing with different build environments during development.  It also
-simplifies resolving external dependencies by including them in the automated
-setup of the container environment.
-
-## Set up the container
-
-You can read installation instructions and download Docker for your
-operating system at [Get Docker](https://docs.docker.com/get-docker/).
-
-The image used by our builders is an Ubuntu Bionic image with all the
-required dependencies and build tools installed. You can pull this image
-from `gcr.io/jpegxl/jpegxl-builder` using the following command:
-
-```bash
-sudo docker pull gcr.io/jpegxl/jpegxl-builder
-```
-
-To use the Docker image you can run the following command:
-
-```bash
-sudo docker run -it --rm \
-  --user $(id -u):$(id -g) \
-  -v $HOME/jpeg-xl:/jpeg-xl -w /jpeg-xl \
-  gcr.io/jpegxl/jpegxl-builder bash
-```
-
-This creates and runs a container that will be deleted after you exit the
-terminal (`--rm` flag).
-
-The `-v` flag is to map the directory containing your jpeg-xl checkout in your
-host (assumed to be at `$HOME/jpeg-xl`) to a directory inside the container at
-/jpeg-xl. Since the container is accessing the host folder directly,
-changes made on the host will will be seen immediately in the container,
-and vice versa.
-
-On OSX, the path must be one of those shared and whitelisted with Docker. $HOME
-(which is a subdirectory of /Users/) is known to work with the factory-default
-settings of Docker.
-
-On OSX, you may ignore the warning that Docker "cannot find name for group ID".
-This warning may also appear on some Linux computers.
-
-On Windows, you can run the following from the jpeg-xl directory obtained from
-Gitlab:
-
-```bash
-docker run -u root:root -it --rm -v %cd%:/jpeg-xl -w /jpeg-xl \
-  gcr.io/jpegxl/jpegxl-builder
-```
-
-## Basic building
-
-Inside the Docker container, you can compile everything and run unit tests.
-We need to specify `clang-7` because the default `clang` compiler is
-not installed on the image.
-
-```bash
-CC=clang-7 CXX=clang++-7 ./ci.sh opt
-```
-
-This writes binaries to `/jpeg-xl/build/tools` and runs unit tests.
-More information on [build modes and testing](building_and_testing.md) is
-available.
-
-If a `build` directory already exists and was configured for a different
-compiler, `cmake` will complain. This can be avoided by renaming or removing
-the existing `build` directory or setting the `BUILD_DIR` environment variable.
-
-## Cross-compiling environments (optional)
-
-We have installed the required cross-compiling tools in the main Docker image
-`jpegxl-builder`. This allows compiling for other architectures, such as arm.
-Tests will be emulated under `qemu`.
-
-The Docker container has several `qemu-*-static` binaries (such as
-`qemu-aarch64-static`) that emulate other architectures on x86_64. These
-binaries are automatically used when running foreign architecture programs
-in the container only if `binfmt` is installed and configured on the *host*
-to use binaries from `/usr/bin` . This is the default location on Ubuntu/Debian.
-
-You need to install both `binfmt-support` and `qemu-user-static` on the host,
-since `binfmt-support` configures only `binfmt` signatures of architectures
-that are installed.  If these are configured elsewhere on other distributions,
-you can symlink them to `/usr/bin/qemu-*-static` inside the Docker container.
-
-To install binfmt support in your Ubuntu host run *outside* the container:
-
-```bash
-sudo apt install binfmt-support qemu-user-static
-```
-
-Then to cross-compile and run unit tests execute the following commands:
-
-```bash
-export BUILD_TARGET=aarch64-linux-gnu CC=clang-7 CXX=clang++-7
-./ci.sh release
-```
-
-The `BUILD_TARGET=aarch64-linux-gnu` environment variable tells the `ci.sh`
-script to cross-compile for that target. This also changes the default
-`BUILD_DIR` to `build-aarch64` since you never want to mix them with the `build`
-of your host. You can also explicitly set a `BUILD_DIR` environment variable
-that will be used instead. The list of supported `BUILD_TARGET` values for this
-container is:
-
-*    *the empty string* (for native x86_64 support)
-*    aarch64-linux-gnu
-*    arm-linux-gnueabihf
-*    i686-linux-gnu
-*    x86_64-w64-mingw32 (for Windows builds)
index 332e451..a897be2 100644 (file)
@@ -4,7 +4,6 @@ These instructions assume an up-to-date Windows 10 (e.g. build 19041.928) with
 **Microsoft Visual Studio 2019** (e.g. Version 16.9.0 Preview 4.0) installed. If
 unavailable, please use another build environment:
 
-* [Docker container](developing_in_docker.md)
 * [MSYS2 on Windows](developing_in_windows_msys.md)
 * [Crossroad on Linux](developing_with_crossroad.md) (cross compilation for Windows)
 
diff --git a/doc/encode_effort.md b/doc/encode_effort.md
new file mode 100644 (file)
index 0000000..221b2bf
--- /dev/null
@@ -0,0 +1,32 @@
+# Encode effort settings
+
+Various trade-offs between encode speed and compression performance can be selected in libjxl. In `cjxl`, this is done via the `--effort` (`-e`) option.
+Higher effort means slower encoding; generally the higher the effort, the more coding tools are used, computationally more expensive heuristics are used,
+and more exhaustive search is performed. 
+Generally efforts range between `1` and `9`, but there is also `e10` you pass the flag `--allow_expert_options` (in combination with "lossless", i.e. `-d 0`). It is considered an expert option because it can be extremely slow.
+
+
+For lossy compression, higher effort results in better visual quality at a given filesize, and also better
+encoder consistency, i.e. less image-dependent variation in the actual visual quality that is achieved. This means that for lossy compression,
+higher effort does not necessarily mean smaller filesizes for every image — some images may be somewhat lower quality than desired when using
+lower effort heuristics, and to improve consistency, higher effort heuristics may decide to use more bytes for them.
+
+For lossless compression, higher effort should result in smaller filesizes, although this is not guaranteed;
+in particular, e2 can be better than e3 for non-photographic images, and e3 can be better than e4 for photographic images.
+
+The following table describes what the various effort settings do:
+
+|Effort | Modular (lossless) | VarDCT (lossy) |
+|-------|--------------------|----------------|
+| e1 | fast-lossless, fixed YCoCg RCT, fixed ClampedGradient predictor, simple palette detection, no MA tree (one context for everything), Huffman, simple rle-only lz77 | |
+| e2 | global channel palette, fixed MA tree (context based on Gradient-error), ANS, otherwise same as e1 | |
+| e3 | same as e2 but fixed Weighted predictor and fixed MA tree with context based on WP-error | only 8x8, basically XYB jpeg with ANS |
+| e4 | try both ClampedGradient and Weighted predictor, learned MA tree, global palette | simple variable blocks heuristics, adaptive quantization, coefficient reordering |
+| e5 | e4 + patches, local palette / local channel palette, different local RCTs | e4 + gabor-like transform, chroma from luma |
+| e6 | e5 + more RCTs and MA tree properties | e5 + error diffusion, full variable blocks heuristics |
+| e7 | e6 + more RCTs and MA tree properties | e6 + patches (including dots) |
+| e8 | e7 + more RCTs, MA tree properties and Weighted predictor parameters | e7 + Butteraugli iterations for adaptive quantization |
+| e9 | e8 + more RCTs, MA tree properties and Weighted predictor parameters, try all predictors | e8 + more Butteraugli iterations |
+| e10 | e9 + previous-channel MA tree properties, different group dimensions, exhaustively try various e9 options | |
+
+For the entropy coding (context clustering, lz77 search, hybriduint configuration): slower/more exhaustive search as effort goes up.
index 70f1278..eadd828 100644 (file)
@@ -29,7 +29,7 @@ number. Released tags don't each one have their own release branch, all releases
 from the same MAJOR.MINOR version will share the same branch. The first commit
 after the branch-off points between the main branch and the release branch
 should be tagged with the suffix `-snapshot` and the name of the next
-MAJOR.MINOR version, in order to get meaningful ouput for `git --describe`.
+MAJOR.MINOR version, in order to get meaningful output for `git describe`.
 
 The main purpose of the release branch is to stabilize the code before a
 release. This involves including fixes to existing bugs but **not** including
@@ -128,13 +128,15 @@ To help update it, run this helper command (in a Debian-based system):
 This will update the version in the following files:
 
  * `lib/CMakeLists.txt`
- * `lib/lib.gni`, automatically updated with `tools/build_cleaner.py --update`.
+ * `lib/lib.gni`, automatically updated with
+   `tools/scripts/build_cleaner.py --update`.
  * `debian/changelog` to create the Debian package release with the new version.
    Debian changelog shouldn't repeat the library changelog, instead it should
    include changes to the packaging scripts.
-If there were incompatible API/ABI changes, make sure to also adapt the 
-corresponding section in 
+ * `.github/workflows/conformance.yml`
+
+If there were incompatible API/ABI changes, make sure to also adapt the
+corresponding section in
 [CMakeLists.txt](https://github.com/libjxl/libjxl/blob/main/lib/CMakeLists.txt#L12).
 
 ## Cherry-pick fixes to a release
@@ -180,6 +182,15 @@ commits you need to cherry-pick, ideally in the same order they were merged on
 the `main` branch. At the end you will have a local branch with multiple commits
 on top of the release branch.
 
+To update the version number, for example from v0.8.0 to v0.8.1 run this helper
+command (in a Debian-based system):
+
+```bash
+./ci.sh bump_version 0.8.1
+```
+
+as described above and commit the changes.
+
 Finally, upload your changes to *your fork* like normal, except that when
 creating a pull request select the desired release branch as a target:
 
@@ -265,3 +276,39 @@ instructions:
    to see the results.
 
  * Finally click "Publish release" and go celebrate with the team. 🎉
+
+ * Make sure to manually push the commit of the release also to https://gitlab.com/wg1/jpeg-xl.
+
+### How to build downstream projects
+
+```bash
+docker run -it debian:bookworm /bin/bash
+
+apt update
+apt install -y clang cmake git libbrotli-dev nasm pkg-config ninja-build
+export CC=clang
+export CXX=clang++
+
+git clone --recurse-submodules --depth 1 -b v0.7.x \
+  https://github.com/libjxl/libjxl.git
+git clone --recurse-submodules --depth 1 \
+  https://github.com/ImageMagick/ImageMagick.git
+git clone --recurse-submodules --depth 1 \
+  https://github.com/FFmpeg/FFmpeg.git
+
+cd ~/libjxl
+git checkout v0.7.x
+cmake -B build -G Ninja .
+cmake --build build
+cmake --install build
+
+cd ~/ImageMagick
+./configure --with-jxl=yes
+# check for "JPEG XL --with-jxl=yes yes"
+make -j 80
+
+cd ~/FFmpeg
+./configure --enable-libjxl
+# check for libjxl decoder/encoder support
+make -j 80
+```
index 9dddad7..7605a33 100644 (file)
@@ -12,12 +12,16 @@ Please add missing software to this list.
 
 ## Browsers
 
-- Chromium: behind a flag since version 91, [tracking bug](https://bugs.chromium.org/p/chromium/issues/detail?id=1178058)
+- Chromium: behind a flag from version 91 to 109, [tracking bug](https://bugs.chromium.org/p/chromium/issues/detail?id=1178058)
 - Firefox: behind a flag since version 90, [tracking bug](https://bugzilla.mozilla.org/show_bug.cgi?id=1539075)
-- Safari: not supported, [tracking bug](https://bugs.webkit.org/show_bug.cgi?id=208235)
+- Safari: supported since version 17 beta [release notes](https://developer.apple.com/documentation/safari-release-notes/safari-17-release-notes), [tracking bug](https://bugs.webkit.org/show_bug.cgi?id=208235)
 - Edge: behind a flag since version 91, start with `.\msedge.exe --enable-features=JXL`
 - Opera: behind a flag since version 77.
-- For all browsers and to track browsers progress see [Can I Use](https://caniuse.com/jpegxl).
+- Basilisk: supported since version v2023.01.07, [release notes](https://www.basilisk-browser.org/releasenotes.shtml)
+- Pale Moon: supported since version 31.4.0, [release notes](https://www.palemoon.org/releasenotes-archived.shtml#v31.4.0)
+- Waterfox: [enabled by default](https://github.com/WaterfoxCo/Waterfox/pull/2936)
+
+For all browsers and to track browsers progress see [Can I Use](https://caniuse.com/jpegxl).
 
 ## Image libraries
 
@@ -26,6 +30,8 @@ Please add missing software to this list.
 - [Imlib2](https://github.com/alistair7/imlib2-jxl)
 - [FFmpeg](https://github.com/FFmpeg/FFmpeg/search?q=jpeg-xl&type=commits)
 - [GDAL](https://gdal.org/drivers/raster/jpegxl.html): supported since 3.4.0 as a TIFF codec, and 3.6.0 as standalone format
+- [GraphicsMagick](http://www.graphicsmagick.org/NEWS.html#march-26-2022): supported since 1.3.38
+- [SAIL](https://sail.software): supported since 0.9.0
 
 ## OS-level support / UI frameworks / file browser plugins
 
@@ -41,24 +47,35 @@ Please add missing software to this list.
 
 ## Image editors
 
+- [Adobe Camera Raw (since version 15)](https://helpx.adobe.com/camera-raw/using/hdr-output.html)
+- [Affinity (since V2)](https://affinity.serif.com/en-gb/whats-new/)
+- [darktable (since 4.2)](https://github.com/darktable-org/darktable/releases/tag/release-4.2.0)
 - [GIMP (since 2.99.8)](https://www.gimp.org/news/2021/10/20/gimp-2-99-8-released/); plugin for older versions available in libjxl repo
+- [Graphic Converter (since 11.5)](https://www.lemkesoft.de/en/products/graphicconverter/)
 - [Krita](https://invent.kde.org/graphics/krita/-/commit/13e5d2e5b9f0eac5c8064b7767f0b62264a0797b)
+- [Paint.NET](https://www.getpaint.net/index.html); supported since 4.3.12 - requires a [plugin](https://github.com/0xC0000054/pdn-jpegxl) to be downloaded and installed.
 - Photoshop: no plugin available yet, no official support yet
 
 ## Image viewers
 
-- [XnView](https://www.xnview.com/en/)
 - [ImageGlass](https://imageglass.org/)
 - [IrfanView](https://www.irfanview.com/); supported since 4.59 - requires a [plugin](https://www.irfanview.com/plugins.htm) to be downloaded and enabled.
+- [jpegview](https://github.com/sylikc/jpegview/releases)
+- [Swayimg](https://github.com/artemsen/swayimg)
 - [Tachiyomi](https://github.com/tachiyomiorg/tachiyomi/releases/tag/v0.12.1)
+- [XnView](https://www.xnview.com/en/)
 - Any viewer based on Qt, KDE, GDK-pixbuf, EFL, ImageMagick, libvips or imlib2 (see above)
   - Qt viewers: gwenview, digiKam, KolourPaint, KPhotoAlbum, LXImage-Qt, qimgv, qView, nomacs, VookiImageViewer, PhotoQt
   - GTK viewers: Eye of Gnome (eog), gThumb, Geeqie
   - EFL viewers: entice, ephoto
-- [Swayimg](https://github.com/artemsen/swayimg)
+## Duplicate image finders
+
+- [AntiDupl.NET](https://github.com/ermig1979/AntiDupl/releases)
 
 ## Online tools
 
+- [Gumlet](https://www.gumlet.com/blog/jpeg-xl/)
 - [Squoosh](https://squoosh.app/)
 - [Cloudinary](https://cloudinary.com/blog/cloudinary_supports_jpeg_xl)
 - [MConverter](https://mconverter.eu/)
index 56fca09..1d79041 100644 (file)
@@ -1,5 +1,5 @@
-API reference
-=============
+libjxl API reference
+====================
 
 ``libjxl`` exposes a C API for encoding and decoding JPEG XL files with some
 C++ header-only helpers for C++ users.
@@ -11,5 +11,7 @@ C++ header-only helpers for C++ users.
    api_decoder
    api_encoder
    api_common
-   api_butteraugli
+   api_metadata
+   api_color
    api_threads
+   api_cpp
diff --git a/doc/sphinx/api_butteraugli.rst b/doc/sphinx/api_butteraugli.rst
deleted file mode 100644 (file)
index 4aae44a..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-Butteraugli API - ``jxl/butteraugli.h``
-=======================================
-
-.. doxygengroup:: libjxl_butteraugli
-   :members:
-   :private-members:
diff --git a/doc/sphinx/api_color.rst b/doc/sphinx/api_color.rst
new file mode 100644 (file)
index 0000000..6e40d48
--- /dev/null
@@ -0,0 +1,6 @@
+Color encoding and conversion
+=============================
+
+.. doxygengroup:: libjxl_color
+   :members:
+   :private-members:
diff --git a/doc/sphinx/api_cpp.rst b/doc/sphinx/api_cpp.rst
new file mode 100644 (file)
index 0000000..cc5e985
--- /dev/null
@@ -0,0 +1,6 @@
+C++ helpers
+===========
+
+.. doxygengroup:: libjxl_cpp
+   :members:
+   :private-members:
diff --git a/doc/sphinx/api_metadata.rst b/doc/sphinx/api_metadata.rst
new file mode 100644 (file)
index 0000000..ddc5b03
--- /dev/null
@@ -0,0 +1,6 @@
+Image and frame metadata
+========================
+
+.. doxygengroup:: libjxl_metadata
+   :members:
+   :private-members:
diff --git a/doc/tables/adobe.md b/doc/tables/adobe.md
deleted file mode 100644 (file)
index f3beef7..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-#### Table M.8 – "Adobe" marker template
-
-```
-0xEE, 0x00, 0x0E, 0x41, 0x64, 0x6F, 0x62, 0x65, 0x00, 0x64, 0x00, 0x00, 0x00, 0x00, 0x01
-```
-
diff --git a/doc/tables/all_tables.pdf b/doc/tables/all_tables.pdf
deleted file mode 100644 (file)
index b02c5b2..0000000
Binary files a/doc/tables/all_tables.pdf and /dev/null differ
diff --git a/doc/tables/all_tables.sh b/doc/tables/all_tables.sh
deleted file mode 100755 (executable)
index 6fc98eb..0000000
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/usr/bin/env bash
-# Copyright (c) the JPEG XL Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style
-# license that can be found in the LICENSE file.
-
-cat dct_gen.md \
-    is_zero_base.md num_nonzeros_base.md brn_proto.md app0.md icc.md ducky.md \
-    adobe.md stock_counts.md stock_values.md symbol_order.md stock_quant.md \
-    quant.md freq_context.md num_nonzero_context.md nonzero_buckets.md \
-    context_modes.md > all_tables.md
diff --git a/doc/tables/app0.md b/doc/tables/app0.md
deleted file mode 100644 (file)
index 266f210..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-#### Table M.4 – APP0 template
-
-```
-0xE0, 0x00, 0x10, 0x4A, 0x46, 0x49, 0x46, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x00, 0x01, 0x00, 0x00
-```
-
diff --git a/doc/tables/brn_proto.md b/doc/tables/brn_proto.md
deleted file mode 100644 (file)
index b5f80b6..0000000
+++ /dev/null
@@ -1,23 +0,0 @@
-#### Table M.3 – Protocol Buffer descriptor of top-level structure of losslessly compressed JPEG stream
-
-```protobuf
-message Header {
-  optional uint64 width = 1;
-  optional uint64 height = 2;
-  required uint64 version_and_component_count_code = 3;
-  optional uint64 subsampling_code = 4;
-}
-
-message Jpeg {
-  required bytes signature = 1;
-  required Header header = 2;
-  optional bytes meta_data = 3;
-  optional bytes jpeg1_internals = 4;
-  optional bytes quant_data = 5;
-  optional bytes histogram_data = 6;
-  optional bytes dc_data = 7;
-  optional bytes ac_data = 8;
-  optional bytes original_jpg = 9;
-}
-```
-
diff --git a/doc/tables/context_modes.md b/doc/tables/context_modes.md
deleted file mode 100644 (file)
index 59bff36..0000000
+++ /dev/null
@@ -1,13 +0,0 @@
-#### Table M.29 – context_modes table
-
-```
-0, 1, 1, 1, 1, 1, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0,
-0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0,
-0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
-```
-
-```
-0, 1, 1, 1, 1, 0, 0, 0, 2, 3, 1, 1, 1, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0,
-0, 2, 2, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-```
diff --git a/doc/tables/dct_gen.md b/doc/tables/dct_gen.md
deleted file mode 100644 (file)
index f3b59e2..0000000
+++ /dev/null
@@ -1,241 +0,0 @@
-#### Electronic Insert I.1 – DCT-II / DCT-III code generator
-
-```python
-#######################################################################
-# DCT-II / DCT-III generator
-#
-# Based on:
-#  "A low multiplicative complexity fast recursive DCT-2 algorithm"
-#  by Maxim Vashkevich and Alexander Petrovsky / arXiv / 20 Jul 2012
-#######################################################################
-
-import math
-import sys
-N = 8
-
-#######################################################################
-# Base transforms / generators
-#######################################################################
-
-CNTR = 0
-def makeTmp():
-  global CNTR
-  result = "t{:02d}".format(CNTR)
-  CNTR = CNTR + 1
-  return result
-
-def makeVar(i):
-  return "i{:02d}".format(i)
-
-def add(x, y):
-  tmp = makeTmp()
-  print(tmp + " = " + x + " + " + y + ";")
-  return tmp
-
-def sub(x, y):
-  tmp = makeTmp()
-  print(tmp + " = " + x + " - " + y + ";")
-  return tmp
-
-def mul(x, c):
-  tmp = makeTmp()
-  print(tmp + " = " + x + " * " + c + ";")
-  return tmp
-
-# 2.0 * math.cos((a + 0.0) / (b + 0.0) * math.pi)
-def C2(a, b):
-  return "c_c2_" + str(a) + "_" + str(b)
-
-# 1.0 / C2(a, b)
-def iC2(a, b):
-  return "c_ic2_" + str(a) + "_" + str(b)
-
-#######################################################################
-# Utilities
-#######################################################################
-
-# Generate identity matrix. Usually this matrix is passed to
-# DCT algorithm to generate "basis" vectors of the transform.
-def makeVars():
-  return [makeVar(i) for i in range(N)]
-
-# Split list of variables info halves.
-def split(x):
-  m = len(x)
-  m2 = m // 2
-  return (x[0 : m2], x[m2 : m])
-
-# Make a list of variables in a reverse order.
-def reverse(varz):
-  m = len(varz)
-  result = [0] * m
-  for i in range(m):
-    result[i] = varz[m - 1 - i]
-  return result
-
-# Apply permutation
-def permute(x, p):
- return [x[p[i]] for i in range(len(p))]
-
-def transposePermutation(p):
-  n = len(p)
-  result = [0] * n
-  for i in range(n):
-    result[p[i]] = i
-  return result
-
-# See paper. Split even-odd elements.
-def P(n):
-  if n == 1:
-    return [0]
-  n2 = n // 2
-  return [2 * i for i in range(n2)] + [2 * i + 1 for i in range(n2)]
-
-# See paper. Interleave first and second half.
-def Pt(n):
-  return transposePermutation(P(n))
-
-#######################################################################
-# Scheme
-#######################################################################
-
-def B2(x):
-  n = len(x)
-  n2 = n // 2
-  if n == 1:
-    raise "ooops"
-  (top, bottom) = split(x)
-  bottom = reverse(bottom)
-  t = [add(top[i], bottom[i]) for i in range(n2)]
-  b = [sub(top[i], bottom[i]) for i in range(n2)]
-  return t + b
-
-def iB2(x):
-  n = len(x)
-  n2 = n // 2
-  if n == 1:
-    raise "ooops"
-  (top, bottom) = split(x)
-  t = [add(top[i], bottom[i]) for i in range(n2)]
-  b = [sub(top[i], bottom[i]) for i in range(n2)]
-  return t + reverse(b)
-
-def B4(x, rn):
-  n = len(x)
-  n2 = n // 2
-  if n == 1:
-    raise "ooops"
-  (top, bottom) = split(x)
-  rbottom = reverse(bottom)
-  t = [sub(top[i], rbottom[i]) for i in range(n2)]
-  b = [mul(bottom[i], C2(rn, 2 * N)) for i in range(n2)]
-  top = [add(t[i], b[i]) for i in range(n2)]
-  bottom = [sub(t[i], b[i]) for i in range(n2)]
-  return top + bottom
-
-def iB4(x, rn):
-  n = len(x)
-  n2 = n // 2
-  if n == 1:
-    raise "ooops"
-  (top, bottom) = split(x)
-  t = [add(top[i], bottom[i]) for i in range(n2)]
-  b = [sub(top[i], bottom[i]) for i in range(n2)]
-  bottom = [mul(b[i], iC2(rn, 2 * N)) for i in range(n2)]
-  rbottom = reverse(bottom)
-  top = [add(t[i], rbottom[i]) for i in range(n2)]
-  return top + bottom
-
-def P4(n):
-  if n == 1:
-    return [0]
-  if n == 2:
-    return [0, 1]
-  n2 = n // 2
-  result = [0] * n
-  tc = 0
-  bc = 0
-  i = 0
-  result[i] = tc; tc = tc + 1; i = i + 1
-  turn = True
-  while i < n - 1:
-    if turn:
-      result[i] = n2 + bc; bc = bc + 1; i = i + 1
-      result[i] = n2 + bc; bc = bc + 1; i = i + 1
-    else:
-      result[i] = tc; tc = tc + 1; i = i + 1
-      result[i] = tc; tc = tc + 1; i = i + 1
-    turn = not turn
-  result[i] = tc; tc = tc + 1; i = i + 1
-  return result
-
-def iP4(n):
-  return transposePermutation(P4(n))
-
-def d2n(x):
-  n = len(x)
-  if n == 1:
-    return x
-  y = B2(x)
-  (top, bottom) = split(y)
-  return permute(d2n(top) + d4n(bottom, N // 2), Pt(n))
-
-def id2n(x):
-  n = len(x)
-  if n == 1:
-    return x
-  (top, bottom) = split(permute(x, P(n)))
-  return iB2(id2n(top) + id4n(bottom, N // 2))
-
-def d4n(x, rn):
-  n = len(x)
-  if n == 1:
-    return x
-  y = B4(x, rn)
-  (top, bottom) = split(y)
-  rn2 = rn // 2
-  return permute(d4n(top, rn2) + d4n(bottom, N - rn2), P4(n))
-
-def id4n(x, rn):
-  n = len(x)
-  if n == 1:
-    return x
-  (top, bottom) = split(permute(x, iP4(n)))
-  rn2 = rn // 2
-  y = id4n(top, rn2) + id4n(bottom, N -rn2)
-  return iB4(y, rn)
-
-#######################################################################
-# Main.
-#######################################################################
-
-def help():
-  print("Usage: %s [N [T]]" % sys.argv[0])
-  print("  N should be the power of 2, default is 8")
-  print("  T is one of {2, 3}, default is 2")
-  sys.exit()
-
-def parseInt(s):
-  try:
-    return int(s)
-  except ValueError:
-    help()
-
-if __name__ == "__main__":
-  if len(sys.argv) < 1 or len(sys.argv) > 3: help()
-  if len(sys.argv) >= 2:
-    N = parseInt(sys.argv[1])
-    if (N & (N - 1)) != 0: help()
-  type = 0
-  if len(sys.argv) >= 3:
-    typeOption = sys.argv[2]
-    if len(typeOption) != 1: help()
-    type = "23".index(typeOption)
-    if type == -1: help()
-  if type == 0:
-    vars = d2n(makeVars())
-  else:  # type == 1
-    vars = id2n(makeVars())
-  print("Output vector: " + str(vars))
-```
-
diff --git a/doc/tables/ducky.md b/doc/tables/ducky.md
deleted file mode 100644 (file)
index 307f688..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-#### Table M.7 – "Ducky" marker template
-
-```
-0xEC, 0x00, 0x11, 0x44, 0x75, 0x63, 0x6B, 0x79, 0x00, 0x01, 0x00, 0x04, 0x00, 0x00, 0x00, 0x64, 0x00, 0x00
-```
-
diff --git a/doc/tables/freq_context.md b/doc/tables/freq_context.md
deleted file mode 100644 (file)
index 3e218fb..0000000
+++ /dev/null
@@ -1,54 +0,0 @@
-#### Table M.15 – freq_context
-
-`scheme == 0`:
-```
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-```
-
-`scheme == 1`:
-```
-0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
-```
-
-`scheme == 2`:
-```
-0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1
-```
-
-`scheme == 3`:
-```
-0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
-6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7,
-7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 2, 2, 2
-```
-
-`scheme == 4`:
-```
- 0,  1,  2,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  8,  8,  9,  9,
- 9,  9, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13,
-13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15,
-15, 15, 15, 15, 15, 15, 15, 15, 15, 15
-```
-
-`scheme == 5`:
-```
- 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 16,
-17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24, 24, 24, 24,
-25, 25, 25, 25, 26, 26, 26, 26, 27, 27, 27, 27, 28, 28, 28, 28, 29, 29,
-29, 29, 30, 30, 30, 30, 31, 31, 31, 31
-```
-
-`scheme == 6`:
-```
- 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
-18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
-36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
-54, 55, 56, 57, 58, 59, 60, 61, 62, 63
-```
-
diff --git a/doc/tables/icc.md b/doc/tables/icc.md
deleted file mode 100644 (file)
index 1f3b4cd..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-#### Table M.6 – common ICC profile template
-
-```
-0xE2, 0x0C, 0x58, 0x49, 0x43, 0x43, 0x5F, 0x50, 0x52, 0x4F, 0x46, 0x49, 0x4C, 0x45, 0x00, 0x01, 0x01, 0x00, 0x00, 0x0C, 0x48, 0x4C, 0x69, 0x6E, 0x6F, 0x02, 0x10, 0x00, 0x00, 0x6D, 0x6E, 0x74, 0x72, 0x52, 0x47, 0x42, 0x20, 0x58, 0x59, 0x5A, 0x20, 0x07, 0xCE, 0x00, 0x02, 0x00, 0x09, 0x00, 0x06, 0x00, 0x31, 0x00, 0x00, 0x61, 0x63, 0x73, 0x70, 0x4D, 0x53, 0x46, 0x54, 0x00, 0x00, 0x00, 0x00, 0x49, 0x45, 0x43, 0x20, 0x73, 0x52, 0x47, 0x42, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0xF6, 0xD6, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0xD3, 0x2D, 0x48, 0x50, 0x20, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x63, 0x70, 0x72, 0x74, 0x00, 0x00, 0x01, 0x50, 0x00, 0x00, 0x00, 0x33, 0x64, 0x65, 0x73, 0x63, 0x00, 0x00, 0x01, 0x84, 0x00, 0x00, 0x00, 0x6C, 0x77, 0x74, 0x70, 0x74, 0x00, 0x00, 0x01, 0xF0, 0x00, 0x00, 0x00, 0x14, 0x62, 0x6B, 0x70, 0x74, 0x00, 0x00, 0x02, 0x04, 0x00, 0x00, 0x00, 0x14, 0x72, 0x58, 0x59, 0x5A, 0x00, 0x00, 0x02, 0x18, 0x00, 0x00, 0x00, 0x14, 0x67, 0x58, 0x59, 0x5A, 0x00, 0x00, 0x02, 0x2C, 0x00, 0x00, 0x00, 0x14, 0x62, 0x58, 0x59, 0x5A, 0x00, 0x00, 0x02, 0x40, 0x00, 0x00, 0x00, 0x14, 0x64, 0x6D, 0x6E, 0x64, 0x00, 0x00, 0x02, 0x54, 0x00, 0x00, 0x00, 0x70, 0x64, 0x6D, 0x64, 0x64, 0x00, 0x00, 0x02, 0xC4, 0x00, 0x00, 0x00, 0x88, 0x76, 0x75, 0x65, 0x64, 0x00, 0x00, 0x03, 0x4C, 0x00, 0x00, 0x00, 0x86, 0x76, 0x69, 0x65, 0x77, 0x00, 0x00, 0x03, 0xD4, 0x00, 0x00, 0x00, 0x24, 0x6C, 0x75, 0x6D, 0x69, 0x00, 0x00, 0x03, 0xF8, 0x00, 0x00, 0x00, 0x14, 0x6D, 0x65, 0x61, 0x73, 0x00, 0x00, 0x04, 0x0C, 0x00, 0x00, 0x00, 0x24, 0x74, 0x65, 0x63, 0x68, 0x00, 0x00, 0x04, 0x30, 0x00, 0x00, 0x00, 0x0C, 0x72, 0x54, 0x52, 0x43, 0x00, 0x00, 0x04, 0x3C, 0x00, 0x00, 0x08, 0x0C, 0x67, 0x54, 0x52, 0x43, 0x00, 0x00, 0x04, 0x3C, 0x00, 0x00, 0x08, 0x0C, 0x62, 0x54, 0x52, 0x43, 0x00, 0x00, 0x04, 0x3C, 0x00, 0x00, 0x08, 0x0C, 0x74, 0x65, 0x78, 0x74, 0x00, 0x00, 0x00, 0x00, 0x43, 0x6F, 0x70, 0x79, 0x72, 0x69, 0x67, 0x68, 0x74, 0x20, 0x28, 0x63, 0x29, 0x20, 0x31, 0x39, 0x39, 0x38, 0x20, 0x48, 0x65, 0x77, 0x6C, 0x65, 0x74, 0x74, 0x2D, 0x50, 0x61, 0x63, 0x6B, 0x61, 0x72, 0x64, 0x20, 0x43, 0x6F, 0x6D, 0x70, 0x61, 0x6E, 0x79, 0x00, 0x00, 0x64, 0x65, 0x73, 0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x12, 0x73, 0x52, 0x47, 0x42, 0x20, 0x49, 0x45, 0x43, 0x36, 0x31, 0x39, 0x36, 0x36, 0x2D, 0x32, 0x2E, 0x31, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x12, 0x73, 0x52, 0x47, 0x42, 0x20, 0x49, 0x45, 0x43, 0x36, 0x31, 0x39, 0x36, 0x36, 0x2D, 0x32, 0x2E, 0x31, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x58, 0x59, 0x5A, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF3, 0x51, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x16, 0xCC, 0x58, 0x59, 0x5A, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x58, 0x59, 0x5A, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x6F, 0xA2, 0x00, 0x00, 0x38, 0xF5, 0x00, 0x00, 0x03, 0x90, 0x58, 0x59, 0x5A, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x62, 0x99, 0x00, 0x00, 0xB7, 0x85, 0x00, 0x00, 0x18, 0xDA, 0x58, 0x59, 0x5A, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x24, 0xA0, 0x00, 0x00, 0x0F, 0x84, 0x00, 0x00, 0xB6, 0xCF, 0x64, 0x65, 0x73, 0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x16, 0x49, 0x45, 0x43, 0x20, 0x68, 0x74, 0x74, 0x70, 0x3A, 0x2F, 0x2F, 0x77, 0x77, 0x77, 0x2E, 0x69, 0x65, 0x63, 0x2E, 0x63, 0x68, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x16, 0x49, 0x45, 0x43, 0x20, 0x68, 0x74, 0x74, 0x70, 0x3A, 0x2F, 0x2F, 0x77, 0x77, 0x77, 0x2E, 0x69, 0x65, 0x63, 0x2E, 0x63, 0x68, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2E, 0x49, 0x45, 0x43, 0x20, 0x36, 0x31, 0x39, 0x36, 0x36, 0x2D, 0x32, 0x2E, 0x31, 0x20, 0x44, 0x65, 0x66, 0x61, 0x75, 0x6C, 0x74, 0x20, 0x52, 0x47, 0x42, 0x20, 0x63, 0x6F, 0x6C, 0x6F, 0x75, 0x72, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x2D, 0x20, 0x73, 0x52, 0x47, 0x42, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2E, 0x49, 0x45, 0x43, 0x20, 0x36, 0x31, 0x39, 0x36, 0x36, 0x2D, 0x32, 0x2E, 0x31, 0x20, 0x44, 0x65, 0x66, 0x61, 0x75, 0x6C, 0x74, 0x20, 0x52, 0x47, 0x42, 0x20, 0x63, 0x6F, 0x6C, 0x6F, 0x75, 0x72, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x2D, 0x20, 0x73, 0x52, 0x47, 0x42, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x63, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2C, 0x52, 0x65, 0x66, 0x65, 0x72, 0x65, 0x6E, 0x63, 0x65, 0x20, 0x56, 0x69, 0x65, 0x77, 0x69, 0x6E, 0x67, 0x20, 0x43, 0x6F, 0x6E, 0x64, 0x69, 0x74, 0x69, 0x6F, 0x6E, 0x20, 0x69, 0x6E, 0x20, 0x49, 0x45, 0x43, 0x36, 0x31, 0x39, 0x36, 0x36, 0x2D, 0x32, 0x2E, 0x31, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2C, 0x52, 0x65, 0x66, 0x65, 0x72, 0x65, 0x6E, 0x63, 0x65, 0x20, 0x56, 0x69, 0x65, 0x77, 0x69, 0x6E, 0x67, 0x20, 0x43, 0x6F, 0x6E, 0x64, 0x69, 0x74, 0x69, 0x6F, 0x6E, 0x20, 0x69, 0x6E, 0x20, 0x49, 0x45, 0x43, 0x36, 0x31, 0x39, 0x36, 0x36, 0x2D, 0x32, 0x2E, 0x31, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x76, 0x69, 0x65, 0x77, 0x00, 0x00, 0x00, 0x00, 0x00, 0x13, 0xA4, 0xFE, 0x00, 0x14, 0x5F, 0x2E, 0x00, 0x10, 0xCF, 0x14, 0x00, 0x03, 0xED, 0xCC, 0x00, 0x04, 0x13, 0x0B, 0x00, 0x03, 0x5C, 0x9E, 0x00, 0x00, 0x00, 0x01, 0x58, 0x59, 0x5A, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x4C, 0x09, 0x56, 0x00, 0x50, 0x00, 0x00, 0x00, 0x57, 0x1F, 0xE7, 0x6D, 0x65, 0x61, 0x73, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x8F, 0x00, 0x00, 0x00, 0x02, 0x73, 0x69, 0x67, 0x20, 0x00, 0x00, 0x00, 0x00, 0x43, 0x52, 0x54, 0x20, 0x63, 0x75, 0x72, 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x0A, 0x00, 0x0F, 0x00, 0x14, 0x00, 0x19, 0x00, 0x1E, 0x00, 0x23, 0x00, 0x28, 0x00, 0x2D, 0x00, 0x32, 0x00, 0x37, 0x00, 0x3B, 0x00, 0x40, 0x00, 0x45, 0x00, 0x4A, 0x00, 0x4F, 0x00, 0x54, 0x00, 0x59, 0x00, 0x5E, 0x00, 0x63, 0x00, 0x68, 0x00, 0x6D, 0x00, 0x72, 0x00, 0x77, 0x00, 0x7C, 0x00, 0x81, 0x00, 0x86, 0x00, 0x8B, 0x00, 0x90, 0x00, 0x95, 0x00, 0x9A, 0x00, 0x9F, 0x00, 0xA4, 0x00, 0xA9, 0x00, 0xAE, 0x00, 0xB2, 0x00, 0xB7, 0x00, 0xBC, 0x00, 0xC1, 0x00, 0xC6, 0x00, 0xCB, 0x00, 0xD0, 0x00, 0xD5, 0x00, 0xDB, 0x00, 0xE0, 0x00, 0xE5, 0x00, 0xEB, 0x00, 0xF0, 0x00, 0xF6, 0x00, 0xFB, 0x01, 0x01, 0x01, 0x07, 0x01, 0x0D, 0x01, 0x13, 0x01, 0x19, 0x01, 0x1F, 0x01, 0x25, 0x01, 0x2B, 0x01, 0x32, 0x01, 0x38, 0x01, 0x3E, 0x01, 0x45, 0x01, 0x4C, 0x01, 0x52, 0x01, 0x59, 0x01, 0x60, 0x01, 0x67, 0x01, 0x6E, 0x01, 0x75, 0x01, 0x7C, 0x01, 0x83, 0x01, 0x8B, 0x01, 0x92, 0x01, 0x9A, 0x01, 0xA1, 0x01, 0xA9, 0x01, 0xB1, 0x01, 0xB9, 0x01, 0xC1, 0x01, 0xC9, 0x01, 0xD1, 0x01, 0xD9, 0x01, 0xE1, 0x01, 0xE9, 0x01, 0xF2, 0x01, 0xFA, 0x02, 0x03, 0x02, 0x0C, 0x02, 0x14, 0x02, 0x1D, 0x02, 0x26, 0x02, 0x2F, 0x02, 0x38, 0x02, 0x41, 0x02, 0x4B, 0x02, 0x54, 0x02, 0x5D, 0x02, 0x67, 0x02, 0x71, 0x02, 0x7A, 0x02, 0x84, 0x02, 0x8E, 0x02, 0x98, 0x02, 0xA2, 0x02, 0xAC, 0x02, 0xB6, 0x02, 0xC1, 0x02, 0xCB, 0x02, 0xD5, 0x02, 0xE0, 0x02, 0xEB, 0x02, 0xF5, 0x03, 0x00, 0x03, 0x0B, 0x03, 0x16, 0x03, 0x21, 0x03, 0x2D, 0x03, 0x38, 0x03, 0x43, 0x03, 0x4F, 0x03, 0x5A, 0x03, 0x66, 0x03, 0x72, 0x03, 0x7E, 0x03, 0x8A, 0x03, 0x96, 0x03, 0xA2, 0x03, 0xAE, 0x03, 0xBA, 0x03, 0xC7, 0x03, 0xD3, 0x03, 0xE0, 0x03, 0xEC, 0x03, 0xF9, 0x04, 0x06, 0x04, 0x13, 0x04, 0x20, 0x04, 0x2D, 0x04, 0x3B, 0x04, 0x48, 0x04, 0x55, 0x04, 0x63, 0x04, 0x71, 0x04, 0x7E, 0x04, 0x8C, 0x04, 0x9A, 0x04, 0xA8, 0x04, 0xB6, 0x04, 0xC4, 0x04, 0xD3, 0x04, 0xE1, 0x04, 0xF0, 0x04, 0xFE, 0x05, 0x0D, 0x05, 0x1C, 0x05, 0x2B, 0x05, 0x3A, 0x05, 0x49, 0x05, 0x58, 0x05, 0x67, 0x05, 0x77, 0x05, 0x86, 0x05, 0x96, 0x05, 0xA6, 0x05, 0xB5, 0x05, 0xC5, 0x05, 0xD5, 0x05, 0xE5, 0x05, 0xF6, 0x06, 0x06, 0x06, 0x16, 0x06, 0x27, 0x06, 0x37, 0x06, 0x48, 0x06, 0x59, 0x06, 0x6A, 0x06, 0x7B, 0x06, 0x8C, 0x06, 0x9D, 0x06, 0xAF, 0x06, 0xC0, 0x06, 0xD1, 0x06, 0xE3, 0x06, 0xF5, 0x07, 0x07, 0x07, 0x19, 0x07, 0x2B, 0x07, 0x3D, 0x07, 0x4F, 0x07, 0x61, 0x07, 0x74, 0x07, 0x86, 0x07, 0x99, 0x07, 0xAC, 0x07, 0xBF, 0x07, 0xD2, 0x07, 0xE5, 0x07, 0xF8, 0x08, 0x0B, 0x08, 0x1F, 0x08, 0x32, 0x08, 0x46, 0x08, 0x5A, 0x08, 0x6E, 0x08, 0x82, 0x08, 0x96, 0x08, 0xAA, 0x08, 0xBE, 0x08, 0xD2, 0x08, 0xE7, 0x08, 0xFB, 0x09, 0x10, 0x09, 0x25, 0x09, 0x3A, 0x09, 0x4F, 0x09, 0x64, 0x09, 0x79, 0x09, 0x8F, 0x09, 0xA4, 0x09, 0xBA, 0x09, 0xCF, 0x09, 0xE5, 0x09, 0xFB, 0x0A, 0x11, 0x0A, 0x27, 0x0A, 0x3D, 0x0A, 0x54, 0x0A, 0x6A, 0x0A, 0x81, 0x0A, 0x98, 0x0A, 0xAE, 0x0A, 0xC5, 0x0A, 0xDC, 0x0A, 0xF3, 0x0B, 0x0B, 0x0B, 0x22, 0x0B, 0x39, 0x0B, 0x51, 0x0B, 0x69, 0x0B, 0x80, 0x0B, 0x98, 0x0B, 0xB0, 0x0B, 0xC8, 0x0B, 0xE1, 0x0B, 0xF9, 0x0C, 0x12, 0x0C, 0x2A, 0x0C, 0x43, 0x0C, 0x5C, 0x0C, 0x75, 0x0C, 0x8E, 0x0C, 0xA7, 0x0C, 0xC0, 0x0C, 0xD9, 0x0C, 0xF3, 0x0D, 0x0D, 0x0D, 0x26, 0x0D, 0x40, 0x0D, 0x5A, 0x0D, 0x74, 0x0D, 0x8E, 0x0D, 0xA9, 0x0D, 0xC3, 0x0D, 0xDE, 0x0D, 0xF8, 0x0E, 0x13, 0x0E, 0x2E, 0x0E, 0x49, 0x0E, 0x64, 0x0E, 0x7F, 0x0E, 0x9B, 0x0E, 0xB6, 0x0E, 0xD2, 0x0E, 0xEE, 0x0F, 0x09, 0x0F, 0x25, 0x0F, 0x41, 0x0F, 0x5E, 0x0F, 0x7A, 0x0F, 0x96, 0x0F, 0xB3, 0x0F, 0xCF, 0x0F, 0xEC, 0x10, 0x09, 0x10, 0x26, 0x10, 0x43, 0x10, 0x61, 0x10, 0x7E, 0x10, 0x9B, 0x10, 0xB9, 0x10, 0xD7, 0x10, 0xF5, 0x11, 0x13, 0x11, 0x31, 0x11, 0x4F, 0x11, 0x6D, 0x11, 0x8C, 0x11, 0xAA, 0x11, 0xC9, 0x11, 0xE8, 0x12, 0x07, 0x12, 0x26, 0x12, 0x45, 0x12, 0x64, 0x12, 0x84, 0x12, 0xA3, 0x12, 0xC3, 0x12, 0xE3, 0x13, 0x03, 0x13, 0x23, 0x13, 0x43, 0x13, 0x63, 0x13, 0x83, 0x13, 0xA4, 0x13, 0xC5, 0x13, 0xE5, 0x14, 0x06, 0x14, 0x27, 0x14, 0x49, 0x14, 0x6A, 0x14, 0x8B, 0x14, 0xAD, 0x14, 0xCE, 0x14, 0xF0, 0x15, 0x12, 0x15, 0x34, 0x15, 0x56, 0x15, 0x78, 0x15, 0x9B, 0x15, 0xBD, 0x15, 0xE0, 0x16, 0x03, 0x16, 0x26, 0x16, 0x49, 0x16, 0x6C, 0x16, 0x8F, 0x16, 0xB2, 0x16, 0xD6, 0x16, 0xFA, 0x17, 0x1D, 0x17, 0x41, 0x17, 0x65, 0x17, 0x89, 0x17, 0xAE, 0x17, 0xD2, 0x17, 0xF7, 0x18, 0x1B, 0x18, 0x40, 0x18, 0x65, 0x18, 0x8A, 0x18, 0xAF, 0x18, 0xD5, 0x18, 0xFA, 0x19, 0x20, 0x19, 0x45, 0x19, 0x6B, 0x19, 0x91, 0x19, 0xB7, 0x19, 0xDD, 0x1A, 0x04, 0x1A, 0x2A, 0x1A, 0x51, 0x1A, 0x77, 0x1A, 0x9E, 0x1A, 0xC5, 0x1A, 0xEC, 0x1B, 0x14, 0x1B, 0x3B, 0x1B, 0x63, 0x1B, 0x8A, 0x1B, 0xB2, 0x1B, 0xDA, 0x1C, 0x02, 0x1C, 0x2A, 0x1C, 0x52, 0x1C, 0x7B, 0x1C, 0xA3, 0x1C, 0xCC, 0x1C, 0xF5, 0x1D, 0x1E, 0x1D, 0x47, 0x1D, 0x70, 0x1D, 0x99, 0x1D, 0xC3, 0x1D, 0xEC, 0x1E, 0x16, 0x1E, 0x40, 0x1E, 0x6A, 0x1E, 0x94, 0x1E, 0xBE, 0x1E, 0xE9, 0x1F, 0x13, 0x1F, 0x3E, 0x1F, 0x69, 0x1F, 0x94, 0x1F, 0xBF, 0x1F, 0xEA, 0x20, 0x15, 0x20, 0x41, 0x20, 0x6C, 0x20, 0x98, 0x20, 0xC4, 0x20, 0xF0, 0x21, 0x1C, 0x21, 0x48, 0x21, 0x75, 0x21, 0xA1, 0x21, 0xCE, 0x21, 0xFB, 0x22, 0x27, 0x22, 0x55, 0x22, 0x82, 0x22, 0xAF, 0x22, 0xDD, 0x23, 0x0A, 0x23, 0x38, 0x23, 0x66, 0x23, 0x94, 0x23, 0xC2, 0x23, 0xF0, 0x24, 0x1F, 0x24, 0x4D, 0x24, 0x7C, 0x24, 0xAB, 0x24, 0xDA, 0x25, 0x09, 0x25, 0x38, 0x25, 0x68, 0x25, 0x97, 0x25, 0xC7, 0x25, 0xF7, 0x26, 0x27, 0x26, 0x57, 0x26, 0x87, 0x26, 0xB7, 0x26, 0xE8, 0x27, 0x18, 0x27, 0x49, 0x27, 0x7A, 0x27, 0xAB, 0x27, 0xDC, 0x28, 0x0D, 0x28, 0x3F, 0x28, 0x71, 0x28, 0xA2, 0x28, 0xD4, 0x29, 0x06, 0x29, 0x38, 0x29, 0x6B, 0x29, 0x9D, 0x29, 0xD0, 0x2A, 0x02, 0x2A, 0x35, 0x2A, 0x68, 0x2A, 0x9B, 0x2A, 0xCF, 0x2B, 0x02, 0x2B, 0x36, 0x2B, 0x69, 0x2B, 0x9D, 0x2B, 0xD1, 0x2C, 0x05, 0x2C, 0x39, 0x2C, 0x6E, 0x2C, 0xA2, 0x2C, 0xD7, 0x2D, 0x0C, 0x2D, 0x41, 0x2D, 0x76, 0x2D, 0xAB, 0x2D, 0xE1, 0x2E, 0x16, 0x2E, 0x4C, 0x2E, 0x82, 0x2E, 0xB7, 0x2E, 0xEE, 0x2F, 0x24, 0x2F, 0x5A, 0x2F, 0x91, 0x2F, 0xC7, 0x2F, 0xFE, 0x30, 0x35, 0x30, 0x6C, 0x30, 0xA4, 0x30, 0xDB, 0x31, 0x12, 0x31, 0x4A, 0x31, 0x82, 0x31, 0xBA, 0x31, 0xF2, 0x32, 0x2A, 0x32, 0x63, 0x32, 0x9B, 0x32, 0xD4, 0x33, 0x0D, 0x33, 0x46, 0x33, 0x7F, 0x33, 0xB8, 0x33, 0xF1, 0x34, 0x2B, 0x34, 0x65, 0x34, 0x9E, 0x34, 0xD8, 0x35, 0x13, 0x35, 0x4D, 0x35, 0x87, 0x35, 0xC2, 0x35, 0xFD, 0x36, 0x37, 0x36, 0x72, 0x36, 0xAE, 0x36, 0xE9, 0x37, 0x24, 0x37, 0x60, 0x37, 0x9C, 0x37, 0xD7, 0x38, 0x14, 0x38, 0x50, 0x38, 0x8C, 0x38, 0xC8, 0x39, 0x05, 0x39, 0x42, 0x39, 0x7F, 0x39, 0xBC, 0x39, 0xF9, 0x3A, 0x36, 0x3A, 0x74, 0x3A, 0xB2, 0x3A, 0xEF, 0x3B, 0x2D, 0x3B, 0x6B, 0x3B, 0xAA, 0x3B, 0xE8, 0x3C, 0x27, 0x3C, 0x65, 0x3C, 0xA4, 0x3C, 0xE3, 0x3D, 0x22, 0x3D, 0x61, 0x3D, 0xA1, 0x3D, 0xE0, 0x3E, 0x20, 0x3E, 0x60, 0x3E, 0xA0, 0x3E, 0xE0, 0x3F, 0x21, 0x3F, 0x61, 0x3F, 0xA2, 0x3F, 0xE2, 0x40, 0x23, 0x40, 0x64, 0x40, 0xA6, 0x40, 0xE7, 0x41, 0x29, 0x41, 0x6A, 0x41, 0xAC, 0x41, 0xEE, 0x42, 0x30, 0x42, 0x72, 0x42, 0xB5, 0x42, 0xF7, 0x43, 0x3A, 0x43, 0x7D, 0x43, 0xC0, 0x44, 0x03, 0x44, 0x47, 0x44, 0x8A, 0x44, 0xCE, 0x45, 0x12, 0x45, 0x55, 0x45, 0x9A, 0x45, 0xDE, 0x46, 0x22, 0x46, 0x67, 0x46, 0xAB, 0x46, 0xF0, 0x47, 0x35, 0x47, 0x7B, 0x47, 0xC0, 0x48, 0x05, 0x48, 0x4B, 0x48, 0x91, 0x48, 0xD7, 0x49, 0x1D, 0x49, 0x63, 0x49, 0xA9, 0x49, 0xF0, 0x4A, 0x37, 0x4A, 0x7D, 0x4A, 0xC4, 0x4B, 0x0C, 0x4B, 0x53, 0x4B, 0x9A, 0x4B, 0xE2, 0x4C, 0x2A, 0x4C, 0x72, 0x4C, 0xBA, 0x4D, 0x02, 0x4D, 0x4A, 0x4D, 0x93, 0x4D, 0xDC, 0x4E, 0x25, 0x4E, 0x6E, 0x4E, 0xB7, 0x4F, 0x00, 0x4F, 0x49, 0x4F, 0x93, 0x4F, 0xDD, 0x50, 0x27, 0x50, 0x71, 0x50, 0xBB, 0x51, 0x06, 0x51, 0x50, 0x51, 0x9B, 0x51, 0xE6, 0x52, 0x31, 0x52, 0x7C, 0x52, 0xC7, 0x53, 0x13, 0x53, 0x5F, 0x53, 0xAA, 0x53, 0xF6, 0x54, 0x42, 0x54, 0x8F, 0x54, 0xDB, 0x55, 0x28, 0x55, 0x75, 0x55, 0xC2, 0x56, 0x0F, 0x56, 0x5C, 0x56, 0xA9, 0x56, 0xF7, 0x57, 0x44, 0x57, 0x92, 0x57, 0xE0, 0x58, 0x2F, 0x58, 0x7D, 0x58, 0xCB, 0x59, 0x1A, 0x59, 0x69, 0x59, 0xB8, 0x5A, 0x07, 0x5A, 0x56, 0x5A, 0xA6, 0x5A, 0xF5, 0x5B, 0x45, 0x5B, 0x95, 0x5B, 0xE5, 0x5C, 0x35, 0x5C, 0x86, 0x5C, 0xD6, 0x5D, 0x27, 0x5D, 0x78, 0x5D, 0xC9, 0x5E, 0x1A, 0x5E, 0x6C, 0x5E, 0xBD, 0x5F, 0x0F, 0x5F, 0x61, 0x5F, 0xB3, 0x60, 0x05, 0x60, 0x57, 0x60, 0xAA, 0x60, 0xFC, 0x61, 0x4F, 0x61, 0xA2, 0x61, 0xF5, 0x62, 0x49, 0x62, 0x9C, 0x62, 0xF0, 0x63, 0x43, 0x63, 0x97, 0x63, 0xEB, 0x64, 0x40, 0x64, 0x94, 0x64, 0xE9, 0x65, 0x3D, 0x65, 0x92, 0x65, 0xE7, 0x66, 0x3D, 0x66, 0x92, 0x66, 0xE8, 0x67, 0x3D, 0x67, 0x93, 0x67, 0xE9, 0x68, 0x3F, 0x68, 0x96, 0x68, 0xEC, 0x69, 0x43, 0x69, 0x9A, 0x69, 0xF1, 0x6A, 0x48, 0x6A, 0x9F, 0x6A, 0xF7, 0x6B, 0x4F, 0x6B, 0xA7, 0x6B, 0xFF, 0x6C, 0x57, 0x6C, 0xAF, 0x6D, 0x08, 0x6D, 0x60, 0x6D, 0xB9, 0x6E, 0x12, 0x6E, 0x6B, 0x6E, 0xC4, 0x6F, 0x1E, 0x6F, 0x78, 0x6F, 0xD1, 0x70, 0x2B, 0x70, 0x86, 0x70, 0xE0, 0x71, 0x3A, 0x71, 0x95, 0x71, 0xF0, 0x72, 0x4B, 0x72, 0xA6, 0x73, 0x01, 0x73, 0x5D, 0x73, 0xB8, 0x74, 0x14, 0x74, 0x70, 0x74, 0xCC, 0x75, 0x28, 0x75, 0x85, 0x75, 0xE1, 0x76, 0x3E, 0x76, 0x9B, 0x76, 0xF8, 0x77, 0x56, 0x77, 0xB3, 0x78, 0x11, 0x78, 0x6E, 0x78, 0xCC, 0x79, 0x2A, 0x79, 0x89, 0x79, 0xE7, 0x7A, 0x46, 0x7A, 0xA5, 0x7B, 0x04, 0x7B, 0x63, 0x7B, 0xC2, 0x7C, 0x21, 0x7C, 0x81, 0x7C, 0xE1, 0x7D, 0x41, 0x7D, 0xA1, 0x7E, 0x01, 0x7E, 0x62, 0x7E, 0xC2, 0x7F, 0x23, 0x7F, 0x84, 0x7F, 0xE5, 0x80, 0x47, 0x80, 0xA8, 0x81, 0x0A, 0x81, 0x6B, 0x81, 0xCD, 0x82, 0x30, 0x82, 0x92, 0x82, 0xF4, 0x83, 0x57, 0x83, 0xBA, 0x84, 0x1D, 0x84, 0x80, 0x84, 0xE3, 0x85, 0x47, 0x85, 0xAB, 0x86, 0x0E, 0x86, 0x72, 0x86, 0xD7, 0x87, 0x3B, 0x87, 0x9F, 0x88, 0x04, 0x88, 0x69, 0x88, 0xCE, 0x89, 0x33, 0x89, 0x99, 0x89, 0xFE, 0x8A, 0x64, 0x8A, 0xCA, 0x8B, 0x30, 0x8B, 0x96, 0x8B, 0xFC, 0x8C, 0x63, 0x8C, 0xCA, 0x8D, 0x31, 0x8D, 0x98, 0x8D, 0xFF, 0x8E, 0x66, 0x8E, 0xCE, 0x8F, 0x36, 0x8F, 0x9E, 0x90, 0x06, 0x90, 0x6E, 0x90, 0xD6, 0x91, 0x3F, 0x91, 0xA8, 0x92, 0x11, 0x92, 0x7A, 0x92, 0xE3, 0x93, 0x4D, 0x93, 0xB6, 0x94, 0x20, 0x94, 0x8A, 0x94, 0xF4, 0x95, 0x5F, 0x95, 0xC9, 0x96, 0x34, 0x96, 0x9F, 0x97, 0x0A, 0x97, 0x75, 0x97, 0xE0, 0x98, 0x4C, 0x98, 0xB8, 0x99, 0x24, 0x99, 0x90, 0x99, 0xFC, 0x9A, 0x68, 0x9A, 0xD5, 0x9B, 0x42, 0x9B, 0xAF, 0x9C, 0x1C, 0x9C, 0x89, 0x9C, 0xF7, 0x9D, 0x64, 0x9D, 0xD2, 0x9E, 0x40, 0x9E, 0xAE, 0x9F, 0x1D, 0x9F, 0x8B, 0x9F, 0xFA, 0xA0, 0x69, 0xA0, 0xD8, 0xA1, 0x47, 0xA1, 0xB6, 0xA2, 0x26, 0xA2, 0x96, 0xA3, 0x06, 0xA3, 0x76, 0xA3, 0xE6, 0xA4, 0x56, 0xA4, 0xC7, 0xA5, 0x38, 0xA5, 0xA9, 0xA6, 0x1A, 0xA6, 0x8B, 0xA6, 0xFD, 0xA7, 0x6E, 0xA7, 0xE0, 0xA8, 0x52, 0xA8, 0xC4, 0xA9, 0x37, 0xA9, 0xA9, 0xAA, 0x1C, 0xAA, 0x8F, 0xAB, 0x02, 0xAB, 0x75, 0xAB, 0xE9, 0xAC, 0x5C, 0xAC, 0xD0, 0xAD, 0x44, 0xAD, 0xB8, 0xAE, 0x2D, 0xAE, 0xA1, 0xAF, 0x16, 0xAF, 0x8B, 0xB0, 0x00, 0xB0, 0x75, 0xB0, 0xEA, 0xB1, 0x60, 0xB1, 0xD6, 0xB2, 0x4B, 0xB2, 0xC2, 0xB3, 0x38, 0xB3, 0xAE, 0xB4, 0x25, 0xB4, 0x9C, 0xB5, 0x13, 0xB5, 0x8A, 0xB6, 0x01, 0xB6, 0x79, 0xB6, 0xF0, 0xB7, 0x68, 0xB7, 0xE0, 0xB8, 0x59, 0xB8, 0xD1, 0xB9, 0x4A, 0xB9, 0xC2, 0xBA, 0x3B, 0xBA, 0xB5, 0xBB, 0x2E, 0xBB, 0xA7, 0xBC, 0x21, 0xBC, 0x9B, 0xBD, 0x15, 0xBD, 0x8F, 0xBE, 0x0A, 0xBE, 0x84, 0xBE, 0xFF, 0xBF, 0x7A, 0xBF, 0xF5, 0xC0, 0x70, 0xC0, 0xEC, 0xC1, 0x67, 0xC1, 0xE3, 0xC2, 0x5F, 0xC2, 0xDB, 0xC3, 0x58, 0xC3, 0xD4, 0xC4, 0x51, 0xC4, 0xCE, 0xC5, 0x4B, 0xC5, 0xC8, 0xC6, 0x46, 0xC6, 0xC3, 0xC7, 0x41, 0xC7, 0xBF, 0xC8, 0x3D, 0xC8, 0xBC, 0xC9, 0x3A, 0xC9, 0xB9, 0xCA, 0x38, 0xCA, 0xB7, 0xCB, 0x36, 0xCB, 0xB6, 0xCC, 0x35, 0xCC, 0xB5, 0xCD, 0x35, 0xCD, 0xB5, 0xCE, 0x36, 0xCE, 0xB6, 0xCF, 0x37, 0xCF, 0xB8, 0xD0, 0x39, 0xD0, 0xBA, 0xD1, 0x3C, 0xD1, 0xBE, 0xD2, 0x3F, 0xD2, 0xC1, 0xD3, 0x44, 0xD3, 0xC6, 0xD4, 0x49, 0xD4, 0xCB, 0xD5, 0x4E, 0xD5, 0xD1, 0xD6, 0x55, 0xD6, 0xD8, 0xD7, 0x5C, 0xD7, 0xE0, 0xD8, 0x64, 0xD8, 0xE8, 0xD9, 0x6C, 0xD9, 0xF1, 0xDA, 0x76, 0xDA, 0xFB, 0xDB, 0x80, 0xDC, 0x05, 0xDC, 0x8A, 0xDD, 0x10, 0xDD, 0x96, 0xDE, 0x1C, 0xDE, 0xA2, 0xDF, 0x29, 0xDF, 0xAF, 0xE0, 0x36, 0xE0, 0xBD, 0xE1, 0x44, 0xE1, 0xCC, 0xE2, 0x53, 0xE2, 0xDB, 0xE3, 0x63, 0xE3, 0xEB, 0xE4, 0x73, 0xE4, 0xFC, 0xE5, 0x84, 0xE6, 0x0D, 0xE6, 0x96, 0xE7, 0x1F, 0xE7, 0xA9, 0xE8, 0x32, 0xE8, 0xBC, 0xE9, 0x46, 0xE9, 0xD0, 0xEA, 0x5B, 0xEA, 0xE5, 0xEB, 0x70, 0xEB, 0xFB, 0xEC, 0x86, 0xED, 0x11, 0xED, 0x9C, 0xEE, 0x28, 0xEE, 0xB4, 0xEF, 0x40, 0xEF, 0xCC, 0xF0, 0x58, 0xF0, 0xE5, 0xF1, 0x72, 0xF1, 0xFF, 0xF2, 0x8C, 0xF3, 0x19, 0xF3, 0xA7, 0xF4, 0x34, 0xF4, 0xC2, 0xF5, 0x50, 0xF5, 0xDE, 0xF6, 0x6D, 0xF6, 0xFB, 0xF7, 0x8A, 0xF8, 0x19, 0xF8, 0xA8, 0xF9, 0x38, 0xF9, 0xC7, 0xFA, 0x57, 0xFA, 0xE7, 0xFB, 0x77, 0xFC, 0x07, 0xFC, 0x98, 0xFD, 0x29, 0xFD, 0xBA, 0xFE, 0x4B, 0xFE, 0xDC, 0xFF, 0x6D, 0xFF, 0xFF
-```
-
diff --git a/doc/tables/is_zero_base.md b/doc/tables/is_zero_base.md
deleted file mode 100644 (file)
index 7e2d081..0000000
+++ /dev/null
@@ -1,9 +0,0 @@
-#### Table M.1 – is_zero_base table
-
-```
-228, 216, 216, 195, 192, 189, 182, 184, 179, 176, 171, 168, 166, 159,
-156, 151, 151, 150, 150, 146, 144, 138, 138, 137, 135, 131, 127, 126,
-124, 123, 124, 123, 122, 121, 118, 117, 114, 115, 116, 116, 115, 115,
-114, 111, 111, 111, 112, 111, 110, 110, 110, 111, 111, 114, 110, 111,
-112, 113, 116, 120, 126, 131, 147, 160
-```
diff --git a/doc/tables/markdown-pdf.css b/doc/tables/markdown-pdf.css
deleted file mode 100644 (file)
index c1efc1c..0000000
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- settings.json:
-    "markdown-pdf.styles": ["markdown-pdf.css",],
-    "markdown-pdf.format": "Letter",
-    "markdown-pdf.margin.top": "1in",
-    "markdown-pdf.margin.bottom": "1in",
-    "markdown-pdf.margin.left": "1in",
-    "markdown-pdf.margin.right": "1in",
-    "markdown-pdf.stylesRelativePathFile" : true,
-    "markdown-pdf.displayHeaderFooter": false,
- */
-
-body {
-  font-family: "Times";
-  font-size: 10pt;
-  padding: 0;
-}
-
-h4 {
-  font-family: "Times New Roman";
-  font-size: 10pt;
-  font-weight: bold;
-}
-
-code {
-  font-family: Consolas, "Source Code Pro";
-  font-size: 10pt;
-}
-
-pre.hljs code > div {
-  padding: 0px;
-}
-
-:not(pre):not(.hljs) > code {
-  color: #4d4d4c;
-}
diff --git a/doc/tables/nonzero_buckets.md b/doc/tables/nonzero_buckets.md
deleted file mode 100644 (file)
index 77a5a39..0000000
+++ /dev/null
@@ -1,9 +0,0 @@
-#### Table M.17 – nonzero_buckets
-
-```
-  0,  1,  2,  3,  4,  4,  5,  5,  5,  6,  6,  6,  6,  7,  7,  7,  7,
-  7,  7,  7,  7,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  9,  9,
-  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, 10, 10, 10, 10, 10, 10,
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10
-```
-
diff --git a/doc/tables/num_nonzero_context.md b/doc/tables/num_nonzero_context.md
deleted file mode 100644 (file)
index b73d48c..0000000
+++ /dev/null
@@ -1,60 +0,0 @@
-#### Table M.16 – num_nonzero_context
-
-`scheme == 0`:
-```
-0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
-6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7,
-7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
-```
-
-`scheme == 1`:
-```
- 0,  2,  2,  4,  4,  4,  6,  6,  6,  6,  8,  8,  8,  8,  8,  8, 10, 10,
-10, 10, 10, 10, 10, 10, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
-12, 12, 12, 12, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
-14, 14, 14, 14, 14, 14, 14, 14, 14, 14
-```
-
-`scheme == 2`:
-```
- 0,  4,  4,  8,  8,  8, 12, 12, 12, 12, 16, 16, 16, 16, 16, 16, 20, 20,
-20, 20, 20, 20, 20, 20, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
-24, 24, 24, 24, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
-28, 28, 28, 28, 28, 28, 28, 28, 28, 28
-```
-
-`scheme == 3`:
-```
- 0,  8,  8, 16, 16, 16, 24, 24, 24, 24, 32, 32, 32, 32, 32, 32, 40, 40,
-40, 40, 40, 40, 40, 40, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
-48, 48, 48, 48, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55,
-55, 55, 55, 55, 55, 55, 55, 55, 55, 55
-```
-
-`scheme == 4`:
-```
-  0,  16,  16,  32,  32,  32,  48,  48,  48,  48,  64,  64,  64,  64,
- 64,  64,  80,  80,  80,  80,  80,  80,  80,  80,  95,  95,  95,  95,
- 95,  95,  95,  95,  95,  95,  95,  95,  95,  95,  95,  95, 109, 109,
-109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
-109, 109, 109, 109, 109, 109, 109, 109
-```
-
-`scheme == 5`:
-```
-  0,  32,  32,  64,  64,  64,  96,  96,  96,  96, 127, 127, 127, 127,
-127, 127, 157, 157, 157, 157, 157, 157, 157, 157, 185, 185, 185, 185,
-185, 185, 185, 185, 185, 185, 185, 185, 185, 185, 185, 185, 211, 211,
-211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211, 211,
-211, 211, 211, 211, 211, 211, 211, 211
-```
-
-`scheme == 6`:
-```
-  0,  64,  64, 127, 127, 127, 188, 188, 188, 188, 246, 246, 246, 246,
-246, 246, 300, 300, 300, 300, 300, 300, 300, 300, 348, 348, 348, 348,
-348, 348, 348, 348, 348, 348, 348, 348, 348, 348, 348, 348, 388, 388,
-388, 388, 388, 388, 388, 388, 388, 388, 388, 388, 388, 388, 388, 388,
-388, 388, 388, 388, 388, 388, 388, 388
-```
-
diff --git a/doc/tables/num_nonzeros_base.md b/doc/tables/num_nonzeros_base.md
deleted file mode 100644 (file)
index 165c738..0000000
+++ /dev/null
@@ -1,258 +0,0 @@
-#### Table M.2 – num_nonzeros_base table
-
-```
-251, 252, 117, 249, 161, 136,  83, 238, 184, 126, 137, 129, 140, 119,
- 70, 213, 160, 175, 174, 130, 166, 134, 122, 125, 131, 144, 136, 133,
-139, 123,  79, 216, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128
-```
-
-```
-254, 252, 174, 232, 189, 155, 122, 177, 204, 173, 146, 149, 141, 133,
-103, 109, 167, 187, 168, 142, 154, 147, 125, 139, 144, 138, 138, 153,
-141, 133,  90, 121, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128
-```
-
-```
-251, 240, 197, 176, 184, 177, 114,  89, 194, 165, 153, 161, 158, 136,
- 92,  95, 123, 171, 160, 140, 148, 136, 129, 139, 145, 136, 143, 134,
-138, 124,  92, 154, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128
-```
-
-```
-247, 220, 201, 110, 194, 176, 147,  59, 175, 171, 156, 157, 152, 146,
-115, 114,  88, 151, 164, 141, 153, 135, 141, 131, 146, 139, 140, 145,
-138, 137, 112, 184, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128
-```
-
-```
-238, 179, 203,  63, 194, 173, 149,  71, 139, 169, 154, 159, 150, 146,
-117, 143,  78, 122, 152, 137, 149, 138, 138, 133, 134, 142, 142, 142,
-148, 128, 118, 199, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128
-```
-
-```
-227, 127, 200,  44, 192, 170, 148, 100, 102, 161, 156, 153, 148, 149,
-124, 160,  88, 101, 134, 132, 149, 145, 134, 134, 136, 141, 138, 142,
-144, 137, 116, 208, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128
-```
-
-```
-214,  86, 195,  44, 187, 163, 148, 126,  81, 147, 156, 152, 150, 144,
-121, 172,  96,  95, 117, 122, 145, 152, 136, 133, 135, 135, 131, 142,
-141, 135, 114, 217, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128
-```
-
-```
-198,  56, 191,  54, 171, 162, 147, 144,  74, 128, 152, 149, 150, 142,
-119, 177, 101, 100, 106, 111, 135, 154, 136, 137, 136, 132, 133, 142,
-144, 130, 117, 222, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128
-```
-
-```
-176,  40, 189,  73, 147, 159, 148, 152,  79, 106, 147, 149, 151, 139,
-123, 188, 108, 110, 106,  97, 125, 151, 137, 138, 135, 135, 134, 136,
-140, 131, 116, 221, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128
-```
-
-```
-148,  33, 185,  88, 117, 158, 145, 163,  95,  91, 137, 146, 150, 140,
-120, 197, 115, 116, 114,  92, 114, 144, 130, 133, 132, 133, 129, 140,
-138, 130, 111, 224, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128
-```
-
-```
-117,  31, 180, 104,  93, 150, 143, 166,  99,  85, 124, 139, 148, 142,
-118, 201, 105, 120, 120,  90, 107, 135, 127, 130, 131, 131, 132, 140,
-142, 133, 114, 229, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128
-```
-
-```
- 87,  35, 170, 110,  78, 141, 144, 176, 106,  90, 112, 132, 143, 138,
-119, 204, 111, 121, 125,  90, 105, 131, 124, 122, 129, 128, 129, 137,
-138, 133, 114, 227, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128
-```
-
-```
- 63,  42, 159, 123,  73, 127, 142, 191, 105,  91, 105, 123, 139, 137,
-120, 209, 117, 110, 122,  98, 110, 125, 115, 123, 122, 126, 128, 134,
-141, 129, 113, 229, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128
-```
-
-```
- 45,  53, 146, 135,  71, 114, 138, 193, 100,  98,  98, 113, 133, 135,
-118, 222, 113, 111, 139, 103, 107, 126, 111, 119, 121, 122, 127, 135,
-141, 128, 114, 242, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128
-```
-
-```
- 33,  60, 132, 138,  75, 100, 134, 203, 112,  99,  98, 105, 126, 131,
-115, 229, 107,  93, 121, 106, 108, 122, 106, 109, 114, 116, 127, 133,
-143, 128, 110, 242, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128
-```
-
-```
- 24,  70, 118, 134,  76,  87, 130, 201, 110,  96,  99,  97, 119, 130,
-111, 229,  97, 104, 125, 102, 112, 125, 101, 109, 113, 114, 125, 129,
-142, 127, 112, 241, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128
-```
-
-```
- 17,  65, 100, 121,  80,  75, 124, 174, 117, 100,  94,  93, 114, 128,
-110, 216, 103,  94, 113, 122, 118, 126, 113, 108, 105, 108, 122, 128,
-141, 125, 113, 238, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128
-```
-
-```
- 12,  70,  82, 132,  78,  65, 118, 155, 136, 103,  97,  89, 106, 124,
-111, 215, 115, 123, 129,  99, 104, 127, 110, 108, 101, 109, 118, 126,
-136, 123, 110, 233, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128
-```
-
-```
-  8,  66,  61, 117,  91,  59, 108, 195, 101, 112,  99,  99,  99, 116,
-106, 230, 127,  99, 144, 101, 118, 137, 117, 111, 106, 104, 116, 121,
-134, 122, 110, 223, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128
-```
-
-```
-  6,  78,  42, 146, 101,  54,  94, 201, 116, 102, 110,  94,  92, 108,
-103, 214, 108, 111, 127, 102, 121, 132, 120, 121,  95,  98, 110, 121,
-129, 117, 107, 235, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128
-```
-
-```
-  5,  93,  29, 145, 102,  52,  77, 216, 108, 115, 108, 102,  89,  97,
- 94, 229,  89, 103, 139, 120, 103, 151, 102, 100,  97,  96,  99, 111,
-125, 116, 104, 242, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128
-```
-
-```
-  4, 105,  21, 145, 100,  54,  64, 217, 100, 122, 128,  87,  88,  91,
- 87, 230, 112,  80, 148,  95, 146, 123,  96, 140,  90,  91,  98, 106,
-122, 111, 100, 249, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128
-```
-
-```
-  4, 130,  14, 142, 104,  56,  51, 208, 116, 135, 100,  89,  82,  84,
- 75, 239,  85,  85, 122, 125,  94, 144, 151, 136,  92,  97, 104, 109,
-113, 110,  91, 246, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128
-```
-
-```
-  3, 126,   9, 172, 105,  57,  39, 219,  95, 120, 118,  96,  93,  75,
- 66, 241, 102, 134,  96, 156, 146, 162, 130, 112,  82,  89,  97, 101,
-116, 103,  82, 254, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128
-```
-
-```
-  3, 149,   7, 182, 122,  54,  29, 224, 103, 100, 113,  96,  90,  74,
- 55, 250, 127,  94, 118,  93, 135, 160, 113, 130,  95, 117, 106,  96,
-111,  97,  77, 242, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128
-```
-
-```
-  3, 150,   4, 170, 138,  59,  20, 229,  91, 150, 107,  98,  92,  68,
- 48, 245, 113,  64, 114, 111, 134, 127, 102, 104,  85, 118, 103, 107,
-102,  91,  72, 245, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128
-```
-
-```
-  3, 171,   3, 165, 137,  62,  14, 211,  96, 127, 132, 121,  95,  62,
- 37, 248, 102,  57, 144,  85, 127, 191, 102,  97, 127, 104,  91, 102,
-107,  81,  64, 254, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128
-```
-
-```
-  2, 166,   2, 196, 122,  65,  10, 243, 102,  93, 117,  92,  96,  63,
- 29, 251, 169, 159, 149,  96,  91, 139, 157,  40, 100,  89, 120,  92,
-109,  79,  58, 247, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128
-```
-
-```
-  2, 176,   2, 189, 118,  48,   7, 219,  68,  43, 109,  96, 129,  75,
- 19, 254,   2,   3, 185,   6, 102, 127, 127, 127,   1, 131,  83,  99,
-107,  80,  45, 254, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128
-```
-
-```
-  1, 205,   2, 208,  64,  89,   4, 223,  29, 169,  29, 123, 118,  76,
- 11, 240, 202, 243,  65,   6,  12, 243,  96,  55, 102, 102, 114, 102,
-107,  74,  31, 247, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128
-```
-
-```
-  1, 216,   1, 214, 127,  94,   2, 234, 145,   3, 127, 106, 155,  80,
-  4, 247,   4,  65,  86, 127, 127, 127, 127, 102, 127, 143, 143, 108,
-113,  80,  16, 216, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128
-```
-
-```
-  2, 199,   1, 222,  93,  94,   1, 232,   2,  65,  74, 139, 201,  48,
-  2, 254, 169, 127,  52, 243, 251, 249, 102,  86, 202, 153,  65,  65,
-146,  69,   8, 238, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
-128, 128, 128, 128, 128, 128, 128
-```
-
diff --git a/doc/tables/quant.md b/doc/tables/quant.md
deleted file mode 100644 (file)
index 1fb80d7..0000000
+++ /dev/null
@@ -1,19 +0,0 @@
-#### Table M.13 – template quant tables
-
-`is_luma == true`:
-```
- 16,  11,  10,  16, 24, 40, 51,  61,  12,  12,  14, 19, 26, 58, 60,
- 55,  14,  13,  16, 24, 40, 57,  69,  56,  14,  17, 22, 29, 51, 87,
- 80,  62,  18,  22, 37, 56, 68, 109, 103,  77,  24, 35, 55, 64, 81,
-104, 113,  92,  49, 64, 78, 87, 103, 121, 120, 101, 72, 92, 95, 98,
-112, 100, 103,  99
-```
-
-`is_luma == false`:
-```
-17, 18, 24, 47, 99, 99, 99, 99, 18, 21, 26, 66, 99, 99, 99, 99, 24, 26,
-56, 99, 99, 99, 99, 99, 47, 66, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
-99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
-99, 99, 99, 99, 99, 99, 99, 99, 99, 99
-```
-
diff --git a/doc/tables/stock_counts.md b/doc/tables/stock_counts.md
deleted file mode 100644 (file)
index 6e8e445..0000000
+++ /dev/null
@@ -1,22 +0,0 @@
-#### Table M.9 – stock counts arrays
-
-`is_ac == 0`, `stock_index == 0`:
-```
-0, 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0, 0, 0, 0, 0
-```
-
-`is_ac == 0`, `stock_index == 1`:
-```
-0, 0, 1, 5, 1, 1, 1, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0
-```
-
-`is_ac == 1`, `stock_index == 0`:
-```
-0, 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 126
-```
-
-`is_ac == 1`, `stock_index == 1`:
-```
-0, 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 120
-```
-
diff --git a/doc/tables/stock_quant.md b/doc/tables/stock_quant.md
deleted file mode 100644 (file)
index b32fd3c..0000000
+++ /dev/null
@@ -1,130 +0,0 @@
-#### Table M.12 – stock quant tables
-
-`is_luma == true`, `stock_index == 0`:
-```
- 3,  2,  2,  3,  5,  8, 10, 12,  2,  2,  3,  4,  5, 12, 12, 11,  3,  3,
- 3,  5,  8, 11, 14, 11,  3,  3,  4,  6, 10, 17, 16, 12,  4,  4,  7, 11,
-14, 22, 21, 15,  5,  7, 11, 13, 16, 21, 23, 18, 10, 13, 16, 17, 21, 24,
-24, 20, 14, 18, 19, 20, 22, 20, 21, 20
-```
-
-`is_luma == true`, `stock_index == 1`:
-```
- 8,  6,  5,  8, 12, 20, 26, 31,  6,  6,  7, 10, 13, 29, 30, 28,  7,  7,
- 8, 12, 20, 29, 35, 28,  7,  9, 11, 15, 26, 44, 40, 31,  9, 11, 19, 28,
-34, 55, 52, 39, 12, 18, 28, 32, 41, 52, 57, 46, 25, 32, 39, 44, 52, 61,
-60, 51, 36, 46, 48, 49, 56, 50, 52, 50
-```
-
-`is_luma == true`, `stock_index == 2`:
-```
- 6,  4,  4,  6, 10, 16, 20, 24,  5,  5,  6,  8, 10, 23, 24, 22,  6,  5,
- 6, 10, 16, 23, 28, 22,  6,  7,  9, 12, 20, 35, 32, 25,  7,  9, 15, 22,
-27, 44, 41, 31, 10, 14, 22, 26, 32, 42, 45, 37, 20, 26, 31, 35, 41, 48,
-48, 40, 29, 37, 38, 39, 45, 40, 41, 40
-```
-
-`is_luma == true`, `stock_index == 3`:
-```
- 5,  3,  3,  5,  7, 12, 15, 18,  4,  4,  4,  6,  8, 17, 18, 17,  4,  4,
- 5,  7, 12, 17, 21, 17,  4,  5,  7,  9, 15, 26, 24, 19,  5,  7, 11, 17,
-20, 33, 31, 23,  7, 11, 17, 19, 24, 31, 34, 28, 15, 19, 23, 26, 31, 36,
-36, 30, 22, 28, 29, 29, 34, 30, 31, 30
-```
-
-`is_luma == true`, `stock_index == 4`:
-```
- 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
- 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
- 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
- 1,  1,  1,  1,  1,  1,  1,  1,  1,  1
-```
-
-`is_luma == true`, `stock_index == 5`:
-```
- 2,  1,  1,  2,  2,  4,  5,  6,  1,  1,  1,  2,  3,  6,  6,  6,  1,  1,
- 2,  2,  4,  6,  7,  6,  1,  2,  2,  3,  5,  9,  8,  6,  2,  2,  4,  6,
- 7, 11, 10,  8,  2,  4,  6,  6,  8, 10, 11,  9,  5,  6,  8,  9, 10, 12,
-12, 10,  7,  9, 10, 10, 11, 10, 10, 10
-```
-
-`is_luma == true`, `stock_index == 6`:
-```
- 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
- 1,  1,  1,  1,  1,  2,  1,  1,  1,  1,  1,  1,  2,  2,  1,  1,  1,  1,
- 1,  2,  2,  3,  1,  1,  1,  1,  2,  2,  3,  3,  1,  1,  1,  2,  2,  3,
- 3,  3,  1,  1,  2,  2,  3,  3,  3,  3
-```
-
-`is_luma == true`, `stock_index == 7`:
-```
-10,  7,  6, 10, 14, 24, 31, 37,  7,  7,  8, 11, 16, 35, 36, 33,  8,  8,
-10, 14, 24, 34, 41, 34,  8, 10, 13, 17, 31, 52, 48, 37, 11, 13, 22, 34,
-41, 65, 62, 46, 14, 21, 33, 38, 49, 62, 68, 55, 29, 38, 47, 52, 62, 73,
-72, 61, 43, 55, 57, 59, 67, 60, 62, 59
-```
-
-`is_luma == false`, `stock_index == 0`:
-```
- 9,  9,  9, 12, 11, 12, 24, 13, 13, 24, 50, 33, 28, 33, 50, 50, 50, 50,
-50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
-50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
-50, 50, 50, 50, 50, 50, 50, 50, 50, 50
-```
-
-`is_luma == false`, `stock_index == 1`:
-```
- 3,  4,  5,  9, 20, 20, 20, 20,  4,  4,  5, 13, 20, 20, 20, 20,  5,  5,
-11, 20, 20, 20, 20, 20,  9, 13, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
-20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
-20, 20, 20, 20, 20, 20, 20, 20, 20, 20
-```
-
-`is_luma == false`, `stock_index == 2`:
-```
- 9,  9, 12, 24, 50, 50, 50, 50,  9, 11, 13, 33, 50, 50, 50, 50, 12, 13,
-28, 50, 50, 50, 50, 50, 24, 33, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
-50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
-50, 50, 50, 50, 50, 50, 50, 50, 50, 50
-```
-
-`is_luma == false`, `stock_index == 3`:
-```
- 5,  5,  7, 14, 30, 30, 30, 30,  5,  6,  8, 20, 30, 30, 30, 30,  7,  8,
-17, 30, 30, 30, 30, 30, 14, 20, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
-30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
-30, 30, 30, 30, 30, 30, 30, 30, 30, 30
-```
-
-`is_luma == false`, `stock_index == 4`:
-```
- 7,  7, 10, 19, 40, 40, 40, 40,  7,  8, 10, 26, 40, 40, 40, 40, 10, 10,
-22, 40, 40, 40, 40, 40, 19, 26, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
-40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
-40, 40, 40, 40, 40, 40, 40, 40, 40, 40
-```
-
-`is_luma == false`, `stock_index == 5`:
-```
- 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
- 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
- 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
- 1,  1,  1,  1,  1,  1,  1,  1,  1,  1
-```
-
-`is_luma == false`, `stock_index == 6`:
-```
- 2,  2,  2,  5, 10, 10, 10, 10,  2,  2,  3,  7, 10, 10, 10, 10,  2,  3,
- 6, 10, 10, 10, 10, 10,  5,  7, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
-10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
-10, 10, 10, 10, 10, 10, 10, 10, 10, 10
-```
-
-`is_luma == false`, `stock_index == 7`:
-```
-10, 11, 14, 28, 59, 59, 59, 59, 11, 13, 16, 40, 59, 59, 59, 59, 14, 16,
-34, 59, 59, 59, 59, 59, 28, 40, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
-59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59, 59,
-59, 59, 59, 59, 59, 59, 59, 59, 59, 59
-```
-
diff --git a/doc/tables/stock_values.md b/doc/tables/stock_values.md
deleted file mode 100644 (file)
index 8e67cff..0000000
+++ /dev/null
@@ -1,44 +0,0 @@
-#### Table M.10 – stock values arrays
-
-`is_ac == 0`, `stock_index == 0`:
-```
-0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 256
-```
-
-`is_ac == 0`, `stock_index == 1`:
-```
-0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 256
-```
-
-`is_ac == 1`, `stock_index == 0`:
-```
-  1,   2,   3,   0,   4,  17,   5,  18,  33,  49,  65,   6,  19,  81,
- 97,   7,  34, 113,  20,  50, 129, 145, 161,   8,  35,  66, 177, 193,
- 21,  82, 209, 240,  36,  51,  98, 114, 130,   9,  10,  22,  23,  24,
- 25,  26,  37,  38,  39,  40,  41,  42,  52,  53,  54,  55,  56,  57,
- 58,  67,  68,  69,  70,  71,  72,  73,  74,  83,  84,  85,  86,  87,
- 88,  89,  90,  99, 100, 101, 102, 103, 104, 105, 106, 115, 116, 117,
-118, 119, 120, 121, 122, 131, 132, 133, 134, 135, 136, 137, 138, 146,
-147, 148, 149, 150, 151, 152, 153, 154, 162, 163, 164, 165, 166, 167,
-168, 169, 170, 178, 179, 180, 181, 182, 183, 184, 185, 186, 194, 195,
-196, 197, 198, 199, 200, 201, 202, 210, 211, 212, 213, 214, 215, 216,
-217, 218, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 241, 242,
-243, 244, 245, 246, 247, 248, 249, 250, 256
-```
-
-`is_ac == 1`, `stock_index == 1`:
-```
-  0,   1,   2,   3,  17,   4,   5,  33,  49,   6,  18,  65,  81,   7,
- 97, 113,  19,  34,  50, 129,   8,  20,  66, 145, 161, 177, 193,   9,
- 35,  51,  82, 240,  21,  98, 114, 209,  10,  22,  36,  52, 225,  37,
-241,  23,  24,  25,  26,  38,  39,  40,  41,  42,  53,  54,  55,  56,
- 57,  58,  67,  68,  69,  70,  71,  72,  73,  74,  83,  84,  85,  86,
- 87,  88,  89,  90,  99, 100, 101, 102, 103, 104, 105, 106, 115, 116,
-117, 118, 119, 120, 121, 122, 130, 131, 132, 133, 134, 135, 136, 137,
-138, 146, 147, 148, 149, 150, 151, 152, 153, 154, 162, 163, 164, 165,
-166, 167, 168, 169, 170, 178, 179, 180, 181, 182, 183, 184, 185, 186,
-194, 195, 196, 197, 198, 199, 200, 201, 202, 210, 211, 212, 213, 214,
-215, 216, 217, 218, 226, 227, 228, 229, 230, 231, 232, 233, 234, 242,
-243, 244, 245, 246, 247, 248, 249, 250, 256
-```
-
diff --git a/doc/tables/symbol_order.md b/doc/tables/symbol_order.md
deleted file mode 100644 (file)
index a196c0f..0000000
+++ /dev/null
@@ -1,30 +0,0 @@
-#### Table M.11 – predefined symbol order
-
-`is_ac == 0`:
-```
-0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-```
-
-`is_ac == 1`:
-```
-  1,   0,   2,   3,  17,   4,   5,  33,  18,  49,  65,   6,  81,  19,
- 97,   7,  34, 113,  50, 129,  20, 145, 161,   8,  35,  66, 177, 193,
- 21,  82, 209, 240,  36,  51,  98, 114,   9, 130,  10,  22,  52, 225,
- 23,  37, 241,  24,  25,  26,  38,  39,  40,  41,  42,  53,  54,  55,
- 56,  57,  58,  67,  68,  69,  70,  71,  72,  73,  74,  83,  84,  85,
- 86,  87,  88,  89,  90,  99, 100, 101, 102, 103, 104, 105, 106, 115,
-116, 117, 118, 119, 120, 121, 122, 131, 132, 133, 134, 135, 136, 137,
-138, 146, 147, 148, 149, 150, 151, 152, 153, 154, 162, 163, 164, 165,
-166, 167, 168, 169, 170, 178, 179, 180, 181, 182, 183, 184, 185, 186,
-194, 195, 196, 197, 198, 199, 200, 201, 202, 210, 211, 212, 213, 214,
-215, 216, 217, 218, 226, 227, 228, 229, 230, 231, 232, 233, 234, 242,
-243, 244, 245, 246, 247, 248, 249, 250,  16,  32,  48,  64,  80,  96,
-112, 128, 144, 160, 176, 192, 208,  11,  12,  13,  14,  15,  27,  28,
- 29,  30,  31,  43,  44,  45,  46,  47,  59,  60,  61,  62,  63,  75,
- 76,  77,  78,  79,  91,  92,  93,  94,  95, 107, 108, 109, 110, 111,
-123, 124, 125, 126, 127, 139, 140, 141, 142, 143, 155, 156, 157, 158,
-159, 171, 172, 173, 174, 175, 187, 188, 189, 190, 191, 203, 204, 205,
-206, 207, 219, 220, 221, 222, 223, 224, 235, 236, 237, 238, 239, 251,
-252, 253, 254, 255
-```
-
index cfcfcb5..b1c0f91 100644 (file)
@@ -150,7 +150,7 @@ multiple image lines.
 
 **DCT size selection**: `ac_strategy.cc`
 
-**[Gaborish]**: `gaborish.h`.
+**[Gaborish]**: `enc_gaborish.h`.
 
 **[Edge preserving filter]**: `epf.h`
 
diff --git a/docker/Dockerfile.jpegxl-builder b/docker/Dockerfile.jpegxl-builder
deleted file mode 100644 (file)
index 16e0077..0000000
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (c) the JPEG XL Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style
-# license that can be found in the LICENSE file.
-
-# Build an Ubuntu-based docker image with the installed software needed to
-# develop and test JPEG XL.
-
-FROM ubuntu:bionic
-
-# Set a prompt for when using it locally.
-ENV PS1="\[\033[01;33m\]\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ "
-
-COPY scripts/99_norecommends /etc/apt/apt.conf.d/99_norecommends
-
-COPY scripts /jpegxl_scripts
-
-ARG DEBIAN_FRONTEND=noninteractive
-
-RUN /jpegxl_scripts/jpegxl_builder.sh && \
-  rm -rf /jpegxl_scripts
diff --git a/docker/Dockerfile.jpegxl-builder-run-aarch64 b/docker/Dockerfile.jpegxl-builder-run-aarch64
deleted file mode 100644 (file)
index a9f38a4..0000000
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright (c) the JPEG XL Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style
-# license that can be found in the LICENSE file.
-
-# Build an Ubuntu-based docker image for aarch64 with the installed software
-# needed to run JPEG XL. This is only useful when running on actual aarch64
-# hardware.
-
-FROM arm64v8/ubuntu:bionic
-
-COPY scripts/99_norecommends /etc/apt/apt.conf.d/99_norecommends
-
-# Set a prompt for when using it locally.
-ENV PS1="\[\033[01;33m\]\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ "
-
-ARG DEBIAN_FRONTEND=noninteractive
-
-RUN set -ex; \
-  apt-get update -y; \
-  apt-get install -y \
-    bsdmainutils \
-    cmake \
-    curl \
-    ca-certificates \
-    extra-cmake-modules \
-    git \
-    imagemagick \
-    libjpeg8 \
-    libgif7 \
-    libgoogle-perftools4 \
-    libopenexr22 \
-    libpng16-16 \
-    libqt5x11extras5 \
-    libsdl2-2.0-0 \
-    parallel; \
-  rm -rf /var/lib/apt/lists/*;
diff --git a/docker/README.md b/docker/README.md
deleted file mode 100644 (file)
index 874df1c..0000000
+++ /dev/null
@@ -1,7 +0,0 @@
-### Docker container infrastructure for JPEG XL
-
-This directory contains the requirements to build a docker image for the
-JPEG XL project builder.
-
-Docker images need to be created and upload manually. See ./build.sh for
-details.
diff --git a/docker/build.sh b/docker/build.sh
deleted file mode 100755 (executable)
index 3d4727f..0000000
+++ /dev/null
@@ -1,83 +0,0 @@
-#!/usr/bin/env bash
-# Copyright (c) the JPEG XL Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style
-# license that can be found in the LICENSE file.
-
-set -eu
-
-MYDIR=$(dirname $(realpath "$0"))
-
-declare -a TARGETS
-
-load_targets() {
-  # Built-in OSX "find" does not support "-m".
-  FIND=$(which "gfind" || which "find")
-  for f in $(${FIND} -maxdepth 1 -name 'Dockerfile.*' | sort); do
-    local target="${f#*Dockerfile.}"
-    TARGETS+=("${target}")
-  done
-}
-
-usage() {
-    cat >&2 <<EOF
-Use: $1 [targets]
-
-Available targets:
-  * all
-EOF
-  for target in "${TARGETS[@]}"; do
-    echo "  * ${target}" >&2
-  done
-}
-
-build_target() {
-  local target="$1"
-
-  local dockerfile="${MYDIR}/Dockerfile.${target}"
-  # JPEG XL builder images are stored in the gcr.io/jpegxl project.
-  local tag="gcr.io/jpegxl/${target}"
-
-  echo "Building ${target}"
-  if ! sudo docker build --no-cache -t "${tag}" -f "${dockerfile}" "${MYDIR}" \
-      >"${target}.log" 2>&1; then
-    echo "${target} failed. See ${target}.log" >&2
-  else
-    echo "Done, to upload image run:" >&2
-    echo "  sudo docker push ${tag}"
-    if [[ "${JPEGXL_PUSH:-}" == "1" ]]; then
-      echo "sudo docker push ${tag}" >&2
-      sudo docker push "${tag}"
-      # The RepoDigest is only created after it is pushed.
-      local fulltag=$(sudo docker inspect --format="{{.RepoDigests}}" "${tag}")
-      fulltag="${fulltag#[}"
-      fulltag="${fulltag%]}"
-      echo "Updating .gitlab-ci.yml to ${fulltag}" >&2
-      sed -E "s;${tag}@sha256:[0-9a-f]+;${fulltag};" \
-        -i "${MYDIR}/../.gitlab-ci.yml"
-    fi
-  fi
-}
-
-main() {
-  cd "${MYDIR}"
-  local target="${1:-}"
-
-  load_targets
-  if [[ -z "${target}" ]]; then
-    usage $0
-    exit 1
-  fi
-
-  if [[ "${target}" == "all" ]]; then
-    for target in "${TARGETS[@]}"; do
-      build_target "${target}"
-    done
-  else
-    for target in "$@"; do
-      build_target "${target}"
-    done
-  fi
-}
-
-main "$@"
diff --git a/docker/scripts/99_norecommends b/docker/scripts/99_norecommends
deleted file mode 100644 (file)
index 96d6728..0000000
+++ /dev/null
@@ -1 +0,0 @@
-APT::Install-Recommends "false";
diff --git a/docker/scripts/binutils_align_fix.patch b/docker/scripts/binutils_align_fix.patch
deleted file mode 100644 (file)
index 6066252..0000000
+++ /dev/null
@@ -1,28 +0,0 @@
-Description: fix lack of alignment in relocations (crashes on mingw)
-See https://sourceware.org/git/?p=binutils-gdb.git;a=patch;h=73af69e74974eaa155eec89867e3ccc77ab39f6d
-From: Marc <marc@groundctl.com>
-Date: Fri, 9 Nov 2018 11:13:50 +0000
-Subject: [PATCH] Allow for compilers that do not produce aligned .rdat
- sections in PE format files.
-
---- a/upstream/ld/scripttempl/pe.sc    2020-05-12 18:45:12.000000000 +0200
-+++ b/upstream/ld/scripttempl/pe.sc    2020-05-12 18:47:12.000000000 +0200
-@@ -143,6 +143,7 @@
-   .rdata ${RELOCATING+BLOCK(__section_alignment__)} :
-   {
-     ${R_RDATA}
-+    . = ALIGN(4);
-     ${RELOCATING+__rt_psrelocs_start = .;}
-     ${RELOCATING+KEEP(*(.rdata_runtime_pseudo_reloc))}
-     ${RELOCATING+__rt_psrelocs_end = .;}
---- a/upstream/ld/scripttempl/pep.sc   2020-05-12 18:45:19.000000000 +0200
-+++ b/upstream/ld/scripttempl/pep.sc   2020-05-12 18:47:18.000000000 +0200
-@@ -143,6 +143,7 @@
-   .rdata ${RELOCATING+BLOCK(__section_alignment__)} :
-   {
-     ${R_RDATA}
-+    . = ALIGN(4);
-     ${RELOCATING+__rt_psrelocs_start = .;}
-     ${RELOCATING+KEEP(*(.rdata_runtime_pseudo_reloc))}
-     ${RELOCATING+__rt_psrelocs_end = .;}
-
diff --git a/docker/scripts/emsdk_install.sh b/docker/scripts/emsdk_install.sh
deleted file mode 100755 (executable)
index 6cf225a..0000000
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/usr/bin/env bash
-# Copyright (c) the JPEG XL Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style
-# license that can be found in the LICENSE file.
-
-EMSDK_URL="https://github.com/emscripten-core/emsdk/archive/main.tar.gz"
-EMSDK_DIR="/opt/emsdk"
-
-EMSDK_RELEASE="2.0.23"
-
-set -eu -x
-
-# Temporary files cleanup hooks.
-CLEANUP_FILES=()
-cleanup() {
-  if [[ ${#CLEANUP_FILES[@]} -ne 0 ]]; then
-    rm -fr "${CLEANUP_FILES[@]}"
-  fi
-}
-trap "{ set +x; } 2>/dev/null; cleanup" INT TERM EXIT
-
-main() {
-  local workdir=$(mktemp -d --suffix=emsdk)
-  CLEANUP_FILES+=("${workdir}")
-
-  local emsdktar="${workdir}/emsdk.tar.gz"
-  curl --output "${emsdktar}" "${EMSDK_URL}" --location
-  mkdir -p "${EMSDK_DIR}"
-  tar -zxf "${emsdktar}" -C "${EMSDK_DIR}" --strip-components=1
-
-  cd "${EMSDK_DIR}"
-  ./emsdk install --shallow "${EMSDK_RELEASE}"
-  ./emsdk activate --embedded "${EMSDK_RELEASE}"
-}
-
-main "$@"
diff --git a/docker/scripts/jpegxl_builder.sh b/docker/scripts/jpegxl_builder.sh
deleted file mode 100755 (executable)
index bf9f19d..0000000
+++ /dev/null
@@ -1,518 +0,0 @@
-#!/usr/bin/env bash
-# Copyright (c) the JPEG XL Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style
-# license that can be found in the LICENSE file.
-
-# Main entry point for all the Dockerfile for jpegxl-builder. This centralized
-# file helps sharing code and configuration between Dockerfiles.
-
-set -eux
-
-MYDIR=$(dirname $(realpath "$0"))
-
-# libjpeg-turbo.
-JPEG_TURBO_RELEASE="2.0.4"
-JPEG_TURBO_URL="https://github.com/libjpeg-turbo/libjpeg-turbo/archive/${JPEG_TURBO_RELEASE}.tar.gz"
-JPEG_TURBO_SHA256="7777c3c19762940cff42b3ba4d7cd5c52d1671b39a79532050c85efb99079064"
-
-# zlib (dependency of libpng)
-ZLIB_RELEASE="1.2.11"
-ZLIB_URL="https://www.zlib.net/zlib-${ZLIB_RELEASE}.tar.gz"
-ZLIB_SHA256="c3e5e9fdd5004dcb542feda5ee4f0ff0744628baf8ed2dd5d66f8ca1197cb1a1"
-# The name in the .pc and the .dll generated don't match in zlib for Windows
-# because they use different .dll names in Windows. We avoid that by defining
-# UNIX=1. We also install all the .dll files to ${prefix}/lib instead of the
-# default ${prefix}/bin.
-ZLIB_FLAGS='-DUNIX=1 -DINSTALL_PKGCONFIG_DIR=/${CMAKE_INSTALL_PREFIX}/lib/pkgconfig -DINSTALL_BIN_DIR=/${CMAKE_INSTALL_PREFIX}/lib'
-
-# libpng
-LIBPNG_RELEASE="1.6.37"
-LIBPNG_URL="https://github.com/glennrp/libpng/archive/v${LIBPNG_RELEASE}.tar.gz"
-LIBPNG_SHA256="ca74a0dace179a8422187671aee97dd3892b53e168627145271cad5b5ac81307"
-
-# giflib
-GIFLIB_RELEASE="5.2.1"
-GIFLIB_URL="https://netcologne.dl.sourceforge.net/project/giflib/giflib-${GIFLIB_RELEASE}.tar.gz"
-GIFLIB_SHA256="31da5562f44c5f15d63340a09a4fd62b48c45620cd302f77a6d9acf0077879bd"
-
-# A patch needed to compile GIFLIB in mingw.
-GIFLIB_PATCH_URL="https://github.com/msys2/MINGW-packages/raw/3afde38fcee7b3ba2cafd97d76cca8f06934504f/mingw-w64-giflib/001-mingw-build.patch"
-GIFLIB_PATCH_SHA256="2b2262ddea87fc07be82e10aeb39eb699239f883c899aa18a16e4d4e40af8ec8"
-
-# webp
-WEBP_RELEASE="1.0.2"
-WEBP_URL="https://codeload.github.com/webmproject/libwebp/tar.gz/v${WEBP_RELEASE}"
-WEBP_SHA256="347cf85ddc3497832b5fa9eee62164a37b249c83adae0ba583093e039bf4881f"
-
-# Google benchmark
-BENCHMARK_RELEASE="1.5.2"
-BENCHMARK_URL="https://github.com/google/benchmark/archive/v${BENCHMARK_RELEASE}.tar.gz"
-BENCHMARK_SHA256="dccbdab796baa1043f04982147e67bb6e118fe610da2c65f88912d73987e700c"
-BENCHMARK_FLAGS="-DGOOGLETEST_PATH=${MYDIR}/../../third_party/googletest"
-# attribute(format(__MINGW_PRINTF_FORMAT, ...)) doesn't work in our
-# environment, so we disable the warning.
-BENCHMARK_FLAGS="-DCMAKE_BUILD_TYPE=Release -DBENCHMARK_ENABLE_TESTING=OFF \
-  -DCMAKE_CXX_FLAGS=-Wno-ignored-attributes \
-  -DCMAKE_POSITION_INDEPENDENT_CODE=ON"
-
-# V8
-V8_VERSION="9.3.22"
-
-# Temporary files cleanup hooks.
-CLEANUP_FILES=()
-cleanup() {
-  if [[ ${#CLEANUP_FILES[@]} -ne 0 ]]; then
-    rm -fr "${CLEANUP_FILES[@]}"
-  fi
-}
-trap "{ set +x; } 2>/dev/null; cleanup" INT TERM EXIT
-
-# List of Ubuntu arch names supported by the builder (such as "i386").
-LIST_ARCHS=(
-  amd64
-  i386
-  arm64
-  armhf
-)
-
-# List of target triplets supported by the builder.
-LIST_TARGETS=(
-  x86_64-linux-gnu
-  i686-linux-gnu
-  arm-linux-gnueabihf
-  aarch64-linux-gnu
-)
-LIST_MINGW_TARGETS=(
-  i686-w64-mingw32
-  x86_64-w64-mingw32
-)
-LIST_WASM_TARGETS=(
-  wasm32
-)
-
-# Setup the apt repositories and supported architectures.
-setup_apt() {
-  apt-get update -y
-  apt-get install -y curl gnupg ca-certificates
-
-  apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 1E9377A2BA9EF27F
-
-  # node sources.
-  cat >/etc/apt/sources.list.d/nodesource.list <<EOF
-  deb https://deb.nodesource.com/node_14.x bionic main
-  deb-src https://deb.nodesource.com/node_14.x bionic main
-EOF
-  curl -s https://deb.nodesource.com/gpgkey/nodesource.gpg.key | apt-key add -
-
-  local port_list=()
-  local main_list=()
-  local ubarch
-  for ubarch in "${LIST_ARCHS[@]}"; do
-    if [[ "${ubarch}" != "amd64" && "${ubarch}" != "i386" ]]; then
-      # other archs are not part of the main mirrors, but available in
-      # ports.ubuntu.com.
-      port_list+=("${ubarch}")
-    else
-      main_list+=("${ubarch}")
-    fi
-    # Add the arch to the system.
-    if [[ "${ubarch}" != "amd64" ]]; then
-      dpkg --add-architecture "${ubarch}"
-    fi
-  done
-
-  # Update the sources.list with the split of supported architectures.
-  local bkplist="/etc/apt/sources.list.bkp"
-  [[ -e "${bkplist}" ]] || \
-    mv /etc/apt/sources.list "${bkplist}"
-
-  local newlist="/etc/apt/sources.list.tmp"
-  rm -f "${newlist}"
-  port_list=$(echo "${port_list[@]}" | tr ' ' ,)
-  if [[ -n "${port_list}" ]]; then
-    local port_url="http://ports.ubuntu.com/ubuntu-ports/"
-    grep -v -E '^#' "${bkplist}" |
-      sed -E "s;^deb (http[^ ]+) (.*)\$;deb [arch=${port_list}] ${port_url} \\2;" \
-      >>"${newlist}"
-  fi
-
-  main_list=$(echo "${main_list[@]}" | tr ' ' ,)
-  grep -v -E '^#' "${bkplist}" |
-    sed -E "s;^deb (http[^ ]+) (.*)\$;deb [arch=${main_list}] \\1 \\2\ndeb-src [arch=${main_list}] \\1 \\2;" \
-    >>"${newlist}"
-  mv "${newlist}" /etc/apt/sources.list
-}
-
-install_pkgs() {
-  packages=(
-    # Native compilers (minimum for SIMD is clang-7)
-    clang-7 clang-format-7 clang-tidy-7
-
-    # TODO: Consider adding clang-8 to every builder:
-    #   clang-8 clang-format-8 clang-tidy-8
-
-    # For cross-compiling to Windows with mingw.
-    mingw-w64
-    wine64
-    wine-binfmt
-
-    # Native tools.
-    bsdmainutils
-    cmake
-    extra-cmake-modules
-    git
-    llvm
-    nasm
-    ninja-build
-    parallel
-    pkg-config
-
-    # For compiling / testing JNI wrapper. JDK8 is almost 2x smaller than JDK11
-    # openjdk-8-jdk-headless would be 50MB smaller, unfortunately, CMake
-    # does mistakenly thinks it does not contain JNI feature.
-    openjdk-8-jdk
-
-    # These are used by the ./ci.sh lint in the native builder.
-    clang-format-7
-    clang-format-8
-
-    # For coverage builds
-    gcovr
-
-    # For compiling giflib documentation.
-    xmlto
-
-    # Common libraries.
-    libstdc++-8-dev
-
-    # We don't use tcmalloc on archs other than amd64. This installs
-    # libgoogle-perftools4:amd64.
-    google-perftools
-
-    # NodeJS for running WASM tests
-    nodejs
-
-    # To generate API documentation.
-    doxygen
-
-    # Freezes version that builds (passes tests). Newer version
-    # (2.30-21ubuntu1~18.04.4) claims to fix "On Intel Skylake
-    # (-march=native) generated avx512 instruction can be wrong",
-    # but newly added tests does not pass. Perhaps the problem is
-    # that mingw package is not updated.
-    binutils-source=2.30-15ubuntu1
-  )
-
-  # Install packages that are arch-dependent.
-  local ubarch
-  for ubarch in "${LIST_ARCHS[@]}"; do
-    packages+=(
-      # Library dependencies. These normally depend on the target architecture
-      # we are compiling for and can't usually be installed for multiple
-      # architectures at the same time.
-      libgif7:"${ubarch}"
-      libjpeg-dev:"${ubarch}"
-      libpng-dev:"${ubarch}"
-      libqt5x11extras5-dev:"${ubarch}"
-
-      libstdc++-8-dev:"${ubarch}"
-      qtbase5-dev:"${ubarch}"
-
-      # For OpenEXR:
-      libilmbase12:"${ubarch}"
-      libopenexr22:"${ubarch}"
-
-      # TCMalloc dependency
-      libunwind-dev:"${ubarch}"
-
-      # Cross-compiling tools per arch.
-      libc6-dev-"${ubarch}"-cross
-      libstdc++-8-dev-"${ubarch}"-cross
-    )
-  done
-
-  local target
-  for target in "${LIST_TARGETS[@]}"; do
-    # Per target cross-compiling tools.
-    if [[ "${target}" != "x86_64-linux-gnu" ]]; then
-      packages+=(
-        binutils-"${target}"
-        gcc-"${target}"
-      )
-    fi
-  done
-
-  # Install all the manual packages via "apt install" for the main arch. These
-  # will be installed for other archs via manual download and unpack.
-  apt install -y "${packages[@]}" "${UNPACK_PKGS[@]}"
-}
-
-# binutils <2.32 need a patch.
-install_binutils() {
-  local workdir=$(mktemp -d --suffix=_install)
-  CLEANUP_FILES+=("${workdir}")
-  pushd "${workdir}"
-  apt source binutils-mingw-w64
-  apt -y build-dep binutils-mingw-w64
-  cd binutils-mingw-w64-8ubuntu1
-  cp "${MYDIR}/binutils_align_fix.patch" debian/patches
-  echo binutils_align_fix.patch >> debian/patches/series
-  dpkg-buildpackage -b
-  cd ..
-  dpkg -i *deb
-  popd
-}
-
-# Install a library from the source code for multiple targets.
-# Usage: install_from_source <tar_url> <sha256> <target> [<target...>]
-install_from_source() {
-  local package="$1"
-  shift
-
-  local url
-  eval "url=\${${package}_URL}"
-  local sha256
-  eval "sha256=\${${package}_SHA256}"
-  # Optional package flags
-  local pkgflags
-  eval "pkgflags=\${${package}_FLAGS:-}"
-
-  local workdir=$(mktemp -d --suffix=_install)
-  CLEANUP_FILES+=("${workdir}")
-
-  local tarfile="${workdir}"/$(basename "${url}")
-  curl -L --output "${tarfile}" "${url}"
-  if ! echo "${sha256} ${tarfile}" | sha256sum -c --status -; then
-    echo "SHA256 mismatch for ${url}: expected ${sha256} but found:"
-    sha256sum "${tarfile}"
-    exit 1
-  fi
-
-  local target
-  for target in "$@"; do
-    echo "Installing ${package} for target ${target} from ${url}"
-
-    local srcdir="${workdir}/source-${target}"
-    mkdir -p "${srcdir}"
-    tar -zxf "${tarfile}" -C "${srcdir}" --strip-components=1
-
-    local prefix="/usr"
-    if [[ "${target}" != "x86_64-linux-gnu" ]]; then
-      prefix="/usr/${target}"
-    fi
-
-    # Apply patches to buildfiles.
-    if [[ "${package}" == "GIFLIB" && "${target}" == *mingw32 ]]; then
-      # GIFLIB Makefile has several problems so we need to fix them here. We are
-      # using a patch from MSYS2 that already fixes the compilation for mingw.
-      local make_patch="${srcdir}/libgif.patch"
-      curl -L "${GIFLIB_PATCH_URL}" -o "${make_patch}"
-      echo "${GIFLIB_PATCH_SHA256} ${make_patch}" | sha256sum -c --status -
-      patch "${srcdir}/Makefile" < "${make_patch}"
-    elif [[ "${package}" == "LIBPNG" && "${target}" == wasm* ]]; then
-      # Cut the dependency to libm; there is pull request to fix it, so this
-      # might not be needed in the future.
-      sed -i 's/APPLE/EMSCRIPTEN/g' "${srcdir}/CMakeLists.txt"
-    fi
-
-    local cmake_args=()
-    local export_args=("CC=clang-7" "CXX=clang++-7")
-    local cmake="cmake"
-    local make="make"
-    local system_name="Linux"
-    if [[ "${target}" == *mingw32 ]]; then
-      system_name="Windows"
-      # When compiling with clang, CMake doesn't detect that we are using mingw.
-      cmake_args+=(
-        -DMINGW=1
-        # Googletest needs this when cross-compiling to windows
-        -DCMAKE_CROSSCOMPILING=1
-        -DHAVE_STD_REGEX=0
-        -DHAVE_POSIX_REGEX=0
-        -DHAVE_GNU_POSIX_REGEX=0
-      )
-      local windres=$(which ${target}-windres || true)
-      if [[ -n "${windres}" ]]; then
-        cmake_args+=(-DCMAKE_RC_COMPILER="${windres}")
-      fi
-    fi
-    if [[ "${target}" == wasm* ]]; then
-      system_name="WASM"
-      cmake="emcmake cmake"
-      make="emmake make"
-      export_args=()
-      cmake_args+=(
-        -DCMAKE_FIND_ROOT_PATH="${prefix}"
-        -DCMAKE_PREFIX_PATH="${prefix}"
-      )
-      # Static and shared library link to the same file -> race condition.
-      nproc=1
-    else
-      nproc=`nproc --all`
-    fi
-    cmake_args+=(-DCMAKE_SYSTEM_NAME="${system_name}")
-
-    if [[ "${target}" != "x86_64-linux-gnu" ]]; then
-      # Cross-compiling.
-      cmake_args+=(
-        -DCMAKE_C_COMPILER_TARGET="${target}"
-        -DCMAKE_CXX_COMPILER_TARGET="${target}"
-        -DCMAKE_SYSTEM_PROCESSOR="${target%%-*}"
-      )
-    fi
-
-    if [[ -e "${srcdir}/CMakeLists.txt" ]]; then
-      # Most packages use cmake for building which is easier to configure for
-      # cross-compiling.
-      if [[ "${package}" == "JPEG_TURBO" && "${target}" == wasm* ]]; then
-        # JT erroneously detects WASM CPU as i386 and tries to use asm.
-        # Wasm/Emscripten support for dynamic linking is incomplete; disable
-        # to avoid CMake warning.
-        cmake_args+=(-DWITH_SIMD=0 -DENABLE_SHARED=OFF)
-      fi
-      (
-        cd "${srcdir}"
-        export ${export_args[@]}
-        ${cmake} \
-          -DCMAKE_INSTALL_PREFIX="${prefix}" \
-          "${cmake_args[@]}" ${pkgflags}
-        ${make} -j${nproc}
-        ${make} install
-      )
-    elif [[ "${package}" == "GIFLIB" ]]; then
-      # GIFLIB doesn't yet have a cmake build system. There is a pull
-      # request in giflib for adding CMakeLists.txt so this might not be
-      # needed in the future.
-      (
-        cd "${srcdir}"
-        local giflib_make_flags=(
-          CFLAGS="-O2 --target=${target} -std=gnu99"
-          PREFIX="${prefix}"
-        )
-        if [[ "${target}" != wasm* ]]; then
-          giflib_make_flags+=(CC=clang-7)
-        fi
-        # giflib make dependencies are not properly set up so parallel building
-        # doesn't work for everything.
-        ${make} -j${nproc} libgif.a "${giflib_make_flags[@]}"
-        ${make} -j${nproc} all "${giflib_make_flags[@]}"
-        ${make} install "${giflib_make_flags[@]}"
-      )
-    else
-      echo "Don't know how to install ${package}"
-      exit 1
-    fi
-
-    # CMake mistakenly uses ".so" libraries and EMCC fails to link properly.
-    if [[ "${target}" == wasm* ]]; then
-      rm -f "${prefix}/lib"/*.so*
-    fi
-  done
-}
-
-# Packages that are manually unpacked for each architecture.
-UNPACK_PKGS=(
-  libgif-dev
-  libclang-common-7-dev
-
-  # For OpenEXR:
-  libilmbase-dev
-  libopenexr-dev
-
-  # TCMalloc
-  libgoogle-perftools-dev
-  libtcmalloc-minimal4
-  libgoogle-perftools4
-)
-
-# Main script entry point.
-main() {
-  cd "${MYDIR}"
-
-  # Configure the repositories with the sources for multi-arch cross
-  # compilation.
-  setup_apt
-  apt-get update -y
-  apt-get dist-upgrade -y
-
-  install_pkgs
-  install_binutils
-  apt clean
-
-  # Remove prebuilt Java classes cache.
-  rm /usr/lib/jvm/java-8-openjdk-amd64/jre/lib/amd64/server/classes.jsa
-
-  # Manually extract packages for the target arch that can't install it directly
-  # at the same time as the native ones.
-  local ubarch
-  for ubarch in "${LIST_ARCHS[@]}"; do
-    if [[ "${ubarch}" != "amd64" ]]; then
-      local pkg
-      for pkg in "${UNPACK_PKGS[@]}"; do
-        apt download "${pkg}":"${ubarch}"
-        dpkg -x "${pkg}"_*_"${ubarch}".deb /
-      done
-    fi
-  done
-  # TODO: Add clang from the llvm repos. This is problematic since we are
-  # installing libclang-common-7-dev:"${ubarch}" from the ubuntu ports repos
-  # which is not available in the llvm repos so it might have a different
-  # version than the ubuntu ones.
-
-  # Remove the win32 libgcc version. The gcc-mingw-w64-x86-64 (and i686)
-  # packages install two libgcc versions:
-  #   /usr/lib/gcc/x86_64-w64-mingw32/7.3-posix
-  #   /usr/lib/gcc/x86_64-w64-mingw32/7.3-win32
-  # (exact libgcc version number depends on the package version).
-  #
-  # Clang will pick the best libgcc, sorting by version, but it doesn't
-  # seem to be a way to specify one or the other one, except by passing
-  # -nostdlib and setting all the include paths from the command line.
-  # To check which one is being used you can run:
-  #   clang++-7 --target=x86_64-w64-mingw32 -v -print-libgcc-file-name
-  # We need to use the "posix" versions for thread support, so here we
-  # just remove the other one.
-  local target
-  for target in "${LIST_MINGW_TARGETS[@]}"; do
-    update-alternatives --set "${target}-gcc" $(which "${target}-gcc-posix")
-    local gcc_win32_path=$("${target}-cpp-win32" -print-libgcc-file-name)
-    rm -rf $(dirname "${gcc_win32_path}")
-  done
-
-  # TODO: Add msan for the target when cross-compiling. This only installs it
-  # for amd64.
-  ./msan_install.sh
-
-  # Build and install qemu user-linux targets.
-  ./qemu_install.sh
-
-  # Install emscripten SDK.
-  ./emsdk_install.sh
-
-  # Setup environment for building WASM libraries from sources.
-  source /opt/emsdk/emsdk_env.sh
-
-  # Install some dependency libraries manually for the different targets.
-
-  install_from_source JPEG_TURBO "${LIST_MINGW_TARGETS[@]}" "${LIST_WASM_TARGETS[@]}"
-  install_from_source ZLIB "${LIST_MINGW_TARGETS[@]}" "${LIST_WASM_TARGETS[@]}"
-  install_from_source LIBPNG "${LIST_MINGW_TARGETS[@]}" "${LIST_WASM_TARGETS[@]}"
-  install_from_source GIFLIB "${LIST_MINGW_TARGETS[@]}" "${LIST_WASM_TARGETS[@]}"
-  # webp in Ubuntu is relatively old so we install it from source for everybody.
-  install_from_source WEBP "${LIST_TARGETS[@]}" "${LIST_MINGW_TARGETS[@]}"
-
-  install_from_source BENCHMARK "${LIST_TARGETS[@]}" "${LIST_MINGW_TARGETS[@]}"
-
-  # Install v8. v8 has better WASM SIMD support than NodeJS 14 (LTS).
-  # First we need the installer to install v8.
-  npm install jsvu -g
-  # install specific version;
-  HOME=/opt jsvu --os=linux64 "v8@${V8_VERSION}"
-  ln -s "/opt/.jsvu/v8-${V8_VERSION}" "/opt/.jsvu/v8"
-
-  # Cleanup.
-  find /var/lib/apt/lists/ -mindepth 1 -delete
-}
-
-main "$@"
diff --git a/docker/scripts/msan_install.sh b/docker/scripts/msan_install.sh
deleted file mode 100755 (executable)
index 0216f62..0000000
+++ /dev/null
@@ -1,131 +0,0 @@
-#!/usr/bin/env bash
-# Copyright (c) the JPEG XL Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style
-# license that can be found in the LICENSE file.
-
-set -eu
-
-MYDIR=$(dirname $(realpath "$0"))
-
-# Convenience flag to pass both CMAKE_C_FLAGS and CMAKE_CXX_FLAGS
-CMAKE_FLAGS=${CMAKE_FLAGS:-}
-CMAKE_C_FLAGS=${CMAKE_C_FLAGS:-${CMAKE_FLAGS}}
-CMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS:-${CMAKE_FLAGS}}
-CMAKE_EXE_LINKER_FLAGS=${CMAKE_EXE_LINKER_FLAGS:-}
-
-CLANG_VERSION="${CLANG_VERSION:-}"
-# Detect the clang version suffix and store it in CLANG_VERSION. For example,
-# "6.0" for clang 6 or "7" for clang 7.
-detect_clang_version() {
-  if [[ -n "${CLANG_VERSION}" ]]; then
-    return 0
-  fi
-  local clang_version=$("${CC:-clang}" --version | head -n1)
-  local llvm_tag
-  case "${clang_version}" in
-    "clang version 6."*)
-      CLANG_VERSION="6.0"
-      ;;
-    "clang version 7."*)
-      CLANG_VERSION="7"
-      ;;
-    "clang version 8."*)
-      CLANG_VERSION="8"
-      ;;
-    "clang version 9."*)
-      CLANG_VERSION="9"
-      ;;
-    *)
-      echo "Unknown clang version: ${clang_version}" >&2
-      return 1
-  esac
-}
-
-# Temporary files cleanup hooks.
-CLEANUP_FILES=()
-cleanup() {
-  if [[ ${#CLEANUP_FILES[@]} -ne 0 ]]; then
-    rm -fr "${CLEANUP_FILES[@]}"
-  fi
-}
-trap "{ set +x; } 2>/dev/null; cleanup" INT TERM EXIT
-
-# Install libc++ libraries compiled with msan in the msan_prefix for the current
-# compiler version.
-cmd_msan_install() {
-  local tmpdir=$(mktemp -d)
-  CLEANUP_FILES+=("${tmpdir}")
-  # Detect the llvm to install:
-  export CC="${CC:-clang}"
-  export CXX="${CXX:-clang++}"
-  detect_clang_version
-  local llvm_tag
-  case "${CLANG_VERSION}" in
-    "6.0")
-      llvm_tag="llvmorg-6.0.1"
-      ;;
-    "7")
-      llvm_tag="llvmorg-7.0.1"
-      ;;
-    "8")
-      llvm_tag="llvmorg-8.0.0"
-      ;;
-    *)
-      echo "Unknown clang version: ${clang_version}" >&2
-      return 1
-  esac
-  local llvm_targz="${tmpdir}/${llvm_tag}.tar.gz"
-  curl -L --show-error -o "${llvm_targz}" \
-    "https://github.com/llvm/llvm-project/archive/${llvm_tag}.tar.gz"
-  tar -C "${tmpdir}" -zxf "${llvm_targz}"
-  local llvm_root="${tmpdir}/llvm-project-${llvm_tag}"
-
-  local msan_prefix="${HOME}/.msan/${CLANG_VERSION}"
-  rm -rf "${msan_prefix}"
-
-  declare -A CMAKE_EXTRAS
-  CMAKE_EXTRAS[libcxx]="\
-    -DLIBCXX_CXX_ABI=libstdc++ \
-    -DLIBCXX_INSTALL_EXPERIMENTAL_LIBRARY=ON"
-
-  for project in libcxx; do
-    local proj_build="${tmpdir}/build-${project}"
-    local proj_dir="${llvm_root}/${project}"
-    mkdir -p "${proj_build}"
-    cmake -B"${proj_build}" -H"${proj_dir}" \
-      -G Ninja \
-      -DCMAKE_BUILD_TYPE=Release \
-      -DLLVM_USE_SANITIZER=Memory \
-      -DLLVM_PATH="${llvm_root}/llvm" \
-      -DLLVM_CONFIG_PATH="$(which llvm-config llvm-config-7 llvm-config-6.0 | \
-                            head -n1)" \
-      -DCMAKE_CXX_FLAGS="${CMAKE_CXX_FLAGS}" \
-      -DCMAKE_C_FLAGS="${CMAKE_C_FLAGS}" \
-      -DCMAKE_EXE_LINKER_FLAGS="${CMAKE_EXE_LINKER_FLAGS}" \
-      -DCMAKE_INSTALL_PREFIX="${msan_prefix}" \
-      ${CMAKE_EXTRAS[${project}]}
-    cmake --build "${proj_build}"
-    ninja -C "${proj_build}" install
-  done
-}
-
-main() {
-  set -x
-  for version in 6.0 7 8; do
-    if ! which "clang-${version}" >/dev/null; then
-      echo "Skipping msan install for clang version ${version}"
-      continue
-    fi
-    (
-     trap "{ set +x; } 2>/dev/null; cleanup" INT TERM EXIT
-     export CLANG_VERSION=${version}
-     export CC=clang-${version}
-     export CXX=clang++-${version}
-     cmd_msan_install
-    ) &
-  done
-  wait
-}
-
-main "$@"
diff --git a/docker/scripts/qemu_install.sh b/docker/scripts/qemu_install.sh
deleted file mode 100755 (executable)
index 8106c44..0000000
+++ /dev/null
@@ -1,83 +0,0 @@
-#!/usr/bin/env bash
-# Copyright (c) the JPEG XL Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style
-# license that can be found in the LICENSE file.
-
-QEMU_RELEASE="4.1.0"
-QEMU_URL="https://download.qemu.org/qemu-${QEMU_RELEASE}.tar.xz"
-QEMU_ARCHS=(
-  aarch64
-  arm
-  i386
-  # TODO: Consider adding these:
-  # aarch64_be
-  # mips64el
-  # mips64
-  # mips
-  # ppc64
-  # ppc
-)
-
-# Ubuntu packages not installed that are needed to build qemu.
-QEMU_BUILD_DEPS=(
-  libglib2.0-dev
-  libpixman-1-dev
-  flex
-  bison
-)
-
-set -eu -x
-
-# Temporary files cleanup hooks.
-CLEANUP_FILES=()
-cleanup() {
-  if [[ ${#CLEANUP_FILES[@]} -ne 0 ]]; then
-    rm -fr "${CLEANUP_FILES[@]}"
-  fi
-}
-trap "{ set +x; } 2>/dev/null; cleanup" INT TERM EXIT
-
-main() {
-  local workdir=$(mktemp -d --suffix=qemu)
-  CLEANUP_FILES+=("${workdir}")
-
-  apt install -y "${QEMU_BUILD_DEPS[@]}"
-
-  local qemutar="${workdir}/qemu.tar.gz"
-  curl --output "${qemutar}" "${QEMU_URL}"
-  tar -Jxf "${qemutar}" -C "${workdir}"
-  local srcdir="${workdir}/qemu-${QEMU_RELEASE}"
-
-  local builddir="${workdir}/build"
-  local prefixdir="${workdir}/prefix"
-  mkdir -p "${builddir}"
-
-  # List of targets to build.
-  local targets=""
-  local make_targets=()
-  local target
-  for target in "${QEMU_ARCHS[@]}"; do
-    targets="${targets} ${target}-linux-user"
-    # Build just the linux-user targets.
-    make_targets+=("${target}-linux-user/all")
-  done
-
-  cd "${builddir}"
-  "${srcdir}/configure" \
-    --prefix="${prefixdir}" \
-    --static --disable-system --enable-linux-user \
-    --target-list="${targets}"
-
-  make -j $(nproc --all || echo 1) "${make_targets[@]}"
-
-  # Manually install these into the non-standard location. This script runs as
-  # root anyway.
-  for target in "${QEMU_ARCHS[@]}"; do
-    cp "${target}-linux-user/qemu-${target}" "/usr/bin/qemu-${target}-static"
-  done
-
-  apt autoremove -y --purge "${QEMU_BUILD_DEPS[@]}"
-}
-
-main "$@"
index 88dc27c..aee5fd4 100644 (file)
@@ -12,45 +12,16 @@ project(SAMPLE_LIBJXL LANGUAGES C CXX)
 # Use pkg-config to find libjxl.
 find_package(PkgConfig)
 pkg_check_modules(Jxl REQUIRED IMPORTED_TARGET libjxl)
+pkg_check_modules(JxlCms REQUIRED IMPORTED_TARGET libjxl_cms)
 pkg_check_modules(JxlThreads REQUIRED IMPORTED_TARGET libjxl_threads)
 
 # Build the example encoder/decoder binaries using the default shared libraries
 # installed.
 add_executable(decode_oneshot decode_oneshot.cc)
-target_link_libraries(decode_oneshot PkgConfig::Jxl PkgConfig::JxlThreads)
+target_link_libraries(decode_oneshot PkgConfig::Jxl PkgConfig::JxlCms PkgConfig::JxlThreads)
 
 add_executable(decode_progressive decode_progressive.cc)
-target_link_libraries(decode_progressive PkgConfig::Jxl PkgConfig::JxlThreads)
+target_link_libraries(decode_progressive PkgConfig::Jxl PkgConfig::JxlCms PkgConfig::JxlThreads)
 
 add_executable(encode_oneshot encode_oneshot.cc)
-target_link_libraries(encode_oneshot PkgConfig::Jxl PkgConfig::JxlThreads)
-
-
-# Building a static binary with the static libjxl dependencies. How to load
-# static library configs from pkg-config and how to build static binaries
-# depends on the platform, and building static binaries in general has problems.
-# If you don't need static binaries you can remove this section.
-add_library(StaticJxl INTERFACE IMPORTED GLOBAL)
-set_target_properties(StaticJxl PROPERTIES
-    INTERFACE_INCLUDE_DIRECTORIES "${Jxl_STATIC_INCLUDE_DIR}"
-    INTERFACE_COMPILE_OPTIONS "${Jxl_STATIC_CFLAGS_OTHER}"
-    INTERFACE_LINK_LIBRARIES "${Jxl_STATIC_LDFLAGS}"
-)
-add_library(StaticJxlThreads INTERFACE IMPORTED GLOBAL)
-set_target_properties(StaticJxlThreads PROPERTIES
-    INTERFACE_INCLUDE_DIRECTORIES "${JxlThreads_STATIC_INCLUDE_DIR}"
-    INTERFACE_COMPILE_OPTIONS "${JxlThreads_STATIC_CFLAGS_OTHER}"
-    # libgcc uses weak symbols for pthread which means that -lpthread is not
-    # linked when compiling a static binary. This is a platform-specific fix for
-    # that.
-    INTERFACE_LINK_LIBRARIES
-      "${JxlThreads_STATIC_LDFLAGS} -Wl,--whole-archive -lpthread -Wl,--no-whole-archive"
-)
-
-add_executable(decode_oneshot_static decode_oneshot.cc)
-target_link_libraries(decode_oneshot_static
-  -static StaticJxl StaticJxlThreads)
-
-add_executable(encode_oneshot_static encode_oneshot.cc)
-target_link_libraries(encode_oneshot_static
-  -static StaticJxl StaticJxlThreads)
+target_link_libraries(encode_oneshot PkgConfig::Jxl PkgConfig::JxlCms PkgConfig::JxlThreads)
index adfe5f8..8ec999e 100644 (file)
@@ -7,16 +7,14 @@
 // available at once). The example outputs the pixels and color information to a
 // floating point image and an ICC profile on disk.
 
+#include <jxl/decode.h>
+#include <jxl/decode_cxx.h>
 #include <limits.h>
 #include <stdint.h>
-#include <stdio.h>
 #include <string.h>
 
 #include <vector>
 
-#include "jxl/decode.h"
-#include "jxl/decode_cxx.h"
-
 bool DecodeJpegXlExif(const uint8_t* jxl, size_t size,
                       std::vector<uint8_t>* exif) {
   auto dec = JxlDecoderMake(nullptr);
index 932193f..0772095 100644 (file)
@@ -7,7 +7,15 @@
 // available at once). The example outputs the pixels and color information to a
 // floating point image and an ICC profile on disk.
 
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
 #include <inttypes.h>
+#include <jxl/decode.h>
+#include <jxl/decode_cxx.h>
+#include <jxl/resizable_parallel_runner.h>
+#include <jxl/resizable_parallel_runner_cxx.h>
 #include <limits.h>
 #include <stdint.h>
 #include <stdio.h>
 
 #include <vector>
 
-#include "jxl/decode.h"
-#include "jxl/decode_cxx.h"
-#include "jxl/resizable_parallel_runner.h"
-#include "jxl/resizable_parallel_runner_cxx.h"
-
 /** Decodes JPEG XL image to floating point pixels and ICC Profile. Pixel are
  * stored as floating point, as interleaved RGBA (4 floating point values per
  * pixel), line per line from top to bottom.  Pixel values have nominal range
@@ -77,15 +80,14 @@ bool DecodeJpegXlOneShot(const uint8_t* jxl, size_t size,
       // Get the ICC color profile of the pixel data
       size_t icc_size;
       if (JXL_DEC_SUCCESS !=
-          JxlDecoderGetICCProfileSize(
-              dec.get(), &format, JXL_COLOR_PROFILE_TARGET_DATA, &icc_size)) {
+          JxlDecoderGetICCProfileSize(dec.get(), JXL_COLOR_PROFILE_TARGET_DATA,
+                                      &icc_size)) {
         fprintf(stderr, "JxlDecoderGetICCProfileSize failed\n");
         return false;
       }
       icc_profile->resize(icc_size);
       if (JXL_DEC_SUCCESS != JxlDecoderGetColorAsICCProfile(
-                                 dec.get(), &format,
-                                 JXL_COLOR_PROFILE_TARGET_DATA,
+                                 dec.get(), JXL_COLOR_PROFILE_TARGET_DATA,
                                  icc_profile->data(), icc_profile->size())) {
         fprintf(stderr, "JxlDecoderGetColorAsICCProfile failed\n");
         return false;
index 77d2a0f..a094cbe 100644 (file)
@@ -6,7 +6,15 @@
 // This C++ example decodes a JPEG XL image progressively (input bytes are
 // passed in chunks). The example outputs the intermediate steps to PAM files.
 
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
 #include <inttypes.h>
+#include <jxl/decode.h>
+#include <jxl/decode_cxx.h>
+#include <jxl/resizable_parallel_runner.h>
+#include <jxl/resizable_parallel_runner_cxx.h>
 #include <limits.h>
 #include <stdint.h>
 #include <stdio.h>
 
 #include <vector>
 
-#include "jxl/decode.h"
-#include "jxl/decode_cxx.h"
-#include "jxl/resizable_parallel_runner.h"
-#include "jxl/resizable_parallel_runner_cxx.h"
-
 bool WritePAM(const char* filename, const uint8_t* buffer, size_t w, size_t h) {
   FILE* fp = fopen(filename, "wb");
   if (!fp) {
@@ -30,7 +33,11 @@ bool WritePAM(const char* filename, const uint8_t* buffer, size_t w, size_t h) {
           "\nDEPTH 4\nMAXVAL 255\nTUPLTYPE "
           "RGB_ALPHA\nENDHDR\n",
           static_cast<uint64_t>(w), static_cast<uint64_t>(h));
-  fwrite(buffer, 1, w * h * 4, fp);
+  size_t num_bytes = w * h * 4;
+  if (fwrite(buffer, 1, num_bytes, fp) != num_bytes) {
+    fclose(fp);
+    return false;
+  };
   if (fclose(fp) != 0) {
     return false;
   }
@@ -121,16 +128,14 @@ bool DecodeJpegXlProgressive(const uint8_t* jxl, size_t size,
       // Get the ICC color profile of the pixel data
       size_t icc_size;
       if (JXL_DEC_SUCCESS !=
-          JxlDecoderGetICCProfileSize(dec.get(), &format,
-                                      JXL_COLOR_PROFILE_TARGET_ORIGINAL,
-                                      &icc_size)) {
+          JxlDecoderGetICCProfileSize(
+              dec.get(), JXL_COLOR_PROFILE_TARGET_ORIGINAL, &icc_size)) {
         fprintf(stderr, "JxlDecoderGetICCProfileSize failed\n");
         return false;
       }
       icc_profile.resize(icc_size);
       if (JXL_DEC_SUCCESS != JxlDecoderGetColorAsICCProfile(
-                                 dec.get(), &format,
-                                 JXL_COLOR_PROFILE_TARGET_ORIGINAL,
+                                 dec.get(), JXL_COLOR_PROFILE_TARGET_ORIGINAL,
                                  icc_profile.data(), icc_profile.size())) {
         fprintf(stderr, "JxlDecoderGetColorAsICCProfile failed\n");
         return false;
@@ -149,11 +154,9 @@ bool DecodeJpegXlProgressive(const uint8_t* jxl, size_t size,
         return false;
       }
       pixels.resize(xsize * ysize * 4);
-      void* pixels_buffer = (void*)pixels.data();
-      size_t pixels_buffer_size = pixels.size() * sizeof(float);
       if (JXL_DEC_SUCCESS != JxlDecoderSetImageOutBuffer(dec.get(), &format,
-                                                         pixels_buffer,
-                                                         pixels_buffer_size)) {
+                                                         pixels.data(),
+                                                         pixels.size())) {
         fprintf(stderr, "JxlDecoderSetImageOutBuffer failed\n");
         return false;
       }
index f1cd9ab..a8daf48 100644 (file)
@@ -6,6 +6,10 @@
 // This example encodes a file containing a floating point image to another
 // file containing JPEG XL image with a single frame.
 
+#include <jxl/encode.h>
+#include <jxl/encode_cxx.h>
+#include <jxl/thread_parallel_runner.h>
+#include <jxl/thread_parallel_runner_cxx.h>
 #include <limits.h>
 #include <string.h>
 
 #include <string>
 #include <vector>
 
-#include "jxl/encode.h"
-#include "jxl/encode_cxx.h"
-#include "jxl/thread_parallel_runner.h"
-#include "jxl/thread_parallel_runner_cxx.h"
-
 /**
  * Reads from .pfm file (Portable FloatMap)
  *
@@ -60,6 +59,7 @@ bool ReadPFM(const char* filename, std::vector<float>* pixels, uint32_t* xsize,
 
   size_t readsize = fread(data.data(), 1, size, file);
   if ((long)readsize != size) {
+    fclose(file);
     return false;
   }
   if (fclose(file) != 0) {
@@ -229,6 +229,7 @@ bool WriteFile(const std::vector<uint8_t>& bytes, const char* filename) {
   if (fwrite(bytes.data(), sizeof(uint8_t), bytes.size(), file) !=
       bytes.size()) {
     fprintf(stderr, "Could not write bytes to %s\n", filename);
+    fclose(file);
     return false;
   }
   if (fclose(file) != 0) {
diff --git a/experimental/fast_lossless/fast_lossless.cc b/experimental/fast_lossless/fast_lossless.cc
deleted file mode 100644 (file)
index 9b442aa..0000000
+++ /dev/null
@@ -1,1362 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "fast_lossless.h"
-
-#include <assert.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <string.h>
-
-#include <algorithm>
-#include <array>
-#include <memory>
-#include <queue>
-#include <vector>
-
-#if (!defined(__BYTE_ORDER__) || (__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__))
-#error "system not known to be little endian"
-#endif
-
-struct BitWriter {
-  void Allocate(size_t maximum_bit_size) {
-    assert(data == nullptr);
-    // Leave some padding.
-    data.reset((uint8_t*)malloc(maximum_bit_size / 8 + 32));
-  }
-
-  void Write(uint32_t count, uint64_t bits) {
-    buffer |= bits << bits_in_buffer;
-    bits_in_buffer += count;
-    memcpy(data.get() + bytes_written, &buffer, 8);
-    size_t bytes_in_buffer = bits_in_buffer / 8;
-    bits_in_buffer -= bytes_in_buffer * 8;
-    buffer >>= bytes_in_buffer * 8;
-    bytes_written += bytes_in_buffer;
-  }
-
-  void ZeroPadToByte() {
-    if (bits_in_buffer != 0) {
-      Write(8 - bits_in_buffer, 0);
-    }
-  }
-
-  std::unique_ptr<uint8_t[], void (*)(void*)> data = {nullptr, free};
-  size_t bytes_written = 0;
-  size_t bits_in_buffer = 0;
-  uint64_t buffer = 0;
-};
-
-constexpr size_t kLZ77Offset = 224;
-constexpr size_t kLZ77MinLength = 16;
-
-struct PrefixCode {
-  static constexpr size_t kNumLZ77 = 17;
-  static constexpr size_t kNumRaw = 15;
-
-  alignas(32) uint8_t raw_nbits[16] = {};
-  alignas(32) uint8_t raw_bits[16] = {};
-  uint8_t lz77_nbits[kNumLZ77] = {};
-
-  uint16_t lz77_bits[kNumLZ77] = {};
-
-  static uint16_t BitReverse(size_t nbits, uint16_t bits) {
-    constexpr uint16_t kNibbleLookup[16] = {
-        0b0000, 0b1000, 0b0100, 0b1100, 0b0010, 0b1010, 0b0110, 0b1110,
-        0b0001, 0b1001, 0b0101, 0b1101, 0b0011, 0b1011, 0b0111, 0b1111,
-    };
-    uint16_t rev16 = (kNibbleLookup[bits & 0xF] << 12) |
-                     (kNibbleLookup[(bits >> 4) & 0xF] << 8) |
-                     (kNibbleLookup[(bits >> 8) & 0xF] << 4) |
-                     (kNibbleLookup[bits >> 12]);
-    return rev16 >> (16 - nbits);
-  }
-
-  // Create the prefix codes given the code lengths.
-  // Supports the code lengths being split into two halves.
-  static void ComputeCanonicalCode(const uint8_t* first_chunk_nbits,
-                                   uint8_t* first_chunk_bits,
-                                   size_t first_chunk_size,
-                                   const uint8_t* second_chunk_nbits,
-                                   uint16_t* second_chunk_bits,
-                                   size_t second_chunk_size) {
-    uint8_t code_length_counts[16] = {};
-    for (size_t i = 0; i < first_chunk_size; i++) {
-      code_length_counts[first_chunk_nbits[i]]++;
-      assert(first_chunk_nbits[i] <= 7);
-      assert(first_chunk_nbits[i] > 0);
-    }
-    for (size_t i = 0; i < second_chunk_size; i++) {
-      code_length_counts[second_chunk_nbits[i]]++;
-    }
-
-    uint16_t next_code[16] = {};
-
-    uint16_t code = 0;
-    for (size_t i = 1; i < 16; i++) {
-      code = (code + code_length_counts[i - 1]) << 1;
-      next_code[i] = code;
-    }
-
-    for (size_t i = 0; i < first_chunk_size; i++) {
-      first_chunk_bits[i] =
-          BitReverse(first_chunk_nbits[i], next_code[first_chunk_nbits[i]]++);
-    }
-    for (size_t i = 0; i < second_chunk_size; i++) {
-      second_chunk_bits[i] =
-          BitReverse(second_chunk_nbits[i], next_code[second_chunk_nbits[i]]++);
-    }
-  }
-
-  PrefixCode(uint64_t* raw_counts, uint64_t* lz77_counts) {
-    // "merge" together all the lz77 counts in a single symbol for the level 1
-    // table (containing just the raw symbols, up to length 7).
-    uint64_t level1_counts[kNumRaw + 1];
-    memcpy(level1_counts, raw_counts, kNumRaw * sizeof(uint64_t));
-    size_t numraw = kNumRaw;
-    while (numraw > 0 && level1_counts[numraw - 1] == 0) numraw--;
-
-    level1_counts[numraw] = 0;
-    for (size_t i = 0; i < kNumLZ77; i++) {
-      level1_counts[numraw] += lz77_counts[i];
-    }
-    uint8_t level1_nbits[kNumRaw + 1] = {};
-    ComputeCodeLengths(level1_counts, numraw + 1, 7, level1_nbits);
-
-    uint8_t level2_nbits[kNumLZ77] = {};
-    ComputeCodeLengths(lz77_counts, kNumLZ77, 15 - level1_nbits[numraw],
-                       level2_nbits);
-    for (size_t i = 0; i < numraw; i++) {
-      raw_nbits[i] = level1_nbits[i];
-    }
-    for (size_t i = 0; i < kNumLZ77; i++) {
-      lz77_nbits[i] =
-          level2_nbits[i] ? level1_nbits[numraw] + level2_nbits[i] : 0;
-    }
-
-    ComputeCanonicalCode(raw_nbits, raw_bits, numraw, lz77_nbits, lz77_bits,
-                         kNumLZ77);
-  }
-
-  static void ComputeCodeLengths(uint64_t* freqs, size_t n, size_t limit,
-                                 uint8_t* nbits) {
-    if (n <= 1) return;
-    assert(n <= (1 << limit));
-    assert(n <= 32);
-    int parent[64] = {};
-    int height[64] = {};
-    using QElem = std::pair<uint64_t, size_t>;
-    std::priority_queue<QElem, std::vector<QElem>, std::greater<QElem>> q;
-    // Standard Huffman code construction. On failure (i.e. if going beyond the
-    // length limit), try again with halved frequencies.
-    while (true) {
-      size_t num_nodes = 0;
-      for (size_t i = 0; i < n; i++) {
-        if (freqs[i] == 0) continue;
-        q.emplace(freqs[i], num_nodes++);
-      }
-      if (num_nodes <= 1) return;
-      while (q.size() > 1) {
-        QElem n1 = q.top();
-        q.pop();
-        QElem n2 = q.top();
-        q.pop();
-        size_t next = num_nodes++;
-        parent[n1.second] = next;
-        parent[n2.second] = next;
-        q.emplace(n1.first + n2.first, next);
-      }
-      assert(q.size() == 1);
-      q.pop();
-      bool is_ok = true;
-      for (size_t i = num_nodes - 1; i-- > 0;) {
-        height[i] = height[parent[i]] + 1;
-        is_ok &= height[i] <= limit;
-      }
-      if (is_ok) {
-        num_nodes = 0;
-        for (size_t i = 0; i < n; i++) {
-          if (freqs[i] == 0) continue;
-          nbits[i] = height[num_nodes++];
-        }
-        break;
-      } else {
-        for (size_t i = 0; i < n; i++) {
-          freqs[i] = (freqs[i] + 1) >> 1;
-        }
-      }
-    }
-  }
-
-  void WriteTo(BitWriter* writer) const {
-    uint64_t code_length_counts[18] = {};
-    code_length_counts[17] = 3 + 2 * (kNumLZ77 - 1);
-    for (size_t i = 0; i < kNumRaw; i++) {
-      code_length_counts[raw_nbits[i]]++;
-    }
-    for (size_t i = 0; i < kNumLZ77; i++) {
-      code_length_counts[lz77_nbits[i]]++;
-    }
-    uint8_t code_length_nbits[18] = {};
-    ComputeCodeLengths(code_length_counts, 18, 5, code_length_nbits);
-    writer->Write(2, 0b00);  // HSKIP = 0, i.e. don't skip code lengths.
-
-    // As per Brotli RFC.
-    uint8_t code_length_order[18] = {1, 2, 3, 4,  0,  5,  17, 6,  16,
-                                     7, 8, 9, 10, 11, 12, 13, 14, 15};
-    uint8_t code_length_length_nbits[] = {2, 4, 3, 2, 2, 4};
-    uint8_t code_length_length_bits[] = {0, 7, 3, 2, 1, 15};
-
-    // Encode lengths of code lengths.
-    size_t num_code_lengths = 18;
-    while (code_length_nbits[code_length_order[num_code_lengths - 1]] == 0) {
-      num_code_lengths--;
-    }
-    for (size_t i = 0; i < num_code_lengths; i++) {
-      int symbol = code_length_nbits[code_length_order[i]];
-      writer->Write(code_length_length_nbits[symbol],
-                    code_length_length_bits[symbol]);
-    }
-
-    // Compute the canonical codes for the codes that represent the lengths of
-    // the actual codes for data.
-    uint16_t code_length_bits[18] = {};
-    ComputeCanonicalCode(nullptr, nullptr, 0, code_length_nbits,
-                         code_length_bits, 18);
-    // Encode raw bit code lengths.
-    for (size_t i = 0; i < kNumRaw; i++) {
-      writer->Write(code_length_nbits[raw_nbits[i]],
-                    code_length_bits[raw_nbits[i]]);
-    }
-    size_t num_lz77 = kNumLZ77;
-    while (lz77_nbits[num_lz77 - 1] == 0) {
-      num_lz77--;
-    }
-    // Encode 0s until 224 (start of LZ77 symbols). This is in total 224-15 =
-    // 209.
-    static_assert(kLZ77Offset == 224, "");
-    static_assert(kNumRaw == 15, "");
-    writer->Write(code_length_nbits[17], code_length_bits[17]);
-    writer->Write(3, 0b010);  // 5
-    writer->Write(code_length_nbits[17], code_length_bits[17]);
-    writer->Write(3, 0b000);  // (5-2)*8 + 3 = 27
-    writer->Write(code_length_nbits[17], code_length_bits[17]);
-    writer->Write(3, 0b110);  // (27-2)*8 + 9 = 209
-    // Encode LZ77 symbols, with values 224+i*16.
-    for (size_t i = 0; i < num_lz77; i++) {
-      writer->Write(code_length_nbits[lz77_nbits[i]],
-                    code_length_bits[lz77_nbits[i]]);
-      if (i != num_lz77 - 1) {
-        // Encode gap between LZ77 symbols: 15 zeros.
-        writer->Write(code_length_nbits[17], code_length_bits[17]);
-        writer->Write(3, 0b000);  // 3
-        writer->Write(code_length_nbits[17], code_length_bits[17]);
-        writer->Write(3, 0b100);  // (3-2)*8+7 = 15
-      }
-    }
-  }
-};
-
-constexpr size_t kChunkSize = 16;
-
-void EncodeHybridUint000(uint32_t value, uint32_t* token, uint32_t* nbits,
-                         uint32_t* bits) {
-  uint32_t n = 31 - __builtin_clz(value);
-  *token = value ? n + 1 : 0;
-  *nbits = value ? n : 0;
-  *bits = value ? value - (1 << n) : 0;
-}
-
-void AppendWriter(BitWriter* dest, const BitWriter* src) {
-  if (dest->bits_in_buffer == 0) {
-    memcpy(dest->data.get() + dest->bytes_written, src->data.get(),
-           src->bytes_written);
-    dest->bytes_written += src->bytes_written;
-  } else {
-    size_t i = 0;
-    uint64_t buf = dest->buffer;
-    uint64_t bits_in_buffer = dest->bits_in_buffer;
-    uint8_t* dest_buf = dest->data.get() + dest->bytes_written;
-    // Copy 8 bytes at a time until we reach the border.
-    for (; i + 8 < src->bytes_written; i += 8) {
-      uint64_t chunk;
-      memcpy(&chunk, src->data.get() + i, 8);
-      uint64_t out = buf | (chunk << bits_in_buffer);
-      memcpy(dest_buf + i, &out, 8);
-      buf = chunk >> (64 - bits_in_buffer);
-    }
-    dest->buffer = buf;
-    dest->bytes_written += i;
-    for (; i < src->bytes_written; i++) {
-      dest->Write(8, src->data[i]);
-    }
-  }
-  dest->Write(src->bits_in_buffer, src->buffer);
-}
-
-void AssembleFrame(size_t width, size_t height, size_t nb_chans,
-                   size_t bitdepth,
-                   const std::vector<std::array<BitWriter, 4>>& group_data,
-                   BitWriter* output) {
-  size_t total_size_groups = 0;
-  std::vector<size_t> group_sizes(group_data.size());
-  for (size_t i = 0; i < group_data.size(); i++) {
-    size_t sz = 0;
-    for (size_t j = 0; j < nb_chans; j++) {
-      const auto& writer = group_data[i][j];
-      sz += writer.bytes_written * 8 + writer.bits_in_buffer;
-    }
-    sz = (sz + 7) / 8;
-    group_sizes[i] = sz;
-    total_size_groups += sz * 8;
-  }
-  output->Allocate(1000 + group_data.size() * 32 + total_size_groups);
-
-  // Signature
-  output->Write(16, 0x0AFF);
-
-  // Size header, hand-crafted.
-  // Not small
-  output->Write(1, 0);
-
-  auto wsz = [output](size_t size) {
-    if (size - 1 < (1 << 9)) {
-      output->Write(2, 0b00);
-      output->Write(9, size - 1);
-    } else if (size - 1 < (1 << 13)) {
-      output->Write(2, 0b01);
-      output->Write(13, size - 1);
-    } else if (size - 1 < (1 << 18)) {
-      output->Write(2, 0b10);
-      output->Write(18, size - 1);
-    } else {
-      output->Write(2, 0b11);
-      output->Write(30, size - 1);
-    }
-  };
-
-  wsz(height);
-
-  // No special ratio.
-  output->Write(3, 0);
-
-  wsz(width);
-
-  // Hand-crafted ImageMetadata.
-  output->Write(1, 0);  // all_default
-  output->Write(1, 0);  // extra_fields
-  output->Write(1, 0);  // bit_depth.floating_point_sample
-  if (bitdepth == 8) {
-    output->Write(2, 0b00);  // bit_depth.bits_per_sample = 8
-  } else if (bitdepth == 10) {
-    output->Write(2, 0b01);  // bit_depth.bits_per_sample = 10
-  } else if (bitdepth == 12) {
-    output->Write(2, 0b10);  // bit_depth.bits_per_sample = 12
-  } else {
-    output->Write(2, 0b11);  // 1 + u(6)
-    output->Write(6, bitdepth - 1);
-  }
-  output->Write(1, 1);  // 16-bit-buffer sufficient
-  bool have_alpha = (nb_chans == 2 || nb_chans == 4);
-  if (have_alpha) {
-    output->Write(2, 0b01);  // One extra channel
-    output->Write(1, 1);     // ... all_default (ie. 8-bit alpha)
-  } else {
-    output->Write(2, 0b00);  // No extra channel
-  }
-  output->Write(1, 0);  // Not XYB
-  if (nb_chans > 1) {
-    output->Write(1, 1);  // color_encoding.all_default (sRGB)
-  } else {
-    output->Write(1, 0);     // color_encoding.all_default false
-    output->Write(1, 0);     // color_encoding.want_icc false
-    output->Write(2, 1);     // grayscale
-    output->Write(2, 1);     // D65
-    output->Write(1, 0);     // no gamma transfer function
-    output->Write(2, 0b10);  // tf: 2 + u(4)
-    output->Write(4, 11);    // tf of sRGB
-    output->Write(2, 1);     // relative rendering intent
-  }
-  output->Write(2, 0b00);  // No extensions.
-
-  output->Write(1, 1);  // all_default transform data
-
-  // No ICC, no preview. Frame should start at byte boundery.
-  output->ZeroPadToByte();
-
-  auto wsz_fh = [output](size_t size) {
-    if (size < (1 << 8)) {
-      output->Write(2, 0b00);
-      output->Write(8, size);
-    } else if (size - 256 < (1 << 11)) {
-      output->Write(2, 0b01);
-      output->Write(11, size - 256);
-    } else if (size - 2304 < (1 << 14)) {
-      output->Write(2, 0b10);
-      output->Write(14, size - 2304);
-    } else {
-      output->Write(2, 0b11);
-      output->Write(30, size - 18688);
-    }
-  };
-
-  // Handcrafted frame header.
-  output->Write(1, 0);     // all_default
-  output->Write(2, 0b00);  // regular frame
-  output->Write(1, 1);     // modular
-  output->Write(2, 0b00);  // default flags
-  output->Write(1, 0);     // not YCbCr
-  output->Write(2, 0b00);  // no upsampling
-  if (have_alpha) {
-    output->Write(2, 0b00);  // no alpha upsampling
-  }
-  output->Write(2, 0b01);  // default group size
-  output->Write(2, 0b00);  // exactly one pass
-  if (width % kChunkSize == 0) {
-    output->Write(1, 0);  // no custom size or origin
-  } else {
-    output->Write(1, 1);  // custom size
-    wsz_fh(0);            // x0 = 0
-    wsz_fh(0);            // y0 = 0
-    wsz_fh((width + kChunkSize - 1) / kChunkSize *
-           kChunkSize);  // xsize rounded up to chunk size
-    wsz_fh(height);      // ysize same
-  }
-  output->Write(2, 0b00);  // kReplace blending mode
-  if (have_alpha) {
-    output->Write(2, 0b00);  // kReplace blending mode for alpha channel
-  }
-  output->Write(1, 1);     // is_last
-  output->Write(2, 0b00);  // a frame has no name
-  output->Write(1, 0);     // loop filter is not all_default
-  output->Write(1, 0);     // no gaborish
-  output->Write(2, 0);     // 0 EPF iters
-  output->Write(2, 0b00);  // No LF extensions
-  output->Write(2, 0b00);  // No FH extensions
-
-  output->Write(1, 0);      // No TOC permutation
-  output->ZeroPadToByte();  // TOC is byte-aligned.
-  for (size_t i = 0; i < group_data.size(); i++) {
-    size_t sz = group_sizes[i];
-    if (sz < (1 << 10)) {
-      output->Write(2, 0b00);
-      output->Write(10, sz);
-    } else if (sz - 1024 < (1 << 14)) {
-      output->Write(2, 0b01);
-      output->Write(14, sz - 1024);
-    } else if (sz - 17408 < (1 << 22)) {
-      output->Write(2, 0b10);
-      output->Write(22, sz - 17408);
-    } else {
-      output->Write(2, 0b11);
-      output->Write(30, sz - 4211712);
-    }
-  }
-  output->ZeroPadToByte();  // Groups are byte-aligned.
-
-  for (size_t i = 0; i < group_data.size(); i++) {
-    for (size_t j = 0; j < nb_chans; j++) {
-      AppendWriter(output, &group_data[i][j]);
-    }
-    output->ZeroPadToByte();
-  }
-}
-
-void PrepareDCGlobalCommon(bool is_single_group, size_t width, size_t height,
-                           const PrefixCode& code, BitWriter* output) {
-  output->Allocate(100000 + (is_single_group ? width * height * 16 : 0));
-  // No patches, spline or noise.
-  output->Write(1, 1);  // default DC dequantization factors (?)
-  output->Write(1, 1);  // use global tree / histograms
-  output->Write(1, 0);  // no lz77 for the tree
-
-  output->Write(1, 1);   // simple code for the tree's context map
-  output->Write(2, 0);   // all contexts clustered together
-  output->Write(1, 1);   // use prefix code for tree
-  output->Write(4, 15);  // don't do hybriduint for tree - 2 symbols anyway
-  output->Write(7, 0b0100101);  // Alphabet size is 6: we need 0 and 5 (var16)
-  output->Write(2, 1);          // simple prefix code
-  output->Write(2, 1);          // with two symbols
-  output->Write(3, 0);          // 0
-  output->Write(3, 5);          // 5
-  output->Write(5, 0b00010);    // tree repr: predictor is 5, all else 0
-
-  output->Write(1, 1);     // Enable lz77 for the main bitstream
-  output->Write(2, 0b00);  // lz77 offset 224
-  static_assert(kLZ77Offset == 224, "");
-  output->Write(10, 0b0000011111);  // lz77 min length 16
-  static_assert(kLZ77MinLength == 16, "");
-  output->Write(4, 4);  // 404 hybrid uint config for lz77: 4
-  output->Write(3, 0);  // 0
-  output->Write(3, 4);  // 4
-  output->Write(1, 1);  // simple code for the context map
-  output->Write(2, 1);  // two clusters
-  output->Write(1, 1);  // raw/lz77 length histogram last
-  output->Write(1, 0);  // distance histogram first
-  output->Write(1, 1);  // use prefix codes
-  output->Write(4, 0);  // 000 hybrid uint config for distances (only need 0)
-  output->Write(4, 0);  // 000 hybrid uint config for symbols (only <= 10)
-  // Distance alphabet size:
-  output->Write(5, 0b00001);  // 2: just need 1 for RLE (i.e. distance 1)
-  // Symbol + LZ77 alphabet size:
-  output->Write(1, 1);    // > 1
-  output->Write(4, 8);    // <= 512
-  output->Write(8, 255);  // == 512
-
-  // Distance histogram:
-  output->Write(2, 1);  // simple prefix code
-  output->Write(2, 0);  // with one symbol
-  output->Write(1, 1);  // 1
-
-  // Symbol + lz77 histogram:
-  code.WriteTo(output);
-
-  // Group header for global modular image.
-  output->Write(1, 1);  // Global tree
-  output->Write(1, 1);  // All default wp
-}
-
-void PrepareDCGlobal(bool is_single_group, size_t width, size_t height,
-                     size_t nb_chans, size_t bitdepth, const PrefixCode& code,
-                     BitWriter* output) {
-  PrepareDCGlobalCommon(is_single_group, width, height, code, output);
-  if (nb_chans > 2) {
-    output->Write(2, 0b01);     // 1 transform
-    output->Write(2, 0b00);     // RCT
-    output->Write(5, 0b00000);  // Starting from ch 0
-    output->Write(2, 0b00);     // YCoCg
-  } else {
-    output->Write(2, 0b00);  // no transforms
-  }
-  if (!is_single_group) {
-    output->ZeroPadToByte();
-  }
-}
-
-void EncodeHybridUint404_Mul16(uint32_t value, uint32_t* token_div16,
-                               uint32_t* nbits, uint32_t* bits) {
-  // NOTE: token in libjxl is actually << 4.
-  uint32_t n = 31 - __builtin_clz(value);
-  *token_div16 = value < 16 ? 0 : n - 3;
-  *nbits = value < 16 ? 0 : n - 4;
-  *bits = value < 16 ? 0 : (value >> 4) - (1 << *nbits);
-}
-
-#ifdef FASTLL_ENABLE_AVX2_INTRINSICS
-#include <immintrin.h>
-void EncodeChunk(const uint16_t* residuals, const PrefixCode& prefix_code,
-                 BitWriter& output) {
-  static_assert(kChunkSize == 16, "Chunk size must be 16");
-  auto value = _mm256_load_si256((__m256i*)residuals);
-
-  // we know that residuals[i] has at most 12 bits, so we just need 3 nibbles
-  // and don't need to mask the third. However we do need to set the high
-  // byte to 0xFF, which will make table lookups return 0.
-  auto lo_nibble =
-      _mm256_or_si256(_mm256_and_si256(value, _mm256_set1_epi16(0xF)),
-                      _mm256_set1_epi16(0xFF00));
-  auto mi_nibble = _mm256_or_si256(
-      _mm256_and_si256(_mm256_srli_epi16(value, 4), _mm256_set1_epi16(0xF)),
-      _mm256_set1_epi16(0xFF00));
-  auto hi_nibble =
-      _mm256_or_si256(_mm256_srli_epi16(value, 8), _mm256_set1_epi16(0xFF00));
-
-  auto lo_lut = _mm256_broadcastsi128_si256(
-      _mm_setr_epi8(0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4));
-  auto mi_lut = _mm256_broadcastsi128_si256(
-      _mm_setr_epi8(0, 5, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8));
-  auto hi_lut = _mm256_broadcastsi128_si256(_mm_setr_epi8(
-      0, 9, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12));
-
-  auto lo_token = _mm256_shuffle_epi8(lo_lut, lo_nibble);
-  auto mi_token = _mm256_shuffle_epi8(mi_lut, mi_nibble);
-  auto hi_token = _mm256_shuffle_epi8(hi_lut, hi_nibble);
-
-  auto token = _mm256_max_epi16(lo_token, _mm256_max_epi16(mi_token, hi_token));
-  auto nbits = _mm256_subs_epu16(token, _mm256_set1_epi16(1));
-
-  // Compute 1<<nbits.
-  auto pow2_lo_lut = _mm256_broadcastsi128_si256(
-      _mm_setr_epi8(1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6,
-                    1u << 7, 0, 0, 0, 0, 0, 0, 0, 0));
-  auto pow2_hi_lut = _mm256_broadcastsi128_si256(
-      _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1 << 0, 1 << 1, 1 << 2, 1 << 3,
-                    1 << 4, 1 << 5, 1 << 6, 1u << 7));
-
-  auto nbits_masked = _mm256_or_si256(nbits, _mm256_set1_epi16(0xFF00));
-
-  auto nbits_pow2_lo = _mm256_shuffle_epi8(pow2_lo_lut, nbits_masked);
-  auto nbits_pow2_hi = _mm256_shuffle_epi8(pow2_hi_lut, nbits_masked);
-
-  auto nbits_pow2 =
-      _mm256_or_si256(_mm256_slli_epi16(nbits_pow2_hi, 8), nbits_pow2_lo);
-
-  auto bits = _mm256_subs_epu16(value, nbits_pow2);
-
-  auto token_masked = _mm256_or_si256(token, _mm256_set1_epi16(0xFF00));
-
-  // huff_nbits <= 6.
-  auto huff_nbits =
-      _mm256_shuffle_epi8(_mm256_broadcastsi128_si256(
-                              _mm_load_si128((__m128i*)prefix_code.raw_nbits)),
-                          token_masked);
-
-  auto huff_bits =
-      _mm256_shuffle_epi8(_mm256_broadcastsi128_si256(
-                              _mm_load_si128((__m128i*)prefix_code.raw_bits)),
-                          token_masked);
-
-  auto huff_nbits_masked =
-      _mm256_or_si256(huff_nbits, _mm256_set1_epi16(0xFF00));
-
-  auto bits_shifted = _mm256_mullo_epi16(
-      bits, _mm256_shuffle_epi8(pow2_lo_lut, huff_nbits_masked));
-
-  nbits = _mm256_add_epi16(nbits, huff_nbits);
-  bits = _mm256_or_si256(bits_shifted, huff_bits);
-
-  // Merge nbits and bits from 16-bit to 32-bit lanes.
-  auto nbits_hi16 = _mm256_srli_epi32(nbits, 16);
-  auto nbits_lo16 = _mm256_and_si256(nbits, _mm256_set1_epi32(0xFFFF));
-  auto bits_hi16 = _mm256_srli_epi32(bits, 16);
-  auto bits_lo16 = _mm256_and_si256(bits, _mm256_set1_epi32(0xFFFF));
-
-  nbits = _mm256_add_epi32(nbits_hi16, nbits_lo16);
-  bits = _mm256_or_si256(_mm256_sllv_epi32(bits_hi16, nbits_lo16), bits_lo16);
-
-  // Merge 32 -> 64 bit lanes.
-  auto nbits_hi32 = _mm256_srli_epi64(nbits, 32);
-  auto nbits_lo32 = _mm256_and_si256(nbits, _mm256_set1_epi64x(0xFFFFFFFF));
-  auto bits_hi32 = _mm256_srli_epi64(bits, 32);
-  auto bits_lo32 = _mm256_and_si256(bits, _mm256_set1_epi64x(0xFFFFFFFF));
-
-  nbits = _mm256_add_epi64(nbits_hi32, nbits_lo32);
-  bits = _mm256_or_si256(_mm256_sllv_epi64(bits_hi32, nbits_lo32), bits_lo32);
-
-  alignas(32) uint64_t nbits_simd[4] = {};
-  alignas(32) uint64_t bits_simd[4] = {};
-
-  _mm256_store_si256((__m256i*)nbits_simd, nbits);
-  _mm256_store_si256((__m256i*)bits_simd, bits);
-
-  // Manually merge the buffer bits with the SIMD bits.
-  // Necessary because Write() is only guaranteed to work with <=56 bits.
-  // Trying to SIMD-fy this code results in slower speed (and definitely less
-  // clarity).
-  {
-    for (size_t i = 0; i < 4; i++) {
-      output.buffer |= bits_simd[i] << output.bits_in_buffer;
-      memcpy(output.data.get() + output.bytes_written, &output.buffer, 8);
-      // If >> 64, next_buffer is unused.
-      uint64_t next_buffer = bits_simd[i] >> (64 - output.bits_in_buffer);
-      output.bits_in_buffer += nbits_simd[i];
-      // This `if` seems to be faster than using ternaries.
-      if (output.bits_in_buffer >= 64) {
-        output.buffer = next_buffer;
-        output.bits_in_buffer -= 64;
-        output.bytes_written += 8;
-      }
-    }
-    memcpy(output.data.get() + output.bytes_written, &output.buffer, 8);
-    size_t bytes_in_buffer = output.bits_in_buffer / 8;
-    output.bits_in_buffer -= bytes_in_buffer * 8;
-    output.buffer >>= bytes_in_buffer * 8;
-    output.bytes_written += bytes_in_buffer;
-  }
-}
-#endif
-
-#ifdef FASTLL_ENABLE_NEON_INTRINSICS
-#include <arm_neon.h>
-
-void EncodeChunk(const uint16_t* residuals, const PrefixCode& code,
-                 BitWriter& output) {
-  uint16x8_t res = vld1q_u16(residuals);
-  uint16x8_t token = vsubq_u16(vdupq_n_u16(16), vclzq_u16(res));
-  uint16x8_t nbits = vqsubq_u16(token, vdupq_n_u16(1));
-  uint16x8_t bits = vqsubq_u16(res, vshlq_s16(vdupq_n_s16(1), nbits));
-  uint16x8_t huff_bits =
-      vandq_u16(vdupq_n_u16(0xFF), vqtbl1q_u8(vld1q_u8(code.raw_bits), token));
-  uint16x8_t huff_nbits =
-      vandq_u16(vdupq_n_u16(0xFF), vqtbl1q_u8(vld1q_u8(code.raw_nbits), token));
-  bits = vorrq_u16(vshlq_u16(bits, huff_nbits), huff_bits);
-  nbits = vaddq_u16(nbits, huff_nbits);
-
-  // Merge nbits and bits from 16-bit to 32-bit lanes.
-  uint32x4_t nbits_lo16 = vandq_u32(nbits, vdupq_n_u32(0xFFFF));
-  uint32x4_t bits_hi16 = vshlq_u32(vshrq_n_u32(bits, 16), nbits_lo16);
-  uint32x4_t bits_lo16 = vandq_u32(bits, vdupq_n_u32(0xFFFF));
-
-  uint32x4_t nbits32 = vsraq_n_u32(nbits_lo16, nbits, 16);
-  uint32x4_t bits32 = vorrq_u32(bits_hi16, bits_lo16);
-
-  // Merging up to 64 bits is not faster.
-
-  // Manually merge the buffer bits with the SIMD bits.
-  // A bit faster.
-  for (size_t i = 0; i < 4; i++) {
-    output.buffer |= bits32[i] << output.bits_in_buffer;
-    memcpy(output.data.get() + output.bytes_written, &output.buffer, 8);
-    output.bits_in_buffer += nbits32[i];
-    size_t bytes_in_buffer = output.bits_in_buffer / 8;
-    output.bits_in_buffer -= bytes_in_buffer * 8;
-    output.buffer >>= bytes_in_buffer * 8;
-    output.bytes_written += bytes_in_buffer;
-  }
-}
-#endif
-
-template <size_t bytedepth>
-struct ChunkEncoder {
-  static void EncodeRle(size_t count, const PrefixCode& code,
-                        BitWriter& output) {
-    if (count == 0) return;
-    count -= kLZ77MinLength;
-    unsigned token_div16, nbits, bits;
-    EncodeHybridUint404_Mul16(count, &token_div16, &nbits, &bits);
-    output.Write(
-        code.lz77_nbits[token_div16] + nbits,
-        (bits << code.lz77_nbits[token_div16]) | code.lz77_bits[token_div16]);
-  }
-
-  inline void Chunk(size_t run, uint16_t* residuals) {
-    EncodeRle(run, *code, *output);
-#if defined(FASTLL_ENABLE_AVX2_INTRINSICS) && FASTLL_ENABLE_AVX2_INTRINSICS
-    if (bytedepth == 1) {
-      EncodeChunk(residuals, *code, *output);
-      return;
-    }
-#elif defined(FASTLL_ENABLE_NEON_INTRINSICS) && FASTLL_ENABLE_NEON_INTRINSICS
-    if (bytedepth == 1) {
-      EncodeChunk(residuals, *code, *output);
-      if (kChunkSize > 8) {
-        EncodeChunk(residuals + 8, *code, *output);
-      }
-      return;
-    }
-#endif
-    for (size_t ix = 0; ix < kChunkSize; ix++) {
-      unsigned token, nbits, bits;
-      EncodeHybridUint000(residuals[ix], &token, &nbits, &bits);
-      output->Write(code->raw_nbits[token] + nbits,
-                    code->raw_bits[token] | bits << code->raw_nbits[token]);
-    }
-  }
-
-  inline void Finalize(size_t run) { EncodeRle(run, *code, *output); }
-
-  const PrefixCode* code;
-  BitWriter* output;
-};
-
-struct ChunkSampleCollector {
-  void Rle(size_t count, uint64_t* lz77_counts) {
-    if (count == 0) return;
-    count -= kLZ77MinLength;
-    unsigned token_div16, nbits, bits;
-    EncodeHybridUint404_Mul16(count, &token_div16, &nbits, &bits);
-    lz77_counts[token_div16]++;
-  }
-
-  inline void Chunk(size_t run, uint16_t* residuals) {
-    // Run is broken. Encode the run and encode the individual vector.
-    Rle(run, lz77_counts);
-    for (size_t ix = 0; ix < kChunkSize; ix++) {
-      unsigned token, nbits, bits;
-      EncodeHybridUint000(residuals[ix], &token, &nbits, &bits);
-      raw_counts[token]++;
-    }
-  }
-
-  // don't count final run since we don't know how long it really is
-  void Finalize(size_t run) {}
-
-  uint64_t* raw_counts;
-  uint64_t* lz77_counts;
-};
-
-constexpr uint16_t PackSigned(int16_t value) {
-  return (static_cast<uint16_t>(value) << 1) ^
-         ((static_cast<uint16_t>(~value) >> 15) - 1);
-}
-
-template <typename T>
-struct ChannelRowProcessor {
-  T* t;
-  inline void ProcessChunk(const int16_t* row, const int16_t* row_left,
-                           const int16_t* row_top, const int16_t* row_topleft) {
-    bool continue_rle = true;
-    alignas(32) uint16_t residuals[kChunkSize] = {};
-    for (size_t ix = 0; ix < kChunkSize; ix++) {
-      int16_t px = row[ix];
-      int16_t left = row_left[ix];
-      int16_t top = row_top[ix];
-      int16_t topleft = row_topleft[ix];
-      int16_t ac = left - topleft;
-      int16_t ab = left - top;
-      int16_t bc = top - topleft;
-      int16_t grad = static_cast<int16_t>(static_cast<uint16_t>(ac) +
-                                          static_cast<uint16_t>(top));
-      int16_t d = ab ^ bc;
-      int16_t clamp = d < 0 ? top : left;
-      int16_t s = ac ^ bc;
-      int16_t pred = s < 0 ? grad : clamp;
-      residuals[ix] = PackSigned(px - pred);
-      continue_rle &= residuals[ix] == last;
-    }
-    // Run continues, nothing to do.
-    if (continue_rle) {
-      run += kChunkSize;
-    } else {
-      // Run is broken. Encode the run and encode the individual vector.
-      t->Chunk(run, residuals);
-      run = 0;
-    }
-    last = residuals[kChunkSize - 1];
-  }
-  void ProcessRow(const int16_t* row, const int16_t* row_left,
-                  const int16_t* row_top, const int16_t* row_topleft,
-                  size_t xs) {
-    for (size_t x = 0; x + kChunkSize <= xs; x += kChunkSize) {
-      ProcessChunk(row + x, row_left + x, row_top + x, row_topleft + x);
-    }
-  }
-
-  void Finalize() { t->Finalize(run); }
-  size_t run = 0;
-  uint16_t last = 0xFFFF;  // Can never appear
-};
-
-template <typename Processor, size_t nb_chans, size_t bytedepth>
-void ProcessImageArea(const unsigned char* rgba, size_t x0, size_t y0,
-                      size_t oxs, size_t xs, size_t yskip, size_t ys,
-                      size_t row_stride, Processor* processors) {
-  constexpr size_t kPadding = 16;
-
-  int16_t group_data[nb_chans][2][256 + kPadding * 2] = {};
-  int16_t allzero[nb_chans] = {};
-  int16_t allone[nb_chans];
-  auto get_pixel = [&](size_t x, size_t y, size_t channel) {
-    int16_t p = rgba[row_stride * (y0 + y) + (x0 + x) * nb_chans * bytedepth +
-                     channel * bytedepth];
-    if (bytedepth == 2) {
-      p <<= 8;
-      p |= rgba[row_stride * (y0 + y) + (x0 + x) * nb_chans * 2 + channel * 2 +
-                1];
-    }
-    return p;
-  };
-
-  for (size_t i = 0; i < nb_chans; i++) allone[i] = 0xffff;
-  for (size_t y = 0; y < ys; y++) {
-    // Pre-fill rows with YCoCg converted pixels.
-    for (size_t x = 0; x < oxs; x++) {
-      if (nb_chans < 3) {
-        int16_t luma = get_pixel(x, y, 0);
-        group_data[0][y & 1][x + kPadding] = luma;
-        if (nb_chans == 2) {
-          int16_t a = get_pixel(x, y, 1);
-          group_data[1][y & 1][x + kPadding] = a;
-        }
-      } else {
-        int16_t r = get_pixel(x, y, 0);
-        int16_t g = get_pixel(x, y, 1);
-        int16_t b = get_pixel(x, y, 2);
-        if (nb_chans == 4) {
-          int16_t a = get_pixel(x, y, 3);
-          group_data[3][y & 1][x + kPadding] = a;
-          group_data[1][y & 1][x + kPadding] = a ? r - b : 0;
-          int16_t tmp = b + (group_data[1][y & 1][x + kPadding] >> 1);
-          group_data[2][y & 1][x + kPadding] = a ? g - tmp : 0;
-          group_data[0][y & 1][x + kPadding] =
-              a ? tmp + (group_data[2][y & 1][x + kPadding] >> 1) : 0;
-        } else {
-          group_data[1][y & 1][x + kPadding] = r - b;
-          int16_t tmp = b + (group_data[1][y & 1][x + kPadding] >> 1);
-          group_data[2][y & 1][x + kPadding] = g - tmp;
-          group_data[0][y & 1][x + kPadding] =
-              tmp + (group_data[2][y & 1][x + kPadding] >> 1);
-        }
-      }
-      for (size_t c = 0; c < nb_chans; c++) {
-        allzero[c] |= group_data[c][y & 1][x + kPadding];
-        allone[c] &= group_data[c][y & 1][x + kPadding];
-      }
-    }
-    // Deal with x == 0.
-    for (size_t c = 0; c < nb_chans; c++) {
-      group_data[c][y & 1][kPadding - 1] =
-          y > 0 ? group_data[c][(y - 1) & 1][kPadding] : 0;
-      // Fix topleft.
-      group_data[c][(y - 1) & 1][kPadding - 1] =
-          y > 0 ? group_data[c][(y - 1) & 1][kPadding] : 0;
-    }
-    // Fill in padding.
-    for (size_t c = 0; c < nb_chans; c++) {
-      for (size_t x = oxs; x < xs; x++) {
-        group_data[c][y & 1][kPadding + x] =
-            group_data[c][y & 1][kPadding + oxs - 1];
-      }
-    }
-    if (y < yskip) continue;
-    for (size_t c = 0; c < nb_chans; c++) {
-      if (y > 0 && (allzero[c] == 0 || (allone[c] == 0xff && bytedepth == 1))) {
-        processors[c].run += xs;
-        continue;
-      }
-
-      // Get pointers to px/left/top/topleft data to speedup loop.
-      const int16_t* row = &group_data[c][y & 1][kPadding];
-      const int16_t* row_left = &group_data[c][y & 1][kPadding - 1];
-      const int16_t* row_top =
-          y == 0 ? row_left : &group_data[c][(y - 1) & 1][kPadding];
-      const int16_t* row_topleft =
-          y == 0 ? row_left : &group_data[c][(y - 1) & 1][kPadding - 1];
-
-      processors[c].ProcessRow(row, row_left, row_top, row_topleft, xs);
-    }
-  }
-  for (size_t c = 0; c < nb_chans; c++) {
-    processors[c].Finalize();
-  }
-}
-
-template <size_t nb_chans, size_t bytedepth>
-void WriteACSection(const unsigned char* rgba, size_t x0, size_t y0, size_t oxs,
-                    size_t ys, size_t row_stride, bool is_single_group,
-                    const PrefixCode& code, std::array<BitWriter, 4>& output) {
-  size_t xs = (oxs + kChunkSize - 1) / kChunkSize * kChunkSize;
-  for (size_t i = 0; i < nb_chans; i++) {
-    if (is_single_group && i == 0) continue;
-    output[i].Allocate(16 * xs * ys * bytedepth + 4);
-  }
-  if (!is_single_group) {
-    // Group header for modular image.
-    // When the image is single-group, the global modular image is the one that
-    // contains the pixel data, and there is no group header.
-    output[0].Write(1, 1);     // Global tree
-    output[0].Write(1, 1);     // All default wp
-    output[0].Write(2, 0b00);  // 0 transforms
-  }
-
-  ChunkEncoder<bytedepth> encoders[nb_chans];
-  ChannelRowProcessor<ChunkEncoder<bytedepth>> row_encoders[nb_chans];
-  for (size_t c = 0; c < nb_chans; c++) {
-    row_encoders[c].t = &encoders[c];
-    encoders[c].output = &output[c];
-    encoders[c].code = &code;
-  }
-  ProcessImageArea<ChannelRowProcessor<ChunkEncoder<bytedepth>>, nb_chans,
-                   bytedepth>(rgba, x0, y0, oxs, xs, 0, ys, row_stride,
-                              row_encoders);
-}
-
-constexpr int kHashExp = 16;
-constexpr uint32_t kHashSize = 1 << kHashExp;
-constexpr uint32_t kHashMultiplier = 2654435761;
-constexpr int kMaxColors = 512;
-
-// can be any function that returns a value in 0 .. kHashSize-1
-// has to map 0 to 0
-inline uint32_t pixel_hash(uint32_t p) {
-  return (p * kHashMultiplier) >> (32 - kHashExp);
-}
-
-template <typename Processor, size_t nb_chans>
-void ProcessImageAreaPalette(const unsigned char* rgba, size_t x0, size_t y0,
-                             size_t oxs, size_t xs, size_t yskip, size_t ys,
-                             size_t row_stride, const int16_t* lookup,
-                             Processor* processors) {
-  constexpr size_t kPadding = 16;
-
-  int16_t group_data[2][256 + kPadding * 2] = {};
-  Processor& row_encoder = processors[0];
-
-  for (size_t y = 0; y < ys; y++) {
-    // Pre-fill rows with palette converted pixels.
-    const unsigned char* inrow = rgba + row_stride * (y0 + y) + x0 * nb_chans;
-    for (size_t x = 0; x < oxs; x++) {
-      uint32_t p = 0;
-      memcpy(&p, inrow + x * nb_chans, nb_chans);
-      group_data[y & 1][x + kPadding] = lookup[pixel_hash(p)];
-    }
-    // Deal with x == 0.
-    group_data[y & 1][kPadding - 1] =
-        y > 0 ? group_data[(y - 1) & 1][kPadding] : 0;
-    // Fix topleft.
-    group_data[(y - 1) & 1][kPadding - 1] =
-        y > 0 ? group_data[(y - 1) & 1][kPadding] : 0;
-    // Fill in padding.
-    for (size_t x = oxs; x < xs; x++) {
-      group_data[y & 1][kPadding + x] = group_data[y & 1][kPadding + oxs - 1];
-    }
-    // Get pointers to px/left/top/topleft data to speedup loop.
-    const int16_t* row = &group_data[y & 1][kPadding];
-    const int16_t* row_left = &group_data[y & 1][kPadding - 1];
-    const int16_t* row_top =
-        y == 0 ? row_left : &group_data[(y - 1) & 1][kPadding];
-    const int16_t* row_topleft =
-        y == 0 ? row_left : &group_data[(y - 1) & 1][kPadding - 1];
-
-    row_encoder.ProcessRow(row, row_left, row_top, row_topleft, xs);
-  }
-  row_encoder.Finalize();
-}
-
-template <size_t nb_chans>
-void WriteACSectionPalette(const unsigned char* rgba, size_t x0, size_t y0,
-                           size_t oxs, size_t ys, size_t row_stride,
-                           bool is_single_group, const PrefixCode& code,
-                           const int16_t* lookup, BitWriter& output) {
-  size_t xs = (oxs + kChunkSize - 1) / kChunkSize * kChunkSize;
-
-  if (!is_single_group) {
-    output.Allocate(16 * xs * ys + 4);
-    // Group header for modular image.
-    // When the image is single-group, the global modular image is the one that
-    // contains the pixel data, and there is no group header.
-    output.Write(1, 1);     // Global tree
-    output.Write(1, 1);     // All default wp
-    output.Write(2, 0b00);  // 0 transforms
-  }
-
-  ChunkEncoder<1> encoder;
-  ChannelRowProcessor<ChunkEncoder<1>> row_encoder;
-
-  row_encoder.t = &encoder;
-  encoder.output = &output;
-  encoder.code = &code;
-  ProcessImageAreaPalette<ChannelRowProcessor<ChunkEncoder<1>>, nb_chans>(
-      rgba, x0, y0, oxs, xs, 0, ys, row_stride, lookup, &row_encoder);
-}
-
-template <size_t nb_chans, size_t bytedepth>
-void CollectSamples(const unsigned char* rgba, size_t x0, size_t y0, size_t xs,
-                    size_t row_stride, size_t row_count, uint64_t* raw_counts,
-                    uint64_t* lz77_counts, bool palette,
-                    const int16_t* lookup) {
-  ChunkSampleCollector sample_collectors[nb_chans];
-  ChannelRowProcessor<ChunkSampleCollector> row_sample_collectors[nb_chans];
-  for (size_t c = 0; c < nb_chans; c++) {
-    row_sample_collectors[c].t = &sample_collectors[c];
-    sample_collectors[c].raw_counts = raw_counts;
-    sample_collectors[c].lz77_counts = lz77_counts;
-  }
-  if (palette) {
-    assert(bytedepth == 1);
-    ProcessImageAreaPalette<ChannelRowProcessor<ChunkSampleCollector>,
-                            nb_chans>(rgba, x0, y0, xs, xs, 1, 1 + row_count,
-                                      row_stride, lookup,
-                                      row_sample_collectors);
-  } else {
-    ProcessImageArea<ChannelRowProcessor<ChunkSampleCollector>, nb_chans,
-                     bytedepth>(rgba, x0, y0, xs, xs, 1, 1 + row_count,
-                                row_stride, row_sample_collectors);
-  }
-}
-
-void PrepareDCGlobalPalette(bool is_single_group, size_t width, size_t height,
-                            const PrefixCode& code,
-                            const std::vector<uint32_t>& palette,
-                            size_t pcolors_real, BitWriter* output) {
-  PrepareDCGlobalCommon(is_single_group, width, height, code, output);
-  output->Write(2, 0b01);     // 1 transform
-  output->Write(2, 0b01);     // Palette
-  output->Write(5, 0b00000);  // Starting from ch 0
-  output->Write(2, 0b10);     // 4-channel palette (RGBA)
-  size_t pcolors = (pcolors_real + kChunkSize - 1) / kChunkSize * kChunkSize;
-  // pcolors <= kMaxColors + kChunkSize - 1
-  static_assert(kMaxColors + kChunkSize < 1281,
-                "add code to signal larger palette sizes");
-  if (pcolors < 256) {
-    output->Write(2, 0b00);
-    output->Write(8, pcolors);
-  } else {
-    output->Write(2, 0b01);
-    output->Write(10, pcolors - 256);
-  }
-
-  output->Write(2, 0b00);  // nb_deltas == 0
-  output->Write(4, 0);     // Zero predictor for delta palette
-  // Encode palette
-  ChunkEncoder<1> encoder;
-  ChannelRowProcessor<ChunkEncoder<1>> row_encoder;
-  row_encoder.t = &encoder;
-  encoder.output = output;
-  encoder.code = &code;
-  int16_t p[4][32 + 1024] = {};
-  uint8_t prgba[4];
-  int i = 0;
-  int have_zero = 0;
-  if (palette[pcolors_real - 1] == 0) have_zero = 1;
-  for (; i < pcolors; i++) {
-    if (i < pcolors_real) {
-      memcpy(prgba, &palette[i], 4);
-    }
-    p[0][16 + i + have_zero] = prgba[0];
-    p[1][16 + i + have_zero] = prgba[1];
-    p[2][16 + i + have_zero] = prgba[2];
-    p[3][16 + i + have_zero] = prgba[3];
-  }
-  p[0][15] = 0;
-  row_encoder.ProcessRow(p[0] + 16, p[0] + 15, p[0] + 15, p[0] + 15, pcolors);
-  p[1][15] = p[0][16];
-  p[0][15] = p[0][16];
-  row_encoder.ProcessRow(p[1] + 16, p[1] + 15, p[0] + 16, p[0] + 15, pcolors);
-  p[2][15] = p[1][16];
-  p[1][15] = p[1][16];
-  row_encoder.ProcessRow(p[2] + 16, p[2] + 15, p[1] + 16, p[1] + 15, pcolors);
-  p[3][15] = p[2][16];
-  p[2][15] = p[2][16];
-  row_encoder.ProcessRow(p[3] + 16, p[3] + 15, p[2] + 16, p[2] + 15, pcolors);
-  row_encoder.Finalize();
-
-  if (!is_single_group) {
-    output->ZeroPadToByte();
-  }
-}
-
-template <size_t nb_chans, size_t bytedepth>
-size_t LLEnc(const unsigned char* rgba, size_t width, size_t stride,
-             size_t height, size_t bitdepth, int effort,
-             unsigned char** output) {
-  size_t bytes_per_sample = (bitdepth > 8 ? 2 : 1);
-  assert(bytedepth == bytes_per_sample);
-  assert(width != 0);
-  assert(height != 0);
-  assert(stride >= nb_chans * bytes_per_sample * width);
-  (void)bytes_per_sample;
-
-  // Count colors to try palette
-  std::vector<uint32_t> palette(kHashSize);
-  palette[0] = 1;
-  int16_t lookup[kHashSize];
-  lookup[0] = 0;
-  int pcolors = 0;
-  bool collided =
-      effort < 2 || bitdepth != 8 || nb_chans < 4;  // todo: also do rgb palette
-  for (size_t y = 0; y < height && !collided; y++) {
-    const unsigned char* r = rgba + stride * y;
-    size_t x = 0;
-    if (nb_chans == 4) {
-      // this is just an unrolling of the next loop
-      for (; x + 7 < width; x += 8) {
-        uint32_t p[8], index[8];
-        memcpy(p, r + x * 4, 32);
-        for (int i = 0; i < 8; i++) index[i] = pixel_hash(p[i]);
-        for (int i = 0; i < 8; i++) {
-          uint32_t init_entry = index[i] ? 0 : 1;
-          if (init_entry != palette[index[i]] && p[i] != palette[index[i]]) {
-            collided = true;
-          }
-        }
-        for (int i = 0; i < 8; i++) palette[index[i]] = p[i];
-      }
-      for (; x < width; x++) {
-        uint32_t p;
-        memcpy(&p, r + x * 4, 4);
-        uint32_t index = pixel_hash(p);
-        uint32_t init_entry = index ? 0 : 1;
-        if (init_entry != palette[index] && p != palette[index]) {
-          collided = true;
-        }
-        palette[index] = p;
-      }
-    } else {
-      for (; x < width; x++) {
-        uint32_t p = 0;
-        memcpy(&p, r + x * nb_chans, nb_chans);
-        uint32_t index = pixel_hash(p);
-        uint32_t init_entry = index ? 0 : 1;
-        if (init_entry != palette[index] && p != palette[index]) {
-          collided = true;
-        }
-        palette[index] = p;
-      }
-    }
-  }
-
-  int nb_entries = 0;
-  if (!collided) {
-    if (palette[0] == 0) pcolors = 1;
-    if (palette[0] == 1) palette[0] = 0;
-    bool have_color = false;
-    uint8_t minG = 255, maxG = 0;
-    for (int k = 0; k < kHashSize; k++) {
-      if (palette[k] == 0) continue;
-      uint8_t p[4];
-      memcpy(p, &palette[k], 4);
-      // move entries to front so sort has less work
-      palette[nb_entries] = palette[k];
-      if (p[0] != p[1] || p[0] != p[2]) have_color = true;
-      if (p[1] < minG) minG = p[1];
-      if (p[1] > maxG) maxG = p[1];
-      nb_entries++;
-      // don't do palette if too many colors are needed
-      if (nb_entries + pcolors > kMaxColors) {
-        collided = true;
-        break;
-      }
-    }
-    if (!have_color) {
-      // don't do palette if it's just grayscale without many holes
-      if (maxG - minG < nb_entries * 1.4f) collided = true;
-    }
-  }
-  if (!collided) {
-    std::sort(
-        palette.begin(), palette.begin() + nb_entries,
-        [](uint32_t ap, uint32_t bp) {
-          if (ap == 0) return false;
-          if (bp == 0) return true;
-          uint8_t a[4], b[4];
-          memcpy(a, &ap, 4);
-          memcpy(b, &bp, 4);
-          float ay, by;
-          ay = (0.299f * a[0] + 0.587f * a[1] + 0.114f * a[2] + 0.01f) * a[3];
-          by = (0.299f * b[0] + 0.587f * b[1] + 0.114f * b[2] + 0.01f) * b[3];
-          return ay < by;  // sort on alpha*luma
-        });
-    for (int k = 0; k < nb_entries; k++) {
-      if (palette[k] == 0) break;
-      lookup[pixel_hash(palette[k])] = pcolors++;
-    }
-  }
-
-  // Width gets padded to kChunkSize, but this computation doesn't change
-  // because of that.
-  size_t num_groups_x = (width + 255) / 256;
-  size_t num_groups_y = (height + 255) / 256;
-  size_t num_dc_groups_x = (width + 2047) / 2048;
-  size_t num_dc_groups_y = (height + 2047) / 2048;
-
-  uint64_t raw_counts[16] = {};
-  uint64_t lz77_counts[17] = {};
-
-  // sample the middle (effort * 2) rows of every group
-  for (size_t g = 0; g < num_groups_y * num_groups_x; g++) {
-    size_t xg = g % num_groups_x;
-    size_t yg = g / num_groups_x;
-    int y_offset = yg * 256;
-    int y_max = std::min<size_t>(height - yg * 256, 256);
-    int y_begin = y_offset + std::max<int>(0, y_max - 2 * effort) / 2;
-    int y_count =
-        std::min<int>(2 * effort * y_max / 256, y_offset + y_max - y_begin - 1);
-    int x_max =
-        std::min<size_t>(width - xg * 256, 256) / kChunkSize * kChunkSize;
-    CollectSamples<nb_chans, bytedepth>(rgba, xg * 256, y_begin, x_max, stride,
-                                        y_count, raw_counts, lz77_counts,
-                                        !collided, lookup);
-  }
-
-  uint64_t base_raw_counts[16] = {3843, 852, 1270, 1214, 1014, 727, 481, 300,
-                                  159,  51,  5,    1,    1,    1,   1,   1};
-
-  bool doing_ycocg = nb_chans > 2 && collided;
-  for (size_t i = bitdepth + 2 + (doing_ycocg ? 1 : 0); i < 16; i++) {
-    base_raw_counts[i] = 0;
-  }
-  uint64_t base_lz77_counts[17] = {
-      // short runs will be sampled, but long ones won't.
-      // near full-group run is quite common (e.g. all-opaque alpha)
-      18, 12, 9, 11, 15, 2, 2, 1, 1, 1, 1, 2, 300, 0, 0, 0, 0};
-
-  for (size_t i = 0; i < 16; i++) {
-    raw_counts[i] = (raw_counts[i] << 8) + base_raw_counts[i];
-  }
-  if (!collided) {
-    unsigned token, nbits, bits;
-    EncodeHybridUint000(PackSigned(pcolors - 1), &token, &nbits, &bits);
-    // ensure all palette indices can actually be encoded
-    for (size_t i = 0; i < token + 1; i++)
-      raw_counts[i] = std::max<uint64_t>(raw_counts[i], 1);
-    // these tokens are only used for the palette itself so they can get a bad
-    // code
-    for (size_t i = token + 1; i < 10; i++) raw_counts[i] = 1;
-  }
-  for (size_t i = 0; i < 17; i++) {
-    lz77_counts[i] = (lz77_counts[i] << 8) + base_lz77_counts[i];
-  }
-  alignas(32) PrefixCode hcode(raw_counts, lz77_counts);
-
-  BitWriter writer;
-
-  bool onegroup = num_groups_x == 1 && num_groups_y == 1;
-
-  size_t num_groups = onegroup ? 1
-                               : (2 + num_dc_groups_x * num_dc_groups_y +
-                                  num_groups_x * num_groups_y);
-
-  std::vector<std::array<BitWriter, 4>> group_data(num_groups);
-  if (collided) {
-    PrepareDCGlobal(onegroup, width, height, nb_chans, bitdepth, hcode,
-                    &group_data[0][0]);
-  } else {
-    PrepareDCGlobalPalette(onegroup, width, height, hcode, palette, pcolors,
-                           &group_data[0][0]);
-  }
-#pragma omp parallel for
-  for (size_t g = 0; g < num_groups_y * num_groups_x; g++) {
-    size_t xg = g % num_groups_x;
-    size_t yg = g / num_groups_x;
-    size_t group_id =
-        onegroup ? 0 : (2 + num_dc_groups_x * num_dc_groups_y + g);
-    size_t xs = std::min<size_t>(width - xg * 256, 256);
-    size_t ys = std::min<size_t>(height - yg * 256, 256);
-    size_t x0 = xg * 256;
-    size_t y0 = yg * 256;
-    auto& gd = group_data[group_id];
-    if (collided) {
-      WriteACSection<nb_chans, bytedepth>(rgba, x0, y0, xs, ys, stride,
-                                          onegroup, hcode, gd);
-
-    } else {
-      WriteACSectionPalette<nb_chans>(rgba, x0, y0, xs, ys, stride, onegroup,
-                                      hcode, lookup, gd[0]);
-    }
-  }
-
-  AssembleFrame(width, height, nb_chans, bitdepth, group_data, &writer);
-
-  *output = writer.data.release();
-  return writer.bytes_written;
-}
-
-size_t FastLosslessEncode(const unsigned char* rgba, size_t width,
-                          size_t stride, size_t height, size_t nb_chans,
-                          size_t bitdepth, int effort, unsigned char** output) {
-  assert(bitdepth <= 12);
-  assert(bitdepth > 0);
-  assert(nb_chans <= 4);
-  assert(nb_chans != 0);
-  if (bitdepth <= 8) {
-    if (nb_chans == 1) {
-      return LLEnc<1, 1>(rgba, width, stride, height, bitdepth, effort, output);
-    }
-    if (nb_chans == 2) {
-      return LLEnc<2, 1>(rgba, width, stride, height, bitdepth, effort, output);
-    }
-    if (nb_chans == 3) {
-      return LLEnc<3, 1>(rgba, width, stride, height, bitdepth, effort, output);
-    }
-    if (nb_chans == 4) {
-      return LLEnc<4, 1>(rgba, width, stride, height, bitdepth, effort, output);
-    }
-  } else {
-    if (nb_chans == 1) {
-      return LLEnc<1, 2>(rgba, width, stride, height, bitdepth, effort, output);
-    }
-    if (nb_chans == 2) {
-      return LLEnc<2, 2>(rgba, width, stride, height, bitdepth, effort, output);
-    }
-    if (nb_chans == 3) {
-      return LLEnc<3, 2>(rgba, width, stride, height, bitdepth, effort, output);
-    }
-    if (nb_chans == 4) {
-      return LLEnc<4, 2>(rgba, width, stride, height, bitdepth, effort, output);
-    }
-  }
-  return 0;
-}
diff --git a/experimental/fast_lossless/fast_lossless.h b/experimental/fast_lossless/fast_lossless.h
deleted file mode 100644 (file)
index f7940e5..0000000
+++ /dev/null
@@ -1,14 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#ifndef FAST_LOSSLESS_H
-#define FAST_LOSSLESS_H
-#include <stdlib.h>
-
-size_t FastLosslessEncode(const unsigned char* rgba, size_t width,
-                          size_t row_stride, size_t height, size_t nb_chans,
-                          size_t bitdepth, int effort, unsigned char** output);
-
-#endif
diff --git a/flake.lock b/flake.lock
new file mode 100644 (file)
index 0000000..3bfd004
--- /dev/null
@@ -0,0 +1,61 @@
+{
+  "nodes": {
+    "flake-utils": {
+      "inputs": {
+        "systems": "systems"
+      },
+      "locked": {
+        "lastModified": 1701680307,
+        "narHash": "sha256-kAuep2h5ajznlPMD9rnQyffWG8EM/C73lejGofXvdM8=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "4022d587cbbfd70fe950c1e2083a02621806a725",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1702312524,
+        "narHash": "sha256-gkZJRDBUCpTPBvQk25G0B7vfbpEYM5s5OZqghkjZsnE=",
+        "owner": "NixOS",
+        "repo": "nixpkgs",
+        "rev": "a9bf124c46ef298113270b1f84a164865987a91c",
+        "type": "github"
+      },
+      "original": {
+        "owner": "NixOS",
+        "ref": "nixos-unstable",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "root": {
+      "inputs": {
+        "flake-utils": "flake-utils",
+        "nixpkgs": "nixpkgs"
+      }
+    },
+    "systems": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}
diff --git a/flake.nix b/flake.nix
new file mode 100644 (file)
index 0000000..4832f5b
--- /dev/null
+++ b/flake.nix
@@ -0,0 +1,39 @@
+{
+  inputs = {
+    nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
+    flake-utils.url = "github:numtide/flake-utils";
+  };
+  outputs = { self, nixpkgs, flake-utils }:
+    flake-utils.lib.eachDefaultSystem
+      (system:
+        let
+          pkgs = import nixpkgs {
+            inherit system;
+          };
+        in
+        with pkgs;
+        {
+          devShells.default = mkShell {
+            buildInputs = [
+              clang
+              cmake
+              pkg-config
+              gtest
+              gmock
+              doxygen
+              graphviz
+              python3
+              libclang.python
+              libpng
+              giflib
+              lcms2
+              brotli
+            ];
+            shellHook = ''
+              export CC=clang
+              export CXX=clang++
+            '';
+          };
+        }
+      );
+}
diff --git a/lib/BUILD b/lib/BUILD
new file mode 100644 (file)
index 0000000..d835ca1
--- /dev/null
+++ b/lib/BUILD
@@ -0,0 +1,310 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Load sources/headers/tests lists.
+load(
+    "jxl_lists.bzl",
+    "libjxl_base_sources",
+    "libjxl_cms_sources",
+    "libjxl_codec_apng_sources",
+    "libjxl_codec_exr_sources",
+    "libjxl_codec_gif_sources",
+    "libjxl_codec_jpegli_sources",
+    "libjxl_codec_jpg_sources",
+    "libjxl_codec_jxl_sources",
+    "libjxl_codec_npy_sources",
+    "libjxl_codec_pgx_sources",
+    "libjxl_codec_pnm_sources",
+    "libjxl_dec_box_sources",
+    "libjxl_dec_jpeg_sources",
+    "libjxl_dec_sources",
+    "libjxl_enc_sources",
+    "libjxl_extras_for_tools_sources",
+    "libjxl_extras_sources",
+    #'libjxl_gbench_sources',
+    "libjxl_jpegli_lib_version",
+    "libjxl_jpegli_libjpeg_helper_files",
+    "libjxl_jpegli_sources",
+    "libjxl_jpegli_testlib_files",
+    "libjxl_jpegli_tests",
+    "libjxl_major_version",
+    "libjxl_minor_version",
+    "libjxl_patch_version",
+    "libjxl_public_headers",
+    "libjxl_testlib_files",
+    "libjxl_tests",
+    "libjxl_threads_public_headers",
+    "libjxl_threads_sources",
+)
+load(
+    "jxl_vars.bzl",
+    "libjxl_deps_brotli",
+    "libjxl_deps_exr",
+    "libjxl_deps_gif",
+    "libjxl_deps_gtest",
+    "libjxl_deps_hwy",
+    "libjxl_deps_hwy_nanobenchmark",
+    "libjxl_deps_hwy_test_util",
+    "libjxl_deps_jpeg",
+    "libjxl_deps_png",
+    "libjxl_deps_runfiles",
+    "libjxl_deps_skcms",
+    "libjxl_deps_testdata",
+    "libjxl_root_package",
+    "libjxl_test_shards",
+    "libjxl_test_timeouts",
+)
+load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
+load("@bazel_skylib//rules:copy_file.bzl", "copy_file")
+
+DEFAULT_VISIBILITY = ["//:__subpackages__"]
+
+DEFAULT_COMPATIBILITY = []
+
+INCLUDES_DIR = "include"
+
+package(
+    default_visibility = ["//:__subpackages__"],
+)
+
+licenses(["notice"])
+
+exports_files(["LICENSE"])
+
+EXPORT_TEMPLATE = """
+#ifndef @_EXPORT_H
+#define @_EXPORT_H
+
+#define @_EXPORT
+#define @_NO_EXPORT
+
+#ifndef @_DEPRECATED
+#  define @_DEPRECATED __attribute__ ((__deprecated__))
+#endif
+
+#endif
+"""
+
+JXL_EXPORT_H = INCLUDES_DIR + "/jxl/jxl_export.h"
+
+genrule(
+    name = "create_jxl_export",
+    outs = [JXL_EXPORT_H],
+    cmd = "echo '" + EXPORT_TEMPLATE.replace("@", "JXL") + "' > $@",
+    compatible_with = DEFAULT_COMPATIBILITY,
+)
+
+JXL_CMS_EXPORT_H = INCLUDES_DIR + "/jxl/jxl_cms_export.h"
+
+genrule(
+    name = "create_jxl_cms_export",
+    outs = [JXL_CMS_EXPORT_H],
+    cmd = "echo '" + EXPORT_TEMPLATE.replace("@", "JXL_CMS") + "' > $@",
+    compatible_with = DEFAULT_COMPATIBILITY,
+)
+
+JXL_THREADS_EXPORT_H = INCLUDES_DIR + "/jxl/jxl_threads_export.h"
+
+genrule(
+    name = "create_jxl_threads_export",
+    outs = [JXL_THREADS_EXPORT_H],
+    cmd = "echo '" + EXPORT_TEMPLATE.replace("@", "JXL_THREADS") + "' > $@",
+    compatible_with = DEFAULT_COMPATIBILITY,
+)
+
+JXL_VERSION_H = INCLUDES_DIR + "/jxl/version.h"
+
+expand_template(
+    name = "expand_jxl_version",
+    out = JXL_VERSION_H,
+    compatible_with = DEFAULT_COMPATIBILITY,
+    substitutions = {
+        "@JPEGXL_MAJOR_VERSION@": str(libjxl_major_version),
+        "@JPEGXL_MINOR_VERSION@": str(libjxl_minor_version),
+        "@JPEGXL_PATCH_VERSION@": str(libjxl_patch_version),
+    },
+    template = "jxl/version.h.in",
+)
+
+cc_library(
+    name = "jxl_version",
+    hdrs = [JXL_VERSION_H],
+    compatible_with = DEFAULT_COMPATIBILITY,
+    strip_include_prefix = INCLUDES_DIR,
+)
+
+JPEGLI_JCONFIG_H = INCLUDES_DIR + "/jpegli/jconfig.h"
+
+JPEGLI_JMORECFG_H = INCLUDES_DIR + "/jpegli/jmorecfg.h"
+
+JPEGLI_JPEGLIB_H = INCLUDES_DIR + "/jpegli/jpeglib.h"
+
+copy_file(
+    name = "expand_jconfig",
+    src = "@libjpeg_turbo//:jconfig.h",
+    out = JPEGLI_JCONFIG_H,
+    compatible_with = DEFAULT_COMPATIBILITY,
+)
+
+copy_file(
+    name = "copy_jmorecfg",
+    src = "@libjpeg_turbo//:jmorecfg.h",
+    out = JPEGLI_JMORECFG_H,
+    compatible_with = DEFAULT_COMPATIBILITY,
+)
+
+copy_file(
+    name = "copy_jpeglib",
+    src = "@libjpeg_turbo//:jpeglib.h",
+    out = JPEGLI_JPEGLIB_H,
+    compatible_with = DEFAULT_COMPATIBILITY,
+)
+
+cc_library(
+    name = "includes",
+    hdrs = libjxl_public_headers + [
+        JXL_EXPORT_H,
+        JXL_CMS_EXPORT_H,
+    ],
+    compatible_with = DEFAULT_COMPATIBILITY,
+    strip_include_prefix = INCLUDES_DIR,
+    deps = [":jxl_version"],
+)
+
+cc_library(
+    name = "libjpeg_includes",
+    hdrs = [
+        JPEGLI_JCONFIG_H,
+        JPEGLI_JMORECFG_H,
+        JPEGLI_JPEGLIB_H,
+    ],
+    compatible_with = DEFAULT_COMPATIBILITY,
+    strip_include_prefix = INCLUDES_DIR + "/jpegli",
+)
+
+cc_library(
+    name = "base",
+    srcs = [path for path in libjxl_base_sources if path.endswith(".cc")],
+    hdrs = [path for path in libjxl_base_sources if path.endswith(".h")],
+    compatible_with = DEFAULT_COMPATIBILITY,
+    deps = [
+        ":includes",
+    ] + libjxl_deps_hwy,
+)
+
+cc_library(
+    name = "jpegxl",
+    srcs = libjxl_dec_sources + libjxl_dec_box_sources + libjxl_dec_jpeg_sources + libjxl_enc_sources + libjxl_cms_sources,
+    compatible_with = DEFAULT_COMPATIBILITY,
+    defines = ["JPEGXL_ENABLE_SKCMS=1"],
+    deps = [
+        ":base",
+        ":includes",
+    ] + libjxl_deps_brotli + libjxl_deps_hwy + libjxl_deps_skcms,
+)
+
+cc_library(
+    name = "jpegxl_private",
+    hdrs = [
+        path
+        for path in libjxl_dec_sources + libjxl_dec_box_sources + libjxl_dec_jpeg_sources + libjxl_enc_sources + libjxl_cms_sources
+        if path.endswith(".h") and not path.endswith("-inl.h")
+    ],
+    compatible_with = DEFAULT_COMPATIBILITY,
+    deps = [":jpegxl"],
+)
+
+cc_library(
+    name = "jpegxl_threads",
+    srcs = libjxl_threads_sources,
+    hdrs = libjxl_threads_public_headers + [JXL_THREADS_EXPORT_H],
+    compatible_with = DEFAULT_COMPATIBILITY,
+    strip_include_prefix = INCLUDES_DIR,
+    deps = [
+        ":base",
+        ":includes",
+    ],
+)
+
+CODEC_FILES = libjxl_codec_apng_sources + libjxl_codec_exr_sources + libjxl_codec_gif_sources + libjxl_codec_jpegli_sources + libjxl_codec_jpg_sources + libjxl_codec_jxl_sources + libjxl_codec_npy_sources + libjxl_codec_pgx_sources + libjxl_codec_pnm_sources
+
+CODEC_SRCS = [path for path in CODEC_FILES if path.endswith(".cc")]
+
+CODEC_HDRS = [path for path in CODEC_FILES if path.endswith(".h")]
+
+cc_library(
+    name = "jpegli",
+    srcs = libjxl_jpegli_sources,
+    hdrs = [
+        "jpegli/common_internal.h",  # TODO(eustas): should not be here
+    ],
+    compatible_with = DEFAULT_COMPATIBILITY,
+    deps = [
+        ":jpegxl_private",
+        ":libjpeg_includes",
+    ] + libjxl_deps_hwy,
+)
+
+# TODO(eustas): build codecs separately?
+cc_library(
+    name = "jpegxl_extras",
+    srcs = libjxl_extras_sources + libjxl_extras_for_tools_sources + CODEC_SRCS,
+    hdrs = CODEC_HDRS,
+    compatible_with = DEFAULT_COMPATIBILITY,
+    defines = [
+        "JPEGXL_ENABLE_APNG=1",
+        "JPEGXL_ENABLE_EXR=1",
+        "JPEGXL_ENABLE_GIF=1",
+        "JPEGXL_ENABLE_JPEG=1",
+        "JPEGXL_ENABLE_JPEGLI=1",
+    ],
+    deps = [
+        ":jpegli",
+        ":jpegxl_private",
+        ":jpegxl_threads",
+        ":jxl_version",
+    ] + libjxl_deps_exr + libjxl_deps_gif + libjxl_deps_jpeg + libjxl_deps_png,
+)
+
+TESTLIB_FILES = libjxl_testlib_files + libjxl_jpegli_testlib_files + libjxl_jpegli_libjpeg_helper_files
+
+cc_library(
+    name = "test_utils",
+    testonly = 1,
+    srcs = [path for path in TESTLIB_FILES if not path.endswith(".h")],
+    hdrs = [path for path in TESTLIB_FILES if path.endswith(".h")],
+    compatible_with = DEFAULT_COMPATIBILITY,
+    defines = [
+        'JPEGXL_ROOT_PACKAGE=\'"' + libjxl_root_package + '"\'',
+    ],
+    deps = [
+        ":jpegli",
+        ":jpegxl_extras",
+        ":jpegxl_private",
+    ] + libjxl_deps_runfiles,
+)
+
+TESTS = [path.partition(".")[0] for path in libjxl_tests + libjxl_jpegli_tests]
+
+[
+    cc_test(
+        name = test,
+        timeout = libjxl_test_timeouts.get(test, "moderate"),
+        srcs = [
+            test + ".cc",
+            "jpegli/testing.h",
+            "jxl/testing.h",
+        ],
+        data = ["//:testdata"],
+        shard_count = libjxl_test_shards.get(test, 1),
+        deps = [
+            ":jpegxl_extras",
+            ":jpegxl_private",
+            ":jpegxl_threads",
+            ":test_utils",
+        ] + libjxl_deps_gtest + libjxl_deps_hwy_test_util + libjxl_deps_hwy_nanobenchmark,
+    )
+    for test in TESTS
+]
index 5c8e0ba..aa7f8c5 100644 (file)
@@ -4,7 +4,7 @@
 # license that can be found in the LICENSE file.
 
 set(JPEGXL_MAJOR_VERSION 0)
-set(JPEGXL_MINOR_VERSION 7)
+set(JPEGXL_MINOR_VERSION 9)
 set(JPEGXL_PATCH_VERSION 0)
 set(JPEGXL_LIBRARY_VERSION
     "${JPEGXL_MAJOR_VERSION}.${JPEGXL_MINOR_VERSION}.${JPEGXL_PATCH_VERSION}")
@@ -15,131 +15,153 @@ set(JPEGXL_LIBRARY_VERSION
 # It is important to update this value when making incompatible API/ABI changes
 # so that programs that depend on libjxl can update their dependencies. Semantic
 # versioning allows 0.y.z to have incompatible changes in minor versions.
-set(JPEGXL_SO_MINOR_VERSION 7)
+set(JPEGXL_SO_MINOR_VERSION 9)
 if (JPEGXL_MAJOR_VERSION EQUAL 0)
-set(JPEGXL_LIBRARY_SOVERSION
-    "${JPEGXL_MAJOR_VERSION}.${JPEGXL_SO_MINOR_VERSION}")
+  set(JPEGXL_LIBRARY_SOVERSION
+      "${JPEGXL_MAJOR_VERSION}.${JPEGXL_SO_MINOR_VERSION}")
 else()
-set(JPEGXL_LIBRARY_SOVERSION "${JPEGXL_MAJOR_VERSION}")
+  set(JPEGXL_LIBRARY_SOVERSION "${JPEGXL_MAJOR_VERSION}")
 endif()
 
 
 # List of warning and feature flags for our library and tests.
 if (MSVC)
-set(JPEGXL_INTERNAL_FLAGS
-  # TODO(janwas): add flags
-)
-else ()
-set(JPEGXL_INTERNAL_FLAGS
-  # F_FLAGS
-  -fmerge-all-constants
-  -fno-builtin-fwrite
-  -fno-builtin-fread
-
-  # WARN_FLAGS
-  -Wall
-  -Wextra
-  -Wc++11-compat
-  -Warray-bounds
-  -Wformat-security
-  -Wimplicit-fallthrough
-  -Wno-register  # Needed by public headers in lcms
-  -Wno-unused-function
-  -Wno-unused-parameter
-  -Wnon-virtual-dtor
-  -Woverloaded-virtual
-  -Wvla
-)
-
-# Warning flags supported by clang.
-if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-  list(APPEND JPEGXL_INTERNAL_FLAGS
-    -Wdeprecated-increment-bool
-    # TODO(deymo): Add -Wextra-semi once we update third_party/highway.
-    # -Wextra-semi
-    -Wfloat-overflow-conversion
-    -Wfloat-zero-conversion
-    -Wfor-loop-analysis
-    -Wgnu-redeclared-enum
-    -Winfinite-recursion
-    -Wliteral-conversion
-    -Wno-c++98-compat
-    -Wno-unused-command-line-argument
-    -Wprivate-header
-    -Wself-assign
-    -Wstring-conversion
-    -Wtautological-overlap-compare
-    -Wthread-safety-analysis
-    -Wundefined-func-template
-    -Wunreachable-code
-    -Wunused-comparison
+  set(JPEGXL_INTERNAL_FLAGS
+    # TODO(janwas): add flags
   )
-  if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.0)
-    list(APPEND HWY_FLAGS -Wc++2a-extensions)
-  endif()
-endif()  # Clang
-
-if (WIN32)
-  list(APPEND JPEGXL_INTERNAL_FLAGS
-    -Wno-cast-align
-    -Wno-double-promotion
-    -Wno-float-equal
-    -Wno-format-nonliteral
-    -Wno-shadow
-    -Wno-sign-conversion
-    -Wno-zero-as-null-pointer-constant
+else ()
+  set(JPEGXL_INTERNAL_FLAGS
+    # F_FLAGS
+    -fmerge-all-constants
+    -fno-builtin-fwrite
+    -fno-builtin-fread
+
+    # WARN_FLAGS
+    -Wall
+    -Wextra
+    -Wc++11-compat
+    -Warray-bounds
+    -Wformat-security
+    -Wimplicit-fallthrough
+    -Wno-register  # Needed by public headers in lcms
+    -Wno-unused-function
+    -Wno-unused-parameter
+    -Wnon-virtual-dtor
+    -Woverloaded-virtual
+    -Wvla
   )
 
+  # Warning flags supported by clang.
   if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
     list(APPEND JPEGXL_INTERNAL_FLAGS
-      -Wno-used-but-marked-unused
-      -Wno-unused-template
-      -Wno-unused-member-function
-      -Wno-shadow-field-in-constructor
-      -Wno-language-extension-token
-      -Wno-global-constructors
-      -Wno-c++98-compat-pedantic
+      -Wdeprecated-increment-bool
+      # TODO(deymo): Add -Wextra-semi once we update third_party/highway.
+      # -Wextra-semi
+      -Wfloat-overflow-conversion
+      -Wfloat-zero-conversion
+      -Wfor-loop-analysis
+      -Wgnu-redeclared-enum
+      -Winfinite-recursion
+      -Wliteral-conversion
+      -Wno-c++98-compat
+      -Wno-unused-command-line-argument
+      -Wprivate-header
+      -Wself-assign
+      -Wstring-conversion
+      -Wtautological-overlap-compare
+      -Wthread-safety-analysis
+      -Wundefined-func-template
+      -Wunreachable-code
+      -Wunused-comparison
     )
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.0)
+      list(APPEND HWY_FLAGS -Wc++2a-extensions)
+    endif()
   endif()  # Clang
-else()  # WIN32
-  list(APPEND JPEGXL_INTERNAL_FLAGS
-    -fsized-deallocation
-    -fno-exceptions
 
-    # Language flags
-    -fmath-errno
-  )
+  if (WIN32)
+    list(APPEND JPEGXL_INTERNAL_FLAGS
+      -Wno-cast-align
+      -Wno-double-promotion
+      -Wno-float-equal
+      -Wno-format-nonliteral
+      -Wno-shadow
+      -Wno-sign-conversion
+      -Wno-zero-as-null-pointer-constant
+    )
 
-  if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+      list(APPEND JPEGXL_INTERNAL_FLAGS
+        -Wno-used-but-marked-unused
+        -Wno-unused-template
+        -Wno-unused-member-function
+        -Wno-shadow-field-in-constructor
+        -Wno-language-extension-token
+        -Wno-global-constructors
+        -Wno-c++98-compat-pedantic
+      )
+    endif()  # Clang
+  else()  # WIN32
     list(APPEND JPEGXL_INTERNAL_FLAGS
-      -fnew-alignment=8
-      -fno-cxx-exceptions
-      -fno-slp-vectorize
-      -fno-vectorize
+      -fsized-deallocation
+      -fno-exceptions
 
-      -disable-free
-      -disable-llvm-verifier
+      # Language flags
+      -fmath-errno
     )
-  endif()  # Clang
-endif()  # WIN32
-
-# Internal flags for coverage builds:
-if(JPEGXL_ENABLE_COVERAGE)
-set(JPEGXL_COVERAGE_FLAGS
-    -g -O0 -fprofile-arcs -ftest-coverage
-    -DJXL_ENABLE_ASSERT=0 -DJXL_ENABLE_CHECK=0
-)
-endif()  # JPEGXL_ENABLE_COVERAGE
+
+    if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+      list(APPEND JPEGXL_INTERNAL_FLAGS
+        -fnew-alignment=8
+        -fno-cxx-exceptions
+        -fno-slp-vectorize
+        -fno-vectorize
+
+        -disable-free
+        -disable-llvm-verifier
+      )
+    endif()  # Clang
+  endif()  # WIN32
 endif()  #!MSVC
 
+if (JPEGXL_ENABLE_SKCMS)
+  list(APPEND JPEGXL_INTERNAL_FLAGS -DJPEGXL_ENABLE_SKCMS=1)
+endif ()
+
+# strips the -internal suffix from all the elements in LIST
+function(strip_internal OUTPUT_VAR LIB_LIST)
+  foreach(lib IN LISTS ${LIB_LIST})
+    string(REGEX REPLACE "-internal$" "" lib "${lib}")
+    list(APPEND out_list "${lib}")
+  endforeach()
+  set(${OUTPUT_VAR} ${out_list} PARENT_SCOPE)
+endfunction()
+
+# set variables for jxl_cms.cmake and jxl.cmake
+if(IS_ABSOLUTE "${CMAKE_INSTALL_INCLUDEDIR}")
+    set(PKGCONFIG_TARGET_INCLUDES "${CMAKE_INSTALL_INCLUDEDIR}")
+else()
+    set(PKGCONFIG_TARGET_INCLUDES "\${prefix}/${CMAKE_INSTALL_INCLUDEDIR}")
+endif()
+if(IS_ABSOLUTE "${CMAKE_INSTALL_LIBDIR}")
+    set(PKGCONFIG_TARGET_LIBS "${CMAKE_INSTALL_LIBDIR}")
+else()
+    set(PKGCONFIG_TARGET_LIBS "\${exec_prefix}/${CMAKE_INSTALL_LIBDIR}")
+endif()
+
+# The jxl_cms library definition.
+include(jxl_cms.cmake)
 # The jxl library definition.
 include(jxl.cmake)
 
 # Other libraries outside the core jxl library.
-if(JPEGXL_ENABLE_TOOLS)
+if(JPEGXL_ENABLE_TOOLS OR BUILD_TESTING)
   include(jxl_extras.cmake)
 endif()
 include(jxl_threads.cmake)
+if (JPEGXL_ENABLE_JPEGLI)
+  include(jpegli.cmake)
+endif()
 
 # Install all the library headers from the source and the generated ones. There
 # is no distinction on which libraries use which header since it is expected
@@ -149,18 +171,14 @@ install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/jxl
 install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/include/jxl
   DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}")
 
-# Profiler for libjxl
-include(jxl_profiler.cmake)
-
 if(BUILD_TESTING)
-# Unittests
-cmake_policy(SET CMP0057 NEW)  # https://gitlab.kitware.com/cmake/cmake/issues/18198
-include(GoogleTest)
+  include(GoogleTest)
+endif()
 
 # Tests for the jxl library.
 include(jxl_tests.cmake)
 
-# Google benchmark for the jxl library
-include(jxl_benchmark.cmake)
-
-endif()  # BUILD_TESTING
+if(BUILD_TESTING)
+  # Google benchmark for the jxl library
+  include(jxl_benchmark.cmake)
+endif()
diff --git a/lib/extras/alpha_blend.cc b/lib/extras/alpha_blend.cc
new file mode 100644 (file)
index 0000000..50c141c
--- /dev/null
@@ -0,0 +1,63 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/alpha_blend.h"
+
+#include "lib/extras/packed_image.h"
+
+namespace jxl {
+namespace extras {
+
+namespace {
+
+void AlphaBlend(PackedFrame* frame, float background[3]) {
+  if (!frame) return;
+  const PackedImage& im = frame->color;
+  JxlPixelFormat format = im.format;
+  if (format.num_channels != 2 && format.num_channels != 4) {
+    return;
+  }
+  --format.num_channels;
+  PackedImage blended(im.xsize, im.ysize, format);
+  // TODO(szabadka) SIMDify this and make it work for float16.
+  for (size_t y = 0; y < im.ysize; ++y) {
+    for (size_t x = 0; x < im.xsize; ++x) {
+      if (format.num_channels == 2) {
+        float g = im.GetPixelValue(y, x, 0);
+        float a = im.GetPixelValue(y, x, 1);
+        float out = g * a + background[0] * (1 - a);
+        blended.SetPixelValue(y, x, 0, out);
+      } else {
+        float r = im.GetPixelValue(y, x, 0);
+        float g = im.GetPixelValue(y, x, 1);
+        float b = im.GetPixelValue(y, x, 2);
+        float a = im.GetPixelValue(y, x, 3);
+        float out_r = r * a + background[0] * (1 - a);
+        float out_g = g * a + background[1] * (1 - a);
+        float out_b = b * a + background[2] * (1 - a);
+        blended.SetPixelValue(y, x, 0, out_r);
+        blended.SetPixelValue(y, x, 1, out_g);
+        blended.SetPixelValue(y, x, 2, out_b);
+      }
+    }
+  }
+  frame->color = blended.Copy();
+}
+
+}  // namespace
+
+void AlphaBlend(PackedPixelFile* ppf, float background[3]) {
+  if (!ppf || ppf->info.alpha_bits == 0) {
+    return;
+  }
+  ppf->info.alpha_bits = 0;
+  AlphaBlend(ppf->preview_frame.get(), background);
+  for (auto& frame : ppf->frames) {
+    AlphaBlend(&frame, background);
+  }
+}
+
+}  // namespace extras
+}  // namespace jxl
diff --git a/lib/extras/alpha_blend.h b/lib/extras/alpha_blend.h
new file mode 100644 (file)
index 0000000..4d78e86
--- /dev/null
@@ -0,0 +1,19 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_EXTRAS_ALPHA_BLEND_H_
+#define LIB_EXTRAS_ALPHA_BLEND_H_
+
+#include "lib/extras/packed_image.h"
+
+namespace jxl {
+namespace extras {
+
+void AlphaBlend(PackedPixelFile* ppf, float background[3]);
+
+}  // namespace extras
+}  // namespace jxl
+
+#endif  // LIB_EXTRAS_ALPHA_BLEND_H_
index 774b4cc..3ba31f2 100644 (file)
@@ -5,27 +5,18 @@
 
 #include "lib/extras/codec.h"
 
-#include "jxl/decode.h"
-#include "jxl/types.h"
-#include "lib/extras/packed_image.h"
-#include "lib/jxl/base/padded_bytes.h"
-#include "lib/jxl/base/status.h"
+#include <jxl/decode.h>
+#include <jxl/types.h>
 
-#if JPEGXL_ENABLE_APNG
+#include "lib/extras/dec/decode.h"
 #include "lib/extras/enc/apng.h"
-#endif
-#if JPEGXL_ENABLE_JPEG
-#include "lib/extras/enc/jpg.h"
-#endif
-#if JPEGXL_ENABLE_EXR
 #include "lib/extras/enc/exr.h"
-#endif
-
-#include "lib/extras/dec/decode.h"
+#include "lib/extras/enc/jpg.h"
 #include "lib/extras/enc/pgx.h"
 #include "lib/extras/enc/pnm.h"
+#include "lib/extras/packed_image.h"
 #include "lib/extras/packed_image_convert.h"
-#include "lib/jxl/base/file_io.h"
+#include "lib/jxl/base/status.h"
 #include "lib/jxl/image_bundle.h"
 
 namespace jxl {
@@ -38,30 +29,21 @@ constexpr size_t kMinBytes = 9;
 
 Status SetFromBytes(const Span<const uint8_t> bytes,
                     const extras::ColorHints& color_hints, CodecInOut* io,
-                    ThreadPool* pool, extras::Codec* orig_codec) {
+                    ThreadPool* pool, const SizeConstraints* constraints,
+                    extras::Codec* orig_codec) {
   if (bytes.size() < kMinBytes) return JXL_FAILURE("Too few bytes");
 
   extras::PackedPixelFile ppf;
-  if (extras::DecodeBytes(bytes, color_hints, io->constraints, &ppf,
-                          orig_codec)) {
+  if (extras::DecodeBytes(bytes, color_hints, &ppf, constraints, orig_codec)) {
     return ConvertPackedPixelFileToCodecInOut(ppf, pool, io);
   }
   return JXL_FAILURE("Codecs failed to decode");
 }
 
-Status SetFromFile(const std::string& pathname,
-                   const extras::ColorHints& color_hints, CodecInOut* io,
-                   ThreadPool* pool, extras::Codec* orig_codec) {
-  std::vector<uint8_t> encoded;
-  JXL_RETURN_IF_ERROR(ReadFile(pathname, &encoded));
-  JXL_RETURN_IF_ERROR(SetFromBytes(Span<const uint8_t>(encoded), color_hints,
-                                   io, pool, orig_codec));
-  return true;
-}
-
 Status Encode(const CodecInOut& io, const extras::Codec codec,
               const ColorEncoding& c_desired, size_t bits_per_sample,
               std::vector<uint8_t>* bytes, ThreadPool* pool) {
+  bytes->clear();
   JXL_CHECK(!io.Main().c_current().ICC().empty());
   JXL_CHECK(!c_desired.ICC().empty());
   io.CheckMetadata();
@@ -77,22 +59,22 @@ Status Encode(const CodecInOut& io, const extras::Codec codec,
   std::ostringstream os;
   switch (codec) {
     case extras::Codec::kPNG:
-#if JPEGXL_ENABLE_APNG
       encoder = extras::GetAPNGEncoder();
-      break;
-#else
-      return JXL_FAILURE("JPEG XL was built without (A)PNG support");
-#endif
+      if (encoder) {
+        break;
+      } else {
+        return JXL_FAILURE("JPEG XL was built without (A)PNG support");
+      }
     case extras::Codec::kJPG:
-#if JPEGXL_ENABLE_JPEG
       format.data_type = JXL_TYPE_UINT8;
       encoder = extras::GetJPEGEncoder();
-      os << io.jpeg_quality;
-      encoder->SetOption("q", os.str());
-      break;
-#else
-      return JXL_FAILURE("JPEG XL was built without JPEG support");
-#endif
+      if (encoder) {
+        os << io.jpeg_quality;
+        encoder->SetOption("q", os.str());
+        break;
+      } else {
+        return JXL_FAILURE("JPEG XL was built without JPEG support");
+      }
     case extras::Codec::kPNM:
       if (io.Main().HasAlpha()) {
         encoder = extras::GetPAMEncoder();
@@ -102,14 +84,9 @@ Status Encode(const CodecInOut& io, const extras::Codec codec,
         encoder = extras::GetPPMEncoder();
       } else {
         format.data_type = JXL_TYPE_FLOAT;
-        format.endianness = JXL_NATIVE_ENDIAN;
+        format.endianness = JXL_LITTLE_ENDIAN;
         encoder = extras::GetPFMEncoder();
       }
-      if (!c_desired.IsSRGB()) {
-        JXL_WARNING(
-            "PNM encoder cannot store custom ICC profile; decoder "
-            "will need hint key=color_space to get the same values");
-      }
       break;
     case extras::Codec::kPGX:
       encoder = extras::GetPGXEncoder();
@@ -117,13 +94,17 @@ Status Encode(const CodecInOut& io, const extras::Codec codec,
     case extras::Codec::kGIF:
       return JXL_FAILURE("Encoding to GIF is not implemented");
     case extras::Codec::kEXR:
-#if JPEGXL_ENABLE_EXR
       format.data_type = JXL_TYPE_FLOAT;
       encoder = extras::GetEXREncoder();
-      break;
-#else
-      return JXL_FAILURE("JPEG XL was built without OpenEXR support");
-#endif
+      if (encoder) {
+        break;
+      } else {
+        return JXL_FAILURE("JPEG XL was built without OpenEXR support");
+      }
+    case extras::Codec::kJXL:
+      // TODO(user): implement
+      return JXL_FAILURE("Codec::kJXL is not supported yet");
+
     case extras::Codec::kUnknown:
       return JXL_FAILURE("Cannot encode using Codec::kUnknown");
   }
@@ -135,6 +116,11 @@ Status Encode(const CodecInOut& io, const extras::Codec codec,
   extras::PackedPixelFile ppf;
   JXL_RETURN_IF_ERROR(
       ConvertCodecInOutToPackedPixelFile(io, format, c_desired, pool, &ppf));
+  ppf.info.bits_per_sample = bits_per_sample;
+  if (format.data_type == JXL_TYPE_FLOAT) {
+    ppf.info.bits_per_sample = 32;
+    ppf.info.exponent_bits_per_sample = 8;
+  }
   extras::EncodedImage encoded_image;
   JXL_RETURN_IF_ERROR(encoder->Encode(ppf, &encoded_image, pool));
   JXL_ASSERT(encoded_image.bitstreams.size() == 1);
@@ -143,15 +129,15 @@ Status Encode(const CodecInOut& io, const extras::Codec codec,
   return true;
 }
 
-Status EncodeToFile(const CodecInOut& io, const ColorEncoding& c_desired,
-                    size_t bits_per_sample, const std::string& pathname,
-                    ThreadPool* pool) {
-  const std::string extension = Extension(pathname);
-  const extras::Codec codec =
-      extras::CodecFromExtension(extension, &bits_per_sample);
+Status Encode(const CodecInOut& io, const ColorEncoding& c_desired,
+              size_t bits_per_sample, const std::string& pathname,
+              std::vector<uint8_t>* bytes, ThreadPool* pool) {
+  std::string extension;
+  const extras::Codec codec = extras::CodecFromPath(
+      pathname, &bits_per_sample, /* filename */ nullptr, &extension);
 
   // Warn about incorrect usage of PGM/PGX/PPM - only the latter supports
-  // color, but CodecFromExtension lumps them all together.
+  // color, but CodecFromPath lumps them all together.
   if (codec == extras::Codec::kPNM && extension != ".pfm") {
     if (io.Main().HasAlpha() && extension != ".pam") {
       JXL_WARNING(
@@ -174,16 +160,14 @@ Status EncodeToFile(const CodecInOut& io, const ColorEncoding& c_desired,
     bits_per_sample = 16;
   }
 
-  std::vector<uint8_t> encoded;
-  return Encode(io, codec, c_desired, bits_per_sample, &encoded, pool) &&
-         WriteFile(encoded, pathname);
+  return Encode(io, codec, c_desired, bits_per_sample, bytes, pool);
 }
 
-Status EncodeToFile(const CodecInOut& io, const std::string& pathname,
-                    ThreadPool* pool) {
+Status Encode(const CodecInOut& io, const std::string& pathname,
+              std::vector<uint8_t>* bytes, ThreadPool* pool) {
   // TODO(lode): need to take the floating_point_sample field into account
-  return EncodeToFile(io, io.metadata.m.color_encoding,
-                      io.metadata.m.bit_depth.bits_per_sample, pathname, pool);
+  return Encode(io, io.metadata.m.color_encoding,
+                io.metadata.m.bit_depth.bits_per_sample, pathname, bytes, pool);
 }
 
 }  // namespace jxl
index 73fdc80..6b39ffd 100644 (file)
@@ -17,7 +17,6 @@
 #include "lib/extras/dec/decode.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/data_parallel.h"
-#include "lib/jxl/base/padded_bytes.h"
 #include "lib/jxl/base/span.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/codec_in_out.h"
 
 namespace jxl {
 
+struct SizeConstraints;
+
 // Decodes "bytes" and sets io->metadata.m.
 // color_space_hint may specify the color space, otherwise, defaults to sRGB.
 Status SetFromBytes(Span<const uint8_t> bytes,
                     const extras::ColorHints& color_hints, CodecInOut* io,
                     ThreadPool* pool = nullptr,
+                    const SizeConstraints* constraints = nullptr,
                     extras::Codec* orig_codec = nullptr);
 // Helper function to use no color_space_hint.
 JXL_INLINE Status SetFromBytes(const Span<const uint8_t> bytes, CodecInOut* io,
                                ThreadPool* pool = nullptr,
+                               const SizeConstraints* constraints = nullptr,
                                extras::Codec* orig_codec = nullptr) {
-  return SetFromBytes(bytes, extras::ColorHints(), io, pool, orig_codec);
+  return SetFromBytes(bytes, extras::ColorHints(), io, pool, constraints,
+                      orig_codec);
 }
 
-// Reads from file and calls SetFromBytes.
-Status SetFromFile(const std::string& pathname,
-                   const extras::ColorHints& color_hints, CodecInOut* io,
-                   ThreadPool* pool = nullptr,
-                   extras::Codec* orig_codec = nullptr);
-
 // Replaces "bytes" with an encoding of pixels transformed from c_current
 // color space to c_desired.
 Status Encode(const CodecInOut& io, extras::Codec codec,
@@ -52,12 +50,12 @@ Status Encode(const CodecInOut& io, extras::Codec codec,
               std::vector<uint8_t>* bytes, ThreadPool* pool = nullptr);
 
 // Deduces codec, calls Encode and writes to file.
-Status EncodeToFile(const CodecInOut& io, const ColorEncoding& c_desired,
-                    size_t bits_per_sample, const std::string& pathname,
-                    ThreadPool* pool = nullptr);
+Status Encode(const CodecInOut& io, const ColorEncoding& c_desired,
+              size_t bits_per_sample, const std::string& pathname,
+              std::vector<uint8_t>* bytes, ThreadPool* pool = nullptr);
 // Same, but defaults to metadata.original color_encoding and bits_per_sample.
-Status EncodeToFile(const CodecInOut& io, const std::string& pathname,
-                    ThreadPool* pool = nullptr);
+Status Encode(const CodecInOut& io, const std::string& pathname,
+              std::vector<uint8_t>* bytes, ThreadPool* pool = nullptr);
 
 }  // namespace jxl
 
index 19cac39..6e86ba9 100644 (file)
@@ -6,30 +6,28 @@
 #include "lib/extras/codec.h"
 
 #include <stddef.h>
-#include <stdio.h>
 
 #include <algorithm>
+#include <cstdint>
 #include <sstream>
 #include <string>
 #include <utility>
 #include <vector>
 
-#include "lib/extras/dec/pgx.h"
+#include "lib/extras/common.h"
+#include "lib/extras/dec/decode.h"
 #include "lib/extras/dec/pnm.h"
 #include "lib/extras/enc/encode.h"
-#include "lib/extras/packed_image_convert.h"
-#include "lib/jxl/base/printf_macros.h"
 #include "lib/jxl/base/random.h"
-#include "lib/jxl/base/thread_pool_internal.h"
-#include "lib/jxl/color_management.h"
-#include "lib/jxl/enc_color_management.h"
-#include "lib/jxl/image.h"
-#include "lib/jxl/image_bundle.h"
-#include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
 #include "lib/jxl/test_utils.h"
-#include "lib/jxl/testdata.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
+
+using test::ThreadPoolForTests;
+
 namespace extras {
 namespace {
 
@@ -37,7 +35,6 @@ using ::testing::AllOf;
 using ::testing::Contains;
 using ::testing::Field;
 using ::testing::IsEmpty;
-using ::testing::NotNull;
 using ::testing::SizeIs;
 
 std::string ExtensionFromCodec(Codec codec, const bool is_gray,
@@ -51,18 +48,14 @@ std::string ExtensionFromCodec(Codec codec, const bool is_gray,
     case Codec::kPNG:
       return ".png";
     case Codec::kPNM:
+      if (bits_per_sample == 32) return ".pfm";
       if (has_alpha) return ".pam";
-      if (is_gray) return ".pgm";
-      return (bits_per_sample == 32) ? ".pfm" : ".ppm";
-    case Codec::kGIF:
-      return ".gif";
+      return is_gray ? ".pgm" : ".ppm";
     case Codec::kEXR:
       return ".exr";
-    case Codec::kUnknown:
+    default:
       return std::string();
   }
-  JXL_UNREACHABLE;
-  return std::string();
 }
 
 void VerifySameImage(const PackedImage& im0, size_t bits_per_sample0,
@@ -110,17 +103,16 @@ JxlColorEncoding CreateTestColorEncoding(bool is_gray) {
   // Roundtrip through internal color encoding to fill in primaries and white
   // point CIE xy coordinates.
   ColorEncoding c_internal;
-  JXL_CHECK(ConvertExternalToInternalColorEncoding(c, &c_internal));
-  ConvertInternalToExternalColorEncoding(c_internal, &c);
+  JXL_CHECK(c_internal.FromExternal(c));
+  c = c_internal.ToExternal();
   return c;
 }
 
 std::vector<uint8_t> GenerateICC(JxlColorEncoding color_encoding) {
   ColorEncoding c;
-  JXL_CHECK(ConvertExternalToInternalColorEncoding(color_encoding, &c));
-  JXL_CHECK(c.CreateICC());
-  PaddedBytes icc = c.ICC();
-  return std::vector<uint8_t>(icc.begin(), icc.end());
+  JXL_CHECK(c.FromExternal(color_encoding));
+  JXL_CHECK(!c.ICC().empty());
+  return c.ICC();
 }
 
 void StoreRandomValue(uint8_t* out, Rng* rng, JxlPixelFormat format,
@@ -173,10 +165,11 @@ struct TestImageParams {
   bool is_gray;
   bool add_alpha;
   bool big_endian;
+  bool add_extra_channels;
 
   bool ShouldTestRoundtrip() const {
     if (codec == Codec::kPNG) {
-      return true;
+      return bits_per_sample <= 16;
     } else if (codec == Codec::kPNM) {
       // TODO(szabadka) Make PNM encoder endianness-aware.
       return ((bits_per_sample <= 16 && big_endian) ||
@@ -213,7 +206,7 @@ struct TestImageParams {
   std::string DebugString() const {
     std::ostringstream os;
     os << "bps:" << bits_per_sample << " gr:" << is_gray << " al:" << add_alpha
-       << " be: " << big_endian;
+       << " be: " << big_endian << " ec: " << add_extra_channels;
     return os.str();
   }
 };
@@ -233,6 +226,19 @@ void CreateTestImage(const TestImageParams& params, PackedPixelFile* ppf) {
 
   PackedFrame frame(params.xsize, params.ysize, params.PixelFormat());
   FillPackedImage(params.bits_per_sample, &frame.color);
+  if (params.add_extra_channels) {
+    for (size_t i = 0; i < 7; ++i) {
+      JxlPixelFormat ec_format = params.PixelFormat();
+      ec_format.num_channels = 1;
+      PackedImage ec(params.xsize, params.ysize, ec_format);
+      FillPackedImage(params.bits_per_sample, &ec);
+      frame.extra_channels.emplace_back(std::move(ec));
+      PackedExtraChannel pec;
+      pec.ec_info.bits_per_sample = params.bits_per_sample;
+      pec.ec_info.type = static_cast<JxlExtraChannelType>(i);
+      ppf->extra_channels_info.emplace_back(std::move(pec));
+    }
+  }
   ppf->frames.emplace_back(std::move(frame));
 }
 
@@ -249,33 +255,67 @@ void TestRoundTrip(const TestImageParams& params, ThreadPool* pool) {
 
   EncodedImage encoded;
   auto encoder = Encoder::FromExtension(extension);
-  ASSERT_TRUE(encoder.get());
+  if (!encoder) {
+    fprintf(stderr, "Skipping test because of missing codec support.\n");
+    return;
+  }
   ASSERT_TRUE(encoder->Encode(ppf_in, &encoded, pool));
   ASSERT_EQ(encoded.bitstreams.size(), 1);
 
   PackedPixelFile ppf_out;
-  ASSERT_TRUE(DecodeBytes(Span<const uint8_t>(encoded.bitstreams[0]),
-                          ColorHints(), SizeConstraints(), &ppf_out));
-
-  if (params.codec != Codec::kPNM && params.codec != Codec::kPGX &&
-      params.codec != Codec::kEXR) {
+  ColorHints color_hints;
+  if (params.codec == Codec::kPNM || params.codec == Codec::kPGX) {
+    color_hints.Add("color_space",
+                    params.is_gray ? "Gra_D65_Rel_SRG" : "RGB_D65_SRG_Rel_SRG");
+  }
+  ASSERT_TRUE(DecodeBytes(Bytes(encoded.bitstreams[0]), color_hints, &ppf_out));
+  if (params.codec == Codec::kPNG && ppf_out.icc.empty()) {
+    // Decoding a PNG may drop the ICC profile if there's a valid cICP chunk.
+    // Rendering intent is not preserved in this case.
+    EXPECT_EQ(ppf_in.color_encoding.color_space,
+              ppf_out.color_encoding.color_space);
+    EXPECT_EQ(ppf_in.color_encoding.white_point,
+              ppf_out.color_encoding.white_point);
+    if (ppf_in.color_encoding.color_space != JXL_COLOR_SPACE_GRAY) {
+      EXPECT_EQ(ppf_in.color_encoding.primaries,
+                ppf_out.color_encoding.primaries);
+    }
+    EXPECT_EQ(ppf_in.color_encoding.transfer_function,
+              ppf_out.color_encoding.transfer_function);
+    EXPECT_EQ(ppf_out.color_encoding.rendering_intent,
+              JXL_RENDERING_INTENT_RELATIVE);
+  } else if (params.codec != Codec::kPNM && params.codec != Codec::kPGX &&
+             params.codec != Codec::kEXR) {
     EXPECT_EQ(ppf_in.icc, ppf_out.icc);
   }
 
   ASSERT_EQ(ppf_out.frames.size(), 1);
-  VerifySameImage(ppf_in.frames[0].color, ppf_in.info.bits_per_sample,
-                  ppf_out.frames[0].color, ppf_out.info.bits_per_sample,
+  const auto& frame_in = ppf_in.frames[0];
+  const auto& frame_out = ppf_out.frames[0];
+  VerifySameImage(frame_in.color, ppf_in.info.bits_per_sample, frame_out.color,
+                  ppf_out.info.bits_per_sample,
                   /*lossless=*/params.codec != Codec::kJPG);
+  ASSERT_EQ(frame_in.extra_channels.size(), frame_out.extra_channels.size());
+  ASSERT_EQ(ppf_out.extra_channels_info.size(),
+            frame_out.extra_channels.size());
+  for (size_t i = 0; i < frame_in.extra_channels.size(); ++i) {
+    VerifySameImage(frame_in.extra_channels[i], ppf_in.info.bits_per_sample,
+                    frame_out.extra_channels[i], ppf_out.info.bits_per_sample,
+                    /*lossless=*/true);
+    EXPECT_EQ(ppf_out.extra_channels_info[i].ec_info.type,
+              ppf_in.extra_channels_info[i].ec_info.type);
+  }
 }
 
 TEST(CodecTest, TestRoundTrip) {
-  ThreadPoolInternal pool(12);
+  ThreadPoolForTests pool(12);
 
   TestImageParams params;
   params.xsize = 7;
   params.ysize = 4;
 
-  for (Codec codec : AvailableCodecs()) {
+  for (Codec codec :
+       {Codec::kPNG, Codec::kPNM, Codec::kPGX, Codec::kEXR, Codec::kJPG}) {
     for (int bits_per_sample : {4, 8, 10, 12, 16, 32}) {
       for (bool is_gray : {false, true}) {
         for (bool add_alpha : {false, true}) {
@@ -285,7 +325,12 @@ TEST(CodecTest, TestRoundTrip) {
             params.is_gray = is_gray;
             params.add_alpha = add_alpha;
             params.big_endian = big_endian;
+            params.add_extra_channels = false;
             TestRoundTrip(params, &pool);
+            if (codec == Codec::kPNM && add_alpha) {
+              params.add_extra_channels = true;
+              TestRoundTrip(params, &pool);
+            }
           }
         }
       }
@@ -293,192 +338,39 @@ TEST(CodecTest, TestRoundTrip) {
   }
 }
 
-CodecInOut DecodeRoundtrip(const std::string& pathname, ThreadPool* pool,
-                           const ColorHints& color_hints = ColorHints()) {
-  CodecInOut io;
-  const PaddedBytes orig = ReadTestData(pathname);
-  JXL_CHECK(
-      SetFromBytes(Span<const uint8_t>(orig), color_hints, &io, pool, nullptr));
-  const ImageBundle& ib1 = io.Main();
-
-  // Encode/Decode again to make sure Encode carries through all metadata.
-  std::vector<uint8_t> encoded;
-  JXL_CHECK(Encode(io, Codec::kPNG, io.metadata.m.color_encoding,
-                   io.metadata.m.bit_depth.bits_per_sample, &encoded, pool));
-
-  CodecInOut io2;
-  JXL_CHECK(SetFromBytes(Span<const uint8_t>(encoded), color_hints, &io2, pool,
-                         nullptr));
-  const ImageBundle& ib2 = io2.Main();
-  EXPECT_EQ(Description(ib1.metadata()->color_encoding),
-            Description(ib2.metadata()->color_encoding));
-  EXPECT_EQ(Description(ib1.c_current()), Description(ib2.c_current()));
-
-  size_t bits_per_sample = io2.metadata.m.bit_depth.bits_per_sample;
-
-  // "Same" pixels?
-  double max_l1 = bits_per_sample <= 12 ? 1.3 : 2E-3;
-  double max_rel = bits_per_sample <= 12 ? 6E-3 : 1E-4;
-  if (ib1.metadata()->color_encoding.IsGray()) {
-    max_rel *= 2.0;
-  } else if (ib1.metadata()->color_encoding.primaries != Primaries::kSRGB) {
-    // Need more tolerance for large gamuts (anything but sRGB)
-    max_l1 *= 1.5;
-    max_rel *= 3.0;
-  }
-  VerifyRelativeError(ib1.color(), ib2.color(), max_l1, max_rel);
-
-  // Simulate the encoder removing profile and decoder restoring it.
-  if (!ib2.metadata()->color_encoding.WantICC()) {
-    io2.metadata.m.color_encoding.InternalRemoveICC();
-    EXPECT_TRUE(io2.metadata.m.color_encoding.CreateICC());
-  }
-
-  return io2;
-}
-
-#if 0
-TEST(CodecTest, TestMetadataSRGB) {
-  ThreadPoolInternal pool(12);
-
-  const char* paths[] = {"external/raw.pixls/DJI-FC6310-16bit_srgb8_v4_krita.png",
-                         "external/raw.pixls/Google-Pixel2XL-16bit_srgb8_v4_krita.png",
-                         "external/raw.pixls/HUAWEI-EVA-L09-16bit_srgb8_dt.png",
-                         "external/raw.pixls/Nikon-D300-12bit_srgb8_dt.png",
-                         "external/raw.pixls/Sony-DSC-RX1RM2-14bit_srgb8_v4_krita.png"};
-  for (const char* relative_pathname : paths) {
-    const CodecInOut io =
-        DecodeRoundtrip(relative_pathname, Codec::kPNG, &pool);
-    EXPECT_EQ(8, io.metadata.m.bit_depth.bits_per_sample);
-    EXPECT_FALSE(io.metadata.m.bit_depth.floating_point_sample);
-    EXPECT_EQ(0, io.metadata.m.bit_depth.exponent_bits_per_sample);
-
-    EXPECT_EQ(64, io.xsize());
-    EXPECT_EQ(64, io.ysize());
-    EXPECT_FALSE(io.metadata.m.HasAlpha());
-
-    const ColorEncoding& c_original = io.metadata.m.color_encoding;
-    EXPECT_FALSE(c_original.ICC().empty());
-    EXPECT_EQ(ColorSpace::kRGB, c_original.GetColorSpace());
-    EXPECT_EQ(WhitePoint::kD65, c_original.white_point);
-    EXPECT_EQ(Primaries::kSRGB, c_original.primaries);
-    EXPECT_TRUE(c_original.tf.IsSRGB());
-  }
-}
-
-TEST(CodecTest, TestMetadataLinear) {
-  ThreadPoolInternal pool(12);
-
-  const char* paths[3] = {
-      "external/raw.pixls/Google-Pixel2XL-16bit_acescg_g1_v4_krita.png",
-      "external/raw.pixls/HUAWEI-EVA-L09-16bit_709_g1_dt.png",
-      "external/raw.pixls/Nikon-D300-12bit_2020_g1_dt.png",
-  };
-  const WhitePoint white_points[3] = {WhitePoint::kCustom, WhitePoint::kD65,
-                                      WhitePoint::kD65};
-  const Primaries primaries[3] = {Primaries::kCustom, Primaries::kSRGB,
-                                  Primaries::k2100};
-
-  for (size_t i = 0; i < 3; ++i) {
-    const CodecInOut io = DecodeRoundtrip(paths[i], Codec::kPNG, &pool);
-    EXPECT_EQ(16, io.metadata.m.bit_depth.bits_per_sample);
-    EXPECT_FALSE(io.metadata.m.bit_depth.floating_point_sample);
-    EXPECT_EQ(0, io.metadata.m.bit_depth.exponent_bits_per_sample);
-
-    EXPECT_EQ(64, io.xsize());
-    EXPECT_EQ(64, io.ysize());
-    EXPECT_FALSE(io.metadata.m.HasAlpha());
-
-    const ColorEncoding& c_original = io.metadata.m.color_encoding;
-    EXPECT_FALSE(c_original.ICC().empty());
-    EXPECT_EQ(ColorSpace::kRGB, c_original.GetColorSpace());
-    EXPECT_EQ(white_points[i], c_original.white_point);
-    EXPECT_EQ(primaries[i], c_original.primaries);
-    EXPECT_TRUE(c_original.tf.IsLinear());
-  }
-}
-
-TEST(CodecTest, TestMetadataICC) {
-  ThreadPoolInternal pool(12);
-
-  const char* paths[] = {
-      "external/raw.pixls/DJI-FC6310-16bit_709_v4_krita.png",
-      "external/raw.pixls/Sony-DSC-RX1RM2-14bit_709_v4_krita.png",
-  };
-  for (const char* relative_pathname : paths) {
-    const CodecInOut io =
-        DecodeRoundtrip(relative_pathname, Codec::kPNG, &pool);
-    EXPECT_GE(16, io.metadata.m.bit_depth.bits_per_sample);
-    EXPECT_LE(14, io.metadata.m.bit_depth.bits_per_sample);
-
-    EXPECT_EQ(64, io.xsize());
-    EXPECT_EQ(64, io.ysize());
-    EXPECT_FALSE(io.metadata.m.HasAlpha());
-
-    const ColorEncoding& c_original = io.metadata.m.color_encoding;
-    EXPECT_FALSE(c_original.ICC().empty());
-    EXPECT_EQ(RenderingIntent::kPerceptual, c_original.rendering_intent);
-    EXPECT_EQ(ColorSpace::kRGB, c_original.GetColorSpace());
-    EXPECT_EQ(WhitePoint::kD65, c_original.white_point);
-    EXPECT_EQ(Primaries::kSRGB, c_original.primaries);
-    EXPECT_EQ(TransferFunction::k709, c_original.tf.GetTransferFunction());
+TEST(CodecTest, LosslessPNMRoundtrip) {
+  ThreadPoolForTests pool(12);
+
+  static const char* kChannels[] = {"", "g", "ga", "rgb", "rgba"};
+  static const char* kExtension[] = {"", ".pgm", ".pam", ".ppm", ".pam"};
+  for (size_t bit_depth = 1; bit_depth <= 16; ++bit_depth) {
+    for (size_t channels = 1; channels <= 4; ++channels) {
+      if (bit_depth == 1 && (channels == 2 || channels == 4)) continue;
+      std::string extension(kExtension[channels]);
+      std::string filename = "jxl/flower/flower_small." +
+                             std::string(kChannels[channels]) + ".depth" +
+                             std::to_string(bit_depth) + extension;
+      const std::vector<uint8_t> orig = jxl::test::ReadTestData(filename);
+
+      PackedPixelFile ppf;
+      ColorHints color_hints;
+      color_hints.Add("color_space",
+                      channels < 3 ? "Gra_D65_Rel_SRG" : "RGB_D65_SRG_Rel_SRG");
+      ASSERT_TRUE(
+          DecodeBytes(Bytes(orig.data(), orig.size()), color_hints, &ppf));
+
+      EncodedImage encoded;
+      auto encoder = Encoder::FromExtension(extension);
+      ASSERT_TRUE(encoder.get());
+      ASSERT_TRUE(encoder->Encode(ppf, &encoded, &pool));
+      ASSERT_EQ(encoded.bitstreams.size(), 1);
+      ASSERT_EQ(orig.size(), encoded.bitstreams[0].size());
+      EXPECT_EQ(0,
+                memcmp(orig.data(), encoded.bitstreams[0].data(), orig.size()));
+    }
   }
 }
 
-TEST(CodecTest, Testexternal/pngsuite) {
-  ThreadPoolInternal pool(12);
-
-  // Ensure we can load PNG with text, japanese UTF-8, compressed text.
-  (void)DecodeRoundtrip("external/pngsuite/ct1n0g04.png", Codec::kPNG, &pool);
-  (void)DecodeRoundtrip("external/pngsuite/ctjn0g04.png", Codec::kPNG, &pool);
-  (void)DecodeRoundtrip("external/pngsuite/ctzn0g04.png", Codec::kPNG, &pool);
-
-  // Extract gAMA
-  const CodecInOut b1 =
-      DecodeRoundtrip("external/pngsuite/g10n3p04.png", Codec::kPNG, &pool);
-  EXPECT_TRUE(b1.metadata.color_encoding.tf.IsLinear());
-
-  // Extract cHRM
-  const CodecInOut b_p =
-      DecodeRoundtrip("external/pngsuite/ccwn2c08.png", Codec::kPNG, &pool);
-  EXPECT_EQ(Primaries::kSRGB, b_p.metadata.color_encoding.primaries);
-  EXPECT_EQ(WhitePoint::kD65, b_p.metadata.color_encoding.white_point);
-
-  // Extract EXIF from (new-style) dedicated chunk
-  const CodecInOut b_exif =
-      DecodeRoundtrip("external/pngsuite/exif2c08.png", Codec::kPNG, &pool);
-  EXPECT_EQ(978, b_exif.blobs.exif.size());
-}
-#endif
-
-void VerifyWideGamutMetadata(const std::string& relative_pathname,
-                             const Primaries primaries, ThreadPool* pool) {
-  const CodecInOut io = DecodeRoundtrip(relative_pathname, pool);
-
-  EXPECT_EQ(8u, io.metadata.m.bit_depth.bits_per_sample);
-  EXPECT_FALSE(io.metadata.m.bit_depth.floating_point_sample);
-  EXPECT_EQ(0u, io.metadata.m.bit_depth.exponent_bits_per_sample);
-
-  const ColorEncoding& c_original = io.metadata.m.color_encoding;
-  EXPECT_FALSE(c_original.ICC().empty());
-  EXPECT_EQ(RenderingIntent::kAbsolute, c_original.rendering_intent);
-  EXPECT_EQ(ColorSpace::kRGB, c_original.GetColorSpace());
-  EXPECT_EQ(WhitePoint::kD65, c_original.white_point);
-  EXPECT_EQ(primaries, c_original.primaries);
-}
-
-TEST(CodecTest, TestWideGamut) {
-  ThreadPoolInternal pool(12);
-  // VerifyWideGamutMetadata("external/wide-gamut-tests/P3-sRGB-color-bars.png",
-  //                        Primaries::kP3, &pool);
-  VerifyWideGamutMetadata("external/wide-gamut-tests/P3-sRGB-color-ring.png",
-                          Primaries::kP3, &pool);
-  // VerifyWideGamutMetadata("external/wide-gamut-tests/R2020-sRGB-color-bars.png",
-  //                        Primaries::k2100, &pool);
-  // VerifyWideGamutMetadata("external/wide-gamut-tests/R2020-sRGB-color-ring.png",
-  //                        Primaries::k2100, &pool);
-}
-
 TEST(CodecTest, TestPNM) { TestCodecPNM(); }
 
 TEST(CodecTest, FormatNegotiation) {
@@ -520,13 +412,15 @@ TEST(CodecTest, EncodeToPNG) {
   ThreadPool* const pool = nullptr;
 
   std::unique_ptr<Encoder> png_encoder = Encoder::FromExtension(".png");
-  ASSERT_THAT(png_encoder, NotNull());
+  if (!png_encoder) {
+    fprintf(stderr, "Skipping test because of missing codec support.\n");
+    return;
+  }
 
-  const PaddedBytes original_png =
-      ReadTestData("external/wesaturate/500px/tmshre_riaphotographs_srgb8.png");
+  const std::vector<uint8_t> original_png = jxl::test::ReadTestData(
+      "external/wesaturate/500px/tmshre_riaphotographs_srgb8.png");
   PackedPixelFile ppf;
-  ASSERT_TRUE(extras::DecodeBytes(Span<const uint8_t>(original_png),
-                                  ColorHints(), SizeConstraints(), &ppf));
+  ASSERT_TRUE(extras::DecodeBytes(Bytes(original_png), ColorHints(), &ppf));
 
   const JxlPixelFormat& format = ppf.frames.front().color.format;
   ASSERT_THAT(
@@ -540,9 +434,8 @@ TEST(CodecTest, EncodeToPNG) {
   ASSERT_THAT(encoded_png.bitstreams, SizeIs(1));
 
   PackedPixelFile decoded_ppf;
-  ASSERT_TRUE(
-      extras::DecodeBytes(Span<const uint8_t>(encoded_png.bitstreams.front()),
-                          ColorHints(), SizeConstraints(), &decoded_ppf));
+  ASSERT_TRUE(extras::DecodeBytes(Bytes(encoded_png.bitstreams.front()),
+                                  ColorHints(), &decoded_ppf));
 
   ASSERT_EQ(decoded_ppf.info.bits_per_sample, ppf.info.bits_per_sample);
   ASSERT_EQ(decoded_ppf.frames.size(), 1);
diff --git a/lib/extras/common.cc b/lib/extras/common.cc
new file mode 100644 (file)
index 0000000..e85b43a
--- /dev/null
@@ -0,0 +1,61 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/common.h"
+
+#include <jxl/codestream_header.h>
+#include <jxl/types.h>
+
+#include <cstddef>
+#include <vector>
+
+#include "lib/extras/packed_image.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+namespace extras {
+
+Status SelectFormat(const std::vector<JxlPixelFormat>& accepted_formats,
+                    const JxlBasicInfo& basic_info, JxlPixelFormat* format) {
+  const size_t original_bit_depth = basic_info.bits_per_sample;
+  size_t current_bit_depth = 0;
+  size_t num_alpha_channels = (basic_info.alpha_bits != 0 ? 1 : 0);
+  size_t num_channels = basic_info.num_color_channels + num_alpha_channels;
+  for (;;) {
+    for (const JxlPixelFormat& candidate : accepted_formats) {
+      if (candidate.num_channels != num_channels) continue;
+      const size_t candidate_bit_depth =
+          PackedImage::BitsPerChannel(candidate.data_type);
+      if (
+          // Candidate bit depth is less than what we have and still enough
+          (original_bit_depth <= candidate_bit_depth &&
+           candidate_bit_depth < current_bit_depth) ||
+          // Or larger than the too-small bit depth we currently have
+          (current_bit_depth < candidate_bit_depth &&
+           current_bit_depth < original_bit_depth)) {
+        *format = candidate;
+        current_bit_depth = candidate_bit_depth;
+      }
+    }
+    if (current_bit_depth == 0) {
+      if (num_channels > basic_info.num_color_channels) {
+        // Try dropping the alpha channel.
+        --num_channels;
+        continue;
+      }
+      return JXL_FAILURE("no appropriate format found");
+    }
+    break;
+  }
+  if (current_bit_depth < original_bit_depth) {
+    JXL_WARNING("encoding %" PRIuS "-bit original to %" PRIuS " bits",
+                original_bit_depth, current_bit_depth);
+  }
+  return true;
+}
+
+}  // namespace extras
+}  // namespace jxl
diff --git a/lib/extras/common.h b/lib/extras/common.h
new file mode 100644 (file)
index 0000000..88ed581
--- /dev/null
@@ -0,0 +1,26 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_EXTRAS_COMMON_H_
+#define LIB_EXTRAS_COMMON_H_
+
+#include <jxl/codestream_header.h>
+#include <jxl/types.h>
+
+#include <vector>
+
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+namespace extras {
+
+// TODO(sboukortt): consider exposing this as part of the C API.
+Status SelectFormat(const std::vector<JxlPixelFormat>& accepted_formats,
+                    const JxlBasicInfo& basic_info, JxlPixelFormat* format);
+
+}  // namespace extras
+}  // namespace jxl
+
+#endif  // LIB_EXTRAS_COMMON_H_
index 5667466..f77dab7 100644 (file)
  *
  */
 
-#include <stdio.h>
+#include <jxl/codestream_header.h>
+#include <jxl/encode.h>
 #include <string.h>
 
 #include <string>
 #include <utility>
 #include <vector>
 
-#include "jxl/codestream_header.h"
-#include "jxl/encode.h"
+#include "lib/extras/size_constraints.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/common.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/printf_macros.h"
 #include "lib/jxl/base/scope_guard.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/sanitizers.h"
+#if JPEGXL_ENABLE_APNG
 #include "png.h" /* original (unpatched) libpng is ok */
+#endif
 
 namespace jxl {
 namespace extras {
 
+#if JPEGXL_ENABLE_APNG
 namespace {
 
+constexpr unsigned char kExifSignature[6] = {0x45, 0x78, 0x69,
+                                             0x66, 0x00, 0x00};
+
 /* hIST chunk tail is not proccesed properly; skip this chunk completely;
    see https://github.com/glennrp/libpng/pull/413 */
 const png_byte kIgnoredPngChunks[] = {
@@ -73,11 +80,145 @@ Status DecodeSRGB(const unsigned char* payload, const size_t payload_size,
   if (payload_size != 1) return JXL_FAILURE("Wrong sRGB size");
   // (PNG uses the same values as ICC.)
   if (payload[0] >= 4) return JXL_FAILURE("Invalid Rendering Intent");
+  color_encoding->white_point = JXL_WHITE_POINT_D65;
+  color_encoding->primaries = JXL_PRIMARIES_SRGB;
+  color_encoding->transfer_function = JXL_TRANSFER_FUNCTION_SRGB;
   color_encoding->rendering_intent =
       static_cast<JxlRenderingIntent>(payload[0]);
   return true;
 }
 
+// If the cICP profile is not fully supported, return false and leave
+// color_encoding unmodified.
+Status DecodeCICP(const unsigned char* payload, const size_t payload_size,
+                  JxlColorEncoding* color_encoding) {
+  if (payload_size != 4) return JXL_FAILURE("Wrong cICP size");
+  JxlColorEncoding color_enc = *color_encoding;
+
+  // From https://www.itu.int/rec/T-REC-H.273-202107-I/en
+  if (payload[0] == 1) {
+    // IEC 61966-2-1 sRGB
+    color_enc.primaries = JXL_PRIMARIES_SRGB;
+    color_enc.white_point = JXL_WHITE_POINT_D65;
+  } else if (payload[0] == 4) {
+    // Rec. ITU-R BT.470-6 System M
+    color_enc.primaries = JXL_PRIMARIES_CUSTOM;
+    color_enc.primaries_red_xy[0] = 0.67;
+    color_enc.primaries_red_xy[1] = 0.33;
+    color_enc.primaries_green_xy[0] = 0.21;
+    color_enc.primaries_green_xy[1] = 0.71;
+    color_enc.primaries_blue_xy[0] = 0.14;
+    color_enc.primaries_blue_xy[1] = 0.08;
+    color_enc.white_point = JXL_WHITE_POINT_CUSTOM;
+    color_enc.white_point_xy[0] = 0.310;
+    color_enc.white_point_xy[1] = 0.316;
+  } else if (payload[0] == 5) {
+    // Rec. ITU-R BT.1700-0 625 PAL and 625 SECAM
+    color_enc.primaries = JXL_PRIMARIES_CUSTOM;
+    color_enc.primaries_red_xy[0] = 0.64;
+    color_enc.primaries_red_xy[1] = 0.33;
+    color_enc.primaries_green_xy[0] = 0.29;
+    color_enc.primaries_green_xy[1] = 0.60;
+    color_enc.primaries_blue_xy[0] = 0.15;
+    color_enc.primaries_blue_xy[1] = 0.06;
+    color_enc.white_point = JXL_WHITE_POINT_D65;
+  } else if (payload[0] == 6 || payload[0] == 7) {
+    // SMPTE ST 170 (2004) / SMPTE ST 240 (1999)
+    color_enc.primaries = JXL_PRIMARIES_CUSTOM;
+    color_enc.primaries_red_xy[0] = 0.630;
+    color_enc.primaries_red_xy[1] = 0.340;
+    color_enc.primaries_green_xy[0] = 0.310;
+    color_enc.primaries_green_xy[1] = 0.595;
+    color_enc.primaries_blue_xy[0] = 0.155;
+    color_enc.primaries_blue_xy[1] = 0.070;
+    color_enc.white_point = JXL_WHITE_POINT_D65;
+  } else if (payload[0] == 8) {
+    // Generic film (colour filters using Illuminant C)
+    color_enc.primaries = JXL_PRIMARIES_CUSTOM;
+    color_enc.primaries_red_xy[0] = 0.681;
+    color_enc.primaries_red_xy[1] = 0.319;
+    color_enc.primaries_green_xy[0] = 0.243;
+    color_enc.primaries_green_xy[1] = 0.692;
+    color_enc.primaries_blue_xy[0] = 0.145;
+    color_enc.primaries_blue_xy[1] = 0.049;
+    color_enc.white_point = JXL_WHITE_POINT_CUSTOM;
+    color_enc.white_point_xy[0] = 0.310;
+    color_enc.white_point_xy[1] = 0.316;
+  } else if (payload[0] == 9) {
+    // Rec. ITU-R BT.2100-2
+    color_enc.primaries = JXL_PRIMARIES_2100;
+    color_enc.white_point = JXL_WHITE_POINT_D65;
+  } else if (payload[0] == 10) {
+    // CIE 1931 XYZ
+    color_enc.primaries = JXL_PRIMARIES_CUSTOM;
+    color_enc.primaries_red_xy[0] = 1;
+    color_enc.primaries_red_xy[1] = 0;
+    color_enc.primaries_green_xy[0] = 0;
+    color_enc.primaries_green_xy[1] = 1;
+    color_enc.primaries_blue_xy[0] = 0;
+    color_enc.primaries_blue_xy[1] = 0;
+    color_enc.white_point = JXL_WHITE_POINT_E;
+  } else if (payload[0] == 11) {
+    // SMPTE RP 431-2 (2011)
+    color_enc.primaries = JXL_PRIMARIES_P3;
+    color_enc.white_point = JXL_WHITE_POINT_DCI;
+  } else if (payload[0] == 12) {
+    // SMPTE EG 432-1 (2010)
+    color_enc.primaries = JXL_PRIMARIES_P3;
+    color_enc.white_point = JXL_WHITE_POINT_D65;
+  } else if (payload[0] == 22) {
+    color_enc.primaries = JXL_PRIMARIES_CUSTOM;
+    color_enc.primaries_red_xy[0] = 0.630;
+    color_enc.primaries_red_xy[1] = 0.340;
+    color_enc.primaries_green_xy[0] = 0.295;
+    color_enc.primaries_green_xy[1] = 0.605;
+    color_enc.primaries_blue_xy[0] = 0.155;
+    color_enc.primaries_blue_xy[1] = 0.077;
+    color_enc.white_point = JXL_WHITE_POINT_D65;
+  } else {
+    JXL_WARNING("Unsupported primaries specified in cICP chunk: %d",
+                static_cast<int>(payload[0]));
+    return false;
+  }
+
+  if (payload[1] == 1 || payload[1] == 6 || payload[1] == 14 ||
+      payload[1] == 15) {
+    // Rec. ITU-R BT.709-6
+    color_enc.transfer_function = JXL_TRANSFER_FUNCTION_709;
+  } else if (payload[1] == 4) {
+    // Rec. ITU-R BT.1700-0 625 PAL and 625 SECAM
+    color_enc.transfer_function = JXL_TRANSFER_FUNCTION_GAMMA;
+    color_enc.gamma = 1 / 2.2;
+  } else if (payload[1] == 5) {
+    // Rec. ITU-R BT.470-6 System B, G
+    color_enc.transfer_function = JXL_TRANSFER_FUNCTION_GAMMA;
+    color_enc.gamma = 1 / 2.8;
+  } else if (payload[1] == 8 || payload[1] == 13 || payload[1] == 16 ||
+             payload[1] == 17 || payload[1] == 18) {
+    // These codes all match the corresponding JXL enum values
+    color_enc.transfer_function = static_cast<JxlTransferFunction>(payload[1]);
+  } else {
+    JXL_WARNING("Unsupported transfer function specified in cICP chunk: %d",
+                static_cast<int>(payload[1]));
+    return false;
+  }
+
+  if (payload[2] != 0) {
+    JXL_WARNING("Unsupported color space specified in cICP chunk: %d",
+                static_cast<int>(payload[2]));
+    return false;
+  }
+  if (payload[3] != 1) {
+    JXL_WARNING("Unsupported full-range flag specified in cICP chunk: %d",
+                static_cast<int>(payload[3]));
+    return false;
+  }
+  // cICP has no rendering intent, so use the default
+  color_enc.rendering_intent = JXL_RENDERING_INTENT_RELATIVE;
+  *color_encoding = color_enc;
+  return true;
+}
+
 Status DecodeGAMA(const unsigned char* payload, const size_t payload_size,
                   JxlColorEncoding* color_encoding) {
   if (payload_size != 4) return JXL_FAILURE("Wrong gAMA size");
@@ -129,6 +270,11 @@ class BlobsReaderPNG {
       return false;
     }
     if (type == "exif") {
+      // Remove "Exif\0\0" prefix if present
+      if (bytes.size() >= sizeof kExifSignature &&
+          memcmp(bytes.data(), kExifSignature, sizeof kExifSignature) == 0) {
+        bytes.erase(bytes.begin(), bytes.begin() + sizeof kExifSignature);
+      }
       if (!metadata->exif.empty()) {
         JXL_WARNING("overwriting EXIF (%" PRIuS " bytes) with base16 (%" PRIuS
                     " bytes)",
@@ -136,9 +282,9 @@ class BlobsReaderPNG {
       }
       metadata->exif = std::move(bytes);
     } else if (type == "iptc") {
-      // TODO (jon): Deal with IPTC in some way
+      // TODO(jon): Deal with IPTC in some way
     } else if (type == "8bim") {
-      // TODO (jon): Deal with 8bim in some way
+      // TODO(jon): Deal with 8bim in some way
     } else if (type == "xmp") {
       if (!metadata->xmp.empty()) {
         JXL_WARNING("overwriting XMP (%" PRIuS " bytes) with base16 (%" PRIuS
@@ -228,6 +374,10 @@ class BlobsReaderPNG {
     // We parsed so far a \n, some number of non \n characters and are now
     // pointing at a \n.
     if (*(pos++) != '\n') return false;
+    // Skip leading spaces
+    while (pos < encoded_end && *pos == ' ') {
+      pos++;
+    }
     uint32_t bytes_to_decode = 0;
     JXL_RETURN_IF_ERROR(DecodeDecimal(&pos, encoded_end, &bytes_to_decode));
 
@@ -274,6 +424,7 @@ constexpr uint32_t kId_fcTL = 0x4C546366;
 constexpr uint32_t kId_IDAT = 0x54414449;
 constexpr uint32_t kId_fdAT = 0x54416466;
 constexpr uint32_t kId_IEND = 0x444E4549;
+constexpr uint32_t kId_cICP = 0x50434963;
 constexpr uint32_t kId_iCCP = 0x50434369;
 constexpr uint32_t kId_sRGB = 0x42475273;
 constexpr uint32_t kId_gAMA = 0x414D4167;
@@ -342,6 +493,12 @@ int processing_start(png_structp& png_ptr, png_infop& info_ptr, void* frame_ptr,
                      std::vector<std::vector<uint8_t>>& chunksInfo) {
   unsigned char header[8] = {137, 80, 78, 71, 13, 10, 26, 10};
 
+  // Cleanup prior decoder, if any.
+  png_destroy_read_struct(&png_ptr, &info_ptr, 0);
+  // Just in case. Not all versions on libpng wipe-out the pointers.
+  png_ptr = nullptr;
+  info_ptr = nullptr;
+
   png_ptr = png_create_read_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
   info_ptr = png_create_info_struct(png_ptr);
   if (!png_ptr || !info_ptr) return 1;
@@ -403,11 +560,20 @@ int processing_finish(png_structp png_ptr, png_infop info_ptr,
 }
 
 }  // namespace
+#endif
+
+bool CanDecodeAPNG() {
+#if JPEGXL_ENABLE_APNG
+  return true;
+#else
+  return false;
+#endif
+}
 
 Status DecodeImageAPNG(const Span<const uint8_t> bytes,
-                       const ColorHints& color_hints,
-                       const SizeConstraints& constraints,
-                       PackedPixelFile* ppf) {
+                       const ColorHints& color_hints, PackedPixelFile* ppf,
+                       const SizeConstraints* constraints) {
+#if JPEGXL_ENABLE_APNG
   Reader r;
   unsigned int id, j, w, h, w0, h0, x0, y0;
   unsigned int delay_num, delay_den, dop, bop, rowbytes, imagesize;
@@ -419,6 +585,7 @@ Status DecodeImageAPNG(const Span<const uint8_t> bytes,
   std::vector<std::vector<uint8_t>> chunksInfo;
   bool isAnimated = false;
   bool hasInfo = false;
+  bool seenFctl = false;
   APNGFrame frameRaw = {};
   uint32_t num_channels;
   JxlPixelFormat format;
@@ -457,7 +624,8 @@ Status DecodeImageAPNG(const Span<const uint8_t> bytes,
 
   ppf->frames.clear();
 
-  bool have_color = false, have_srgb = false;
+  bool have_color = false;
+  bool have_cicp = false, have_iccp = false, have_srgb = false;
   bool errorstate = true;
   if (id == kId_IHDR && chunkIHDR.size() == 25) {
     x0 = 0;
@@ -478,12 +646,14 @@ Status DecodeImageAPNG(const Span<const uint8_t> bytes,
     ppf->color_encoding.white_point = JXL_WHITE_POINT_D65;
     ppf->color_encoding.primaries = JXL_PRIMARIES_SRGB;
     ppf->color_encoding.transfer_function = JXL_TRANSFER_FUNCTION_SRGB;
+    ppf->color_encoding.rendering_intent = JXL_RENDERING_INTENT_RELATIVE;
 
     if (!processing_start(png_ptr, info_ptr, (void*)&frameRaw, hasInfo,
                           chunkIHDR, chunksInfo)) {
       while (!r.Eof()) {
         id = read_chunk(&r, &chunk);
         if (!id) break;
+        seenFctl |= (id == kId_fcTL);
 
         if (id == kId_acTL && !hasInfo && !isAnimated) {
           isAnimated = true;
@@ -544,11 +714,16 @@ Status DecodeImageAPNG(const Span<const uint8_t> bytes,
           }
         } else if (id == kId_IDAT) {
           // First IDAT chunk means we now have all header info
+          if (seenFctl) {
+            // `fcTL` chunk must appear after all `IDAT` chunks
+            return JXL_FAILURE("IDAT chunk after fcTL chunk");
+          }
           hasInfo = true;
           JXL_CHECK(w == png_get_image_width(png_ptr, info_ptr));
           JXL_CHECK(h == png_get_image_height(png_ptr, info_ptr));
           int colortype = png_get_color_type(png_ptr, info_ptr);
-          ppf->info.bits_per_sample = png_get_bit_depth(png_ptr, info_ptr);
+          int png_bit_depth = png_get_bit_depth(png_ptr, info_ptr);
+          ppf->info.bits_per_sample = png_bit_depth;
           png_color_8p sigbits = NULL;
           png_get_sBIT(png_ptr, info_ptr, &sigbits);
           if (colortype & 1) {
@@ -559,8 +734,18 @@ Status DecodeImageAPNG(const Span<const uint8_t> bytes,
             ppf->info.num_color_channels = 3;
             ppf->color_encoding.color_space = JXL_COLOR_SPACE_RGB;
             if (sigbits && sigbits->red == sigbits->green &&
-                sigbits->green == sigbits->blue)
+                sigbits->green == sigbits->blue) {
               ppf->info.bits_per_sample = sigbits->red;
+            } else if (sigbits) {
+              int maxbps = std::max(sigbits->red,
+                                    std::max(sigbits->green, sigbits->blue));
+              JXL_WARNING(
+                  "sBIT chunk: bit depths for R, G, and B are not the same (%i "
+                  "%i %i), while in JPEG XL they have to be the same. Setting "
+                  "RGB bit depth to %i.",
+                  sigbits->red, sigbits->green, sigbits->blue, maxbps);
+              ppf->info.bits_per_sample = maxbps;
+            }
           } else {
             ppf->info.num_color_channels = 1;
             ppf->color_encoding.color_space = JXL_COLOR_SPACE_GRAY;
@@ -569,12 +754,12 @@ Status DecodeImageAPNG(const Span<const uint8_t> bytes,
           if (colortype & 4 ||
               png_get_valid(png_ptr, info_ptr, PNG_INFO_tRNS)) {
             ppf->info.alpha_bits = ppf->info.bits_per_sample;
-            if (sigbits) {
-              if (sigbits->alpha &&
-                  sigbits->alpha != ppf->info.bits_per_sample) {
-                return JXL_FAILURE("Unsupported alpha bit-depth");
-              }
-              ppf->info.alpha_bits = sigbits->alpha;
+            if (sigbits && sigbits->alpha != ppf->info.bits_per_sample) {
+              JXL_WARNING(
+                  "sBIT chunk: bit depths for RGBA are inconsistent "
+                  "(%i %i %i %i). Setting A bitdepth to %i.",
+                  sigbits->red, sigbits->green, sigbits->blue, sigbits->alpha,
+                  ppf->info.bits_per_sample);
             }
           } else {
             ppf->info.alpha_bits = 0;
@@ -584,7 +769,7 @@ Status DecodeImageAPNG(const Span<const uint8_t> bytes,
                                                  : JXL_COLOR_SPACE_RGB);
           ppf->info.xsize = w;
           ppf->info.ysize = h;
-          JXL_RETURN_IF_ERROR(VerifyDimensions(&constraints, w, h));
+          JXL_RETURN_IF_ERROR(VerifyDimensions(constraints, w, h));
           num_channels =
               ppf->info.num_color_channels + (ppf->info.alpha_bits ? 1 : 0);
           format = {
@@ -594,6 +779,9 @@ Status DecodeImageAPNG(const Span<const uint8_t> bytes,
               /*endianness=*/JXL_BIG_ENDIAN,
               /*align=*/0,
           };
+          if (png_bit_depth > 8 && format.data_type == JXL_TYPE_UINT8) {
+            png_set_strip_16(png_ptr);
+          }
           bytes_per_pixel =
               num_channels * (format.data_type == JXL_TYPE_UINT16 ? 2 : 1);
           rowbytes = w * bytes_per_pixel;
@@ -607,13 +795,26 @@ Status DecodeImageAPNG(const Span<const uint8_t> bytes,
             break;
           }
         } else if (id == kId_fdAT && isAnimated) {
+          if (!hasInfo) {
+            return JXL_FAILURE("fDAT chunk before iDAT");
+          }
           png_save_uint_32(chunk.data() + 4, chunk.size() - 16);
           memcpy(chunk.data() + 8, "IDAT", 4);
           if (processing_data(png_ptr, info_ptr, chunk.data() + 4,
                               chunk.size() - 4)) {
             break;
           }
-        } else if (id == kId_iCCP) {
+        } else if (id == kId_cICP) {
+          // Color profile chunks: cICP has the highest priority, followed by
+          // iCCP and sRGB (which shouldn't co-exist, but if they do, we use
+          // iCCP), followed finally by gAMA and cHRM.
+          if (DecodeCICP(chunk.data() + 8, chunk.size() - 12,
+                         &ppf->color_encoding)) {
+            have_cicp = true;
+            have_color = true;
+            ppf->icc.clear();
+          }
+        } else if (!have_cicp && id == kId_iCCP) {
           if (processing_data(png_ptr, info_ptr, chunk.data(), chunk.size())) {
             JXL_WARNING("Corrupt iCCP chunk");
             break;
@@ -630,19 +831,20 @@ Status DecodeImageAPNG(const Span<const uint8_t> bytes,
           if (ok && proflen) {
             ppf->icc.assign(profile, profile + proflen);
             have_color = true;
+            have_iccp = true;
           } else {
             // TODO(eustas): JXL_WARNING?
           }
-        } else if (id == kId_sRGB) {
+        } else if (!have_cicp && !have_iccp && id == kId_sRGB) {
           JXL_RETURN_IF_ERROR(DecodeSRGB(chunk.data() + 8, chunk.size() - 12,
                                          &ppf->color_encoding));
           have_srgb = true;
           have_color = true;
-        } else if (id == kId_gAMA) {
+        } else if (!have_cicp && !have_srgb && !have_iccp && id == kId_gAMA) {
           JXL_RETURN_IF_ERROR(DecodeGAMA(chunk.data() + 8, chunk.size() - 12,
                                          &ppf->color_encoding));
           have_color = true;
-        } else if (id == kId_cHRM) {
+        } else if (!have_cicp && !have_srgb && !have_iccp && id == kId_cHRM) {
           JXL_RETURN_IF_ERROR(DecodeCHRM(chunk.data() + 8, chunk.size() - 12,
                                          &ppf->color_encoding));
           have_color = true;
@@ -665,12 +867,6 @@ Status DecodeImageAPNG(const Span<const uint8_t> bytes,
       }
     }
 
-    if (have_srgb) {
-      ppf->color_encoding.white_point = JXL_WHITE_POINT_D65;
-      ppf->color_encoding.primaries = JXL_PRIMARIES_SRGB;
-      ppf->color_encoding.transfer_function = JXL_TRANSFER_FUNCTION_SRGB;
-      ppf->color_encoding.rendering_intent = JXL_RENDERING_INTENT_PERCEPTUAL;
-    }
     JXL_RETURN_IF_ERROR(ApplyColorHints(
         color_hints, have_color, ppf->info.num_color_channels == 1, ppf));
   }
@@ -706,31 +902,29 @@ Status DecodeImageAPNG(const Span<const uint8_t> bytes,
     size_t xsize = frame.data.xsize;
     size_t ysize = frame.data.ysize;
     if (previous_frame_should_be_cleared) {
-      size_t xs = frame.data.xsize;
-      size_t ys = frame.data.ysize;
       size_t px0 = frames[i - 1].x0;
       size_t py0 = frames[i - 1].y0;
       size_t pxs = frames[i - 1].xsize;
       size_t pys = frames[i - 1].ysize;
-      if (px0 >= x0 && py0 >= y0 && px0 + pxs <= x0 + xs &&
-          py0 + pys <= y0 + ys && frame.blend_op == BLEND_OP_SOURCE &&
+      if (px0 >= x0 && py0 >= y0 && px0 + pxs <= x0 + xsize &&
+          py0 + pys <= y0 + ysize && frame.blend_op == BLEND_OP_SOURCE &&
           use_for_next_frame) {
         // If the previous frame is entirely contained in the current frame and
         // we are using BLEND_OP_SOURCE, nothing special needs to be done.
         ppf->frames.emplace_back(std::move(frame.data));
-      } else if (px0 == x0 && py0 == y0 && px0 + pxs == x0 + xs &&
-                 py0 + pys == y0 + ys && use_for_next_frame) {
+      } else if (px0 == x0 && py0 == y0 && px0 + pxs == x0 + xsize &&
+                 py0 + pys == y0 + ysize && use_for_next_frame) {
         // If the new frame has the same size as the old one, but we are
         // blending, we can instead just not blend.
         should_blend = false;
         ppf->frames.emplace_back(std::move(frame.data));
-      } else if (px0 <= x0 && py0 <= y0 && px0 + pxs >= x0 + xs &&
-                 py0 + pys >= y0 + ys && use_for_next_frame) {
+      } else if (px0 <= x0 && py0 <= y0 && px0 + pxs >= x0 + xsize &&
+                 py0 + pys >= y0 + ysize && use_for_next_frame) {
         // If the new frame is contained within the old frame, we can pad the
         // new frame with zeros and not blend.
         PackedImage new_data(pxs, pys, frame.data.format);
         memset(new_data.pixels(), 0, new_data.pixels_size);
-        for (size_t y = 0; y < ys; y++) {
+        for (size_t y = 0; y < ysize; y++) {
           size_t bytes_per_pixel =
               PackedImage::BitsPerChannel(new_data.format.data_type) *
               new_data.format.num_channels / 8;
@@ -739,7 +933,7 @@ Status DecodeImageAPNG(const Span<const uint8_t> bytes,
                      bytes_per_pixel * (x0 - px0),
                  static_cast<const uint8_t*>(frame.data.pixels()) +
                      frame.data.stride * y,
-                 xs * bytes_per_pixel);
+                 xsize * bytes_per_pixel);
         }
 
         x0 = px0;
@@ -749,19 +943,21 @@ Status DecodeImageAPNG(const Span<const uint8_t> bytes,
         should_blend = false;
         ppf->frames.emplace_back(std::move(new_data));
       } else {
-        // If all else fails, insert a dummy blank frame with kReplace.
+        // If all else fails, insert a placeholder blank frame with kReplace.
         PackedImage blank(pxs, pys, frame.data.format);
         memset(blank.pixels(), 0, blank.pixels_size);
         ppf->frames.emplace_back(std::move(blank));
         auto& pframe = ppf->frames.back();
         pframe.frame_info.layer_info.crop_x0 = px0;
         pframe.frame_info.layer_info.crop_y0 = py0;
-        pframe.frame_info.layer_info.xsize = frame.xsize;
-        pframe.frame_info.layer_info.ysize = frame.ysize;
+        pframe.frame_info.layer_info.xsize = pxs;
+        pframe.frame_info.layer_info.ysize = pys;
         pframe.frame_info.duration = 0;
-        pframe.frame_info.layer_info.have_crop = 0;
+        bool is_full_size = px0 == 0 && py0 == 0 && pxs == ppf->info.xsize &&
+                            pys == ppf->info.ysize;
+        pframe.frame_info.layer_info.have_crop = is_full_size ? 0 : 1;
         pframe.frame_info.layer_info.blend_info.blendmode = JXL_BLEND_REPLACE;
-        pframe.frame_info.layer_info.blend_info.source = 0;
+        pframe.frame_info.layer_info.blend_info.source = 1;
         pframe.frame_info.layer_info.save_as_reference = 1;
         ppf->frames.emplace_back(std::move(frame.data));
       }
@@ -780,7 +976,7 @@ Status DecodeImageAPNG(const Span<const uint8_t> bytes,
     bool is_full_size = x0 == 0 && y0 == 0 && xsize == ppf->info.xsize &&
                         ysize == ppf->info.ysize;
     pframe.frame_info.layer_info.have_crop = is_full_size ? 0 : 1;
-    pframe.frame_info.layer_info.blend_info.source = should_blend ? 1 : 0;
+    pframe.frame_info.layer_info.blend_info.source = 1;
     pframe.frame_info.layer_info.blend_info.alpha = 0;
     pframe.frame_info.layer_info.save_as_reference = use_for_next_frame ? 1 : 0;
 
@@ -791,6 +987,9 @@ Status DecodeImageAPNG(const Span<const uint8_t> bytes,
   ppf->frames.back().frame_info.is_last = true;
 
   return true;
+#else
+  return false;
+#endif
 }
 
 }  // namespace extras
index a68f6f8..d91364b 100644 (file)
 #include "lib/extras/dec/color_hints.h"
 #include "lib/extras/packed_image.h"
 #include "lib/jxl/base/data_parallel.h"
-#include "lib/jxl/base/padded_bytes.h"
 #include "lib/jxl/base/span.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/codec_in_out.h"
 
 namespace jxl {
+
+struct SizeConstraints;
+
 namespace extras {
 
+bool CanDecodeAPNG();
+
 // Decodes `bytes` into `ppf`.
 Status DecodeImageAPNG(Span<const uint8_t> bytes, const ColorHints& color_hints,
-                       const SizeConstraints& constraints,
-                       PackedPixelFile* ppf);
+                       PackedPixelFile* ppf,
+                       const SizeConstraints* constraints = nullptr);
 
 }  // namespace extras
 }  // namespace jxl
index 2325b50..54f6aa4 100644 (file)
@@ -69,9 +69,9 @@ Status ParseEnum(const std::string& token, const EnumName<T>* enum_values,
   }
   return false;
 }
-#define ARRAYSIZE(X) (sizeof(X) / sizeof((X)[0]))
+#define ARRAY_SIZE(X) (sizeof(X) / sizeof((X)[0]))
 #define PARSE_ENUM(type, token, value) \
-  ParseEnum<type>(token, k##type##Names, ARRAYSIZE(k##type##Names), value)
+  ParseEnum<type>(token, k##type##Names, ARRAY_SIZE(k##type##Names), value)
 
 class Tokenizer {
  public:
index 989d591..23680ff 100644 (file)
@@ -6,9 +6,10 @@
 #ifndef LIB_EXTRAS_COLOR_DESCRIPTION_H_
 #define LIB_EXTRAS_COLOR_DESCRIPTION_H_
 
+#include <jxl/color_encoding.h>
+
 #include <string>
 
-#include "jxl/color_encoding.h"
 #include "lib/jxl/base/status.h"
 
 namespace jxl {
index 8ae9e5d..e6e34f0 100644 (file)
@@ -5,9 +5,9 @@
 
 #include "lib/extras/dec/color_description.h"
 
-#include "gtest/gtest.h"
 #include "lib/jxl/color_encoding_internal.h"
 #include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
 
@@ -21,8 +21,7 @@ TEST(ColorDescriptionTest, RoundTripAll) {
     JxlColorEncoding c_external = {};
     EXPECT_TRUE(ParseDescription(description, &c_external));
     ColorEncoding c_internal;
-    EXPECT_TRUE(
-        ConvertExternalToInternalColorEncoding(c_external, &c_internal));
+    EXPECT_TRUE(c_internal.FromExternal(c_external));
     EXPECT_TRUE(c_original.SameColorEncoding(c_internal))
         << "Where c_original=" << c_original
         << " and c_internal=" << c_internal;
index cf7d3e3..5c6d7b8 100644 (file)
@@ -5,9 +5,12 @@
 
 #include "lib/extras/dec/color_hints.h"
 
-#include "jxl/encode.h"
+#include <jxl/encode.h>
+
+#include <vector>
+
 #include "lib/extras/dec/color_description.h"
-#include "lib/jxl/base/file_io.h"
+#include "lib/jxl/base/status.h"
 
 namespace jxl {
 namespace extras {
@@ -15,19 +18,15 @@ namespace extras {
 Status ApplyColorHints(const ColorHints& color_hints,
                        const bool color_already_set, const bool is_gray,
                        PackedPixelFile* ppf) {
-  if (color_already_set) {
-    return color_hints.Foreach(
-        [](const std::string& key, const std::string& /*value*/) {
-          JXL_WARNING("Decoder ignoring %s hint", key.c_str());
-          return true;
-        });
-  }
-
-  bool got_color_space = false;
+  bool got_color_space = color_already_set;
 
   JXL_RETURN_IF_ERROR(color_hints.Foreach(
-      [is_gray, ppf, &got_color_space](const std::string& key,
-                                       const std::string& value) -> Status {
+      [color_already_set, is_gray, ppf, &got_color_space](
+          const std::string& key, const std::string& value) -> Status {
+        if (color_already_set && (key == "color_space" || key == "icc")) {
+          JXL_WARNING("Decoder ignoring %s hint", key.c_str());
+          return true;
+        }
         if (key == "color_space") {
           JxlColorEncoding c_original_external;
           if (!ParseDescription(value, &c_original_external)) {
@@ -41,9 +40,23 @@ Status ApplyColorHints(const ColorHints& color_hints,
           }
 
           got_color_space = true;
-        } else if (key == "icc_pathname") {
-          JXL_RETURN_IF_ERROR(ReadFile(value, &ppf->icc));
+        } else if (key == "icc") {
+          const uint8_t* data = reinterpret_cast<const uint8_t*>(value.data());
+          std::vector<uint8_t> icc(data, data + value.size());
+          ppf->icc.swap(icc);
           got_color_space = true;
+        } else if (key == "exif") {
+          const uint8_t* data = reinterpret_cast<const uint8_t*>(value.data());
+          std::vector<uint8_t> blob(data, data + value.size());
+          ppf->metadata.exif.swap(blob);
+        } else if (key == "xmp") {
+          const uint8_t* data = reinterpret_cast<const uint8_t*>(value.data());
+          std::vector<uint8_t> blob(data, data + value.size());
+          ppf->metadata.xmp.swap(blob);
+        } else if (key == "jumbf") {
+          const uint8_t* data = reinterpret_cast<const uint8_t*>(value.data());
+          std::vector<uint8_t> blob(data, data + value.size());
+          ppf->metadata.jumbf.swap(blob);
         } else {
           JXL_WARNING("Ignoring %s hint", key.c_str());
         }
@@ -51,7 +64,6 @@ Status ApplyColorHints(const ColorHints& color_hints,
       }));
 
   if (!got_color_space) {
-    JXL_WARNING("No color_space/icc_pathname given, assuming sRGB");
     ppf->color_encoding.color_space =
         is_gray ? JXL_COLOR_SPACE_GRAY : JXL_COLOR_SPACE_RGB;
     ppf->color_encoding.white_point = JXL_WHITE_POINT_D65;
index 9c7de88..036f203 100644 (file)
@@ -10,6 +10,8 @@
 // information into the file, and those that support it may not have it.
 // To allow attaching color information to those file formats the caller can
 // define these color hints.
+// Besides color space information, 'ColorHints' may also include other
+// additional information such as Exif, XMP and JUMBF metadata.
 
 #include <stddef.h>
 #include <stdint.h>
index 8712e03..9149208 100644 (file)
@@ -7,18 +7,11 @@
 
 #include <locale>
 
-#if JPEGXL_ENABLE_APNG
 #include "lib/extras/dec/apng.h"
-#endif
-#if JPEGXL_ENABLE_EXR
 #include "lib/extras/dec/exr.h"
-#endif
-#if JPEGXL_ENABLE_GIF
 #include "lib/extras/dec/gif.h"
-#endif
-#if JPEGXL_ENABLE_JPEG
 #include "lib/extras/dec/jpg.h"
-#endif
+#include "lib/extras/dec/jxl.h"
 #include "lib/extras/dec/pgx.h"
 #include "lib/extras/dec/pnm.h"
 
@@ -29,59 +22,89 @@ namespace {
 // Any valid encoding is larger (ensures codecs can read the first few bytes)
 constexpr size_t kMinBytes = 9;
 
-}  // namespace
-
-std::vector<Codec> AvailableCodecs() {
-  std::vector<Codec> out;
-#if JPEGXL_ENABLE_APNG
-  out.push_back(Codec::kPNG);
-#endif
-#if JPEGXL_ENABLE_EXR
-  out.push_back(Codec::kEXR);
-#endif
-#if JPEGXL_ENABLE_GIF
-  out.push_back(Codec::kGIF);
-#endif
-#if JPEGXL_ENABLE_JPEG
-  out.push_back(Codec::kJPG);
-#endif
-  out.push_back(Codec::kPGX);
-  out.push_back(Codec::kPNM);
-  return out;
-}
+void BasenameAndExtension(const std::string& path, std::string* filename,
+                          std::string* extension) {
+  // Pattern: "png:name" or "png:-"
+  size_t pos = path.find_first_of(':');
+  if (pos != std::string::npos) {
+    *extension = "." + path.substr(0, pos);
+    *filename = path.substr(pos + 1);
+    //+ ((path.length() == pos + 2 && path.substr(pos + 1, 1) == "-") ? "" :
+    //*extension);
+    return;
+  }
 
-Codec CodecFromExtension(std::string extension,
-                         size_t* JXL_RESTRICT bits_per_sample) {
-  std::transform(
-      extension.begin(), extension.end(), extension.begin(),
-      [](char c) { return std::tolower(c, std::locale::classic()); });
-  if (extension == ".png") return Codec::kPNG;
+  // Pattern: "name.png"
+  pos = path.find_last_of('.');
+  if (pos != std::string::npos) {
+    *extension = path.substr(pos);
+    *filename = path;
+    return;
+  }
 
-  if (extension == ".jpg") return Codec::kJPG;
-  if (extension == ".jpeg") return Codec::kJPG;
+  // Extension not found
+  *filename = path;
+  *extension = "";
+}
 
-  if (extension == ".pgx") return Codec::kPGX;
+}  // namespace
 
-  if (extension == ".pam") return Codec::kPNM;
-  if (extension == ".pnm") return Codec::kPNM;
-  if (extension == ".pgm") return Codec::kPNM;
-  if (extension == ".ppm") return Codec::kPNM;
-  if (extension == ".pfm") {
+Codec CodecFromPath(std::string path, size_t* JXL_RESTRICT bits_per_sample,
+                    std::string* filename, std::string* extension) {
+  std::string base;
+  std::string ext;
+  BasenameAndExtension(path, &base, &ext);
+  if (filename) *filename = base;
+  if (extension) *extension = ext;
+
+  std::transform(ext.begin(), ext.end(), ext.begin(), [](char c) {
+    return std::tolower(c, std::locale::classic());
+  });
+  if (ext == ".png") return Codec::kPNG;
+
+  if (ext == ".jpg") return Codec::kJPG;
+  if (ext == ".jpeg") return Codec::kJPG;
+
+  if (ext == ".pgx") return Codec::kPGX;
+
+  if (ext == ".pam") return Codec::kPNM;
+  if (ext == ".pnm") return Codec::kPNM;
+  if (ext == ".pgm") return Codec::kPNM;
+  if (ext == ".ppm") return Codec::kPNM;
+  if (ext == ".pfm") {
     if (bits_per_sample != nullptr) *bits_per_sample = 32;
     return Codec::kPNM;
   }
 
-  if (extension == ".gif") return Codec::kGIF;
+  if (ext == ".gif") return Codec::kGIF;
 
-  if (extension == ".exr") return Codec::kEXR;
+  if (ext == ".exr") return Codec::kEXR;
 
   return Codec::kUnknown;
 }
 
+bool CanDecode(Codec codec) {
+  switch (codec) {
+    case Codec::kEXR:
+      return CanDecodeEXR();
+    case Codec::kGIF:
+      return CanDecodeGIF();
+    case Codec::kJPG:
+      return CanDecodeJPG();
+    case Codec::kPNG:
+      return CanDecodeAPNG();
+    case Codec::kPNM:
+    case Codec::kPGX:
+    case Codec::kJXL:
+      return true;
+    default:
+      return false;
+  }
+}
+
 Status DecodeBytes(const Span<const uint8_t> bytes,
-                   const ColorHints& color_hints,
-                   const SizeConstraints& constraints,
-                   extras::PackedPixelFile* ppf, Codec* orig_codec) {
+                   const ColorHints& color_hints, extras::PackedPixelFile* ppf,
+                   const SizeConstraints* constraints, Codec* orig_codec) {
   if (bytes.size() < kMinBytes) return JXL_FAILURE("Too few bytes");
 
   *ppf = extras::PackedPixelFile();
@@ -90,33 +113,42 @@ Status DecodeBytes(const Span<const uint8_t> bytes,
   ppf->info.uses_original_profile = true;
   ppf->info.orientation = JXL_ORIENT_IDENTITY;
 
-  Codec codec;
-#if JPEGXL_ENABLE_APNG
-  if (DecodeImageAPNG(bytes, color_hints, constraints, ppf)) {
-    codec = Codec::kPNG;
-  } else
-#endif
-      if (DecodeImagePGX(bytes, color_hints, constraints, ppf)) {
-    codec = Codec::kPGX;
-  } else if (DecodeImagePNM(bytes, color_hints, constraints, ppf)) {
-    codec = Codec::kPNM;
-  }
-#if JPEGXL_ENABLE_GIF
-  else if (DecodeImageGIF(bytes, color_hints, constraints, ppf)) {
-    codec = Codec::kGIF;
-  }
-#endif
-#if JPEGXL_ENABLE_JPEG
-  else if (DecodeImageJPG(bytes, color_hints, constraints, ppf)) {
-    codec = Codec::kJPG;
-  }
-#endif
-#if JPEGXL_ENABLE_EXR
-  else if (DecodeImageEXR(bytes, color_hints, constraints, ppf)) {
-    codec = Codec::kEXR;
-  }
-#endif
-  else {
+  const auto choose_codec = [&]() -> Codec {
+    if (DecodeImageAPNG(bytes, color_hints, ppf, constraints)) {
+      return Codec::kPNG;
+    }
+    if (DecodeImagePGX(bytes, color_hints, ppf, constraints)) {
+      return Codec::kPGX;
+    }
+    if (DecodeImagePNM(bytes, color_hints, ppf, constraints)) {
+      return Codec::kPNM;
+    }
+    JXLDecompressParams dparams = {};
+    for (const uint32_t num_channels : {1, 2, 3, 4}) {
+      dparams.accepted_formats.push_back(
+          {num_channels, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, /*align=*/0});
+    }
+    size_t decoded_bytes;
+    if (DecodeImageJXL(bytes.data(), bytes.size(), dparams, &decoded_bytes,
+                       ppf) &&
+        ApplyColorHints(color_hints, true, ppf->info.num_color_channels == 1,
+                        ppf)) {
+      return Codec::kJXL;
+    }
+    if (DecodeImageGIF(bytes, color_hints, ppf, constraints)) {
+      return Codec::kGIF;
+    }
+    if (DecodeImageJPG(bytes, color_hints, ppf, constraints)) {
+      return Codec::kJPG;
+    }
+    if (DecodeImageEXR(bytes, color_hints, ppf, constraints)) {
+      return Codec::kEXR;
+    }
+    return Codec::kUnknown;
+  };
+
+  Codec codec = choose_codec();
+  if (codec == Codec::kUnknown) {
     return JXL_FAILURE("Codecs failed to decode");
   }
   if (orig_codec) *orig_codec = codec;
index 7f0ff70..0f864dd 100644 (file)
 #include "lib/extras/dec/color_hints.h"
 #include "lib/jxl/base/span.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/codec_in_out.h"
 
 namespace jxl {
+
+struct SizeConstraints;
+
 namespace extras {
 
-// Codecs supported by CodecInOut::Encode.
+// Codecs supported by DecodeBytes.
 enum class Codec : uint32_t {
-  kUnknown,  // for CodecFromExtension
+  kUnknown,  // for CodecFromPath
   kPNG,
   kPNM,
   kPGX,
   kJPG,
   kGIF,
-  kEXR
+  kEXR,
+  kJXL
 };
 
-std::vector<Codec> AvailableCodecs();
+bool CanDecode(Codec codec);
 
 // If and only if extension is ".pfm", *bits_per_sample is updated to 32 so
 // that Encode() would encode to PFM instead of PPM.
-Codec CodecFromExtension(std::string extension,
-                         size_t* JXL_RESTRICT bits_per_sample = nullptr);
+Codec CodecFromPath(std::string path,
+                    size_t* JXL_RESTRICT bits_per_sample = nullptr,
+                    std::string* filename = nullptr,
+                    std::string* extension = nullptr);
 
 // Decodes "bytes" info *ppf.
 // color_space_hint may specify the color space, otherwise, defaults to sRGB.
 Status DecodeBytes(Span<const uint8_t> bytes, const ColorHints& color_hints,
-                   const SizeConstraints& constraints,
-                   extras::PackedPixelFile* ppf, Codec* orig_codec = nullptr);
+                   extras::PackedPixelFile* ppf,
+                   const SizeConstraints* constraints = nullptr,
+                   Codec* orig_codec = nullptr);
 
 }  // namespace extras
 }  // namespace jxl
index ddb6d53..821e0f4 100644 (file)
@@ -5,20 +5,22 @@
 
 #include "lib/extras/dec/exr.h"
 
+#if JPEGXL_ENABLE_EXR
 #include <ImfChromaticitiesAttribute.h>
 #include <ImfIO.h>
 #include <ImfRgbaFile.h>
 #include <ImfStandardAttributes.h>
+#endif
 
 #include <vector>
 
 namespace jxl {
 namespace extras {
 
+#if JPEGXL_ENABLE_EXR
 namespace {
 
 namespace OpenEXR = OPENEXR_IMF_NAMESPACE;
-namespace Imath = IMATH_NAMESPACE;
 
 // OpenEXR::Int64 is deprecated in favor of using uint64_t directly, but using
 // uint64_t as recommended causes build failures with previous OpenEXR versions
@@ -60,10 +62,20 @@ class InMemoryIStream : public OpenEXR::IStream {
 };
 
 }  // namespace
+#endif
+
+bool CanDecodeEXR() {
+#if JPEGXL_ENABLE_EXR
+  return true;
+#else
+  return false;
+#endif
+}
 
 Status DecodeImageEXR(Span<const uint8_t> bytes, const ColorHints& color_hints,
-                      const SizeConstraints& constraints,
-                      PackedPixelFile* ppf) {
+                      PackedPixelFile* ppf,
+                      const SizeConstraints* constraints) {
+#if JPEGXL_ENABLE_EXR
   InMemoryIStream is(bytes);
 
 #ifdef __EXCEPTIONS
@@ -71,7 +83,8 @@ Status DecodeImageEXR(Span<const uint8_t> bytes, const ColorHints& color_hints,
   try {
     input_ptr.reset(new OpenEXR::RgbaInputFile(is));
   } catch (...) {
-    return JXL_FAILURE("OpenEXR failed to parse input");
+    // silently return false if it is not an EXR file
+    return false;
   }
   OpenEXR::RgbaInputFile& input = *input_ptr;
 #else
@@ -87,7 +100,7 @@ Status DecodeImageEXR(Span<const uint8_t> bytes, const ColorHints& color_hints,
 
   const float intensity_target = OpenEXR::hasWhiteLuminance(input.header())
                                      ? OpenEXR::whiteLuminance(input.header())
-                                     : kDefaultIntensityTarget;
+                                     : 0;
 
   auto image_size = input.displayWindow().size();
   // Size is computed as max - min, but both bounds are inclusive.
@@ -144,6 +157,7 @@ Status DecodeImageEXR(Span<const uint8_t> bytes, const ColorHints& color_hints,
            std::min(input.dataWindow().max.x, input.displayWindow().max.x);
            ++exr_x) {
         const int image_x = exr_x - input.displayWindow().min.x;
+        // TODO(eustas): UB: OpenEXR::Rgba is not TriviallyCopyable
         memcpy(row + image_x * pixel_size,
                input_row + (exr_x - input.dataWindow().min.x), pixel_size);
       }
@@ -178,6 +192,9 @@ Status DecodeImageEXR(Span<const uint8_t> bytes, const ColorHints& color_hints,
   }
   ppf->info.intensity_target = intensity_target;
   return true;
+#else
+  return false;
+#endif
 }
 
 }  // namespace extras
index 6af4e6b..0605cbb 100644 (file)
 #include "lib/extras/dec/color_hints.h"
 #include "lib/extras/packed_image.h"
 #include "lib/jxl/base/data_parallel.h"
-#include "lib/jxl/base/padded_bytes.h"
 #include "lib/jxl/base/span.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/codec_in_out.h"
 
 namespace jxl {
+
+struct SizeConstraints;
+
 namespace extras {
 
+bool CanDecodeEXR();
+
 // Decodes `bytes` into `ppf`. color_hints are ignored.
 Status DecodeImageEXR(Span<const uint8_t> bytes, const ColorHints& color_hints,
-                      const SizeConstraints& constraints, PackedPixelFile* ppf);
+                      PackedPixelFile* ppf,
+                      const SizeConstraints* constraints = nullptr);
 
 }  // namespace extras
 }  // namespace jxl
index 5167bf5..3d96394 100644 (file)
@@ -5,20 +5,24 @@
 
 #include "lib/extras/dec/gif.h"
 
+#if JPEGXL_ENABLE_GIF
 #include <gif_lib.h>
+#endif
+#include <jxl/codestream_header.h>
 #include <string.h>
 
 #include <memory>
 #include <utility>
 #include <vector>
 
-#include "jxl/codestream_header.h"
+#include "lib/extras/size_constraints.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/sanitizers.h"
 
 namespace jxl {
 namespace extras {
 
+#if JPEGXL_ENABLE_GIF
 namespace {
 
 struct ReadState {
@@ -38,21 +42,6 @@ struct PackedRgb {
   uint8_t r, g, b;
 };
 
-// Gif does not support partial transparency, so this considers any nonzero
-// alpha channel value as opaque.
-bool AllOpaque(const PackedImage& color) {
-  for (size_t y = 0; y < color.ysize; ++y) {
-    const PackedRgba* const JXL_RESTRICT row =
-        static_cast<const PackedRgba*>(color.pixels()) + y * color.xsize;
-    for (size_t x = 0; x < color.xsize; ++x) {
-      if (row[x].a == 0) {
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
 void ensure_have_alpha(PackedFrame* frame) {
   if (!frame->extra_channels.empty()) return;
   const JxlPixelFormat alpha_format{
@@ -67,12 +56,21 @@ void ensure_have_alpha(PackedFrame* frame) {
   std::fill_n(static_cast<uint8_t*>(frame->extra_channels[0].pixels()),
               frame->color.xsize * frame->color.ysize, 255u);
 }
-
 }  // namespace
+#endif
+
+bool CanDecodeGIF() {
+#if JPEGXL_ENABLE_GIF
+  return true;
+#else
+  return false;
+#endif
+}
 
 Status DecodeImageGIF(Span<const uint8_t> bytes, const ColorHints& color_hints,
-                      const SizeConstraints& constraints,
-                      PackedPixelFile* ppf) {
+                      PackedPixelFile* ppf,
+                      const SizeConstraints* constraints) {
+#if JPEGXL_ENABLE_GIF
   int error = GIF_OK;
   ReadState state = {bytes};
   const auto ReadFromSpan = [](GifFileType* const gif, GifByteType* const bytes,
@@ -111,20 +109,20 @@ Status DecodeImageGIF(Span<const uint8_t> bytes, const ColorHints& color_hints,
                        sizeof(*gif->SavedImages) * gif->ImageCount);
 
   JXL_RETURN_IF_ERROR(
-      VerifyDimensions<uint32_t>(&constraints, gif->SWidth, gif->SHeight));
+      VerifyDimensions<uint32_t>(constraints, gif->SWidth, gif->SHeight));
   uint64_t total_pixel_count =
       static_cast<uint64_t>(gif->SWidth) * gif->SHeight;
   for (int i = 0; i < gif->ImageCount; ++i) {
     const SavedImage& image = gif->SavedImages[i];
     uint32_t w = image.ImageDesc.Width;
     uint32_t h = image.ImageDesc.Height;
-    JXL_RETURN_IF_ERROR(VerifyDimensions<uint32_t>(&constraints, w, h));
+    JXL_RETURN_IF_ERROR(VerifyDimensions<uint32_t>(constraints, w, h));
     uint64_t pixel_count = static_cast<uint64_t>(w) * h;
     if (total_pixel_count + pixel_count < total_pixel_count) {
       return JXL_FAILURE("Image too big");
     }
     total_pixel_count += pixel_count;
-    if (total_pixel_count > constraints.dec_max_pixels) {
+    if (constraints && (total_pixel_count > constraints->dec_max_pixels)) {
       return JXL_FAILURE("Image too big");
     }
   }
@@ -408,6 +406,9 @@ Status DecodeImageGIF(Span<const uint8_t> bytes, const ColorHints& color_hints,
     }
   }
   return true;
+#else
+  return false;
+#endif
 }
 
 }  // namespace extras
index b359517..4d5be86 100644 (file)
 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/base/span.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/codec_in_out.h"
 
 namespace jxl {
+
+struct SizeConstraints;
+
 namespace extras {
 
+bool CanDecodeGIF();
+
 // Decodes `bytes` into `ppf`. color_hints are ignored.
 Status DecodeImageGIF(Span<const uint8_t> bytes, const ColorHints& color_hints,
-                      const SizeConstraints& constraints, PackedPixelFile* ppf);
+                      PackedPixelFile* ppf,
+                      const SizeConstraints* constraints = nullptr);
 
 }  // namespace extras
 }  // namespace jxl
diff --git a/lib/extras/dec/jpegli.cc b/lib/extras/dec/jpegli.cc
new file mode 100644 (file)
index 0000000..ffa1b79
--- /dev/null
@@ -0,0 +1,271 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/dec/jpegli.h"
+
+#include <setjmp.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <numeric>
+#include <utility>
+#include <vector>
+
+#include "lib/jpegli/decode.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/sanitizers.h"
+
+namespace jxl {
+namespace extras {
+
+namespace {
+
+constexpr unsigned char kExifSignature[6] = {0x45, 0x78, 0x69,
+                                             0x66, 0x00, 0x00};
+constexpr int kExifMarker = JPEG_APP0 + 1;
+constexpr int kICCMarker = JPEG_APP0 + 2;
+
+static inline bool IsJPG(const std::vector<uint8_t>& bytes) {
+  if (bytes.size() < 2) return false;
+  if (bytes[0] != 0xFF || bytes[1] != 0xD8) return false;
+  return true;
+}
+
+bool MarkerIsExif(const jpeg_saved_marker_ptr marker) {
+  return marker->marker == kExifMarker &&
+         marker->data_length >= sizeof kExifSignature + 2 &&
+         std::equal(std::begin(kExifSignature), std::end(kExifSignature),
+                    marker->data);
+}
+
+Status ReadICCProfile(jpeg_decompress_struct* const cinfo,
+                      std::vector<uint8_t>* const icc) {
+  uint8_t* icc_data_ptr;
+  unsigned int icc_data_len;
+  if (jpegli_read_icc_profile(cinfo, &icc_data_ptr, &icc_data_len)) {
+    icc->assign(icc_data_ptr, icc_data_ptr + icc_data_len);
+    free(icc_data_ptr);
+    return true;
+  }
+  return false;
+}
+
+void ReadExif(jpeg_decompress_struct* const cinfo,
+              std::vector<uint8_t>* const exif) {
+  constexpr size_t kExifSignatureSize = sizeof kExifSignature;
+  for (jpeg_saved_marker_ptr marker = cinfo->marker_list; marker != nullptr;
+       marker = marker->next) {
+    // marker is initialized by libjpeg, which we are not instrumenting with
+    // msan.
+    msan::UnpoisonMemory(marker, sizeof(*marker));
+    msan::UnpoisonMemory(marker->data, marker->data_length);
+    if (!MarkerIsExif(marker)) continue;
+    size_t marker_length = marker->data_length - kExifSignatureSize;
+    exif->resize(marker_length);
+    std::copy_n(marker->data + kExifSignatureSize, marker_length, exif->data());
+    return;
+  }
+}
+
+JpegliDataType ConvertDataType(JxlDataType type) {
+  switch (type) {
+    case JXL_TYPE_UINT8:
+      return JPEGLI_TYPE_UINT8;
+    case JXL_TYPE_UINT16:
+      return JPEGLI_TYPE_UINT16;
+    case JXL_TYPE_FLOAT:
+      return JPEGLI_TYPE_FLOAT;
+    default:
+      return JPEGLI_TYPE_UINT8;
+  }
+}
+
+JpegliEndianness ConvertEndianness(JxlEndianness type) {
+  switch (type) {
+    case JXL_NATIVE_ENDIAN:
+      return JPEGLI_NATIVE_ENDIAN;
+    case JXL_BIG_ENDIAN:
+      return JPEGLI_BIG_ENDIAN;
+    case JXL_LITTLE_ENDIAN:
+      return JPEGLI_LITTLE_ENDIAN;
+    default:
+      return JPEGLI_NATIVE_ENDIAN;
+  }
+}
+
+JxlColorSpace ConvertColorSpace(J_COLOR_SPACE colorspace) {
+  switch (colorspace) {
+    case JCS_GRAYSCALE:
+      return JXL_COLOR_SPACE_GRAY;
+    case JCS_RGB:
+      return JXL_COLOR_SPACE_RGB;
+    default:
+      return JXL_COLOR_SPACE_UNKNOWN;
+  }
+}
+
+void MyErrorExit(j_common_ptr cinfo) {
+  jmp_buf* env = static_cast<jmp_buf*>(cinfo->client_data);
+  (*cinfo->err->output_message)(cinfo);
+  jpegli_destroy_decompress(reinterpret_cast<j_decompress_ptr>(cinfo));
+  longjmp(*env, 1);
+}
+
+void MyOutputMessage(j_common_ptr cinfo) {
+#if JXL_DEBUG_WARNING == 1
+  char buf[JMSG_LENGTH_MAX + 1];
+  (*cinfo->err->format_message)(cinfo, buf);
+  buf[JMSG_LENGTH_MAX] = 0;
+  JXL_WARNING("%s", buf);
+#endif
+}
+
+void UnmapColors(uint8_t* row, size_t xsize, int components,
+                 JSAMPARRAY colormap, size_t num_colors) {
+  JXL_CHECK(colormap != nullptr);
+  std::vector<uint8_t> tmp(xsize * components);
+  for (size_t x = 0; x < xsize; ++x) {
+    JXL_CHECK(row[x] < num_colors);
+    for (int c = 0; c < components; ++c) {
+      tmp[x * components + c] = colormap[c][row[x]];
+    }
+  }
+  memcpy(row, tmp.data(), tmp.size());
+}
+
+}  // namespace
+
+Status DecodeJpeg(const std::vector<uint8_t>& compressed,
+                  const JpegDecompressParams& dparams, ThreadPool* pool,
+                  PackedPixelFile* ppf) {
+  // Don't do anything for non-JPEG files (no need to report an error)
+  if (!IsJPG(compressed)) return false;
+
+  // TODO(veluca): use JPEGData also for pixels?
+
+  // We need to declare all the non-trivial destructor local variables before
+  // the call to setjmp().
+  std::unique_ptr<JSAMPLE[]> row;
+
+  jpeg_decompress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    // Setup error handling in jpeg library so we can deal with broken jpegs in
+    // the fuzzer.
+    jpeg_error_mgr jerr;
+    jmp_buf env;
+    cinfo.err = jpegli_std_error(&jerr);
+    jerr.error_exit = &MyErrorExit;
+    jerr.output_message = &MyOutputMessage;
+    if (setjmp(env)) {
+      return false;
+    }
+    cinfo.client_data = static_cast<void*>(&env);
+
+    jpegli_create_decompress(&cinfo);
+    jpegli_mem_src(&cinfo,
+                   reinterpret_cast<const unsigned char*>(compressed.data()),
+                   compressed.size());
+    jpegli_save_markers(&cinfo, kICCMarker, 0xFFFF);
+    jpegli_save_markers(&cinfo, kExifMarker, 0xFFFF);
+    const auto failure = [&cinfo](const char* str) -> Status {
+      jpegli_abort_decompress(&cinfo);
+      jpegli_destroy_decompress(&cinfo);
+      return JXL_FAILURE("%s", str);
+    };
+    jpegli_read_header(&cinfo, TRUE);
+    // Might cause CPU-zip bomb.
+    if (cinfo.arith_code) {
+      return failure("arithmetic code JPEGs are not supported");
+    }
+    int nbcomp = cinfo.num_components;
+    if (nbcomp != 1 && nbcomp != 3) {
+      return failure("unsupported number of components in JPEG");
+    }
+    if (dparams.force_rgb) {
+      cinfo.out_color_space = JCS_RGB;
+    } else if (dparams.force_grayscale) {
+      cinfo.out_color_space = JCS_GRAYSCALE;
+    }
+    if (!ReadICCProfile(&cinfo, &ppf->icc)) {
+      ppf->icc.clear();
+      // Default to SRGB
+      ppf->color_encoding.color_space =
+          ConvertColorSpace(cinfo.out_color_space);
+      ppf->color_encoding.white_point = JXL_WHITE_POINT_D65;
+      ppf->color_encoding.primaries = JXL_PRIMARIES_SRGB;
+      ppf->color_encoding.transfer_function = JXL_TRANSFER_FUNCTION_SRGB;
+      ppf->color_encoding.rendering_intent = JXL_RENDERING_INTENT_PERCEPTUAL;
+    }
+    ReadExif(&cinfo, &ppf->metadata.exif);
+
+    ppf->info.xsize = cinfo.image_width;
+    ppf->info.ysize = cinfo.image_height;
+    if (dparams.output_data_type == JXL_TYPE_UINT8) {
+      ppf->info.bits_per_sample = 8;
+      ppf->info.exponent_bits_per_sample = 0;
+    } else if (dparams.output_data_type == JXL_TYPE_UINT16) {
+      ppf->info.bits_per_sample = 16;
+      ppf->info.exponent_bits_per_sample = 0;
+    } else if (dparams.output_data_type == JXL_TYPE_FLOAT) {
+      ppf->info.bits_per_sample = 32;
+      ppf->info.exponent_bits_per_sample = 8;
+    } else {
+      return failure("unsupported data type");
+    }
+    ppf->info.uses_original_profile = true;
+
+    // No alpha in JPG
+    ppf->info.alpha_bits = 0;
+    ppf->info.alpha_exponent_bits = 0;
+    ppf->info.orientation = JXL_ORIENT_IDENTITY;
+
+    jpegli_set_output_format(&cinfo, ConvertDataType(dparams.output_data_type),
+                             ConvertEndianness(dparams.output_endianness));
+
+    if (dparams.num_colors > 0) {
+      cinfo.quantize_colors = TRUE;
+      cinfo.desired_number_of_colors = dparams.num_colors;
+      cinfo.two_pass_quantize = dparams.two_pass_quant;
+      cinfo.dither_mode = (J_DITHER_MODE)dparams.dither_mode;
+    }
+
+    jpegli_start_decompress(&cinfo);
+
+    ppf->info.num_color_channels = cinfo.out_color_components;
+    const JxlPixelFormat format{
+        /*num_channels=*/static_cast<uint32_t>(cinfo.out_color_components),
+        dparams.output_data_type,
+        dparams.output_endianness,
+        /*align=*/0,
+    };
+    ppf->frames.clear();
+    // Allocates the frame buffer.
+    ppf->frames.emplace_back(cinfo.image_width, cinfo.image_height, format);
+    const auto& frame = ppf->frames.back();
+    JXL_ASSERT(sizeof(JSAMPLE) * cinfo.out_color_components *
+                   cinfo.image_width <=
+               frame.color.stride);
+
+    for (size_t y = 0; y < cinfo.image_height; ++y) {
+      JSAMPROW rows[] = {reinterpret_cast<JSAMPLE*>(
+          static_cast<uint8_t*>(frame.color.pixels()) +
+          frame.color.stride * y)};
+      jpegli_read_scanlines(&cinfo, rows, 1);
+      if (dparams.num_colors > 0) {
+        UnmapColors(rows[0], cinfo.output_width, cinfo.out_color_components,
+                    cinfo.colormap, cinfo.actual_number_of_colors);
+      }
+    }
+
+    jpegli_finish_decompress(&cinfo);
+    return true;
+  };
+  bool success = try_catch_block();
+  jpegli_destroy_decompress(&cinfo);
+  return success;
+}
+
+}  // namespace extras
+}  // namespace jxl
diff --git a/lib/extras/dec/jpegli.h b/lib/extras/dec/jpegli.h
new file mode 100644 (file)
index 0000000..574df54
--- /dev/null
@@ -0,0 +1,41 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_EXTRAS_DEC_JPEGLI_H_
+#define LIB_EXTRAS_DEC_JPEGLI_H_
+
+// Decodes JPG pixels and metadata in memory using the libjpegli library.
+
+#include <jxl/types.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "lib/extras/packed_image.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+namespace extras {
+
+struct JpegDecompressParams {
+  JxlDataType output_data_type = JXL_TYPE_UINT8;
+  JxlEndianness output_endianness = JXL_NATIVE_ENDIAN;
+  bool force_rgb = false;
+  bool force_grayscale = false;
+  int num_colors = 0;
+  bool two_pass_quant = true;
+  // 0 = none, 1 = ordered, 2 = Floyd-Steinberg
+  int dither_mode = 2;
+};
+
+Status DecodeJpeg(const std::vector<uint8_t>& compressed,
+                  const JpegDecompressParams& dparams, ThreadPool* pool,
+                  PackedPixelFile* ppf);
+
+}  // namespace extras
+}  // namespace jxl
+
+#endif  // LIB_EXTRAS_DEC_JPEGLI_H_
index 6b92f4a..3c8a4bc 100644 (file)
@@ -5,8 +5,10 @@
 
 #include "lib/extras/dec/jpg.h"
 
+#if JPEGXL_ENABLE_JPEG
 #include <jpeglib.h>
 #include <setjmp.h>
+#endif
 #include <stdint.h>
 
 #include <algorithm>
 #include <utility>
 #include <vector>
 
+#include "lib/extras/size_constraints.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/sanitizers.h"
 
 namespace jxl {
 namespace extras {
 
+#if JPEGXL_ENABLE_JPEG
 namespace {
 
 constexpr unsigned char kICCSignature[12] = {
@@ -160,12 +164,35 @@ void MyOutputMessage(j_common_ptr cinfo) {
 #endif
 }
 
+void UnmapColors(uint8_t* row, size_t xsize, int components,
+                 JSAMPARRAY colormap, size_t num_colors) {
+  JXL_CHECK(colormap != nullptr);
+  std::vector<uint8_t> tmp(xsize * components);
+  for (size_t x = 0; x < xsize; ++x) {
+    JXL_CHECK(row[x] < num_colors);
+    for (int c = 0; c < components; ++c) {
+      tmp[x * components + c] = colormap[c][row[x]];
+    }
+  }
+  memcpy(row, tmp.data(), tmp.size());
+}
+
 }  // namespace
+#endif
+
+bool CanDecodeJPG() {
+#if JPEGXL_ENABLE_JPEG
+  return true;
+#else
+  return false;
+#endif
+}
 
 Status DecodeImageJPG(const Span<const uint8_t> bytes,
-                      const ColorHints& color_hints,
-                      const SizeConstraints& constraints,
-                      PackedPixelFile* ppf) {
+                      const ColorHints& color_hints, PackedPixelFile* ppf,
+                      const SizeConstraints* constraints,
+                      const JPGDecompressParams* dparams) {
+#if JPEGXL_ENABLE_JPEG
   // Don't do anything for non-JPEG files (no need to report an error)
   if (!IsJPG(bytes)) return false;
 
@@ -176,10 +203,7 @@ Status DecodeImageJPG(const Span<const uint8_t> bytes,
   std::unique_ptr<JSAMPLE[]> row;
 
   const auto try_catch_block = [&]() -> bool {
-    jpeg_decompress_struct cinfo;
-    // cinfo is initialized by libjpeg, which we are not instrumenting with
-    // msan, therefore we need to initialize cinfo here.
-    msan::UnpoisonMemory(&cinfo, sizeof(cinfo));
+    jpeg_decompress_struct cinfo = {};
     // Setup error handling in jpeg library so we can deal with broken jpegs in
     // the fuzzer.
     jpeg_error_mgr jerr;
@@ -207,8 +231,7 @@ Status DecodeImageJPG(const Span<const uint8_t> bytes,
     if (read_header_result == JPEG_SUSPENDED) {
       return failure("truncated JPEG input");
     }
-    if (!VerifyDimensions(&constraints, cinfo.image_width,
-                          cinfo.image_height)) {
+    if (!VerifyDimensions(constraints, cinfo.image_width, cinfo.image_height)) {
       return failure("image too big");
     }
     // Might cause CPU-zip bomb.
@@ -252,12 +275,21 @@ Status DecodeImageJPG(const Span<const uint8_t> bytes,
     ppf->info.num_color_channels = nbcomp;
     ppf->info.orientation = JXL_ORIENT_IDENTITY;
 
+    if (dparams && dparams->num_colors > 0) {
+      cinfo.quantize_colors = TRUE;
+      cinfo.desired_number_of_colors = dparams->num_colors;
+      cinfo.two_pass_quantize = dparams->two_pass_quant;
+      cinfo.dither_mode = (J_DITHER_MODE)dparams->dither_mode;
+    }
+
     jpeg_start_decompress(&cinfo);
-    JXL_ASSERT(cinfo.output_components == nbcomp);
+    JXL_ASSERT(cinfo.out_color_components == nbcomp);
+    JxlDataType data_type =
+        ppf->info.bits_per_sample <= 8 ? JXL_TYPE_UINT8 : JXL_TYPE_UINT16;
 
     const JxlPixelFormat format{
         /*num_channels=*/static_cast<uint32_t>(nbcomp),
-        /*data_type=*/BITS_IN_JSAMPLE == 8 ? JXL_TYPE_UINT8 : JXL_TYPE_UINT16,
+        data_type,
         /*endianness=*/JXL_NATIVE_ENDIAN,
         /*align=*/0,
     };
@@ -265,9 +297,19 @@ Status DecodeImageJPG(const Span<const uint8_t> bytes,
     // Allocates the frame buffer.
     ppf->frames.emplace_back(cinfo.image_width, cinfo.image_height, format);
     const auto& frame = ppf->frames.back();
-    JXL_ASSERT(sizeof(JSAMPLE) * cinfo.output_components * cinfo.image_width <=
+    JXL_ASSERT(sizeof(JSAMPLE) * cinfo.out_color_components *
+                   cinfo.image_width <=
                frame.color.stride);
 
+    if (cinfo.quantize_colors) {
+      jxl::msan::UnpoisonMemory(cinfo.colormap, cinfo.out_color_components *
+                                                    sizeof(cinfo.colormap[0]));
+      for (int c = 0; c < cinfo.out_color_components; ++c) {
+        jxl::msan::UnpoisonMemory(
+            cinfo.colormap[c],
+            cinfo.actual_number_of_colors * sizeof(cinfo.colormap[c][0]));
+      }
+    }
     for (size_t y = 0; y < cinfo.image_height; ++y) {
       JSAMPROW rows[] = {reinterpret_cast<JSAMPLE*>(
           static_cast<uint8_t*>(frame.color.pixels()) +
@@ -275,6 +317,10 @@ Status DecodeImageJPG(const Span<const uint8_t> bytes,
       jpeg_read_scanlines(&cinfo, rows, 1);
       msan::UnpoisonMemory(rows[0], sizeof(JSAMPLE) * cinfo.output_components *
                                         cinfo.image_width);
+      if (dparams && dparams->num_colors > 0) {
+        UnmapColors(rows[0], cinfo.output_width, cinfo.out_color_components,
+                    cinfo.colormap, cinfo.actual_number_of_colors);
+      }
     }
 
     jpeg_finish_decompress(&cinfo);
@@ -283,6 +329,9 @@ Status DecodeImageJPG(const Span<const uint8_t> bytes,
   };
 
   return try_catch_block();
+#else
+  return false;
+#endif
 }
 
 }  // namespace extras
index 66b3452..6e7b2f7 100644 (file)
 #include "lib/extras/codec.h"
 #include "lib/extras/dec/color_hints.h"
 #include "lib/jxl/base/data_parallel.h"
-#include "lib/jxl/base/padded_bytes.h"
 #include "lib/jxl/base/span.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/codec_in_out.h"
 
 namespace jxl {
+
+struct SizeConstraints;
+
 namespace extras {
 
+bool CanDecodeJPG();
+
+struct JPGDecompressParams {
+  int num_colors = 0;
+  bool two_pass_quant = false;
+  // 0 = none, 1 = ordered, 2 = Floyd-Steinberg
+  int dither_mode = 0;
+};
+
 // Decodes `bytes` into `ppf`. color_hints are ignored.
 // `elapsed_deinterleave`, if non-null, will be set to the time (in seconds)
 // that it took to deinterleave the raw JSAMPLEs to planar floats.
 Status DecodeImageJPG(Span<const uint8_t> bytes, const ColorHints& color_hints,
-                      const SizeConstraints& constraints, PackedPixelFile* ppf);
+                      PackedPixelFile* ppf,
+                      const SizeConstraints* constraints = nullptr,
+                      const JPGDecompressParams* dparams = nullptr);
 
 }  // namespace extras
 }  // namespace jxl
index 0e10356..1f3a3ff 100644 (file)
@@ -5,12 +5,15 @@
 
 #include "lib/extras/dec/jxl.h"
 
-#include "jxl/decode.h"
-#include "jxl/decode_cxx.h"
-#include "jxl/types.h"
+#include <jxl/cms.h>
+#include <jxl/decode.h>
+#include <jxl/decode_cxx.h>
+#include <jxl/types.h>
+
+#include "lib/extras/common.h"
 #include "lib/extras/dec/color_description.h"
-#include "lib/extras/enc/encode.h"
 #include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/exif.h"
 
 namespace jxl {
 namespace extras {
@@ -68,11 +71,48 @@ struct BoxProcessor {
   }
 };
 
+void SetBitDepthFromDataType(JxlDataType data_type, uint32_t* bits_per_sample,
+                             uint32_t* exponent_bits_per_sample) {
+  switch (data_type) {
+    case JXL_TYPE_UINT8:
+      *bits_per_sample = 8;
+      *exponent_bits_per_sample = 0;
+      break;
+    case JXL_TYPE_UINT16:
+      *bits_per_sample = 16;
+      *exponent_bits_per_sample = 0;
+      break;
+    case JXL_TYPE_FLOAT16:
+      *bits_per_sample = 16;
+      *exponent_bits_per_sample = 5;
+      break;
+    case JXL_TYPE_FLOAT:
+      *bits_per_sample = 32;
+      *exponent_bits_per_sample = 8;
+      break;
+  }
+}
+
+template <typename T>
+void UpdateBitDepth(JxlBitDepth bit_depth, JxlDataType data_type, T* info) {
+  if (bit_depth.type == JXL_BIT_DEPTH_FROM_PIXEL_FORMAT) {
+    SetBitDepthFromDataType(data_type, &info->bits_per_sample,
+                            &info->exponent_bits_per_sample);
+  } else if (bit_depth.type == JXL_BIT_DEPTH_CUSTOM) {
+    info->bits_per_sample = bit_depth.bits_per_sample;
+    info->exponent_bits_per_sample = bit_depth.exponent_bits_per_sample;
+  }
+}
+
 }  // namespace
 
 bool DecodeImageJXL(const uint8_t* bytes, size_t bytes_size,
                     const JXLDecompressParams& dparams, size_t* decoded_bytes,
                     PackedPixelFile* ppf, std::vector<uint8_t>* jpeg_bytes) {
+  JxlSignature sig = JxlSignatureCheck(bytes, bytes_size);
+  // silently return false if this is not a JXL file
+  if (sig == JXL_SIG_INVALID) return false;
+
   auto decoder = JxlDecoderMake(/*memory_manager=*/nullptr);
   JxlDecoder* dec = decoder.get();
   ppf->frames.clear();
@@ -86,12 +126,7 @@ bool DecodeImageJXL(const uint8_t* bytes, size_t bytes_size,
 
   JxlPixelFormat format;
   std::vector<JxlPixelFormat> accepted_formats = dparams.accepted_formats;
-  if (accepted_formats.empty()) {
-    for (const uint32_t num_channels : {1, 2, 3, 4}) {
-      accepted_formats.push_back(
-          {num_channels, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, /*align=*/0});
-    }
-  }
+
   JxlColorEncoding color_encoding;
   size_t num_color_channels = 0;
   if (!dparams.color_space.empty()) {
@@ -107,7 +142,9 @@ bool DecodeImageJXL(const uint8_t* bytes, size_t bytes_size,
   bool can_reconstruct_jpeg = false;
   std::vector<uint8_t> jpeg_data_chunk;
   if (jpeg_bytes != nullptr) {
-    jpeg_data_chunk.resize(16384);
+    // This bound is very likely to be enough to hold the entire
+    // reconstructed JPEG, to avoid having to do expensive retries.
+    jpeg_data_chunk.resize(bytes_size * 3 / 2 + 1024);
     jpeg_bytes->resize(0);
   }
 
@@ -128,6 +165,10 @@ bool DecodeImageJXL(const uint8_t* bytes, size_t bytes_size,
   } else {
     events |= (JXL_DEC_COLOR_ENCODING | JXL_DEC_FRAME | JXL_DEC_PREVIEW_IMAGE |
                JXL_DEC_BOX);
+    if (accepted_formats.empty()) {
+      // decoding just the metadata, not the pixel data
+      events ^= (JXL_DEC_FULL_IMAGE | JXL_DEC_PREVIEW_IMAGE);
+    }
   }
   if (JXL_DEC_SUCCESS != JxlDecoderSubscribeEvents(dec, events)) {
     fprintf(stderr, "JxlDecoderSubscribeEvents failed\n");
@@ -165,7 +206,7 @@ bool DecodeImageJXL(const uint8_t* bytes, size_t bytes_size,
     return false;
   }
   uint32_t progression_index = 0;
-  bool codestream_done = false;
+  bool codestream_done = accepted_formats.empty();
   BoxProcessor boxes(dec);
   for (;;) {
     JxlDecoderStatus status = JxlDecoderProcessInput(dec);
@@ -185,8 +226,12 @@ bool DecodeImageJXL(const uint8_t* bytes, size_t bytes_size,
         }
         break;
       }
+      size_t released_size = JxlDecoderReleaseInput(dec);
       fprintf(stderr,
-              "Input file is truncated and allow_partial_input was disabled.");
+              "Input file is truncated (total bytes: %" PRIuS
+              ", processed bytes: %" PRIuS
+              ") and --allow_partial_files is not present.\n",
+              bytes_size, bytes_size - released_size);
       return false;
     } else if (status == JXL_DEC_BOX) {
       boxes.FinalizeOutput();
@@ -240,11 +285,16 @@ bool DecodeImageJXL(const uint8_t* bytes, size_t bytes_size,
         fprintf(stderr, "JxlDecoderGetBasicInfo failed\n");
         return false;
       }
+      if (accepted_formats.empty()) continue;
       if (num_color_channels != 0) {
         // Mark the change in number of color channels due to the requested
         // color space.
         ppf->info.num_color_channels = num_color_channels;
       }
+      if (dparams.output_bitdepth.type == JXL_BIT_DEPTH_CUSTOM) {
+        // Select format based on custom bits per sample.
+        ppf->info.bits_per_sample = dparams.output_bitdepth.bits_per_sample;
+      }
       // Select format according to accepted formats.
       if (!jxl::extras::SelectFormat(accepted_formats, ppf->info, &format)) {
         fprintf(stderr, "SelectFormat failed\n");
@@ -254,9 +304,11 @@ bool DecodeImageJXL(const uint8_t* bytes, size_t bytes_size,
       if (!have_alpha) {
         // Mark in the basic info that alpha channel was dropped.
         ppf->info.alpha_bits = 0;
-      } else if (dparams.unpremultiply_alpha) {
-        // Mark in the basic info that alpha was unpremultiplied.
-        ppf->info.alpha_premultiplied = false;
+      } else {
+        if (dparams.unpremultiply_alpha) {
+          // Mark in the basic info that alpha was unpremultiplied.
+          ppf->info.alpha_premultiplied = false;
+        }
       }
       bool alpha_found = false;
       for (uint32_t i = 0; i < ppf->info.num_extra_channels; ++i) {
@@ -287,6 +339,7 @@ bool DecodeImageJXL(const uint8_t* bytes, size_t bytes_size,
                   "Warning: --color_space ignored because the image is "
                   "not XYB encoded.\n");
         } else {
+          JxlDecoderSetCms(dec, *JxlGetDefaultCms());
           if (JXL_DEC_SUCCESS !=
               JxlDecoderSetPreferredColorProfile(dec, &color_encoding)) {
             fprintf(stderr, "Failed to set color space.\n");
@@ -296,34 +349,35 @@ bool DecodeImageJXL(const uint8_t* bytes, size_t bytes_size,
       }
       size_t icc_size = 0;
       JxlColorProfileTarget target = JXL_COLOR_PROFILE_TARGET_DATA;
-      if (JXL_DEC_SUCCESS !=
-          JxlDecoderGetICCProfileSize(dec, nullptr, target, &icc_size)) {
-        fprintf(stderr, "JxlDecoderGetICCProfileSize failed\n");
-      }
-      if (icc_size != 0) {
-        ppf->icc.resize(icc_size);
+      ppf->color_encoding.color_space = JXL_COLOR_SPACE_UNKNOWN;
+      if (JXL_DEC_SUCCESS != JxlDecoderGetColorAsEncodedProfile(
+                                 dec, target, &ppf->color_encoding) ||
+          dparams.need_icc) {
+        // only get ICC if it is not an Enum color encoding
         if (JXL_DEC_SUCCESS !=
-            JxlDecoderGetColorAsICCProfile(dec, nullptr, target,
-                                           ppf->icc.data(), icc_size)) {
-          fprintf(stderr, "JxlDecoderGetColorAsICCProfile failed\n");
-          return false;
+            JxlDecoderGetICCProfileSize(dec, target, &icc_size)) {
+          fprintf(stderr, "JxlDecoderGetICCProfileSize failed\n");
+        }
+        if (icc_size != 0) {
+          ppf->icc.resize(icc_size);
+          if (JXL_DEC_SUCCESS != JxlDecoderGetColorAsICCProfile(
+                                     dec, target, ppf->icc.data(), icc_size)) {
+            fprintf(stderr, "JxlDecoderGetColorAsICCProfile failed\n");
+            return false;
+          }
         }
-      }
-      if (JXL_DEC_SUCCESS != JxlDecoderGetColorAsEncodedProfile(
-                                 dec, nullptr, target, &ppf->color_encoding)) {
-        ppf->color_encoding.color_space = JXL_COLOR_SPACE_UNKNOWN;
       }
       icc_size = 0;
       target = JXL_COLOR_PROFILE_TARGET_ORIGINAL;
       if (JXL_DEC_SUCCESS !=
-          JxlDecoderGetICCProfileSize(dec, nullptr, target, &icc_size)) {
+          JxlDecoderGetICCProfileSize(dec, target, &icc_size)) {
         fprintf(stderr, "JxlDecoderGetICCProfileSize failed\n");
       }
       if (icc_size != 0) {
         ppf->orig_icc.resize(icc_size);
         if (JXL_DEC_SUCCESS !=
-            JxlDecoderGetColorAsICCProfile(dec, nullptr, target,
-                                           ppf->orig_icc.data(), icc_size)) {
+            JxlDecoderGetColorAsICCProfile(dec, target, ppf->orig_icc.data(),
+                                           icc_size)) {
           fprintf(stderr, "JxlDecoderGetColorAsICCProfile failed\n");
           return false;
         }
@@ -421,9 +475,21 @@ bool DecodeImageJXL(const uint8_t* bytes, size_t bytes_size,
           return false;
         }
       }
+      if (JXL_DEC_SUCCESS !=
+          JxlDecoderSetImageOutBitDepth(dec, &dparams.output_bitdepth)) {
+        fprintf(stderr, "JxlDecoderSetImageOutBitDepth failed\n");
+        return false;
+      }
+      UpdateBitDepth(dparams.output_bitdepth, format.data_type, &ppf->info);
+      bool have_alpha = (format.num_channels == 2 || format.num_channels == 4);
+      if (have_alpha) {
+        // Interleaved alpha channels has the same bit depth as color channels.
+        ppf->info.alpha_bits = ppf->info.bits_per_sample;
+        ppf->info.alpha_exponent_bits = ppf->info.exponent_bits_per_sample;
+      }
       JxlPixelFormat ec_format = format;
       ec_format.num_channels = 1;
-      for (const auto& eci : ppf->extra_channels_info) {
+      for (auto& eci : ppf->extra_channels_info) {
         frame.extra_channels.emplace_back(jxl::extras::PackedImage(
             ppf->info.xsize, ppf->info.ysize, ec_format));
         auto& ec = frame.extra_channels.back();
@@ -446,6 +512,8 @@ bool DecodeImageJXL(const uint8_t* bytes, size_t bytes_size,
           fprintf(stderr, "JxlDecoderSetExtraChannelBuffer failed\n");
           return false;
         }
+        UpdateBitDepth(dparams.output_bitdepth, ec_format.data_type,
+                       &eci.ec_info);
       }
     } else if (status == JXL_DEC_SUCCESS) {
       // Decoding finished successfully.
@@ -463,6 +531,28 @@ bool DecodeImageJXL(const uint8_t* bytes, size_t bytes_size,
     }
   }
   boxes.FinalizeOutput();
+  if (!ppf->metadata.exif.empty()) {
+    // Verify that Exif box has a valid TIFF header at the specified offset.
+    // Discard bytes preceding the header.
+    if (ppf->metadata.exif.size() >= 4) {
+      uint32_t offset = LoadBE32(ppf->metadata.exif.data());
+      if (offset <= ppf->metadata.exif.size() - 8) {
+        std::vector<uint8_t> exif(ppf->metadata.exif.begin() + 4 + offset,
+                                  ppf->metadata.exif.end());
+        bool bigendian;
+        if (IsExif(exif, &bigendian)) {
+          ppf->metadata.exif = std::move(exif);
+        } else {
+          fprintf(stderr, "Warning: invalid TIFF header in Exif\n");
+        }
+      } else {
+        fprintf(stderr, "Warning: invalid Exif offset: %" PRIu32 "\n", offset);
+      }
+    } else {
+      fprintf(stderr, "Warning: invalid Exif length: %" PRIuS "\n",
+              ppf->metadata.exif.size());
+    }
+  }
   if (jpeg_bytes != nullptr) {
     if (!can_reconstruct_jpeg) return false;
     size_t used_jpeg_output =
index c462fa4..cbada1f 100644 (file)
@@ -8,14 +8,14 @@
 
 // Decodes JPEG XL images in memory.
 
+#include <jxl/parallel_runner.h>
+#include <jxl/types.h>
 #include <stdint.h>
 
 #include <limits>
 #include <string>
 #include <vector>
 
-#include "jxl/parallel_runner.h"
-#include "jxl/types.h"
 #include "lib/extras/packed_image.h"
 
 namespace jxl {
@@ -41,6 +41,10 @@ struct JXLDecompressParams {
   // Whether truncated input should be treated as an error.
   bool allow_partial_input = false;
 
+  // Set to true if an ICC profile has to be synthesized for Enum color
+  // encodings
+  bool need_icc = false;
+
   // How many passes to decode at most. By default, decode everything.
   uint32_t max_passes = std::numeric_limits<uint32_t>::max();
 
@@ -53,6 +57,9 @@ struct JXLDecompressParams {
   bool use_image_callback = true;
   // Whether to unpremultiply colors for associated alpha channels.
   bool unpremultiply_alpha = false;
+
+  // Controls the effective bit depth of the output pixels.
+  JxlBitDepth output_bitdepth = {JXL_BIT_DEPTH_FROM_PIXEL_FORMAT, 0, 0};
 };
 
 bool DecodeImageJXL(const uint8_t* bytes, size_t bytes_size,
index 1417348..a99eb0f 100644 (file)
@@ -7,6 +7,7 @@
 
 #include <string.h>
 
+#include "lib/extras/size_constraints.h"
 #include "lib/jxl/base/bits.h"
 #include "lib/jxl/base/compiler_specific.h"
 
@@ -145,15 +146,14 @@ class Parser {
 }  // namespace
 
 Status DecodeImagePGX(const Span<const uint8_t> bytes,
-                      const ColorHints& color_hints,
-                      const SizeConstraints& constraints,
-                      PackedPixelFile* ppf) {
+                      const ColorHints& color_hints, PackedPixelFile* ppf,
+                      const SizeConstraints* constraints) {
   Parser parser(bytes);
   HeaderPGX header = {};
   const uint8_t* pos;
   if (!parser.ParseHeader(&header, &pos)) return false;
   JXL_RETURN_IF_ERROR(
-      VerifyDimensions(&constraints, header.xsize, header.ysize));
+      VerifyDimensions(constraints, header.xsize, header.ysize));
   if (header.bits_per_sample == 0 || header.bits_per_sample > 32) {
     return JXL_FAILURE("PGX: bits_per_sample invalid");
   }
index 38aedf5..ce852e6 100644 (file)
 #include "lib/extras/dec/color_hints.h"
 #include "lib/extras/packed_image.h"
 #include "lib/jxl/base/data_parallel.h"
-#include "lib/jxl/base/padded_bytes.h"
 #include "lib/jxl/base/span.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/codec_in_out.h"
 
 namespace jxl {
+
+struct SizeConstraints;
+
 namespace extras {
 
 // Decodes `bytes` into `ppf`.
 Status DecodeImagePGX(Span<const uint8_t> bytes, const ColorHints& color_hints,
-                      const SizeConstraints& constraints, PackedPixelFile* ppf);
+                      PackedPixelFile* ppf,
+                      const SizeConstraints* constraints = nullptr);
 
 }  // namespace extras
 }  // namespace jxl
index 41e6bf8..5dbc314 100644 (file)
@@ -5,16 +5,18 @@
 
 #include "lib/extras/dec/pgx.h"
 
-#include "gtest/gtest.h"
+#include <cstring>
+
 #include "lib/extras/packed_image_convert.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
 namespace extras {
 namespace {
 
 Span<const uint8_t> MakeSpan(const char* str) {
-  return Span<const uint8_t>(reinterpret_cast<const uint8_t*>(str),
-                             strlen(str));
+  return Bytes(reinterpret_cast<const uint8_t*>(str), strlen(str));
 }
 
 TEST(CodecPGXTest, Test8bits) {
@@ -23,8 +25,7 @@ TEST(CodecPGXTest, Test8bits) {
   PackedPixelFile ppf;
   ThreadPool* pool = nullptr;
 
-  EXPECT_TRUE(DecodeImagePGX(MakeSpan(pgx.c_str()), ColorHints(),
-                             SizeConstraints(), &ppf));
+  EXPECT_TRUE(DecodeImagePGX(MakeSpan(pgx.c_str()), ColorHints(), &ppf));
   CodecInOut io;
   EXPECT_TRUE(ConvertPackedPixelFileToCodecInOut(ppf, pool, &io));
 
@@ -51,8 +52,7 @@ TEST(CodecPGXTest, Test16bits) {
   PackedPixelFile ppf;
   ThreadPool* pool = nullptr;
 
-  EXPECT_TRUE(DecodeImagePGX(MakeSpan(pgx.c_str()), ColorHints(),
-                             SizeConstraints(), &ppf));
+  EXPECT_TRUE(DecodeImagePGX(MakeSpan(pgx.c_str()), ColorHints(), &ppf));
   CodecInOut io;
   EXPECT_TRUE(ConvertPackedPixelFileToCodecInOut(ppf, pool, &io));
 
index 03aecef..c576385 100644 (file)
@@ -8,6 +8,9 @@
 #include <stdlib.h>
 #include <string.h>
 
+#include <cmath>
+
+#include "lib/extras/size_constraints.h"
 #include "lib/jxl/base/bits.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/status.h"
@@ -16,16 +19,6 @@ namespace jxl {
 namespace extras {
 namespace {
 
-struct HeaderPNM {
-  size_t xsize;
-  size_t ysize;
-  bool is_gray;    // PGM
-  bool has_alpha;  // PAM
-  size_t bits_per_sample;
-  bool floating_point;
-  bool big_endian;
-};
-
 class Parser {
  public:
   explicit Parser(const Span<const uint8_t> bytes)
@@ -183,16 +176,20 @@ class Parser {
   Status ParseHeaderPAM(HeaderPNM* header, const uint8_t** pos) {
     size_t depth = 3;
     size_t max_val = 255;
+    JXL_RETURN_IF_ERROR(SkipWhitespace());
     while (!MatchString("ENDHDR", /*skipws=*/false)) {
-      JXL_RETURN_IF_ERROR(SkipWhitespace());
       if (MatchString("WIDTH")) {
         JXL_RETURN_IF_ERROR(ParseUnsigned(&header->xsize));
+        JXL_RETURN_IF_ERROR(SkipWhitespace());
       } else if (MatchString("HEIGHT")) {
         JXL_RETURN_IF_ERROR(ParseUnsigned(&header->ysize));
+        JXL_RETURN_IF_ERROR(SkipWhitespace());
       } else if (MatchString("DEPTH")) {
         JXL_RETURN_IF_ERROR(ParseUnsigned(&depth));
+        JXL_RETURN_IF_ERROR(SkipWhitespace());
       } else if (MatchString("MAXVAL")) {
         JXL_RETURN_IF_ERROR(ParseUnsigned(&max_val));
+        JXL_RETURN_IF_ERROR(SkipWhitespace());
       } else if (MatchString("TUPLTYPE")) {
         if (MatchString("RGB_ALPHA")) {
           header->has_alpha = true;
@@ -209,6 +206,20 @@ class Parser {
         } else if (MatchString("BLACKANDWHITE")) {
           header->is_gray = true;
           max_val = 1;
+        } else if (MatchString("Alpha")) {
+          header->ec_types.push_back(JXL_CHANNEL_ALPHA);
+        } else if (MatchString("Depth")) {
+          header->ec_types.push_back(JXL_CHANNEL_DEPTH);
+        } else if (MatchString("SpotColor")) {
+          header->ec_types.push_back(JXL_CHANNEL_SPOT_COLOR);
+        } else if (MatchString("SelectionMask")) {
+          header->ec_types.push_back(JXL_CHANNEL_SELECTION_MASK);
+        } else if (MatchString("Black")) {
+          header->ec_types.push_back(JXL_CHANNEL_BLACK);
+        } else if (MatchString("CFA")) {
+          header->ec_types.push_back(JXL_CHANNEL_CFA);
+        } else if (MatchString("Thermal")) {
+          header->ec_types.push_back(JXL_CHANNEL_THERMAL);
         } else {
           return JXL_FAILURE("PAM: unknown TUPLTYPE");
         }
@@ -223,13 +234,13 @@ class Parser {
     }
     size_t num_channels = header->is_gray ? 1 : 3;
     if (header->has_alpha) num_channels++;
-    if (num_channels != depth) {
+    if (num_channels + header->ec_types.size() != depth) {
       return JXL_FAILURE("PAM: bad DEPTH");
     }
     if (max_val == 0 || max_val >= 65536) {
       return JXL_FAILURE("PAM: bad MAXVAL");
     }
-    // e.g When `max_val` is 1 , we want 1 bit:
+    // e.g. When `max_val` is 1 , we want 1 bit:
     header->bits_per_sample = FloorLog2Nonzero(max_val) + 1;
     if ((1u << header->bits_per_sample) - 1 != max_val)
       return JXL_FAILURE("PNM: unsupported MaxVal (expected 2^n - 1)");
@@ -298,30 +309,98 @@ class Parser {
 };
 
 Span<const uint8_t> MakeSpan(const char* str) {
-  return Span<const uint8_t>(reinterpret_cast<const uint8_t*>(str),
-                             strlen(str));
+  return Bytes(reinterpret_cast<const uint8_t*>(str), strlen(str));
+}
+
+void ReadLinePNM(void* opaque, size_t xpos, size_t ypos, size_t xsize,
+                 uint8_t* buffer, size_t len) {
+  ChunkedPNMDecoder* dec = reinterpret_cast<ChunkedPNMDecoder*>(opaque);
+  const size_t bytes_per_channel =
+      DivCeil(dec->header.bits_per_sample, jxl::kBitsPerByte);
+  const size_t pixel_offset = ypos * dec->header.xsize + xpos;
+  const size_t num_channels = dec->header.is_gray ? 1 : 3;
+  const size_t offset = pixel_offset * num_channels * bytes_per_channel;
+  const size_t num_bytes = xsize * num_channels * bytes_per_channel;
+  if (fseek(dec->f, dec->data_start + offset, SEEK_SET) != 0) {
+    return;
+  }
+  JXL_ASSERT(num_bytes == len);
+  if (num_bytes != fread(buffer, 1, num_bytes, dec->f)) {
+    JXL_WARNING("Failed to read from PNM file\n");
+  }
 }
 
 }  // namespace
 
-Status DecodeImagePNM(const Span<const uint8_t> bytes,
-                      const ColorHints& color_hints,
-                      const SizeConstraints& constraints,
+Status DecodeImagePNM(ChunkedPNMDecoder* dec, const ColorHints& color_hints,
                       PackedPixelFile* ppf) {
+  std::vector<uint8_t> buffer(10 * 1024);
+  const size_t bytes_read = fread(buffer.data(), 1, buffer.size(), dec->f);
+  if (ferror(dec->f) || bytes_read > buffer.size()) {
+    return false;
+  }
+  Span<const uint8_t> span(buffer);
+  Parser parser(span);
+  HeaderPNM& header = dec->header;
+  const uint8_t* pos = nullptr;
+  if (!parser.ParseHeader(&header, &pos)) {
+    return false;
+  }
+  dec->data_start = pos - &buffer[0];
+
+  if (header.bits_per_sample == 0 || header.bits_per_sample > 16) {
+    return JXL_FAILURE("Invalid bits_per_sample");
+  }
+  if (header.has_alpha || !header.ec_types.empty() || header.floating_point) {
+    return JXL_FAILURE("Only PGM and PPM inputs are supported");
+  }
+
+  // PPM specifies that in the raster, the sample values are "nonlinear"
+  // (BP.709, with gamma number of 2.2). Deviate from the specification and
+  // assume `sRGB` in our implementation.
+  JXL_RETURN_IF_ERROR(ApplyColorHints(color_hints, /*color_already_set=*/false,
+                                      header.is_gray, ppf));
+
+  ppf->info.xsize = header.xsize;
+  ppf->info.ysize = header.ysize;
+  ppf->info.bits_per_sample = header.bits_per_sample;
+  ppf->info.exponent_bits_per_sample = 0;
+  ppf->info.orientation = JXL_ORIENT_IDENTITY;
+  ppf->info.alpha_bits = 0;
+  ppf->info.alpha_exponent_bits = 0;
+  ppf->info.num_color_channels = (header.is_gray ? 1 : 3);
+  ppf->info.num_extra_channels = 0;
+
+  const JxlDataType data_type =
+      header.bits_per_sample > 8 ? JXL_TYPE_UINT16 : JXL_TYPE_UINT8;
+  const JxlPixelFormat format{
+      /*num_channels=*/ppf->info.num_color_channels,
+      /*data_type=*/data_type,
+      /*endianness=*/header.big_endian ? JXL_BIG_ENDIAN : JXL_LITTLE_ENDIAN,
+      /*align=*/0,
+  };
+  ppf->chunked_frames.emplace_back(header.xsize, header.ysize, format, dec,
+                                   ReadLinePNM);
+  return true;
+}
+
+Status DecodeImagePNM(const Span<const uint8_t> bytes,
+                      const ColorHints& color_hints, PackedPixelFile* ppf,
+                      const SizeConstraints* constraints) {
   Parser parser(bytes);
   HeaderPNM header = {};
   const uint8_t* pos = nullptr;
   if (!parser.ParseHeader(&header, &pos)) return false;
   JXL_RETURN_IF_ERROR(
-      VerifyDimensions(&constraints, header.xsize, header.ysize));
+      VerifyDimensions(constraints, header.xsize, header.ysize));
 
   if (header.bits_per_sample == 0 || header.bits_per_sample > 32) {
     return JXL_FAILURE("PNM: bits_per_sample invalid");
   }
 
-  // PPM specify that in the raster, the sample values are "nonlinear" (BP.709,
-  // with gamma number of 2.2). Deviate from the specification and assume
-  // `sRGB` in our implementation.
+  // PPM specifies that in the raster, the sample values are "nonlinear"
+  // (BP.709, with gamma number of 2.2). Deviate from the specification and
+  // assume `sRGB` in our implementation.
   JXL_RETURN_IF_ERROR(ApplyColorHints(color_hints, /*color_already_set=*/false,
                                       header.is_gray, ppf));
 
@@ -341,7 +420,17 @@ Status DecodeImagePNM(const Span<const uint8_t> bytes,
   ppf->info.alpha_bits = (header.has_alpha ? ppf->info.bits_per_sample : 0);
   ppf->info.alpha_exponent_bits = 0;
   ppf->info.num_color_channels = (header.is_gray ? 1 : 3);
-  ppf->info.num_extra_channels = (header.has_alpha ? 1 : 0);
+  uint32_t num_alpha_channels = (header.has_alpha ? 1 : 0);
+  uint32_t num_interleaved_channels =
+      ppf->info.num_color_channels + num_alpha_channels;
+  ppf->info.num_extra_channels = num_alpha_channels + header.ec_types.size();
+
+  for (auto type : header.ec_types) {
+    PackedExtraChannel pec;
+    pec.ec_info.bits_per_sample = ppf->info.bits_per_sample;
+    pec.ec_info.type = type;
+    ppf->extra_channels_info.emplace_back(std::move(pec));
+  }
 
   JxlDataType data_type;
   if (header.floating_point) {
@@ -356,27 +445,50 @@ Status DecodeImagePNM(const Span<const uint8_t> bytes,
   }
 
   const JxlPixelFormat format{
-      /*num_channels=*/ppf->info.num_color_channels +
-          ppf->info.num_extra_channels,
+      /*num_channels=*/num_interleaved_channels,
       /*data_type=*/data_type,
       /*endianness=*/header.big_endian ? JXL_BIG_ENDIAN : JXL_LITTLE_ENDIAN,
       /*align=*/0,
   };
+  const JxlPixelFormat ec_format{1, format.data_type, format.endianness, 0};
   ppf->frames.clear();
   ppf->frames.emplace_back(header.xsize, header.ysize, format);
   auto* frame = &ppf->frames.back();
-
+  for (size_t i = 0; i < header.ec_types.size(); ++i) {
+    frame->extra_channels.emplace_back(header.xsize, header.ysize, ec_format);
+  }
   size_t pnm_remaining_size = bytes.data() + bytes.size() - pos;
   if (pnm_remaining_size < frame->color.pixels_size) {
     return JXL_FAILURE("PNM file too small");
   }
-  const bool flipped_y = header.bits_per_sample == 32;  // PFMs are flipped
+
   uint8_t* out = reinterpret_cast<uint8_t*>(frame->color.pixels());
-  for (size_t y = 0; y < header.ysize; ++y) {
-    size_t y_in = flipped_y ? header.ysize - 1 - y : y;
-    const uint8_t* row_in = &pos[y_in * frame->color.stride];
-    uint8_t* row_out = &out[y * frame->color.stride];
-    memcpy(row_out, row_in, frame->color.stride);
+  std::vector<uint8_t*> ec_out(header.ec_types.size());
+  for (size_t i = 0; i < ec_out.size(); ++i) {
+    ec_out[i] = reinterpret_cast<uint8_t*>(frame->extra_channels[i].pixels());
+  }
+  if (ec_out.empty()) {
+    const bool flipped_y = header.bits_per_sample == 32;  // PFMs are flipped
+    for (size_t y = 0; y < header.ysize; ++y) {
+      size_t y_in = flipped_y ? header.ysize - 1 - y : y;
+      const uint8_t* row_in = &pos[y_in * frame->color.stride];
+      uint8_t* row_out = &out[y * frame->color.stride];
+      memcpy(row_out, row_in, frame->color.stride);
+    }
+  } else {
+    size_t pwidth = PackedImage::BitsPerChannel(data_type) / 8;
+    for (size_t y = 0; y < header.ysize; ++y) {
+      for (size_t x = 0; x < header.xsize; ++x) {
+        memcpy(out, pos, frame->color.pixel_stride());
+        out += frame->color.pixel_stride();
+        pos += frame->color.pixel_stride();
+        for (auto& p : ec_out) {
+          memcpy(p, pos, pwidth);
+          pos += pwidth;
+          p += pwidth;
+        }
+      }
+    }
   }
   return true;
 }
index f637483..9b68e56 100644 (file)
 #include "lib/extras/dec/color_hints.h"
 #include "lib/extras/packed_image.h"
 #include "lib/jxl/base/data_parallel.h"
-#include "lib/jxl/base/padded_bytes.h"
 #include "lib/jxl/base/span.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/codec_in_out.h"
 
 namespace jxl {
+
+struct SizeConstraints;
+
 namespace extras {
 
 // Decodes `bytes` into `ppf`. color_hints may specify "color_space", which
 // defaults to sRGB.
 Status DecodeImagePNM(Span<const uint8_t> bytes, const ColorHints& color_hints,
-                      const SizeConstraints& constraints, PackedPixelFile* ppf);
+                      PackedPixelFile* ppf,
+                      const SizeConstraints* constraints = nullptr);
 
 void TestCodecPNM();
 
+struct HeaderPNM {
+  size_t xsize;
+  size_t ysize;
+  bool is_gray;    // PGM
+  bool has_alpha;  // PAM
+  size_t bits_per_sample;
+  bool floating_point;
+  bool big_endian;
+  std::vector<JxlExtraChannelType> ec_types;  // PAM
+};
+
+struct ChunkedPNMDecoder {
+  FILE* f;
+  HeaderPNM header = {};
+  size_t data_start;
+};
+
+Status DecodeImagePNM(ChunkedPNMDecoder* dec, const ColorHints& color_hints,
+                      PackedPixelFile* ppf);
+
 }  // namespace extras
 }  // namespace jxl
 
index db6cf9e..f2f8754 100644 (file)
@@ -36,7 +36,6 @@
  *
  */
 
-#include <stdio.h>
 #include <string.h>
 
 #include <string>
 #include "lib/extras/exif.h"
 #include "lib/jxl/base/byte_order.h"
 #include "lib/jxl/base/printf_macros.h"
+#if JPEGXL_ENABLE_APNG
 #include "png.h" /* original (unpatched) libpng is ok */
+#endif
 
 namespace jxl {
 namespace extras {
 
+#if JPEGXL_ENABLE_APNG
 namespace {
 
+constexpr unsigned char kExifSignature[6] = {0x45, 0x78, 0x69,
+                                             0x66, 0x00, 0x00};
+
 class APNGEncoder : public Encoder {
  public:
   std::vector<JxlPixelFormat> AcceptedFormats() const override {
     std::vector<JxlPixelFormat> formats;
     for (const uint32_t num_channels : {1, 2, 3, 4}) {
       for (const JxlDataType data_type : {JXL_TYPE_UINT8, JXL_TYPE_UINT16}) {
-        formats.push_back(JxlPixelFormat{num_channels, data_type,
-                                         JXL_BIG_ENDIAN, /*align=*/0});
+        for (JxlEndianness endianness : {JXL_BIG_ENDIAN, JXL_LITTLE_ENDIAN}) {
+          formats.push_back(
+              JxlPixelFormat{num_channels, data_type, endianness, /*align=*/0});
+        }
       }
     }
     return formats;
@@ -96,12 +103,24 @@ class BlobsWriterPNG {
       // identity to avoid repeated orientation.
       std::vector<uint8_t> exif = blobs.exif;
       ResetExifOrientation(exif);
+      // By convention, the data is prefixed with "Exif\0\0" when stored in
+      // the legacy (and non-standard) "Raw profile type exif" text chunk
+      // currently used here.
+      // TODO(user): Store Exif data in an eXIf chunk instead, which always
+      //             begins with the TIFF header.
+      if (exif.size() >= sizeof kExifSignature &&
+          memcmp(exif.data(), kExifSignature, sizeof kExifSignature) != 0) {
+        exif.insert(exif.begin(), kExifSignature,
+                    kExifSignature + sizeof kExifSignature);
+      }
       JXL_RETURN_IF_ERROR(EncodeBase16("exif", exif, strings));
     }
     if (!blobs.iptc.empty()) {
       JXL_RETURN_IF_ERROR(EncodeBase16("iptc", blobs.iptc, strings));
     }
     if (!blobs.xmp.empty()) {
+      // TODO(user): Store XMP data in an "XML:com.adobe.xmp" text chunk
+      //             instead.
       JXL_RETURN_IF_ERROR(EncodeBase16("xmp", blobs.xmp, strings));
     }
     return true;
@@ -142,7 +161,7 @@ class BlobsWriterPNG {
   }
 };
 
-void MaybeAddCICP(JxlColorEncoding c_enc, png_structp png_ptr,
+void MaybeAddCICP(const JxlColorEncoding& c_enc, png_structp png_ptr,
                   png_infop info_ptr) {
   png_byte cicp_data[4] = {};
   png_unknown_chunk cicp_chunk;
@@ -172,13 +191,80 @@ void MaybeAddCICP(JxlColorEncoding c_enc, png_structp png_ptr,
   cicp_data[3] = 1;
   cicp_chunk.data = cicp_data;
   cicp_chunk.size = sizeof(cicp_data);
-  cicp_chunk.location = PNG_HAVE_PLTE;
+  cicp_chunk.location = PNG_HAVE_IHDR;
   memcpy(cicp_chunk.name, "cICP", 5);
-  png_set_keep_unknown_chunks(png_ptr, 3,
+  png_set_keep_unknown_chunks(png_ptr, PNG_HANDLE_CHUNK_ALWAYS,
                               reinterpret_cast<const png_byte*>("cICP"), 1);
   png_set_unknown_chunks(png_ptr, info_ptr, &cicp_chunk, 1);
 }
 
+bool MaybeAddSRGB(const JxlColorEncoding& c_enc, png_structp png_ptr,
+                  png_infop info_ptr) {
+  if (c_enc.transfer_function == JXL_TRANSFER_FUNCTION_SRGB &&
+      (c_enc.color_space == JXL_COLOR_SPACE_GRAY ||
+       (c_enc.color_space == JXL_COLOR_SPACE_RGB &&
+        c_enc.primaries == JXL_PRIMARIES_SRGB &&
+        c_enc.white_point == JXL_WHITE_POINT_D65))) {
+    png_set_sRGB(png_ptr, info_ptr, c_enc.rendering_intent);
+    png_set_cHRM_fixed(png_ptr, info_ptr, 31270, 32900, 64000, 33000, 30000,
+                       60000, 15000, 6000);
+    png_set_gAMA_fixed(png_ptr, info_ptr, 45455);
+    return true;
+  }
+  return false;
+}
+
+void MaybeAddCHRM(const JxlColorEncoding& c_enc, png_structp png_ptr,
+                  png_infop info_ptr) {
+  if (c_enc.color_space != JXL_COLOR_SPACE_RGB) return;
+  if (c_enc.primaries == 0) return;
+  png_set_cHRM(png_ptr, info_ptr, c_enc.white_point_xy[0],
+               c_enc.white_point_xy[1], c_enc.primaries_red_xy[0],
+               c_enc.primaries_red_xy[1], c_enc.primaries_green_xy[0],
+               c_enc.primaries_green_xy[1], c_enc.primaries_blue_xy[0],
+               c_enc.primaries_blue_xy[1]);
+}
+
+void MaybeAddGAMA(const JxlColorEncoding& c_enc, png_structp png_ptr,
+                  png_infop info_ptr) {
+  switch (c_enc.transfer_function) {
+    case JXL_TRANSFER_FUNCTION_LINEAR:
+      png_set_gAMA_fixed(png_ptr, info_ptr, PNG_FP_1);
+      break;
+    case JXL_TRANSFER_FUNCTION_SRGB:
+      png_set_gAMA_fixed(png_ptr, info_ptr, 45455);
+      break;
+    case JXL_TRANSFER_FUNCTION_GAMMA:
+      png_set_gAMA(png_ptr, info_ptr, c_enc.gamma);
+      break;
+
+    default:;
+      // No gAMA chunk.
+  }
+}
+
+void MaybeAddCLLi(const JxlColorEncoding& c_enc, const float intensity_target,
+                  png_structp png_ptr, png_infop info_ptr) {
+  if (c_enc.transfer_function != JXL_TRANSFER_FUNCTION_PQ) return;
+
+  const uint32_t max_cll =
+      static_cast<uint32_t>(10000.f * Clamp1(intensity_target, 0.f, 10000.f));
+  png_byte chunk_data[8] = {};
+  chunk_data[0] = (max_cll >> 24) & 0xFF;
+  chunk_data[1] = (max_cll >> 16) & 0xFF;
+  chunk_data[2] = (max_cll >> 8) & 0xFF;
+  chunk_data[3] = max_cll & 0xFF;
+  // Leave MaxFALL set to 0.
+  png_unknown_chunk chunk;
+  memcpy(chunk.name, "cLLi", 5);
+  chunk.data = chunk_data;
+  chunk.size = sizeof chunk_data;
+  chunk.location = PNG_HAVE_IHDR;
+  png_set_keep_unknown_chunks(png_ptr, PNG_HANDLE_CHUNK_ALWAYS,
+                              reinterpret_cast<const png_byte*>("cLLi"), 1);
+  png_set_unknown_chunks(png_ptr, info_ptr, &chunk, 1);
+}
+
 Status APNGEncoder::EncodePackedPixelFileToAPNG(
     const PackedPixelFile& ppf, ThreadPool* pool,
     std::vector<uint8_t>* bytes) const {
@@ -233,21 +319,7 @@ Status APNGEncoder::EncodePackedPixelFileToAPNG(
       } else {
         memcpy(&out[0], in, out_size);
       }
-    } else if (format.data_type == JXL_TYPE_FLOAT) {
-      float mul = 65535.0;
-      const uint8_t* p_in = in;
-      uint8_t* p_out = out.data();
-      for (size_t i = 0; i < num_samples; ++i, p_in += 4, p_out += 2) {
-        uint32_t val = (format.endianness == JXL_BIG_ENDIAN ? LoadBE32(p_in)
-                                                            : LoadLE32(p_in));
-        float fval;
-        memcpy(&fval, &val, 4);
-        StoreBE16(static_cast<uint32_t>(fval * mul + 0.5), p_out);
-      }
-    } else {
-      return JXL_FAILURE("Unsupported pixel data type");
     }
-
     png_structp png_ptr;
     png_infop info_ptr;
 
@@ -272,11 +344,19 @@ Status APNGEncoder::EncodePackedPixelFileToAPNG(
                  PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_BASE,
                  PNG_FILTER_TYPE_BASE);
     if (count == 0) {
-      MaybeAddCICP(ppf.color_encoding, png_ptr, info_ptr);
-      if (!ppf.icc.empty()) {
-        png_set_benign_errors(png_ptr, 1);
-        png_set_iCCP(png_ptr, info_ptr, "1", 0, ppf.icc.data(), ppf.icc.size());
+      if (!MaybeAddSRGB(ppf.color_encoding, png_ptr, info_ptr)) {
+        MaybeAddCICP(ppf.color_encoding, png_ptr, info_ptr);
+        if (!ppf.icc.empty()) {
+          png_set_benign_errors(png_ptr, 1);
+          png_set_iCCP(png_ptr, info_ptr, "1", 0, ppf.icc.data(),
+                       ppf.icc.size());
+        }
+        MaybeAddCHRM(ppf.color_encoding, png_ptr, info_ptr);
+        MaybeAddGAMA(ppf.color_encoding, png_ptr, info_ptr);
       }
+      MaybeAddCLLi(ppf.color_encoding, ppf.info.intensity_target, png_ptr,
+                   info_ptr);
+
       std::vector<std::string> textstrings;
       JXL_RETURN_IF_ERROR(BlobsWriterPNG::Encode(ppf.metadata, &textstrings));
       for (size_t kk = 0; kk + 1 < textstrings.size(); kk += 2) {
@@ -360,9 +440,14 @@ Status APNGEncoder::EncodePackedPixelFileToAPNG(
 }
 
 }  // namespace
+#endif
 
 std::unique_ptr<Encoder> GetAPNGEncoder() {
+#if JPEGXL_ENABLE_APNG
   return jxl::make_unique<APNGEncoder>();
+#else
+  return nullptr;
+#endif
 }
 
 }  // namespace extras
index dc593d2..8c9a148 100644 (file)
@@ -7,24 +7,17 @@
 
 #include <locale>
 
-#if JPEGXL_ENABLE_APNG
 #include "lib/extras/enc/apng.h"
-#endif
-#if JPEGXL_ENABLE_EXR
 #include "lib/extras/enc/exr.h"
-#endif
-#if JPEGXL_ENABLE_JPEG
 #include "lib/extras/enc/jpg.h"
-#endif
 #include "lib/extras/enc/npy.h"
 #include "lib/extras/enc/pgx.h"
 #include "lib/extras/enc/pnm.h"
-#include "lib/jxl/base/printf_macros.h"
 
 namespace jxl {
 namespace extras {
 
-Status Encoder::VerifyBasicInfo(const JxlBasicInfo& info) const {
+Status Encoder::VerifyBasicInfo(const JxlBasicInfo& info) {
   if (info.xsize == 0 || info.ysize == 0) {
     return JXL_FAILURE("Empty image");
   }
@@ -40,8 +33,34 @@ Status Encoder::VerifyBasicInfo(const JxlBasicInfo& info) const {
   return true;
 }
 
-Status Encoder::VerifyPackedImage(const PackedImage& image,
-                                  const JxlBasicInfo& info) const {
+Status Encoder::VerifyFormat(const JxlPixelFormat& format) const {
+  for (auto f : AcceptedFormats()) {
+    if (f.num_channels != format.num_channels) continue;
+    if (f.data_type != format.data_type) continue;
+    if (f.data_type == JXL_TYPE_UINT8 || f.endianness == format.endianness) {
+      return true;
+    }
+  }
+  return JXL_FAILURE("Format is not in the list of accepted formats.");
+}
+
+Status Encoder::VerifyBitDepth(JxlDataType data_type, uint32_t bits_per_sample,
+                               uint32_t exponent_bits) {
+  if ((data_type == JXL_TYPE_UINT8 &&
+       (bits_per_sample == 0 || bits_per_sample > 8 || exponent_bits != 0)) ||
+      (data_type == JXL_TYPE_UINT16 &&
+       (bits_per_sample <= 8 || bits_per_sample > 16 || exponent_bits != 0)) ||
+      (data_type == JXL_TYPE_FLOAT16 &&
+       (bits_per_sample > 16 || exponent_bits > 5))) {
+    return JXL_FAILURE(
+        "Incompatible data_type %d and bit depth %u with exponent bits %u",
+        (int)data_type, bits_per_sample, exponent_bits);
+  }
+  return true;
+}
+
+Status Encoder::VerifyImageSize(const PackedImage& image,
+                                const JxlBasicInfo& info) {
   if (image.pixels() == nullptr) {
     return JXL_FAILURE("Invalid image.");
   }
@@ -57,77 +76,60 @@ Status Encoder::VerifyPackedImage(const PackedImage& image,
       image.format.num_channels != info_num_channels) {
     return JXL_FAILURE("Frame size does not match image size");
   }
-  if (info.bits_per_sample >
-      PackedImage::BitsPerChannel(image.format.data_type)) {
-    return JXL_FAILURE("Bit depth does not fit pixel data type");
-  }
   return true;
 }
 
-Status SelectFormat(const std::vector<JxlPixelFormat>& accepted_formats,
-                    const JxlBasicInfo& basic_info, JxlPixelFormat* format) {
-  const size_t original_bit_depth = basic_info.bits_per_sample;
-  size_t current_bit_depth = 0;
-  size_t num_alpha_channels = (basic_info.alpha_bits != 0 ? 1 : 0);
-  size_t num_channels = basic_info.num_color_channels + num_alpha_channels;
-  for (;;) {
-    for (const JxlPixelFormat& candidate : accepted_formats) {
-      if (candidate.num_channels != num_channels) continue;
-      const size_t candidate_bit_depth =
-          PackedImage::BitsPerChannel(candidate.data_type);
-      if (
-          // Candidate bit depth is less than what we have and still enough
-          (original_bit_depth <= candidate_bit_depth &&
-           candidate_bit_depth < current_bit_depth) ||
-          // Or larger than the too-small bit depth we currently have
-          (current_bit_depth < candidate_bit_depth &&
-           current_bit_depth < original_bit_depth)) {
-        *format = candidate;
-        current_bit_depth = candidate_bit_depth;
-      }
-    }
-    if (current_bit_depth == 0) {
-      if (num_channels > basic_info.num_color_channels) {
-        // Try dropping the alpha channel.
-        --num_channels;
-        continue;
-      }
-      return JXL_FAILURE("no appropriate format found");
-    }
-    break;
-  }
-  if (current_bit_depth < original_bit_depth) {
-    JXL_WARNING("encoding %" PRIuS "-bit original to %" PRIuS " bits",
-                original_bit_depth, current_bit_depth);
-  }
+Status Encoder::VerifyPackedImage(const PackedImage& image,
+                                  const JxlBasicInfo& info) const {
+  JXL_RETURN_IF_ERROR(VerifyImageSize(image, info));
+  JXL_RETURN_IF_ERROR(VerifyFormat(image.format));
+  JXL_RETURN_IF_ERROR(VerifyBitDepth(image.format.data_type,
+                                     info.bits_per_sample,
+                                     info.exponent_bits_per_sample));
   return true;
 }
 
+template <int metadata>
+class MetadataEncoder : public Encoder {
+ public:
+  std::vector<JxlPixelFormat> AcceptedFormats() const override {
+    std::vector<JxlPixelFormat> formats;
+    // empty, i.e. no need for actual pixel data
+    return formats;
+  }
+
+  Status Encode(const PackedPixelFile& ppf, EncodedImage* encoded,
+                ThreadPool* pool) const override {
+    JXL_RETURN_IF_ERROR(VerifyBasicInfo(ppf.info));
+    encoded->icc.clear();
+    encoded->bitstreams.resize(1);
+    if (metadata == 0) encoded->bitstreams.front() = ppf.metadata.exif;
+    if (metadata == 1) encoded->bitstreams.front() = ppf.metadata.xmp;
+    if (metadata == 2) encoded->bitstreams.front() = ppf.metadata.jumbf;
+    return true;
+  }
+};
+
 std::unique_ptr<Encoder> Encoder::FromExtension(std::string extension) {
   std::transform(
       extension.begin(), extension.end(), extension.begin(),
       [](char c) { return std::tolower(c, std::locale::classic()); });
-#if JPEGXL_ENABLE_APNG
   if (extension == ".png" || extension == ".apng") return GetAPNGEncoder();
-#endif
-
-#if JPEGXL_ENABLE_JPEG
   if (extension == ".jpg") return GetJPEGEncoder();
   if (extension == ".jpeg") return GetJPEGEncoder();
-#endif
-
   if (extension == ".npy") return GetNumPyEncoder();
-
   if (extension == ".pgx") return GetPGXEncoder();
-
   if (extension == ".pam") return GetPAMEncoder();
   if (extension == ".pgm") return GetPGMEncoder();
   if (extension == ".ppm") return GetPPMEncoder();
+  if (extension == ".pnm") return GetPNMEncoder();
   if (extension == ".pfm") return GetPFMEncoder();
-
-#if JPEGXL_ENABLE_EXR
   if (extension == ".exr") return GetEXREncoder();
-#endif
+  if (extension == ".exif") return jxl::make_unique<MetadataEncoder<0>>();
+  if (extension == ".xmp") return jxl::make_unique<MetadataEncoder<1>>();
+  if (extension == ".xml") return jxl::make_unique<MetadataEncoder<1>>();
+  if (extension == ".jumbf") return jxl::make_unique<MetadataEncoder<2>>();
+  if (extension == ".jumb") return jxl::make_unique<MetadataEncoder<2>>();
 
   return nullptr;
 }
index 92eec50..da5f509 100644 (file)
@@ -8,10 +8,17 @@
 
 // Facade for image encoders.
 
+#include <jxl/codestream_header.h>
+#include <jxl/types.h>
+
+#include <cstdint>
+#include <memory>
 #include <string>
 #include <unordered_map>
+#include <utility>
+#include <vector>
 
-#include "lib/extras/dec/decode.h"
+#include "lib/extras/packed_image.h"
 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/base/status.h"
 
@@ -20,7 +27,7 @@ namespace extras {
 
 struct EncodedImage {
   // One (if the format supports animations or the image has only one frame) or
-  // more sequential bitstreams.
+  // more 1quential bitstreams.
   std::vector<std::vector<uint8_t>> bitstreams;
 
   // For each extra channel one or more sequential bitstreams.
@@ -43,6 +50,8 @@ class Encoder {
 
   virtual ~Encoder() = default;
 
+  // Set of pixel formats that this encoder takes as input.
+  // If empty, the 'encoder' does not need any pixels (it's metadata-only).
   virtual std::vector<JxlPixelFormat> AcceptedFormats() const = 0;
 
   // Any existing data in encoded_image is discarded.
@@ -53,12 +62,18 @@ class Encoder {
     options_[std::move(name)] = std::move(value);
   }
 
+  static Status VerifyBasicInfo(const JxlBasicInfo& info);
+  static Status VerifyImageSize(const PackedImage& image,
+                                const JxlBasicInfo& info);
+  static Status VerifyBitDepth(JxlDataType data_type, uint32_t bits_per_sample,
+                               uint32_t exponent_bits);
+
  protected:
   const std::unordered_map<std::string, std::string>& options() const {
     return options_;
   }
 
-  Status VerifyBasicInfo(const JxlBasicInfo& info) const;
+  Status VerifyFormat(const JxlPixelFormat& format) const;
 
   Status VerifyPackedImage(const PackedImage& image,
                            const JxlBasicInfo& info) const;
@@ -67,10 +82,6 @@ class Encoder {
   std::unordered_map<std::string, std::string> options_;
 };
 
-// TODO(sboukortt): consider exposing this as part of the C API.
-Status SelectFormat(const std::vector<JxlPixelFormat>& accepted_formats,
-                    const JxlBasicInfo& basic_info, JxlPixelFormat* format);
-
 }  // namespace extras
 }  // namespace jxl
 
index 05e05f9..d4005c3 100644 (file)
@@ -5,20 +5,23 @@
 
 #include "lib/extras/enc/exr.h"
 
+#if JPEGXL_ENABLE_EXR
 #include <ImfChromaticitiesAttribute.h>
 #include <ImfIO.h>
 #include <ImfRgbaFile.h>
 #include <ImfStandardAttributes.h>
+#endif
+#include <jxl/codestream_header.h>
 
 #include <vector>
 
-#include "jxl/codestream_header.h"
 #include "lib/extras/packed_image.h"
 #include "lib/jxl/base/byte_order.h"
 
 namespace jxl {
 namespace extras {
 
+#if JPEGXL_ENABLE_EXR
 namespace {
 
 namespace OpenEXR = OPENEXR_IMF_NAMESPACE;
@@ -110,7 +113,7 @@ Status EncodeImageEXR(const PackedImage& image, const JxlBasicInfo& info,
   chromaticities.white =
       Imath::V2f(c_enc.white_point_xy[0], c_enc.white_point_xy[1]);
   OpenEXR::addChromaticities(header, chromaticities);
-  OpenEXR::addWhiteLuminance(header, 255.0f);
+  OpenEXR::addWhiteLuminance(header, info.intensity_target);
 
   auto loadFloat =
       format.endianness == JXL_BIG_ENDIAN ? LoadBEFloat : LoadLEFloat;
@@ -162,7 +165,7 @@ class EXREncoder : public Encoder {
   std::vector<JxlPixelFormat> AcceptedFormats() const override {
     std::vector<JxlPixelFormat> formats;
     for (const uint32_t num_channels : {1, 2, 3, 4}) {
-      for (const JxlDataType data_type : {JXL_TYPE_FLOAT, JXL_TYPE_FLOAT16}) {
+      for (const JxlDataType data_type : {JXL_TYPE_FLOAT}) {
         for (JxlEndianness endianness : {JXL_BIG_ENDIAN, JXL_LITTLE_ENDIAN}) {
           formats.push_back(JxlPixelFormat{/*num_channels=*/num_channels,
                                            /*data_type=*/data_type,
@@ -191,9 +194,14 @@ class EXREncoder : public Encoder {
 };
 
 }  // namespace
+#endif
 
 std::unique_ptr<Encoder> GetEXREncoder() {
+#if JPEGXL_ENABLE_EXR
   return jxl::make_unique<EXREncoder>();
+#else
+  return nullptr;
+#endif
 }
 
 }  // namespace extras
diff --git a/lib/extras/enc/jpegli.cc b/lib/extras/enc/jpegli.cc
new file mode 100644 (file)
index 0000000..3b78764
--- /dev/null
@@ -0,0 +1,523 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/enc/jpegli.h"
+
+#include <jxl/cms.h>
+#include <jxl/codestream_header.h>
+#include <setjmp.h>
+#include <stdint.h>
+
+#include "lib/extras/enc/encode.h"
+#include "lib/jpegli/encode.h"
+#include "lib/jxl/enc_xyb.h"
+
+namespace jxl {
+namespace extras {
+
+namespace {
+
+void MyErrorExit(j_common_ptr cinfo) {
+  jmp_buf* env = static_cast<jmp_buf*>(cinfo->client_data);
+  (*cinfo->err->output_message)(cinfo);
+  jpegli_destroy_compress(reinterpret_cast<j_compress_ptr>(cinfo));
+  longjmp(*env, 1);
+}
+
+Status VerifyInput(const PackedPixelFile& ppf) {
+  const JxlBasicInfo& info = ppf.info;
+  JXL_RETURN_IF_ERROR(Encoder::VerifyBasicInfo(info));
+  if (ppf.frames.size() != 1) {
+    return JXL_FAILURE("JPEG input must have exactly one frame.");
+  }
+  const PackedImage& image = ppf.frames[0].color;
+  JXL_RETURN_IF_ERROR(Encoder::VerifyImageSize(image, info));
+  if (image.format.data_type == JXL_TYPE_FLOAT16) {
+    return JXL_FAILURE("FLOAT16 input is not supported.");
+  }
+  JXL_RETURN_IF_ERROR(Encoder::VerifyBitDepth(image.format.data_type,
+                                              info.bits_per_sample,
+                                              info.exponent_bits_per_sample));
+  if ((image.format.data_type == JXL_TYPE_UINT8 && info.bits_per_sample != 8) ||
+      (image.format.data_type == JXL_TYPE_UINT16 &&
+       info.bits_per_sample != 16)) {
+    return JXL_FAILURE("Only full bit depth unsigned types are supported.");
+  }
+  return true;
+}
+
+Status GetColorEncoding(const PackedPixelFile& ppf,
+                        ColorEncoding* color_encoding) {
+  if (!ppf.icc.empty()) {
+    IccBytes icc = ppf.icc;
+    JXL_RETURN_IF_ERROR(
+        color_encoding->SetICC(std::move(icc), JxlGetDefaultCms()));
+  } else {
+    JXL_RETURN_IF_ERROR(color_encoding->FromExternal(ppf.color_encoding));
+  }
+  if (color_encoding->ICC().empty()) {
+    return JXL_FAILURE("Invalid color encoding.");
+  }
+  return true;
+}
+
+bool HasICCProfile(const std::vector<uint8_t>& app_data) {
+  size_t pos = 0;
+  while (pos < app_data.size()) {
+    if (pos + 16 > app_data.size()) return false;
+    uint8_t marker = app_data[pos + 1];
+    size_t marker_len = (app_data[pos + 2] << 8) + app_data[pos + 3] + 2;
+    if (marker == 0xe2 && memcmp(&app_data[pos + 4], "ICC_PROFILE", 12) == 0) {
+      return true;
+    }
+    pos += marker_len;
+  }
+  return false;
+}
+
+Status WriteAppData(j_compress_ptr cinfo,
+                    const std::vector<uint8_t>& app_data) {
+  size_t pos = 0;
+  while (pos < app_data.size()) {
+    if (pos + 4 > app_data.size()) {
+      return JXL_FAILURE("Incomplete APP header.");
+    }
+    uint8_t marker = app_data[pos + 1];
+    size_t marker_len = (app_data[pos + 2] << 8) + app_data[pos + 3] + 2;
+    if (app_data[pos] != 0xff || marker < 0xe0 || marker > 0xef) {
+      return JXL_FAILURE("Invalid APP marker %02x %02x", app_data[pos], marker);
+    }
+    if (marker_len <= 4) {
+      return JXL_FAILURE("Invalid APP marker length.");
+    }
+    if (pos + marker_len > app_data.size()) {
+      return JXL_FAILURE("Incomplete APP data");
+    }
+    jpegli_write_marker(cinfo, marker, &app_data[pos + 4], marker_len - 4);
+    pos += marker_len;
+  }
+  return true;
+}
+
+static constexpr int kICCMarker = 0xe2;
+constexpr unsigned char kICCSignature[12] = {
+    0x49, 0x43, 0x43, 0x5F, 0x50, 0x52, 0x4F, 0x46, 0x49, 0x4C, 0x45, 0x00};
+static constexpr uint8_t kUnknownTf = 2;
+static constexpr unsigned char kCICPTagSignature[4] = {0x63, 0x69, 0x63, 0x70};
+static constexpr size_t kCICPTagSize = 12;
+
+bool FindCICPTag(const uint8_t* icc_data, size_t len, bool is_first_chunk,
+                 size_t* cicp_offset, size_t* cicp_length, uint8_t* cicp_tag,
+                 size_t* cicp_pos) {
+  if (is_first_chunk) {
+    // Look up the offset of the CICP tag from the first chunk of ICC data.
+    if (len < 132) {
+      return false;
+    }
+    uint32_t tag_count = LoadBE32(&icc_data[128]);
+    if (len < 132 + 12 * tag_count) {
+      return false;
+    }
+    for (uint32_t i = 0; i < tag_count; ++i) {
+      if (memcmp(&icc_data[132 + 12 * i], kCICPTagSignature, 4) == 0) {
+        *cicp_offset = LoadBE32(&icc_data[136 + 12 * i]);
+        *cicp_length = LoadBE32(&icc_data[140 + 12 * i]);
+      }
+    }
+    if (*cicp_length < kCICPTagSize) {
+      return false;
+    }
+  }
+  if (*cicp_offset < len) {
+    size_t n_bytes = std::min(len - *cicp_offset, kCICPTagSize - *cicp_pos);
+    memcpy(&cicp_tag[*cicp_pos], &icc_data[*cicp_offset], n_bytes);
+    *cicp_pos += n_bytes;
+    *cicp_offset = 0;
+  } else {
+    *cicp_offset -= len;
+  }
+  return true;
+}
+
+uint8_t LookupCICPTransferFunctionFromAppData(const uint8_t* app_data,
+                                              size_t len) {
+  size_t last_index = 0;
+  size_t cicp_offset = 0;
+  size_t cicp_length = 0;
+  uint8_t cicp_tag[kCICPTagSize] = {};
+  size_t cicp_pos = 0;
+  size_t pos = 0;
+  while (pos < len) {
+    const uint8_t* marker = &app_data[pos];
+    if (pos + 4 > len) {
+      return kUnknownTf;
+    }
+    size_t marker_size = (marker[2] << 8) + marker[3] + 2;
+    if (pos + marker_size > len) {
+      return kUnknownTf;
+    }
+    if (marker_size < 18 || marker[0] != 0xff || marker[1] != kICCMarker ||
+        memcmp(&marker[4], kICCSignature, 12) != 0) {
+      pos += marker_size;
+      continue;
+    }
+    uint8_t index = marker[16];
+    uint8_t total = marker[17];
+    const uint8_t* payload = marker + 18;
+    const size_t payload_size = marker_size - 18;
+    if (index != last_index + 1 || index > total) {
+      return kUnknownTf;
+    }
+    if (!FindCICPTag(payload, payload_size, last_index == 0, &cicp_offset,
+                     &cicp_length, &cicp_tag[0], &cicp_pos)) {
+      return kUnknownTf;
+    }
+    if (cicp_pos == kCICPTagSize) {
+      break;
+    }
+    ++last_index;
+  }
+  if (cicp_pos >= kCICPTagSize && memcmp(cicp_tag, kCICPTagSignature, 4) == 0) {
+    return cicp_tag[9];
+  }
+  return kUnknownTf;
+}
+
+uint8_t LookupCICPTransferFunctionFromICCProfile(const uint8_t* icc_data,
+                                                 size_t len) {
+  size_t cicp_offset = 0;
+  size_t cicp_length = 0;
+  uint8_t cicp_tag[kCICPTagSize] = {};
+  size_t cicp_pos = 0;
+  if (!FindCICPTag(icc_data, len, true, &cicp_offset, &cicp_length,
+                   &cicp_tag[0], &cicp_pos)) {
+    return kUnknownTf;
+  }
+  if (cicp_pos >= kCICPTagSize && memcmp(cicp_tag, kCICPTagSignature, 4) == 0) {
+    return cicp_tag[9];
+  }
+  return kUnknownTf;
+}
+
+JpegliDataType ConvertDataType(JxlDataType type) {
+  switch (type) {
+    case JXL_TYPE_UINT8:
+      return JPEGLI_TYPE_UINT8;
+    case JXL_TYPE_UINT16:
+      return JPEGLI_TYPE_UINT16;
+    case JXL_TYPE_FLOAT:
+      return JPEGLI_TYPE_FLOAT;
+    default:
+      return JPEGLI_TYPE_UINT8;
+  }
+}
+
+JpegliEndianness ConvertEndianness(JxlEndianness endianness) {
+  switch (endianness) {
+    case JXL_NATIVE_ENDIAN:
+      return JPEGLI_NATIVE_ENDIAN;
+    case JXL_LITTLE_ENDIAN:
+      return JPEGLI_LITTLE_ENDIAN;
+    case JXL_BIG_ENDIAN:
+      return JPEGLI_BIG_ENDIAN;
+    default:
+      return JPEGLI_NATIVE_ENDIAN;
+  }
+}
+
+void ToFloatRow(const uint8_t* row_in, JxlPixelFormat format, size_t len,
+                float* row_out) {
+  bool is_little_endian =
+      (format.endianness == JXL_LITTLE_ENDIAN ||
+       (format.endianness == JXL_NATIVE_ENDIAN && IsLittleEndian()));
+  static constexpr double kMul8 = 1.0 / 255.0;
+  static constexpr double kMul16 = 1.0 / 65535.0;
+  if (format.data_type == JXL_TYPE_UINT8) {
+    for (size_t x = 0; x < len; ++x) {
+      row_out[x] = row_in[x] * kMul8;
+    }
+  } else if (format.data_type == JXL_TYPE_UINT16 && is_little_endian) {
+    for (size_t x = 0; x < len; ++x) {
+      row_out[x] = LoadLE16(&row_in[2 * x]) * kMul16;
+    }
+  } else if (format.data_type == JXL_TYPE_UINT16 && !is_little_endian) {
+    for (size_t x = 0; x < len; ++x) {
+      row_out[x] = LoadBE16(&row_in[2 * x]) * kMul16;
+    }
+  } else if (format.data_type == JXL_TYPE_FLOAT && is_little_endian) {
+    for (size_t x = 0; x < len; ++x) {
+      row_out[x] = LoadLEFloat(&row_in[4 * x]);
+    }
+  } else if (format.data_type == JXL_TYPE_FLOAT && !is_little_endian) {
+    for (size_t x = 0; x < len; ++x) {
+      row_out[x] = LoadBEFloat(&row_in[4 * x]);
+    }
+  }
+}
+
+Status EncodeJpegToTargetSize(const PackedPixelFile& ppf,
+                              const JpegSettings& jpeg_settings,
+                              size_t target_size, ThreadPool* pool,
+                              std::vector<uint8_t>* output) {
+  output->clear();
+  size_t best_error = std::numeric_limits<size_t>::max();
+  float distance0 = -1.0f;
+  float distance1 = -1.0f;
+  float distance = 1.0f;
+  for (int step = 0; step < 15; ++step) {
+    JpegSettings settings = jpeg_settings;
+    settings.libjpeg_quality = 0;
+    settings.distance = distance;
+    settings.target_size = 0;
+    std::vector<uint8_t> compressed;
+    JXL_RETURN_IF_ERROR(EncodeJpeg(ppf, settings, pool, &compressed));
+    size_t size = compressed.size();
+    // prefer being under the target size to being over it
+    size_t error = size < target_size
+                       ? target_size - size
+                       : static_cast<size_t>(1.2f * (size - target_size));
+    if (error < best_error) {
+      best_error = error;
+      std::swap(*output, compressed);
+    }
+    float rel_error = size * 1.0f / target_size;
+    if (std::abs(rel_error - 1.0f) < 0.002f) {
+      break;
+    }
+    if (size < target_size) {
+      distance1 = distance;
+    } else {
+      distance0 = distance;
+    }
+    if (distance1 == -1) {
+      distance *= std::pow(rel_error, 1.5) * 1.05;
+    } else if (distance0 == -1) {
+      distance *= std::pow(rel_error, 1.5) * 0.95;
+    } else {
+      distance = 0.5 * (distance0 + distance1);
+    }
+  }
+  return true;
+}
+
+}  // namespace
+
+Status EncodeJpeg(const PackedPixelFile& ppf, const JpegSettings& jpeg_settings,
+                  ThreadPool* pool, std::vector<uint8_t>* compressed) {
+  if (jpeg_settings.libjpeg_quality > 0) {
+    auto encoder = Encoder::FromExtension(".jpg");
+    encoder->SetOption("q", std::to_string(jpeg_settings.libjpeg_quality));
+    if (!jpeg_settings.libjpeg_chroma_subsampling.empty()) {
+      encoder->SetOption("chroma_subsampling",
+                         jpeg_settings.libjpeg_chroma_subsampling);
+    }
+    EncodedImage encoded;
+    JXL_RETURN_IF_ERROR(encoder->Encode(ppf, &encoded, pool));
+    size_t target_size = encoded.bitstreams[0].size();
+    return EncodeJpegToTargetSize(ppf, jpeg_settings, target_size, pool,
+                                  compressed);
+  }
+  if (jpeg_settings.target_size > 0) {
+    return EncodeJpegToTargetSize(ppf, jpeg_settings, jpeg_settings.target_size,
+                                  pool, compressed);
+  }
+  JXL_RETURN_IF_ERROR(VerifyInput(ppf));
+
+  ColorEncoding color_encoding;
+  JXL_RETURN_IF_ERROR(GetColorEncoding(ppf, &color_encoding));
+
+  ColorSpaceTransform c_transform(*JxlGetDefaultCms());
+  ColorEncoding xyb_encoding;
+  if (jpeg_settings.xyb) {
+    if (ppf.info.num_color_channels != 3) {
+      return JXL_FAILURE("Only RGB input is supported in XYB mode.");
+    }
+    if (HasICCProfile(jpeg_settings.app_data)) {
+      return JXL_FAILURE("APP data ICC profile is not supported in XYB mode.");
+    }
+    const ColorEncoding& c_desired = ColorEncoding::LinearSRGB(false);
+    JXL_RETURN_IF_ERROR(
+        c_transform.Init(color_encoding, c_desired, 255.0f, ppf.info.xsize, 1));
+    xyb_encoding.SetColorSpace(jxl::ColorSpace::kXYB);
+    xyb_encoding.SetRenderingIntent(jxl::RenderingIntent::kPerceptual);
+    JXL_RETURN_IF_ERROR(xyb_encoding.CreateICC());
+  }
+  const ColorEncoding& output_encoding =
+      jpeg_settings.xyb ? xyb_encoding : color_encoding;
+
+  // We need to declare all the non-trivial destructor local variables
+  // before the call to setjmp().
+  std::vector<uint8_t> pixels;
+  unsigned char* output_buffer = nullptr;
+  unsigned long output_size = 0;
+  std::vector<uint8_t> row_bytes;
+  size_t rowlen = RoundUpTo(ppf.info.xsize, VectorSize());
+  hwy::AlignedFreeUniquePtr<float[]> xyb_tmp =
+      hwy::AllocateAligned<float>(6 * rowlen);
+  hwy::AlignedFreeUniquePtr<float[]> premul_absorb =
+      hwy::AllocateAligned<float>(VectorSize() * 12);
+  ComputePremulAbsorb(255.0f, premul_absorb.get());
+
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    jpeg_error_mgr jerr;
+    jmp_buf env;
+    cinfo.err = jpegli_std_error(&jerr);
+    jerr.error_exit = &MyErrorExit;
+    if (setjmp(env)) {
+      return false;
+    }
+    cinfo.client_data = static_cast<void*>(&env);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &output_buffer, &output_size);
+    const JxlBasicInfo& info = ppf.info;
+    cinfo.image_width = info.xsize;
+    cinfo.image_height = info.ysize;
+    cinfo.input_components = info.num_color_channels;
+    cinfo.in_color_space =
+        cinfo.input_components == 1 ? JCS_GRAYSCALE : JCS_RGB;
+    if (jpeg_settings.xyb) {
+      jpegli_set_xyb_mode(&cinfo);
+    } else if (jpeg_settings.use_std_quant_tables) {
+      jpegli_use_standard_quant_tables(&cinfo);
+    }
+    uint8_t cicp_tf = kUnknownTf;
+    if (!jpeg_settings.app_data.empty()) {
+      cicp_tf = LookupCICPTransferFunctionFromAppData(
+          jpeg_settings.app_data.data(), jpeg_settings.app_data.size());
+    } else if (!output_encoding.IsSRGB()) {
+      cicp_tf = LookupCICPTransferFunctionFromICCProfile(
+          output_encoding.ICC().data(), output_encoding.ICC().size());
+    }
+    jpegli_set_cicp_transfer_function(&cinfo, cicp_tf);
+    jpegli_set_defaults(&cinfo);
+    if (!jpeg_settings.chroma_subsampling.empty()) {
+      if (jpeg_settings.chroma_subsampling == "444") {
+        cinfo.comp_info[0].h_samp_factor = 1;
+        cinfo.comp_info[0].v_samp_factor = 1;
+      } else if (jpeg_settings.chroma_subsampling == "440") {
+        cinfo.comp_info[0].h_samp_factor = 1;
+        cinfo.comp_info[0].v_samp_factor = 2;
+      } else if (jpeg_settings.chroma_subsampling == "422") {
+        cinfo.comp_info[0].h_samp_factor = 2;
+        cinfo.comp_info[0].v_samp_factor = 1;
+      } else if (jpeg_settings.chroma_subsampling == "420") {
+        cinfo.comp_info[0].h_samp_factor = 2;
+        cinfo.comp_info[0].v_samp_factor = 2;
+      } else {
+        return false;
+      }
+      for (int i = 1; i < cinfo.num_components; ++i) {
+        cinfo.comp_info[i].h_samp_factor = 1;
+        cinfo.comp_info[i].v_samp_factor = 1;
+      }
+    }
+    jpegli_enable_adaptive_quantization(
+        &cinfo, jpeg_settings.use_adaptive_quantization);
+    if (jpeg_settings.psnr_target > 0.0) {
+      jpegli_set_psnr(&cinfo, jpeg_settings.psnr_target,
+                      jpeg_settings.search_tolerance,
+                      jpeg_settings.min_distance, jpeg_settings.max_distance);
+    } else if (jpeg_settings.quality > 0.0) {
+      float distance = jpegli_quality_to_distance(jpeg_settings.quality);
+      jpegli_set_distance(&cinfo, distance, TRUE);
+    } else {
+      jpegli_set_distance(&cinfo, jpeg_settings.distance, TRUE);
+    }
+    jpegli_set_progressive_level(&cinfo, jpeg_settings.progressive_level);
+    cinfo.optimize_coding = jpeg_settings.optimize_coding;
+    if (!jpeg_settings.app_data.empty()) {
+      // Make sure jpegli_start_compress() does not write any APP markers.
+      cinfo.write_JFIF_header = false;
+      cinfo.write_Adobe_marker = false;
+    }
+    const PackedImage& image = ppf.frames[0].color;
+    if (jpeg_settings.xyb) {
+      jpegli_set_input_format(&cinfo, JPEGLI_TYPE_FLOAT, JPEGLI_NATIVE_ENDIAN);
+    } else {
+      jpegli_set_input_format(&cinfo, ConvertDataType(image.format.data_type),
+                              ConvertEndianness(image.format.endianness));
+    }
+    jpegli_start_compress(&cinfo, TRUE);
+    if (!jpeg_settings.app_data.empty()) {
+      JXL_RETURN_IF_ERROR(WriteAppData(&cinfo, jpeg_settings.app_data));
+    }
+    if ((jpeg_settings.app_data.empty() && !output_encoding.IsSRGB()) ||
+        jpeg_settings.xyb) {
+      jpegli_write_icc_profile(&cinfo, output_encoding.ICC().data(),
+                               output_encoding.ICC().size());
+    }
+    const uint8_t* pixels = reinterpret_cast<const uint8_t*>(image.pixels());
+    if (jpeg_settings.xyb) {
+      float* src_buf = c_transform.BufSrc(0);
+      float* dst_buf = c_transform.BufDst(0);
+      for (size_t y = 0; y < image.ysize; ++y) {
+        // convert to float
+        ToFloatRow(&pixels[y * image.stride], image.format, 3 * image.xsize,
+                   src_buf);
+        // convert to linear srgb
+        if (!c_transform.Run(0, src_buf, dst_buf)) {
+          return false;
+        }
+        // deinterleave channels
+        float* row0 = &xyb_tmp[0];
+        float* row1 = &xyb_tmp[rowlen];
+        float* row2 = &xyb_tmp[2 * rowlen];
+        for (size_t x = 0; x < image.xsize; ++x) {
+          row0[x] = dst_buf[3 * x + 0];
+          row1[x] = dst_buf[3 * x + 1];
+          row2[x] = dst_buf[3 * x + 2];
+        }
+        // convert to xyb
+        LinearRGBRowToXYB(row0, row1, row2, premul_absorb.get(), image.xsize);
+        // scale xyb
+        ScaleXYBRow(row0, row1, row2, image.xsize);
+        // interleave channels
+        float* row_out = &xyb_tmp[3 * rowlen];
+        for (size_t x = 0; x < image.xsize; ++x) {
+          row_out[3 * x + 0] = row0[x];
+          row_out[3 * x + 1] = row1[x];
+          row_out[3 * x + 2] = row2[x];
+        }
+        // feed to jpegli as native endian floats
+        JSAMPROW row[] = {reinterpret_cast<uint8_t*>(row_out)};
+        jpegli_write_scanlines(&cinfo, row, 1);
+      }
+    } else {
+      row_bytes.resize(image.stride);
+      if (cinfo.num_components == (int)image.format.num_channels) {
+        for (size_t y = 0; y < info.ysize; ++y) {
+          memcpy(&row_bytes[0], pixels + y * image.stride, image.stride);
+          JSAMPROW row[] = {row_bytes.data()};
+          jpegli_write_scanlines(&cinfo, row, 1);
+        }
+      } else {
+        for (size_t y = 0; y < info.ysize; ++y) {
+          int bytes_per_channel =
+              PackedImage::BitsPerChannel(image.format.data_type) / 8;
+          int bytes_per_pixel = cinfo.num_components * bytes_per_channel;
+          for (size_t x = 0; x < info.xsize; ++x) {
+            memcpy(&row_bytes[x * bytes_per_pixel],
+                   &pixels[y * image.stride + x * image.pixel_stride()],
+                   bytes_per_pixel);
+          }
+          JSAMPROW row[] = {row_bytes.data()};
+          jpegli_write_scanlines(&cinfo, row, 1);
+        }
+      }
+    }
+    jpegli_finish_compress(&cinfo);
+    compressed->resize(output_size);
+    std::copy_n(output_buffer, output_size, compressed->data());
+    return true;
+  };
+  bool success = try_catch_block();
+  jpegli_destroy_compress(&cinfo);
+  if (output_buffer) free(output_buffer);
+  return success;
+}
+
+}  // namespace extras
+}  // namespace jxl
diff --git a/lib/extras/enc/jpegli.h b/lib/extras/enc/jpegli.h
new file mode 100644 (file)
index 0000000..9538b2e
--- /dev/null
@@ -0,0 +1,53 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_EXTRAS_ENC_JPEGLI_H_
+#define LIB_EXTRAS_ENC_JPEGLI_H_
+
+// Encodes JPG pixels and metadata in memory using the libjpegli library.
+
+#include <stdint.h>
+
+#include <string>
+#include <vector>
+
+#include "lib/extras/packed_image.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+namespace extras {
+
+struct JpegSettings {
+  bool xyb = false;
+  size_t target_size = 0;
+  float quality = 0.0f;
+  float distance = 1.f;
+  bool use_adaptive_quantization = true;
+  bool use_std_quant_tables = false;
+  int progressive_level = 2;
+  bool optimize_coding = true;
+  std::string chroma_subsampling;
+  int libjpeg_quality = 0;
+  std::string libjpeg_chroma_subsampling;
+  // Parameters for selecting distance based on PSNR target.
+  float psnr_target = 0.0f;
+  float search_tolerance = 0.01;
+  float min_distance = 0.1f;
+  float max_distance = 25.0f;
+  // If not empty, must contain concatenated APP marker segments. In this case,
+  // these and only these APP marker segments will be written to the JPEG
+  // output. In xyb mode app_data must not contain an ICC profile, in this
+  // case an additional APP2 ICC profile for the XYB colorspace will be emitted.
+  std::vector<uint8_t> app_data;
+};
+
+Status EncodeJpeg(const PackedPixelFile& ppf, const JpegSettings& jpeg_settings,
+                  ThreadPool* pool, std::vector<uint8_t>* compressed);
+
+}  // namespace extras
+}  // namespace jxl
+
+#endif  // LIB_EXTRAS_ENC_JPEGLI_H_
index 93a39dd..f1355bb 100644 (file)
@@ -5,12 +5,18 @@
 
 #include "lib/extras/enc/jpg.h"
 
+#if JPEGXL_ENABLE_JPEG
 #include <jpeglib.h>
 #include <setjmp.h>
+#endif
 #include <stdint.h>
 
 #include <algorithm>
+#include <array>
+#include <cmath>
+#include <fstream>
 #include <iterator>
+#include <memory>
 #include <numeric>
 #include <sstream>
 #include <utility>
 #include "lib/jxl/sanitizers.h"
 #if JPEGXL_ENABLE_SJPEG
 #include "sjpeg.h"
+#include "sjpegi.h"
 #endif
 
 namespace jxl {
 namespace extras {
 
+#if JPEGXL_ENABLE_JPEG
 namespace {
 
 constexpr unsigned char kICCSignature[12] = {
@@ -42,6 +50,142 @@ enum class JpegEncoder {
   kSJpeg,
 };
 
+#define ARRAY_SIZE(X) (sizeof(X) / sizeof((X)[0]))
+
+// Popular jpeg scan scripts
+// The fields of the individual scans are:
+// comps_in_scan, component_index[], Ss, Se, Ah, Al
+static constexpr jpeg_scan_info kScanScript1[] = {
+    {1, {0}, 0, 0, 0, 0},   //
+    {1, {1}, 0, 0, 0, 0},   //
+    {1, {2}, 0, 0, 0, 0},   //
+    {1, {0}, 1, 8, 0, 0},   //
+    {1, {0}, 9, 63, 0, 0},  //
+    {1, {1}, 1, 63, 0, 0},  //
+    {1, {2}, 1, 63, 0, 0},  //
+};
+static constexpr size_t kNumScans1 = ARRAY_SIZE(kScanScript1);
+
+static constexpr jpeg_scan_info kScanScript2[] = {
+    {1, {0}, 0, 0, 0, 0},   //
+    {1, {1}, 0, 0, 0, 0},   //
+    {1, {2}, 0, 0, 0, 0},   //
+    {1, {0}, 1, 2, 0, 1},   //
+    {1, {0}, 3, 63, 0, 1},  //
+    {1, {0}, 1, 63, 1, 0},  //
+    {1, {1}, 1, 63, 0, 0},  //
+    {1, {2}, 1, 63, 0, 0},  //
+};
+static constexpr size_t kNumScans2 = ARRAY_SIZE(kScanScript2);
+
+static constexpr jpeg_scan_info kScanScript3[] = {
+    {1, {0}, 0, 0, 0, 0},   //
+    {1, {1}, 0, 0, 0, 0},   //
+    {1, {2}, 0, 0, 0, 0},   //
+    {1, {0}, 1, 63, 0, 2},  //
+    {1, {0}, 1, 63, 2, 1},  //
+    {1, {0}, 1, 63, 1, 0},  //
+    {1, {1}, 1, 63, 0, 0},  //
+    {1, {2}, 1, 63, 0, 0},  //
+};
+static constexpr size_t kNumScans3 = ARRAY_SIZE(kScanScript3);
+
+static constexpr jpeg_scan_info kScanScript4[] = {
+    {3, {0, 1, 2}, 0, 0, 0, 1},  //
+    {1, {0}, 1, 5, 0, 2},        //
+    {1, {2}, 1, 63, 0, 1},       //
+    {1, {1}, 1, 63, 0, 1},       //
+    {1, {0}, 6, 63, 0, 2},       //
+    {1, {0}, 1, 63, 2, 1},       //
+    {3, {0, 1, 2}, 0, 0, 1, 0},  //
+    {1, {2}, 1, 63, 1, 0},       //
+    {1, {1}, 1, 63, 1, 0},       //
+    {1, {0}, 1, 63, 1, 0},       //
+};
+static constexpr size_t kNumScans4 = ARRAY_SIZE(kScanScript4);
+
+static constexpr jpeg_scan_info kScanScript5[] = {
+    {3, {0, 1, 2}, 0, 0, 0, 1},  //
+    {1, {0}, 1, 5, 0, 2},        //
+    {1, {1}, 1, 5, 0, 2},        //
+    {1, {2}, 1, 5, 0, 2},        //
+    {1, {1}, 6, 63, 0, 2},       //
+    {1, {2}, 6, 63, 0, 2},       //
+    {1, {0}, 6, 63, 0, 2},       //
+    {1, {0}, 1, 63, 2, 1},       //
+    {1, {1}, 1, 63, 2, 1},       //
+    {1, {2}, 1, 63, 2, 1},       //
+    {3, {0, 1, 2}, 0, 0, 1, 0},  //
+    {1, {0}, 1, 63, 1, 0},       //
+    {1, {1}, 1, 63, 1, 0},       //
+    {1, {2}, 1, 63, 1, 0},       //
+};
+static constexpr size_t kNumScans5 = ARRAY_SIZE(kScanScript5);
+
+// default progressive mode of jpegli
+static constexpr jpeg_scan_info kScanScript6[] = {
+    {3, {0, 1, 2}, 0, 0, 0, 0},  //
+    {1, {0}, 1, 2, 0, 0},        //
+    {1, {1}, 1, 2, 0, 0},        //
+    {1, {2}, 1, 2, 0, 0},        //
+    {1, {0}, 3, 63, 0, 2},       //
+    {1, {1}, 3, 63, 0, 2},       //
+    {1, {2}, 3, 63, 0, 2},       //
+    {1, {0}, 3, 63, 2, 1},       //
+    {1, {1}, 3, 63, 2, 1},       //
+    {1, {2}, 3, 63, 2, 1},       //
+    {1, {0}, 3, 63, 1, 0},       //
+    {1, {1}, 3, 63, 1, 0},       //
+    {1, {2}, 3, 63, 1, 0},       //
+};
+static constexpr size_t kNumScans6 = ARRAY_SIZE(kScanScript6);
+
+// Adapt RGB scan info to grayscale jpegs.
+void FilterScanComponents(const jpeg_compress_struct* cinfo,
+                          jpeg_scan_info* si) {
+  const int all_comps_in_scan = si->comps_in_scan;
+  si->comps_in_scan = 0;
+  for (int j = 0; j < all_comps_in_scan; ++j) {
+    const int component = si->component_index[j];
+    if (component < cinfo->input_components) {
+      si->component_index[si->comps_in_scan++] = component;
+    }
+  }
+}
+
+Status SetJpegProgression(int progressive_id,
+                          std::vector<jpeg_scan_info>* scan_infos,
+                          jpeg_compress_struct* cinfo) {
+  if (progressive_id < 0) {
+    return true;
+  }
+  if (progressive_id == 0) {
+    jpeg_simple_progression(cinfo);
+    return true;
+  }
+  constexpr const jpeg_scan_info* kScanScripts[] = {kScanScript1, kScanScript2,
+                                                    kScanScript3, kScanScript4,
+                                                    kScanScript5, kScanScript6};
+  constexpr size_t kNumScans[] = {kNumScans1, kNumScans2, kNumScans3,
+                                  kNumScans4, kNumScans5, kNumScans6};
+  if (progressive_id > static_cast<int>(ARRAY_SIZE(kNumScans))) {
+    return JXL_FAILURE("Unknown jpeg scan script id %d", progressive_id);
+  }
+  const jpeg_scan_info* scan_script = kScanScripts[progressive_id - 1];
+  const size_t num_scans = kNumScans[progressive_id - 1];
+  // filter scan script for number of components
+  for (size_t i = 0; i < num_scans; ++i) {
+    jpeg_scan_info scan_info = scan_script[i];
+    FilterScanComponents(cinfo, &scan_info);
+    if (scan_info.comps_in_scan > 0) {
+      scan_infos->emplace_back(std::move(scan_info));
+    }
+  }
+  cinfo->scan_info = scan_infos->data();
+  cinfo->num_scans = scan_infos->size();
+  return true;
+}
+
 bool IsSRGBEncoding(const JxlColorEncoding& c) {
   return ((c.color_space == JXL_COLOR_SPACE_RGB ||
            c.color_space == JXL_COLOR_SPACE_GRAY) &&
@@ -106,18 +250,37 @@ Status SetChromaSubsampling(const std::string& subsampling,
   return false;
 }
 
+struct JpegParams {
+  // Common between sjpeg and libjpeg
+  int quality = 100;
+  std::string chroma_subsampling = "444";
+  // Libjpeg parameters
+  int progressive_id = -1;
+  bool optimize_coding = true;
+  bool is_xyb = false;
+  // Sjpeg parameters
+  int libjpeg_quality = 0;
+  std::string libjpeg_chroma_subsampling = "444";
+  float psnr_target = 0;
+  std::string custom_base_quant_fn;
+  float search_q_start = 65.0f;
+  float search_q_min = 1.0f;
+  float search_q_max = 100.0f;
+  int search_max_iters = 20;
+  float search_tolerance = 0.1f;
+  float search_q_precision = 0.01f;
+  float search_first_iter_slope = 3.0f;
+  bool enable_adaptive_quant = true;
+};
+
 Status EncodeWithLibJpeg(const PackedImage& image, const JxlBasicInfo& info,
                          const std::vector<uint8_t>& icc,
-                         std::vector<uint8_t> exif, size_t quality,
-                         const std::string& chroma_subsampling,
+                         std::vector<uint8_t> exif, const JpegParams& params,
                          std::vector<uint8_t>* bytes) {
   if (BITS_IN_JSAMPLE != 8 || sizeof(JSAMPLE) != 1) {
     return JXL_FAILURE("Only 8 bit JSAMPLE is supported.");
   }
-  jpeg_compress_struct cinfo;
-  // cinfo is initialized by libjpeg, which we are not instrumenting with
-  // msan.
-  msan::UnpoisonMemory(&cinfo, sizeof(cinfo));
+  jpeg_compress_struct cinfo = {};
   jpeg_error_mgr jerr;
   cinfo.err = jpeg_std_error(&jerr);
   jpeg_create_compress(&cinfo);
@@ -129,11 +292,19 @@ Status EncodeWithLibJpeg(const PackedImage& image, const JxlBasicInfo& info,
   cinfo.input_components = info.num_color_channels;
   cinfo.in_color_space = info.num_color_channels == 1 ? JCS_GRAYSCALE : JCS_RGB;
   jpeg_set_defaults(&cinfo);
-  cinfo.optimize_coding = TRUE;
+  cinfo.optimize_coding = params.optimize_coding;
   if (cinfo.input_components == 3) {
-    JXL_RETURN_IF_ERROR(SetChromaSubsampling(chroma_subsampling, &cinfo));
+    JXL_RETURN_IF_ERROR(
+        SetChromaSubsampling(params.chroma_subsampling, &cinfo));
   }
-  jpeg_set_quality(&cinfo, quality, TRUE);
+  if (params.is_xyb) {
+    // Tell libjpeg not to convert XYB data to YCbCr.
+    jpeg_set_colorspace(&cinfo, JCS_RGB);
+  }
+  jpeg_set_quality(&cinfo, params.quality, TRUE);
+  std::vector<jpeg_scan_info> scan_infos;
+  JXL_RETURN_IF_ERROR(
+      SetJpegProgression(params.progressive_id, &scan_infos, &cinfo));
   jpeg_start_compress(&cinfo, TRUE);
   if (!icc.empty()) {
     WriteICCProfile(&cinfo, icc);
@@ -145,13 +316,39 @@ Status EncodeWithLibJpeg(const PackedImage& image, const JxlBasicInfo& info,
   if (cinfo.input_components > 3 || cinfo.input_components < 0)
     return JXL_FAILURE("invalid numbers of components");
 
-  std::vector<uint8_t> raw_bytes(image.pixels_size);
-  memcpy(&raw_bytes[0], reinterpret_cast<const uint8_t*>(image.pixels()),
-         image.pixels_size);
-  for (size_t y = 0; y < info.ysize; ++y) {
-    JSAMPROW row[] = {raw_bytes.data() + y * image.stride};
-
-    jpeg_write_scanlines(&cinfo, row, 1);
+  std::vector<uint8_t> row_bytes(image.stride);
+  const uint8_t* pixels = reinterpret_cast<const uint8_t*>(image.pixels());
+  if (cinfo.num_components == (int)image.format.num_channels &&
+      image.format.data_type == JXL_TYPE_UINT8) {
+    for (size_t y = 0; y < info.ysize; ++y) {
+      memcpy(&row_bytes[0], pixels + y * image.stride, image.stride);
+      JSAMPROW row[] = {row_bytes.data()};
+      jpeg_write_scanlines(&cinfo, row, 1);
+    }
+  } else if (image.format.data_type == JXL_TYPE_UINT8) {
+    for (size_t y = 0; y < info.ysize; ++y) {
+      const uint8_t* image_row = pixels + y * image.stride;
+      for (size_t x = 0; x < info.xsize; ++x) {
+        const uint8_t* image_pixel = image_row + x * image.pixel_stride();
+        memcpy(&row_bytes[x * cinfo.num_components], image_pixel,
+               cinfo.num_components);
+      }
+      JSAMPROW row[] = {row_bytes.data()};
+      jpeg_write_scanlines(&cinfo, row, 1);
+    }
+  } else {
+    for (size_t y = 0; y < info.ysize; ++y) {
+      const uint8_t* image_row = pixels + y * image.stride;
+      for (size_t x = 0; x < info.xsize; ++x) {
+        const uint8_t* image_pixel = image_row + x * image.pixel_stride();
+        for (int c = 0; c < cinfo.num_components; ++c) {
+          uint32_t val16 = (image_pixel[2 * c] << 8) + image_pixel[2 * c + 1];
+          row_bytes[x * cinfo.num_components + c] = (val16 + 128) / 257;
+        }
+      }
+      JSAMPROW row[] = {row_bytes.data()};
+      jpeg_write_scanlines(&cinfo, row, 1);
+    }
   }
   jpeg_finish_compress(&cinfo);
   jpeg_destroy_compress(&cinfo);
@@ -164,15 +361,93 @@ Status EncodeWithLibJpeg(const PackedImage& image, const JxlBasicInfo& info,
   return true;
 }
 
+#if JPEGXL_ENABLE_SJPEG
+struct MySearchHook : public sjpeg::SearchHook {
+  uint8_t base_tables[2][64];
+  float q_start;
+  float q_precision;
+  float first_iter_slope;
+  void ReadBaseTables(const std::string& fn) {
+    const uint8_t kJPEGAnnexKMatrices[2][64] = {
+        {16, 11, 10, 16, 24,  40,  51,  61,  12, 12, 14, 19, 26,  58,  60,  55,
+         14, 13, 16, 24, 40,  57,  69,  56,  14, 17, 22, 29, 51,  87,  80,  62,
+         18, 22, 37, 56, 68,  109, 103, 77,  24, 35, 55, 64, 81,  104, 113, 92,
+         49, 64, 78, 87, 103, 121, 120, 101, 72, 92, 95, 98, 112, 100, 103, 99},
+        {17, 18, 24, 47, 99, 99, 99, 99, 18, 21, 26, 66, 99, 99, 99, 99,
+         24, 26, 56, 99, 99, 99, 99, 99, 47, 66, 99, 99, 99, 99, 99, 99,
+         99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
+         99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99}};
+    memcpy(base_tables[0], kJPEGAnnexKMatrices[0], sizeof(base_tables[0]));
+    memcpy(base_tables[1], kJPEGAnnexKMatrices[1], sizeof(base_tables[1]));
+    if (!fn.empty()) {
+      std::ifstream f(fn);
+      std::string line;
+      int idx = 0;
+      while (idx < 128 && std::getline(f, line)) {
+        if (line.empty() || line[0] == '#') continue;
+        std::istringstream line_stream(line);
+        std::string token;
+        while (idx < 128 && std::getline(line_stream, token, ',')) {
+          uint8_t val = std::stoi(token);
+          base_tables[idx / 64][idx % 64] = val;
+          idx++;
+        }
+      }
+    }
+  }
+  bool Setup(const sjpeg::EncoderParam& param) override {
+    sjpeg::SearchHook::Setup(param);
+    q = q_start;
+    return true;
+  }
+  void NextMatrix(int idx, uint8_t dst[64]) override {
+    float factor = (q <= 0)       ? 5000.0f
+                   : (q < 50.0f)  ? 5000.0f / q
+                   : (q < 100.0f) ? 2 * (100.0f - q)
+                                  : 0.0f;
+    sjpeg::SetQuantMatrix(base_tables[idx], factor, dst);
+  }
+  bool Update(float result) override {
+    value = result;
+    if (fabs(value - target) < tolerance * target) {
+      return true;
+    }
+    if (value > target) {
+      qmax = q;
+    } else {
+      qmin = q;
+    }
+    if (qmin == qmax) {
+      return true;
+    }
+    const float last_q = q;
+    if (pass == 0) {
+      q += first_iter_slope *
+           (for_size ? 0.1 * std::log(target / value) : (target - value));
+      q = std::max(qmin, std::min(qmax, q));
+    } else {
+      q = (qmin + qmax) / 2.;
+    }
+    return (pass > 0 && fabs(q - last_q) < q_precision);
+  }
+  ~MySearchHook() override {}
+};
+#endif
+
 Status EncodeWithSJpeg(const PackedImage& image, const JxlBasicInfo& info,
                        const std::vector<uint8_t>& icc,
-                       std::vector<uint8_t> exif, size_t quality,
-                       const std::string& chroma_subsampling,
+                       std::vector<uint8_t> exif, const JpegParams& params,
                        std::vector<uint8_t>* bytes) {
 #if !JPEGXL_ENABLE_SJPEG
   return JXL_FAILURE("JPEG XL was built without sjpeg support");
 #else
-  sjpeg::EncoderParam param(quality);
+  if (image.format.data_type != JXL_TYPE_UINT8) {
+    return JXL_FAILURE("Unsupported pixel data type");
+  }
+  if (info.alpha_bits > 0) {
+    return JXL_FAILURE("alpha is not supported");
+  }
+  sjpeg::EncoderParam param(params.quality);
   if (!icc.empty()) {
     param.iccp.assign(icc.begin(), icc.end());
   }
@@ -180,13 +455,43 @@ Status EncodeWithSJpeg(const PackedImage& image, const JxlBasicInfo& info,
     ResetExifOrientation(exif);
     param.exif.assign(exif.begin(), exif.end());
   }
-  if (chroma_subsampling == "444") {
+  if (params.chroma_subsampling == "444") {
     param.yuv_mode = SJPEG_YUV_444;
-  } else if (chroma_subsampling == "420") {
+  } else if (params.chroma_subsampling == "420") {
+    param.yuv_mode = SJPEG_YUV_420;
+  } else if (params.chroma_subsampling == "420sharp") {
     param.yuv_mode = SJPEG_YUV_SHARP;
   } else {
     return JXL_FAILURE("sjpeg does not support this chroma subsampling mode");
   }
+  param.adaptive_quantization = params.enable_adaptive_quant;
+  std::unique_ptr<MySearchHook> hook;
+  if (params.libjpeg_quality > 0) {
+    JpegParams libjpeg_params;
+    libjpeg_params.quality = params.libjpeg_quality;
+    libjpeg_params.chroma_subsampling = params.libjpeg_chroma_subsampling;
+    std::vector<uint8_t> libjpeg_bytes;
+    JXL_RETURN_IF_ERROR(EncodeWithLibJpeg(image, info, icc, exif,
+                                          libjpeg_params, &libjpeg_bytes));
+    param.target_mode = sjpeg::EncoderParam::TARGET_SIZE;
+    param.target_value = libjpeg_bytes.size();
+  }
+  if (params.psnr_target > 0) {
+    param.target_mode = sjpeg::EncoderParam::TARGET_PSNR;
+    param.target_value = params.psnr_target;
+  }
+  if (param.target_mode != sjpeg::EncoderParam::TARGET_NONE) {
+    param.passes = params.search_max_iters;
+    param.tolerance = params.search_tolerance;
+    param.qmin = params.search_q_min;
+    param.qmax = params.search_q_max;
+    hook.reset(new MySearchHook());
+    hook->ReadBaseTables(params.custom_base_quant_fn);
+    hook->q_start = params.search_q_start;
+    hook->q_precision = params.search_q_precision;
+    hook->first_iter_slope = params.search_first_iter_slope;
+    param.search_hook = hook.get();
+  }
   size_t stride = info.xsize * 3;
   const uint8_t* pixels = reinterpret_cast<const uint8_t*>(image.pixels());
   std::string output;
@@ -202,27 +507,20 @@ Status EncodeWithSJpeg(const PackedImage& image, const JxlBasicInfo& info,
 Status EncodeImageJPG(const PackedImage& image, const JxlBasicInfo& info,
                       const std::vector<uint8_t>& icc,
                       std::vector<uint8_t> exif, JpegEncoder encoder,
-                      size_t quality, const std::string& chroma_subsampling,
-                      ThreadPool* pool, std::vector<uint8_t>* bytes) {
-  if (image.format.data_type != JXL_TYPE_UINT8) {
-    return JXL_FAILURE("Unsupported pixel data type");
-  }
-  if (info.alpha_bits > 0) {
-    return JXL_FAILURE("alpha is not supported");
-  }
-  if (quality > 100) {
+                      const JpegParams& params, ThreadPool* pool,
+                      std::vector<uint8_t>* bytes) {
+  if (params.quality > 100) {
     return JXL_FAILURE("please specify a 0-100 JPEG quality");
   }
 
   switch (encoder) {
     case JpegEncoder::kLibJpeg:
-      JXL_RETURN_IF_ERROR(EncodeWithLibJpeg(image, info, icc, std::move(exif),
-                                            quality, chroma_subsampling,
-                                            bytes));
+      JXL_RETURN_IF_ERROR(
+          EncodeWithLibJpeg(image, info, icc, std::move(exif), params, bytes));
       break;
     case JpegEncoder::kSJpeg:
-      JXL_RETURN_IF_ERROR(EncodeWithSJpeg(image, info, icc, std::move(exif),
-                                          quality, chroma_subsampling, bytes));
+      JXL_RETURN_IF_ERROR(
+          EncodeWithSJpeg(image, info, icc, std::move(exif), params, bytes));
       break;
     default:
       return JXL_FAILURE("tried to use an unknown JPEG encoder");
@@ -234,43 +532,72 @@ Status EncodeImageJPG(const PackedImage& image, const JxlBasicInfo& info,
 class JPEGEncoder : public Encoder {
   std::vector<JxlPixelFormat> AcceptedFormats() const override {
     std::vector<JxlPixelFormat> formats;
-    for (const uint32_t num_channels : {1, 3}) {
+    for (const uint32_t num_channels : {1, 2, 3, 4}) {
       for (JxlEndianness endianness : {JXL_BIG_ENDIAN, JXL_LITTLE_ENDIAN}) {
         formats.push_back(JxlPixelFormat{/*num_channels=*/num_channels,
                                          /*data_type=*/JXL_TYPE_UINT8,
                                          /*endianness=*/endianness,
                                          /*align=*/0});
       }
+      formats.push_back(JxlPixelFormat{/*num_channels=*/num_channels,
+                                       /*data_type=*/JXL_TYPE_UINT16,
+                                       /*endianness=*/JXL_BIG_ENDIAN,
+                                       /*align=*/0});
     }
     return formats;
   }
   Status Encode(const PackedPixelFile& ppf, EncodedImage* encoded_image,
                 ThreadPool* pool = nullptr) const override {
     JXL_RETURN_IF_ERROR(VerifyBasicInfo(ppf.info));
-    const auto& options = this->options();
-    int quality = 100;
-    auto it_quality = options.find("q");
-    if (it_quality != options.end()) {
-      std::istringstream is(it_quality->second);
-      JXL_RETURN_IF_ERROR(static_cast<bool>(is >> quality));
-    }
-    std::string chroma_subsampling = "444";
-    auto it_chroma_subsampling = options.find("chroma_subsampling");
-    if (it_chroma_subsampling != options.end()) {
-      chroma_subsampling = it_chroma_subsampling->second;
-    }
     JpegEncoder jpeg_encoder = JpegEncoder::kLibJpeg;
-    auto it_encoder = options.find("jpeg_encoder");
-    if (it_encoder != options.end()) {
-      if (it_encoder->second == "libjpeg") {
-        jpeg_encoder = JpegEncoder::kLibJpeg;
-      } else if (it_encoder->second == "sjpeg") {
-        jpeg_encoder = JpegEncoder::kSJpeg;
-      } else {
-        return JXL_FAILURE("unknown jpeg encoder \"%s\"",
-                           it_encoder->second.c_str());
+    JpegParams params;
+    for (const auto& it : options()) {
+      if (it.first == "q") {
+        std::istringstream is(it.second);
+        JXL_RETURN_IF_ERROR(static_cast<bool>(is >> params.quality));
+      } else if (it.first == "libjpeg_quality") {
+        std::istringstream is(it.second);
+        JXL_RETURN_IF_ERROR(static_cast<bool>(is >> params.libjpeg_quality));
+      } else if (it.first == "chroma_subsampling") {
+        params.chroma_subsampling = it.second;
+      } else if (it.first == "libjpeg_chroma_subsampling") {
+        params.libjpeg_chroma_subsampling = it.second;
+      } else if (it.first == "jpeg_encoder") {
+        if (it.second == "libjpeg") {
+          jpeg_encoder = JpegEncoder::kLibJpeg;
+        } else if (it.second == "sjpeg") {
+          jpeg_encoder = JpegEncoder::kSJpeg;
+        } else {
+          return JXL_FAILURE("unknown jpeg encoder \"%s\"", it.second.c_str());
+        }
+      } else if (it.first == "progressive") {
+        std::istringstream is(it.second);
+        JXL_RETURN_IF_ERROR(static_cast<bool>(is >> params.progressive_id));
+      } else if (it.first == "optimize" && it.second == "OFF") {
+        params.optimize_coding = false;
+      } else if (it.first == "adaptive_q" && it.second == "OFF") {
+        params.enable_adaptive_quant = false;
+      } else if (it.first == "psnr") {
+        params.psnr_target = std::stof(it.second);
+      } else if (it.first == "base_quant_fn") {
+        params.custom_base_quant_fn = it.second;
+      } else if (it.first == "search_q_start") {
+        params.search_q_start = std::stof(it.second);
+      } else if (it.first == "search_q_min") {
+        params.search_q_min = std::stof(it.second);
+      } else if (it.first == "search_q_max") {
+        params.search_q_max = std::stof(it.second);
+      } else if (it.first == "search_max_iters") {
+        params.search_max_iters = std::stoi(it.second);
+      } else if (it.first == "search_tolerance") {
+        params.search_tolerance = std::stof(it.second);
+      } else if (it.first == "search_q_precision") {
+        params.search_q_precision = std::stof(it.second);
+      } else if (it.first == "search_first_iter_slope") {
+        params.search_first_iter_slope = std::stof(it.second);
       }
     }
+    params.is_xyb = (ppf.color_encoding.color_space == JXL_COLOR_SPACE_XYB);
     std::vector<uint8_t> icc;
     if (!IsSRGBEncoding(ppf.color_encoding)) {
       icc = ppf.icc;
@@ -281,17 +608,22 @@ class JPEGEncoder : public Encoder {
       JXL_RETURN_IF_ERROR(VerifyPackedImage(frame.color, ppf.info));
       encoded_image->bitstreams.emplace_back();
       JXL_RETURN_IF_ERROR(EncodeImageJPG(
-          frame.color, ppf.info, icc, ppf.metadata.exif, jpeg_encoder, quality,
-          chroma_subsampling, pool, &encoded_image->bitstreams.back()));
+          frame.color, ppf.info, icc, ppf.metadata.exif, jpeg_encoder, params,
+          pool, &encoded_image->bitstreams.back()));
     }
     return true;
   }
 };
 
 }  // namespace
+#endif
 
 std::unique_ptr<Encoder> GetJPEGEncoder() {
+#if JPEGXL_ENABLE_JPEG
   return jxl::make_unique<JPEGEncoder>();
+#else
+  return nullptr;
+#endif
 }
 
 }  // namespace extras
diff --git a/lib/extras/enc/jxl.cc b/lib/extras/enc/jxl.cc
new file mode 100644 (file)
index 0000000..054d15e
--- /dev/null
@@ -0,0 +1,359 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/extras/enc/jxl.h"
+
+#include <jxl/encode.h>
+#include <jxl/encode_cxx.h>
+
+#include "lib/jxl/exif.h"
+
+namespace jxl {
+namespace extras {
+
+JxlEncoderStatus SetOption(const JXLOption& opt,
+                           JxlEncoderFrameSettings* settings) {
+  return opt.is_float
+             ? JxlEncoderFrameSettingsSetFloatOption(settings, opt.id, opt.fval)
+             : JxlEncoderFrameSettingsSetOption(settings, opt.id, opt.ival);
+}
+
+bool SetFrameOptions(const std::vector<JXLOption>& options, size_t frame_index,
+                     size_t* option_idx, JxlEncoderFrameSettings* settings) {
+  while (*option_idx < options.size()) {
+    const auto& opt = options[*option_idx];
+    if (opt.frame_index > frame_index) {
+      break;
+    }
+    if (JXL_ENC_SUCCESS != SetOption(opt, settings)) {
+      fprintf(stderr, "Setting option id %d failed.\n", opt.id);
+      return false;
+    }
+    (*option_idx)++;
+  }
+  return true;
+}
+
+bool SetupFrame(JxlEncoder* enc, JxlEncoderFrameSettings* settings,
+                const JxlFrameHeader& frame_header,
+                const JXLCompressParams& params, const PackedPixelFile& ppf,
+                size_t frame_index, size_t num_alpha_channels,
+                size_t num_interleaved_alpha, size_t& option_idx) {
+  if (JXL_ENC_SUCCESS != JxlEncoderSetFrameHeader(settings, &frame_header)) {
+    fprintf(stderr, "JxlEncoderSetFrameHeader() failed.\n");
+    return false;
+  }
+  if (!SetFrameOptions(params.options, frame_index, &option_idx, settings)) {
+    return false;
+  }
+  if (num_alpha_channels > 0) {
+    JxlExtraChannelInfo extra_channel_info;
+    JxlEncoderInitExtraChannelInfo(JXL_CHANNEL_ALPHA, &extra_channel_info);
+    extra_channel_info.bits_per_sample = ppf.info.alpha_bits;
+    extra_channel_info.exponent_bits_per_sample = ppf.info.alpha_exponent_bits;
+    if (params.premultiply != -1) {
+      if (params.premultiply != 0 && params.premultiply != 1) {
+        fprintf(stderr, "premultiply must be one of: -1, 0, 1.\n");
+        return false;
+      }
+      extra_channel_info.alpha_premultiplied = params.premultiply;
+    }
+    if (JXL_ENC_SUCCESS !=
+        JxlEncoderSetExtraChannelInfo(enc, 0, &extra_channel_info)) {
+      fprintf(stderr, "JxlEncoderSetExtraChannelInfo() failed.\n");
+      return false;
+    }
+    // We take the extra channel blend info frame_info, but don't do
+    // clamping.
+    JxlBlendInfo extra_channel_blend_info = frame_header.layer_info.blend_info;
+    extra_channel_blend_info.clamp = JXL_FALSE;
+    JxlEncoderSetExtraChannelBlendInfo(settings, 0, &extra_channel_blend_info);
+  }
+  // Add extra channel info for the rest of the extra channels.
+  for (size_t i = 0; i < ppf.info.num_extra_channels; ++i) {
+    if (i < ppf.extra_channels_info.size()) {
+      const auto& ec_info = ppf.extra_channels_info[i].ec_info;
+      if (JXL_ENC_SUCCESS != JxlEncoderSetExtraChannelInfo(
+                                 enc, num_interleaved_alpha + i, &ec_info)) {
+        fprintf(stderr, "JxlEncoderSetExtraChannelInfo() failed.\n");
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+bool ReadCompressedOutput(JxlEncoder* enc, std::vector<uint8_t>* compressed) {
+  compressed->clear();
+  compressed->resize(4096);
+  uint8_t* next_out = compressed->data();
+  size_t avail_out = compressed->size() - (next_out - compressed->data());
+  JxlEncoderStatus result = JXL_ENC_NEED_MORE_OUTPUT;
+  while (result == JXL_ENC_NEED_MORE_OUTPUT) {
+    result = JxlEncoderProcessOutput(enc, &next_out, &avail_out);
+    if (result == JXL_ENC_NEED_MORE_OUTPUT) {
+      size_t offset = next_out - compressed->data();
+      compressed->resize(compressed->size() * 2);
+      next_out = compressed->data() + offset;
+      avail_out = compressed->size() - offset;
+    }
+  }
+  compressed->resize(next_out - compressed->data());
+  if (result != JXL_ENC_SUCCESS) {
+    fprintf(stderr, "JxlEncoderProcessOutput failed.\n");
+    return false;
+  }
+  return true;
+}
+
+bool EncodeImageJXL(const JXLCompressParams& params, const PackedPixelFile& ppf,
+                    const std::vector<uint8_t>* jpeg_bytes,
+                    std::vector<uint8_t>* compressed) {
+  auto encoder = JxlEncoderMake(/*memory_manager=*/nullptr);
+  JxlEncoder* enc = encoder.get();
+
+  if (params.allow_expert_options) {
+    JxlEncoderAllowExpertOptions(enc);
+  }
+
+  if (params.runner_opaque != nullptr &&
+      JXL_ENC_SUCCESS != JxlEncoderSetParallelRunner(enc, params.runner,
+                                                     params.runner_opaque)) {
+    fprintf(stderr, "JxlEncoderSetParallelRunner failed\n");
+    return false;
+  }
+
+  if (params.HasOutputProcessor() &&
+      JXL_ENC_SUCCESS !=
+          JxlEncoderSetOutputProcessor(enc, params.output_processor)) {
+    fprintf(stderr, "JxlEncoderSetOutputProcessorfailed\n");
+    return false;
+  }
+
+  auto settings = JxlEncoderFrameSettingsCreate(enc, nullptr);
+  size_t option_idx = 0;
+  if (!SetFrameOptions(params.options, 0, &option_idx, settings)) {
+    return false;
+  }
+  if (JXL_ENC_SUCCESS !=
+      JxlEncoderSetFrameDistance(settings, params.distance)) {
+    fprintf(stderr, "Setting frame distance failed.\n");
+    return false;
+  }
+  if (params.debug_image) {
+    JxlEncoderSetDebugImageCallback(settings, params.debug_image,
+                                    params.debug_image_opaque);
+  }
+  if (params.stats) {
+    JxlEncoderCollectStats(settings, params.stats);
+  }
+
+  bool use_boxes = !ppf.metadata.exif.empty() || !ppf.metadata.xmp.empty() ||
+                   !ppf.metadata.jumbf.empty() || !ppf.metadata.iptc.empty();
+  bool use_container = params.use_container || use_boxes ||
+                       (jpeg_bytes && params.jpeg_store_metadata);
+
+  if (JXL_ENC_SUCCESS !=
+      JxlEncoderUseContainer(enc, static_cast<int>(use_container))) {
+    fprintf(stderr, "JxlEncoderUseContainer failed.\n");
+    return false;
+  }
+
+  if (jpeg_bytes) {
+    if (params.jpeg_store_metadata &&
+        JXL_ENC_SUCCESS != JxlEncoderStoreJPEGMetadata(enc, JXL_TRUE)) {
+      fprintf(stderr, "Storing JPEG metadata failed.\n");
+      return false;
+    }
+    if (!params.jpeg_store_metadata && params.jpeg_strip_exif) {
+      JxlEncoderFrameSettingsSetOption(settings,
+                                       JXL_ENC_FRAME_SETTING_JPEG_KEEP_EXIF, 0);
+    }
+    if (!params.jpeg_store_metadata && params.jpeg_strip_xmp) {
+      JxlEncoderFrameSettingsSetOption(settings,
+                                       JXL_ENC_FRAME_SETTING_JPEG_KEEP_XMP, 0);
+    }
+    if (params.jpeg_strip_jumbf) {
+      JxlEncoderFrameSettingsSetOption(
+          settings, JXL_ENC_FRAME_SETTING_JPEG_KEEP_JUMBF, 0);
+    }
+    if (JXL_ENC_SUCCESS != JxlEncoderAddJPEGFrame(settings, jpeg_bytes->data(),
+                                                  jpeg_bytes->size())) {
+      JxlEncoderError error = JxlEncoderGetError(enc);
+      if (error == JXL_ENC_ERR_BAD_INPUT) {
+        fprintf(stderr,
+                "Error while decoding the JPEG image. It may be corrupt (e.g. "
+                "truncated) or of an unsupported type (e.g. CMYK).\n");
+      } else if (error == JXL_ENC_ERR_JBRD) {
+        fprintf(stderr,
+                "JPEG bitstream reconstruction data could not be created. "
+                "Possibly there is too much tail data.\n"
+                "Try using --jpeg_store_metadata 0, to losslessly "
+                "recompress the JPEG image data without bitstream "
+                "reconstruction data.\n");
+      } else {
+        fprintf(stderr, "JxlEncoderAddJPEGFrame() failed.\n");
+      }
+      return false;
+    }
+  } else {
+    size_t num_alpha_channels = 0;  // Adjusted below.
+    JxlBasicInfo basic_info = ppf.info;
+    basic_info.xsize *= params.already_downsampled;
+    basic_info.ysize *= params.already_downsampled;
+    if (basic_info.alpha_bits > 0) num_alpha_channels = 1;
+    if (params.intensity_target > 0) {
+      basic_info.intensity_target = params.intensity_target;
+    }
+    basic_info.num_extra_channels =
+        std::max<uint32_t>(num_alpha_channels, ppf.info.num_extra_channels);
+    basic_info.num_color_channels = ppf.info.num_color_channels;
+    const bool lossless = params.distance == 0;
+    basic_info.uses_original_profile = lossless;
+    if (params.override_bitdepth != 0) {
+      basic_info.bits_per_sample = params.override_bitdepth;
+      basic_info.exponent_bits_per_sample =
+          params.override_bitdepth == 32 ? 8 : 0;
+    }
+    if (JXL_ENC_SUCCESS !=
+        JxlEncoderSetCodestreamLevel(enc, params.codestream_level)) {
+      fprintf(stderr, "Setting --codestream_level failed.\n");
+      return false;
+    }
+    if (JXL_ENC_SUCCESS != JxlEncoderSetBasicInfo(enc, &basic_info)) {
+      fprintf(stderr, "JxlEncoderSetBasicInfo() failed.\n");
+      return false;
+    }
+    if (JXL_ENC_SUCCESS !=
+        JxlEncoderSetUpsamplingMode(enc, params.already_downsampled,
+                                    params.upsampling_mode)) {
+      fprintf(stderr, "JxlEncoderSetUpsamplingMode() failed.\n");
+      return false;
+    }
+    if (JXL_ENC_SUCCESS !=
+        JxlEncoderSetFrameBitDepth(settings, &params.input_bitdepth)) {
+      fprintf(stderr, "JxlEncoderSetFrameBitDepth() failed.\n");
+      return false;
+    }
+    if (num_alpha_channels != 0 &&
+        JXL_ENC_SUCCESS != JxlEncoderSetExtraChannelDistance(
+                               settings, 0, params.alpha_distance)) {
+      fprintf(stderr, "Setting alpha distance failed.\n");
+      return false;
+    }
+    if (lossless &&
+        JXL_ENC_SUCCESS != JxlEncoderSetFrameLossless(settings, JXL_TRUE)) {
+      fprintf(stderr, "JxlEncoderSetFrameLossless() failed.\n");
+      return false;
+    }
+    if (!ppf.icc.empty()) {
+      if (JXL_ENC_SUCCESS !=
+          JxlEncoderSetICCProfile(enc, ppf.icc.data(), ppf.icc.size())) {
+        fprintf(stderr, "JxlEncoderSetICCProfile() failed.\n");
+        return false;
+      }
+    } else {
+      if (JXL_ENC_SUCCESS !=
+          JxlEncoderSetColorEncoding(enc, &ppf.color_encoding)) {
+        fprintf(stderr, "JxlEncoderSetColorEncoding() failed.\n");
+        return false;
+      }
+    }
+
+    if (use_boxes) {
+      if (JXL_ENC_SUCCESS != JxlEncoderUseBoxes(enc)) {
+        fprintf(stderr, "JxlEncoderUseBoxes() failed.\n");
+        return false;
+      }
+      // Prepend 4 zero bytes to exif for tiff header offset
+      std::vector<uint8_t> exif_with_offset;
+      bool bigendian;
+      if (IsExif(ppf.metadata.exif, &bigendian)) {
+        exif_with_offset.resize(ppf.metadata.exif.size() + 4);
+        memcpy(exif_with_offset.data() + 4, ppf.metadata.exif.data(),
+               ppf.metadata.exif.size());
+      }
+      const struct BoxInfo {
+        const char* type;
+        const std::vector<uint8_t>& bytes;
+      } boxes[] = {
+          {"Exif", exif_with_offset},
+          {"xml ", ppf.metadata.xmp},
+          {"jumb", ppf.metadata.jumbf},
+          {"xml ", ppf.metadata.iptc},
+      };
+      for (size_t i = 0; i < sizeof boxes / sizeof *boxes; ++i) {
+        const BoxInfo& box = boxes[i];
+        if (!box.bytes.empty() &&
+            JXL_ENC_SUCCESS != JxlEncoderAddBox(enc, box.type, box.bytes.data(),
+                                                box.bytes.size(),
+                                                params.compress_boxes)) {
+          fprintf(stderr, "JxlEncoderAddBox() failed (%s).\n", box.type);
+          return false;
+        }
+      }
+      JxlEncoderCloseBoxes(enc);
+    }
+
+    for (size_t num_frame = 0; num_frame < ppf.frames.size(); ++num_frame) {
+      const jxl::extras::PackedFrame& pframe = ppf.frames[num_frame];
+      const jxl::extras::PackedImage& pimage = pframe.color;
+      JxlPixelFormat ppixelformat = pimage.format;
+      size_t num_interleaved_alpha =
+          (ppixelformat.num_channels - ppf.info.num_color_channels);
+      if (!SetupFrame(enc, settings, pframe.frame_info, params, ppf, num_frame,
+                      num_alpha_channels, num_interleaved_alpha, option_idx)) {
+        return false;
+      }
+      if (JXL_ENC_SUCCESS != JxlEncoderAddImageFrame(settings, &ppixelformat,
+                                                     pimage.pixels(),
+                                                     pimage.pixels_size)) {
+        fprintf(stderr, "JxlEncoderAddImageFrame() failed.\n");
+        return false;
+      }
+      // Only set extra channel buffer if it is provided non-interleaved.
+      for (size_t i = 0; i < pframe.extra_channels.size(); ++i) {
+        if (JXL_ENC_SUCCESS !=
+            JxlEncoderSetExtraChannelBuffer(settings, &ppixelformat,
+                                            pframe.extra_channels[i].pixels(),
+                                            pframe.extra_channels[i].stride *
+                                                pframe.extra_channels[i].ysize,
+                                            num_interleaved_alpha + i)) {
+          fprintf(stderr, "JxlEncoderSetExtraChannelBuffer() failed.\n");
+          return false;
+        }
+      }
+    }
+    for (size_t fi = 0; fi < ppf.chunked_frames.size(); ++fi) {
+      ChunkedPackedFrame& chunked_frame = ppf.chunked_frames[fi];
+      size_t num_interleaved_alpha =
+          (chunked_frame.format.num_channels - ppf.info.num_color_channels);
+      if (!SetupFrame(enc, settings, chunked_frame.frame_info, params, ppf, fi,
+                      num_alpha_channels, num_interleaved_alpha, option_idx)) {
+        return false;
+      }
+      const bool last_frame = fi + 1 == ppf.chunked_frames.size();
+      if (JXL_ENC_SUCCESS !=
+          JxlEncoderAddChunkedFrame(settings, last_frame,
+                                    chunked_frame.GetInputSource())) {
+        fprintf(stderr, "JxlEncoderAddChunkedFrame() failed.\n");
+        return false;
+      }
+    }
+  }
+  JxlEncoderCloseInput(enc);
+  if (params.HasOutputProcessor()) {
+    if (JXL_ENC_SUCCESS != JxlEncoderFlushInput(enc)) {
+      fprintf(stderr, "JxlEncoderAddChunkedFrame() failed.\n");
+      return false;
+    }
+  } else if (!ReadCompressedOutput(enc, compressed)) {
+    return false;
+  }
+  return true;
+}
+
+}  // namespace extras
+}  // namespace jxl
diff --git a/lib/extras/enc/jxl.h b/lib/extras/enc/jxl.h
new file mode 100644 (file)
index 0000000..b8ca5bd
--- /dev/null
@@ -0,0 +1,91 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_EXTRAS_ENC_JXL_H_
+#define LIB_EXTRAS_ENC_JXL_H_
+
+#include <jxl/encode.h>
+#include <jxl/parallel_runner.h>
+#include <jxl/thread_parallel_runner.h>
+#include <jxl/types.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "lib/extras/packed_image.h"
+
+namespace jxl {
+namespace extras {
+
+struct JXLOption {
+  JXLOption(JxlEncoderFrameSettingId id, int64_t val, size_t frame_index)
+      : id(id), is_float(false), ival(val), frame_index(frame_index) {}
+  JXLOption(JxlEncoderFrameSettingId id, float val, size_t frame_index)
+      : id(id), is_float(true), fval(val), frame_index(frame_index) {}
+
+  JxlEncoderFrameSettingId id;
+  bool is_float;
+  union {
+    int64_t ival;
+    float fval;
+  };
+  size_t frame_index;
+};
+
+struct JXLCompressParams {
+  std::vector<JXLOption> options;
+  // Target butteraugli distance, 0.0 means lossless.
+  float distance = 1.0f;
+  float alpha_distance = 1.0f;
+  // If set to true, forces container mode.
+  bool use_container = false;
+  // Whether to enable/disable byte-exact jpeg reconstruction for jpeg inputs.
+  bool jpeg_store_metadata = true;
+  bool jpeg_strip_exif = false;
+  bool jpeg_strip_xmp = false;
+  bool jpeg_strip_jumbf = false;
+  // Whether to create brob boxes.
+  bool compress_boxes = true;
+  // Upper bound on the intensity level present in the image in nits (zero means
+  // that the library chooses a default).
+  float intensity_target = 0;
+  int already_downsampled = 1;
+  int upsampling_mode = -1;
+  // Overrides for bitdepth, codestream level and alpha premultiply.
+  size_t override_bitdepth = 0;
+  int32_t codestream_level = -1;
+  int32_t premultiply = -1;
+  // Override input buffer interpretation.
+  JxlBitDepth input_bitdepth = {JXL_BIT_DEPTH_FROM_PIXEL_FORMAT, 0, 0};
+  // If runner_opaque is set, the decoder uses this parallel runner.
+  JxlParallelRunner runner = JxlThreadParallelRunner;
+  void* runner_opaque = nullptr;
+  JxlEncoderOutputProcessor output_processor = {};
+  JxlDebugImageCallback debug_image = nullptr;
+  void* debug_image_opaque = nullptr;
+  JxlEncoderStats* stats = nullptr;
+  bool allow_expert_options = false;
+
+  void AddOption(JxlEncoderFrameSettingId id, int64_t val) {
+    options.emplace_back(JXLOption(id, val, 0));
+  }
+  void AddFloatOption(JxlEncoderFrameSettingId id, float val) {
+    options.emplace_back(JXLOption(id, val, 0));
+  }
+  bool HasOutputProcessor() const {
+    return (output_processor.get_buffer != nullptr &&
+            output_processor.release_buffer != nullptr &&
+            output_processor.set_finalized_position != nullptr);
+  }
+};
+
+bool EncodeImageJXL(const JXLCompressParams& params, const PackedPixelFile& ppf,
+                    const std::vector<uint8_t>* jpeg_bytes,
+                    std::vector<uint8_t>* compressed);
+
+}  // namespace extras
+}  // namespace jxl
+
+#endif  // LIB_EXTRAS_ENC_JXL_H_
index 1428e64..ae8cf13 100644 (file)
@@ -5,13 +5,12 @@
 
 #include "lib/extras/enc/npy.h"
 
-#include <stdio.h>
+#include <jxl/types.h>
 
 #include <sstream>
 #include <string>
 #include <vector>
 
-#include "jxl/types.h"
 #include "lib/extras/packed_image.h"
 
 namespace jxl {
index ef204ad..d4809e3 100644 (file)
@@ -5,13 +5,11 @@
 
 #include "lib/extras/enc/pgx.h"
 
-#include <stdio.h>
+#include <jxl/codestream_header.h>
 #include <string.h>
 
-#include "jxl/codestream_header.h"
 #include "lib/extras/packed_image.h"
 #include "lib/jxl/base/byte_order.h"
-#include "lib/jxl/base/printf_macros.h"
 
 namespace jxl {
 namespace extras {
index 9b5f6cb..4183900 100644 (file)
@@ -5,7 +5,6 @@
 
 #include "lib/extras/enc/pnm.h"
 
-#include <stdio.h>
 #include <string.h>
 
 #include <string>
 #include "lib/extras/packed_image.h"
 #include "lib/jxl/base/byte_order.h"
 #include "lib/jxl/base/compiler_specific.h"
-#include "lib/jxl/base/file_io.h"
 #include "lib/jxl/base/printf_macros.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/color_management.h"
 #include "lib/jxl/dec_external_image.h"
-#include "lib/jxl/enc_color_management.h"
 #include "lib/jxl/enc_external_image.h"
 #include "lib/jxl/enc_image_bundle.h"
 #include "lib/jxl/fields.h"  // AllDefault
@@ -32,66 +28,7 @@ namespace {
 
 constexpr size_t kMaxHeaderSize = 200;
 
-Status EncodeHeader(const PackedImage& image, size_t bits_per_sample,
-                    bool little_endian, char* header, int* chars_written) {
-  size_t num_channels = image.format.num_channels;
-  bool is_gray = num_channels <= 2;
-  bool has_alpha = num_channels == 2 || num_channels == 4;
-  if (has_alpha) {  // PAM
-    if (bits_per_sample > 16) return JXL_FAILURE("PNM cannot have > 16 bits");
-    const uint32_t max_val = (1U << bits_per_sample) - 1;
-    *chars_written =
-        snprintf(header, kMaxHeaderSize,
-                 "P7\nWIDTH %" PRIuS "\nHEIGHT %" PRIuS
-                 "\nDEPTH %u\nMAXVAL %u\nTUPLTYPE %s\nENDHDR\n",
-                 image.xsize, image.ysize, is_gray ? 2 : 4, max_val,
-                 is_gray ? "GRAYSCALE_ALPHA" : "RGB_ALPHA");
-    JXL_RETURN_IF_ERROR(static_cast<unsigned int>(*chars_written) <
-                        kMaxHeaderSize);
-  } else if (bits_per_sample == 32) {  // PFM
-    const char type = is_gray ? 'f' : 'F';
-    const double scale = little_endian ? -1.0 : 1.0;
-    *chars_written =
-        snprintf(header, kMaxHeaderSize, "P%c\n%" PRIuS " %" PRIuS "\n%.1f\n",
-                 type, image.xsize, image.ysize, scale);
-    JXL_RETURN_IF_ERROR(static_cast<unsigned int>(*chars_written) <
-                        kMaxHeaderSize);
-  } else {  // PGM/PPM
-    if (bits_per_sample > 16) return JXL_FAILURE("PNM cannot have > 16 bits");
-    const uint32_t max_val = (1U << bits_per_sample) - 1;
-    const char type = is_gray ? '5' : '6';
-    *chars_written =
-        snprintf(header, kMaxHeaderSize, "P%c\n%" PRIuS " %" PRIuS "\n%u\n",
-                 type, image.xsize, image.ysize, max_val);
-    JXL_RETURN_IF_ERROR(static_cast<unsigned int>(*chars_written) <
-                        kMaxHeaderSize);
-  }
-  return true;
-}
-
-Status EncodeImagePNM(const PackedImage& image, size_t bits_per_sample,
-                      std::vector<uint8_t>* bytes) {
-  // Choose native for PFM; PGM/PPM require big-endian
-  bool is_little_endian = bits_per_sample > 16 && IsLittleEndian();
-  char header[kMaxHeaderSize];
-  int header_size = 0;
-  JXL_RETURN_IF_ERROR(EncodeHeader(image, bits_per_sample, is_little_endian,
-                                   header, &header_size));
-  bytes->resize(static_cast<size_t>(header_size) + image.pixels_size);
-  memcpy(bytes->data(), header, static_cast<size_t>(header_size));
-  const bool flipped_y = bits_per_sample == 32;  // PFMs are flipped
-  const uint8_t* in = reinterpret_cast<const uint8_t*>(image.pixels());
-  uint8_t* out = bytes->data() + header_size;
-  for (size_t y = 0; y < image.ysize; ++y) {
-    size_t y_out = flipped_y ? image.ysize - 1 - y : y;
-    const uint8_t* row_in = &in[y * image.stride];
-    uint8_t* row_out = &out[y_out * image.stride];
-    memcpy(row_out, row_in, image.stride);
-  }
-  return true;
-}
-
-class PNMEncoder : public Encoder {
+class BasePNMEncoder : public Encoder {
  public:
   Status Encode(const PackedPixelFile& ppf, EncodedImage* encoded_image,
                 ThreadPool* pool = nullptr) const override {
@@ -106,8 +43,8 @@ class PNMEncoder : public Encoder {
     for (const auto& frame : ppf.frames) {
       JXL_RETURN_IF_ERROR(VerifyPackedImage(frame.color, ppf.info));
       encoded_image->bitstreams.emplace_back();
-      JXL_RETURN_IF_ERROR(EncodeImagePNM(frame.color, ppf.info.bits_per_sample,
-                                         &encoded_image->bitstreams.back()));
+      JXL_RETURN_IF_ERROR(
+          EncodeFrame(ppf, frame, &encoded_image->bitstreams.back()));
     }
     for (size_t i = 0; i < ppf.extra_channels_info.size(); ++i) {
       const auto& ec_info = ppf.extra_channels_info[i].ec_info;
@@ -115,85 +52,258 @@ class PNMEncoder : public Encoder {
       auto& ec_bitstreams = encoded_image->extra_channel_bitstreams.back();
       for (const auto& frame : ppf.frames) {
         ec_bitstreams.emplace_back();
-        JXL_RETURN_IF_ERROR(EncodeImagePNM(frame.extra_channels[i],
-                                           ec_info.bits_per_sample,
-                                           &ec_bitstreams.back()));
+        JXL_RETURN_IF_ERROR(EncodeExtraChannel(frame.extra_channels[i],
+                                               ec_info.bits_per_sample,
+                                               &ec_bitstreams.back()));
       }
     }
     return true;
   }
+
+ protected:
+  virtual Status EncodeFrame(const PackedPixelFile& ppf,
+                             const PackedFrame& frame,
+                             std::vector<uint8_t>* bytes) const = 0;
+  virtual Status EncodeExtraChannel(const PackedImage& image,
+                                    size_t bits_per_sample,
+                                    std::vector<uint8_t>* bytes) const = 0;
 };
 
+class PNMEncoder : public BasePNMEncoder {
+ public:
+  static const std::vector<JxlPixelFormat> kAcceptedFormats;
+
+  std::vector<JxlPixelFormat> AcceptedFormats() const override {
+    return kAcceptedFormats;
+  }
+
+  Status EncodeFrame(const PackedPixelFile& ppf, const PackedFrame& frame,
+                     std::vector<uint8_t>* bytes) const override {
+    return EncodeImage(frame.color, ppf.info.bits_per_sample, bytes);
+  }
+  Status EncodeExtraChannel(const PackedImage& image, size_t bits_per_sample,
+                            std::vector<uint8_t>* bytes) const override {
+    return EncodeImage(image, bits_per_sample, bytes);
+  }
+
+ private:
+  Status EncodeImage(const PackedImage& image, size_t bits_per_sample,
+                     std::vector<uint8_t>* bytes) const {
+    uint32_t maxval = (1u << bits_per_sample) - 1;
+    char type = image.format.num_channels == 1 ? '5' : '6';
+    char header[kMaxHeaderSize];
+    size_t header_size =
+        snprintf(header, kMaxHeaderSize, "P%c\n%" PRIuS " %" PRIuS "\n%u\n",
+                 type, image.xsize, image.ysize, maxval);
+    JXL_RETURN_IF_ERROR(header_size < kMaxHeaderSize);
+    bytes->resize(header_size + image.pixels_size);
+    memcpy(bytes->data(), header, header_size);
+    memcpy(bytes->data() + header_size,
+           reinterpret_cast<uint8_t*>(image.pixels()), image.pixels_size);
+    return true;
+  }
+};
+
+class PGMEncoder : public PNMEncoder {
+ public:
+  static const std::vector<JxlPixelFormat> kAcceptedFormats;
+
+  std::vector<JxlPixelFormat> AcceptedFormats() const override {
+    return kAcceptedFormats;
+  }
+};
+
+const std::vector<JxlPixelFormat> PGMEncoder::kAcceptedFormats = {
+    JxlPixelFormat{1, JXL_TYPE_UINT8, JXL_BIG_ENDIAN, 0},
+    JxlPixelFormat{1, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}};
+
 class PPMEncoder : public PNMEncoder {
  public:
+  static const std::vector<JxlPixelFormat> kAcceptedFormats;
+
   std::vector<JxlPixelFormat> AcceptedFormats() const override {
-    std::vector<JxlPixelFormat> formats;
-    for (const uint32_t num_channels : {1, 2, 3, 4}) {
-      for (const JxlDataType data_type : {JXL_TYPE_UINT8, JXL_TYPE_UINT16}) {
-        for (JxlEndianness endianness : {JXL_BIG_ENDIAN, JXL_LITTLE_ENDIAN}) {
-          formats.push_back(JxlPixelFormat{/*num_channels=*/num_channels,
-                                           /*data_type=*/data_type,
-                                           /*endianness=*/endianness,
-                                           /*align=*/0});
-        }
-      }
-    }
-    return formats;
+    return kAcceptedFormats;
   }
 };
 
-class PFMEncoder : public PNMEncoder {
+const std::vector<JxlPixelFormat> PPMEncoder::kAcceptedFormats = {
+    JxlPixelFormat{3, JXL_TYPE_UINT8, JXL_BIG_ENDIAN, 0},
+    JxlPixelFormat{3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0}};
+
+const std::vector<JxlPixelFormat> PNMEncoder::kAcceptedFormats = [] {
+  std::vector<JxlPixelFormat> combined = PPMEncoder::kAcceptedFormats;
+  combined.insert(combined.end(), PGMEncoder::kAcceptedFormats.begin(),
+                  PGMEncoder::kAcceptedFormats.end());
+  return combined;
+}();
+
+class PFMEncoder : public BasePNMEncoder {
  public:
   std::vector<JxlPixelFormat> AcceptedFormats() const override {
     std::vector<JxlPixelFormat> formats;
     for (const uint32_t num_channels : {1, 3}) {
-      for (const JxlDataType data_type : {JXL_TYPE_FLOAT16, JXL_TYPE_FLOAT}) {
-        for (JxlEndianness endianness : {JXL_BIG_ENDIAN, JXL_LITTLE_ENDIAN}) {
-          formats.push_back(JxlPixelFormat{/*num_channels=*/num_channels,
-                                           /*data_type=*/data_type,
-                                           /*endianness=*/endianness,
-                                           /*align=*/0});
-        }
+      for (JxlEndianness endianness : {JXL_BIG_ENDIAN, JXL_LITTLE_ENDIAN}) {
+        formats.push_back(JxlPixelFormat{/*num_channels=*/num_channels,
+                                         /*data_type=*/JXL_TYPE_FLOAT,
+                                         /*endianness=*/endianness,
+                                         /*align=*/0});
       }
     }
     return formats;
   }
-};
+  Status EncodeFrame(const PackedPixelFile& ppf, const PackedFrame& frame,
+                     std::vector<uint8_t>* bytes) const override {
+    return EncodeImage(frame.color, bytes);
+  }
+  Status EncodeExtraChannel(const PackedImage& image, size_t bits_per_sample,
+                            std::vector<uint8_t>* bytes) const override {
+    return EncodeImage(image, bytes);
+  }
 
-class PGMEncoder : public PPMEncoder {
- public:
-  std::vector<JxlPixelFormat> AcceptedFormats() const override {
-    std::vector<JxlPixelFormat> formats = PPMEncoder::AcceptedFormats();
-    for (auto it = formats.begin(); it != formats.end();) {
-      if (it->num_channels > 2) {
-        it = formats.erase(it);
-      } else {
-        ++it;
-      }
+ private:
+  Status EncodeImage(const PackedImage& image,
+                     std::vector<uint8_t>* bytes) const {
+    char type = image.format.num_channels == 1 ? 'f' : 'F';
+    double scale = image.format.endianness == JXL_LITTLE_ENDIAN ? -1.0 : 1.0;
+    char header[kMaxHeaderSize];
+    size_t header_size =
+        snprintf(header, kMaxHeaderSize, "P%c\n%" PRIuS " %" PRIuS "\n%.1f\n",
+                 type, image.xsize, image.ysize, scale);
+    JXL_RETURN_IF_ERROR(header_size < kMaxHeaderSize);
+    bytes->resize(header_size + image.pixels_size);
+    memcpy(bytes->data(), header, header_size);
+    const uint8_t* in = reinterpret_cast<const uint8_t*>(image.pixels());
+    uint8_t* out = bytes->data() + header_size;
+    for (size_t y = 0; y < image.ysize; ++y) {
+      size_t y_out = image.ysize - 1 - y;
+      const uint8_t* row_in = &in[y * image.stride];
+      uint8_t* row_out = &out[y_out * image.stride];
+      memcpy(row_out, row_in, image.stride);
     }
-    return formats;
+    return true;
   }
 };
 
-class PAMEncoder : public PPMEncoder {
+class PAMEncoder : public BasePNMEncoder {
  public:
   std::vector<JxlPixelFormat> AcceptedFormats() const override {
-    std::vector<JxlPixelFormat> formats = PPMEncoder::AcceptedFormats();
-    for (auto it = formats.begin(); it != formats.end();) {
-      if (it->num_channels != 2 && it->num_channels != 4) {
-        it = formats.erase(it);
-      } else {
-        ++it;
+    std::vector<JxlPixelFormat> formats;
+    for (const uint32_t num_channels : {1, 2, 3, 4}) {
+      for (const JxlDataType data_type : {JXL_TYPE_UINT8, JXL_TYPE_UINT16}) {
+        formats.push_back(JxlPixelFormat{/*num_channels=*/num_channels,
+                                         /*data_type=*/data_type,
+                                         /*endianness=*/JXL_BIG_ENDIAN,
+                                         /*align=*/0});
       }
     }
     return formats;
   }
-};
+  Status EncodeFrame(const PackedPixelFile& ppf, const PackedFrame& frame,
+                     std::vector<uint8_t>* bytes) const override {
+    const PackedImage& color = frame.color;
+    const auto& ec_info = ppf.extra_channels_info;
+    JXL_RETURN_IF_ERROR(frame.extra_channels.size() == ec_info.size());
+    for (const auto& ec : frame.extra_channels) {
+      if (ec.xsize != color.xsize || ec.ysize != color.ysize) {
+        return JXL_FAILURE("Extra channel and color size mismatch.");
+      }
+      if (ec.format.data_type != color.format.data_type ||
+          ec.format.endianness != color.format.endianness) {
+        return JXL_FAILURE("Extra channel and color format mismatch.");
+      }
+    }
+    if (ppf.info.alpha_bits &&
+        (ppf.info.bits_per_sample != ppf.info.alpha_bits)) {
+      return JXL_FAILURE("Alpha bit depth does not match image bit depth");
+    }
+    for (const auto& it : ec_info) {
+      if (it.ec_info.bits_per_sample != ppf.info.bits_per_sample) {
+        return JXL_FAILURE(
+            "Extra channel bit depth does not match image bit depth");
+      }
+    }
+    const char* kColorTypes[4] = {"GRAYSCALE", "GRAYSCALE_ALPHA", "RGB",
+                                  "RGB_ALPHA"};
+    uint32_t maxval = (1u << ppf.info.bits_per_sample) - 1;
+    uint32_t depth = color.format.num_channels + ec_info.size();
+    char header[kMaxHeaderSize];
+    size_t pos = 0;
+    pos += snprintf(header + pos, kMaxHeaderSize - pos,
+                    "P7\nWIDTH %" PRIuS "\nHEIGHT %" PRIuS
+                    "\nDEPTH %u\n"
+                    "MAXVAL %u\nTUPLTYPE %s\n",
+                    color.xsize, color.ysize, depth, maxval,
+                    kColorTypes[color.format.num_channels - 1]);
+    JXL_RETURN_IF_ERROR(pos < kMaxHeaderSize);
+    for (const auto& info : ec_info) {
+      pos += snprintf(header + pos, kMaxHeaderSize - pos, "TUPLTYPE %s\n",
+                      ExtraChannelTypeName(info.ec_info.type).c_str());
+      JXL_RETURN_IF_ERROR(pos < kMaxHeaderSize);
+    }
+    pos += snprintf(header + pos, kMaxHeaderSize - pos, "ENDHDR\n");
+    JXL_RETURN_IF_ERROR(pos < kMaxHeaderSize);
+    size_t total_size = color.pixels_size;
+    for (const auto& ec : frame.extra_channels) {
+      total_size += ec.pixels_size;
+    }
+    bytes->resize(pos + total_size);
+    memcpy(bytes->data(), header, pos);
+    // If we have no extra channels, just copy color pixel data over.
+    if (frame.extra_channels.empty()) {
+      memcpy(bytes->data() + pos, reinterpret_cast<uint8_t*>(color.pixels()),
+             color.pixels_size);
+      return true;
+    }
+    // Interleave color and extra channels.
+    const uint8_t* in = reinterpret_cast<const uint8_t*>(color.pixels());
+    std::vector<const uint8_t*> ec_in(frame.extra_channels.size());
+    for (size_t i = 0; i < frame.extra_channels.size(); ++i) {
+      ec_in[i] =
+          reinterpret_cast<const uint8_t*>(frame.extra_channels[i].pixels());
+    }
+    uint8_t* out = bytes->data() + pos;
+    size_t pwidth = PackedImage::BitsPerChannel(color.format.data_type) / 8;
+    for (size_t y = 0; y < color.ysize; ++y) {
+      for (size_t x = 0; x < color.xsize; ++x) {
+        memcpy(out, in, color.pixel_stride());
+        out += color.pixel_stride();
+        in += color.pixel_stride();
+        for (auto& p : ec_in) {
+          memcpy(out, p, pwidth);
+          out += pwidth;
+          p += pwidth;
+        }
+      }
+    }
+    return true;
+  }
+  Status EncodeExtraChannel(const PackedImage& image, size_t bits_per_sample,
+                            std::vector<uint8_t>* bytes) const override {
+    return true;
+  }
 
-Span<const uint8_t> MakeSpan(const char* str) {
-  return Span<const uint8_t>(reinterpret_cast<const uint8_t*>(str),
-                             strlen(str));
-}
+ private:
+  static std::string ExtraChannelTypeName(JxlExtraChannelType type) {
+    switch (type) {
+      case JXL_CHANNEL_ALPHA:
+        return std::string("Alpha");
+      case JXL_CHANNEL_DEPTH:
+        return std::string("Depth");
+      case JXL_CHANNEL_SPOT_COLOR:
+        return std::string("SpotColor");
+      case JXL_CHANNEL_SELECTION_MASK:
+        return std::string("SelectionMask");
+      case JXL_CHANNEL_BLACK:
+        return std::string("Black");
+      case JXL_CHANNEL_CFA:
+        return std::string("CFA");
+      case JXL_CHANNEL_THERMAL:
+        return std::string("Thermal");
+      default:
+        return std::string("UNKNOWN");
+    }
+  }
+};
 
 }  // namespace
 
@@ -201,6 +311,10 @@ std::unique_ptr<Encoder> GetPPMEncoder() {
   return jxl::make_unique<PPMEncoder>();
 }
 
+std::unique_ptr<Encoder> GetPNMEncoder() {
+  return jxl::make_unique<PNMEncoder>();
+}
+
 std::unique_ptr<Encoder> GetPFMEncoder() {
   return jxl::make_unique<PFMEncoder>();
 }
index 403208c..1e0020c 100644 (file)
@@ -19,6 +19,7 @@ namespace extras {
 
 std::unique_ptr<Encoder> GetPAMEncoder();
 std::unique_ptr<Encoder> GetPGMEncoder();
+std::unique_ptr<Encoder> GetPNMEncoder();
 std::unique_ptr<Encoder> GetPPMEncoder();
 std::unique_ptr<Encoder> GetPFMEncoder();
 
index 7d92655..aea6327 100644 (file)
@@ -23,7 +23,7 @@ void ResetExifOrientation(std::vector<uint8_t>& exif) {
     return;  // not a valid tiff header
   }
   t += 4;
-  uint32_t offset = (bigendian ? LoadBE32(t) : LoadLE32(t));
+  uint64_t offset = (bigendian ? LoadBE32(t) : LoadLE32(t));
   if (exif.size() < 12 + offset + 2 || offset < 8) return;
   t += offset - 4;
   uint16_t nb_tags = (bigendian ? LoadBE16(t) : LoadLE16(t));
index e39a080..1250055 100644 (file)
@@ -5,9 +5,9 @@
 
 #include "lib/extras/hlg.h"
 
-#include <cmath>
+#include <jxl/cms.h>
 
-#include "lib/jxl/enc_color_management.h"
+#include <cmath>
 
 namespace jxl {
 
@@ -19,11 +19,12 @@ float GetHlgGamma(const float peak_luminance, const float surround_luminance) {
 Status HlgOOTF(ImageBundle* ib, const float gamma, ThreadPool* pool) {
   ColorEncoding linear_rec2020;
   linear_rec2020.SetColorSpace(ColorSpace::kRGB);
-  linear_rec2020.primaries = Primaries::k2100;
-  linear_rec2020.white_point = WhitePoint::kD65;
-  linear_rec2020.tf.SetTransferFunction(TransferFunction::kLinear);
+  JXL_RETURN_IF_ERROR(linear_rec2020.SetPrimariesType(Primaries::k2100));
+  JXL_RETURN_IF_ERROR(linear_rec2020.SetWhitePointType(WhitePoint::kD65));
+  linear_rec2020.Tf().SetTransferFunction(TransferFunction::kLinear);
   JXL_RETURN_IF_ERROR(linear_rec2020.CreateICC());
-  JXL_RETURN_IF_ERROR(ib->TransformTo(linear_rec2020, GetJxlCms(), pool));
+  JXL_RETURN_IF_ERROR(
+      ib->TransformTo(linear_rec2020, *JxlGetDefaultCms(), pool));
 
   JXL_RETURN_IF_ERROR(RunOnPool(
       pool, 0, ib->ysize(), ThreadPool::NoInit,
diff --git a/lib/extras/jpegli_test.cc b/lib/extras/jpegli_test.cc
new file mode 100644 (file)
index 0000000..0ebf0c1
--- /dev/null
@@ -0,0 +1,415 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#if JPEGXL_ENABLE_JPEGLI
+
+#include "lib/extras/dec/jpegli.h"
+
+#include <jxl/color_encoding.h>
+#include <stdint.h>
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "lib/extras/dec/color_hints.h"
+#include "lib/extras/dec/decode.h"
+#include "lib/extras/dec/jpg.h"
+#include "lib/extras/enc/encode.h"
+#include "lib/extras/enc/jpegli.h"
+#include "lib/extras/enc/jpg.h"
+#include "lib/extras/packed_image.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/test_image.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace extras {
+namespace {
+
+using test::Butteraugli3Norm;
+using test::ButteraugliDistance;
+using test::TestImage;
+
+Status ReadTestImage(const std::string& pathname, PackedPixelFile* ppf) {
+  const std::vector<uint8_t> encoded = jxl::test::ReadTestData(pathname);
+  ColorHints color_hints;
+  if (pathname.find(".ppm") != std::string::npos) {
+    color_hints.Add("color_space", "RGB_D65_SRG_Rel_SRG");
+  } else if (pathname.find(".pgm") != std::string::npos) {
+    color_hints.Add("color_space", "Gra_D65_Rel_SRG");
+  }
+  return DecodeBytes(Bytes(encoded), color_hints, ppf);
+}
+
+std::vector<uint8_t> GetAppData(const std::vector<uint8_t>& compressed) {
+  std::vector<uint8_t> result;
+  size_t pos = 2;  // After SOI
+  while (pos + 4 < compressed.size()) {
+    if (compressed[pos] != 0xff || compressed[pos + 1] < 0xe0 ||
+        compressed[pos + 1] > 0xf0) {
+      break;
+    }
+    size_t len = (compressed[pos + 2] << 8) + compressed[pos + 3] + 2;
+    if (pos + len > compressed.size()) {
+      break;
+    }
+    result.insert(result.end(), &compressed[pos], &compressed[pos] + len);
+    pos += len;
+  }
+  return result;
+}
+
+Status DecodeWithLibjpeg(const std::vector<uint8_t>& compressed,
+                         PackedPixelFile* ppf,
+                         const JPGDecompressParams* dparams = nullptr) {
+  return DecodeImageJPG(Bytes(compressed), ColorHints(), ppf,
+                        /*constraints=*/nullptr, dparams);
+}
+
+Status EncodeWithLibjpeg(const PackedPixelFile& ppf, int quality,
+                         std::vector<uint8_t>* compressed) {
+  std::unique_ptr<Encoder> encoder = GetJPEGEncoder();
+  encoder->SetOption("q", std::to_string(quality));
+  EncodedImage encoded;
+  JXL_RETURN_IF_ERROR(encoder->Encode(ppf, &encoded));
+  JXL_RETURN_IF_ERROR(!encoded.bitstreams.empty());
+  *compressed = std::move(encoded.bitstreams[0]);
+  return true;
+}
+
+std::string Description(const JxlColorEncoding& color_encoding) {
+  ColorEncoding c_enc;
+  JXL_CHECK(c_enc.FromExternal(color_encoding));
+  return Description(c_enc);
+}
+
+float BitsPerPixel(const PackedPixelFile& ppf,
+                   const std::vector<uint8_t>& compressed) {
+  const size_t num_pixels = ppf.info.xsize * ppf.info.ysize;
+  return compressed.size() * 8.0 / num_pixels;
+}
+
+TEST(JpegliTest, JpegliSRGBDecodeTest) {
+  TEST_LIBJPEG_SUPPORT();
+  std::string testimage = "jxl/flower/flower_small.rgb.depth8.ppm";
+  PackedPixelFile ppf0;
+  ASSERT_TRUE(ReadTestImage(testimage, &ppf0));
+  EXPECT_EQ("RGB_D65_SRG_Rel_SRG", Description(ppf0.color_encoding));
+  EXPECT_EQ(8, ppf0.info.bits_per_sample);
+
+  std::vector<uint8_t> compressed;
+  ASSERT_TRUE(EncodeWithLibjpeg(ppf0, 90, &compressed));
+
+  PackedPixelFile ppf1;
+  ASSERT_TRUE(DecodeWithLibjpeg(compressed, &ppf1));
+  PackedPixelFile ppf2;
+  JpegDecompressParams dparams;
+  ASSERT_TRUE(DecodeJpeg(compressed, dparams, nullptr, &ppf2));
+  EXPECT_LT(ButteraugliDistance(ppf0, ppf2), ButteraugliDistance(ppf0, ppf1));
+}
+
+TEST(JpegliTest, JpegliGrayscaleDecodeTest) {
+  TEST_LIBJPEG_SUPPORT();
+  std::string testimage = "jxl/flower/flower_small.g.depth8.pgm";
+  PackedPixelFile ppf0;
+  ASSERT_TRUE(ReadTestImage(testimage, &ppf0));
+  EXPECT_EQ("Gra_D65_Rel_SRG", Description(ppf0.color_encoding));
+  EXPECT_EQ(8, ppf0.info.bits_per_sample);
+
+  std::vector<uint8_t> compressed;
+  ASSERT_TRUE(EncodeWithLibjpeg(ppf0, 90, &compressed));
+
+  PackedPixelFile ppf1;
+  ASSERT_TRUE(DecodeWithLibjpeg(compressed, &ppf1));
+  PackedPixelFile ppf2;
+  JpegDecompressParams dparams;
+  ASSERT_TRUE(DecodeJpeg(compressed, dparams, nullptr, &ppf2));
+  EXPECT_LT(ButteraugliDistance(ppf0, ppf2), ButteraugliDistance(ppf0, ppf1));
+}
+
+TEST(JpegliTest, JpegliXYBEncodeTest) {
+  TEST_LIBJPEG_SUPPORT();
+  std::string testimage = "jxl/flower/flower_small.rgb.depth8.ppm";
+  PackedPixelFile ppf_in;
+  ASSERT_TRUE(ReadTestImage(testimage, &ppf_in));
+  EXPECT_EQ("RGB_D65_SRG_Rel_SRG", Description(ppf_in.color_encoding));
+  EXPECT_EQ(8, ppf_in.info.bits_per_sample);
+
+  std::vector<uint8_t> compressed;
+  JpegSettings settings;
+  settings.xyb = true;
+  ASSERT_TRUE(EncodeJpeg(ppf_in, settings, nullptr, &compressed));
+
+  PackedPixelFile ppf_out;
+  ASSERT_TRUE(DecodeWithLibjpeg(compressed, &ppf_out));
+  EXPECT_THAT(BitsPerPixel(ppf_in, compressed), IsSlightlyBelow(1.45f));
+  EXPECT_THAT(ButteraugliDistance(ppf_in, ppf_out), IsSlightlyBelow(1.32f));
+}
+
+TEST(JpegliTest, JpegliDecodeTestLargeSmoothArea) {
+  TEST_LIBJPEG_SUPPORT();
+  TestImage t;
+  const size_t xsize = 2070;
+  const size_t ysize = 1063;
+  t.SetDimensions(xsize, ysize).SetChannels(3);
+  t.SetAllBitDepths(8).SetEndianness(JXL_NATIVE_ENDIAN);
+  TestImage::Frame frame = t.AddFrame();
+  frame.RandomFill();
+  // Create a large smooth area in the top half of the image. This is to test
+  // that the bias statistics calculation can handle many blocks with all-zero
+  // AC coefficients.
+  for (size_t y = 0; y < ysize / 2; ++y) {
+    for (size_t x = 0; x < xsize; ++x) {
+      for (size_t c = 0; c < 3; ++c) {
+        frame.SetValue(y, x, c, 0.5f);
+      }
+    }
+  }
+  const PackedPixelFile& ppf0 = t.ppf();
+
+  std::vector<uint8_t> compressed;
+  ASSERT_TRUE(EncodeWithLibjpeg(ppf0, 90, &compressed));
+
+  PackedPixelFile ppf1;
+  JpegDecompressParams dparams;
+  ASSERT_TRUE(DecodeJpeg(compressed, dparams, nullptr, &ppf1));
+  EXPECT_LT(ButteraugliDistance(ppf0, ppf1), 3.0f);
+}
+
+TEST(JpegliTest, JpegliYUVEncodeTest) {
+  TEST_LIBJPEG_SUPPORT();
+  std::string testimage = "jxl/flower/flower_small.rgb.depth8.ppm";
+  PackedPixelFile ppf_in;
+  ASSERT_TRUE(ReadTestImage(testimage, &ppf_in));
+  EXPECT_EQ("RGB_D65_SRG_Rel_SRG", Description(ppf_in.color_encoding));
+  EXPECT_EQ(8, ppf_in.info.bits_per_sample);
+
+  std::vector<uint8_t> compressed;
+  JpegSettings settings;
+  settings.xyb = false;
+  ASSERT_TRUE(EncodeJpeg(ppf_in, settings, nullptr, &compressed));
+
+  PackedPixelFile ppf_out;
+  ASSERT_TRUE(DecodeWithLibjpeg(compressed, &ppf_out));
+  EXPECT_THAT(BitsPerPixel(ppf_in, compressed), IsSlightlyBelow(1.7f));
+  EXPECT_THAT(ButteraugliDistance(ppf_in, ppf_out), IsSlightlyBelow(1.32f));
+}
+
+TEST(JpegliTest, JpegliYUVChromaSubsamplingEncodeTest) {
+  TEST_LIBJPEG_SUPPORT();
+  std::string testimage = "jxl/flower/flower_small.rgb.depth8.ppm";
+  PackedPixelFile ppf_in;
+  ASSERT_TRUE(ReadTestImage(testimage, &ppf_in));
+  EXPECT_EQ("RGB_D65_SRG_Rel_SRG", Description(ppf_in.color_encoding));
+  EXPECT_EQ(8, ppf_in.info.bits_per_sample);
+
+  std::vector<uint8_t> compressed;
+  JpegSettings settings;
+  for (const char* sampling : {"440", "422", "420"}) {
+    settings.xyb = false;
+    settings.chroma_subsampling = std::string(sampling);
+    ASSERT_TRUE(EncodeJpeg(ppf_in, settings, nullptr, &compressed));
+
+    PackedPixelFile ppf_out;
+    ASSERT_TRUE(DecodeWithLibjpeg(compressed, &ppf_out));
+    EXPECT_LE(BitsPerPixel(ppf_in, compressed), 1.55f);
+    EXPECT_LE(ButteraugliDistance(ppf_in, ppf_out), 1.82f);
+  }
+}
+
+TEST(JpegliTest, JpegliYUVEncodeTestNoAq) {
+  TEST_LIBJPEG_SUPPORT();
+  std::string testimage = "jxl/flower/flower_small.rgb.depth8.ppm";
+  PackedPixelFile ppf_in;
+  ASSERT_TRUE(ReadTestImage(testimage, &ppf_in));
+  EXPECT_EQ("RGB_D65_SRG_Rel_SRG", Description(ppf_in.color_encoding));
+  EXPECT_EQ(8, ppf_in.info.bits_per_sample);
+
+  std::vector<uint8_t> compressed;
+  JpegSettings settings;
+  settings.xyb = false;
+  settings.use_adaptive_quantization = false;
+  ASSERT_TRUE(EncodeJpeg(ppf_in, settings, nullptr, &compressed));
+
+  PackedPixelFile ppf_out;
+  ASSERT_TRUE(DecodeWithLibjpeg(compressed, &ppf_out));
+  EXPECT_THAT(BitsPerPixel(ppf_in, compressed), IsSlightlyBelow(1.85f));
+  EXPECT_THAT(ButteraugliDistance(ppf_in, ppf_out), IsSlightlyBelow(1.25f));
+}
+
+TEST(JpegliTest, JpegliHDRRoundtripTest) {
+  std::string testimage = "jxl/hdr_room.png";
+  PackedPixelFile ppf_in;
+  ASSERT_TRUE(ReadTestImage(testimage, &ppf_in));
+  EXPECT_EQ("RGB_D65_202_Rel_HLG", Description(ppf_in.color_encoding));
+  EXPECT_EQ(16, ppf_in.info.bits_per_sample);
+
+  std::vector<uint8_t> compressed;
+  JpegSettings settings;
+  settings.xyb = false;
+  ASSERT_TRUE(EncodeJpeg(ppf_in, settings, nullptr, &compressed));
+
+  PackedPixelFile ppf_out;
+  JpegDecompressParams dparams;
+  dparams.output_data_type = JXL_TYPE_UINT16;
+  ASSERT_TRUE(DecodeJpeg(compressed, dparams, nullptr, &ppf_out));
+  EXPECT_THAT(BitsPerPixel(ppf_in, compressed), IsSlightlyBelow(2.95f));
+  EXPECT_THAT(ButteraugliDistance(ppf_in, ppf_out), IsSlightlyBelow(1.05f));
+}
+
+TEST(JpegliTest, JpegliSetAppData) {
+  std::string testimage = "jxl/flower/flower_small.rgb.depth8.ppm";
+  PackedPixelFile ppf_in;
+  ASSERT_TRUE(ReadTestImage(testimage, &ppf_in));
+  EXPECT_EQ("RGB_D65_SRG_Rel_SRG", Description(ppf_in.color_encoding));
+  EXPECT_EQ(8, ppf_in.info.bits_per_sample);
+
+  std::vector<uint8_t> compressed;
+  JpegSettings settings;
+  settings.app_data = {0xff, 0xe3, 0, 4, 0, 1};
+  EXPECT_TRUE(EncodeJpeg(ppf_in, settings, nullptr, &compressed));
+  EXPECT_EQ(settings.app_data, GetAppData(compressed));
+
+  settings.app_data = {0xff, 0xe3, 0, 6, 0, 1, 2, 3, 0xff, 0xef, 0, 4, 0, 1};
+  EXPECT_TRUE(EncodeJpeg(ppf_in, settings, nullptr, &compressed));
+  EXPECT_EQ(settings.app_data, GetAppData(compressed));
+
+  settings.xyb = true;
+  EXPECT_TRUE(EncodeJpeg(ppf_in, settings, nullptr, &compressed));
+  EXPECT_EQ(0, memcmp(settings.app_data.data(), GetAppData(compressed).data(),
+                      settings.app_data.size()));
+
+  settings.xyb = false;
+  settings.app_data = {0};
+  EXPECT_FALSE(EncodeJpeg(ppf_in, settings, nullptr, &compressed));
+
+  settings.app_data = {0xff, 0xe0};
+  EXPECT_FALSE(EncodeJpeg(ppf_in, settings, nullptr, &compressed));
+
+  settings.app_data = {0xff, 0xe0, 0, 2};
+  EXPECT_FALSE(EncodeJpeg(ppf_in, settings, nullptr, &compressed));
+
+  settings.app_data = {0xff, 0xeb, 0, 4, 0};
+  EXPECT_FALSE(EncodeJpeg(ppf_in, settings, nullptr, &compressed));
+
+  settings.app_data = {0xff, 0xeb, 0, 4, 0, 1, 2, 3};
+  EXPECT_FALSE(EncodeJpeg(ppf_in, settings, nullptr, &compressed));
+
+  settings.app_data = {0xff, 0xab, 0, 4, 0, 1};
+  EXPECT_FALSE(EncodeJpeg(ppf_in, settings, nullptr, &compressed));
+
+  settings.xyb = false;
+  settings.app_data = {
+      0xff, 0xeb, 0,    4,    0,    1,                       //
+      0xff, 0xe2, 0,    20,   0x49, 0x43, 0x43, 0x5F, 0x50,  //
+      0x52, 0x4F, 0x46, 0x49, 0x4C, 0x45, 0x00, 0,    1,     //
+      0,    0,    0,    0,                                   //
+  };
+  EXPECT_TRUE(EncodeJpeg(ppf_in, settings, nullptr, &compressed));
+  EXPECT_EQ(settings.app_data, GetAppData(compressed));
+
+  settings.xyb = true;
+  EXPECT_FALSE(EncodeJpeg(ppf_in, settings, nullptr, &compressed));
+}
+
+struct TestConfig {
+  int num_colors;
+  int passes;
+  int dither;
+};
+
+class JpegliColorQuantTestParam : public ::testing::TestWithParam<TestConfig> {
+};
+
+TEST_P(JpegliColorQuantTestParam, JpegliColorQuantizeTest) {
+  TEST_LIBJPEG_SUPPORT();
+  TestConfig config = GetParam();
+  std::string testimage = "jxl/flower/flower_small.rgb.depth8.ppm";
+  PackedPixelFile ppf0;
+  ASSERT_TRUE(ReadTestImage(testimage, &ppf0));
+  EXPECT_EQ("RGB_D65_SRG_Rel_SRG", Description(ppf0.color_encoding));
+  EXPECT_EQ(8, ppf0.info.bits_per_sample);
+
+  std::vector<uint8_t> compressed;
+  ASSERT_TRUE(EncodeWithLibjpeg(ppf0, 90, &compressed));
+
+  PackedPixelFile ppf1;
+  JPGDecompressParams dparams1;
+  dparams1.two_pass_quant = (config.passes == 2);
+  dparams1.num_colors = config.num_colors;
+  dparams1.dither_mode = config.dither;
+  ASSERT_TRUE(DecodeWithLibjpeg(compressed, &ppf1, &dparams1));
+
+  PackedPixelFile ppf2;
+  JpegDecompressParams dparams2;
+  dparams2.two_pass_quant = (config.passes == 2);
+  dparams2.num_colors = config.num_colors;
+  dparams2.dither_mode = config.dither;
+  ASSERT_TRUE(DecodeJpeg(compressed, dparams2, nullptr, &ppf2));
+
+  double dist1 = Butteraugli3Norm(ppf0, ppf1);
+  double dist2 = Butteraugli3Norm(ppf0, ppf2);
+  printf("distance: %f  vs %f\n", dist2, dist1);
+  if (config.passes == 1) {
+    if (config.num_colors == 16 && config.dither == 2) {
+      // TODO(szabadka) Fix this case.
+      EXPECT_LT(dist2, dist1 * 1.5);
+    } else {
+      EXPECT_LT(dist2, dist1 * 1.05);
+    }
+  } else if (config.num_colors > 64) {
+    // TODO(szabadka) Fix 2pass quantization for <= 64 colors.
+    EXPECT_LT(dist2, dist1 * 1.1);
+  } else if (config.num_colors > 32) {
+    EXPECT_LT(dist2, dist1 * 1.2);
+  } else {
+    EXPECT_LT(dist2, dist1 * 1.7);
+  }
+}
+
+std::vector<TestConfig> GenerateTests() {
+  std::vector<TestConfig> all_tests;
+  for (int num_colors = 8; num_colors <= 256; num_colors *= 2) {
+    for (int passes = 1; passes <= 2; ++passes) {
+      for (int dither = 0; dither < 3; dither += passes) {
+        TestConfig config;
+        config.num_colors = num_colors;
+        config.passes = passes;
+        config.dither = dither;
+        all_tests.push_back(config);
+      }
+    }
+  }
+  return all_tests;
+}
+
+std::ostream& operator<<(std::ostream& os, const TestConfig& c) {
+  static constexpr const char* kDitherModeStr[] = {"No", "Ordered", "FS"};
+  os << c.passes << "pass";
+  os << c.num_colors << "colors";
+  os << kDitherModeStr[c.dither] << "dither";
+  return os;
+}
+
+std::string TestDescription(const testing::TestParamInfo<TestConfig>& info) {
+  std::stringstream name;
+  name << info.param;
+  return name.str();
+}
+
+JXL_GTEST_INSTANTIATE_TEST_SUITE_P(JpegliColorQuantTest,
+                                   JpegliColorQuantTestParam,
+                                   testing::ValuesIn(GenerateTests()),
+                                   TestDescription);
+
+}  // namespace
+}  // namespace extras
+}  // namespace jxl
+#endif  // JPEGXL_ENABLE_JPEGLI
similarity index 84%
rename from lib/jxl/enc_butteraugli_pnorm.cc
rename to lib/extras/metrics.cc
index fe5629d..4259d3c 100644 (file)
@@ -3,7 +3,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-#include "lib/jxl/enc_butteraugli_pnorm.h"
+#include "lib/extras/metrics.h"
 
 #include <math.h>
 #include <stdlib.h>
 #include <atomic>
 
 #undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "lib/jxl/enc_butteraugli_pnorm.cc"
+#define HWY_TARGET_INCLUDE "lib/extras/metrics.cc"
 #include <hwy/foreach_target.h>
 #include <hwy/highway.h>
 
 #include "lib/jxl/base/compiler_specific.h"
-#include "lib/jxl/base/profiler.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/color_encoding_internal.h"
 HWY_BEFORE_NAMESPACE();
@@ -31,8 +30,6 @@ using hwy::HWY_NAMESPACE::Rebind;
 
 double ComputeDistanceP(const ImageF& distmap, const ButteraugliParams& params,
                         double p) {
-  PROFILER_FUNC;
-
   const double onePerPixels = 1.0 / (distmap.ysize() * distmap.xsize());
   if (std::abs(p - 3.0) < 1E-6) {
     double sum1[3] = {0.0};
@@ -45,7 +42,7 @@ double ComputeDistanceP(const ImageF& distmap, const ButteraugliParams& params,
     using T = float;
 #endif
     const HWY_FULL(T) d;
-    constexpr size_t N = MaxLanes(HWY_FULL(T)());
+    constexpr size_t N = MaxLanes(d);
     // Manually aligned storage to avoid asan crash on clang-7 due to
     // unaligned spill.
     HWY_ALIGN T sum_totals0[N] = {0};
@@ -126,10 +123,8 @@ double ComputeDistanceP(const ImageF& distmap, const ButteraugliParams& params,
   }
 }
 
-// TODO(lode): take alpha into account when needed
-double ComputeDistance2(const ImageBundle& ib1, const ImageBundle& ib2,
-                        const JxlCmsInterface& cms) {
-  PROFILER_FUNC;
+void ComputeSumOfSquares(const ImageBundle& ib1, const ImageBundle& ib2,
+                         const JxlCmsInterface& cms, double sum_of_squares[3]) {
   // Convert to sRGB - closer to perception than linear.
   const Image3F* srgb1 = &ib1.color();
   Image3F copy1;
@@ -152,7 +147,6 @@ double ComputeDistance2(const ImageBundle& ib1, const ImageBundle& ib2,
   float yuvmatrix[3][3] = {{0.299, 0.587, 0.114},
                            {-0.14713, -0.28886, 0.436},
                            {0.615, -0.51499, -0.10001}};
-  double sum_of_squares[3] = {};
   for (size_t y = 0; y < srgb1->ysize(); ++y) {
     const float* JXL_RESTRICT row1[3];
     const float* JXL_RESTRICT row2[3];
@@ -177,15 +171,6 @@ double ComputeDistance2(const ImageBundle& ib1, const ImageBundle& ib2,
       }
     }
   }
-  // Weighted PSNR as in JPEG-XL: chroma counts 1/8.
-  const float weights[3] = {6.0f / 8, 1.0f / 8, 1.0f / 8};
-  // Avoid squaring the weight - 1/64 is too extreme.
-  double norm = 0;
-  for (size_t i = 0; i < 3; i++) {
-    norm += std::sqrt(sum_of_squares[i]) * weights[i];
-  }
-  // This function returns distance *squared*.
-  return norm * norm;
 }
 
 // NOLINTNEXTLINE(google-readability-namespace-comments)
@@ -201,10 +186,38 @@ double ComputeDistanceP(const ImageF& distmap, const ButteraugliParams& params,
   return HWY_DYNAMIC_DISPATCH(ComputeDistanceP)(distmap, params, p);
 }
 
-HWY_EXPORT(ComputeDistance2);
+HWY_EXPORT(ComputeSumOfSquares);
+
 double ComputeDistance2(const ImageBundle& ib1, const ImageBundle& ib2,
                         const JxlCmsInterface& cms) {
-  return HWY_DYNAMIC_DISPATCH(ComputeDistance2)(ib1, ib2, cms);
+  double sum_of_squares[3] = {};
+  HWY_DYNAMIC_DISPATCH(ComputeSumOfSquares)(ib1, ib2, cms, sum_of_squares);
+  // Weighted PSNR as in JPEG-XL: chroma counts 1/8.
+  const float weights[3] = {6.0f / 8, 1.0f / 8, 1.0f / 8};
+  // Avoid squaring the weight - 1/64 is too extreme.
+  double norm = 0;
+  for (size_t i = 0; i < 3; i++) {
+    norm += std::sqrt(sum_of_squares[i]) * weights[i];
+  }
+  // This function returns distance *squared*.
+  return norm * norm;
+}
+
+double ComputePSNR(const ImageBundle& ib1, const ImageBundle& ib2,
+                   const JxlCmsInterface& cms) {
+  if (!SameSize(ib1, ib2)) return 0.0;
+  double sum_of_squares[3] = {};
+  HWY_DYNAMIC_DISPATCH(ComputeSumOfSquares)(ib1, ib2, cms, sum_of_squares);
+  constexpr double kChannelWeights[3] = {6.0 / 8, 1.0 / 8, 1.0 / 8};
+  double avg_psnr = 0;
+  const size_t input_pixels = ib1.xsize() * ib1.ysize();
+  for (int i = 0; i < 3; ++i) {
+    const double rmse = std::sqrt(sum_of_squares[i] / input_pixels);
+    const double psnr =
+        sum_of_squares[i] == 0 ? 99.99 : (20 * std::log10(1 / rmse));
+    avg_psnr += kChannelWeights[i] * psnr;
+  }
+  return avg_psnr;
 }
 
 }  // namespace jxl
similarity index 74%
rename from lib/jxl/enc_butteraugli_pnorm.h
rename to lib/extras/metrics.h
index cf6872e..87a69a9 100644 (file)
@@ -3,8 +3,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-#ifndef LIB_JXL_ENC_BUTTERAUGLI_PNORM_H_
-#define LIB_JXL_ENC_BUTTERAUGLI_PNORM_H_
+#ifndef LIB_EXTRAS_METRICS_H_
+#define LIB_EXTRAS_METRICS_H_
 
 #include <stdint.h>
 
@@ -20,6 +20,9 @@ double ComputeDistanceP(const ImageF& distmap, const ButteraugliParams& params,
 double ComputeDistance2(const ImageBundle& ib1, const ImageBundle& ib2,
                         const JxlCmsInterface& cms);
 
+double ComputePSNR(const ImageBundle& ib1, const ImageBundle& ib2,
+                   const JxlCmsInterface& cms);
+
 }  // namespace jxl
 
-#endif  // LIB_JXL_ENC_BUTTERAUGLI_PNORM_H_
+#endif  // LIB_EXTRAS_METRICS_H_
index 1296472..d3ba9ce 100644 (file)
@@ -9,20 +9,24 @@
 // Helper class for storing external (int or float, interleaved) images. This is
 // the common format used by other libraries and in the libjxl API.
 
+#include <jxl/codestream_header.h>
+#include <jxl/encode.h>
+#include <jxl/types.h>
 #include <stddef.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 
 #include <algorithm>
+#include <cmath>
 #include <memory>
+#include <set>
 #include <string>
 #include <vector>
 
-#include "jxl/codestream_header.h"
-#include "jxl/encode.h"
-#include "jxl/types.h"
-#include "lib/jxl/common.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/common.h"
+#include "lib/jxl/base/status.h"
 
 namespace jxl {
 namespace extras {
@@ -33,9 +37,26 @@ class PackedImage {
   PackedImage(size_t xsize, size_t ysize, const JxlPixelFormat& format)
       : PackedImage(xsize, ysize, format, CalcStride(format, xsize)) {}
 
+  PackedImage Copy() const {
+    PackedImage copy(xsize, ysize, format);
+    memcpy(reinterpret_cast<uint8_t*>(copy.pixels()),
+           reinterpret_cast<const uint8_t*>(pixels()), pixels_size);
+    return copy;
+  }
+
   // The interleaved pixels as defined in the storage format.
   void* pixels() const { return pixels_.get(); }
 
+  uint8_t* pixels(size_t y, size_t x, size_t c) const {
+    return (reinterpret_cast<uint8_t*>(pixels_.get()) + y * stride +
+            x * pixel_stride_ + c * bytes_per_channel_);
+  }
+
+  const uint8_t* const_pixels(size_t y, size_t x, size_t c) const {
+    return (reinterpret_cast<const uint8_t*>(pixels_.get()) + y * stride +
+            x * pixel_stride_ + c * bytes_per_channel_);
+  }
+
   // The image size in pixels.
   size_t xsize;
   size_t ysize;
@@ -47,10 +68,7 @@ class PackedImage {
   JxlPixelFormat format;
   size_t pixels_size;
 
-  size_t pixel_stride() const {
-    return (BitsPerChannel(format.data_type) * format.num_channels /
-            jxl::kBitsPerByte);
-  }
+  size_t pixel_stride() const { return pixel_stride_; }
 
   static size_t BitsPerChannel(JxlDataType data_type) {
     switch (data_type) {
@@ -67,6 +85,52 @@ class PackedImage {
     }
   }
 
+  float GetPixelValue(size_t y, size_t x, size_t c) const {
+    const uint8_t* data = const_pixels(y, x, c);
+    switch (format.data_type) {
+      case JXL_TYPE_UINT8:
+        return data[0] * (1.0f / 255);
+      case JXL_TYPE_UINT16: {
+        uint16_t val;
+        memcpy(&val, data, 2);
+        return (swap_endianness_ ? JXL_BSWAP16(val) : val) * (1.0f / 65535);
+      }
+      case JXL_TYPE_FLOAT: {
+        float val;
+        memcpy(&val, data, 4);
+        return swap_endianness_ ? BSwapFloat(val) : val;
+      }
+      default:
+        JXL_ABORT("Unhandled JxlDataType");
+    }
+  }
+
+  void SetPixelValue(size_t y, size_t x, size_t c, float val) {
+    uint8_t* data = pixels(y, x, c);
+    switch (format.data_type) {
+      case JXL_TYPE_UINT8:
+        data[0] = Clamp1(std::round(val * 255), 0.0f, 255.0f);
+        break;
+      case JXL_TYPE_UINT16: {
+        uint16_t val16 = Clamp1(std::round(val * 65535), 0.0f, 65535.0f);
+        if (swap_endianness_) {
+          val16 = JXL_BSWAP16(val16);
+        }
+        memcpy(data, &val16, 2);
+        break;
+      }
+      case JXL_TYPE_FLOAT: {
+        if (swap_endianness_) {
+          val = BSwapFloat(val);
+        }
+        memcpy(data, &val, 4);
+        break;
+      }
+      default:
+        JXL_ABORT("Unhandled JxlDataType");
+    }
+  }
+
  private:
   PackedImage(size_t xsize, size_t ysize, const JxlPixelFormat& format,
               size_t stride)
@@ -75,7 +139,11 @@ class PackedImage {
         stride(stride),
         format(format),
         pixels_size(ysize * stride),
-        pixels_(malloc(std::max<size_t>(1, pixels_size)), free) {}
+        pixels_(malloc(std::max<size_t>(1, pixels_size)), free) {
+    bytes_per_channel_ = BitsPerChannel(format.data_type) / jxl::kBitsPerByte;
+    pixel_stride_ = format.num_channels * bytes_per_channel_;
+    swap_endianness_ = SwapEndianness(format.endianness);
+  }
 
   static size_t CalcStride(const JxlPixelFormat& format, size_t xsize) {
     size_t stride = xsize * (BitsPerChannel(format.data_type) *
@@ -86,6 +154,9 @@ class PackedImage {
     return stride;
   }
 
+  size_t bytes_per_channel_;
+  size_t pixel_stride_;
+  bool swap_endianness_;
   std::unique_ptr<void, decltype(free)*> pixels_;
 };
 
@@ -98,6 +169,18 @@ class PackedFrame {
   template <typename... Args>
   explicit PackedFrame(Args&&... args) : color(std::forward<Args>(args)...) {}
 
+  PackedFrame Copy() const {
+    PackedFrame copy(color.xsize, color.ysize, color.format);
+    copy.frame_info = frame_info;
+    copy.name = name;
+    copy.color = color.Copy();
+    for (size_t i = 0; i < extra_channels.size(); ++i) {
+      PackedImage ec = extra_channels[i].Copy();
+      copy.extra_channels.emplace_back(std::move(ec));
+    }
+    return copy;
+  }
+
   // The Frame metadata.
   JxlFrameHeader frame_info = {};
   std::string name;
@@ -108,6 +191,85 @@ class PackedFrame {
   std::vector<PackedImage> extra_channels;
 };
 
+class ChunkedPackedFrame {
+ public:
+  typedef void (*ReadLine)(void* opaque, size_t xpos, size_t ypos, size_t xsize,
+                           uint8_t* buffer, size_t len);
+  ChunkedPackedFrame(size_t xsize, size_t ysize, const JxlPixelFormat& format,
+                     void* opaque, ReadLine read_line)
+      : xsize(xsize),
+        ysize(ysize),
+        format(format),
+        opaque_(opaque),
+        read_line_(read_line) {}
+
+  JxlChunkedFrameInputSource GetInputSource() {
+    return JxlChunkedFrameInputSource{this,
+                                      GetColorChannelsPixelFormat,
+                                      GetColorChannelDataAt,
+                                      GetExtraChannelPixelFormat,
+                                      GetExtraChannelDataAt,
+                                      ReleaseCurrentData};
+  }
+
+  // The Frame metadata.
+  JxlFrameHeader frame_info = {};
+  std::string name;
+
+  size_t xsize;
+  size_t ysize;
+  JxlPixelFormat format;
+
+ private:
+  static void GetColorChannelsPixelFormat(void* opaque,
+                                          JxlPixelFormat* pixel_format) {
+    ChunkedPackedFrame* self = reinterpret_cast<ChunkedPackedFrame*>(opaque);
+    *pixel_format = self->format;
+  }
+
+  static const void* GetColorChannelDataAt(void* opaque, size_t xpos,
+                                           size_t ypos, size_t xsize,
+                                           size_t ysize, size_t* row_offset) {
+    ChunkedPackedFrame* self = reinterpret_cast<ChunkedPackedFrame*>(opaque);
+    size_t bytes_per_channel =
+        PackedImage::BitsPerChannel(self->format.data_type) / jxl::kBitsPerByte;
+    size_t bytes_per_pixel = bytes_per_channel * self->format.num_channels;
+    *row_offset = xsize * bytes_per_pixel;
+    uint8_t* buffer = reinterpret_cast<uint8_t*>(malloc(ysize * (*row_offset)));
+    for (size_t y = 0; y < ysize; ++y) {
+      self->read_line_(self->opaque_, xpos, ypos + y, xsize,
+                       &buffer[y * (*row_offset)], *row_offset);
+    }
+    self->buffers_.insert(buffer);
+    return buffer;
+  }
+
+  static void GetExtraChannelPixelFormat(void* opaque, size_t ec_index,
+                                         JxlPixelFormat* pixel_format) {
+    JXL_ABORT("Not implemented");
+  }
+
+  static const void* GetExtraChannelDataAt(void* opaque, size_t ec_index,
+                                           size_t xpos, size_t ypos,
+                                           size_t xsize, size_t ysize,
+                                           size_t* row_offset) {
+    JXL_ABORT("Not implemented");
+  }
+
+  static void ReleaseCurrentData(void* opaque, const void* buffer) {
+    ChunkedPackedFrame* self = reinterpret_cast<ChunkedPackedFrame*>(opaque);
+    auto iter = self->buffers_.find(const_cast<void*>(buffer));
+    if (iter != self->buffers_.end()) {
+      free(*iter);
+      self->buffers_.erase(iter);
+    }
+  }
+
+  void* opaque_;
+  ReadLine read_line_;
+  std::set<void*> buffers_;
+};
+
 // Optional metadata associated with a file
 class PackedMetadata {
  public:
@@ -117,17 +279,18 @@ class PackedMetadata {
   std::vector<uint8_t> xmp;
 };
 
+// The extra channel metadata information.
+struct PackedExtraChannel {
+  JxlExtraChannelInfo ec_info;
+  size_t index;
+  std::string name;
+};
+
 // Helper class representing a JXL image file as decoded to pixels from the API.
 class PackedPixelFile {
  public:
   JxlBasicInfo info = {};
 
-  // The extra channel metadata information.
-  struct PackedExtraChannel {
-    JxlExtraChannelInfo ec_info;
-    size_t index;
-    std::string name;
-  };
   std::vector<PackedExtraChannel> extra_channels_info;
 
   // Color information of the decoded pixels.
@@ -139,9 +302,14 @@ class PackedPixelFile {
 
   std::unique_ptr<PackedFrame> preview_frame;
   std::vector<PackedFrame> frames;
+  mutable std::vector<ChunkedPackedFrame> chunked_frames;
 
   PackedMetadata metadata;
   PackedPixelFile() { JxlEncoderInitBasicInfo(&info); };
+
+  size_t num_frames() const {
+    return chunked_frames.empty() ? frames.size() : chunked_frames.size();
+  }
 };
 
 }  // namespace extras
index dcdd12a..1bc8f20 100644 (file)
@@ -5,17 +5,18 @@
 
 #include "lib/extras/packed_image_convert.h"
 
+#include <jxl/cms.h>
+#include <jxl/color_encoding.h>
+#include <jxl/types.h>
+
 #include <cstdint>
 
-#include "jxl/color_encoding.h"
-#include "jxl/types.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/color_encoding_internal.h"
-#include "lib/jxl/color_management.h"
 #include "lib/jxl/dec_external_image.h"
-#include "lib/jxl/enc_color_management.h"
 #include "lib/jxl/enc_external_image.h"
 #include "lib/jxl/enc_image_bundle.h"
+#include "lib/jxl/luminance.h"
 
 namespace jxl {
 namespace extras {
@@ -58,10 +59,7 @@ Status ConvertPackedFrameToImageBundle(const JxlBasicInfo& info,
 
   JXL_RETURN_IF_ERROR(ConvertFromExternal(
       span, frame.color.xsize, frame.color.ysize, io.metadata.m.color_encoding,
-      frame.color.format.num_channels,
-      /*alpha_is_premultiplied=*/info.alpha_premultiplied,
-      frame_bits_per_sample, frame.color.format.endianness, pool, bundle,
-      /*float_in=*/float_in, /*align=*/0));
+      frame_bits_per_sample, frame.color.format, pool, bundle));
 
   bundle->extra_channels().resize(io.metadata.m.extra_channel_info.size());
   for (size_t i = 0; i < frame.extra_channels.size(); i++) {
@@ -84,7 +82,7 @@ Status ConvertPackedPixelFileToCodecInOut(const PackedPixelFile& ppf,
                ppf.info.exponent_bits_per_sample);
   }
 
-  const bool is_gray = ppf.info.num_color_channels == 1;
+  const bool is_gray = (ppf.info.num_color_channels == 1);
   JXL_ASSERT(ppf.info.num_color_channels == 1 ||
              ppf.info.num_color_channels == 3);
 
@@ -114,20 +112,24 @@ Status ConvertPackedPixelFileToCodecInOut(const PackedPixelFile& ppf,
 
   // Convert the color encoding.
   if (!ppf.icc.empty()) {
-    PaddedBytes icc;
-    icc.append(ppf.icc);
-    if (!io->metadata.m.color_encoding.SetICC(std::move(icc))) {
+    IccBytes icc = ppf.icc;
+    if (!io->metadata.m.color_encoding.SetICC(std::move(icc),
+                                              JxlGetDefaultCms())) {
       fprintf(stderr, "Warning: error setting ICC profile, assuming SRGB\n");
       io->metadata.m.color_encoding = ColorEncoding::SRGB(is_gray);
     } else {
+      if (io->metadata.m.color_encoding.IsCMYK()) {
+        // We expect gray or tri-color.
+        return JXL_FAILURE("Embedded ICC is CMYK");
+      }
       if (io->metadata.m.color_encoding.IsGray() != is_gray) {
         // E.g. JPG image has 3 channels, but gray ICC.
         return JXL_FAILURE("Embedded ICC does not match image color type");
       }
     }
   } else {
-    JXL_RETURN_IF_ERROR(ConvertExternalToInternalColorEncoding(
-        ppf.color_encoding, &io->metadata.m.color_encoding));
+    JXL_RETURN_IF_ERROR(
+        io->metadata.m.color_encoding.FromExternal(ppf.color_encoding));
     if (io->metadata.m.color_encoding.ICC().empty()) {
       return JXL_FAILURE("Failed to serialize ICC");
     }
@@ -140,8 +142,7 @@ Status ConvertPackedPixelFileToCodecInOut(const PackedPixelFile& ppf,
   io->blobs.xmp = ppf.metadata.xmp;
 
   // Append all other extra channels.
-  for (const PackedPixelFile::PackedExtraChannel& info :
-       ppf.extra_channels_info) {
+  for (const auto& info : ppf.extra_channels_info) {
     ExtraChannelInfo out;
     out.type = static_cast<jxl::ExtraChannel>(info.ec_info.type);
     out.bit_depth.bits_per_sample = info.ec_info.bits_per_sample;
@@ -171,14 +172,12 @@ Status ConvertPackedPixelFileToCodecInOut(const PackedPixelFile& ppf,
   }
 
   // Convert the pixels
-  io->dec_pixels = 0;
   io->frames.clear();
   for (const auto& frame : ppf.frames) {
     ImageBundle bundle(&io->metadata.m);
     JXL_RETURN_IF_ERROR(
         ConvertPackedFrameToImageBundle(ppf.info, frame, *io, pool, &bundle));
     io->frames.push_back(std::move(bundle));
-    io->dec_pixels += frame.color.xsize * frame.color.ysize;
   }
 
   if (ppf.info.exponent_bits_per_sample == 0) {
@@ -188,7 +187,7 @@ Status ConvertPackedPixelFileToCodecInOut(const PackedPixelFile& ppf,
   if (ppf.info.intensity_target != 0) {
     io->metadata.m.SetIntensityTarget(ppf.info.intensity_target);
   } else {
-    SetIntensityTarget(io);
+    SetIntensityTarget(&io->metadata.m);
   }
   io->CheckMetadata();
   return true;
@@ -221,6 +220,12 @@ Status ConvertCodecInOutToPackedPixelFile(const CodecInOut& io,
   ppf->info.exponent_bits_per_sample =
       io.metadata.m.bit_depth.exponent_bits_per_sample;
 
+  ppf->info.intensity_target = io.metadata.m.tone_mapping.intensity_target;
+  ppf->info.linear_below = io.metadata.m.tone_mapping.linear_below;
+  ppf->info.min_nits = io.metadata.m.tone_mapping.min_nits;
+  ppf->info.relative_to_max_display =
+      io.metadata.m.tone_mapping.relative_to_max_display;
+
   ppf->info.alpha_bits = io.metadata.m.GetAlphaBits();
   ppf->info.alpha_premultiplied = alpha_premultiplied;
 
@@ -239,7 +244,7 @@ Status ConvertCodecInOutToPackedPixelFile(const CodecInOut& io,
 
   // Convert the color encoding
   ppf->icc.assign(c_desired.ICC().begin(), c_desired.ICC().end());
-  ConvertInternalToExternalColorEncoding(c_desired, &ppf->color_encoding);
+  ppf->color_encoding = c_desired.ToExternal();
 
   // Convert the extra blobs
   ppf->metadata.exif = io.blobs.exif;
@@ -276,12 +281,8 @@ Status ConvertCodecInOutToPackedPixelFile(const CodecInOut& io,
     const ImageBundle* transformed;
     // TODO(firsching): handle the transform here.
     JXL_RETURN_IF_ERROR(TransformIfNeeded(*to_color_transform, c_desired,
-                                          GetJxlCms(), pool, &store,
+                                          *JxlGetDefaultCms(), pool, &store,
                                           &transformed));
-    size_t stride = ib.oriented_xsize() *
-                    (c_desired.Channels() * ppf->info.bits_per_sample) /
-                    kBitsPerByte;
-    PaddedBytes pixels(stride * ib.oriented_ysize());
 
     JXL_RETURN_IF_ERROR(ConvertToExternal(
         *transformed, bits_per_sample, float_out, format.num_channels,
index cada660..100adcc 100644 (file)
@@ -9,7 +9,8 @@
 // Helper functions to convert from the external image types to the internal
 // CodecInOut to help transitioning to the external types.
 
-#include "jxl/types.h"
+#include <jxl/types.h>
+
 #include "lib/extras/packed_image.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/codec_in_out.h"
diff --git a/lib/extras/render_hdr.cc b/lib/extras/render_hdr.cc
deleted file mode 100644 (file)
index b247699..0000000
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "lib/extras/render_hdr.h"
-
-#include "lib/extras/hlg.h"
-#include "lib/extras/tone_mapping.h"
-#include "lib/jxl/enc_color_management.h"
-
-namespace jxl {
-
-Status RenderHDR(CodecInOut* io, float display_nits, ThreadPool* pool) {
-  const ColorEncoding& original_color_encoding = io->metadata.m.color_encoding;
-  if (!(original_color_encoding.tf.IsPQ() ||
-        original_color_encoding.tf.IsHLG())) {
-    // Nothing to do.
-    return true;
-  }
-
-  if (original_color_encoding.tf.IsPQ()) {
-    JXL_RETURN_IF_ERROR(ToneMapTo({0, display_nits}, io, pool));
-    JXL_RETURN_IF_ERROR(GamutMap(io, /*preserve_saturation=*/0.1, pool));
-  } else {
-    const float intensity_target = io->metadata.m.IntensityTarget();
-    const float gamma_hlg_to_display = GetHlgGamma(display_nits);
-    // If the image is already in display space, we need to account for the
-    // already-applied OOTF.
-    const float gamma_display_to_display =
-        gamma_hlg_to_display / GetHlgGamma(intensity_target);
-    // Ensures that conversions to linear in HlgOOTF below will not themselves
-    // include the OOTF.
-    io->metadata.m.SetIntensityTarget(300);
-
-    bool need_gamut_mapping = false;
-    for (ImageBundle& ib : io->frames) {
-      const float gamma = ib.c_current().tf.IsHLG() ? gamma_hlg_to_display
-                                                    : gamma_display_to_display;
-      if (gamma < 1) need_gamut_mapping = true;
-      JXL_RETURN_IF_ERROR(HlgOOTF(&ib, gamma, pool));
-    }
-    io->metadata.m.SetIntensityTarget(display_nits);
-
-    if (need_gamut_mapping) {
-      JXL_RETURN_IF_ERROR(GamutMap(io, /*preserve_saturation=*/0.1, pool));
-    }
-  }
-
-  ColorEncoding rec2020_pq;
-  rec2020_pq.SetColorSpace(ColorSpace::kRGB);
-  rec2020_pq.white_point = WhitePoint::kD65;
-  rec2020_pq.primaries = Primaries::k2100;
-  rec2020_pq.tf.SetTransferFunction(TransferFunction::kPQ);
-  JXL_RETURN_IF_ERROR(rec2020_pq.CreateICC());
-  io->metadata.m.color_encoding = rec2020_pq;
-  return io->TransformTo(rec2020_pq, GetJxlCms(), pool);
-}
-
-}  // namespace jxl
diff --git a/lib/extras/render_hdr.h b/lib/extras/render_hdr.h
deleted file mode 100644 (file)
index 95127e0..0000000
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#ifndef LIB_EXTRAS_RENDER_HDR_H_
-#define LIB_EXTRAS_RENDER_HDR_H_
-
-#include "lib/jxl/codec_in_out.h"
-
-namespace jxl {
-
-// If `io` has an original color space using PQ or HLG, this renders it
-// appropriately for a display with a peak luminance of `display_nits` and
-// converts the result to a Rec. 2020 / PQ image. Otherwise, leaves the image as
-// is.
-// PQ images are tone-mapped using the method described in Rep. ITU-R BT.2408-5
-// annex 5, while HLG images are rendered using the HLG OOTF with a gamma
-// appropriate for the given target luminance.
-// With a sufficiently bright SDR display, converting the output of this
-// function to an SDR colorspace may look decent.
-Status RenderHDR(CodecInOut* io, float display_nits,
-                 ThreadPool* pool = nullptr);
-
-}  // namespace jxl
-
-#endif  // LIB_EXTRAS_RENDER_HDR_H_
similarity index 50%
rename from lib/jxl/size_constraints.h
rename to lib/extras/size_constraints.h
index 20787b1..cf06f8c 100644 (file)
@@ -7,6 +7,9 @@
 #define LIB_JXL_SIZE_CONSTRAINTS_H_
 
 #include <cstdint>
+#include <type_traits>
+
+#include "lib/jxl/base/status.h"
 
 namespace jxl {
 
@@ -18,6 +21,23 @@ struct SizeConstraints {
   uint64_t dec_max_pixels = 0xFFFFFFFFu;  // Might be up to ~0ull
 };
 
+template <typename T,
+          class = typename std::enable_if<std::is_unsigned<T>::value>::type>
+Status VerifyDimensions(const SizeConstraints* constraints, T xs, T ys) {
+  if (!constraints) return true;
+
+  if (xs == 0 || ys == 0) return JXL_FAILURE("Empty image.");
+  if (xs > constraints->dec_max_xsize) return JXL_FAILURE("Image too wide.");
+  if (ys > constraints->dec_max_ysize) return JXL_FAILURE("Image too tall.");
+
+  const uint64_t num_pixels = static_cast<uint64_t>(xs) * ys;
+  if (num_pixels > constraints->dec_max_pixels) {
+    return JXL_FAILURE("Image too big.");
+  }
+
+  return true;
+}
+
 }  // namespace jxl
 
 #endif  // LIB_JXL_SIZE_CONSTRAINTS_H_
index 73d1b8f..d4f4175 100644 (file)
@@ -6,7 +6,6 @@
 #include "lib/extras/time.h"
 
 #include <stdint.h>
-#include <stdio.h>
 #include <stdlib.h>
 
 #include <ctime>
index 1ed1b29..3d02695 100644 (file)
@@ -7,11 +7,13 @@
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "lib/extras/tone_mapping.cc"
+#include <jxl/cms.h>
+
 #include <hwy/foreach_target.h>
 #include <hwy/highway.h>
 
-#include "lib/jxl/dec_tone_mapping-inl.h"
-#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/cms/tone_mapping-inl.h"
+#include "lib/jxl/image_bundle.h"
 
 HWY_BEFORE_NAMESPACE();
 namespace jxl {
@@ -30,11 +32,12 @@ Status ToneMapFrame(const std::pair<float, float> display_nits,
 
   ColorEncoding linear_rec2020;
   linear_rec2020.SetColorSpace(ColorSpace::kRGB);
-  linear_rec2020.primaries = Primaries::k2100;
-  linear_rec2020.white_point = WhitePoint::kD65;
-  linear_rec2020.tf.SetTransferFunction(TransferFunction::kLinear);
+  JXL_RETURN_IF_ERROR(linear_rec2020.SetPrimariesType(Primaries::k2100));
+  JXL_RETURN_IF_ERROR(linear_rec2020.SetWhitePointType(WhitePoint::kD65));
+  linear_rec2020.Tf().SetTransferFunction(TransferFunction::kLinear);
   JXL_RETURN_IF_ERROR(linear_rec2020.CreateICC());
-  JXL_RETURN_IF_ERROR(ib->TransformTo(linear_rec2020, GetJxlCms(), pool));
+  JXL_RETURN_IF_ERROR(
+      ib->TransformTo(linear_rec2020, *JxlGetDefaultCms(), pool));
 
   Rec2408ToneMapper<decltype(df)> tone_mapper(
       {ib->metadata()->tone_mapping.min_nits,
@@ -67,11 +70,12 @@ Status GamutMapFrame(ImageBundle* const ib, float preserve_saturation,
 
   ColorEncoding linear_rec2020;
   linear_rec2020.SetColorSpace(ColorSpace::kRGB);
-  linear_rec2020.primaries = Primaries::k2100;
-  linear_rec2020.white_point = WhitePoint::kD65;
-  linear_rec2020.tf.SetTransferFunction(TransferFunction::kLinear);
+  JXL_RETURN_IF_ERROR(linear_rec2020.SetPrimariesType(Primaries::k2100));
+  JXL_RETURN_IF_ERROR(linear_rec2020.SetWhitePointType(WhitePoint::kD65));
+  linear_rec2020.Tf().SetTransferFunction(TransferFunction::kLinear);
   JXL_RETURN_IF_ERROR(linear_rec2020.CreateICC());
-  JXL_RETURN_IF_ERROR(ib->TransformTo(linear_rec2020, GetJxlCms(), pool));
+  JXL_RETURN_IF_ERROR(
+      ib->TransformTo(linear_rec2020, *JxlGetDefaultCms(), pool));
 
   JXL_RETURN_IF_ERROR(RunOnPool(
       pool, 0, ib->ysize(), ThreadPool::NoInit,
index 2f97b88..34cbdde 100644 (file)
@@ -6,39 +6,35 @@
 #include "benchmark/benchmark.h"
 #include "lib/extras/codec.h"
 #include "lib/extras/tone_mapping.h"
-#include "lib/jxl/enc_color_management.h"
-#include "lib/jxl/testdata.h"
 
 namespace jxl {
 
 static void BM_ToneMapping(benchmark::State& state) {
-  CodecInOut image;
-  const PaddedBytes image_bytes = ReadTestData("jxl/flower/flower.png");
-  JXL_CHECK(SetFromBytes(Span<const uint8_t>(image_bytes), &image));
+  Image3F color(2268, 1512);
+  FillImage(0.5f, &color);
 
-  // Convert to linear Rec. 2020 so that `ToneMapTo` doesn't have to and we
-  // mainly measure the tone mapping itself.
+  // Use linear Rec. 2020 so that `ToneMapTo` doesn't have to convert to it and
+  // we mainly measure the tone mapping itself.
   ColorEncoding linear_rec2020;
   linear_rec2020.SetColorSpace(ColorSpace::kRGB);
-  linear_rec2020.primaries = Primaries::k2100;
-  linear_rec2020.white_point = WhitePoint::kD65;
-  linear_rec2020.tf.SetTransferFunction(TransferFunction::kLinear);
+  JXL_CHECK(linear_rec2020.SetPrimariesType(Primaries::k2100));
+  JXL_CHECK(linear_rec2020.SetWhitePointType(WhitePoint::kD65));
+  linear_rec2020.Tf().SetTransferFunction(TransferFunction::kLinear);
   JXL_CHECK(linear_rec2020.CreateICC());
-  JXL_CHECK(image.TransformTo(linear_rec2020, GetJxlCms()));
 
   for (auto _ : state) {
     state.PauseTiming();
     CodecInOut tone_mapping_input;
-    tone_mapping_input.SetFromImage(CopyImage(*image.Main().color()),
-                                    image.Main().c_current());
-    tone_mapping_input.metadata.m.SetIntensityTarget(
-        image.metadata.m.IntensityTarget());
+    Image3F color2(color.xsize(), color.ysize());
+    CopyImageTo(color, &color2);
+    tone_mapping_input.SetFromImage(std::move(color2), linear_rec2020);
+    tone_mapping_input.metadata.m.SetIntensityTarget(255);
     state.ResumeTiming();
 
     JXL_CHECK(ToneMapTo({0.1, 100}, &tone_mapping_input));
   }
 
-  state.SetItemsProcessed(state.iterations() * image.xsize() * image.ysize());
+  state.SetItemsProcessed(state.iterations() * color.xsize() * color.ysize());
 }
 BENCHMARK(BM_ToneMapping);
 
diff --git a/lib/include/jxl/butteraugli.h b/lib/include/jxl/butteraugli.h
deleted file mode 100644 (file)
index ba69a29..0000000
+++ /dev/null
@@ -1,160 +0,0 @@
-/* Copyright (c) the JPEG XL Project Authors. All rights reserved.
- *
- * Use of this source code is governed by a BSD-style
- * license that can be found in the LICENSE file.
- */
-
-/** @addtogroup libjxl_butteraugli
- * @{
- * @file butteraugli.h
- * @brief Butteraugli API for JPEG XL.
- */
-
-#ifndef JXL_BUTTERAUGLI_H_
-#define JXL_BUTTERAUGLI_H_
-
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
-#include "jxl/jxl_export.h"
-#include "jxl/memory_manager.h"
-#include "jxl/parallel_runner.h"
-#include "jxl/types.h"
-
-/**
- * Opaque structure that holds a butteraugli API.
- *
- * Allocated and initialized with JxlButteraugliApiCreate().
- * Cleaned up and deallocated with JxlButteraugliApiDestroy().
- */
-typedef struct JxlButteraugliApiStruct JxlButteraugliApi;
-
-/**
- * Opaque structure that holds intermediary butteraugli results.
- *
- * Allocated and initialized with JxlButteraugliCompute().
- * Cleaned up and deallocated with JxlButteraugliResultDestroy().
- */
-typedef struct JxlButteraugliResultStruct JxlButteraugliResult;
-
-/**
- * Deinitializes and frees JxlButteraugliResult instance.
- *
- * @param result instance to be cleaned up and deallocated.
- */
-JXL_EXPORT void JxlButteraugliResultDestroy(JxlButteraugliResult* result);
-
-/**
- * Creates an instance of JxlButteraugliApi and initializes it.
- *
- * @p memory_manager will be used for all the library dynamic allocations made
- * from this instance. The parameter may be NULL, in which case the default
- * allocator will be used. See jxl/memory_manager.h for details.
- *
- * @param memory_manager custom allocator function. It may be NULL. The memory
- *        manager will be copied internally.
- * @return @c NULL if the instance can not be allocated or initialized
- * @return pointer to initialized JxlEncoder otherwise
- */
-JXL_EXPORT JxlButteraugliApi* JxlButteraugliApiCreate(
-    const JxlMemoryManager* memory_manager);
-
-/**
- * Set the parallel runner for multithreading.
- *
- * @param api api instance.
- * @param parallel_runner function pointer to runner for multithreading. A
- * multithreaded runner should be set to reach fast performance.
- * @param parallel_runner_opaque opaque pointer for parallel_runner.
- */
-JXL_EXPORT void JxlButteraugliApiSetParallelRunner(
-    JxlButteraugliApi* api, JxlParallelRunner parallel_runner,
-    void* parallel_runner_opaque);
-
-/**
- * Set the hf_asymmetry option for butteraugli.
- *
- * @param api api instance.
- * @param v new hf_asymmetry value.
- */
-JXL_EXPORT void JxlButteraugliApiSetHFAsymmetry(JxlButteraugliApi* api,
-                                                float v);
-
-/**
- * Set the intensity_target option for butteraugli.
- *
- * @param api api instance.
- * @param v new intensity_target value.
- */
-JXL_EXPORT void JxlButteraugliApiSetIntensityTarget(JxlButteraugliApi* api,
-                                                    float v);
-
-/**
- * Deinitializes and frees JxlButteraugliApi instance.
- *
- * @param api instance to be cleaned up and deallocated.
- */
-JXL_EXPORT void JxlButteraugliApiDestroy(JxlButteraugliApi* api);
-
-/**
- * Computes intermediary butteraugli result between an original image and a
- * distortion.
- *
- * @param api api instance for this computation.
- * @param xsize width of the compared images.
- * @param ysize height of the compared images.
- * @param pixel_format_orig pixel format for original image.
- * @param buffer_orig pixel data for original image.
- * @param size_orig size of buffer_orig in bytes.
- * @param pixel_format_dist pixel format for distortion.
- * @param buffer_dist pixel data for distortion.
- * @param size_dist size of buffer_dist in bytes.
- * @return @c NULL if the results can not be computed or initialized.
- * @return pointer to initialized and computed intermediary result.
- */
-JXL_EXPORT JxlButteraugliResult* JxlButteraugliCompute(
-    const JxlButteraugliApi* api, uint32_t xsize, uint32_t ysize,
-    const JxlPixelFormat* pixel_format_orig, const void* buffer_orig,
-    size_t size_orig, const JxlPixelFormat* pixel_format_dist,
-    const void* buffer_dist, size_t size_dist);
-
-/**
- * Computes butteraugli max distance based on an intermediary butteraugli
- * result.
- *
- * @param result intermediary result instance.
- * @return max distance.
- */
-JXL_EXPORT float JxlButteraugliResultGetMaxDistance(
-    const JxlButteraugliResult* result);
-
-/**
- * Computes a butteraugli distance based on an intermediary butteraugli result.
- *
- * @param result intermediary result instance.
- * @param pnorm pnorm to calculate.
- * @return distance using the given pnorm.
- */
-JXL_EXPORT float JxlButteraugliResultGetDistance(
-    const JxlButteraugliResult* result, float pnorm);
-
-/**
- * Get a pointer to the distmap in the result.
- *
- * @param result intermediary result instance.
- * @param buffer will be set to the distmap. The distance value for (x,y) will
- * be available at buffer + y * row_stride + x.
- * @param row_stride will be set to the row stride of the distmap.
- */
-JXL_EXPORT void JxlButteraugliResultGetDistmap(
-    const JxlButteraugliResult* result, const float** buffer,
-    uint32_t* row_stride);
-
-#if defined(__cplusplus) || defined(c_plusplus)
-}
-#endif
-
-#endif /* JXL_BUTTERAUGLI_H_ */
-
-/** @}*/
diff --git a/lib/include/jxl/butteraugli_cxx.h b/lib/include/jxl/butteraugli_cxx.h
deleted file mode 100644 (file)
index 55efd74..0000000
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-/// @addtogroup libjxl_butteraugli
-/// @{
-///
-/// @file butteraugli_cxx.h
-/// @brief C++ header-only helper for @ref butteraugli.h.
-///
-/// There's no binary library associated with the header since this is a header
-/// only library.
-
-#ifndef JXL_BUTTERAUGLI_CXX_H_
-#define JXL_BUTTERAUGLI_CXX_H_
-
-#include <memory>
-
-#include "jxl/butteraugli.h"
-
-#if !(defined(__cplusplus) || defined(c_plusplus))
-#error "This a C++ only header. Use jxl/butteraugli.h from C sources."
-#endif
-
-/// Struct to call JxlButteraugliApiDestroy from the JxlButteraugliApiPtr
-/// unique_ptr.
-struct JxlButteraugliApiDestroyStruct {
-  /// Calls @ref JxlButteraugliApiDestroy() on the passed api.
-  void operator()(JxlButteraugliApi* api) { JxlButteraugliApiDestroy(api); }
-};
-
-/// std::unique_ptr<> type that calls JxlButteraugliApiDestroy() when releasing
-/// the pointer.
-///
-/// Use this helper type from C++ sources to ensure the api is destroyed and
-/// their internal resources released.
-typedef std::unique_ptr<JxlButteraugliApi, JxlButteraugliApiDestroyStruct>
-    JxlButteraugliApiPtr;
-
-/// Struct to call JxlButteraugliResultDestroy from the JxlButteraugliResultPtr
-/// unique_ptr.
-struct JxlButteraugliResultDestroyStruct {
-  /// Calls @ref JxlButteraugliResultDestroy() on the passed result object.
-  void operator()(JxlButteraugliResult* result) {
-    JxlButteraugliResultDestroy(result);
-  }
-};
-
-/// std::unique_ptr<> type that calls JxlButteraugliResultDestroy() when
-/// releasing the pointer.
-///
-/// Use this helper type from C++ sources to ensure the result object is
-/// destroyed and their internal resources released.
-typedef std::unique_ptr<JxlButteraugliResult, JxlButteraugliResultDestroyStruct>
-    JxlButteraugliResultPtr;
-
-#endif  // JXL_BUTTERAUGLI_CXX_H_
-
-/// @}
diff --git a/lib/include/jxl/cms.h b/lib/include/jxl/cms.h
new file mode 100644 (file)
index 0000000..6616a26
--- /dev/null
@@ -0,0 +1,24 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef JXL_CMS_H_
+#define JXL_CMS_H_
+
+// ICC profiles and color space conversions.
+
+#include <jxl/cms_interface.h>
+#include <jxl/jxl_cms_export.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+JXL_CMS_EXPORT const JxlCmsInterface* JxlGetDefaultCms();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // JXL_CMS_H_
index fb852ee..56da9fe 100644 (file)
@@ -4,7 +4,7 @@
  * license that can be found in the LICENSE file.
  */
 
-/** @addtogroup libjxl_common
+/** @addtogroup libjxl_color
  * @{
  * @file cms_interface.h
  * @brief Interface to allow the injection of different color management systems
 #ifndef JXL_CMS_INTERFACE_H_
 #define JXL_CMS_INTERFACE_H_
 
-#include "jxl/color_encoding.h"
-#include "jxl/types.h"
+#include <jxl/color_encoding.h>
+#include <jxl/types.h>
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
 
+/** Parses an ICC profile and populates @p c and @p cmyk with the data.
+ *
+ * @param user_data JxlCmsInterface::set_fields_data passed as-is.
+ * @param icc_data the ICC data to parse.
+ * @param icc_size how many bytes of icc_data are valid.
+ * @param c a JxlColorEncoding to populate if applicable.
+ * @param cmyk a boolean to set to whether the colorspace is a CMYK colorspace.
+ * @return Whether the relevant fields in @p c were successfully populated.
+ */
+typedef JXL_BOOL (*jpegxl_cms_set_fields_from_icc_func)(void* user_data,
+                                                        const uint8_t* icc_data,
+                                                        size_t icc_size,
+                                                        JxlColorEncoding* c,
+                                                        JXL_BOOL* cmyk);
+
 /** Represents an input or output colorspace to a color transform, as a
  * serialized ICC profile. */
 typedef struct {
@@ -207,6 +222,11 @@ typedef void (*jpegxl_cms_destroy_func)(void*);
  * @enddot
  */
 typedef struct {
+  /** CMS-specific data that will be passed to @ref set_fields_from_icc. */
+  void* set_fields_data;
+  /** Populates a JxlColorEncoding from an ICC profile. */
+  jpegxl_cms_set_fields_from_icc_func set_fields_from_icc;
+
   /** CMS-specific data that will be passed to @ref init. */
   void* init_data;
   /** Prepares a colorspace transform as described in the documentation of @ref
index d126577..fb71484 100644 (file)
@@ -4,7 +4,7 @@
  * license that can be found in the LICENSE file.
  */
 
-/** @addtogroup libjxl_common
+/** @addtogroup libjxl_metadata
  * @{
  * @file codestream_header.h
  * @brief Definitions of structs and enums for the metadata from the JPEG XL
 #ifndef JXL_CODESTREAM_HEADER_H_
 #define JXL_CODESTREAM_HEADER_H_
 
+#include <jxl/types.h>
 #include <stddef.h>
 #include <stdint.h>
 
-#include "jxl/types.h"
-
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
@@ -71,15 +70,6 @@ typedef struct {
   uint32_t ysize;
 } JxlPreviewHeader;
 
-/** The intrinsic size header */
-typedef struct {
-  /** Intrinsic width in pixels */
-  uint32_t xsize;
-
-  /** Intrinsic height in pixels */
-  uint32_t ysize;
-} JxlIntrinsicSizeHeader;
-
 /** The codestream animation header, optionally present in the beginning of
  * the codestream, and if it is it applies to all animation frames, unlike
  * JxlFrameHeader which applies to an individual frame.
@@ -254,7 +244,7 @@ typedef struct {
    */
   uint32_t intrinsic_xsize;
 
-  /** Intrinsic heigth of the image.
+  /** Intrinsic height of the image.
    * The intrinsic size can be different from the actual size in pixels
    * (as given by xsize and ysize) and it denotes the recommended dimensions
    * for displaying the image, i.e. applications are advised to resample the
@@ -389,6 +379,8 @@ typedef struct {
   /** After blending, save the frame as reference frame with this ID (0-3).
    * Special case: if the frame duration is nonzero, ID 0 means "will not be
    * referenced in the future". This value is not used for the last frame.
+   * When encoding, ID 3 is reserved to frames that are generated internally by
+   * the encoder, and should not be used by applications.
    */
   uint32_t save_as_reference;
 } JxlLayerInfo;
index b16f6a0..928117e 100644 (file)
@@ -4,7 +4,7 @@
  * license that can be found in the LICENSE file.
  */
 
-/** @addtogroup libjxl_common
+/** @addtogroup libjxl_color
  * @{
  * @file color_encoding.h
  * @brief Color Encoding definitions used by JPEG XL.
index 66820bf..fa21866 100644 (file)
 #ifndef JXL_DECODE_H_
 #define JXL_DECODE_H_
 
+#include <jxl/cms_interface.h>
+#include <jxl/codestream_header.h>
+#include <jxl/color_encoding.h>
+#include <jxl/jxl_export.h>
+#include <jxl/memory_manager.h>
+#include <jxl/parallel_runner.h>
+#include <jxl/types.h>
+#include <jxl/version.h>
 #include <stddef.h>
 #include <stdint.h>
 
-#include "jxl/codestream_header.h"
-#include "jxl/color_encoding.h"
-#include "jxl/jxl_export.h"
-#include "jxl/memory_manager.h"
-#include "jxl/parallel_runner.h"
-#include "jxl/types.h"
-#include "jxl/version.h"
-
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
@@ -167,16 +167,6 @@ typedef enum {
    */
   JXL_DEC_NEED_PREVIEW_OUT_BUFFER = 3,
 
-  /** The decoder is able to decode a DC image and requests setting a DC output
-   * buffer using @ref JxlDecoderSetDCOutBuffer. This occurs if @ref
-   * JXL_DEC_DC_IMAGE is requested and it is possible to decode a DC image from
-   * the codestream and the DC out buffer was not yet set. This event re-occurs
-   * for new frames if there are multiple animation frames.
-   * @deprecated The DC feature in this form will be removed. For progressive
-   * rendering, @ref JxlDecoderFlushImage should be used.
-   */
-  JXL_DEC_NEED_DC_OUT_BUFFER = 4,
-
   /** The decoder requests an output buffer to store the full resolution image,
    * which can be set with @ref JxlDecoderSetImageOutBuffer or with @ref
    * JxlDecoderSetImageOutCallback. This event re-occurs for new frames if
@@ -209,16 +199,6 @@ typedef enum {
   JXL_DEC_BASIC_INFO = 0x40,
 
   /** Informative event by @ref JxlDecoderProcessInput
-   * "JxlDecoderProcessInput": User extensions of the codestream header. This
-   * event occurs max once per image and always later than @ref
-   * JXL_DEC_BASIC_INFO and earlier than any pixel data.
-   *
-   * @deprecated The decoder no longer returns this, the header extensions,
-   * if any, are available at the JXL_DEC_BASIC_INFO event.
-   */
-  JXL_DEC_EXTENSIONS = 0x80,
-
-  /** Informative event by @ref JxlDecoderProcessInput
    * "JxlDecoderProcessInput": Color encoding or ICC profile from the
    * codestream header. This event occurs max once per image and always later
    * than @ref JXL_DEC_BASIC_INFO and earlier than any pixel data.
@@ -260,27 +240,11 @@ typedef enum {
   JXL_DEC_FRAME = 0x400,
 
   /** Informative event by @ref JxlDecoderProcessInput
-   * "JxlDecoderProcessInput": DC image, 8x8 sub-sampled frame, decoded. It is
-   * not guaranteed that the decoder will always return DC separately, but when
-   * it does it will do so before outputting the full frame. @ref
-   * JxlDecoderSetDCOutBuffer must be used after getting the basic image
-   * information to be able to get the DC pixels, if not this return status only
-   * indicates we're past this point in the codestream. This event occurs max
-   * once per frame and always later than @ref JXL_DEC_FRAME and other header
-   * events and earlier than full resolution pixel data.
-   *
-   * @deprecated The DC feature in this form will be removed. For progressive
-   * rendering, @ref JxlDecoderFlushImage should be used.
-   */
-  JXL_DEC_DC_IMAGE = 0x800,
-
-  /** Informative event by @ref JxlDecoderProcessInput
    * "JxlDecoderProcessInput": full frame (or layer, in case coalescing is
    * disabled) is decoded. @ref JxlDecoderSetImageOutBuffer must be used after
    * getting the basic image information to be able to get the image pixels, if
    * not this return status only indicates we're past this point in the
-   * codestream. This event occurs max once per frame and always later than @ref
-   * JXL_DEC_DC_IMAGE.
+   * codestream. This event occurs max once per frame.
    * In this case, @ref JxlDecoderReleaseInput will return all bytes from the
    * end of the frame (or if @ref JXL_DEC_JPEG_RECONSTRUCTION is subscribed to,
    * from the end of the last box that is needed for jpeg reconstruction) as
@@ -356,6 +320,33 @@ typedef enum {
   JXL_DEC_FRAME_PROGRESSION = 0x8000,
 } JxlDecoderStatus;
 
+/** Types of progressive detail.
+ * Setting a progressive detail with value N implies all progressive details
+ * with smaller or equal value. Currently only the following level of
+ * progressive detail is implemented:
+ *  - kDC (which implies kFrames)
+ *  - kLastPasses (which implies kDC and kFrames)
+ *  - kPasses (which implies kLastPasses, kDC and kFrames)
+ */
+typedef enum {
+  // after completed kRegularFrames
+  kFrames = 0,
+  // after completed DC (1:8)
+  kDC = 1,
+  // after completed AC passes that are the last pass for their resolution
+  // target.
+  kLastPasses = 2,
+  // after completed AC passes that are not the last pass for their resolution
+  // target.
+  kPasses = 3,
+  // during DC frame when lower resolution are completed (1:32, 1:16)
+  kDCProgressive = 4,
+  // after completed groups
+  kDCGroups = 5,
+  // after completed groups
+  kGroups = 6,
+} JxlProgressiveDetail;
+
 /** Rewinds decoder to the beginning. The same input must be given again from
  * the beginning of the file and the decoder will emit events from the beginning
  * again. When rewinding (as opposed to @ref JxlDecoderReset), the decoder can
@@ -404,7 +395,7 @@ JXL_EXPORT void JxlDecoderSkipFrames(JxlDecoder* dec, size_t amount);
 /**
  * Skips processing the current frame. Can be called after frame processing
  * already started, signaled by a @ref JXL_DEC_NEED_IMAGE_OUT_BUFFER event,
- * but before the corrsponding @ref JXL_DEC_FULL_IMAGE event. The next signaled
+ * but before the corresponding @ref JXL_DEC_FULL_IMAGE event. The next signaled
  * event will be another @ref JXL_DEC_FRAME, or @ref JXL_DEC_SUCCESS if there
  * are no more frames. If pixel data is required from the already processed part
  * of the frame, @ref JxlDecoderFlushImage must be called before this.
@@ -416,23 +407,6 @@ JXL_EXPORT void JxlDecoderSkipFrames(JxlDecoder* dec, size_t amount);
 JXL_EXPORT JxlDecoderStatus JxlDecoderSkipCurrentFrame(JxlDecoder* dec);
 
 /**
- * Get the default pixel format for this decoder.
- *
- * Requires that the decoder can produce JxlBasicInfo.
- *
- * @param dec @ref JxlDecoder to query when creating the recommended pixel
- *     format.
- * @param format JxlPixelFormat to populate with the recommended settings for
- *     the data loaded into this decoder.
- * @return @ref JXL_DEC_SUCCESS if no error, @ref JXL_DEC_NEED_MORE_INPUT if the
- *     basic info isn't yet available, and @ref JXL_DEC_ERROR otherwise.
- *
- * DEPRECATED: this function will be removed in the future.
- */
-JXL_EXPORT JXL_DEPRECATED JxlDecoderStatus
-JxlDecoderDefaultPixelFormat(const JxlDecoder* dec, JxlPixelFormat* format);
-
-/**
  * Set the parallel runner for multithreading. May only be set before starting
  * decoding.
  *
@@ -598,8 +572,6 @@ JXL_EXPORT JxlDecoderStatus JxlDecoderSetCoalescing(JxlDecoder* dec,
  *     available and this informative event is subscribed to.
  * @return @ref JXL_DEC_PREVIEW_IMAGE when preview pixel information is
  *     available and output in the preview buffer.
- * @return @ref JXL_DEC_DC_IMAGE when DC pixel information (8x8 downscaled
- *     version of the image) is available and output is in the DC buffer.
  * @return @ref JXL_DEC_FULL_IMAGE when all pixel information at highest detail
  *     is available and has been output in the pixel buffer.
  */
@@ -743,17 +715,28 @@ typedef enum {
  *    represented, the ICC profile may be a close approximation. It is also not
  *    always feasible to deduce from an ICC profile which named color space it
  *    exactly represents, if any, as it can represent any arbitrary space.
+ *    HDR color spaces such as those using PQ and HLG are also potentially
+ *    problematic, in that: while ICC profiles can encode a transfer function
+ *    that happens to approximate those of PQ and HLG (HLG for only one given
+ *    system gamma at a time, and necessitating a 3D LUT if gamma is to be
+ *    different from 1), they cannot (before ICCv4.4) semantically signal that
+ *    this is the color space that they represent. Therefore, they will
+ *    typically not actually be interpreted as representing an HDR color space.
+ *    This is especially detrimental to PQ which will then be interpreted as if
+ *    the maximum signal value represented SDR white instead of 10000 cd/m^2,
+ *    meaning that the image will be displayed two orders of magnitude (5-7 EV)
+ *    too dim.
  *  - The JPEG XL image has an encoded structured color profile, and it
  *    indicates an unknown or xyb color space. In that case, @ref
  *    JxlDecoderGetColorAsICCProfile is not available.
  *
- * When rendering an image on a system that supports ICC profiles, @ref
- * JxlDecoderGetColorAsICCProfile should be used first. When rendering
- * for a specific color space, possibly indicated in the JPEG XL
- * image, @ref JxlDecoderGetColorAsEncodedProfile should be used first.
+ * When rendering an image on a system where ICC-based color management is used,
+ * @ref JxlDecoderGetColorAsICCProfile should generally be used first as it will
+ * return a ready-to-use profile (with the aforementioned caveat about HDR).
+ * When knowledge about the nominal color space is desired if available, @ref
+ * JxlDecoderGetColorAsEncodedProfile should be used first.
  *
  * @param dec decoder object
- * @param unused_format deprecated, can be NULL
  * @param target whether to get the original color profile from the metadata
  *     or the color profile of the decoded pixels.
  * @param color_encoding struct to copy the information into, or NULL to only
@@ -764,8 +747,8 @@ typedef enum {
  *     codestream.
  */
 JXL_EXPORT JxlDecoderStatus JxlDecoderGetColorAsEncodedProfile(
-    const JxlDecoder* dec, const JxlPixelFormat* unused_format,
-    JxlColorProfileTarget target, JxlColorEncoding* color_encoding);
+    const JxlDecoder* dec, JxlColorProfileTarget target,
+    JxlColorEncoding* color_encoding);
 
 /**
  * Outputs the size in bytes of the ICC profile returned by @ref
@@ -779,7 +762,6 @@ JXL_EXPORT JxlDecoderStatus JxlDecoderGetColorAsEncodedProfile(
  * depending of what is encoded in the codestream.
  *
  * @param dec decoder object
- * @param unused_format deprecated, can be NULL
  * @param target whether to get the original color profile from the metadata
  *     or the color profile of the decoded pixels.
  * @param size variable to output the size into, or NULL to only check the
@@ -791,8 +773,7 @@ JXL_EXPORT JxlDecoderStatus JxlDecoderGetColorAsEncodedProfile(
  *     cannot be generated.
  */
 JXL_EXPORT JxlDecoderStatus JxlDecoderGetICCProfileSize(
-    const JxlDecoder* dec, const JxlPixelFormat* unused_format,
-    JxlColorProfileTarget target, size_t* size);
+    const JxlDecoder* dec, JxlColorProfileTarget target, size_t* size);
 
 /**
  * Outputs ICC profile if available. The profile is only available if @ref
@@ -800,7 +781,6 @@ JXL_EXPORT JxlDecoderStatus JxlDecoderGetICCProfileSize(
  * at least as many bytes as given by @ref JxlDecoderGetICCProfileSize.
  *
  * @param dec decoder object
- * @param unused_format deprecated, can be NULL
  * @param target whether to get the original color profile from the metadata
  *     or the color profile of the decoded pixels.
  * @param icc_profile buffer to copy the ICC profile into
@@ -811,36 +791,13 @@ JXL_EXPORT JxlDecoderStatus JxlDecoderGetICCProfileSize(
  *     large enough.
  */
 JXL_EXPORT JxlDecoderStatus JxlDecoderGetColorAsICCProfile(
-    const JxlDecoder* dec, const JxlPixelFormat* unused_format,
-    JxlColorProfileTarget target, uint8_t* icc_profile, size_t size);
+    const JxlDecoder* dec, JxlColorProfileTarget target, uint8_t* icc_profile,
+    size_t size);
 
-/** Sets the color profile to use for @ref JXL_COLOR_PROFILE_TARGET_DATA for the
- * special case when the decoder has a choice. This only has effect for a JXL
- * image where uses_original_profile is false. If uses_original_profile is true,
- * this setting is ignored and the decoder uses a profile related to the image.
- * No matter what, the @ref JXL_COLOR_PROFILE_TARGET_DATA must still be queried
- * to know the actual data format of the decoded pixels after decoding.
- *
- * The JXL decoder has no color management system built in, but can convert XYB
- * color to any of the ones supported by JxlColorEncoding. Note that if the
- * requested color encoding has a narrower gamut, or the white points differ,
- * then the resulting image can have significant color distortion.
- *
- * Can only be set after the @ref JXL_DEC_COLOR_ENCODING event occurred and
- * before any other event occurred, and can affect the result of @ref
- * JXL_COLOR_PROFILE_TARGET_DATA (but not of @ref
- * JXL_COLOR_PROFILE_TARGET_ORIGINAL), so should be used after getting @ref
- * JXL_COLOR_PROFILE_TARGET_ORIGINAL but before getting @ref
- * JXL_COLOR_PROFILE_TARGET_DATA. The color_encoding must be grayscale if
- * num_color_channels from the basic info is 1, RGB if num_color_channels from
- * the basic info is 3.
- *
- * If @ref JxlDecoderSetPreferredColorProfile is not used, then for images for
- * which uses_original_profile is false and with ICC color profile, the decoder
- * will choose linear sRGB for color images, linear grayscale for grayscale
- * images. This function only sets a preference, since for other images the
- * decoder has no choice what color profile to use, it is determined by the
- * image.
+/** Sets the desired output color profile of the decoded image by calling
+ * @ref JxlDecoderSetOutputColorProfile, passing on @c color_encoding and
+ * setting @c icc_data to NULL. See @ref JxlDecoderSetOutputColorProfile for
+ * details.
  *
  * @param dec decoder object
  * @param color_encoding the default color encoding to set
@@ -864,6 +821,68 @@ JXL_EXPORT JxlDecoderStatus JxlDecoderSetDesiredIntensityTarget(
     JxlDecoder* dec, float desired_intensity_target);
 
 /**
+ * Sets the desired output color profile of the decoded image either from a
+ * color encoding or an ICC profile. Valid calls of this function have either @c
+ * color_encoding or @c icc_data set to NULL and @c icc_size must be 0 if and
+ * only if @c icc_data is NULL.
+ *
+ * Depending on whether a color management system (CMS) has been set the
+ * behavior is as follows:
+ *
+ * If a color management system (CMS) has been set with @ref JxlDecoderSetCms,
+ * and the CMS supports output to the desired color encoding or ICC profile,
+ * then it will provide the output in that color encoding or ICC profile. If the
+ * desired color encoding or the ICC is not supported, then an error will be
+ * returned.
+ *
+ * If no CMS has been set with @ref JxlDecoderSetCms, there are two cases:
+ *
+ * (1) Calling this function with a color encoding will convert XYB images to
+ * the desired color encoding. In this case, if the requested color encoding has
+ * a narrower gamut, or the white points differ, then the resulting image can
+ * have significant color distortion. Non-XYB images will not be converted to
+ * the desired color space.
+ *
+ * (2) Calling this function with an ICC profile will result in an error.
+ *
+ * If called with an ICC profile (after a call to @ref JxlDecoderSetCms), the
+ * ICC profile has to be a valid RGB or grayscale color profile.
+ *
+ * Can only be set after the @ref JXL_DEC_COLOR_ENCODING event occurred and
+ * before any other event occurred, and should be used before getting
+ * JXL_COLOR_PROFILE_TARGET_DATA.
+ *
+ * This function must not be called before JxlDecoderSetCms.
+ *
+ * @param dec decoder orbject
+ * @param color_encoding the output color encoding
+ * @param icc_data bytes of the icc profile
+ * @param icc_size size of the icc profile in bytes
+ * @return @ref JXL_DEC_SUCCESS if the color profile was set successfully, @ref
+ *     JXL_DEC_ERROR otherwise.
+ */
+JXL_EXPORT JxlDecoderStatus JxlDecoderSetOutputColorProfile(
+    JxlDecoder* dec, const JxlColorEncoding* color_encoding,
+    const uint8_t* icc_data, size_t icc_size);
+
+/**
+ * Sets the color management system (CMS) that will be used for color
+ * conversion (if applicable) during decoding. May only be set before starting
+ * decoding and must not be called after @ref JxlDecoderSetOutputColorProfile.
+ *
+ * See @ref JxlDecoderSetOutputColorProfile for how color conversions are done
+ * depending on whether or not a CMS has been set with @ref JxlDecoderSetCms.
+ *
+ * @param dec decoder object.
+ * @param cms structure representing a CMS implementation. See @ref
+ * JxlCmsInterface for more details.
+ */
+JXL_EXPORT JxlDecoderStatus JxlDecoderSetCms(JxlDecoder* dec,
+                                             JxlCmsInterface cms);
+// TODO(firsching): add a function JxlDecoderSetDefaultCms() for setting a
+// default in case libjxl is build with a CMS.
+
+/**
  * Returns the minimum size in bytes of the preview image output pixel buffer
  * for the given format. This is the buffer for @ref
  * JxlDecoderSetPreviewOutBuffer. Requires the preview header information is
@@ -942,44 +961,6 @@ JXL_EXPORT JxlDecoderStatus JxlDecoderGetExtraChannelBlendInfo(
     const JxlDecoder* dec, size_t index, JxlBlendInfo* blend_info);
 
 /**
- * Returns the minimum size in bytes of the DC image output buffer
- * for the given format. This is the buffer for @ref JxlDecoderSetDCOutBuffer.
- * Requires the basic image information is available in the decoder.
- *
- * @param dec decoder object
- * @param format format of pixels
- * @param size output value, buffer size in bytes
- * @return @ref JXL_DEC_SUCCESS on success, @ref JXL_DEC_ERROR on error, such as
- *     information not available yet.
- *
- * @deprecated The DC feature in this form will be removed. Use @ref
- *     JxlDecoderFlushImage for progressive rendering.
- */
-JXL_EXPORT JXL_DEPRECATED JxlDecoderStatus JxlDecoderDCOutBufferSize(
-    const JxlDecoder* dec, const JxlPixelFormat* format, size_t* size);
-
-/**
- * Sets the buffer to write the lower resolution (8x8 sub-sampled) DC image
- * to. The size of the buffer must be at least as large as given by @ref
- * JxlDecoderDCOutBufferSize. The buffer follows the format described by
- * JxlPixelFormat. The DC image has dimensions ceil(xsize / 8) * ceil(ysize /
- * 8). The buffer is owned by the caller.
- *
- * @param dec decoder object
- * @param format format of pixels. Object owned by user and its contents are
- *     copied internally.
- * @param buffer buffer type to output the pixel data to
- * @param size size of buffer in bytes
- * @return @ref JXL_DEC_SUCCESS on success, @ref JXL_DEC_ERROR on error, such as
- *     size too small.
- *
- * @deprecated The DC feature in this form will be removed. Use @ref
- *     JxlDecoderFlushImage for progressive rendering.
- */
-JXL_EXPORT JXL_DEPRECATED JxlDecoderStatus JxlDecoderSetDCOutBuffer(
-    JxlDecoder* dec, const JxlPixelFormat* format, void* buffer, size_t size);
-
-/**
  * Returns the minimum size in bytes of the image output pixel buffer for the
  * given format. This is the buffer for @ref JxlDecoderSetImageOutBuffer.
  * Requires that the basic image information is available in the decoder in the
@@ -1308,7 +1289,7 @@ JXL_EXPORT JxlDecoderStatus JxlDecoderSetDecompressBoxes(JxlDecoder* dec,
                                                          JXL_BOOL decompress);
 
 /**
- * Outputs the type of the current box, after a @ref JXL_DEC_BOX event occured,
+ * Outputs the type of the current box, after a @ref JXL_DEC_BOX event occurred,
  * as 4 characters without null termination character. In case of a compressed
  * "brob" box, this will return "brob" if the decompressed argument is
  * JXL_FALSE, or the underlying box type if the decompressed argument is
@@ -1438,6 +1419,21 @@ JXL_EXPORT size_t JxlDecoderGetIntendedDownsamplingRatio(JxlDecoder* dec);
  */
 JXL_EXPORT JxlDecoderStatus JxlDecoderFlushImage(JxlDecoder* dec);
 
+/**
+ * Sets the bit depth of the output buffer or callback.
+ *
+ * Can be called after @ref JxlDecoderSetImageOutBuffer or @ref
+ * JxlDecoderSetImageOutCallback. For float pixel data types, only the default
+ * @ref JXL_BIT_DEPTH_FROM_PIXEL_FORMAT setting is supported.
+ *
+ * @param dec decoder object
+ * @param bit_depth the bit depth setting of the pixel output
+ * @return @ref JXL_DEC_SUCCESS on success, @ref JXL_DEC_ERROR on error, such as
+ *     incompatible custom bit depth and pixel data type.
+ */
+JXL_EXPORT JxlDecoderStatus
+JxlDecoderSetImageOutBitDepth(JxlDecoder* dec, const JxlBitDepth* bit_depth);
+
 #if defined(__cplusplus) || defined(c_plusplus)
 }
 #endif
index ed5c393..3dd0d2a 100644 (file)
@@ -3,7 +3,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-/// @addtogroup libjxl_decoder
+/// @addtogroup libjxl_cpp
 /// @{
 ///
 /// @file decode_cxx.h
@@ -15,9 +15,9 @@
 #ifndef JXL_DECODE_CXX_H_
 #define JXL_DECODE_CXX_H_
 
-#include <memory>
+#include <jxl/decode.h>
 
-#include "jxl/decode.h"
+#include <memory>
 
 #if !(defined(__cplusplus) || defined(c_plusplus))
 #error "This a C++ only header. Use jxl/decode.h from C sources."
index 4813e3b..916cbc5 100644 (file)
 #ifndef JXL_ENCODE_H_
 #define JXL_ENCODE_H_
 
-#include "jxl/cms_interface.h"
-#include "jxl/codestream_header.h"
-#include "jxl/jxl_export.h"
-#include "jxl/memory_manager.h"
-#include "jxl/parallel_runner.h"
+#include <jxl/cms_interface.h>
+#include <jxl/codestream_header.h>
+#include <jxl/jxl_export.h>
+#include <jxl/memory_manager.h>
+#include <jxl/parallel_runner.h>
+#include <jxl/stats.h>
+#include <jxl/version.h>
+#include <stdint.h>
+
+#include "jxl/types.h"
 
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
@@ -50,10 +55,6 @@ typedef struct JxlEncoderStruct JxlEncoder;
  */
 typedef struct JxlEncoderFrameSettingsStruct JxlEncoderFrameSettings;
 
-/** DEPRECATED: Use JxlEncoderFrameSettings instead.
- */
-typedef JxlEncoderFrameSettings JxlEncoderOptions;
-
 /**
  * Return value for multiple encoder functions.
  */
@@ -71,13 +72,6 @@ typedef enum {
    */
   JXL_ENC_NEED_MORE_OUTPUT = 2,
 
-  /** DEPRECATED: the encoder does not return this status and there is no need
-   * to handle or expect it.
-   * Instead, JXL_ENC_ERROR is returned with error condition
-   * JXL_ENC_ERR_NOT_SUPPORTED.
-   */
-  JXL_ENC_NOT_SUPPORTED = 3,
-
 } JxlEncoderStatus;
 
 /**
@@ -338,6 +332,50 @@ typedef enum {
    */
   JXL_ENC_FRAME_SETTING_BROTLI_EFFORT = 32,
 
+  /** Enables or disables brotli compression of metadata boxes derived from
+   * a JPEG frame when using JxlEncoderAddJPEGFrame. This has no effect on boxes
+   * added using JxlEncoderAddBox.
+   * -1 = default, 0 = disable compression, 1 = enable compression.
+   */
+  JXL_ENC_FRAME_SETTING_JPEG_COMPRESS_BOXES = 33,
+
+  /** Control what kind of buffering is used, when using chunked image frames.
+   * 0 = buffers everything, basically the same as non-streamed code path
+   (mainly for testing)
+   * 1 = can buffer internal data (the tokens)
+   * 2 = can buffer the output
+   * 3 = minimize buffer usage: streamed input and chunked output, writing TOC
+   last (will not work with progressive)
+
+   When the image dimensions is smaller than 2048 x 2048 all the options are the
+   same. Using 1, 2 or 3 can result increasingly in less compression density.
+   */
+  JXL_ENC_FRAME_SETTING_BUFFERING = 34,
+
+  /** Keep or discard Exif metadata boxes derived from a JPEG frame when using
+   * JxlEncoderAddJPEGFrame. This has no effect on boxes added using
+   * JxlEncoderAddBox. When JxlEncoderStoreJPEGMetadata is set to 1, this option
+   * cannot be set to 0. Even when Exif metadata is discarded, the orientation
+   * will still be applied. 0 = discard Exif metadata, 1 = keep Exif metadata
+   * (default).
+   */
+  JXL_ENC_FRAME_SETTING_JPEG_KEEP_EXIF = 35,
+
+  /** Keep or discard XMP metadata boxes derived from a JPEG frame when using
+   * JxlEncoderAddJPEGFrame. This has no effect on boxes added using
+   * JxlEncoderAddBox. When JxlEncoderStoreJPEGMetadata is set to 1, this option
+   * cannot be set to 0. 0 = discard XMP metadata, 1 = keep XMP metadata
+   * (default).
+   */
+  JXL_ENC_FRAME_SETTING_JPEG_KEEP_XMP = 36,
+
+  /** Keep or discard JUMBF metadata boxes derived from a JPEG frame when using
+   * JxlEncoderAddJPEGFrame. This has no effect on boxes added using
+   * JxlEncoderAddBox. 0 = discard JUMBF metadata, 1 = keep JUMBF metadata
+   * (default).
+   */
+  JXL_ENC_FRAME_SETTING_JPEG_KEEP_JUMBF = 37,
+
   /** Enum value not to be used as an option. This value is added to force the
    * C compiler to have the enum to take a known size.
    */
@@ -423,6 +461,11 @@ JXL_EXPORT JxlEncoderError JxlEncoderGetError(JxlEncoder* enc);
  * When the return value is not JXL_ENC_ERROR or JXL_ENC_SUCCESS, the encoding
  * requires more JxlEncoderProcessOutput calls to continue.
  *
+ * The caller must guarantee that *avail_out >= 32 when calling
+ * JxlEncoderProcessOutput; otherwise, JXL_ENC_NEED_MORE_OUTPUT will be
+ * returned. It is guaranteed that, if *avail_out >= 32, at least one byte of
+ * output will be written.
+ *
  * This encodes the frames and/or boxes added so far. If the last frame or last
  * box has been added, @ref JxlEncoderCloseInput, @ref JxlEncoderCloseFrames
  * and/or @ref JxlEncoderCloseBoxes must be called before the next
@@ -515,6 +558,22 @@ JXL_EXPORT JxlEncoderStatus JxlEncoderSetFrameName(
     JxlEncoderFrameSettings* frame_settings, const char* frame_name);
 
 /**
+ * Sets the bit depth of the input buffer.
+ *
+ * For float pixel formats, only the default JXL_BIT_DEPTH_FROM_PIXEL_FORMAT
+ * setting is allowed, while for unsigned pixel formats,
+ * JXL_BIT_DEPTH_FROM_CODESTREAM setting is also allowed. See the comment on
+ * @ref JxlEncoderAddImageFrame for the effects of the bit depth setting.
+
+ * @param frame_settings set of options and metadata for this frame. Also
+ * includes reference to the encoder object.
+ * @param bit_depth the bit depth setting of the pixel input
+ * @return JXL_ENC_SUCCESS on success, JXL_ENC_ERROR on error
+ */
+JXL_EXPORT JxlEncoderStatus JxlEncoderSetFrameBitDepth(
+    JxlEncoderFrameSettings* frame_settings, const JxlBitDepth* bit_depth);
+
+/**
  * Sets the buffer to read JPEG encoded bytes from for the next frame to encode.
  *
  * If JxlEncoderSetBasicInfo has not yet been called, calling
@@ -555,15 +614,22 @@ JxlEncoderAddJPEGFrame(const JxlEncoderFrameSettings* frame_settings,
  * - JXL_TYPE_FLOAT, with nominal range 0..1
  *
  * Note: the sample data type in pixel_format is allowed to be different from
- * what is described in the JxlBasicInfo. The type in pixel_format describes the
- * format of the uncompressed pixel buffer. The bits_per_sample and
- * exponent_bits_per_sample in the JxlBasicInfo describes what will actually be
- * encoded in the JPEG XL codestream. For example, to encode a 12-bit image, you
- * would set bits_per_sample to 12, and you could use e.g. JXL_TYPE_UINT16
- * (where the values are rescaled to 16-bit, i.e. multiplied by 65535/4095) or
- * JXL_TYPE_FLOAT (where the values are rescaled to 0..1, i.e. multiplied
- * by 1.f/4095.f). While it is allowed, it is obviously not recommended to use a
- * pixel_format with lower precision than what is specified in the JxlBasicInfo.
+ * what is described in the JxlBasicInfo. The type in pixel_format, together
+ * with an optional @ref JxlBitDepth parameter set by @ref
+ * JxlEncoderSetFrameBitDepth describes the format of the uncompressed pixel
+ * buffer. The bits_per_sample and exponent_bits_per_sample in the JxlBasicInfo
+ * describes what will actually be encoded in the JPEG XL codestream.
+ * For example, to encode a 12-bit image, you would set bits_per_sample to 12,
+ * while the input frame buffer can be in the following formats:
+ *  - if pixel format is in JXL_TYPE_UINT16 with default bit depth setting
+ *    (i.e. JXL_BIT_DEPTH_FROM_PIXEL_FORMAT), input sample values are rescaled
+ *    to 16-bit, i.e. multiplied by 65535/4095;
+ *  - if pixel format is in JXL_TYPE_UINT16 with JXL_BIT_DEPTH_FROM_CODESTREAM
+ *    bit depth setting, input sample values are provided unscaled;
+ *  - if pixel format is in JXL_TYPE_FLOAT, input sample values are rescaled
+ *    to 0..1, i.e.  multiplied by 1.f/4095.f.
+ * While it is allowed, it is obviously not recommended to use a pixel_format
+ * with lower precision than what is specified in the JxlBasicInfo.
  *
  * We support interleaved channels as described by the JxlPixelFormat:
  * - single-channel data, e.g. grayscale
@@ -606,6 +672,264 @@ JXL_EXPORT JxlEncoderStatus JxlEncoderAddImageFrame(
     const JxlPixelFormat* pixel_format, const void* buffer, size_t size);
 
 /**
+ * The JxlEncoderOutputProcessor structure provides an interface for the
+ * encoder's output processing. Users of the library, who want to do streaming
+ * encoding, should implement the required callbacks for buffering, writing,
+ * seeking (if supported), and setting a finalized position during the encoding
+ * process.
+ *
+ * At a high level, the processor can be in one of two states:
+ * - With an active buffer: This indicates that a buffer has been acquired using
+ *   `get_buffer` and encoded data can be written to it.
+ * - Without an active buffer: In this state, no data can be written. A new
+ * buffer must be acquired after releasing any previously active buffer.
+ *
+ * The library will not acquire more than one buffer at a given time.
+ *
+ * The state of the processor includes `position` and `finalized position`,
+ * which have the following meaning.
+ *
+ * - position: Represents the current position, in bytes, within the output
+ * stream where the encoded data will be written next. This position moves
+ * forward with each `release_buffer` call as data is written, and can also be
+ * adjusted through the optional seek callback, if provided. At this position
+ * the next write will occur.
+ *
+ * - finalized position:  A position in the output stream that ensures all bytes
+ * before this point are finalized and won't be changed by later writes.
+ *
+ * All fields but `seek` are required, `seek` is optional and can be NULL.
+ */
+struct JxlEncoderOutputProcessor {
+  /**
+   * Required.
+   * An opaque pointer that the client can use to store custom data.
+   * This data will be passed to the associated callback functions.
+   */
+  void* opaque;
+
+  /**
+   * Required.
+   * Acquires a buffer at the current position into which the library will write
+   * the output data.
+   *
+   * If the `size` argument points to 0 and the returned value is NULL, this
+   * will be interpreted as asking the output writing to stop. In such a case,
+   * the library will return an error. The client is expected to set the size of
+   * the returned buffer based on the suggested `size` when this function is
+   * called.
+   *
+   * @param opaque user supplied parameters to the callback
+   * @param size points to a suggested buffer size when called; must be set to
+   * the size of the returned buffer once the function returns.
+   * @return a pointer to the acquired buffer or NULL to indicate a stop
+   * condition.
+   */
+  void* (*get_buffer)(void* opaque, size_t* size);
+
+  /**
+   * Required.
+   * Notifies the user of library that the current buffer's data has been
+   * written and can be released. This function should advance the current
+   * position of the buffer by `written_bytes` number of bytes.
+   *
+   * @param opaque user supplied parameters to the callback
+   * @param written_bytes the number of bytes written to the buffer.
+   */
+  void (*release_buffer)(void* opaque, size_t written_bytes);
+
+  /**
+   * Optional, can be NULL
+   * Seeks to a specific position in the output. This function is optional and
+   * can be set to NULL if the output doesn't support seeking. Can only be done
+   * when there is no buffer. Cannot be used to seek before the finalized
+   * position.
+   *
+   * @param opaque user supplied parameters to the callback
+   * @param position the position to seek to, in bytes.
+   */
+  void (*seek)(void* opaque, uint64_t position);
+
+  /**
+   * Required.
+   * Sets a finalized position on the output data, at a specific position.
+   * Seeking will never request a position before the finalized position.
+   *
+   * Will only be called if there is no active buffer.
+   *
+   * @param opaque user supplied parameters to the callback
+   * @param finalized_position the position, in bytes, where the finalized
+   * position should be set.
+   */
+  void (*set_finalized_position)(void* opaque, uint64_t finalized_position);
+};
+
+/**
+ * Sets the output processor for the encoder. This processor determines how the
+ * encoder will handle buffering, writing, seeking (if supported), and
+ * setting a finalized position during the encoding process.
+ *
+ * This should not be used when using @ref JxlEncoderProcessOutput.
+ *
+ * @param enc encoder object.
+ * @param output_processor the struct containing the callbacks for managing
+ * output.
+ * @return JXL_ENC_SUCCESS on success, JXL_ENC_ERROR on error.
+ */
+JXL_EXPORT JxlEncoderStatus JxlEncoderSetOutputProcessor(
+    JxlEncoder* enc, struct JxlEncoderOutputProcessor output_processor);
+
+/**
+ * Flushes any buffered input in the encoder, ensuring that all available input
+ * data has been processed and written to the output.
+ *
+ * This function can only be used after @ref JxlEncoderSetOutputProcessor.
+ * Before making the last call to @ref JxlEncoderFlushInput, users should call
+ * @ref JxlEncoderCloseInput to signal the end of input data.
+ *
+ * This should not be used when using @ref JxlEncoderProcessOutput.
+ *
+ * @param enc encoder object.
+ * @return JXL_ENC_SUCCESS on success, JXL_ENC_ERROR on error.
+ */
+JXL_EXPORT JxlEncoderStatus JxlEncoderFlushInput(JxlEncoder* enc);
+
+/**
+ * This struct provides callback functions to pass pixel data in a streaming
+ * manner instead of requiring the entire frame data in memory at once.
+ */
+struct JxlChunkedFrameInputSource {
+  /**
+   * A pointer to any user-defined data or state. This can be used to pass
+   * information to the callback functions.
+   */
+  void* opaque;
+
+  /**
+   * Get the pixel format that color channel data will be provided in.
+   * When called, `pixel_format` points to a suggested pixel format; if
+   * color channel data can be given in this pixel format, processing might
+   * be more efficient.
+   *
+   * This function will be called exactly once, before any call to
+   * get_color_channel_at.
+   *
+   * @param opaque user supplied parameters to the callback
+   * @param pixel_format format for pixels
+   */
+  void (*get_color_channels_pixel_format)(void* opaque,
+                                          JxlPixelFormat* pixel_format);
+
+  /**
+   * Callback to retrieve a rectangle of color channel data at a specific
+   * location. It is guaranteed that xpos and ypos are multiples of 128. xsize,
+   * ysize will be multiples of 128, unless the resulting rectangle would be out
+   * of image bounds. Moreover, xsize and ysize will be at most 2048. The
+   * returned data will be assumed to be in the format returned by the
+   * (preceding) call to get_color_channels_pixel_format, except the `align`
+   * parameter of the pixel format will be ignored. Instead, the `i`-th row will
+   * be assumed to start at position `return_value + i * *row_offset`, with the
+   * value of `*row_offset` decided by the callee.
+   *
+   * Note that multiple calls to `get_color_channel_data_at` may happen before a
+   * call to `release_buffer`.
+   *
+   * @param opaque user supplied parameters to the callback
+   * @param xpos horizontal position for the data.
+   * @param ypos vertical position for the data.
+   * @param xsize horizontal size of the requested rectangle of data.
+   * @param ysize vertical size of the requested rectangle of data.
+   * @param row_offset pointer to a the byte offset between consecutive rows of
+   * the retrieved pixel data.
+   * @return pointer to the retrieved pixel data.
+   */
+  const void* (*get_color_channel_data_at)(void* opaque, size_t xpos,
+                                           size_t ypos, size_t xsize,
+                                           size_t ysize, size_t* row_offset);
+
+  /**
+   * Get the pixel format that extra channel data will be provided in.
+   * When called, `pixel_format` points to a suggested pixel format; if
+   * extra channel data can be given in this pixel format, processing might
+   * be more efficient.
+   *
+   * This function will be called exactly once per index, before any call to
+   * get_extra_channel_data_at with that given index.
+   *
+   * @param opaque user supplied parameters to the callback
+   * @param ec_index zero-indexed index of the extra channel
+   * @param pixel_format format for extra channel data
+   */
+  void (*get_extra_channel_pixel_format)(void* opaque, size_t ec_index,
+                                         JxlPixelFormat* pixel_format);
+
+  /**
+   * Callback to retrieve a rectangle of extra channel `ec_index` data at a
+   * specific location. It is guaranteed that xpos and ypos are multiples of
+   * 128. xsize, ysize will be multiples of 128, unless the resulting rectangle
+   * would be out of image bounds. Moreover, xsize and ysize will be at most
+   * 2048. The returned data will be assumed to be in the format returned by the
+   * (preceding) call to get_extra_channels_pixel_format_at with the
+   * corresponding extra channel index `ec_index`, except the `align` parameter
+   * of the pixel format will be ignored. Instead, the `i`-th row will be
+   * assumed to start at position `return_value + i * *row_offset`, with the
+   * value of `*row_offset` decided by the callee.
+   *
+   * Note that multiple calls to `get_extra_channel_data_at` may happen before a
+   * call to `release_buffer`.
+   *
+   * @param opaque user supplied parameters to the callback
+   * @param xpos horizontal position for the data.
+   * @param ypos vertical position for the data.
+   * @param xsize horizontal size of the requested rectangle of data.
+   * @param ysize vertical size of the requested rectangle of data.
+   * @param row_offset pointer to a the byte offset between consecutive rows of
+   * the retrieved pixel data.
+   * @return pointer to the retrieved pixel data.
+   */
+  const void* (*get_extra_channel_data_at)(void* opaque, size_t ec_index,
+                                           size_t xpos, size_t ypos,
+                                           size_t xsize, size_t ysize,
+                                           size_t* row_offset);
+
+  /**
+   * Releases the buffer `buf` (obtained through a call to
+   * `get_color_channel_data_at` or `get_extra_channel_data_at`). This function
+   * will be called exactly once per call to `get_color_channel_data_at` or
+   * `get_extra_channel_data_at`.
+   *
+   * @param opaque user supplied parameters to the callback
+   * @param buf pointer returned by `get_color_channel_data_at` or
+   * `get_extra_channel_data_at`
+   */
+  void (*release_buffer)(void* opaque, const void* buf);
+};
+
+/**
+ * @brief Adds a frame to the encoder using a chunked input source.
+ *
+ * This function gives a way to encode a frame by providing pixel data in a
+ * chunked or streaming manner, which can be especially useful when dealing with
+ * large images that may not fit entirely in memory or when trying to optimize
+ * memory usage. The input data is provided through callbacks defined in the
+ * `JxlChunkedFrameInputSource` struct. Once the frame data has been completely
+ * retrieved, this function will flush the input and close it if it is the last
+ * frame.
+ *
+ * @param frame_settings set of options and metadata for this frame. Also
+ * includes reference to the encoder object.
+ * @param is_last_frame indicates if this is the last frame.
+ * @param chunked_frame_input struct providing callback methods for retrieving
+ * pixel data in chunks.
+ *
+ * @return Returns a status indicating the success or failure of adding the
+ * frame.
+ */
+JXL_EXPORT JxlEncoderStatus JxlEncoderAddChunkedFrame(
+    const JxlEncoderFrameSettings* frame_settings, JXL_BOOL is_last_frame,
+    struct JxlChunkedFrameInputSource chunked_frame_input);
+
+/**
  * Sets the buffer to read pixels from for an extra channel at a given index.
  * The index must be smaller than the num_extra_channels in the associated
  * JxlBasicInfo. Must call @ref JxlEncoderSetExtraChannelInfo before
@@ -851,6 +1175,26 @@ JXL_EXPORT JxlEncoderStatus JxlEncoderSetBasicInfo(JxlEncoder* enc,
                                                    const JxlBasicInfo* info);
 
 /**
+ * Sets the upsampling method the decoder will use in case there are frames
+ * with JXL_ENC_FRAME_SETTING_RESAMPLING set. This is useful in combination
+ * with the JXL_ENC_FRAME_SETTING_ALREADY_DOWNSAMPLED option, to control the
+ * type of upsampling that will be used.
+ *
+ * @param enc encoder object.
+ * @param factor upsampling factor to configure (1, 2, 4 or 8; for 1 this
+ * function has no effect at all)
+ * @param mode upsampling mode to use for this upsampling:
+ * -1: default (good for photographic images, no signaling overhead)
+ * 0: nearest neighbor (good for pixel art)
+ * 1: 'pixel dots' (same as NN for 2x, diamond-shaped 'pixel dots' for 4x/8x)
+ * @return JXL_ENC_SUCCESS if the operation was successful,
+ * JXL_ENC_ERROR or JXL_ENC_NOT_SUPPORTED otherwise
+ */
+JXL_EXPORT JxlEncoderStatus JxlEncoderSetUpsamplingMode(JxlEncoder* enc,
+                                                        const int64_t factor,
+                                                        const int64_t mode);
+
+/**
  * Initializes a JxlExtraChannelInfo struct to default values.
  * For forwards-compatibility, this function has to be called before values
  * are assigned to the struct fields.
@@ -1048,58 +1392,76 @@ JXL_EXPORT int JxlEncoderGetRequiredCodestreamLevel(const JxlEncoder* enc);
 JXL_EXPORT JxlEncoderStatus JxlEncoderSetFrameLossless(
     JxlEncoderFrameSettings* frame_settings, JXL_BOOL lossless);
 
-/** DEPRECATED: use JxlEncoderSetFrameLossless instead.
- */
-JXL_EXPORT JxlEncoderStatus
-JxlEncoderOptionsSetLossless(JxlEncoderFrameSettings*, JXL_BOOL);
-
 /**
- * @param frame_settings set of options and metadata for this frame. Also
- * includes reference to the encoder object.
- * @param effort the effort value to set.
- * @return JXL_ENC_SUCCESS if the operation was successful, JXL_ENC_ERROR
- * otherwise.
+ * Sets the distance level for lossy compression: target max butteraugli
+ * distance, lower = higher quality. Range: 0 .. 25.
+ * 0.0 = mathematically lossless (however, use JxlEncoderSetFrameLossless
+ * instead to use true lossless, as setting distance to 0 alone is not the only
+ * requirement). 1.0 = visually lossless. Recommended range: 0.5 .. 3.0. Default
+ * value: 1.0.
  *
- * DEPRECATED: use JxlEncoderFrameSettingsSetOption(frame_settings,
- * JXL_ENC_FRAME_SETTING_EFFORT, effort) instead.
- */
-JXL_EXPORT JXL_DEPRECATED JxlEncoderStatus
-JxlEncoderOptionsSetEffort(JxlEncoderFrameSettings* frame_settings, int effort);
-
-/**
  * @param frame_settings set of options and metadata for this frame. Also
  * includes reference to the encoder object.
- * @param tier the decoding speed tier to set.
+ * @param distance the distance value to set.
  * @return JXL_ENC_SUCCESS if the operation was successful, JXL_ENC_ERROR
  * otherwise.
- *
- * DEPRECATED: use JxlEncoderFrameSettingsSetOption(frame_settings,
- * JXL_ENC_FRAME_SETTING_DECODING_SPEED, tier) instead.
  */
-JXL_EXPORT JXL_DEPRECATED JxlEncoderStatus JxlEncoderOptionsSetDecodingSpeed(
-    JxlEncoderFrameSettings* frame_settings, int tier);
+JXL_EXPORT JxlEncoderStatus JxlEncoderSetFrameDistance(
+    JxlEncoderFrameSettings* frame_settings, float distance);
 
 /**
- * Sets the distance level for lossy compression: target max butteraugli
- * distance, lower = higher quality. Range: 0 .. 15.
- * 0.0 = mathematically lossless (however, use JxlEncoderSetFrameLossless
- * instead to use true lossless, as setting distance to 0 alone is not the only
- * requirement). 1.0 = visually lossless. Recommended range: 0.5 .. 3.0. Default
- * value: 1.0.
+ * Sets the distance level for lossy compression of extra channels.
+ * The distance is as in JxlEncoderSetFrameDistance (lower = higher quality).
+ * If not set, or if set to the special value -1, the distance that was set with
+ * JxlEncoderSetFrameDistance will be used.
  *
  * @param frame_settings set of options and metadata for this frame. Also
  * includes reference to the encoder object.
+ * @param index index of the extra channel to set a distance value for.
  * @param distance the distance value to set.
  * @return JXL_ENC_SUCCESS if the operation was successful, JXL_ENC_ERROR
  * otherwise.
  */
-JXL_EXPORT JxlEncoderStatus JxlEncoderSetFrameDistance(
-    JxlEncoderFrameSettings* frame_settings, float distance);
+JXL_EXPORT JxlEncoderStatus JxlEncoderSetExtraChannelDistance(
+    JxlEncoderFrameSettings* frame_settings, size_t index, float distance);
 
-/** DEPRECATED: use JxlEncoderSetFrameDistance instead.
+/**
+ * Maps JPEG-style quality factor to distance.
+ *
+ * This function takes in input a JPEG-style quality factor `quality` and
+ * produces as output a `distance` value suitable to be used with @ref
+ * JxlEncoderSetFrameDistance and
+ * @ref JxlEncoderSetExtraChannelDistance.
+ *
+ * The `distance` value influences the level of compression, with lower values
+ * indicating higher quality:
+ * - 0.0 implies lossless compression (however, note that calling @ref
+ * JxlEncoderSetFrameLossless is required).
+ * - 1.0 represents a visually lossy compression, which is also the default
+ * setting.
+ *
+ * The `quality` parameter, ranging up to 100, is inversely related to
+ * 'distance':
+ * - A `quality` of 100.0 maps to a `distance` of 0.0 (lossless).
+ * - A `quality` of 90.0 corresponds to a `distance` of 1.0.
+ *
+ * Recommended Range:
+ * - `distance`: 0.5 to 3.0.
+ * - corresponding `quality`: approximately 96 to 68.
+ *
+ * Allowed Range:
+ * - `distance`: 0.0 to 25.0.
+ * - corresponding `quality`: 100.0 to 0.0.
+ *
+ * Note: the `quality` parameter has no consistent psychovisual meaning
+ * across different codecs and libraries. Using the mapping defined by @ref
+ * JxlEncoderDistanceFromQuality will result in a visual quality roughly
+ * equivalent to what would be obtained with `libjpeg-turbo` with the same
+ * `quality` parameter, but that is by no means guaranteed; do not assume that
+ * the same quality value will result in similar file sizes and image quality
+ * across different codecs.
  */
-JXL_EXPORT JXL_DEPRECATED JxlEncoderStatus
-JxlEncoderOptionsSetDistance(JxlEncoderFrameSettings*, float);
+JXL_EXPORT float JxlEncoderDistanceFromQuality(float quality);
 
 /**
  * Create a new set of encoder options, with all values initially copied from
@@ -1119,11 +1481,6 @@ JxlEncoderOptionsSetDistance(JxlEncoderFrameSettings*, float);
 JXL_EXPORT JxlEncoderFrameSettings* JxlEncoderFrameSettingsCreate(
     JxlEncoder* enc, const JxlEncoderFrameSettings* source);
 
-/** DEPRECATED: use JxlEncoderFrameSettingsCreate instead.
- */
-JXL_EXPORT JXL_DEPRECATED JxlEncoderFrameSettings* JxlEncoderOptionsCreate(
-    JxlEncoder*, const JxlEncoderFrameSettings*);
-
 /**
  * Sets a color encoding to be sRGB.
  *
@@ -1142,6 +1499,67 @@ JXL_EXPORT void JxlColorEncodingSetToSRGB(JxlColorEncoding* color_encoding,
 JXL_EXPORT void JxlColorEncodingSetToLinearSRGB(
     JxlColorEncoding* color_encoding, JXL_BOOL is_gray);
 
+/**
+ * Enables usage of expert options.
+ *
+ * At the moment, the only expert option is setting an effort value of 10,
+ * which gives the best compression for pixel-lossless modes but is very slow.
+ *
+ * @param enc encoder object.
+ */
+JXL_EXPORT void JxlEncoderAllowExpertOptions(JxlEncoder* enc);
+
+/**
+ * Function type for @ref JxlEncoderSetDebugImageCallback.
+ *
+ * The callback may be called simultaneously by different threads when using a
+ * threaded parallel runner, on different debug images.
+ *
+ * @param opaque optional user data, as given to @ref
+ *   JxlEncoderSetDebugImageCallback.
+ * @param label label of debug image, can be used in filenames
+ * @param xsize width of debug image
+ * @param ysize height of debug image
+ * @param color color encoding of debug image
+ * @param pixels pixel data of debug image as big-endian 16-bit unsigned
+ *   samples. The memory is not owned by the user, and is only valid during the
+ *   time the callback is running.
+ */
+typedef void (*JxlDebugImageCallback)(void* opaque, const char* label,
+                                      size_t xsize, size_t ysize,
+                                      const JxlColorEncoding* color,
+                                      const uint16_t* pixels);
+
+/**
+ * Sets the given debug image callback that will be used by the encoder to
+ * output various debug images during encoding.
+ *
+ * This only has any effect if the encoder was compiled with the appropriate
+ * debug build flags.
+ *
+ * @param frame_settings set of options and metadata for this frame. Also
+ * includes reference to the encoder object.
+ * @param callback used to return the debug image
+ * @param opaque user supplied parameter to the image callback
+ */
+JXL_EXPORT void JxlEncoderSetDebugImageCallback(
+    JxlEncoderFrameSettings* frame_settings, JxlDebugImageCallback callback,
+    void* opaque);
+
+/**
+ * Sets the given stats object for gathering various statistics during encoding.
+ *
+ * This only has any effect if the encoder was compiled with the appropriate
+ * debug build flags.
+ *
+ * @param frame_settings set of options and metadata for this frame. Also
+ * includes reference to the encoder object.
+ * @param stats object that can be used to query the gathered stats (created
+ *   by @ref JxlEncoderStatsCreate)
+ */
+JXL_EXPORT void JxlEncoderCollectStats(JxlEncoderFrameSettings* frame_settings,
+                                       JxlEncoderStats* stats);
+
 #if defined(__cplusplus) || defined(c_plusplus)
 }
 #endif
index 494c03c..1f69575 100644 (file)
@@ -3,7 +3,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-/// @addtogroup libjxl_encoder
+/// @addtogroup libjxl_cpp
 ///@{
 ///
 /// @file encode_cxx.h
@@ -15,9 +15,9 @@
 #ifndef JXL_ENCODE_CXX_H_
 #define JXL_ENCODE_CXX_H_
 
-#include <memory>
+#include <jxl/encode.h>
 
-#include "jxl/encode.h"
+#include <memory>
 
 #if !(defined(__cplusplus) || defined(c_plusplus))
 #error "This a C++ only header. Use jxl/encode.h from C sources."
index 45394e9..e71e0aa 100644 (file)
@@ -4,7 +4,7 @@
  * license that can be found in the LICENSE file.
  */
 
-/** @addtogroup libjxl_common
+/** @addtogroup libjxl_threads
  *  @{
  */
 /**
index f6344bd..a65015d 100644 (file)
 #ifndef JXL_RESIZABLE_PARALLEL_RUNNER_H_
 #define JXL_RESIZABLE_PARALLEL_RUNNER_H_
 
+#include <jxl/jxl_threads_export.h>
+#include <jxl/memory_manager.h>
+#include <jxl/parallel_runner.h>
 #include <stddef.h>
 #include <stdint.h>
-#include <stdio.h>
 #include <stdlib.h>
 
-#include "jxl/jxl_threads_export.h"
-#include "jxl/memory_manager.h"
-#include "jxl/parallel_runner.h"
-
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
index 9a310c8..8c617ae 100644 (file)
@@ -3,7 +3,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-/// @addtogroup libjxl_threads
+/// @addtogroup libjxl_cpp
 /// @{
 ///
 /// @file resizable_parallel_runner_cxx.h
@@ -16,9 +16,9 @@
 #ifndef JXL_RESIZABLE_PARALLEL_RUNNER_CXX_H_
 #define JXL_RESIZABLE_PARALLEL_RUNNER_CXX_H_
 
-#include <memory>
+#include <jxl/resizable_parallel_runner.h>
 
-#include "jxl/resizable_parallel_runner.h"
+#include <memory>
 
 #if !(defined(__cplusplus) || defined(c_plusplus))
 #error \
diff --git a/lib/include/jxl/stats.h b/lib/include/jxl/stats.h
new file mode 100644 (file)
index 0000000..c9359dc
--- /dev/null
@@ -0,0 +1,103 @@
+/* Copyright (c) the JPEG XL Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style
+ * license that can be found in the LICENSE file.
+ */
+
+/** @addtogroup libjxl_encoder
+ * @{
+ * @file stats.h
+ * @brief API to collect various statistics from JXL encoder.
+ */
+
+#ifndef JXL_STATS_H_
+#define JXL_STATS_H_
+
+#include <jxl/jxl_export.h>
+#include <stddef.h>
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+/**
+ * Opaque structure that holds the encoder statistics.
+ *
+ * Allocated and initialized with JxlEncoderStatsCreate().
+ * Cleaned up and deallocated with JxlEncoderStatsDestroy().
+ */
+typedef struct JxlEncoderStatsStruct JxlEncoderStats;
+
+/**
+ * Creates an instance of JxlEncoderStats and initializes it.
+ *
+ * @return pointer to initialized JxlEncoderStats instance
+ */
+JXL_EXPORT JxlEncoderStats* JxlEncoderStatsCreate(void);
+
+/**
+ * Deinitializes and frees JxlEncoderStats instance.
+ *
+ * @param stats instance to be cleaned up and deallocated. No-op if stats is
+ * null pointer.
+ */
+JXL_EXPORT void JxlEncoderStatsDestroy(JxlEncoderStats* stats);
+
+/** Data type for querying JxlEncoderStats object
+ */
+typedef enum {
+  JXL_ENC_STAT_HEADER_BITS,
+  JXL_ENC_STAT_TOC_BITS,
+  JXL_ENC_STAT_DICTIONARY_BITS,
+  JXL_ENC_STAT_SPLINES_BITS,
+  JXL_ENC_STAT_NOISE_BITS,
+  JXL_ENC_STAT_QUANT_BITS,
+  JXL_ENC_STAT_MODULAR_TREE_BITS,
+  JXL_ENC_STAT_MODULAR_GLOBAL_BITS,
+  JXL_ENC_STAT_DC_BITS,
+  JXL_ENC_STAT_MODULAR_DC_GROUP_BITS,
+  JXL_ENC_STAT_CONTROL_FIELDS_BITS,
+  JXL_ENC_STAT_COEF_ORDER_BITS,
+  JXL_ENC_STAT_AC_HISTOGRAM_BITS,
+  JXL_ENC_STAT_AC_BITS,
+  JXL_ENC_STAT_MODULAR_AC_GROUP_BITS,
+  JXL_ENC_STAT_NUM_SMALL_BLOCKS,
+  JXL_ENC_STAT_NUM_DCT4X8_BLOCKS,
+  JXL_ENC_STAT_NUM_AFV_BLOCKS,
+  JXL_ENC_STAT_NUM_DCT8_BLOCKS,
+  JXL_ENC_STAT_NUM_DCT8X32_BLOCKS,
+  JXL_ENC_STAT_NUM_DCT16_BLOCKS,
+  JXL_ENC_STAT_NUM_DCT16X32_BLOCKS,
+  JXL_ENC_STAT_NUM_DCT32_BLOCKS,
+  JXL_ENC_STAT_NUM_DCT32X64_BLOCKS,
+  JXL_ENC_STAT_NUM_DCT64_BLOCKS,
+  JXL_ENC_STAT_NUM_BUTTERAUGLI_ITERS,
+  JXL_ENC_NUM_STATS,
+} JxlEncoderStatsKey;
+
+/** Returns the value of the statistics corresponding the given key.
+ *
+ * @param stats object that was passed to the encoder with a
+ *   @ref JxlEncoderCollectStats function
+ * @param key the particular statistics to query
+ *
+ * @return the value of the statistics
+ */
+JXL_EXPORT size_t JxlEncoderStatsGet(const JxlEncoderStats* stats,
+                                     JxlEncoderStatsKey key);
+
+/** Updates the values of the given stats object with that of an other.
+ *
+ * @param stats object whose values will be updated (usually added together)
+ * @param other stats object whose values will be merged with stats
+ */
+JXL_EXPORT void JxlEncoderStatsMerge(JxlEncoderStats* stats,
+                                     const JxlEncoderStats* other);
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}
+#endif
+
+#endif /* JXL_STATS_H_ */
+
+/** @}*/
index 581ff73..6166fe7 100644 (file)
 #ifndef JXL_THREAD_PARALLEL_RUNNER_H_
 #define JXL_THREAD_PARALLEL_RUNNER_H_
 
+#include <jxl/jxl_threads_export.h>
+#include <jxl/memory_manager.h>
+#include <jxl/parallel_runner.h>
 #include <stddef.h>
 #include <stdint.h>
-#include <stdio.h>
 #include <stdlib.h>
 
-#include "jxl/jxl_threads_export.h"
-#include "jxl/memory_manager.h"
-#include "jxl/parallel_runner.h"
-
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
@@ -62,7 +60,7 @@ JXL_THREADS_EXPORT void JxlThreadParallelRunnerDestroy(void* runner_opaque);
 /** Returns a default num_worker_threads value for
  * JxlThreadParallelRunnerCreate.
  */
-JXL_THREADS_EXPORT size_t JxlThreadParallelRunnerDefaultNumWorkerThreads();
+JXL_THREADS_EXPORT size_t JxlThreadParallelRunnerDefaultNumWorkerThreads(void);
 
 #if defined(__cplusplus) || defined(c_plusplus)
 }
index a71d18c..ba7e285 100644 (file)
@@ -3,7 +3,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-/// @addtogroup libjxl_threads
+/// @addtogroup libjxl_cpp
 /// @{
 ///
 /// @file thread_parallel_runner_cxx.h
@@ -15,9 +15,9 @@
 #ifndef JXL_THREAD_PARALLEL_RUNNER_CXX_H_
 #define JXL_THREAD_PARALLEL_RUNNER_CXX_H_
 
-#include <memory>
+#include <jxl/thread_parallel_runner.h>
 
-#include "jxl/thread_parallel_runner.h"
+#include <memory>
 
 #if !(defined(__cplusplus) || defined(c_plusplus))
 #error \
index 1f81978..a47216f 100644 (file)
 #ifndef JXL_TYPES_H_
 #define JXL_TYPES_H_
 
+#include <jxl/jxl_export.h>
 #include <stddef.h>
 #include <stdint.h>
 
-#include "jxl/jxl_export.h"
-
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
@@ -33,6 +32,8 @@ extern "C" {
 #define JXL_TRUE 1
 /** Portable @c false replacement. */
 #define JXL_FALSE 0
+/** Converts of bool-like value to either JXL_TRUE or JXL_FALSE. */
+#define TO_JXL_BOOL(C) (!!(C) ? JXL_TRUE : JXL_FALSE)
 
 /** Data type for the sample values per channel per pixel.
  */
@@ -55,14 +56,6 @@ typedef enum {
   JXL_TYPE_FLOAT16 = 5,
 } JxlDataType;
 
-/* DEPRECATED: bit-packed 1-bit data type. Use JXL_TYPE_UINT8 instead.
- */
-static const int JXL_DEPRECATED JXL_TYPE_BOOLEAN = 1;
-
-/* DEPRECATED: uint32_t data type. Use JXL_TYPE_FLOAT instead.
- */
-static const int JXL_DEPRECATED JXL_TYPE_UINT32 = 4;
-
 /** Ordering of multi-byte data.
  */
 typedef enum {
@@ -81,7 +74,6 @@ typedef enum {
  * for pixels. This is not necessarily the same as the data type encoded in the
  * codestream. The channels are interleaved per pixel. The pixels are
  * organized row by row, left to right, top to bottom.
- * TODO(lode): implement padding / alignment (row stride)
  * TODO(lode): support different channel orders if needed (RGB, BGR, ...)
  */
 typedef struct {
@@ -111,36 +103,47 @@ typedef struct {
   size_t align;
 } JxlPixelFormat;
 
-/** Data type holding the 4-character type name of an ISOBMFF box.
+/** Settings for the interpretation of UINT input and output buffers.
+ *  (buffers using a FLOAT data type are not affected by this)
  */
-typedef char JxlBoxType[4];
+typedef enum {
+  /** This is the default setting, where the encoder expects the input pixels
+   * to use the full range of the pixel format data type (e.g. for UINT16, the
+   * input range is 0 .. 65535 and the value 65535 is mapped to 1.0 when
+   * converting to float), and the decoder uses the full range to output
+   * pixels. If the bit depth in the basic info is different from this, the
+   * encoder expects the values to be rescaled accordingly (e.g. multiplied by
+   * 65535/4095 for a 12-bit image using UINT16 input data type). */
+  JXL_BIT_DEPTH_FROM_PIXEL_FORMAT = 0,
+
+  /** If this setting is selected, the encoder expects the input pixels to be
+   * in the range defined by the bits_per_sample value of the basic info (e.g.
+   * for 12-bit images using UINT16 input data types, the allowed range is
+   * 0 .. 4095 and the value 4095 is mapped to 1.0 when converting to float),
+   * and the decoder outputs pixels in this range. */
+  JXL_BIT_DEPTH_FROM_CODESTREAM = 1,
+
+  /** This setting can only be used in the decoder to select a custom range for
+   * pixel output */
+  JXL_BIT_DEPTH_CUSTOM = 2,
+} JxlBitDepthType;
+
+/** Data type for describing the interpretation of the input and output buffers
+ * in terms of the range of allowed input and output pixel values. */
+typedef struct {
+  /** Bit depth setting, see comment on @ref JxlBitDepthType */
+  JxlBitDepthType type;
+
+  /** Custom bits per sample */
+  uint32_t bits_per_sample;
+
+  /** Custom exponent bits per sample */
+  uint32_t exponent_bits_per_sample;
+} JxlBitDepth;
 
-/** Types of progressive detail.
- * Setting a progressive detail with value N implies all progressive details
- * with smaller or equal value. Currently only the following level of
- * progressive detail is implemented:
- *  - kDC (which implies kFrames)
- *  - kLastPasses (which implies kDC and kFrames)
- *  - kPasses (which implies kLastPasses, kDC and kFrames)
+/** Data type holding the 4-character type name of an ISOBMFF box.
  */
-typedef enum {
-  // after completed kRegularFrames
-  kFrames = 0,
-  // after completed DC (1:8)
-  kDC = 1,
-  // after completed AC passes that are the last pass for their resolution
-  // target.
-  kLastPasses = 2,
-  // after completed AC passes that are not the last pass for their resolution
-  // target.
-  kPasses = 3,
-  // during DC frame when lower resolution are completed (1:32, 1:16)
-  kDCProgressive = 4,
-  // after completed groups
-  kDCGroups = 5,
-  // after completed groups
-  kGroups = 6,
-} JxlProgressiveDetail;
+typedef char JxlBoxType[4];
 
 #if defined(__cplusplus) || defined(c_plusplus)
 }
diff --git a/lib/jpegli.cmake b/lib/jpegli.cmake
new file mode 100644 (file)
index 0000000..f06912f
--- /dev/null
@@ -0,0 +1,159 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+include(jxl_lists.cmake)
+
+set(JPEGLI_INTERNAL_LIBS
+  hwy
+  Threads::Threads
+  ${ATOMICS_LIBRARIES}
+)
+
+# JPEGLIB setup
+set(BITS_IN_JSAMPLE 8)
+set(MEM_SRCDST_SUPPORTED 1)
+
+if(JPEGLI_LIBJPEG_LIBRARY_SOVERSION STREQUAL "62")
+  set(JPEG_LIB_VERSION 62)
+elseif(JPEGLI_LIBJPEG_LIBRARY_SOVERSION STREQUAL "7")
+  set(JPEG_LIB_VERSION 70)
+elseif(JPEGLI_LIBJPEG_LIBRARY_SOVERSION STREQUAL "8")
+  set(JPEG_LIB_VERSION 80)
+endif()
+
+configure_file(
+  ../third_party/libjpeg-turbo/jconfig.h.in include/jpegli/jconfig.h)
+configure_file(
+  ../third_party/libjpeg-turbo/jpeglib.h include/jpegli/jpeglib.h COPYONLY)
+configure_file(
+  ../third_party/libjpeg-turbo/jmorecfg.h include/jpegli/jmorecfg.h COPYONLY)
+
+add_library(jpegli-static STATIC EXCLUDE_FROM_ALL "${JPEGXL_INTERNAL_JPEGLI_SOURCES}")
+target_compile_options(jpegli-static PRIVATE "${JPEGXL_INTERNAL_FLAGS}")
+target_compile_options(jpegli-static PUBLIC ${JPEGXL_COVERAGE_FLAGS})
+set_property(TARGET jpegli-static PROPERTY POSITION_INDEPENDENT_CODE ON)
+target_include_directories(jpegli-static PRIVATE
+  "$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}>"
+  "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
+  "$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include>"
+  "${JXL_HWY_INCLUDE_DIRS}"
+)
+target_include_directories(jpegli-static PUBLIC
+  "$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include/jpegli>"
+)
+target_link_libraries(jpegli-static PUBLIC ${JPEGLI_INTERNAL_LIBS})
+
+#
+# Tests for jpegli-static
+#
+
+find_package(JPEG)
+if(JPEG_FOUND AND BUILD_TESTING)
+# TODO(eustas): merge into jxl_tests.cmake?
+
+add_library(jpegli_libjpeg_util-obj OBJECT
+  ${JPEGXL_INTERNAL_JPEGLI_LIBJPEG_HELPER_FILES}
+)
+target_include_directories(jpegli_libjpeg_util-obj PRIVATE
+  "${PROJECT_SOURCE_DIR}"
+  "${JPEG_INCLUDE_DIRS}"
+)
+target_compile_options(jpegli_libjpeg_util-obj PRIVATE
+  "${JPEGXL_INTERNAL_FLAGS}" "${JPEGXL_COVERAGE_FLAGS}")
+
+# Individual test binaries:
+file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tests)
+foreach (TESTFILE IN LISTS JPEGXL_INTERNAL_JPEGLI_TESTS)
+  # The TESTNAME is the name without the extension or directory.
+  get_filename_component(TESTNAME ${TESTFILE} NAME_WE)
+  add_executable(${TESTNAME} ${TESTFILE}
+    $<TARGET_OBJECTS:jpegli_libjpeg_util-obj>
+    ${JPEGXL_INTERNAL_JPEGLI_TESTLIB_FILES}
+  )
+  target_compile_options(${TESTNAME} PRIVATE
+    ${JPEGXL_INTERNAL_FLAGS}
+    # Add coverage flags to the test binary so code in the private headers of
+    # the library is also instrumented when running tests that execute it.
+    ${JPEGXL_COVERAGE_FLAGS}
+  )
+  target_compile_definitions(${TESTNAME} PRIVATE
+    -DTEST_DATA_PATH="${JPEGXL_TEST_DATA_PATH}")
+  target_include_directories(${TESTNAME} PRIVATE
+    "${PROJECT_SOURCE_DIR}"
+    "${CMAKE_CURRENT_SOURCE_DIR}/include"
+    "${CMAKE_CURRENT_BINARY_DIR}/include"
+  )
+  target_link_libraries(${TESTNAME}
+    hwy
+    jpegli-static
+    gmock
+    GTest::GTest
+    GTest::Main
+    ${JPEG_LIBRARIES}
+  )
+  set_target_properties(${TESTNAME} PROPERTIES LINK_FLAGS "${JPEGXL_COVERAGE_LINK_FLAGS}")
+  # Output test targets in the test directory.
+  set_target_properties(${TESTNAME} PROPERTIES PREFIX "tests/")
+  if (WIN32 AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+    set_target_properties(${TESTNAME} PROPERTIES COMPILE_FLAGS "-Wno-error")
+  endif ()
+  # 240 seconds because some build types (e.g. coverage) can be quite slow.
+  gtest_discover_tests(${TESTNAME} DISCOVERY_TIMEOUT 240)
+endforeach ()
+endif()
+
+#
+# Build libjpeg.so that links to libjpeg-static
+#
+
+if (JPEGXL_ENABLE_JPEGLI_LIBJPEG AND NOT APPLE AND NOT WIN32 AND NOT EMSCRIPTEN)
+add_library(jpegli-libjpeg-obj OBJECT "${JPEGXL_INTERNAL_JPEGLI_WRAPPER_SOURCES}")
+target_compile_options(jpegli-libjpeg-obj PRIVATE ${JPEGXL_INTERNAL_FLAGS})
+target_compile_options(jpegli-libjpeg-obj PUBLIC ${JPEGXL_COVERAGE_FLAGS})
+set_property(TARGET jpegli-libjpeg-obj PROPERTY POSITION_INDEPENDENT_CODE ON)
+target_include_directories(jpegli-libjpeg-obj PRIVATE
+  "$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}>"
+  "$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include/jpegli>"
+)
+target_compile_definitions(jpegli-libjpeg-obj PUBLIC
+  ${JPEGLI_LIBJPEG_OBJ_COMPILE_DEFINITIONS}
+)
+set(JPEGLI_LIBJPEG_INTERNAL_OBJECTS $<TARGET_OBJECTS:jpegli-libjpeg-obj>)
+
+file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/jpegli)
+add_library(jpeg SHARED ${JPEGLI_LIBJPEG_INTERNAL_OBJECTS})
+target_link_libraries(jpeg PUBLIC ${JPEGXL_COVERAGE_FLAGS})
+target_link_libraries(jpeg PRIVATE jpegli-static)
+set_target_properties(jpeg PROPERTIES
+  VERSION ${JPEGLI_LIBJPEG_LIBRARY_VERSION}
+  SOVERSION ${JPEGLI_LIBJPEG_LIBRARY_SOVERSION}
+  LIBRARY_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/jpegli"
+  RUNTIME_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/jpegli")
+
+# Add a jpeg.version file as a version script to tag symbols with the
+# appropriate version number.
+set_target_properties(jpeg PROPERTIES
+  LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/jpegli/jpeg.version.${JPEGLI_LIBJPEG_LIBRARY_SOVERSION})
+set_property(TARGET jpeg APPEND_STRING PROPERTY
+  LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/jpegli/jpeg.version.${JPEGLI_LIBJPEG_LIBRARY_SOVERSION}")
+
+if (JPEGXL_INSTALL_JPEGLI_LIBJPEG)
+  install(TARGETS jpeg
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
+  install(
+    DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/include/jpegli/"
+    DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}")
+endif()
+
+# This hides the default visibility symbols from static libraries bundled into
+# the shared library. In particular this prevents exposing symbols from hwy
+# in the shared library.
+if(LINKER_SUPPORT_EXCLUDE_LIBS)
+  set_property(TARGET jpeg APPEND_STRING PROPERTY
+    LINK_FLAGS " ${LINKER_EXCLUDE_LIBS_FLAG}")
+endif()
+endif()
diff --git a/lib/jpegli/README.md b/lib/jpegli/README.md
new file mode 100644 (file)
index 0000000..72f13af
--- /dev/null
@@ -0,0 +1,49 @@
+# Improved JPEG encoder and decoder implementation
+
+This subdirectory contains a JPEG encoder and decoder implementation that is
+API and ABI compatible with libjpeg62.
+
+## Building
+
+When building the parent libjxl project, two binaries, `tools/cjpegli` and
+`tools/djpegli` will be built, as well as a
+`lib/jpegli/libjpeg.so.62.3.0` shared library that can be used as a drop-in
+replacement for the system library with the same name.
+
+## Encoder improvements
+
+Improvements and new features used by the encoder include:
+
+* Support for 16-bit unsigned and 32-bit floating point input buffers.
+
+* Color space conversions, chroma subsampling and DCT are all done in floating
+  point precision, the conversion to integers happens first when producing
+  the final quantized DCT coefficients.
+
+* The desired quality can be indicated by a distance parameter that is
+  analogous to the distance parameter of JPEG XL. The quantization tables
+  are chosen based on the distance and the chroma subsampling mode, with
+  different positions in the quantization matrix scaling differently, and the
+  red and blue chrominance channels have separate quantization tables.
+
+* Adaptive dead-zone quantization. On noisy parts of the image, quantization
+  thresholds for zero coefficients are higher than on smoother parts of the
+  image.
+
+* Support for more efficient compression of JPEGs with an ICC profile
+  representing the XYB colorspace. These JPEGs will not be converted to the
+  YCbCr colorspace, but specialized quantization tables will be chosen for
+  the original X, Y, B channels.
+
+## Decoder improvements
+
+* Support for 16-bit unsigned and 32-bit floating point output buffers.
+
+* Non-zero DCT coefficients are dequantized to the expectation value of their
+  respective quantization intervals assuming a Laplacian distribution of the
+  original unquantized DCT coefficients.
+
+* After dequantization, inverse DCT, chroma upsampling and color space
+  conversions are all done in floating point precision, the conversion to
+  integer samples happens only in the final output phase (unless output to
+  floating point was requested).
diff --git a/lib/jpegli/adaptive_quantization.cc b/lib/jpegli/adaptive_quantization.cc
new file mode 100644 (file)
index 0000000..6a8c4d3
--- /dev/null
@@ -0,0 +1,562 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/adaptive_quantization.h"
+
+#include <stddef.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <string>
+#include <vector>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jpegli/adaptive_quantization.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jpegli/encode_internal.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+HWY_BEFORE_NAMESPACE();
+namespace jpegli {
+namespace HWY_NAMESPACE {
+namespace {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::AbsDiff;
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::And;
+using hwy::HWY_NAMESPACE::Div;
+using hwy::HWY_NAMESPACE::Floor;
+using hwy::HWY_NAMESPACE::GetLane;
+using hwy::HWY_NAMESPACE::Max;
+using hwy::HWY_NAMESPACE::Min;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::NegMulAdd;
+using hwy::HWY_NAMESPACE::Rebind;
+using hwy::HWY_NAMESPACE::ShiftLeft;
+using hwy::HWY_NAMESPACE::ShiftRight;
+using hwy::HWY_NAMESPACE::Sqrt;
+using hwy::HWY_NAMESPACE::Sub;
+using hwy::HWY_NAMESPACE::ZeroIfNegative;
+
+static constexpr float kInputScaling = 1.0f / 255.0f;
+
+// Primary template: default to actual division.
+template <typename T, class V>
+struct FastDivision {
+  HWY_INLINE V operator()(const V n, const V d) const { return n / d; }
+};
+// Partial specialization for float vectors.
+template <class V>
+struct FastDivision<float, V> {
+  // One Newton-Raphson iteration.
+  static HWY_INLINE V ReciprocalNR(const V x) {
+    const auto rcp = ApproximateReciprocal(x);
+    const auto sum = Add(rcp, rcp);
+    const auto x_rcp = Mul(x, rcp);
+    return NegMulAdd(x_rcp, rcp, sum);
+  }
+
+  V operator()(const V n, const V d) const {
+#if 1  // Faster on SKX
+    return Div(n, d);
+#else
+    return n * ReciprocalNR(d);
+#endif
+  }
+};
+
+// Approximates smooth functions via rational polynomials (i.e. dividing two
+// polynomials). Evaluates polynomials via Horner's scheme, which is faster than
+// Clenshaw recurrence for Chebyshev polynomials. LoadDup128 allows us to
+// specify constants (replicated 4x) independently of the lane count.
+template <size_t NP, size_t NQ, class D, class V, typename T>
+HWY_INLINE HWY_MAYBE_UNUSED V EvalRationalPolynomial(const D d, const V x,
+                                                     const T (&p)[NP],
+                                                     const T (&q)[NQ]) {
+  constexpr size_t kDegP = NP / 4 - 1;
+  constexpr size_t kDegQ = NQ / 4 - 1;
+  auto yp = LoadDup128(d, &p[kDegP * 4]);
+  auto yq = LoadDup128(d, &q[kDegQ * 4]);
+  // We use pointer arithmetic to refer to &p[(kDegP - n) * 4] to avoid a
+  // compiler warning that the index is out of bounds since we are already
+  // checking that it is not out of bounds with (kDegP >= n) and the access
+  // will be optimized away. Similarly with q and kDegQ.
+  HWY_FENCE;
+  if (kDegP >= 1) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 1) * 4)));
+  if (kDegQ >= 1) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 1) * 4)));
+  HWY_FENCE;
+  if (kDegP >= 2) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 2) * 4)));
+  if (kDegQ >= 2) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 2) * 4)));
+  HWY_FENCE;
+  if (kDegP >= 3) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 3) * 4)));
+  if (kDegQ >= 3) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 3) * 4)));
+  HWY_FENCE;
+  if (kDegP >= 4) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 4) * 4)));
+  if (kDegQ >= 4) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 4) * 4)));
+  HWY_FENCE;
+  if (kDegP >= 5) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 5) * 4)));
+  if (kDegQ >= 5) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 5) * 4)));
+  HWY_FENCE;
+  if (kDegP >= 6) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 6) * 4)));
+  if (kDegQ >= 6) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 6) * 4)));
+  HWY_FENCE;
+  if (kDegP >= 7) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 7) * 4)));
+  if (kDegQ >= 7) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 7) * 4)));
+
+  return FastDivision<T, V>()(yp, yq);
+}
+
+// Computes base-2 logarithm like std::log2. Undefined if negative / NaN.
+// L1 error ~3.9E-6
+template <class DF, class V>
+V FastLog2f(const DF df, V x) {
+  // 2,2 rational polynomial approximation of std::log1p(x) / std::log(2).
+  HWY_ALIGN const float p[4 * (2 + 1)] = {HWY_REP4(-1.8503833400518310E-06f),
+                                          HWY_REP4(1.4287160470083755E+00f),
+                                          HWY_REP4(7.4245873327820566E-01f)};
+  HWY_ALIGN const float q[4 * (2 + 1)] = {HWY_REP4(9.9032814277590719E-01f),
+                                          HWY_REP4(1.0096718572241148E+00f),
+                                          HWY_REP4(1.7409343003366853E-01f)};
+
+  const Rebind<int32_t, DF> di;
+  const auto x_bits = BitCast(di, x);
+
+  // Range reduction to [-1/3, 1/3] - 3 integer, 2 float ops
+  const auto exp_bits = Sub(x_bits, Set(di, 0x3f2aaaab));  // = 2/3
+  // Shifted exponent = log2; also used to clear mantissa.
+  const auto exp_shifted = ShiftRight<23>(exp_bits);
+  const auto mantissa = BitCast(df, Sub(x_bits, ShiftLeft<23>(exp_shifted)));
+  const auto exp_val = ConvertTo(df, exp_shifted);
+  return Add(EvalRationalPolynomial(df, Sub(mantissa, Set(df, 1.0f)), p, q),
+             exp_val);
+}
+
+// max relative error ~3e-7
+template <class DF, class V>
+V FastPow2f(const DF df, V x) {
+  const Rebind<int32_t, DF> di;
+  auto floorx = Floor(x);
+  auto exp =
+      BitCast(df, ShiftLeft<23>(Add(ConvertTo(di, floorx), Set(di, 127))));
+  auto frac = Sub(x, floorx);
+  auto num = Add(frac, Set(df, 1.01749063e+01));
+  num = MulAdd(num, frac, Set(df, 4.88687798e+01));
+  num = MulAdd(num, frac, Set(df, 9.85506591e+01));
+  num = Mul(num, exp);
+  auto den = MulAdd(frac, Set(df, 2.10242958e-01), Set(df, -2.22328856e-02));
+  den = MulAdd(den, frac, Set(df, -1.94414990e+01));
+  den = MulAdd(den, frac, Set(df, 9.85506633e+01));
+  return Div(num, den);
+}
+
+inline float FastPow2f(float f) {
+  HWY_CAPPED(float, 1) D;
+  return GetLane(FastPow2f(D, Set(D, f)));
+}
+
+// The following functions modulate an exponent (out_val) and return the updated
+// value. Their descriptor is limited to 8 lanes for 8x8 blocks.
+
+template <class D, class V>
+V ComputeMask(const D d, const V out_val) {
+  const auto kBase = Set(d, -0.74174993f);
+  const auto kMul4 = Set(d, 3.2353257320940401f);
+  const auto kMul2 = Set(d, 12.906028311180409f);
+  const auto kOffset2 = Set(d, 305.04035728311436f);
+  const auto kMul3 = Set(d, 5.0220313103171232f);
+  const auto kOffset3 = Set(d, 2.1925739705298404f);
+  const auto kOffset4 = Mul(Set(d, 0.25f), kOffset3);
+  const auto kMul0 = Set(d, 0.74760422233706747f);
+  const auto k1 = Set(d, 1.0f);
+
+  // Avoid division by zero.
+  const auto v1 = Max(Mul(out_val, kMul0), Set(d, 1e-3f));
+  const auto v2 = Div(k1, Add(v1, kOffset2));
+  const auto v3 = Div(k1, MulAdd(v1, v1, kOffset3));
+  const auto v4 = Div(k1, MulAdd(v1, v1, kOffset4));
+  // TODO(jyrki):
+  // A log or two here could make sense. In butteraugli we have effectively
+  // log(log(x + C)) for this kind of use, as a single log is used in
+  // saturating visual masking and here the modulation values are exponential,
+  // another log would counter that.
+  return Add(kBase, MulAdd(kMul4, v4, MulAdd(kMul2, v2, Mul(kMul3, v3))));
+}
+
+// mul and mul2 represent a scaling difference between jxl and butteraugli.
+static const float kSGmul = 226.0480446705883f;
+static const float kSGmul2 = 1.0f / 73.377132366608819f;
+static const float kLog2 = 0.693147181f;
+// Includes correction factor for std::log -> log2.
+static const float kSGRetMul = kSGmul2 * 18.6580932135f * kLog2;
+static const float kSGVOffset = 7.14672470003f;
+
+template <bool invert, typename D, typename V>
+V RatioOfDerivativesOfCubicRootToSimpleGamma(const D d, V v) {
+  // The opsin space in jxl is the cubic root of photons, i.e., v * v * v
+  // is related to the number of photons.
+  //
+  // SimpleGamma(v * v * v) is the psychovisual space in butteraugli.
+  // This ratio allows quantization to move from jxl's opsin space to
+  // butteraugli's log-gamma space.
+  static const float kEpsilon = 1e-2;
+  static const float kNumOffset = kEpsilon / kInputScaling / kInputScaling;
+  static const float kNumMul = kSGRetMul * 3 * kSGmul;
+  static const float kVOffset = (kSGVOffset * kLog2 + kEpsilon) / kInputScaling;
+  static const float kDenMul = kLog2 * kSGmul * kInputScaling * kInputScaling;
+
+  v = ZeroIfNegative(v);
+  const auto num_mul = Set(d, kNumMul);
+  const auto num_offset = Set(d, kNumOffset);
+  const auto den_offset = Set(d, kVOffset);
+  const auto den_mul = Set(d, kDenMul);
+
+  const auto v2 = Mul(v, v);
+
+  const auto num = MulAdd(num_mul, v2, num_offset);
+  const auto den = MulAdd(Mul(den_mul, v), v2, den_offset);
+  return invert ? Div(num, den) : Div(den, num);
+}
+
+template <bool invert = false>
+static float RatioOfDerivativesOfCubicRootToSimpleGamma(float v) {
+  using DScalar = HWY_CAPPED(float, 1);
+  auto vscalar = Load(DScalar(), &v);
+  return GetLane(
+      RatioOfDerivativesOfCubicRootToSimpleGamma<invert>(DScalar(), vscalar));
+}
+
+// TODO(veluca): this function computes an approximation of the derivative of
+// SimpleGamma with (f(x+eps)-f(x))/eps. Consider two-sided approximation or
+// exact derivatives. For reference, SimpleGamma was:
+/*
+template <typename D, typename V>
+V SimpleGamma(const D d, V v) {
+  // A simple HDR compatible gamma function.
+  const auto mul = Set(d, kSGmul);
+  const auto kRetMul = Set(d, kSGRetMul);
+  const auto kRetAdd = Set(d, kSGmul2 * -20.2789020414f);
+  const auto kVOffset = Set(d, kSGVOffset);
+
+  v *= mul;
+
+  // This should happen rarely, but may lead to a NaN, which is rather
+  // undesirable. Since negative photons don't exist we solve the NaNs by
+  // clamping here.
+  // TODO(veluca): with FastLog2f, this no longer leads to NaNs.
+  v = ZeroIfNegative(v);
+  return kRetMul * FastLog2f(d, v + kVOffset) + kRetAdd;
+}
+*/
+
+template <class D, class V>
+V GammaModulation(const D d, const size_t x, const size_t y,
+                  const RowBuffer<float>& input, const V out_val) {
+  static const float kBias = 0.16f / kInputScaling;
+  static const float kScale = kInputScaling / 64.0f;
+  auto overall_ratio = Zero(d);
+  const auto bias = Set(d, kBias);
+  const auto scale = Set(d, kScale);
+  const float* const JXL_RESTRICT block_start = input.Row(y) + x;
+  for (size_t dy = 0; dy < 8; ++dy) {
+    const float* const JXL_RESTRICT row_in = block_start + dy * input.stride();
+    for (size_t dx = 0; dx < 8; dx += Lanes(d)) {
+      const auto iny = Add(Load(d, row_in + dx), bias);
+      const auto ratio_g =
+          RatioOfDerivativesOfCubicRootToSimpleGamma</*invert=*/true>(d, iny);
+      overall_ratio = Add(overall_ratio, ratio_g);
+    }
+  }
+  overall_ratio = Mul(SumOfLanes(d, overall_ratio), scale);
+  // ideally -1.0, but likely optimal correction adds some entropy, so slightly
+  // less than that.
+  // ln(2) constant folded in because we want std::log but have FastLog2f.
+  const auto kGam = Set(d, -0.15526878023684174f * 0.693147180559945f);
+  return MulAdd(kGam, FastLog2f(d, overall_ratio), out_val);
+}
+
+// Change precision in 8x8 blocks that have high frequency content.
+template <class D, class V>
+V HfModulation(const D d, const size_t x, const size_t y,
+               const RowBuffer<float>& input, const V out_val) {
+  // Zero out the invalid differences for the rightmost value per row.
+  const Rebind<uint32_t, D> du;
+  HWY_ALIGN constexpr uint32_t kMaskRight[8] = {~0u, ~0u, ~0u, ~0u,
+                                                ~0u, ~0u, ~0u, 0};
+
+  auto sum = Zero(d);  // sum of absolute differences with right and below
+  static const float kSumCoeff = -2.0052193233688884f * kInputScaling / 112.0;
+  auto sumcoeff = Set(d, kSumCoeff);
+
+  const float* const JXL_RESTRICT block_start = input.Row(y) + x;
+  for (size_t dy = 0; dy < 8; ++dy) {
+    const float* JXL_RESTRICT row_in = block_start + dy * input.stride();
+    const float* JXL_RESTRICT row_in_next =
+        dy == 7 ? row_in : row_in + input.stride();
+
+    for (size_t dx = 0; dx < 8; dx += Lanes(d)) {
+      const auto p = Load(d, row_in + dx);
+      const auto pr = LoadU(d, row_in + dx + 1);
+      const auto mask = BitCast(d, Load(du, kMaskRight + dx));
+      sum = Add(sum, And(mask, AbsDiff(p, pr)));
+      const auto pd = Load(d, row_in_next + dx);
+      sum = Add(sum, AbsDiff(p, pd));
+    }
+  }
+
+  sum = SumOfLanes(d, sum);
+  return MulAdd(sum, sumcoeff, out_val);
+}
+
+void PerBlockModulations(const float y_quant_01, const RowBuffer<float>& input,
+                         const size_t yb0, const size_t yblen,
+                         RowBuffer<float>* aq_map) {
+  static const float kAcQuant = 0.841f;
+  float base_level = 0.48f * kAcQuant;
+  float kDampenRampStart = 9.0f;
+  float kDampenRampEnd = 65.0f;
+  float dampen = 1.0f;
+  if (y_quant_01 >= kDampenRampStart) {
+    dampen = 1.0f - ((y_quant_01 - kDampenRampStart) /
+                     (kDampenRampEnd - kDampenRampStart));
+    if (dampen < 0) {
+      dampen = 0;
+    }
+  }
+  const float mul = kAcQuant * dampen;
+  const float add = (1.0f - dampen) * base_level;
+  for (size_t iy = 0; iy < yblen; iy++) {
+    const size_t yb = yb0 + iy;
+    const size_t y = yb * 8;
+    float* const JXL_RESTRICT row_out = aq_map->Row(yb);
+    const HWY_CAPPED(float, 8) df;
+    for (size_t ix = 0; ix < aq_map->xsize(); ix++) {
+      size_t x = ix * 8;
+      auto out_val = Set(df, row_out[ix]);
+      out_val = ComputeMask(df, out_val);
+      out_val = HfModulation(df, x, y, input, out_val);
+      out_val = GammaModulation(df, x, y, input, out_val);
+      // We want multiplicative quantization field, so everything
+      // until this point has been modulating the exponent.
+      row_out[ix] = FastPow2f(GetLane(out_val) * 1.442695041f) * mul + add;
+    }
+  }
+}
+
+template <typename D, typename V>
+V MaskingSqrt(const D d, V v) {
+  static const float kLogOffset = 28;
+  static const float kMul = 211.50759899638012f;
+  const auto mul_v = Set(d, kMul * 1e8);
+  const auto offset_v = Set(d, kLogOffset);
+  return Mul(Set(d, 0.25f), Sqrt(MulAdd(v, Sqrt(mul_v), offset_v)));
+}
+
+template <typename V>
+void Sort4(V& min0, V& min1, V& min2, V& min3) {
+  const auto tmp0 = Min(min0, min1);
+  const auto tmp1 = Max(min0, min1);
+  const auto tmp2 = Min(min2, min3);
+  const auto tmp3 = Max(min2, min3);
+  const auto tmp4 = Max(tmp0, tmp2);
+  const auto tmp5 = Min(tmp1, tmp3);
+  min0 = Min(tmp0, tmp2);
+  min1 = Min(tmp4, tmp5);
+  min2 = Max(tmp4, tmp5);
+  min3 = Max(tmp1, tmp3);
+}
+
+template <typename V>
+void UpdateMin4(const V v, V& min0, V& min1, V& min2, V& min3) {
+  const auto tmp0 = Max(min0, v);
+  const auto tmp1 = Max(min1, tmp0);
+  const auto tmp2 = Max(min2, tmp1);
+  min0 = Min(min0, v);
+  min1 = Min(min1, tmp0);
+  min2 = Min(min2, tmp1);
+  min3 = Min(min3, tmp2);
+}
+
+// Computes a linear combination of the 4 lowest values of the 3x3 neighborhood
+// of each pixel. Output is downsampled 2x.
+void FuzzyErosion(const RowBuffer<float>& pre_erosion, const size_t yb0,
+                  const size_t yblen, RowBuffer<float>* tmp,
+                  RowBuffer<float>* aq_map) {
+  int xsize_blocks = aq_map->xsize();
+  int xsize = pre_erosion.xsize();
+  HWY_FULL(float) d;
+  const auto mul0 = Set(d, 0.125f);
+  const auto mul1 = Set(d, 0.075f);
+  const auto mul2 = Set(d, 0.06f);
+  const auto mul3 = Set(d, 0.05f);
+  for (size_t iy = 0; iy < 2 * yblen; ++iy) {
+    size_t y = 2 * yb0 + iy;
+    const float* JXL_RESTRICT rowt = pre_erosion.Row(y - 1);
+    const float* JXL_RESTRICT rowm = pre_erosion.Row(y);
+    const float* JXL_RESTRICT rowb = pre_erosion.Row(y + 1);
+    float* row_out = tmp->Row(y);
+    for (int x = 0; x < xsize; x += Lanes(d)) {
+      int xm1 = x - 1;
+      int xp1 = x + 1;
+      auto min0 = LoadU(d, rowm + x);
+      auto min1 = LoadU(d, rowm + xm1);
+      auto min2 = LoadU(d, rowm + xp1);
+      auto min3 = LoadU(d, rowt + xm1);
+      Sort4(min0, min1, min2, min3);
+      UpdateMin4(LoadU(d, rowt + x), min0, min1, min2, min3);
+      UpdateMin4(LoadU(d, rowt + xp1), min0, min1, min2, min3);
+      UpdateMin4(LoadU(d, rowb + xm1), min0, min1, min2, min3);
+      UpdateMin4(LoadU(d, rowb + x), min0, min1, min2, min3);
+      UpdateMin4(LoadU(d, rowb + xp1), min0, min1, min2, min3);
+      const auto v = Add(Add(Mul(mul0, min0), Mul(mul1, min1)),
+                         Add(Mul(mul2, min2), Mul(mul3, min3)));
+      Store(v, d, row_out + x);
+    }
+    if (iy % 2 == 1) {
+      const float* JXL_RESTRICT row_out0 = tmp->Row(y - 1);
+      float* JXL_RESTRICT aq_out = aq_map->Row(yb0 + iy / 2);
+      for (int bx = 0, x = 0; bx < xsize_blocks; ++bx, x += 2) {
+        aq_out[bx] =
+            (row_out[x] + row_out[x + 1] + row_out0[x] + row_out0[x + 1]);
+      }
+    }
+  }
+}
+
+void ComputePreErosion(const RowBuffer<float>& input, const size_t xsize,
+                       const size_t y0, const size_t ylen, int border,
+                       float* diff_buffer, RowBuffer<float>* pre_erosion) {
+  const size_t xsize_out = xsize / 4;
+  const size_t y0_out = y0 / 4;
+
+  // The XYB gamma is 3.0 to be able to decode faster with two muls.
+  // Butteraugli's gamma is matching the gamma of human eye, around 2.6.
+  // We approximate the gamma difference by adding one cubic root into
+  // the adaptive quantization. This gives us a total gamma of 2.6666
+  // for quantization uses.
+  static const float match_gamma_offset = 0.019 / kInputScaling;
+
+  const HWY_CAPPED(float, 8) df;
+
+  static const float limit = 0.2f;
+  // Computes image (padded to multiple of 8x8) of local pixel differences.
+  // Subsample both directions by 4.
+  for (size_t iy = 0; iy < ylen; ++iy) {
+    size_t y = y0 + iy;
+    const float* row_in = input.Row(y);
+    const float* row_in1 = input.Row(y + 1);
+    const float* row_in2 = input.Row(y - 1);
+    float* JXL_RESTRICT row_out = diff_buffer;
+    const auto match_gamma_offset_v = Set(df, match_gamma_offset);
+    const auto quarter = Set(df, 0.25f);
+    for (size_t x = 0; x < xsize; x += Lanes(df)) {
+      const auto in = LoadU(df, row_in + x);
+      const auto in_r = LoadU(df, row_in + x + 1);
+      const auto in_l = LoadU(df, row_in + x - 1);
+      const auto in_t = LoadU(df, row_in2 + x);
+      const auto in_b = LoadU(df, row_in1 + x);
+      const auto base = Mul(quarter, Add(Add(in_r, in_l), Add(in_t, in_b)));
+      const auto gammacv =
+          RatioOfDerivativesOfCubicRootToSimpleGamma</*invert=*/false>(
+              df, Add(in, match_gamma_offset_v));
+      auto diff = Mul(gammacv, Sub(in, base));
+      diff = Mul(diff, diff);
+      diff = Min(diff, Set(df, limit));
+      diff = MaskingSqrt(df, diff);
+      if ((iy & 3) != 0) {
+        diff = Add(diff, LoadU(df, row_out + x));
+      }
+      StoreU(diff, df, row_out + x);
+    }
+    if (iy % 4 == 3) {
+      size_t y_out = y0_out + iy / 4;
+      float* row_dout = pre_erosion->Row(y_out);
+      for (size_t x = 0; x < xsize_out; x++) {
+        row_dout[x] = (row_out[x * 4] + row_out[x * 4 + 1] +
+                       row_out[x * 4 + 2] + row_out[x * 4 + 3]) *
+                      0.25f;
+      }
+      pre_erosion->PadRow(y_out, xsize_out, border);
+    }
+  }
+}
+
+}  // namespace
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jpegli
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jpegli {
+HWY_EXPORT(ComputePreErosion);
+HWY_EXPORT(FuzzyErosion);
+HWY_EXPORT(PerBlockModulations);
+
+namespace {
+
+static constexpr int kPreErosionBorder = 1;
+
+}  // namespace
+
+void ComputeAdaptiveQuantField(j_compress_ptr cinfo) {
+  jpeg_comp_master* m = cinfo->master;
+  if (!m->use_adaptive_quantization) {
+    return;
+  }
+  int y_channel = cinfo->jpeg_color_space == JCS_RGB ? 1 : 0;
+  jpeg_component_info* y_comp = &cinfo->comp_info[y_channel];
+  int y_quant_01 = cinfo->quant_tbl_ptrs[y_comp->quant_tbl_no]->quantval[1];
+  if (m->next_iMCU_row == 0) {
+    m->input_buffer[y_channel].CopyRow(-1, 0, 1);
+  }
+  if (m->next_iMCU_row + 1 == cinfo->total_iMCU_rows) {
+    size_t last_row = m->ysize_blocks * DCTSIZE - 1;
+    m->input_buffer[y_channel].CopyRow(last_row + 1, last_row, 1);
+  }
+  const RowBuffer<float>& input = m->input_buffer[y_channel];
+  const size_t xsize_blocks = y_comp->width_in_blocks;
+  const size_t xsize = xsize_blocks * DCTSIZE;
+  const size_t yb0 = m->next_iMCU_row * cinfo->max_v_samp_factor;
+  const size_t yblen = cinfo->max_v_samp_factor;
+  size_t y0 = yb0 * DCTSIZE;
+  size_t ylen = cinfo->max_v_samp_factor * DCTSIZE;
+  if (y0 == 0) {
+    ylen += 4;
+  } else {
+    y0 += 4;
+  }
+  if (m->next_iMCU_row + 1 == cinfo->total_iMCU_rows) {
+    ylen -= 4;
+  }
+  HWY_DYNAMIC_DISPATCH(ComputePreErosion)
+  (input, xsize, y0, ylen, kPreErosionBorder, m->diff_buffer, &m->pre_erosion);
+  if (y0 == 0) {
+    m->pre_erosion.CopyRow(-1, 0, kPreErosionBorder);
+  }
+  if (m->next_iMCU_row + 1 == cinfo->total_iMCU_rows) {
+    size_t last_row = m->ysize_blocks * 2 - 1;
+    m->pre_erosion.CopyRow(last_row + 1, last_row, kPreErosionBorder);
+  }
+  HWY_DYNAMIC_DISPATCH(FuzzyErosion)
+  (m->pre_erosion, yb0, yblen, &m->fuzzy_erosion_tmp, &m->quant_field);
+  HWY_DYNAMIC_DISPATCH(PerBlockModulations)
+  (y_quant_01, input, yb0, yblen, &m->quant_field);
+  for (int y = 0; y < cinfo->max_v_samp_factor; ++y) {
+    float* row = m->quant_field.Row(yb0 + y);
+    for (size_t x = 0; x < xsize_blocks; ++x) {
+      row[x] = std::max(0.0f, (0.6f / row[x]) - 1.0f);
+    }
+  }
+}
+
+}  // namespace jpegli
+#endif  // HWY_ONCE
diff --git a/lib/jpegli/adaptive_quantization.h b/lib/jpegli/adaptive_quantization.h
new file mode 100644 (file)
index 0000000..d8537e8
--- /dev/null
@@ -0,0 +1,17 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_ADAPTIVE_QUANTIZATION_H_
+#define LIB_JPEGLI_ADAPTIVE_QUANTIZATION_H_
+
+#include "lib/jpegli/common.h"
+
+namespace jpegli {
+
+void ComputeAdaptiveQuantField(j_compress_ptr cinfo);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_ADAPTIVE_QUANTIZATION_H_
diff --git a/lib/jpegli/bit_writer.cc b/lib/jpegli/bit_writer.cc
new file mode 100644 (file)
index 0000000..9788f35
--- /dev/null
@@ -0,0 +1,60 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/bit_writer.h"
+
+#include "lib/jpegli/encode_internal.h"
+
+namespace jpegli {
+
+void JpegBitWriterInit(j_compress_ptr cinfo) {
+  jpeg_comp_master* m = cinfo->master;
+  JpegBitWriter* bw = &m->bw;
+  size_t buffer_size = m->blocks_per_iMCU_row * (DCTSIZE2 * 16 + 8) + (1 << 16);
+  bw->cinfo = cinfo;
+  bw->data = Allocate<uint8_t>(cinfo, buffer_size, JPOOL_IMAGE);
+  bw->len = buffer_size;
+  bw->pos = 0;
+  bw->output_pos = 0;
+  bw->put_buffer = 0;
+  bw->free_bits = 64;
+  bw->healthy = true;
+}
+
+bool EmptyBitWriterBuffer(JpegBitWriter* bw) {
+  while (bw->output_pos < bw->pos) {
+    j_compress_ptr cinfo = bw->cinfo;
+    if (cinfo->dest->free_in_buffer == 0 &&
+        !(*cinfo->dest->empty_output_buffer)(cinfo)) {
+      return false;
+    }
+    size_t buflen = bw->pos - bw->output_pos;
+    size_t copylen = std::min<size_t>(cinfo->dest->free_in_buffer, buflen);
+    memcpy(cinfo->dest->next_output_byte, bw->data + bw->output_pos, copylen);
+    bw->output_pos += copylen;
+    cinfo->dest->free_in_buffer -= copylen;
+    cinfo->dest->next_output_byte += copylen;
+  }
+  bw->output_pos = bw->pos = 0;
+  return true;
+}
+
+void JumpToByteBoundary(JpegBitWriter* bw) {
+  size_t n_bits = bw->free_bits & 7u;
+  if (n_bits > 0) {
+    WriteBits(bw, n_bits, (1u << n_bits) - 1);
+  }
+  bw->put_buffer <<= bw->free_bits;
+  while (bw->free_bits <= 56) {
+    int c = (bw->put_buffer >> 56) & 0xFF;
+    EmitByte(bw, c);
+    bw->put_buffer <<= 8;
+    bw->free_bits += 8;
+  }
+  bw->put_buffer = 0;
+  bw->free_bits = 64;
+}
+
+}  // namespace jpegli
diff --git a/lib/jpegli/bit_writer.h b/lib/jpegli/bit_writer.h
new file mode 100644 (file)
index 0000000..3adf1ea
--- /dev/null
@@ -0,0 +1,98 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_BIT_WRITER_H_
+#define LIB_JPEGLI_BIT_WRITER_H_
+
+#include <stdint.h>
+#include <string.h>
+
+#include "lib/jpegli/common.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/compiler_specific.h"
+
+namespace jpegli {
+
+// Handles the packing of bits into output bytes.
+struct JpegBitWriter {
+  j_compress_ptr cinfo;
+  uint8_t* data;
+  size_t len;
+  size_t pos;
+  size_t output_pos;
+  uint64_t put_buffer;
+  int free_bits;
+  bool healthy;
+};
+
+void JpegBitWriterInit(j_compress_ptr cinfo);
+
+bool EmptyBitWriterBuffer(JpegBitWriter* bw);
+
+void JumpToByteBoundary(JpegBitWriter* bw);
+
+// Returns non-zero if and only if x has a zero byte, i.e. one of
+// x & 0xff, x & 0xff00, ..., x & 0xff00000000000000 is zero.
+static JXL_INLINE uint64_t HasZeroByte(uint64_t x) {
+  return (x - 0x0101010101010101ULL) & ~x & 0x8080808080808080ULL;
+}
+
+/**
+ * Writes the given byte to the output, writes an extra zero if byte is 0xFF.
+ *
+ * This method is "careless" - caller must make sure that there is enough
+ * space in the output buffer. Emits up to 2 bytes to buffer.
+ */
+static JXL_INLINE void EmitByte(JpegBitWriter* bw, int byte) {
+  bw->data[bw->pos++] = byte;
+  if (byte == 0xFF) bw->data[bw->pos++] = 0;
+}
+
+static JXL_INLINE void DischargeBitBuffer(JpegBitWriter* bw) {
+  // At this point we are ready to emit the bytes of put_buffer to the output.
+  // The JPEG format requires that after every 0xff byte in the entropy
+  // coded section, there is a zero byte, therefore we first check if any of
+  // the bytes of put_buffer is 0xFF.
+  if (HasZeroByte(~bw->put_buffer)) {
+    // We have a 0xFF byte somewhere, examine each byte and append a zero
+    // byte if necessary.
+    EmitByte(bw, (bw->put_buffer >> 56) & 0xFF);
+    EmitByte(bw, (bw->put_buffer >> 48) & 0xFF);
+    EmitByte(bw, (bw->put_buffer >> 40) & 0xFF);
+    EmitByte(bw, (bw->put_buffer >> 32) & 0xFF);
+    EmitByte(bw, (bw->put_buffer >> 24) & 0xFF);
+    EmitByte(bw, (bw->put_buffer >> 16) & 0xFF);
+    EmitByte(bw, (bw->put_buffer >> 8) & 0xFF);
+    EmitByte(bw, (bw->put_buffer >> 0) & 0xFF);
+  } else {
+    // We don't have any 0xFF bytes, output all 8 bytes without checking.
+    StoreBE64(bw->put_buffer, bw->data + bw->pos);
+    bw->pos += 8;
+  }
+}
+
+static JXL_INLINE void WriteBits(JpegBitWriter* bw, int nbits, uint64_t bits) {
+  // This is an optimization; if everything goes well,
+  // then |nbits| is positive; if non-existing Huffman symbol is going to be
+  // encoded, its length should be zero; later encoder could check the
+  // "health" of JpegBitWriter.
+  if (nbits == 0) {
+    bw->healthy = false;
+    return;
+  }
+  bw->free_bits -= nbits;
+  if (bw->free_bits < 0) {
+    bw->put_buffer <<= (bw->free_bits + nbits);
+    bw->put_buffer |= (bits >> -bw->free_bits);
+    DischargeBitBuffer(bw);
+    bw->free_bits += 64;
+    bw->put_buffer = nbits;
+  }
+  bw->put_buffer <<= nbits;
+  bw->put_buffer |= bits;
+}
+
+}  // namespace jpegli
+#endif  // LIB_JPEGLI_BIT_WRITER_H_
diff --git a/lib/jpegli/bitstream.cc b/lib/jpegli/bitstream.cc
new file mode 100644 (file)
index 0000000..3448367
--- /dev/null
@@ -0,0 +1,452 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/bitstream.h"
+
+#include <cmath>
+
+#include "lib/jpegli/bit_writer.h"
+#include "lib/jpegli/error.h"
+#include "lib/jpegli/memory_manager.h"
+
+namespace jpegli {
+
+void WriteOutput(j_compress_ptr cinfo, const uint8_t* buf, size_t bufsize) {
+  size_t pos = 0;
+  while (pos < bufsize) {
+    if (cinfo->dest->free_in_buffer == 0 &&
+        !(*cinfo->dest->empty_output_buffer)(cinfo)) {
+      JPEGLI_ERROR("Destination suspension is not supported in markers.");
+    }
+    size_t len = std::min<size_t>(cinfo->dest->free_in_buffer, bufsize - pos);
+    memcpy(cinfo->dest->next_output_byte, buf + pos, len);
+    pos += len;
+    cinfo->dest->free_in_buffer -= len;
+    cinfo->dest->next_output_byte += len;
+  }
+}
+
+void WriteOutput(j_compress_ptr cinfo, const std::vector<uint8_t>& bytes) {
+  WriteOutput(cinfo, bytes.data(), bytes.size());
+}
+
+void WriteOutput(j_compress_ptr cinfo, std::initializer_list<uint8_t> bytes) {
+  WriteOutput(cinfo, bytes.begin(), bytes.size());
+}
+
+void EncodeAPP0(j_compress_ptr cinfo) {
+  WriteOutput(cinfo,
+              {0xff, 0xe0, 0, 16, 'J', 'F', 'I', 'F', '\0',
+               cinfo->JFIF_major_version, cinfo->JFIF_minor_version,
+               cinfo->density_unit, static_cast<uint8_t>(cinfo->X_density >> 8),
+               static_cast<uint8_t>(cinfo->X_density & 0xff),
+               static_cast<uint8_t>(cinfo->Y_density >> 8),
+               static_cast<uint8_t>(cinfo->Y_density & 0xff), 0, 0});
+}
+
+void EncodeAPP14(j_compress_ptr cinfo) {
+  uint8_t color_transform = cinfo->jpeg_color_space == JCS_YCbCr  ? 1
+                            : cinfo->jpeg_color_space == JCS_YCCK ? 2
+                                                                  : 0;
+  WriteOutput(cinfo, {0xff, 0xee, 0, 14, 'A', 'd', 'o', 'b', 'e', 0, 100, 0, 0,
+                      0, 0, color_transform});
+}
+
+void WriteFileHeader(j_compress_ptr cinfo) {
+  WriteOutput(cinfo, {0xFF, 0xD8});  // SOI
+  if (cinfo->write_JFIF_header) {
+    EncodeAPP0(cinfo);
+  }
+  if (cinfo->write_Adobe_marker) {
+    EncodeAPP14(cinfo);
+  }
+}
+
+bool EncodeDQT(j_compress_ptr cinfo, bool write_all_tables) {
+  uint8_t data[4 + NUM_QUANT_TBLS * (1 + 2 * DCTSIZE2)];  // 520 bytes
+  size_t pos = 0;
+  data[pos++] = 0xFF;
+  data[pos++] = 0xDB;
+  pos += 2;  // Length will be filled in later.
+
+  int send_table[NUM_QUANT_TBLS] = {};
+  if (write_all_tables) {
+    for (int i = 0; i < NUM_QUANT_TBLS; ++i) {
+      if (cinfo->quant_tbl_ptrs[i]) send_table[i] = 1;
+    }
+  } else {
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      send_table[cinfo->comp_info[c].quant_tbl_no] = 1;
+    }
+  }
+
+  bool is_baseline = true;
+  for (int i = 0; i < NUM_QUANT_TBLS; ++i) {
+    if (!send_table[i]) continue;
+    JQUANT_TBL* quant_table = cinfo->quant_tbl_ptrs[i];
+    if (quant_table == nullptr) {
+      JPEGLI_ERROR("Missing quant table %d", i);
+    }
+    int precision = 0;
+    for (size_t k = 0; k < DCTSIZE2; ++k) {
+      if (quant_table->quantval[k] > 255) {
+        precision = 1;
+        is_baseline = false;
+      }
+    }
+    if (quant_table->sent_table) {
+      continue;
+    }
+    data[pos++] = (precision << 4) + i;
+    for (size_t j = 0; j < DCTSIZE2; ++j) {
+      int val_idx = kJPEGNaturalOrder[j];
+      int val = quant_table->quantval[val_idx];
+      if (val == 0) {
+        JPEGLI_ERROR("Invalid quantval 0.");
+      }
+      if (precision) {
+        data[pos++] = val >> 8;
+      }
+      data[pos++] = val & 0xFFu;
+    }
+    quant_table->sent_table = TRUE;
+  }
+  if (pos > 4) {
+    data[2] = (pos - 2) >> 8u;
+    data[3] = (pos - 2) & 0xFFu;
+    WriteOutput(cinfo, data, pos);
+  }
+  return is_baseline;
+}
+
+void EncodeSOF(j_compress_ptr cinfo, bool is_baseline) {
+  if (cinfo->data_precision != kJpegPrecision) {
+    is_baseline = false;
+    JPEGLI_ERROR("Unsupported data precision %d", cinfo->data_precision);
+  }
+  const uint8_t marker = cinfo->progressive_mode ? 0xc2
+                         : is_baseline           ? 0xc0
+                                                 : 0xc1;
+  const size_t n_comps = cinfo->num_components;
+  const size_t marker_len = 8 + 3 * n_comps;
+  std::vector<uint8_t> data(marker_len + 2);
+  size_t pos = 0;
+  data[pos++] = 0xFF;
+  data[pos++] = marker;
+  data[pos++] = marker_len >> 8u;
+  data[pos++] = marker_len & 0xFFu;
+  data[pos++] = kJpegPrecision;
+  data[pos++] = cinfo->image_height >> 8u;
+  data[pos++] = cinfo->image_height & 0xFFu;
+  data[pos++] = cinfo->image_width >> 8u;
+  data[pos++] = cinfo->image_width & 0xFFu;
+  data[pos++] = n_comps;
+  for (size_t i = 0; i < n_comps; ++i) {
+    jpeg_component_info* comp = &cinfo->comp_info[i];
+    data[pos++] = comp->component_id;
+    data[pos++] = ((comp->h_samp_factor << 4u) | (comp->v_samp_factor));
+    const uint32_t quant_idx = comp->quant_tbl_no;
+    if (cinfo->quant_tbl_ptrs[quant_idx] == nullptr) {
+      JPEGLI_ERROR("Invalid component quant table index %u.", quant_idx);
+    }
+    data[pos++] = quant_idx;
+  }
+  WriteOutput(cinfo, data);
+}
+
+void WriteFrameHeader(j_compress_ptr cinfo) {
+  jpeg_comp_master* m = cinfo->master;
+  bool is_baseline = EncodeDQT(cinfo, /*write_all_tables=*/false);
+  if (cinfo->progressive_mode || cinfo->arith_code ||
+      cinfo->data_precision != 8) {
+    is_baseline = false;
+  }
+  for (size_t i = 0; i < m->num_huffman_tables; ++i) {
+    int slot_id = m->slot_id_map[i];
+    if (slot_id > 0x11 || (slot_id > 0x01 && slot_id < 0x10)) {
+      is_baseline = false;
+    }
+  }
+  EncodeSOF(cinfo, is_baseline);
+}
+
+void EncodeDRI(j_compress_ptr cinfo) {
+  WriteOutput(cinfo, {0xFF, 0xDD, 0, 4,
+                      static_cast<uint8_t>(cinfo->restart_interval >> 8),
+                      static_cast<uint8_t>(cinfo->restart_interval & 0xFF)});
+}
+
+void EncodeDHT(j_compress_ptr cinfo, size_t offset, size_t num) {
+  jpeg_comp_master* m = cinfo->master;
+  size_t marker_len = 2;
+  for (size_t i = 0; i < num; ++i) {
+    const JHUFF_TBL& table = m->huffman_tables[offset + i];
+    if (table.sent_table) continue;
+    marker_len += kJpegHuffmanMaxBitLength + 1;
+    for (size_t j = 0; j <= kJpegHuffmanMaxBitLength; ++j) {
+      marker_len += table.bits[j];
+    }
+  }
+  std::vector<uint8_t> data(marker_len + 2);
+  size_t pos = 0;
+  data[pos++] = 0xFF;
+  data[pos++] = 0xC4;
+  data[pos++] = marker_len >> 8u;
+  data[pos++] = marker_len & 0xFFu;
+  for (size_t i = 0; i < num; ++i) {
+    const JHUFF_TBL& table = m->huffman_tables[offset + i];
+    if (table.sent_table) continue;
+    size_t total_count = 0;
+    for (size_t i = 0; i <= kJpegHuffmanMaxBitLength; ++i) {
+      total_count += table.bits[i];
+    }
+    data[pos++] = m->slot_id_map[offset + i];
+    for (size_t i = 1; i <= kJpegHuffmanMaxBitLength; ++i) {
+      data[pos++] = table.bits[i];
+    }
+    for (size_t i = 0; i < total_count; ++i) {
+      data[pos++] = table.huffval[i];
+    }
+  }
+  if (marker_len > 2) {
+    WriteOutput(cinfo, data);
+  }
+}
+
+void EncodeSOS(j_compress_ptr cinfo, int scan_index) {
+  jpeg_comp_master* m = cinfo->master;
+  const jpeg_scan_info* scan_info = &cinfo->scan_info[scan_index];
+  const size_t marker_len = 6 + 2 * scan_info->comps_in_scan;
+  std::vector<uint8_t> data(marker_len + 2);
+  size_t pos = 0;
+  data[pos++] = 0xFF;
+  data[pos++] = 0xDA;
+  data[pos++] = marker_len >> 8u;
+  data[pos++] = marker_len & 0xFFu;
+  data[pos++] = scan_info->comps_in_scan;
+  for (int i = 0; i < scan_info->comps_in_scan; ++i) {
+    int comp_idx = scan_info->component_index[i];
+    data[pos++] = cinfo->comp_info[comp_idx].component_id;
+    int dc_slot_id = m->slot_id_map[m->context_map[comp_idx]];
+    int ac_context = m->ac_ctx_offset[scan_index] + i;
+    int ac_slot_id = m->slot_id_map[m->context_map[ac_context]];
+    data[pos++] = (dc_slot_id << 4u) + (ac_slot_id - 16);
+  }
+  data[pos++] = scan_info->Ss;
+  data[pos++] = scan_info->Se;
+  data[pos++] = ((scan_info->Ah << 4u) | (scan_info->Al));
+  WriteOutput(cinfo, data);
+}
+
+void WriteScanHeader(j_compress_ptr cinfo, int scan_index) {
+  jpeg_comp_master* m = cinfo->master;
+  const jpeg_scan_info* scan_info = &cinfo->scan_info[scan_index];
+  cinfo->restart_interval = m->scan_token_info[scan_index].restart_interval;
+  if (cinfo->restart_interval != m->last_restart_interval) {
+    EncodeDRI(cinfo);
+    m->last_restart_interval = cinfo->restart_interval;
+  }
+  size_t num_dht = 0;
+  if (scan_index == 0) {
+    // For the first scan we emit all DC and at most 4 AC Huffman codes.
+    for (size_t i = 0, num_ac = 0; i < m->num_huffman_tables; ++i) {
+      if (m->slot_id_map[i] >= 16 && num_ac++ >= 4) break;
+      ++num_dht;
+    }
+  } else if (scan_info->Ss > 0) {
+    // For multi-scan sequential and progressive DC scans we have already
+    // emitted all Huffman codes that we need before the first scan. For
+    // progressive AC scans we only need at most one new Huffman code.
+    if (m->context_map[m->ac_ctx_offset[scan_index]] == m->next_dht_index) {
+      num_dht = 1;
+    }
+  }
+  if (num_dht > 0) {
+    EncodeDHT(cinfo, m->next_dht_index, num_dht);
+    m->next_dht_index += num_dht;
+  }
+  EncodeSOS(cinfo, scan_index);
+}
+
+void WriteBlock(const int32_t* JXL_RESTRICT symbols,
+                const int32_t* JXL_RESTRICT extra_bits, const int num_nonzeros,
+                const bool emit_eob,
+                const HuffmanCodeTable* JXL_RESTRICT dc_code,
+                const HuffmanCodeTable* JXL_RESTRICT ac_code,
+                JpegBitWriter* JXL_RESTRICT bw) {
+  int symbol = symbols[0];
+  WriteBits(bw, dc_code->depth[symbol], dc_code->code[symbol] | extra_bits[0]);
+  for (int i = 1; i < num_nonzeros; ++i) {
+    symbol = symbols[i];
+    if (symbol > 255) {
+      WriteBits(bw, ac_code->depth[0xf0], ac_code->code[0xf0]);
+      symbol -= 256;
+      if (symbol > 255) {
+        WriteBits(bw, ac_code->depth[0xf0], ac_code->code[0xf0]);
+        symbol -= 256;
+        if (symbol > 255) {
+          WriteBits(bw, ac_code->depth[0xf0], ac_code->code[0xf0]);
+          symbol -= 256;
+        }
+      }
+    }
+    WriteBits(bw, ac_code->depth[symbol],
+              ac_code->code[symbol] | extra_bits[i]);
+  }
+  if (emit_eob) {
+    WriteBits(bw, ac_code->depth[0], ac_code->code[0]);
+  }
+}
+
+namespace {
+
+static JXL_INLINE void EmitMarker(JpegBitWriter* bw, int marker) {
+  bw->data[bw->pos++] = 0xFF;
+  bw->data[bw->pos++] = marker;
+}
+
+void WriteTokens(j_compress_ptr cinfo, int scan_index, JpegBitWriter* bw) {
+  jpeg_comp_master* m = cinfo->master;
+  HuffmanCodeTable* coding_tables = &m->coding_tables[0];
+  int next_restart_marker = 0;
+  const ScanTokenInfo& sti = m->scan_token_info[scan_index];
+  size_t num_token_arrays = m->cur_token_array + 1;
+  size_t total_tokens = 0;
+  size_t restart_idx = 0;
+  size_t next_restart = sti.restarts[restart_idx];
+  uint8_t* context_map = m->context_map;
+  for (size_t i = 0; i < num_token_arrays; ++i) {
+    Token* tokens = m->token_arrays[i].tokens;
+    size_t num_tokens = m->token_arrays[i].num_tokens;
+    if (sti.token_offset < total_tokens + num_tokens &&
+        total_tokens < sti.token_offset + sti.num_tokens) {
+      size_t start_ix =
+          total_tokens < sti.token_offset ? sti.token_offset - total_tokens : 0;
+      size_t end_ix = std::min(sti.token_offset + sti.num_tokens - total_tokens,
+                               num_tokens);
+      size_t cycle_len = bw->len / 8;
+      size_t next_cycle = cycle_len;
+      for (size_t i = start_ix; i < end_ix; ++i) {
+        if (total_tokens + i == next_restart) {
+          JumpToByteBoundary(bw);
+          EmitMarker(bw, 0xD0 + next_restart_marker);
+          next_restart_marker += 1;
+          next_restart_marker &= 0x7;
+          next_restart = sti.restarts[++restart_idx];
+        }
+        Token t = tokens[i];
+        const HuffmanCodeTable* code = &coding_tables[context_map[t.context]];
+        WriteBits(bw, code->depth[t.symbol], code->code[t.symbol] | t.bits);
+        if (--next_cycle == 0) {
+          if (!EmptyBitWriterBuffer(bw)) {
+            JPEGLI_ERROR(
+                "Output suspension is not supported in "
+                "finish_compress");
+          }
+          next_cycle = cycle_len;
+        }
+      }
+    }
+    total_tokens += num_tokens;
+  }
+}
+
+void WriteACRefinementTokens(j_compress_ptr cinfo, int scan_index,
+                             JpegBitWriter* bw) {
+  jpeg_comp_master* m = cinfo->master;
+  const ScanTokenInfo& sti = m->scan_token_info[scan_index];
+  const uint8_t context = m->ac_ctx_offset[scan_index];
+  const HuffmanCodeTable* code = &m->coding_tables[m->context_map[context]];
+  size_t cycle_len = bw->len / 64;
+  size_t next_cycle = cycle_len;
+  size_t refbit_idx = 0;
+  size_t eobrun_idx = 0;
+  size_t restart_idx = 0;
+  size_t next_restart = sti.restarts[restart_idx];
+  int next_restart_marker = 0;
+  for (size_t i = 0; i < sti.num_tokens; ++i) {
+    if (i == next_restart) {
+      JumpToByteBoundary(bw);
+      EmitMarker(bw, 0xD0 + next_restart_marker);
+      next_restart_marker += 1;
+      next_restart_marker &= 0x7;
+      next_restart = sti.restarts[++restart_idx];
+    }
+    RefToken t = sti.tokens[i];
+    int symbol = t.symbol & 253;
+    uint16_t bits = 0;
+    if ((symbol & 1) == 0) {
+      int r = symbol >> 4;
+      if (r > 0 && r < 15) {
+        bits = sti.eobruns[eobrun_idx++];
+      }
+    } else {
+      bits = (t.symbol >> 1) & 1;
+    }
+    WriteBits(bw, code->depth[symbol], code->code[symbol] | bits);
+    for (int j = 0; j < t.refbits; ++j) {
+      WriteBits(bw, 1, sti.refbits[refbit_idx++]);
+    }
+    if (--next_cycle == 0) {
+      if (!EmptyBitWriterBuffer(bw)) {
+        JPEGLI_ERROR("Output suspension is not supported in finish_compress");
+      }
+      next_cycle = cycle_len;
+    }
+  }
+}
+
+void WriteDCRefinementBits(j_compress_ptr cinfo, int scan_index,
+                           JpegBitWriter* bw) {
+  jpeg_comp_master* m = cinfo->master;
+  const ScanTokenInfo& sti = m->scan_token_info[scan_index];
+  size_t restart_idx = 0;
+  size_t next_restart = sti.restarts[restart_idx];
+  int next_restart_marker = 0;
+  size_t cycle_len = bw->len * 4;
+  size_t next_cycle = cycle_len;
+  size_t refbit_idx = 0;
+  for (size_t i = 0; i < sti.num_tokens; ++i) {
+    if (i == next_restart) {
+      JumpToByteBoundary(bw);
+      EmitMarker(bw, 0xD0 + next_restart_marker);
+      next_restart_marker += 1;
+      next_restart_marker &= 0x7;
+      next_restart = sti.restarts[++restart_idx];
+    }
+    WriteBits(bw, 1, sti.refbits[refbit_idx++]);
+    if (--next_cycle == 0) {
+      if (!EmptyBitWriterBuffer(bw)) {
+        JPEGLI_ERROR(
+            "Output suspension is not supported in "
+            "finish_compress");
+      }
+      next_cycle = cycle_len;
+    }
+  }
+}
+
+}  // namespace
+
+void WriteScanData(j_compress_ptr cinfo, int scan_index) {
+  const jpeg_scan_info* scan_info = &cinfo->scan_info[scan_index];
+  JpegBitWriter* bw = &cinfo->master->bw;
+  if (scan_info->Ah == 0) {
+    WriteTokens(cinfo, scan_index, bw);
+  } else if (scan_info->Ss > 0) {
+    WriteACRefinementTokens(cinfo, scan_index, bw);
+  } else {
+    WriteDCRefinementBits(cinfo, scan_index, bw);
+  }
+  if (!bw->healthy) {
+    JPEGLI_ERROR("Unknown Huffman coded symbol found in scan %d", scan_index);
+  }
+  JumpToByteBoundary(bw);
+  if (!EmptyBitWriterBuffer(bw)) {
+    JPEGLI_ERROR("Output suspension is not supported in finish_compress");
+  }
+}
+
+}  // namespace jpegli
diff --git a/lib/jpegli/bitstream.h b/lib/jpegli/bitstream.h
new file mode 100644 (file)
index 0000000..aa54c73
--- /dev/null
@@ -0,0 +1,44 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_BITSTREAM_H_
+#define LIB_JPEGLI_BITSTREAM_H_
+
+#include <initializer_list>
+#include <vector>
+
+#include "lib/jpegli/encode_internal.h"
+
+namespace jpegli {
+
+void WriteOutput(j_compress_ptr cinfo, const uint8_t* buf, size_t bufsize);
+void WriteOutput(j_compress_ptr cinfo, const std::vector<uint8_t>& bytes);
+void WriteOutput(j_compress_ptr cinfo, std::initializer_list<uint8_t> bytes);
+
+void EncodeAPP0(j_compress_ptr cinfo);
+void EncodeAPP14(j_compress_ptr cinfo);
+void WriteFileHeader(j_compress_ptr cinfo);
+
+// Returns true of only baseline 8-bit tables are used.
+bool EncodeDQT(j_compress_ptr cinfo, bool write_all_tables);
+void EncodeSOF(j_compress_ptr cinfo, bool is_baseline);
+void WriteFrameHeader(j_compress_ptr cinfo);
+
+void EncodeDRI(j_compress_ptr cinfo);
+void EncodeDHT(j_compress_ptr cinfo, size_t offset, size_t num);
+void EncodeSOS(j_compress_ptr cinfo, int scan_index);
+void WriteScanHeader(j_compress_ptr cinfo, int scan_index);
+
+void WriteBlock(const int32_t* JXL_RESTRICT symbols,
+                const int32_t* JXL_RESTRICT extra_bits, const int num_nonzeros,
+                const bool emit_eob,
+                const HuffmanCodeTable* JXL_RESTRICT dc_code,
+                const HuffmanCodeTable* JXL_RESTRICT ac_code,
+                JpegBitWriter* JXL_RESTRICT bw);
+void WriteScanData(j_compress_ptr cinfo, int scan_index);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_BITSTREAM_H_
diff --git a/lib/jpegli/color_quantize.cc b/lib/jpegli/color_quantize.cc
new file mode 100644 (file)
index 0000000..e8357e2
--- /dev/null
@@ -0,0 +1,533 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/color_quantize.h"
+
+#include <cmath>
+#include <limits>
+#include <unordered_map>
+
+#include "lib/jpegli/decode_internal.h"
+#include "lib/jpegli/error.h"
+
+namespace jpegli {
+
+namespace {
+
+static constexpr int kNumColorCellBits[kMaxComponents] = {3, 4, 3, 3};
+static constexpr int kCompW[kMaxComponents] = {2, 3, 1, 1};
+
+int Pow(int a, int b) {
+  int r = 1;
+  for (int i = 0; i < b; ++i) {
+    r *= a;
+  }
+  return r;
+}
+
+int ComponentOrder(j_decompress_ptr cinfo, int i) {
+  if (cinfo->out_color_components == 3) {
+    return i < 2 ? 1 - i : i;
+  }
+  return i;
+}
+
+int GetColorComponent(int i, int N) {
+  return (i * 255 + (N - 1) / 2) / (N - 1);
+}
+
+}  // namespace
+
+void ChooseColorMap1Pass(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  int components = cinfo->out_color_components;
+  int desired = std::min(cinfo->desired_number_of_colors, 256);
+  int num = 1;
+  while (Pow(num + 1, components) <= desired) {
+    ++num;
+  }
+  if (num == 1) {
+    JPEGLI_ERROR("Too few colors (%d) in requested colormap", desired);
+  }
+  int actual = Pow(num, components);
+  for (int i = 0; i < components; ++i) {
+    m->num_colors_[i] = num;
+  }
+  while (actual < desired) {
+    int total = actual;
+    for (int i = 0; i < components; ++i) {
+      int c = ComponentOrder(cinfo, i);
+      int new_total = (actual / m->num_colors_[c]) * (m->num_colors_[c] + 1);
+      if (new_total <= desired) {
+        ++m->num_colors_[c];
+        actual = new_total;
+      }
+    }
+    if (actual == total) {
+      break;
+    }
+  }
+  cinfo->actual_number_of_colors = actual;
+  cinfo->colormap = (*cinfo->mem->alloc_sarray)(
+      reinterpret_cast<j_common_ptr>(cinfo), JPOOL_IMAGE, actual, components);
+  int next_color[kMaxComponents] = {0};
+  for (int i = 0; i < actual; ++i) {
+    for (int c = 0; c < components; ++c) {
+      cinfo->colormap[c][i] =
+          GetColorComponent(next_color[c], m->num_colors_[c]);
+    }
+    int c = components - 1;
+    while (c > 0 && next_color[c] + 1 == m->num_colors_[c]) {
+      next_color[c--] = 0;
+    }
+    ++next_color[c];
+  }
+  if (!m->colormap_lut_) {
+    m->colormap_lut_ = Allocate<uint8_t>(cinfo, components * 256, JPOOL_IMAGE);
+  }
+  int stride = actual;
+  for (int c = 0; c < components; ++c) {
+    int N = m->num_colors_[c];
+    stride /= N;
+    for (int i = 0; i < 256; ++i) {
+      int index = ((2 * i - 1) * (N - 1) + 254) / 510;
+      m->colormap_lut_[c * 256 + i] = index * stride;
+    }
+  }
+}
+
+namespace {
+
+// 2^13 priority levels for the PQ seems to be a good compromise between
+// accuracy, running time and stack space usage.
+static const int kMaxPriority = 1 << 13;
+static const int kMaxLevel = 3;
+
+// This function is used in the multi-resolution grid to be able to compute
+// the keys for the different resolutions by just shifting the first key.
+inline int InterlaceBitsRGB(uint8_t r, uint8_t g, uint8_t b) {
+  int z = 0;
+  for (int i = 0; i < 7; ++i) {
+    z += (r >> 5) & 4;
+    z += (g >> 6) & 2;
+    z += (b >> 7);
+    z <<= 3;
+    r <<= 1;
+    g <<= 1;
+    b <<= 1;
+  }
+  z += (r >> 5) & 4;
+  z += (g >> 6) & 2;
+  z += (b >> 7);
+  return z;
+}
+
+// This function will compute the actual priorities of the colors based on
+// the current distance from the palette, the population count and the signals
+// from the multi-resolution grid.
+inline int Priority(int d, int n, const int* density, const int* radius) {
+  int p = d * n;
+  for (int level = 0; level < kMaxLevel; ++level) {
+    if (d > radius[level]) {
+      p += density[level] * (d - radius[level]);
+    }
+  }
+  return std::min(kMaxPriority - 1, p >> 4);
+}
+
+inline int ColorIntQuadDistanceRGB(uint8_t r1, uint8_t g1, uint8_t b1,
+                                   uint8_t r2, uint8_t g2, uint8_t b2) {
+  // weights for the intensity calculation
+  static constexpr int ired = 2;
+  static constexpr int igreen = 5;
+  static constexpr int iblue = 1;
+  // normalization factor for the intensity calculation (2^ishift)
+  static constexpr int ishift = 3;
+  const int rd = r1 - r2;
+  const int gd = g1 - g2;
+  const int bd = b1 - b2;
+  const int id = ired * rd + igreen * gd + iblue * bd;
+  return rd * rd + gd * gd + bd * bd + ((id * id) >> (2 * ishift));
+}
+
+inline int ScaleQuadDistanceRGB(int d) {
+  return static_cast<int>(sqrt(d * 0.25) + 0.5);
+}
+
+// The function updates the minimal distances, the clustering and the
+// quantization error after the insertion of the new color into the palette.
+void AddToRGBPalette(const uint8_t* red, const uint8_t* green,
+                     const uint8_t* blue,
+                     const int* count,  // histogram of colors
+                     const int index,   // index of color to be added
+                     const int k,       // size of current palette
+                     const int n,       // number of colors
+                     int* dist,         // array of distances from palette
+                     int* cluster,      // mapping of color indices to palette
+                     int* center,       // the inverse mapping
+                     int64_t* error) {  // measure of the quantization error
+  center[k] = index;
+  cluster[index] = k;
+  *error -=
+      static_cast<int64_t>(dist[index]) * static_cast<int64_t>(count[index]);
+  dist[index] = 0;
+  for (int j = 0; j < n; ++j) {
+    if (dist[j] > 0) {
+      const int d = ColorIntQuadDistanceRGB(
+          red[index], green[index], blue[index], red[j], green[j], blue[j]);
+      if (d < dist[j]) {
+        *error += static_cast<int64_t>((d - dist[j])) *
+                  static_cast<int64_t>(count[j]);
+        dist[j] = d;
+        cluster[j] = k;
+      }
+    }
+  }
+}
+
+struct RGBPixelHasher {
+  // A quick but good-enough hash to get 24 bits of RGB into the lower 12 bits.
+  size_t operator()(uint32_t a) const { return (a ^ (a >> 12)) * 0x9e3779b9; }
+};
+
+struct WangHasher {
+  // Thomas Wang's Hash.  Nearly perfect and still quite fast.  Above (for
+  // pixels) we use a simpler hash because the number of hash calls is
+  // proportional to the number of pixels and that hash dominates; we want the
+  // cost to be minimal and we start with a large table.  We can use a better
+  // hash for the histogram since the number of hash calls is proportional to
+  // the number of unique colors in the image, which is hopefully much smaller.
+  // Note that the difference is slight; e.g. replacing RGBPixelHasher with
+  // WangHasher only slows things down by 5% on an Opteron.
+  size_t operator()(uint32_t a) const {
+    a = (a ^ 61) ^ (a >> 16);
+    a = a + (a << 3);
+    a = a ^ (a >> 4);
+    a = a * 0x27d4eb2d;
+    a = a ^ (a >> 15);
+    return a;
+  }
+};
+
+// Build an index of all the different colors in the input
+// image. To do this we map the 24 bit RGB representation of the colors
+// to a unique integer index assigned to the different colors in order of
+// appearance in the image.  Return the number of unique colors found.
+// The colors are pre-quantized to 3 * 6 bits precision.
+static int BuildRGBColorIndex(const uint8_t* const image, int const num_pixels,
+                              int* const count, uint8_t* const red,
+                              uint8_t* const green, uint8_t* const blue) {
+  // Impossible because rgb are in the low 24 bits, and the upper 8 bits is 0.
+  const uint32_t impossible_pixel_value = 0x10000000;
+  std::unordered_map<uint32_t, int, RGBPixelHasher> index_map(1 << 12);
+  std::unordered_map<uint32_t, int, RGBPixelHasher>::iterator index_map_lookup;
+  const uint8_t* imagep = &image[0];
+  uint32_t prev_pixel = impossible_pixel_value;
+  int index = 0;
+  int n = 0;
+  for (int i = 0; i < num_pixels; ++i) {
+    uint8_t r = ((*imagep++) & 0xfc) + 2;
+    uint8_t g = ((*imagep++) & 0xfc) + 2;
+    uint8_t b = ((*imagep++) & 0xfc) + 2;
+    uint32_t pixel = (b << 16) | (g << 8) | r;
+    if (pixel != prev_pixel) {
+      prev_pixel = pixel;
+      index_map_lookup = index_map.find(pixel);
+      if (index_map_lookup != index_map.end()) {
+        index = index_map_lookup->second;
+      } else {
+        index_map[pixel] = index = n++;
+        red[index] = r;
+        green[index] = g;
+        blue[index] = b;
+      }
+    }
+    ++count[index];
+  }
+  return n;
+}
+
+}  // namespace
+
+void ChooseColorMap2Pass(j_decompress_ptr cinfo) {
+  if (cinfo->out_color_space != JCS_RGB) {
+    JPEGLI_ERROR("Two-pass quantizer must use RGB output color space.");
+  }
+  jpeg_decomp_master* m = cinfo->master;
+  const size_t num_pixels = cinfo->output_width * cinfo->output_height;
+  const int max_color_count = std::max<size_t>(num_pixels, 1u << 18);
+  const int max_palette_size = cinfo->desired_number_of_colors;
+  std::unique_ptr<uint8_t[]> red(new uint8_t[max_color_count]);
+  std::unique_ptr<uint8_t[]> green(new uint8_t[max_color_count]);
+  std::unique_ptr<uint8_t[]> blue(new uint8_t[max_color_count]);
+  std::vector<int> count(max_color_count, 0);
+  // number of colors
+  int n = BuildRGBColorIndex(m->pixels_, num_pixels, &count[0], &red[0],
+                             &green[0], &blue[0]);
+
+  std::vector<int> dist(n, std::numeric_limits<int>::max());
+  std::vector<int> cluster(n);
+  std::vector<bool> in_palette(n, false);
+  int center[256];
+  int k = 0;  // palette size
+  const int count_threshold = (num_pixels * 4) / max_palette_size;
+  static constexpr int kAveragePixelErrorThreshold = 1;
+  const int64_t error_threshold = num_pixels * kAveragePixelErrorThreshold;
+  int64_t error = 0;  // quantization error
+
+  int max_count = 0;
+  int winner = 0;
+  for (int i = 0; i < n; ++i) {
+    if (count[i] > max_count) {
+      max_count = count[i];
+      winner = i;
+    }
+    if (!in_palette[i] && count[i] > count_threshold) {
+      AddToRGBPalette(&red[0], &green[0], &blue[0], &count[0], i, k++, n,
+                      &dist[0], &cluster[0], &center[0], &error);
+      in_palette[i] = true;
+    }
+  }
+  if (k == 0) {
+    AddToRGBPalette(&red[0], &green[0], &blue[0], &count[0], winner, k++, n,
+                    &dist[0], &cluster[0], &center[0], &error);
+    in_palette[winner] = true;
+  }
+
+  // Calculation of the multi-resolution density grid.
+  std::vector<int> density(n * kMaxLevel);
+  std::vector<int> radius(n * kMaxLevel);
+  std::unordered_map<uint32_t, int, WangHasher> histogram[kMaxLevel];
+  for (int level = 0; level < kMaxLevel; ++level) {
+    // This value is never used because key = InterlaceBitsRGB(...) >> 6
+  }
+
+  for (int i = 0; i < n; ++i) {
+    if (!in_palette[i]) {
+      const int key = InterlaceBitsRGB(red[i], green[i], blue[i]) >> 6;
+      for (int level = 0; level < kMaxLevel; ++level) {
+        histogram[level][key >> (3 * level)] += count[i];
+      }
+    }
+  }
+  for (int i = 0; i < n; ++i) {
+    if (!in_palette[i]) {
+      for (int level = 0; level < kMaxLevel; ++level) {
+        const int mask = (4 << level) - 1;
+        const int rd = std::max(red[i] & mask, mask - (red[i] & mask));
+        const int gd = std::max(green[i] & mask, mask - (green[i] & mask));
+        const int bd = std::max(blue[i] & mask, mask - (blue[i] & mask));
+        radius[i * kMaxLevel + level] =
+            ScaleQuadDistanceRGB(ColorIntQuadDistanceRGB(0, 0, 0, rd, gd, bd));
+      }
+      const int key = InterlaceBitsRGB(red[i], green[i], blue[i]) >> 6;
+      if (kMaxLevel > 0) {
+        density[i * kMaxLevel] = histogram[0][key] - count[i];
+      }
+      for (int level = 1; level < kMaxLevel; ++level) {
+        density[i * kMaxLevel + level] =
+            (histogram[level][key >> (3 * level)] -
+             histogram[level - 1][key >> (3 * level - 3)]);
+      }
+    }
+  }
+
+  // Calculate the initial error now that the palette has been initialized.
+  error = 0;
+  for (int i = 0; i < n; ++i) {
+    error += static_cast<int64_t>(dist[i]) * static_cast<int64_t>(count[i]);
+  }
+
+  std::unique_ptr<std::vector<int>[]> bucket_array(
+      new std::vector<int>[kMaxPriority]);
+  int top_priority = -1;
+  for (int i = 0; i < n; ++i) {
+    if (!in_palette[i]) {
+      int priority = Priority(ScaleQuadDistanceRGB(dist[i]), count[i],
+                              &density[i * kMaxLevel], &radius[i * kMaxLevel]);
+      bucket_array[priority].push_back(i);
+      top_priority = std::max(priority, top_priority);
+    }
+  }
+  double error_accum = 0;
+  while (top_priority >= 0 && k < max_palette_size) {
+    if (error < error_threshold) {
+      error_accum += std::min(error_threshold, error_threshold - error);
+      if (error_accum >= 10 * error_threshold) {
+        break;
+      }
+    }
+    int i = bucket_array[top_priority].back();
+    int priority = Priority(ScaleQuadDistanceRGB(dist[i]), count[i],
+                            &density[i * kMaxLevel], &radius[i * kMaxLevel]);
+    if (priority < top_priority) {
+      bucket_array[priority].push_back(i);
+    } else {
+      AddToRGBPalette(&red[0], &green[0], &blue[0], &count[0], i, k++, n,
+                      &dist[0], &cluster[0], &center[0], &error);
+    }
+    bucket_array[top_priority].pop_back();
+    while (top_priority >= 0 && bucket_array[top_priority].empty()) {
+      --top_priority;
+    }
+  }
+
+  cinfo->actual_number_of_colors = k;
+  cinfo->colormap = (*cinfo->mem->alloc_sarray)(
+      reinterpret_cast<j_common_ptr>(cinfo), JPOOL_IMAGE, k, 3);
+  for (int i = 0; i < k; ++i) {
+    int index = center[i];
+    cinfo->colormap[0][i] = red[index];
+    cinfo->colormap[1][i] = green[index];
+    cinfo->colormap[2][i] = blue[index];
+  }
+}
+
+namespace {
+
+void FindCandidatesForCell(j_decompress_ptr cinfo, int ncomp, int cell[],
+                           std::vector<uint8_t>* candidates) {
+  int cell_min[kMaxComponents];
+  int cell_max[kMaxComponents];
+  int cell_center[kMaxComponents];
+  for (int c = 0; c < ncomp; ++c) {
+    cell_min[c] = cell[c] << (8 - kNumColorCellBits[c]);
+    cell_max[c] = cell_min[c] + (1 << (8 - kNumColorCellBits[c])) - 1;
+    cell_center[c] = (cell_min[c] + cell_max[c]) >> 1;
+  }
+  int min_maxdist = std::numeric_limits<int>::max();
+  int mindist[256];
+  for (int i = 0; i < cinfo->actual_number_of_colors; ++i) {
+    int dmin = 0;
+    int dmax = 0;
+    for (int c = 0; c < ncomp; ++c) {
+      int palette_c = cinfo->colormap[c][i];
+      int dminc = 0, dmaxc;
+      if (palette_c < cell_min[c]) {
+        dminc = cell_min[c] - palette_c;
+        dmaxc = cell_max[c] - palette_c;
+      } else if (palette_c > cell_max[c]) {
+        dminc = palette_c - cell_max[c];
+        dmaxc = palette_c - cell_min[c];
+      } else if (palette_c > cell_center[c]) {
+        dmaxc = palette_c - cell_min[c];
+      } else {
+        dmaxc = cell_max[c] - palette_c;
+      }
+      dminc *= kCompW[c];
+      dmaxc *= kCompW[c];
+      dmin += dminc * dminc;
+      dmax += dmaxc * dmaxc;
+    }
+    mindist[i] = dmin;
+    min_maxdist = std::min(dmax, min_maxdist);
+  }
+  for (int i = 0; i < cinfo->actual_number_of_colors; ++i) {
+    if (mindist[i] < min_maxdist) {
+      candidates->push_back(i);
+    }
+  }
+}
+
+}  // namespace
+
+void CreateInverseColorMap(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  int ncomp = cinfo->out_color_components;
+  int num_cells = 1;
+  for (int c = 0; c < ncomp; ++c) {
+    num_cells *= (1 << kNumColorCellBits[c]);
+  }
+  m->candidate_lists_.resize(num_cells);
+
+  int next_cell[kMaxComponents] = {0};
+  for (int i = 0; i < num_cells; ++i) {
+    m->candidate_lists_[i].clear();
+    FindCandidatesForCell(cinfo, ncomp, next_cell, &m->candidate_lists_[i]);
+    int c = ncomp - 1;
+    while (c > 0 && next_cell[c] + 1 == (1 << kNumColorCellBits[c])) {
+      next_cell[c--] = 0;
+    }
+    ++next_cell[c];
+  }
+  m->regenerate_inverse_colormap_ = false;
+}
+
+int LookupColorIndex(j_decompress_ptr cinfo, JSAMPLE* pixel) {
+  jpeg_decomp_master* m = cinfo->master;
+  int num_channels = cinfo->out_color_components;
+  int index = 0;
+  if (m->quant_mode_ == 1) {
+    for (int c = 0; c < num_channels; ++c) {
+      index += m->colormap_lut_[c * 256 + pixel[c]];
+    }
+  } else {
+    size_t cell_idx = 0;
+    size_t stride = 1;
+    for (int c = num_channels - 1; c >= 0; --c) {
+      cell_idx += (pixel[c] >> (8 - kNumColorCellBits[c])) * stride;
+      stride <<= kNumColorCellBits[c];
+    }
+    JXL_ASSERT(cell_idx < m->candidate_lists_.size());
+    int mindist = std::numeric_limits<int>::max();
+    const auto& candidates = m->candidate_lists_[cell_idx];
+    for (uint8_t i : candidates) {
+      int dist = 0;
+      for (int c = 0; c < num_channels; ++c) {
+        int d = (cinfo->colormap[c][i] - pixel[c]) * kCompW[c];
+        dist += d * d;
+      }
+      if (dist < mindist) {
+        mindist = dist;
+        index = i;
+      }
+    }
+  }
+  JXL_ASSERT(index < cinfo->actual_number_of_colors);
+  return index;
+}
+
+void CreateOrderedDitherTables(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  static constexpr size_t kDitherSize = 4;
+  static constexpr size_t kDitherMask = kDitherSize - 1;
+  static constexpr float kBaseDitherMatrix[] = {
+      0,  8,  2,  10,  //
+      12, 4,  14, 6,   //
+      3,  11, 1,  9,   //
+      15, 7,  13, 5,   //
+  };
+  m->dither_size_ = kDitherSize;
+  m->dither_mask_ = kDitherMask;
+  size_t ncells = m->dither_size_ * m->dither_size_;
+  for (int c = 0; c < cinfo->out_color_components; ++c) {
+    float spread = 1.0f / (m->num_colors_[c] - 1);
+    float mul = spread / ncells;
+    float offset = 0.5f * spread;
+    if (m->dither_[c] == nullptr) {
+      m->dither_[c] = Allocate<float>(cinfo, ncells, JPOOL_IMAGE_ALIGNED);
+    }
+    for (size_t idx = 0; idx < ncells; ++idx) {
+      m->dither_[c][idx] = kBaseDitherMatrix[idx] * mul - offset;
+    }
+  }
+}
+
+void InitFSDitherState(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  for (int c = 0; c < cinfo->out_color_components; ++c) {
+    if (m->error_row_[c] == nullptr) {
+      m->error_row_[c] =
+          Allocate<float>(cinfo, cinfo->output_width, JPOOL_IMAGE_ALIGNED);
+      m->error_row_[c + kMaxComponents] =
+          Allocate<float>(cinfo, cinfo->output_width, JPOOL_IMAGE_ALIGNED);
+    }
+    memset(m->error_row_[c], 0.0, cinfo->output_width * sizeof(float));
+    memset(m->error_row_[c + kMaxComponents], 0.0,
+           cinfo->output_width * sizeof(float));
+  }
+}
+
+}  // namespace jpegli
diff --git a/lib/jpegli/color_quantize.h b/lib/jpegli/color_quantize.h
new file mode 100644 (file)
index 0000000..3dda1d8
--- /dev/null
@@ -0,0 +1,27 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_COLOR_QUANTIZE_H_
+#define LIB_JPEGLI_COLOR_QUANTIZE_H_
+
+#include "lib/jpegli/common.h"
+
+namespace jpegli {
+
+void ChooseColorMap1Pass(j_decompress_ptr cinfo);
+
+void ChooseColorMap2Pass(j_decompress_ptr cinfo);
+
+void CreateInverseColorMap(j_decompress_ptr cinfo);
+
+void CreateOrderedDitherTables(j_decompress_ptr cinfo);
+
+void InitFSDitherState(j_decompress_ptr cinfo);
+
+int LookupColorIndex(j_decompress_ptr cinfo, JSAMPLE* pixel);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_COLOR_QUANTIZE_H_
diff --git a/lib/jpegli/color_transform.cc b/lib/jpegli/color_transform.cc
new file mode 100644 (file)
index 0000000..020a6fd
--- /dev/null
@@ -0,0 +1,281 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/color_transform.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jpegli/color_transform.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jpegli/decode_internal.h"
+#include "lib/jpegli/encode_internal.h"
+#include "lib/jpegli/error.h"
+#include "lib/jxl/base/compiler_specific.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jpegli {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Div;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::Sub;
+
+void YCbCrToRGB(float* row[kMaxComponents], size_t xsize) {
+  const HWY_CAPPED(float, 8) df;
+  float* JXL_RESTRICT row0 = row[0];
+  float* JXL_RESTRICT row1 = row[1];
+  float* JXL_RESTRICT row2 = row[2];
+
+  // Full-range BT.601 as defined by JFIF Clause 7:
+  // https://www.itu.int/rec/T-REC-T.871-201105-I/en
+  const auto crcr = Set(df, 1.402f);
+  const auto cgcb = Set(df, -0.114f * 1.772f / 0.587f);
+  const auto cgcr = Set(df, -0.299f * 1.402f / 0.587f);
+  const auto cbcb = Set(df, 1.772f);
+
+  for (size_t x = 0; x < xsize; x += Lanes(df)) {
+    const auto y_vec = Load(df, row0 + x);
+    const auto cb_vec = Load(df, row1 + x);
+    const auto cr_vec = Load(df, row2 + x);
+    const auto r_vec = MulAdd(crcr, cr_vec, y_vec);
+    const auto g_vec = MulAdd(cgcr, cr_vec, MulAdd(cgcb, cb_vec, y_vec));
+    const auto b_vec = MulAdd(cbcb, cb_vec, y_vec);
+    Store(r_vec, df, row0 + x);
+    Store(g_vec, df, row1 + x);
+    Store(b_vec, df, row2 + x);
+  }
+}
+
+void YCCKToCMYK(float* row[kMaxComponents], size_t xsize) {
+  const HWY_CAPPED(float, 8) df;
+  float* JXL_RESTRICT row0 = row[0];
+  float* JXL_RESTRICT row1 = row[1];
+  float* JXL_RESTRICT row2 = row[2];
+  YCbCrToRGB(row, xsize);
+  const auto offset = Set(df, -1.0f / 255.0f);
+  for (size_t x = 0; x < xsize; x += Lanes(df)) {
+    Store(Sub(offset, Load(df, row0 + x)), df, row0 + x);
+    Store(Sub(offset, Load(df, row1 + x)), df, row1 + x);
+    Store(Sub(offset, Load(df, row2 + x)), df, row2 + x);
+  }
+}
+
+void RGBToYCbCr(float* row[kMaxComponents], size_t xsize) {
+  const HWY_CAPPED(float, 8) df;
+  float* JXL_RESTRICT row0 = row[0];
+  float* JXL_RESTRICT row1 = row[1];
+  float* JXL_RESTRICT row2 = row[2];
+  // Full-range BT.601 as defined by JFIF Clause 7:
+  // https://www.itu.int/rec/T-REC-T.871-201105-I/en
+  const auto c128 = Set(df, 128.0f);
+  const auto kR = Set(df, 0.299f);  // NTSC luma
+  const auto kG = Set(df, 0.587f);
+  const auto kB = Set(df, 0.114f);
+  const auto kAmpR = Set(df, 0.701f);
+  const auto kAmpB = Set(df, 0.886f);
+  const auto kDiffR = Add(kAmpR, kR);
+  const auto kDiffB = Add(kAmpB, kB);
+  const auto kNormR = Div(Set(df, 1.0f), (Add(kAmpR, Add(kG, kB))));
+  const auto kNormB = Div(Set(df, 1.0f), (Add(kR, Add(kG, kAmpB))));
+
+  for (size_t x = 0; x < xsize; x += Lanes(df)) {
+    const auto r = Load(df, row0 + x);
+    const auto g = Load(df, row1 + x);
+    const auto b = Load(df, row2 + x);
+    const auto r_base = Mul(r, kR);
+    const auto r_diff = Mul(r, kDiffR);
+    const auto g_base = Mul(g, kG);
+    const auto b_base = Mul(b, kB);
+    const auto b_diff = Mul(b, kDiffB);
+    const auto y_base = Add(r_base, Add(g_base, b_base));
+    const auto cb_vec = MulAdd(Sub(b_diff, y_base), kNormB, c128);
+    const auto cr_vec = MulAdd(Sub(r_diff, y_base), kNormR, c128);
+    Store(y_base, df, row0 + x);
+    Store(cb_vec, df, row1 + x);
+    Store(cr_vec, df, row2 + x);
+  }
+}
+
+void CMYKToYCCK(float* row[kMaxComponents], size_t xsize) {
+  const HWY_CAPPED(float, 8) df;
+  float* JXL_RESTRICT row0 = row[0];
+  float* JXL_RESTRICT row1 = row[1];
+  float* JXL_RESTRICT row2 = row[2];
+  const auto unity = Set(df, 255.0f);
+  for (size_t x = 0; x < xsize; x += Lanes(df)) {
+    Store(Sub(unity, Load(df, row0 + x)), df, row0 + x);
+    Store(Sub(unity, Load(df, row1 + x)), df, row1 + x);
+    Store(Sub(unity, Load(df, row2 + x)), df, row2 + x);
+  }
+  RGBToYCbCr(row, xsize);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jpegli
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jpegli {
+
+HWY_EXPORT(CMYKToYCCK);
+HWY_EXPORT(YCCKToCMYK);
+HWY_EXPORT(YCbCrToRGB);
+HWY_EXPORT(RGBToYCbCr);
+
+bool CheckColorSpaceComponents(int num_components, J_COLOR_SPACE colorspace) {
+  switch (colorspace) {
+    case JCS_GRAYSCALE:
+      return num_components == 1;
+    case JCS_RGB:
+    case JCS_YCbCr:
+    case JCS_EXT_RGB:
+    case JCS_EXT_BGR:
+      return num_components == 3;
+    case JCS_CMYK:
+    case JCS_YCCK:
+    case JCS_EXT_RGBX:
+    case JCS_EXT_BGRX:
+    case JCS_EXT_XBGR:
+    case JCS_EXT_XRGB:
+    case JCS_EXT_RGBA:
+    case JCS_EXT_BGRA:
+    case JCS_EXT_ABGR:
+    case JCS_EXT_ARGB:
+      return num_components == 4;
+    default:
+      // Unrecognized colorspaces can have any number of channels, since no
+      // color transform will be performed on them.
+      return true;
+  }
+}
+
+void NullTransform(float* row[kMaxComponents], size_t len) {}
+
+void GrayscaleToRGB(float* row[kMaxComponents], size_t len) {
+  memcpy(row[1], row[0], len * sizeof(row[1][0]));
+  memcpy(row[2], row[0], len * sizeof(row[2][0]));
+}
+
+void GrayscaleToYCbCr(float* row[kMaxComponents], size_t len) {
+  memset(row[1], 0, len * sizeof(row[1][0]));
+  memset(row[2], 0, len * sizeof(row[2][0]));
+}
+
+void ChooseColorTransform(j_compress_ptr cinfo) {
+  jpeg_comp_master* m = cinfo->master;
+  if (!CheckColorSpaceComponents(cinfo->input_components,
+                                 cinfo->in_color_space)) {
+    JPEGLI_ERROR("Invalid number of input components %d for colorspace %d",
+                 cinfo->input_components, cinfo->in_color_space);
+  }
+  if (!CheckColorSpaceComponents(cinfo->num_components,
+                                 cinfo->jpeg_color_space)) {
+    JPEGLI_ERROR("Invalid number of components %d for colorspace %d",
+                 cinfo->num_components, cinfo->jpeg_color_space);
+  }
+  if (cinfo->jpeg_color_space == cinfo->in_color_space) {
+    if (cinfo->num_components != cinfo->input_components) {
+      JPEGLI_ERROR("Input/output components mismatch:  %d vs %d",
+                   cinfo->input_components, cinfo->num_components);
+    }
+    // No color transform requested.
+    m->color_transform = NullTransform;
+    return;
+  }
+
+  if (cinfo->in_color_space == JCS_RGB && m->xyb_mode) {
+    JPEGLI_ERROR("Color transform on XYB colorspace is not supported.");
+  }
+
+  m->color_transform = nullptr;
+  if (cinfo->jpeg_color_space == JCS_GRAYSCALE) {
+    if (cinfo->in_color_space == JCS_RGB) {
+      m->color_transform = HWY_DYNAMIC_DISPATCH(RGBToYCbCr);
+    } else if (cinfo->in_color_space == JCS_YCbCr ||
+               cinfo->in_color_space == JCS_YCCK) {
+      // Since the first luminance channel is the grayscale version of the
+      // image, nothing to do here
+      m->color_transform = NullTransform;
+    }
+  } else if (cinfo->jpeg_color_space == JCS_RGB) {
+    if (cinfo->in_color_space == JCS_GRAYSCALE) {
+      m->color_transform = GrayscaleToRGB;
+    }
+  } else if (cinfo->jpeg_color_space == JCS_YCbCr) {
+    if (cinfo->in_color_space == JCS_RGB) {
+      m->color_transform = HWY_DYNAMIC_DISPATCH(RGBToYCbCr);
+    } else if (cinfo->in_color_space == JCS_GRAYSCALE) {
+      m->color_transform = GrayscaleToYCbCr;
+    }
+  } else if (cinfo->jpeg_color_space == JCS_YCCK) {
+    if (cinfo->in_color_space == JCS_CMYK) {
+      m->color_transform = HWY_DYNAMIC_DISPATCH(CMYKToYCCK);
+    }
+  }
+
+  if (m->color_transform == nullptr) {
+    // TODO(szabadka) Support more color transforms.
+    JPEGLI_ERROR("Unsupported color transform %d -> %d", cinfo->in_color_space,
+                 cinfo->jpeg_color_space);
+  }
+}
+
+void ChooseColorTransform(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (!CheckColorSpaceComponents(cinfo->out_color_components,
+                                 cinfo->out_color_space)) {
+    JPEGLI_ERROR("Invalid number of output components %d for colorspace %d",
+                 cinfo->out_color_components, cinfo->out_color_space);
+  }
+  if (!CheckColorSpaceComponents(cinfo->num_components,
+                                 cinfo->jpeg_color_space)) {
+    JPEGLI_ERROR("Invalid number of components %d for colorspace %d",
+                 cinfo->num_components, cinfo->jpeg_color_space);
+  }
+  if (cinfo->jpeg_color_space == cinfo->out_color_space) {
+    if (cinfo->num_components != cinfo->out_color_components) {
+      JPEGLI_ERROR("Input/output components mismatch:  %d vs %d",
+                   cinfo->num_components, cinfo->out_color_components);
+    }
+    // No color transform requested.
+    m->color_transform = NullTransform;
+    return;
+  }
+
+  m->color_transform = nullptr;
+  if (cinfo->jpeg_color_space == JCS_GRAYSCALE) {
+    if (cinfo->out_color_space == JCS_RGB) {
+      m->color_transform = GrayscaleToRGB;
+    }
+  } else if (cinfo->jpeg_color_space == JCS_RGB) {
+    if (cinfo->out_color_space == JCS_GRAYSCALE) {
+      m->color_transform = HWY_DYNAMIC_DISPATCH(RGBToYCbCr);
+    }
+  } else if (cinfo->jpeg_color_space == JCS_YCbCr) {
+    if (cinfo->out_color_space == JCS_RGB) {
+      m->color_transform = HWY_DYNAMIC_DISPATCH(YCbCrToRGB);
+    } else if (cinfo->out_color_space == JCS_GRAYSCALE) {
+      m->color_transform = NullTransform;
+    }
+  } else if (cinfo->jpeg_color_space == JCS_YCCK) {
+    if (cinfo->out_color_space == JCS_CMYK) {
+      m->color_transform = HWY_DYNAMIC_DISPATCH(YCCKToCMYK);
+    }
+  }
+
+  if (m->color_transform == nullptr) {
+    // TODO(szabadka) Support more color transforms.
+    JPEGLI_ERROR("Unsupported color transform %d -> %d",
+                 cinfo->jpeg_color_space, cinfo->out_color_space);
+  }
+}
+
+}  // namespace jpegli
+#endif  // HWY_ONCE
diff --git a/lib/jpegli/color_transform.h b/lib/jpegli/color_transform.h
new file mode 100644 (file)
index 0000000..8d58f88
--- /dev/null
@@ -0,0 +1,20 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_COLOR_TRANSFORM_H_
+#define LIB_JPEGLI_COLOR_TRANSFORM_H_
+
+#include "lib/jpegli/common.h"
+#include "lib/jxl/base/compiler_specific.h"
+
+namespace jpegli {
+
+void ChooseColorTransform(j_compress_ptr cinfo);
+
+void ChooseColorTransform(j_decompress_ptr cinfo);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_COLOR_TRANSFORM_H_
diff --git a/lib/jpegli/common.cc b/lib/jpegli/common.cc
new file mode 100644 (file)
index 0000000..5f34372
--- /dev/null
@@ -0,0 +1,59 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/common.h"
+
+#include "lib/jpegli/decode_internal.h"
+#include "lib/jpegli/encode_internal.h"
+#include "lib/jpegli/memory_manager.h"
+
+void jpegli_abort(j_common_ptr cinfo) {
+  if (cinfo->mem == nullptr) return;
+  for (int pool_id = 0; pool_id < JPOOL_NUMPOOLS; ++pool_id) {
+    if (pool_id == JPOOL_PERMANENT) continue;
+    (*cinfo->mem->free_pool)(cinfo, pool_id);
+  }
+  if (cinfo->is_decompressor) {
+    cinfo->global_state = jpegli::kDecStart;
+  } else {
+    cinfo->global_state = jpegli::kEncStart;
+  }
+}
+
+void jpegli_destroy(j_common_ptr cinfo) {
+  if (cinfo->mem == nullptr) return;
+  (*cinfo->mem->self_destruct)(cinfo);
+  if (cinfo->is_decompressor) {
+    cinfo->global_state = jpegli::kDecNull;
+    delete reinterpret_cast<j_decompress_ptr>(cinfo)->master;
+  } else {
+    cinfo->global_state = jpegli::kEncNull;
+  }
+}
+
+JQUANT_TBL* jpegli_alloc_quant_table(j_common_ptr cinfo) {
+  JQUANT_TBL* table = jpegli::Allocate<JQUANT_TBL>(cinfo, 1);
+  table->sent_table = FALSE;
+  return table;
+}
+
+JHUFF_TBL* jpegli_alloc_huff_table(j_common_ptr cinfo) {
+  JHUFF_TBL* table = jpegli::Allocate<JHUFF_TBL>(cinfo, 1);
+  table->sent_table = FALSE;
+  return table;
+}
+
+int jpegli_bytes_per_sample(JpegliDataType data_type) {
+  switch (data_type) {
+    case JPEGLI_TYPE_UINT8:
+      return 1;
+    case JPEGLI_TYPE_UINT16:
+      return 2;
+    case JPEGLI_TYPE_FLOAT:
+      return 4;
+    default:
+      return 0;
+  }
+}
diff --git a/lib/jpegli/common.h b/lib/jpegli/common.h
new file mode 100644 (file)
index 0000000..42487f2
--- /dev/null
@@ -0,0 +1,48 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// This file contains the C API of the common encoder/decoder part of libjpegli
+// library, which is based on the C API of libjpeg, with the function names
+// changed from jpeg_* to jpegli_*, while compressor and decompressor object
+// definitions are included directly from jpeglib.h
+//
+// Applications can use the libjpegli library in one of the following ways:
+//
+//  (1) Include jpegli/encode.h and/or jpegli/decode.h, update the function
+//      names of the API and link against libjpegli.
+//
+//  (2) Leave the application code unchanged, but replace the libjpeg.so library
+//      with the one built by this project that is API- and ABI-compatible with
+//      libjpeg-turbo's version of libjpeg.so.
+
+#ifndef LIB_JPEGLI_COMMON_H_
+#define LIB_JPEGLI_COMMON_H_
+
+/* clang-format off */
+#include <stdio.h>
+#include <jpeglib.h>
+/* clang-format on */
+
+#include "lib/jpegli/types.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+struct jpeg_error_mgr* jpegli_std_error(struct jpeg_error_mgr* err);
+
+void jpegli_abort(j_common_ptr cinfo);
+
+void jpegli_destroy(j_common_ptr cinfo);
+
+JQUANT_TBL* jpegli_alloc_quant_table(j_common_ptr cinfo);
+
+JHUFF_TBL* jpegli_alloc_huff_table(j_common_ptr cinfo);
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}  // extern "C"
+#endif
+
+#endif  // LIB_JPEGLI_COMMON_H_
diff --git a/lib/jpegli/common_internal.h b/lib/jpegli/common_internal.h
new file mode 100644 (file)
index 0000000..248d315
--- /dev/null
@@ -0,0 +1,150 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_COMMON_INTERNAL_H_
+#define LIB_JPEGLI_COMMON_INTERNAL_H_
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <algorithm>
+#include <hwy/aligned_allocator.h>
+
+#include "lib/jpegli/memory_manager.h"
+#include "lib/jpegli/simd.h"
+#include "lib/jxl/base/compiler_specific.h"  // for ssize_t
+#include "lib/jxl/base/status.h"             // for JXL_CHECK
+
+namespace jpegli {
+
+enum State {
+  kDecNull,
+  kDecStart,
+  kDecInHeader,
+  kDecHeaderDone,
+  kDecProcessMarkers,
+  kDecProcessScan,
+  kEncNull,
+  kEncStart,
+  kEncHeader,
+  kEncReadImage,
+  kEncWriteCoeffs,
+};
+
+template <typename T1, typename T2>
+constexpr inline T1 DivCeil(T1 a, T2 b) {
+  return (a + b - 1) / b;
+}
+
+template <typename T1, typename T2>
+constexpr inline T1 RoundUpTo(T1 a, T2 b) {
+  return DivCeil(a, b) * b;
+}
+
+constexpr size_t kDCTBlockSize = 64;
+// This is set to the same value as MAX_COMPS_IN_SCAN, because that is the
+// maximum number of channels the libjpeg-turbo decoder can decode.
+constexpr int kMaxComponents = 4;
+constexpr int kMaxQuantTables = 4;
+constexpr int kJpegPrecision = 8;
+constexpr int kMaxHuffmanTables = 4;
+constexpr size_t kJpegHuffmanMaxBitLength = 16;
+constexpr int kJpegHuffmanAlphabetSize = 256;
+constexpr int kJpegDCAlphabetSize = 12;
+constexpr int kMaxDHTMarkers = 512;
+constexpr int kMaxDimPixels = 65535;
+constexpr uint8_t kApp1 = 0xE1;
+constexpr uint8_t kApp2 = 0xE2;
+const uint8_t kIccProfileTag[12] = "ICC_PROFILE";
+const uint8_t kExifTag[6] = "Exif\0";
+const uint8_t kXMPTag[29] = "http://ns.adobe.com/xap/1.0/";
+
+/* clang-format off */
+constexpr uint32_t kJPEGNaturalOrder[80] = {
+  0,   1,  8, 16,  9,  2,  3, 10,
+  17, 24, 32, 25, 18, 11,  4,  5,
+  12, 19, 26, 33, 40, 48, 41, 34,
+  27, 20, 13,  6,  7, 14, 21, 28,
+  35, 42, 49, 56, 57, 50, 43, 36,
+  29, 22, 15, 23, 30, 37, 44, 51,
+  58, 59, 52, 45, 38, 31, 39, 46,
+  53, 60, 61, 54, 47, 55, 62, 63,
+  // extra entries for safety in decoder
+  63, 63, 63, 63, 63, 63, 63, 63,
+  63, 63, 63, 63, 63, 63, 63, 63
+};
+
+constexpr uint32_t kJPEGZigZagOrder[64] = {
+  0,   1,  5,  6, 14, 15, 27, 28,
+  2,   4,  7, 13, 16, 26, 29, 42,
+  3,   8, 12, 17, 25, 30, 41, 43,
+  9,  11, 18, 24, 31, 40, 44, 53,
+  10, 19, 23, 32, 39, 45, 52, 54,
+  20, 22, 33, 38, 46, 51, 55, 60,
+  21, 34, 37, 47, 50, 56, 59, 61,
+  35, 36, 48, 49, 57, 58, 62, 63
+};
+/* clang-format on */
+
+template <typename T>
+class RowBuffer {
+ public:
+  template <typename CInfoType>
+  void Allocate(CInfoType cinfo, size_t num_rows, size_t rowsize) {
+    size_t vec_size = std::max(VectorSize(), sizeof(T));
+    JXL_CHECK(vec_size % sizeof(T) == 0);
+    size_t alignment = std::max<size_t>(HWY_ALIGNMENT, vec_size);
+    size_t min_memstride = alignment + rowsize * sizeof(T) + vec_size;
+    size_t memstride = RoundUpTo(min_memstride, alignment);
+    xsize_ = rowsize;
+    ysize_ = num_rows;
+    stride_ = memstride / sizeof(T);
+    offset_ = alignment / sizeof(T);
+    data_ = ::jpegli::Allocate<T>(cinfo, ysize_ * stride_, JPOOL_IMAGE_ALIGNED);
+  }
+
+  T* Row(ssize_t y) const {
+    return &data_[((ysize_ + y) % ysize_) * stride_ + offset_];
+  }
+
+  size_t xsize() const { return xsize_; };
+  size_t ysize() const { return ysize_; };
+  size_t stride() const { return stride_; }
+
+  void PadRow(size_t y, size_t from, int border) {
+    float* row = Row(y);
+    for (int offset = -border; offset < 0; ++offset) {
+      row[offset] = row[0];
+    }
+    float last_val = row[from - 1];
+    for (size_t x = from; x < xsize_ + border; ++x) {
+      row[x] = last_val;
+    }
+  }
+
+  void CopyRow(ssize_t dst_row, ssize_t src_row, int border) {
+    memcpy(Row(dst_row) - border, Row(src_row) - border,
+           (xsize_ + 2 * border) * sizeof(T));
+  }
+
+  void FillRow(ssize_t y, T val, size_t len) {
+    T* row = Row(y);
+    for (size_t x = 0; x < len; ++x) {
+      row[x] = val;
+    }
+  }
+
+ private:
+  size_t xsize_;
+  size_t ysize_;
+  size_t stride_;
+  size_t offset_;
+  T* data_;
+};
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_COMMON_INTERNAL_H_
diff --git a/lib/jpegli/dct-inl.h b/lib/jpegli/dct-inl.h
new file mode 100644 (file)
index 0000000..1cbe704
--- /dev/null
@@ -0,0 +1,258 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#if defined(LIB_JPEGLI_DCT_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JPEGLI_DCT_INL_H_
+#undef LIB_JPEGLI_DCT_INL_H_
+#else
+#define LIB_JPEGLI_DCT_INL_H_
+#endif
+
+#include "lib/jpegli/transpose-inl.h"
+#include "lib/jxl/base/compiler_specific.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jpegli {
+namespace HWY_NAMESPACE {
+namespace {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Abs;
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::DemoteTo;
+using hwy::HWY_NAMESPACE::Ge;
+using hwy::HWY_NAMESPACE::IfThenElseZero;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::Rebind;
+using hwy::HWY_NAMESPACE::Round;
+using hwy::HWY_NAMESPACE::Sub;
+using hwy::HWY_NAMESPACE::Vec;
+
+using D = HWY_FULL(float);
+using DI = HWY_FULL(int32_t);
+
+template <size_t N>
+void AddReverse(const float* JXL_RESTRICT ain1, const float* JXL_RESTRICT ain2,
+                float* JXL_RESTRICT aout) {
+  HWY_CAPPED(float, 8) d8;
+  for (size_t i = 0; i < N; i++) {
+    auto in1 = Load(d8, ain1 + i * 8);
+    auto in2 = Load(d8, ain2 + (N - i - 1) * 8);
+    Store(Add(in1, in2), d8, aout + i * 8);
+  }
+}
+
+template <size_t N>
+void SubReverse(const float* JXL_RESTRICT ain1, const float* JXL_RESTRICT ain2,
+                float* JXL_RESTRICT aout) {
+  HWY_CAPPED(float, 8) d8;
+  for (size_t i = 0; i < N; i++) {
+    auto in1 = Load(d8, ain1 + i * 8);
+    auto in2 = Load(d8, ain2 + (N - i - 1) * 8);
+    Store(Sub(in1, in2), d8, aout + i * 8);
+  }
+}
+
+template <size_t N>
+void B(float* JXL_RESTRICT coeff) {
+  HWY_CAPPED(float, 8) d8;
+  constexpr float kSqrt2 = 1.41421356237f;
+  auto sqrt2 = Set(d8, kSqrt2);
+  auto in1 = Load(d8, coeff);
+  auto in2 = Load(d8, coeff + 8);
+  Store(MulAdd(in1, sqrt2, in2), d8, coeff);
+  for (size_t i = 1; i + 1 < N; i++) {
+    auto in1 = Load(d8, coeff + i * 8);
+    auto in2 = Load(d8, coeff + (i + 1) * 8);
+    Store(Add(in1, in2), d8, coeff + i * 8);
+  }
+}
+
+// Ideally optimized away by compiler (except the multiply).
+template <size_t N>
+void InverseEvenOdd(const float* JXL_RESTRICT ain, float* JXL_RESTRICT aout) {
+  HWY_CAPPED(float, 8) d8;
+  for (size_t i = 0; i < N / 2; i++) {
+    auto in1 = Load(d8, ain + i * 8);
+    Store(in1, d8, aout + 2 * i * 8);
+  }
+  for (size_t i = N / 2; i < N; i++) {
+    auto in1 = Load(d8, ain + i * 8);
+    Store(in1, d8, aout + (2 * (i - N / 2) + 1) * 8);
+  }
+}
+
+// Constants for DCT implementation. Generated by the following snippet:
+// for i in range(N // 2):
+//    print(1.0 / (2 * math.cos((i + 0.5) * math.pi / N)), end=", ")
+template <size_t N>
+struct WcMultipliers;
+
+template <>
+struct WcMultipliers<4> {
+  static constexpr float kMultipliers[] = {
+      0.541196100146197,
+      1.3065629648763764,
+  };
+};
+
+template <>
+struct WcMultipliers<8> {
+  static constexpr float kMultipliers[] = {
+      0.5097955791041592,
+      0.6013448869350453,
+      0.8999762231364156,
+      2.5629154477415055,
+  };
+};
+
+constexpr float WcMultipliers<4>::kMultipliers[];
+constexpr float WcMultipliers<8>::kMultipliers[];
+
+// Invoked on full vector.
+template <size_t N>
+void Multiply(float* JXL_RESTRICT coeff) {
+  HWY_CAPPED(float, 8) d8;
+  for (size_t i = 0; i < N / 2; i++) {
+    auto in1 = Load(d8, coeff + (N / 2 + i) * 8);
+    auto mul = Set(d8, WcMultipliers<N>::kMultipliers[i]);
+    Store(Mul(in1, mul), d8, coeff + (N / 2 + i) * 8);
+  }
+}
+
+void LoadFromBlock(const float* JXL_RESTRICT pixels, size_t pixels_stride,
+                   size_t off, float* JXL_RESTRICT coeff) {
+  HWY_CAPPED(float, 8) d8;
+  for (size_t i = 0; i < 8; i++) {
+    Store(LoadU(d8, pixels + i * pixels_stride + off), d8, coeff + i * 8);
+  }
+}
+
+void StoreToBlockAndScale(const float* JXL_RESTRICT coeff, float* output,
+                          size_t off) {
+  HWY_CAPPED(float, 8) d8;
+  auto mul = Set(d8, 1.0f / 8);
+  for (size_t i = 0; i < 8; i++) {
+    StoreU(Mul(mul, Load(d8, coeff + i * 8)), d8, output + i * 8 + off);
+  }
+}
+
+template <size_t N>
+struct DCT1DImpl;
+
+template <>
+struct DCT1DImpl<1> {
+  JXL_INLINE void operator()(float* JXL_RESTRICT mem) {}
+};
+
+template <>
+struct DCT1DImpl<2> {
+  JXL_INLINE void operator()(float* JXL_RESTRICT mem) {
+    HWY_CAPPED(float, 8) d8;
+    auto in1 = Load(d8, mem);
+    auto in2 = Load(d8, mem + 8);
+    Store(Add(in1, in2), d8, mem);
+    Store(Sub(in1, in2), d8, mem + 8);
+  }
+};
+
+template <size_t N>
+struct DCT1DImpl {
+  void operator()(float* JXL_RESTRICT mem) {
+    HWY_ALIGN float tmp[N * 8];
+    AddReverse<N / 2>(mem, mem + N * 4, tmp);
+    DCT1DImpl<N / 2>()(tmp);
+    SubReverse<N / 2>(mem, mem + N * 4, tmp + N * 4);
+    Multiply<N>(tmp);
+    DCT1DImpl<N / 2>()(tmp + N * 4);
+    B<N / 2>(tmp + N * 4);
+    InverseEvenOdd<N>(tmp, mem);
+  }
+};
+
+void DCT1D(const float* JXL_RESTRICT pixels, size_t pixels_stride,
+           float* JXL_RESTRICT output) {
+  HWY_CAPPED(float, 8) d8;
+  HWY_ALIGN float tmp[64];
+  for (size_t i = 0; i < 8; i += Lanes(d8)) {
+    // TODO(veluca): consider removing the temporary memory here (as is done in
+    // IDCT), if it turns out that some compilers don't optimize away the loads
+    // and this is performance-critical.
+    LoadFromBlock(pixels, pixels_stride, i, tmp);
+    DCT1DImpl<8>()(tmp);
+    StoreToBlockAndScale(tmp, output, i);
+  }
+}
+
+static JXL_INLINE JXL_MAYBE_UNUSED void TransformFromPixels(
+    const float* JXL_RESTRICT pixels, size_t pixels_stride,
+    float* JXL_RESTRICT coefficients, float* JXL_RESTRICT scratch_space) {
+  DCT1D(pixels, pixels_stride, scratch_space);
+  Transpose8x8Block(scratch_space, coefficients);
+  DCT1D(coefficients, 8, scratch_space);
+  Transpose8x8Block(scratch_space, coefficients);
+}
+
+static JXL_INLINE JXL_MAYBE_UNUSED void StoreQuantizedValue(const Vec<DI>& ival,
+                                                            int16_t* out) {
+  Rebind<int16_t, DI> di16;
+  Store(DemoteTo(di16, ival), di16, out);
+}
+
+static JXL_INLINE JXL_MAYBE_UNUSED void StoreQuantizedValue(const Vec<DI>& ival,
+                                                            int32_t* out) {
+  DI di;
+  Store(ival, di, out);
+}
+
+template <typename T>
+void QuantizeBlock(const float* dct, const float* qmc, float aq_strength,
+                   const float* zero_bias_offset, const float* zero_bias_mul,
+                   T* block) {
+  D d;
+  DI di;
+  const auto aq_mul = Set(d, aq_strength);
+  for (size_t k = 0; k < DCTSIZE2; k += Lanes(d)) {
+    const auto val = Load(d, dct + k);
+    const auto q = Load(d, qmc + k);
+    const auto qval = Mul(val, q);
+    const auto zb_offset = Load(d, zero_bias_offset + k);
+    const auto zb_mul = Load(d, zero_bias_mul + k);
+    const auto threshold = Add(zb_offset, Mul(zb_mul, aq_mul));
+    const auto nzero_mask = Ge(Abs(qval), threshold);
+    const auto ival = ConvertTo(di, IfThenElseZero(nzero_mask, Round(qval)));
+    StoreQuantizedValue(ival, block + k);
+  }
+}
+
+template <typename T>
+void ComputeCoefficientBlock(const float* JXL_RESTRICT pixels, size_t stride,
+                             const float* JXL_RESTRICT qmc,
+                             int16_t last_dc_coeff, float aq_strength,
+                             const float* zero_bias_offset,
+                             const float* zero_bias_mul,
+                             float* JXL_RESTRICT tmp, T* block) {
+  float* JXL_RESTRICT dct = tmp;
+  float* JXL_RESTRICT scratch_space = tmp + DCTSIZE2;
+  TransformFromPixels(pixels, stride, dct, scratch_space);
+  QuantizeBlock(dct, qmc, aq_strength, zero_bias_offset, zero_bias_mul, block);
+  // Center DC values around zero.
+  static constexpr float kDCBias = 128.0f;
+  const float dc = (dct[0] - kDCBias) * qmc[0];
+  float dc_threshold = zero_bias_offset[0] + aq_strength * zero_bias_mul[0];
+  if (std::abs(dc - last_dc_coeff) < dc_threshold) {
+    block[0] = last_dc_coeff;
+  } else {
+    block[0] = std::round(dc);
+  }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace
+}  // namespace HWY_NAMESPACE
+}  // namespace jpegli
+HWY_AFTER_NAMESPACE();
+#endif  // LIB_JPEGLI_DCT_INL_H_
diff --git a/lib/jpegli/decode.cc b/lib/jpegli/decode.cc
new file mode 100644 (file)
index 0000000..758babe
--- /dev/null
@@ -0,0 +1,1028 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/decode.h"
+
+#include <string.h>
+
+#include <vector>
+
+#include "lib/jpegli/color_quantize.h"
+#include "lib/jpegli/decode_internal.h"
+#include "lib/jpegli/decode_marker.h"
+#include "lib/jpegli/decode_scan.h"
+#include "lib/jpegli/error.h"
+#include "lib/jpegli/memory_manager.h"
+#include "lib/jpegli/render.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/status.h"
+
+namespace jpegli {
+
+void InitializeImage(j_decompress_ptr cinfo) {
+  cinfo->restart_interval = 0;
+  cinfo->saw_JFIF_marker = FALSE;
+  cinfo->JFIF_major_version = 1;
+  cinfo->JFIF_minor_version = 1;
+  cinfo->density_unit = 0;
+  cinfo->X_density = 1;
+  cinfo->Y_density = 1;
+  cinfo->saw_Adobe_marker = FALSE;
+  cinfo->Adobe_transform = 0;
+  cinfo->CCIR601_sampling = FALSE;  // not used
+  cinfo->marker_list = nullptr;
+  cinfo->comp_info = nullptr;
+  cinfo->input_scan_number = 0;
+  cinfo->input_iMCU_row = 0;
+  cinfo->output_scan_number = 0;
+  cinfo->output_iMCU_row = 0;
+  cinfo->output_scanline = 0;
+  cinfo->unread_marker = 0;
+  cinfo->coef_bits = nullptr;
+  // We set all these to zero since we don't yet support arithmetic coding.
+  memset(cinfo->arith_dc_L, 0, sizeof(cinfo->arith_dc_L));
+  memset(cinfo->arith_dc_U, 0, sizeof(cinfo->arith_dc_U));
+  memset(cinfo->arith_ac_K, 0, sizeof(cinfo->arith_ac_K));
+  // Initialize the private fields.
+  jpeg_decomp_master* m = cinfo->master;
+  m->input_buffer_.clear();
+  m->input_buffer_pos_ = 0;
+  m->codestream_bits_ahead_ = 0;
+  m->is_multiscan_ = false;
+  m->found_soi_ = false;
+  m->found_dri_ = false;
+  m->found_sof_ = false;
+  m->found_eoi_ = false;
+  m->icc_index_ = 0;
+  m->icc_total_ = 0;
+  m->icc_profile_.clear();
+  memset(m->dc_huff_lut_, 0, sizeof(m->dc_huff_lut_));
+  memset(m->ac_huff_lut_, 0, sizeof(m->ac_huff_lut_));
+  // Initialize the values to an invalid symbol so that we can recognize it
+  // when reading the bit stream using a Huffman code with space > 0.
+  for (size_t i = 0; i < kAllHuffLutSize; ++i) {
+    m->dc_huff_lut_[i].bits = 0;
+    m->dc_huff_lut_[i].value = 0xffff;
+    m->ac_huff_lut_[i].bits = 0;
+    m->ac_huff_lut_[i].value = 0xffff;
+  }
+  m->colormap_lut_ = nullptr;
+  m->pixels_ = nullptr;
+  m->scanlines_ = nullptr;
+  m->regenerate_inverse_colormap_ = true;
+  for (int i = 0; i < kMaxComponents; ++i) {
+    m->dither_[i] = nullptr;
+    m->error_row_[i] = nullptr;
+  }
+  m->output_passes_done_ = 0;
+  m->xoffset_ = 0;
+  m->dequant_ = nullptr;
+}
+
+void InitializeDecompressParams(j_decompress_ptr cinfo) {
+  cinfo->jpeg_color_space = JCS_UNKNOWN;
+  cinfo->out_color_space = JCS_UNKNOWN;
+  cinfo->scale_num = 1;
+  cinfo->scale_denom = 1;
+  cinfo->output_gamma = 0.0f;
+  cinfo->buffered_image = FALSE;
+  cinfo->raw_data_out = FALSE;
+  cinfo->dct_method = JDCT_DEFAULT;
+  cinfo->do_fancy_upsampling = TRUE;
+  cinfo->do_block_smoothing = TRUE;
+  cinfo->quantize_colors = FALSE;
+  cinfo->dither_mode = JDITHER_FS;
+  cinfo->two_pass_quantize = TRUE;
+  cinfo->desired_number_of_colors = 256;
+  cinfo->enable_1pass_quant = FALSE;
+  cinfo->enable_external_quant = FALSE;
+  cinfo->enable_2pass_quant = FALSE;
+  cinfo->actual_number_of_colors = 0;
+  cinfo->colormap = nullptr;
+}
+
+void InitProgressMonitor(j_decompress_ptr cinfo, bool coef_only) {
+  if (!cinfo->progress) return;
+  jpeg_decomp_master* m = cinfo->master;
+  int nc = cinfo->num_components;
+  int estimated_num_scans =
+      cinfo->progressive_mode ? 2 + 3 * nc : (m->is_multiscan_ ? nc : 1);
+  cinfo->progress->pass_limit = cinfo->total_iMCU_rows * estimated_num_scans;
+  cinfo->progress->pass_counter = 0;
+  if (coef_only) {
+    cinfo->progress->total_passes = 1;
+  } else {
+    int input_passes = !cinfo->buffered_image && m->is_multiscan_ ? 1 : 0;
+    bool two_pass_quant = cinfo->quantize_colors && !cinfo->colormap &&
+                          cinfo->two_pass_quantize && cinfo->enable_2pass_quant;
+    cinfo->progress->total_passes = input_passes + (two_pass_quant ? 2 : 1);
+  }
+  cinfo->progress->completed_passes = 0;
+}
+
+void InitProgressMonitorForOutput(j_decompress_ptr cinfo) {
+  if (!cinfo->progress) return;
+  jpeg_decomp_master* m = cinfo->master;
+  int passes_per_output = cinfo->enable_2pass_quant ? 2 : 1;
+  int output_passes_left = cinfo->buffered_image && !m->found_eoi_ ? 2 : 1;
+  cinfo->progress->total_passes =
+      m->output_passes_done_ + passes_per_output * output_passes_left;
+  cinfo->progress->completed_passes = m->output_passes_done_;
+}
+
+void ProgressMonitorInputPass(j_decompress_ptr cinfo) {
+  if (!cinfo->progress) return;
+  cinfo->progress->pass_counter =
+      ((cinfo->input_scan_number - 1) * cinfo->total_iMCU_rows +
+       cinfo->input_iMCU_row);
+  if (cinfo->progress->pass_counter > cinfo->progress->pass_limit) {
+    cinfo->progress->pass_limit =
+        cinfo->input_scan_number * cinfo->total_iMCU_rows;
+  }
+  (*cinfo->progress->progress_monitor)(reinterpret_cast<j_common_ptr>(cinfo));
+}
+
+void ProgressMonitorOutputPass(j_decompress_ptr cinfo) {
+  if (!cinfo->progress) return;
+  jpeg_decomp_master* m = cinfo->master;
+  int input_passes = !cinfo->buffered_image && m->is_multiscan_ ? 1 : 0;
+  cinfo->progress->pass_counter = cinfo->output_scanline;
+  cinfo->progress->pass_limit = cinfo->output_height;
+  cinfo->progress->completed_passes = input_passes + m->output_passes_done_;
+  (*cinfo->progress->progress_monitor)(reinterpret_cast<j_common_ptr>(cinfo));
+}
+
+void BuildHuffmanLookupTable(j_decompress_ptr cinfo, JHUFF_TBL* table,
+                             HuffmanTableEntry* huff_lut) {
+  uint32_t counts[kJpegHuffmanMaxBitLength + 1] = {};
+  counts[0] = 0;
+  int total_count = 0;
+  int space = 1 << kJpegHuffmanMaxBitLength;
+  int max_depth = 1;
+  for (size_t i = 1; i <= kJpegHuffmanMaxBitLength; ++i) {
+    int count = table->bits[i];
+    if (count != 0) {
+      max_depth = i;
+    }
+    counts[i] = count;
+    total_count += count;
+    space -= count * (1 << (kJpegHuffmanMaxBitLength - i));
+  }
+  uint32_t values[kJpegHuffmanAlphabetSize + 1] = {};
+  uint8_t values_seen[256] = {0};
+  for (int i = 0; i < total_count; ++i) {
+    int value = table->huffval[i];
+    if (values_seen[value]) {
+      return JPEGLI_ERROR("Duplicate Huffman code value %d", value);
+    }
+    values_seen[value] = 1;
+    values[i] = value;
+  }
+  // Add an invalid symbol that will have the all 1 code.
+  ++counts[max_depth];
+  values[total_count] = kJpegHuffmanAlphabetSize;
+  space -= (1 << (kJpegHuffmanMaxBitLength - max_depth));
+  if (space < 0) {
+    JPEGLI_ERROR("Invalid Huffman code lengths.");
+  } else if (space > 0 && huff_lut[0].value != 0xffff) {
+    // Re-initialize the values to an invalid symbol so that we can recognize
+    // it when reading the bit stream using a Huffman code with space > 0.
+    for (int i = 0; i < kJpegHuffmanLutSize; ++i) {
+      huff_lut[i].bits = 0;
+      huff_lut[i].value = 0xffff;
+    }
+  }
+  BuildJpegHuffmanTable(&counts[0], &values[0], huff_lut);
+}
+
+void PrepareForScan(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  for (int i = 0; i < cinfo->comps_in_scan; ++i) {
+    int comp_idx = cinfo->cur_comp_info[i]->component_index;
+    int* prev_coef_bits = cinfo->coef_bits[comp_idx + cinfo->num_components];
+    for (int k = std::min(cinfo->Ss, 1); k <= std::max(cinfo->Se, 9); k++) {
+      prev_coef_bits[k] =
+          (cinfo->input_scan_number > 0) ? cinfo->coef_bits[comp_idx][k] : 0;
+    }
+    for (int k = cinfo->Ss; k <= cinfo->Se; ++k) {
+      cinfo->coef_bits[comp_idx][k] = cinfo->Al;
+    }
+  }
+  AddStandardHuffmanTables(reinterpret_cast<j_common_ptr>(cinfo),
+                           /*is_dc=*/false);
+  AddStandardHuffmanTables(reinterpret_cast<j_common_ptr>(cinfo),
+                           /*is_dc=*/true);
+  // Check that all the Huffman tables needed for this scan are defined and
+  // build derived lookup tables.
+  for (int i = 0; i < cinfo->comps_in_scan; ++i) {
+    if (cinfo->Ss == 0) {
+      int dc_tbl_idx = cinfo->cur_comp_info[i]->dc_tbl_no;
+      JHUFF_TBL* table = cinfo->dc_huff_tbl_ptrs[dc_tbl_idx];
+      HuffmanTableEntry* huff_lut =
+          &m->dc_huff_lut_[dc_tbl_idx * kJpegHuffmanLutSize];
+      if (!table) {
+        return JPEGLI_ERROR("DC Huffman table %d not found", dc_tbl_idx);
+      }
+      BuildHuffmanLookupTable(cinfo, table, huff_lut);
+    }
+    if (cinfo->Se > 0) {
+      int ac_tbl_idx = cinfo->cur_comp_info[i]->ac_tbl_no;
+      JHUFF_TBL* table = cinfo->ac_huff_tbl_ptrs[ac_tbl_idx];
+      HuffmanTableEntry* huff_lut =
+          &m->ac_huff_lut_[ac_tbl_idx * kJpegHuffmanLutSize];
+      if (!table) {
+        return JPEGLI_ERROR("AC Huffman table %d not found", ac_tbl_idx);
+      }
+      BuildHuffmanLookupTable(cinfo, table, huff_lut);
+    }
+  }
+  // Copy quantization tables into comp_info.
+  for (int i = 0; i < cinfo->comps_in_scan; ++i) {
+    jpeg_component_info* comp = cinfo->cur_comp_info[i];
+    if (comp->quant_table == nullptr) {
+      comp->quant_table = Allocate<JQUANT_TBL>(cinfo, 1, JPOOL_IMAGE);
+      memcpy(comp->quant_table, cinfo->quant_tbl_ptrs[comp->quant_tbl_no],
+             sizeof(JQUANT_TBL));
+    }
+  }
+  if (cinfo->comps_in_scan == 1) {
+    const auto& comp = *cinfo->cur_comp_info[0];
+    cinfo->MCUs_per_row = DivCeil(cinfo->image_width * comp.h_samp_factor,
+                                  cinfo->max_h_samp_factor * DCTSIZE);
+    cinfo->MCU_rows_in_scan = DivCeil(cinfo->image_height * comp.v_samp_factor,
+                                      cinfo->max_v_samp_factor * DCTSIZE);
+    m->mcu_rows_per_iMCU_row_ = cinfo->cur_comp_info[0]->v_samp_factor;
+  } else {
+    cinfo->MCU_rows_in_scan = cinfo->total_iMCU_rows;
+    cinfo->MCUs_per_row = m->iMCU_cols_;
+    m->mcu_rows_per_iMCU_row_ = 1;
+    size_t mcu_size = 0;
+    for (int i = 0; i < cinfo->comps_in_scan; ++i) {
+      jpeg_component_info* comp = cinfo->cur_comp_info[i];
+      mcu_size += comp->h_samp_factor * comp->v_samp_factor;
+    }
+    if (mcu_size > D_MAX_BLOCKS_IN_MCU) {
+      JPEGLI_ERROR("MCU size too big");
+    }
+  }
+  memset(m->last_dc_coeff_, 0, sizeof(m->last_dc_coeff_));
+  m->restarts_to_go_ = cinfo->restart_interval;
+  m->next_restart_marker_ = 0;
+  m->eobrun_ = -1;
+  m->scan_mcu_row_ = 0;
+  m->scan_mcu_col_ = 0;
+  m->codestream_bits_ahead_ = 0;
+  ++cinfo->input_scan_number;
+  cinfo->input_iMCU_row = 0;
+  PrepareForiMCURow(cinfo);
+  cinfo->global_state = kDecProcessScan;
+}
+
+int ConsumeInput(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (cinfo->global_state == kDecProcessScan && m->streaming_mode_ &&
+      cinfo->input_iMCU_row > cinfo->output_iMCU_row) {
+    // Prevent input from getting ahead of output in streaming mode.
+    return JPEG_SUSPENDED;
+  }
+  jpeg_source_mgr* src = cinfo->src;
+  int status;
+  for (;;) {
+    const uint8_t* data;
+    size_t len;
+    if (m->input_buffer_.empty()) {
+      data = cinfo->src->next_input_byte;
+      len = cinfo->src->bytes_in_buffer;
+    } else {
+      data = &m->input_buffer_[m->input_buffer_pos_];
+      len = m->input_buffer_.size() - m->input_buffer_pos_;
+    }
+    size_t pos = 0;
+    if (cinfo->global_state == kDecProcessScan) {
+      status = ProcessScan(cinfo, data, len, &pos, &m->codestream_bits_ahead_);
+    } else {
+      status = ProcessMarkers(cinfo, data, len, &pos);
+    }
+    if (m->input_buffer_.empty()) {
+      cinfo->src->next_input_byte += pos;
+      cinfo->src->bytes_in_buffer -= pos;
+    } else {
+      m->input_buffer_pos_ += pos;
+      size_t bytes_left = m->input_buffer_.size() - m->input_buffer_pos_;
+      if (bytes_left <= src->bytes_in_buffer) {
+        src->next_input_byte += (src->bytes_in_buffer - bytes_left);
+        src->bytes_in_buffer = bytes_left;
+        m->input_buffer_.clear();
+        m->input_buffer_pos_ = 0;
+      }
+    }
+    if (status == kHandleRestart) {
+      JXL_DASSERT(m->input_buffer_.size() <=
+                  m->input_buffer_pos_ + src->bytes_in_buffer);
+      m->input_buffer_.clear();
+      m->input_buffer_pos_ = 0;
+      if (cinfo->unread_marker == 0xd0 + m->next_restart_marker_) {
+        cinfo->unread_marker = 0;
+      } else {
+        if (!(*cinfo->src->resync_to_restart)(cinfo, m->next_restart_marker_)) {
+          return JPEG_SUSPENDED;
+        }
+      }
+      m->next_restart_marker_ += 1;
+      m->next_restart_marker_ &= 0x7;
+      m->restarts_to_go_ = cinfo->restart_interval;
+      if (cinfo->unread_marker != 0) {
+        JPEGLI_WARN("Failed to resync to next restart marker, skipping scan.");
+        return JPEG_SCAN_COMPLETED;
+      }
+      continue;
+    }
+    if (status == kHandleMarkerProcessor) {
+      JXL_DASSERT(m->input_buffer_.size() <=
+                  m->input_buffer_pos_ + src->bytes_in_buffer);
+      m->input_buffer_.clear();
+      m->input_buffer_pos_ = 0;
+      if (!(*GetMarkerProcessor(cinfo))(cinfo)) {
+        return JPEG_SUSPENDED;
+      }
+      cinfo->unread_marker = 0;
+      continue;
+    }
+    if (status != kNeedMoreInput) {
+      break;
+    }
+    if (m->input_buffer_.empty()) {
+      JXL_DASSERT(m->input_buffer_pos_ == 0);
+      m->input_buffer_.assign(src->next_input_byte,
+                              src->next_input_byte + src->bytes_in_buffer);
+    }
+    if (!(*cinfo->src->fill_input_buffer)(cinfo)) {
+      m->input_buffer_.clear();
+      m->input_buffer_pos_ = 0;
+      return JPEG_SUSPENDED;
+    }
+    if (src->bytes_in_buffer == 0) {
+      JPEGLI_ERROR("Empty input.");
+    }
+    m->input_buffer_.insert(m->input_buffer_.end(), src->next_input_byte,
+                            src->next_input_byte + src->bytes_in_buffer);
+  }
+  if (status == JPEG_SCAN_COMPLETED) {
+    cinfo->global_state = kDecProcessMarkers;
+  } else if (status == JPEG_REACHED_SOS) {
+    if (cinfo->global_state == kDecInHeader) {
+      cinfo->global_state = kDecHeaderDone;
+    } else {
+      PrepareForScan(cinfo);
+    }
+  }
+  return status;
+}
+
+bool IsInputReady(j_decompress_ptr cinfo) {
+  if (cinfo->master->found_eoi_) {
+    return true;
+  }
+  if (cinfo->input_scan_number > cinfo->output_scan_number) {
+    return true;
+  }
+  if (cinfo->input_scan_number < cinfo->output_scan_number) {
+    return false;
+  }
+  if (cinfo->input_iMCU_row == cinfo->total_iMCU_rows) {
+    return true;
+  }
+  return cinfo->input_iMCU_row >
+         cinfo->output_iMCU_row + (cinfo->master->streaming_mode_ ? 0 : 2);
+}
+
+bool ReadOutputPass(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (!m->pixels_) {
+    size_t stride = cinfo->out_color_components * cinfo->output_width;
+    size_t num_samples = cinfo->output_height * stride;
+    m->pixels_ = Allocate<uint8_t>(cinfo, num_samples, JPOOL_IMAGE);
+    m->scanlines_ =
+        Allocate<JSAMPROW>(cinfo, cinfo->output_height, JPOOL_IMAGE);
+    for (size_t i = 0; i < cinfo->output_height; ++i) {
+      m->scanlines_[i] = &m->pixels_[i * stride];
+    }
+  }
+  size_t num_output_rows = 0;
+  while (num_output_rows < cinfo->output_height) {
+    if (IsInputReady(cinfo)) {
+      ProgressMonitorOutputPass(cinfo);
+      ProcessOutput(cinfo, &num_output_rows, m->scanlines_,
+                    cinfo->output_height);
+    } else if (ConsumeInput(cinfo) == JPEG_SUSPENDED) {
+      return false;
+    }
+  }
+  cinfo->output_scanline = 0;
+  cinfo->output_iMCU_row = 0;
+  return true;
+}
+
+boolean PrepareQuantizedOutput(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (cinfo->raw_data_out) {
+    JPEGLI_ERROR("Color quantization is not supported in raw data mode.");
+  }
+  if (m->output_data_type_ != JPEGLI_TYPE_UINT8) {
+    JPEGLI_ERROR("Color quantization must use 8-bit mode.");
+  }
+  if (cinfo->colormap) {
+    m->quant_mode_ = 3;
+  } else if (cinfo->two_pass_quantize && cinfo->enable_2pass_quant) {
+    m->quant_mode_ = 2;
+  } else if (cinfo->enable_1pass_quant) {
+    m->quant_mode_ = 1;
+  } else {
+    JPEGLI_ERROR("Invalid quantization mode change");
+  }
+  if (m->quant_mode_ > 1 && cinfo->dither_mode == JDITHER_ORDERED) {
+    cinfo->dither_mode = JDITHER_FS;
+  }
+  if (m->quant_mode_ == 1) {
+    ChooseColorMap1Pass(cinfo);
+  } else if (m->quant_mode_ == 2) {
+    m->quant_pass_ = 0;
+    if (!ReadOutputPass(cinfo)) {
+      return FALSE;
+    }
+    ChooseColorMap2Pass(cinfo);
+  }
+  if (m->quant_mode_ == 2 ||
+      (m->quant_mode_ == 3 && m->regenerate_inverse_colormap_)) {
+    CreateInverseColorMap(cinfo);
+  }
+  if (cinfo->dither_mode == JDITHER_ORDERED) {
+    CreateOrderedDitherTables(cinfo);
+  } else if (cinfo->dither_mode == JDITHER_FS) {
+    InitFSDitherState(cinfo);
+  }
+  m->quant_pass_ = 1;
+  return TRUE;
+}
+
+void AllocateCoefficientBuffer(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  j_common_ptr comptr = reinterpret_cast<j_common_ptr>(cinfo);
+  jvirt_barray_ptr* coef_arrays = jpegli::Allocate<jvirt_barray_ptr>(
+      cinfo, cinfo->num_components, JPOOL_IMAGE);
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    size_t height_in_blocks =
+        m->streaming_mode_ ? comp->v_samp_factor : comp->height_in_blocks;
+    coef_arrays[c] = (*cinfo->mem->request_virt_barray)(
+        comptr, JPOOL_IMAGE, TRUE, comp->width_in_blocks, height_in_blocks,
+        comp->v_samp_factor);
+  }
+  cinfo->master->coef_arrays = coef_arrays;
+  (*cinfo->mem->realize_virt_arrays)(comptr);
+}
+
+void AllocateOutputBuffers(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  size_t iMCU_width = cinfo->max_h_samp_factor * m->min_scaled_dct_size;
+  size_t output_stride = m->iMCU_cols_ * iMCU_width;
+  m->need_context_rows_ = false;
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    if (cinfo->do_fancy_upsampling && m->v_factor[c] == 2) {
+      m->need_context_rows_ = true;
+    }
+  }
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    const auto& comp = cinfo->comp_info[c];
+    size_t cheight = comp.v_samp_factor * m->scaled_dct_size[c];
+    int downsampled_width = output_stride / m->h_factor[c];
+    m->raw_height_[c] = cinfo->total_iMCU_rows * cheight;
+    if (m->need_context_rows_) {
+      cheight *= 3;
+    }
+    m->raw_output_[c].Allocate(cinfo, cheight, downsampled_width);
+  }
+  int num_all_components =
+      std::max(cinfo->out_color_components, cinfo->num_components);
+  for (int c = 0; c < num_all_components; ++c) {
+    m->render_output_[c].Allocate(cinfo, cinfo->max_v_samp_factor,
+                                  output_stride);
+  }
+  m->idct_scratch_ = Allocate<float>(cinfo, 5 * DCTSIZE2, JPOOL_IMAGE_ALIGNED);
+  // Padding for horizontal chroma upsampling.
+  constexpr size_t kPaddingLeft = 64;
+  constexpr size_t kPaddingRight = 64;
+  m->upsample_scratch_ = Allocate<float>(
+      cinfo, output_stride + kPaddingLeft + kPaddingRight, JPOOL_IMAGE_ALIGNED);
+  size_t bytes_per_sample = jpegli_bytes_per_sample(m->output_data_type_);
+  size_t bytes_per_pixel = cinfo->out_color_components * bytes_per_sample;
+  size_t scratch_stride = RoundUpTo(output_stride, HWY_ALIGNMENT);
+  m->output_scratch_ = Allocate<uint8_t>(
+      cinfo, bytes_per_pixel * scratch_stride, JPOOL_IMAGE_ALIGNED);
+  m->smoothing_scratch_ =
+      Allocate<int16_t>(cinfo, DCTSIZE2, JPOOL_IMAGE_ALIGNED);
+  size_t coeffs_per_block = cinfo->num_components * DCTSIZE2;
+  m->nonzeros_ = Allocate<int>(cinfo, coeffs_per_block, JPOOL_IMAGE_ALIGNED);
+  m->sumabs_ = Allocate<int>(cinfo, coeffs_per_block, JPOOL_IMAGE_ALIGNED);
+  m->biases_ = Allocate<float>(cinfo, coeffs_per_block, JPOOL_IMAGE_ALIGNED);
+  m->dequant_ = Allocate<float>(cinfo, coeffs_per_block, JPOOL_IMAGE_ALIGNED);
+  memset(m->dequant_, 0, coeffs_per_block * sizeof(float));
+}
+
+}  // namespace jpegli
+
+void jpegli_CreateDecompress(j_decompress_ptr cinfo, int version,
+                             size_t structsize) {
+  cinfo->mem = nullptr;
+  if (structsize != sizeof(*cinfo)) {
+    JPEGLI_ERROR("jpeg_decompress_struct has wrong size.");
+  }
+  jpegli::InitMemoryManager(reinterpret_cast<j_common_ptr>(cinfo));
+  cinfo->is_decompressor = TRUE;
+  cinfo->progress = nullptr;
+  cinfo->src = nullptr;
+  for (int i = 0; i < NUM_QUANT_TBLS; i++) {
+    cinfo->quant_tbl_ptrs[i] = nullptr;
+  }
+  for (int i = 0; i < NUM_HUFF_TBLS; i++) {
+    cinfo->dc_huff_tbl_ptrs[i] = nullptr;
+    cinfo->ac_huff_tbl_ptrs[i] = nullptr;
+  }
+  cinfo->global_state = jpegli::kDecStart;
+  cinfo->sample_range_limit = nullptr;  // not used
+  cinfo->rec_outbuf_height = 1;         // output works with any buffer height
+  cinfo->master = new jpeg_decomp_master;
+  jpeg_decomp_master* m = cinfo->master;
+  for (int i = 0; i < 16; ++i) {
+    m->app_marker_parsers[i] = nullptr;
+  }
+  m->com_marker_parser = nullptr;
+  memset(m->markers_to_save_, 0, sizeof(m->markers_to_save_));
+  jpegli::InitializeDecompressParams(cinfo);
+  jpegli::InitializeImage(cinfo);
+}
+
+void jpegli_destroy_decompress(j_decompress_ptr cinfo) {
+  jpegli_destroy(reinterpret_cast<j_common_ptr>(cinfo));
+}
+
+void jpegli_abort_decompress(j_decompress_ptr cinfo) {
+  jpegli_abort(reinterpret_cast<j_common_ptr>(cinfo));
+}
+
+void jpegli_save_markers(j_decompress_ptr cinfo, int marker_code,
+                         unsigned int length_limit) {
+  // TODO(szabadka) Limit our memory usage by taking into account length_limit.
+  jpeg_decomp_master* m = cinfo->master;
+  if (marker_code < 0xe0) {
+    JPEGLI_ERROR("jpegli_save_markers: invalid marker code %d", marker_code);
+  }
+  m->markers_to_save_[marker_code - 0xe0] = 1;
+}
+
+void jpegli_set_marker_processor(j_decompress_ptr cinfo, int marker_code,
+                                 jpeg_marker_parser_method routine) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (marker_code == 0xfe) {
+    m->com_marker_parser = routine;
+  } else if (marker_code >= 0xe0 && marker_code <= 0xef) {
+    m->app_marker_parsers[marker_code - 0xe0] = routine;
+  } else {
+    JPEGLI_ERROR("jpegli_set_marker_processor: invalid marker code %d",
+                 marker_code);
+  }
+}
+
+int jpegli_consume_input(j_decompress_ptr cinfo) {
+  if (cinfo->global_state == jpegli::kDecStart) {
+    (*cinfo->err->reset_error_mgr)(reinterpret_cast<j_common_ptr>(cinfo));
+    (*cinfo->src->init_source)(cinfo);
+    jpegli::InitializeDecompressParams(cinfo);
+    jpegli::InitializeImage(cinfo);
+    cinfo->global_state = jpegli::kDecInHeader;
+  }
+  if (cinfo->global_state == jpegli::kDecHeaderDone) {
+    return JPEG_REACHED_SOS;
+  }
+  if (cinfo->master->found_eoi_) {
+    return JPEG_REACHED_EOI;
+  }
+  if (cinfo->global_state == jpegli::kDecInHeader ||
+      cinfo->global_state == jpegli::kDecProcessMarkers ||
+      cinfo->global_state == jpegli::kDecProcessScan) {
+    return jpegli::ConsumeInput(cinfo);
+  }
+  JPEGLI_ERROR("Unexpected state %d", cinfo->global_state);
+  return JPEG_REACHED_EOI;  // return value does not matter
+}
+
+int jpegli_read_header(j_decompress_ptr cinfo, boolean require_image) {
+  if (cinfo->global_state != jpegli::kDecStart &&
+      cinfo->global_state != jpegli::kDecInHeader) {
+    JPEGLI_ERROR("jpegli_read_header: unexpected state %d",
+                 cinfo->global_state);
+  }
+  if (cinfo->src == nullptr) {
+    JPEGLI_ERROR("Missing source.");
+  }
+  for (;;) {
+    int retcode = jpegli_consume_input(cinfo);
+    if (retcode == JPEG_SUSPENDED) {
+      return retcode;
+    } else if (retcode == JPEG_REACHED_SOS) {
+      break;
+    } else if (retcode == JPEG_REACHED_EOI) {
+      if (require_image) {
+        JPEGLI_ERROR("jpegli_read_header: unexpected EOI marker.");
+      }
+      jpegli_abort_decompress(cinfo);
+      return JPEG_HEADER_TABLES_ONLY;
+    }
+  };
+  return JPEG_HEADER_OK;
+}
+
+boolean jpegli_read_icc_profile(j_decompress_ptr cinfo, JOCTET** icc_data_ptr,
+                                unsigned int* icc_data_len) {
+  if (cinfo->global_state == jpegli::kDecStart ||
+      cinfo->global_state == jpegli::kDecInHeader) {
+    JPEGLI_ERROR("jpegli_read_icc_profile: unexpected state %d",
+                 cinfo->global_state);
+  }
+  if (icc_data_ptr == nullptr || icc_data_len == nullptr) {
+    JPEGLI_ERROR("jpegli_read_icc_profile: invalid output buffer");
+  }
+  jpeg_decomp_master* m = cinfo->master;
+  if (m->icc_profile_.empty()) {
+    *icc_data_ptr = nullptr;
+    *icc_data_len = 0;
+    return FALSE;
+  }
+  *icc_data_len = m->icc_profile_.size();
+  *icc_data_ptr = (JOCTET*)malloc(*icc_data_len);
+  if (*icc_data_ptr == nullptr) {
+    JPEGLI_ERROR("jpegli_read_icc_profile: Out of memory");
+  }
+  memcpy(*icc_data_ptr, m->icc_profile_.data(), *icc_data_len);
+  return TRUE;
+}
+
+void jpegli_core_output_dimensions(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (!m->found_sof_) {
+    JPEGLI_ERROR("No SOF marker found.");
+  }
+  if (cinfo->raw_data_out) {
+    if (cinfo->scale_num != 1 || cinfo->scale_denom != 1) {
+      JPEGLI_ERROR("Output scaling is not supported in raw output mode");
+    }
+  }
+  if (cinfo->scale_num != 1 || cinfo->scale_denom != 1) {
+    int dctsize = 16;
+    while (cinfo->scale_num * DCTSIZE <= cinfo->scale_denom * (dctsize - 1)) {
+      --dctsize;
+    }
+    m->min_scaled_dct_size = dctsize;
+    cinfo->output_width =
+        jpegli::DivCeil(cinfo->image_width * dctsize, DCTSIZE);
+    cinfo->output_height =
+        jpegli::DivCeil(cinfo->image_height * dctsize, DCTSIZE);
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      m->scaled_dct_size[c] = m->min_scaled_dct_size;
+    }
+  } else {
+    cinfo->output_width = cinfo->image_width;
+    cinfo->output_height = cinfo->image_height;
+    m->min_scaled_dct_size = DCTSIZE;
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      m->scaled_dct_size[c] = DCTSIZE;
+    }
+  }
+}
+
+void jpegli_calc_output_dimensions(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  jpegli_core_output_dimensions(cinfo);
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    m->h_factor[c] = cinfo->max_h_samp_factor / comp->h_samp_factor;
+    m->v_factor[c] = cinfo->max_v_samp_factor / comp->v_samp_factor;
+  }
+  if (cinfo->scale_num != 1 || cinfo->scale_denom != 1) {
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      // Prefer IDCT scaling over 2x upsampling.
+      while (m->scaled_dct_size[c] < DCTSIZE && (m->v_factor[c] % 2) == 0 &&
+             (m->h_factor[c] % 2) == 0) {
+        m->scaled_dct_size[c] *= 2;
+        m->v_factor[c] /= 2;
+        m->h_factor[c] /= 2;
+      }
+    }
+  }
+  if (cinfo->out_color_space == JCS_GRAYSCALE) {
+    cinfo->out_color_components = 1;
+  } else if (cinfo->out_color_space == JCS_RGB ||
+             cinfo->out_color_space == JCS_YCbCr) {
+    cinfo->out_color_components = 3;
+  } else if (cinfo->out_color_space == JCS_CMYK ||
+             cinfo->out_color_space == JCS_YCCK) {
+    cinfo->out_color_components = 4;
+  } else {
+    cinfo->out_color_components = cinfo->num_components;
+  }
+  cinfo->output_components =
+      cinfo->quantize_colors ? 1 : cinfo->out_color_components;
+  cinfo->rec_outbuf_height = 1;
+}
+
+boolean jpegli_has_multiple_scans(j_decompress_ptr cinfo) {
+  if (cinfo->input_scan_number == 0) {
+    JPEGLI_ERROR("No SOS marker found.");
+  }
+  return cinfo->master->is_multiscan_;
+}
+
+boolean jpegli_input_complete(j_decompress_ptr cinfo) {
+  return cinfo->master->found_eoi_;
+}
+
+boolean jpegli_start_decompress(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (cinfo->global_state == jpegli::kDecHeaderDone) {
+    m->streaming_mode_ = !m->is_multiscan_ && !cinfo->buffered_image &&
+                         (!cinfo->quantize_colors || !cinfo->two_pass_quantize);
+    jpegli::AllocateCoefficientBuffer(cinfo);
+    jpegli_calc_output_dimensions(cinfo);
+    jpegli::PrepareForScan(cinfo);
+    if (cinfo->quantize_colors) {
+      if (cinfo->colormap != nullptr) {
+        cinfo->enable_external_quant = TRUE;
+      } else if (cinfo->two_pass_quantize &&
+                 cinfo->out_color_space == JCS_RGB) {
+        cinfo->enable_2pass_quant = TRUE;
+      } else {
+        cinfo->enable_1pass_quant = TRUE;
+      }
+    }
+    jpegli::InitProgressMonitor(cinfo, /*coef_only=*/false);
+    jpegli::AllocateOutputBuffers(cinfo);
+    if (cinfo->buffered_image == TRUE) {
+      cinfo->output_scan_number = 0;
+      return TRUE;
+    }
+  } else if (!m->is_multiscan_) {
+    JPEGLI_ERROR("jpegli_start_decompress: unexpected state %d",
+                 cinfo->global_state);
+  }
+  if (m->is_multiscan_) {
+    if (cinfo->global_state != jpegli::kDecProcessScan &&
+        cinfo->global_state != jpegli::kDecProcessMarkers) {
+      JPEGLI_ERROR("jpegli_start_decompress: unexpected state %d",
+                   cinfo->global_state);
+    }
+    while (!m->found_eoi_) {
+      jpegli::ProgressMonitorInputPass(cinfo);
+      if (jpegli::ConsumeInput(cinfo) == JPEG_SUSPENDED) {
+        return FALSE;
+      }
+    }
+  }
+  cinfo->output_scan_number = cinfo->input_scan_number;
+  jpegli::PrepareForOutput(cinfo);
+  if (cinfo->quantize_colors) {
+    return jpegli::PrepareQuantizedOutput(cinfo);
+  } else {
+    return TRUE;
+  }
+}
+
+boolean jpegli_start_output(j_decompress_ptr cinfo, int scan_number) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (!cinfo->buffered_image) {
+    JPEGLI_ERROR("jpegli_start_output: buffered image mode was not set");
+  }
+  if (cinfo->global_state != jpegli::kDecProcessScan &&
+      cinfo->global_state != jpegli::kDecProcessMarkers) {
+    JPEGLI_ERROR("jpegli_start_output: unexpected state %d",
+                 cinfo->global_state);
+  }
+  cinfo->output_scan_number = std::max(1, scan_number);
+  if (m->found_eoi_) {
+    cinfo->output_scan_number =
+        std::min(cinfo->output_scan_number, cinfo->input_scan_number);
+  }
+  jpegli::InitProgressMonitorForOutput(cinfo);
+  jpegli::PrepareForOutput(cinfo);
+  if (cinfo->quantize_colors) {
+    return jpegli::PrepareQuantizedOutput(cinfo);
+  } else {
+    return TRUE;
+  }
+}
+
+boolean jpegli_finish_output(j_decompress_ptr cinfo) {
+  if (!cinfo->buffered_image) {
+    JPEGLI_ERROR("jpegli_finish_output: buffered image mode was not set");
+  }
+  if (cinfo->global_state != jpegli::kDecProcessScan &&
+      cinfo->global_state != jpegli::kDecProcessMarkers) {
+    JPEGLI_ERROR("jpegli_finish_output: unexpected state %d",
+                 cinfo->global_state);
+  }
+  // Advance input to the start of the next scan, or to the end of input.
+  while (cinfo->input_scan_number <= cinfo->output_scan_number &&
+         !cinfo->master->found_eoi_) {
+    if (jpegli::ConsumeInput(cinfo) == JPEG_SUSPENDED) {
+      return FALSE;
+    }
+  }
+  return TRUE;
+}
+
+JDIMENSION jpegli_read_scanlines(j_decompress_ptr cinfo, JSAMPARRAY scanlines,
+                                 JDIMENSION max_lines) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (cinfo->global_state != jpegli::kDecProcessScan &&
+      cinfo->global_state != jpegli::kDecProcessMarkers) {
+    JPEGLI_ERROR("jpegli_read_scanlines: unexpected state %d",
+                 cinfo->global_state);
+  }
+  if (cinfo->buffered_image) {
+    if (cinfo->output_scan_number == 0) {
+      JPEGLI_ERROR(
+          "jpegli_read_scanlines: "
+          "jpegli_start_output() was not called");
+    }
+  } else if (m->is_multiscan_ && !m->found_eoi_) {
+    JPEGLI_ERROR(
+        "jpegli_read_scanlines: "
+        "jpegli_start_decompress() did not finish");
+  }
+  if (cinfo->output_scanline + max_lines > cinfo->output_height) {
+    max_lines = cinfo->output_height - cinfo->output_scanline;
+  }
+  jpegli::ProgressMonitorOutputPass(cinfo);
+  size_t num_output_rows = 0;
+  while (num_output_rows < max_lines) {
+    if (jpegli::IsInputReady(cinfo)) {
+      jpegli::ProcessOutput(cinfo, &num_output_rows, scanlines, max_lines);
+    } else if (jpegli::ConsumeInput(cinfo) == JPEG_SUSPENDED) {
+      break;
+    }
+  }
+  return num_output_rows;
+}
+
+JDIMENSION jpegli_skip_scanlines(j_decompress_ptr cinfo, JDIMENSION num_lines) {
+  // TODO(szabadka) Skip the IDCT for skipped over blocks.
+  return jpegli_read_scanlines(cinfo, nullptr, num_lines);
+}
+
+void jpegli_crop_scanline(j_decompress_ptr cinfo, JDIMENSION* xoffset,
+                          JDIMENSION* width) {
+  jpeg_decomp_master* m = cinfo->master;
+  if ((cinfo->global_state != jpegli::kDecProcessScan &&
+       cinfo->global_state != jpegli::kDecProcessMarkers) ||
+      cinfo->output_scanline != 0) {
+    JPEGLI_ERROR("jpegli_crop_decompress: unexpected state %d",
+                 cinfo->global_state);
+  }
+  if (cinfo->raw_data_out) {
+    JPEGLI_ERROR("Output cropping is not supported in raw data mode");
+  }
+  if (xoffset == nullptr || width == nullptr || *width == 0 ||
+      *xoffset + *width > cinfo->output_width) {
+    JPEGLI_ERROR("jpegli_crop_scanline: Invalid arguments");
+  }
+  // TODO(szabadka) Skip the IDCT for skipped over blocks.
+  size_t xend = *xoffset + *width;
+  size_t iMCU_width = m->min_scaled_dct_size * cinfo->max_h_samp_factor;
+  *xoffset = (*xoffset / iMCU_width) * iMCU_width;
+  *width = xend - *xoffset;
+  cinfo->master->xoffset_ = *xoffset;
+  cinfo->output_width = *width;
+}
+
+JDIMENSION jpegli_read_raw_data(j_decompress_ptr cinfo, JSAMPIMAGE data,
+                                JDIMENSION max_lines) {
+  if ((cinfo->global_state != jpegli::kDecProcessScan &&
+       cinfo->global_state != jpegli::kDecProcessMarkers) ||
+      !cinfo->raw_data_out) {
+    JPEGLI_ERROR("jpegli_read_raw_data: unexpected state %d",
+                 cinfo->global_state);
+  }
+  size_t iMCU_height = cinfo->max_v_samp_factor * DCTSIZE;
+  if (max_lines < iMCU_height) {
+    JPEGLI_ERROR("jpegli_read_raw_data: output buffer too small");
+  }
+  jpegli::ProgressMonitorOutputPass(cinfo);
+  while (!jpegli::IsInputReady(cinfo)) {
+    if (jpegli::ConsumeInput(cinfo) == JPEG_SUSPENDED) {
+      return 0;
+    }
+  }
+  if (cinfo->output_iMCU_row < cinfo->total_iMCU_rows) {
+    jpegli::ProcessRawOutput(cinfo, data);
+    return iMCU_height;
+  }
+  return 0;
+}
+
+jvirt_barray_ptr* jpegli_read_coefficients(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  m->streaming_mode_ = false;
+  if (!cinfo->buffered_image && cinfo->global_state == jpegli::kDecHeaderDone) {
+    jpegli::AllocateCoefficientBuffer(cinfo);
+    jpegli_calc_output_dimensions(cinfo);
+    jpegli::InitProgressMonitor(cinfo, /*coef_only=*/true);
+    jpegli::PrepareForScan(cinfo);
+  }
+  if (cinfo->global_state != jpegli::kDecProcessScan &&
+      cinfo->global_state != jpegli::kDecProcessMarkers) {
+    JPEGLI_ERROR("jpegli_read_coefficients: unexpected state %d",
+                 cinfo->global_state);
+  }
+  if (!cinfo->buffered_image) {
+    while (!m->found_eoi_) {
+      jpegli::ProgressMonitorInputPass(cinfo);
+      if (jpegli::ConsumeInput(cinfo) == JPEG_SUSPENDED) {
+        return nullptr;
+      }
+    }
+    cinfo->output_scanline = cinfo->output_height;
+  }
+  return m->coef_arrays;
+}
+
+boolean jpegli_finish_decompress(j_decompress_ptr cinfo) {
+  if (cinfo->global_state != jpegli::kDecProcessScan &&
+      cinfo->global_state != jpegli::kDecProcessMarkers) {
+    JPEGLI_ERROR("jpegli_finish_decompress: unexpected state %d",
+                 cinfo->global_state);
+  }
+  if (!cinfo->buffered_image && cinfo->output_scanline < cinfo->output_height) {
+    JPEGLI_ERROR("Incomplete output");
+  }
+  while (!cinfo->master->found_eoi_) {
+    if (jpegli::ConsumeInput(cinfo) == JPEG_SUSPENDED) {
+      return FALSE;
+    }
+  }
+  (*cinfo->src->term_source)(cinfo);
+  jpegli_abort_decompress(cinfo);
+  return TRUE;
+}
+
+boolean jpegli_resync_to_restart(j_decompress_ptr cinfo, int desired) {
+  JPEGLI_WARN("Invalid restart marker found: 0x%02x vs 0x%02x.",
+              cinfo->unread_marker, 0xd0 + desired);
+  // This is a trivial implementation, we just let the decoder skip the entire
+  // scan and attempt to render the partial input.
+  return TRUE;
+}
+
+void jpegli_new_colormap(j_decompress_ptr cinfo) {
+  if (cinfo->global_state != jpegli::kDecProcessScan &&
+      cinfo->global_state != jpegli::kDecProcessMarkers) {
+    JPEGLI_ERROR("jpegli_new_colormap: unexpected state %d",
+                 cinfo->global_state);
+  }
+  if (!cinfo->buffered_image) {
+    JPEGLI_ERROR("jpegli_new_colormap: not in  buffered image mode");
+  }
+  if (!cinfo->enable_external_quant) {
+    JPEGLI_ERROR("external colormap quantizer was not enabled");
+  }
+  if (!cinfo->quantize_colors || cinfo->colormap == nullptr) {
+    JPEGLI_ERROR("jpegli_new_colormap: not in external colormap mode");
+  }
+  cinfo->master->regenerate_inverse_colormap_ = true;
+}
+
+void jpegli_set_output_format(j_decompress_ptr cinfo, JpegliDataType data_type,
+                              JpegliEndianness endianness) {
+  switch (data_type) {
+    case JPEGLI_TYPE_UINT8:
+    case JPEGLI_TYPE_UINT16:
+    case JPEGLI_TYPE_FLOAT:
+      cinfo->master->output_data_type_ = data_type;
+      break;
+    default:
+      JPEGLI_ERROR("Unsupported data type %d", data_type);
+  }
+  switch (endianness) {
+    case JPEGLI_NATIVE_ENDIAN:
+      cinfo->master->swap_endianness_ = false;
+      break;
+    case JPEGLI_LITTLE_ENDIAN:
+      cinfo->master->swap_endianness_ = !IsLittleEndian();
+      break;
+    case JPEGLI_BIG_ENDIAN:
+      cinfo->master->swap_endianness_ = IsLittleEndian();
+      break;
+    default:
+      JPEGLI_ERROR("Unsupported endianness %d", endianness);
+  }
+}
diff --git a/lib/jpegli/decode.h b/lib/jpegli/decode.h
new file mode 100644 (file)
index 0000000..9800ebf
--- /dev/null
@@ -0,0 +1,106 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// This file contains the C API of the decoder part of the libjpegli library,
+// which is based on the C API of libjpeg, with the function names changed from
+// jpeg_* to jpegli_*, while decompressor object definitions are included
+// directly from jpeglib.h
+//
+// Applications can use the libjpegli library in one of the following ways:
+//
+//  (1) Include jpegli/encode.h and/or jpegli/decode.h, update the function
+//      names of the API and link against libjpegli.
+//
+//  (2) Leave the application code unchanged, but replace the libjpeg.so library
+//      with the one built by this project that is API- and ABI-compatible with
+//      libjpeg-turbo's version of libjpeg.so.
+
+#ifndef LIB_JPEGLI_DECODE_H_
+#define LIB_JPEGLI_DECODE_H_
+
+#include "lib/jpegli/common.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#define jpegli_create_decompress(cinfo)              \
+  jpegli_CreateDecompress((cinfo), JPEG_LIB_VERSION, \
+                          (size_t)sizeof(struct jpeg_decompress_struct))
+
+void jpegli_CreateDecompress(j_decompress_ptr cinfo, int version,
+                             size_t structsize);
+
+void jpegli_stdio_src(j_decompress_ptr cinfo, FILE *infile);
+
+void jpegli_mem_src(j_decompress_ptr cinfo, const unsigned char *inbuffer,
+                    unsigned long insize);
+
+int jpegli_read_header(j_decompress_ptr cinfo, boolean require_image);
+
+boolean jpegli_start_decompress(j_decompress_ptr cinfo);
+
+JDIMENSION jpegli_read_scanlines(j_decompress_ptr cinfo, JSAMPARRAY scanlines,
+                                 JDIMENSION max_lines);
+
+JDIMENSION jpegli_skip_scanlines(j_decompress_ptr cinfo, JDIMENSION num_lines);
+
+void jpegli_crop_scanline(j_decompress_ptr cinfo, JDIMENSION *xoffset,
+                          JDIMENSION *width);
+
+boolean jpegli_finish_decompress(j_decompress_ptr cinfo);
+
+JDIMENSION jpegli_read_raw_data(j_decompress_ptr cinfo, JSAMPIMAGE data,
+                                JDIMENSION max_lines);
+
+jvirt_barray_ptr *jpegli_read_coefficients(j_decompress_ptr cinfo);
+
+boolean jpegli_has_multiple_scans(j_decompress_ptr cinfo);
+
+boolean jpegli_start_output(j_decompress_ptr cinfo, int scan_number);
+
+boolean jpegli_finish_output(j_decompress_ptr cinfo);
+
+boolean jpegli_input_complete(j_decompress_ptr cinfo);
+
+int jpegli_consume_input(j_decompress_ptr cinfo);
+
+#if JPEG_LIB_VERSION >= 80
+void jpegli_core_output_dimensions(j_decompress_ptr cinfo);
+#endif
+void jpegli_calc_output_dimensions(j_decompress_ptr cinfo);
+
+void jpegli_save_markers(j_decompress_ptr cinfo, int marker_code,
+                         unsigned int length_limit);
+
+void jpegli_set_marker_processor(j_decompress_ptr cinfo, int marker_code,
+                                 jpeg_marker_parser_method routine);
+
+boolean jpegli_resync_to_restart(j_decompress_ptr cinfo, int desired);
+
+boolean jpegli_read_icc_profile(j_decompress_ptr cinfo, JOCTET **icc_data_ptr,
+                                unsigned int *icc_data_len);
+
+void jpegli_abort_decompress(j_decompress_ptr cinfo);
+
+void jpegli_destroy_decompress(j_decompress_ptr cinfo);
+
+void jpegli_new_colormap(j_decompress_ptr cinfo);
+
+//
+// New API functions that are not available in libjpeg
+//
+// NOTE: This part of the API is still experimental and will probably change in
+// the future.
+//
+
+void jpegli_set_output_format(j_decompress_ptr cinfo, JpegliDataType data_type,
+                              JpegliEndianness endianness);
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}  // extern "C"
+#endif
+
+#endif  // LIB_JPEGLI_DECODE_H_
diff --git a/lib/jpegli/decode_api_test.cc b/lib/jpegli/decode_api_test.cc
new file mode 100644 (file)
index 0000000..c48b937
--- /dev/null
@@ -0,0 +1,1304 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <cmath>
+#include <cstdint>
+#include <vector>
+
+#include "lib/jpegli/decode.h"
+#include "lib/jpegli/encode.h"
+#include "lib/jpegli/test_utils.h"
+#include "lib/jpegli/testing.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/sanitizers.h"
+
+namespace jpegli {
+namespace {
+
+static constexpr uint8_t kFakeEoiMarker[2] = {0xff, 0xd9};
+static constexpr size_t kNumSourceBuffers = 4;
+
+// Custom source manager that refills the input buffer in chunks, simulating
+// a file reader with a fixed buffer size.
+class SourceManager {
+ public:
+  SourceManager(const uint8_t* data, size_t len, size_t max_chunk_size)
+      : data_(data), len_(len), max_chunk_size_(max_chunk_size) {
+    pub_.skip_input_data = skip_input_data;
+    pub_.resync_to_restart = jpegli_resync_to_restart;
+    pub_.term_source = term_source;
+    pub_.init_source = init_source;
+    pub_.fill_input_buffer = fill_input_buffer;
+    if (max_chunk_size_ == 0) max_chunk_size_ = len;
+    buffers_.resize(kNumSourceBuffers, std::vector<uint8_t>(max_chunk_size_));
+    Reset();
+  }
+
+  void Reset() {
+    pub_.next_input_byte = nullptr;
+    pub_.bytes_in_buffer = 0;
+    pos_ = 0;
+    chunk_idx_ = 0;
+  }
+
+  ~SourceManager() {
+    EXPECT_EQ(0, pub_.bytes_in_buffer);
+    EXPECT_EQ(len_, pos_);
+  }
+
+ private:
+  jpeg_source_mgr pub_;
+  const uint8_t* data_;
+  size_t len_;
+  size_t chunk_idx_;
+  size_t pos_;
+  size_t max_chunk_size_;
+  std::vector<std::vector<uint8_t>> buffers_;
+
+  static void init_source(j_decompress_ptr cinfo) {}
+
+  static boolean fill_input_buffer(j_decompress_ptr cinfo) {
+    auto src = reinterpret_cast<SourceManager*>(cinfo->src);
+    if (src->pos_ < src->len_) {
+      size_t chunk_size = std::min(src->len_ - src->pos_, src->max_chunk_size_);
+      size_t next_idx = ++src->chunk_idx_ % kNumSourceBuffers;
+      uint8_t* next_buffer = src->buffers_[next_idx].data();
+      memcpy(next_buffer, src->data_ + src->pos_, chunk_size);
+      src->pub_.next_input_byte = next_buffer;
+      src->pub_.bytes_in_buffer = chunk_size;
+    } else {
+      src->pub_.next_input_byte = kFakeEoiMarker;
+      src->pub_.bytes_in_buffer = 2;
+      src->len_ += 2;
+    }
+    src->pos_ += src->pub_.bytes_in_buffer;
+    return TRUE;
+  }
+
+  static void skip_input_data(j_decompress_ptr cinfo, long num_bytes) {
+    auto src = reinterpret_cast<SourceManager*>(cinfo->src);
+    if (num_bytes <= 0) {
+      return;
+    }
+    if (src->pub_.bytes_in_buffer >= static_cast<size_t>(num_bytes)) {
+      src->pub_.bytes_in_buffer -= num_bytes;
+      src->pub_.next_input_byte += num_bytes;
+    } else {
+      src->pos_ += num_bytes - src->pub_.bytes_in_buffer;
+      src->pub_.bytes_in_buffer = 0;
+    }
+  }
+
+  static void term_source(j_decompress_ptr cinfo) {}
+};
+
+uint8_t markers_seen[kMarkerSequenceLen];
+size_t num_markers_seen = 0;
+
+uint8_t get_next_byte(j_decompress_ptr cinfo) {
+  if (cinfo->src->bytes_in_buffer == 0) {
+    (*cinfo->src->fill_input_buffer)(cinfo);
+  }
+  cinfo->src->bytes_in_buffer--;
+  return *cinfo->src->next_input_byte++;
+}
+
+boolean test_marker_processor(j_decompress_ptr cinfo) {
+  markers_seen[num_markers_seen] = cinfo->unread_marker;
+  size_t marker_len = (get_next_byte(cinfo) << 8) + get_next_byte(cinfo);
+  EXPECT_EQ(2 + ((num_markers_seen + 2) % sizeof(kMarkerData)), marker_len);
+  if (marker_len > 2) {
+    (*cinfo->src->skip_input_data)(cinfo, marker_len - 2);
+  }
+  ++num_markers_seen;
+  return TRUE;
+}
+
+void ReadOutputImage(const DecompressParams& dparams, j_decompress_ptr cinfo,
+                     TestImage* output) {
+  JDIMENSION xoffset = 0;
+  JDIMENSION yoffset = 0;
+  JDIMENSION xsize_cropped = cinfo->output_width;
+  JDIMENSION ysize_cropped = cinfo->output_height;
+  if (dparams.crop_output) {
+    xoffset = xsize_cropped = cinfo->output_width / 3;
+    yoffset = ysize_cropped = cinfo->output_height / 3;
+    jpegli_crop_scanline(cinfo, &xoffset, &xsize_cropped);
+  }
+  output->ysize = ysize_cropped;
+  output->xsize = cinfo->output_width;
+  output->components = cinfo->out_color_components;
+  output->data_type = dparams.data_type;
+  output->endianness = dparams.endianness;
+  size_t bytes_per_sample = jpegli_bytes_per_sample(dparams.data_type);
+  if (cinfo->raw_data_out) {
+    output->color_space = cinfo->jpeg_color_space;
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      size_t xsize = cinfo->comp_info[c].width_in_blocks * DCTSIZE;
+      size_t ysize = cinfo->comp_info[c].height_in_blocks * DCTSIZE;
+      std::vector<uint8_t> plane(ysize * xsize * bytes_per_sample);
+      output->raw_data.emplace_back(std::move(plane));
+    }
+  } else {
+    output->color_space = cinfo->out_color_space;
+    output->AllocatePixels();
+  }
+  size_t total_output_lines = 0;
+  while (cinfo->output_scanline < cinfo->output_height) {
+    size_t max_lines;
+    size_t num_output_lines;
+    if (cinfo->raw_data_out) {
+      size_t iMCU_height = cinfo->max_v_samp_factor * DCTSIZE;
+      EXPECT_EQ(cinfo->output_scanline, cinfo->output_iMCU_row * iMCU_height);
+      max_lines = iMCU_height;
+      std::vector<std::vector<JSAMPROW>> rowdata(cinfo->num_components);
+      std::vector<JSAMPARRAY> data(cinfo->num_components);
+      for (int c = 0; c < cinfo->num_components; ++c) {
+        size_t xsize = cinfo->comp_info[c].width_in_blocks * DCTSIZE;
+        size_t ysize = cinfo->comp_info[c].height_in_blocks * DCTSIZE;
+        size_t num_lines = cinfo->comp_info[c].v_samp_factor * DCTSIZE;
+        rowdata[c].resize(num_lines);
+        size_t y0 = cinfo->output_iMCU_row * num_lines;
+        for (size_t i = 0; i < num_lines; ++i) {
+          rowdata[c][i] =
+              y0 + i < ysize ? &output->raw_data[c][(y0 + i) * xsize] : nullptr;
+        }
+        data[c] = &rowdata[c][0];
+      }
+      num_output_lines = jpegli_read_raw_data(cinfo, &data[0], max_lines);
+    } else {
+      size_t max_output_lines = dparams.max_output_lines;
+      if (max_output_lines == 0) max_output_lines = cinfo->output_height;
+      if (cinfo->output_scanline < yoffset) {
+        max_lines = yoffset - cinfo->output_scanline;
+        num_output_lines = jpegli_skip_scanlines(cinfo, max_lines);
+      } else if (cinfo->output_scanline >= yoffset + ysize_cropped) {
+        max_lines = cinfo->output_height - cinfo->output_scanline;
+        num_output_lines = jpegli_skip_scanlines(cinfo, max_lines);
+      } else {
+        size_t lines_left = yoffset + ysize_cropped - cinfo->output_scanline;
+        max_lines = std::min<size_t>(max_output_lines, lines_left);
+        size_t stride = cinfo->output_width * cinfo->out_color_components *
+                        bytes_per_sample;
+        std::vector<JSAMPROW> scanlines(max_lines);
+        for (size_t i = 0; i < max_lines; ++i) {
+          size_t yidx = cinfo->output_scanline - yoffset + i;
+          scanlines[i] = &output->pixels[yidx * stride];
+        }
+        num_output_lines =
+            jpegli_read_scanlines(cinfo, &scanlines[0], max_lines);
+        if (cinfo->quantize_colors) {
+          for (size_t i = 0; i < num_output_lines; ++i) {
+            UnmapColors(scanlines[i], cinfo->output_width,
+                        cinfo->out_color_components, cinfo->colormap,
+                        cinfo->actual_number_of_colors);
+          }
+        }
+      }
+    }
+    total_output_lines += num_output_lines;
+    EXPECT_EQ(total_output_lines, cinfo->output_scanline);
+    EXPECT_EQ(num_output_lines, max_lines);
+  }
+  EXPECT_EQ(cinfo->total_iMCU_rows,
+            DivCeil(cinfo->image_height, cinfo->max_v_samp_factor * DCTSIZE));
+}
+
+struct TestConfig {
+  std::string fn;
+  std::string fn_desc;
+  TestImage input;
+  CompressParams jparams;
+  DecompressParams dparams;
+  bool compare_to_orig = false;
+  float max_tolerance_factor = 1.01f;
+  float max_rms_dist = 1.0f;
+  float max_diff = 35.0f;
+};
+
+std::vector<uint8_t> GetTestJpegData(TestConfig& config) {
+  std::vector<uint8_t> compressed;
+  if (!config.fn.empty()) {
+    compressed = ReadTestData(config.fn.c_str());
+  } else {
+    GeneratePixels(&config.input);
+    JXL_CHECK(EncodeWithJpegli(config.input, config.jparams, &compressed));
+  }
+  if (config.dparams.size_factor < 1.0f) {
+    compressed.resize(compressed.size() * config.dparams.size_factor);
+  }
+  return compressed;
+}
+
+void TestAPINonBuffered(const CompressParams& jparams,
+                        const DecompressParams& dparams,
+                        const TestImage& expected_output,
+                        j_decompress_ptr cinfo, TestImage* output) {
+  if (jparams.add_marker) {
+    jpegli_save_markers(cinfo, kSpecialMarker0, 0xffff);
+    jpegli_save_markers(cinfo, kSpecialMarker1, 0xffff);
+    num_markers_seen = 0;
+    jpegli_set_marker_processor(cinfo, 0xe6, test_marker_processor);
+    jpegli_set_marker_processor(cinfo, 0xe7, test_marker_processor);
+    jpegli_set_marker_processor(cinfo, 0xe8, test_marker_processor);
+  }
+  if (!jparams.icc.empty()) {
+    jpegli_save_markers(cinfo, JPEG_APP0 + 2, 0xffff);
+  }
+  jpegli_read_header(cinfo, /*require_image=*/TRUE);
+  if (jparams.add_marker) {
+    EXPECT_EQ(num_markers_seen, kMarkerSequenceLen);
+    EXPECT_EQ(0, memcmp(markers_seen, kMarkerSequence, num_markers_seen));
+  }
+  if (!jparams.icc.empty()) {
+    uint8_t* icc_data = nullptr;
+    unsigned int icc_len;
+    JXL_CHECK(jpegli_read_icc_profile(cinfo, &icc_data, &icc_len));
+    JXL_CHECK(icc_data);
+    EXPECT_EQ(0, memcmp(jparams.icc.data(), icc_data, icc_len));
+    free(icc_data);
+  }
+  // Check that jpegli_calc_output_dimensions can be called multiple times
+  // even with different parameters.
+  if (!cinfo->raw_data_out) {
+    cinfo->scale_num = 1;
+    cinfo->scale_denom = 2;
+  }
+  jpegli_calc_output_dimensions(cinfo);
+  SetDecompressParams(dparams, cinfo);
+  jpegli_set_output_format(cinfo, dparams.data_type, dparams.endianness);
+  VerifyHeader(jparams, cinfo);
+  jpegli_calc_output_dimensions(cinfo);
+  EXPECT_LE(expected_output.xsize, cinfo->output_width);
+  if (!dparams.crop_output) {
+    EXPECT_EQ(expected_output.xsize, cinfo->output_width);
+  }
+  if (dparams.output_mode == COEFFICIENTS) {
+    jvirt_barray_ptr* coef_arrays = jpegli_read_coefficients(cinfo);
+    JXL_CHECK(coef_arrays != nullptr);
+    CopyCoefficients(cinfo, coef_arrays, output);
+  } else {
+    jpegli_start_decompress(cinfo);
+    VerifyScanHeader(jparams, cinfo);
+    ReadOutputImage(dparams, cinfo, output);
+  }
+  jpegli_finish_decompress(cinfo);
+}
+
+void TestAPIBuffered(const CompressParams& jparams,
+                     const DecompressParams& dparams, j_decompress_ptr cinfo,
+                     std::vector<TestImage>* output_progression) {
+  EXPECT_EQ(JPEG_REACHED_SOS,
+            jpegli_read_header(cinfo, /*require_image=*/TRUE));
+  cinfo->buffered_image = TRUE;
+  SetDecompressParams(dparams, cinfo);
+  jpegli_set_output_format(cinfo, dparams.data_type, dparams.endianness);
+  VerifyHeader(jparams, cinfo);
+  EXPECT_TRUE(jpegli_start_decompress(cinfo));
+  // start decompress should not read the whole input in buffered image mode
+  EXPECT_FALSE(jpegli_input_complete(cinfo));
+  bool has_multiple_scans = jpegli_has_multiple_scans(cinfo);
+  EXPECT_EQ(0, cinfo->output_scan_number);
+  int sos_marker_cnt = 1;  // read_header reads the first SOS marker
+  while (!jpegli_input_complete(cinfo)) {
+    EXPECT_EQ(cinfo->input_scan_number, sos_marker_cnt);
+    if (dparams.skip_scans && (cinfo->input_scan_number % 2) != 1) {
+      int result = JPEG_SUSPENDED;
+      while (result != JPEG_REACHED_SOS && result != JPEG_REACHED_EOI) {
+        result = jpegli_consume_input(cinfo);
+      }
+      if (result == JPEG_REACHED_SOS) ++sos_marker_cnt;
+      continue;
+    }
+    SetScanDecompressParams(dparams, cinfo, cinfo->input_scan_number);
+    EXPECT_TRUE(jpegli_start_output(cinfo, cinfo->input_scan_number));
+    // start output sets output_scan_number, but does not change
+    // input_scan_number
+    EXPECT_EQ(cinfo->output_scan_number, cinfo->input_scan_number);
+    EXPECT_EQ(cinfo->input_scan_number, sos_marker_cnt);
+    VerifyScanHeader(jparams, cinfo);
+    TestImage output;
+    ReadOutputImage(dparams, cinfo, &output);
+    output_progression->emplace_back(std::move(output));
+    // read scanlines/read raw data does not change input/output scan number
+    EXPECT_EQ(cinfo->input_scan_number, sos_marker_cnt);
+    EXPECT_EQ(cinfo->output_scan_number, cinfo->input_scan_number);
+    EXPECT_TRUE(jpegli_finish_output(cinfo));
+    ++sos_marker_cnt;  // finish output reads the next SOS marker or EOI
+    if (dparams.output_mode == COEFFICIENTS) {
+      jvirt_barray_ptr* coef_arrays = jpegli_read_coefficients(cinfo);
+      JXL_CHECK(coef_arrays != nullptr);
+      CopyCoefficients(cinfo, coef_arrays, &output_progression->back());
+    }
+  }
+  jpegli_finish_decompress(cinfo);
+  if (dparams.size_factor == 1.0f) {
+    EXPECT_EQ(has_multiple_scans, cinfo->input_scan_number > 1);
+  }
+}
+
+TEST(DecodeAPITest, ReuseCinfo) {
+  TestImage input, output, expected;
+  std::vector<TestImage> output_progression, expected_output_progression;
+  CompressParams jparams;
+  DecompressParams dparams;
+  std::vector<uint8_t> compressed;
+  jpeg_decompress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+    input.xsize = 129;
+    input.ysize = 73;
+    GeneratePixels(&input);
+    for (int h_samp : {2, 1}) {
+      for (int v_samp : {2, 1}) {
+        for (int progr : {0, 2}) {
+          jparams.h_sampling = {h_samp, 1, 1};
+          jparams.v_sampling = {v_samp, 1, 1};
+          jparams.progressive_mode = progr;
+          printf(
+              "Generating input with %dx%d chroma subsampling "
+              "progressive level %d\n",
+              h_samp, v_samp, progr);
+          JXL_CHECK(EncodeWithJpegli(input, jparams, &compressed));
+          for (JpegIOMode output_mode : {PIXELS, RAW_DATA, COEFFICIENTS}) {
+            for (bool crop : {true, false}) {
+              if (crop && output_mode != PIXELS) continue;
+              for (int scale_num : {1, 2, 3, 4, 7, 8, 13, 16}) {
+                if (scale_num != 8 && output_mode != PIXELS) continue;
+                int scale_denom = 8;
+                while (scale_num % 2 == 0 && scale_denom % 2 == 0) {
+                  scale_num /= 2;
+                  scale_denom /= 2;
+                }
+                printf("Decoding with output mode %d output scaling %d/%d %s\n",
+                       output_mode, scale_num, scale_denom,
+                       crop ? "with cropped output" : "");
+                dparams.output_mode = output_mode;
+                dparams.scale_num = scale_num;
+                dparams.scale_denom = scale_denom;
+                expected.Clear();
+                DecodeWithLibjpeg(jparams, dparams, compressed, &expected);
+                output.Clear();
+                cinfo.buffered_image = false;
+                cinfo.raw_data_out = false;
+                cinfo.scale_num = cinfo.scale_denom = 1;
+                SourceManager src(compressed.data(), compressed.size(),
+                                  1u << 12);
+                cinfo.src = reinterpret_cast<jpeg_source_mgr*>(&src);
+                jpegli_read_header(&cinfo, /*require_image=*/TRUE);
+                jpegli_abort_decompress(&cinfo);
+                src.Reset();
+                TestAPINonBuffered(jparams, dparams, expected, &cinfo, &output);
+                float max_rms = output_mode == COEFFICIENTS ? 0.0f : 1.0f;
+                if (scale_num == 1 && scale_denom == 8 && h_samp != v_samp) {
+                  max_rms = 5.0f;  // libjpeg does not do fancy upsampling
+                }
+                VerifyOutputImage(expected, output, max_rms);
+                printf("Decoding in buffered image mode\n");
+                expected_output_progression.clear();
+                DecodeAllScansWithLibjpeg(jparams, dparams, compressed,
+                                          &expected_output_progression);
+                output_progression.clear();
+                src.Reset();
+                TestAPIBuffered(jparams, dparams, &cinfo, &output_progression);
+                JXL_CHECK(output_progression.size() ==
+                          expected_output_progression.size());
+                for (size_t i = 0; i < output_progression.size(); ++i) {
+                  const TestImage& output = output_progression[i];
+                  const TestImage& expected = expected_output_progression[i];
+                  VerifyOutputImage(expected, output, max_rms);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    return true;
+  };
+  ASSERT_TRUE(try_catch_block());
+  jpegli_destroy_decompress(&cinfo);
+}
+
+std::vector<TestConfig> GenerateBasicConfigs() {
+  std::vector<TestConfig> all_configs;
+  for (int samp : {1, 2}) {
+    for (int progr : {0, 2}) {
+      TestConfig config;
+      config.input.xsize = 257 + samp * 37;
+      config.input.ysize = 265 + (progr / 2) * 17;
+      config.jparams.h_sampling = {samp, 1, 1};
+      config.jparams.v_sampling = {samp, 1, 1};
+      config.jparams.progressive_mode = progr;
+      GeneratePixels(&config.input);
+      all_configs.push_back(config);
+    }
+  }
+  return all_configs;
+}
+
+TEST(DecodeAPITest, ReuseCinfoSameMemSource) {
+  std::vector<TestConfig> all_configs = GenerateBasicConfigs();
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  {
+    jpeg_compress_struct cinfo;
+    const auto try_catch_block = [&]() -> bool {
+      ERROR_HANDLER_SETUP(jpegli);
+      jpegli_create_compress(&cinfo);
+      jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+      for (const TestConfig& config : all_configs) {
+        EncodeWithJpegli(config.input, config.jparams, &cinfo);
+      }
+      return true;
+    };
+    EXPECT_TRUE(try_catch_block());
+    jpegli_destroy_compress(&cinfo);
+  }
+  std::vector<TestImage> all_outputs(all_configs.size());
+  {
+    jpeg_decompress_struct cinfo;
+    const auto try_catch_block = [&]() -> bool {
+      ERROR_HANDLER_SETUP(jpegli);
+      jpegli_create_decompress(&cinfo);
+      jpegli_mem_src(&cinfo, buffer, buffer_size);
+      for (size_t i = 0; i < all_configs.size(); ++i) {
+        TestAPINonBuffered(all_configs[i].jparams, DecompressParams(),
+                           all_configs[i].input, &cinfo, &all_outputs[i]);
+      }
+      return true;
+    };
+    EXPECT_TRUE(try_catch_block());
+    jpegli_destroy_decompress(&cinfo);
+  }
+  for (size_t i = 0; i < all_configs.size(); ++i) {
+    VerifyOutputImage(all_configs[i].input, all_outputs[i], 2.35f);
+  }
+  if (buffer) free(buffer);
+}
+
+TEST(DecodeAPITest, ReuseCinfoSameStdSource) {
+  std::vector<TestConfig> all_configs = GenerateBasicConfigs();
+  FILE* tmpf = tmpfile();
+  JXL_CHECK(tmpf);
+  {
+    jpeg_compress_struct cinfo;
+    const auto try_catch_block = [&]() -> bool {
+      ERROR_HANDLER_SETUP(jpegli);
+      jpegli_create_compress(&cinfo);
+      jpegli_stdio_dest(&cinfo, tmpf);
+      for (const TestConfig& config : all_configs) {
+        EncodeWithJpegli(config.input, config.jparams, &cinfo);
+      }
+      return true;
+    };
+    EXPECT_TRUE(try_catch_block());
+    jpegli_destroy_compress(&cinfo);
+  }
+  rewind(tmpf);
+  std::vector<TestImage> all_outputs(all_configs.size());
+  {
+    jpeg_decompress_struct cinfo;
+    const auto try_catch_block = [&]() -> bool {
+      ERROR_HANDLER_SETUP(jpegli);
+      jpegli_create_decompress(&cinfo);
+      jpegli_stdio_src(&cinfo, tmpf);
+      for (size_t i = 0; i < all_configs.size(); ++i) {
+        TestAPINonBuffered(all_configs[i].jparams, DecompressParams(),
+                           all_configs[i].input, &cinfo, &all_outputs[i]);
+      }
+      return true;
+    };
+    EXPECT_TRUE(try_catch_block());
+    jpegli_destroy_decompress(&cinfo);
+  }
+  for (size_t i = 0; i < all_configs.size(); ++i) {
+    VerifyOutputImage(all_configs[i].input, all_outputs[i], 2.35f);
+  }
+  fclose(tmpf);
+}
+
+TEST(DecodeAPITest, AbbreviatedStreams) {
+  uint8_t* table_stream = nullptr;
+  unsigned long table_stream_size = 0;
+  uint8_t* data_stream = nullptr;
+  unsigned long data_stream_size = 0;
+  {
+    jpeg_compress_struct cinfo;
+    const auto try_catch_block = [&]() -> bool {
+      ERROR_HANDLER_SETUP(jpegli);
+      jpegli_create_compress(&cinfo);
+      jpegli_mem_dest(&cinfo, &table_stream, &table_stream_size);
+      cinfo.input_components = 3;
+      cinfo.in_color_space = JCS_RGB;
+      jpegli_set_defaults(&cinfo);
+      jpegli_write_tables(&cinfo);
+      jpegli_mem_dest(&cinfo, &data_stream, &data_stream_size);
+      cinfo.image_width = 1;
+      cinfo.image_height = 1;
+      cinfo.optimize_coding = FALSE;
+      jpegli_set_progressive_level(&cinfo, 0);
+      jpegli_start_compress(&cinfo, FALSE);
+      JSAMPLE image[3] = {0};
+      JSAMPROW row[] = {image};
+      jpegli_write_scanlines(&cinfo, row, 1);
+      jpegli_finish_compress(&cinfo);
+      return true;
+    };
+    EXPECT_TRUE(try_catch_block());
+    EXPECT_LT(data_stream_size, 50);
+    jpegli_destroy_compress(&cinfo);
+  }
+  {
+    jpeg_decompress_struct cinfo = {};
+    const auto try_catch_block = [&]() -> bool {
+      ERROR_HANDLER_SETUP(jpegli);
+      jpegli_create_decompress(&cinfo);
+      jpegli_mem_src(&cinfo, table_stream, table_stream_size);
+      jpegli_read_header(&cinfo, FALSE);
+      jpegli_mem_src(&cinfo, data_stream, data_stream_size);
+      jpegli_read_header(&cinfo, TRUE);
+      EXPECT_EQ(1, cinfo.image_width);
+      EXPECT_EQ(1, cinfo.image_height);
+      EXPECT_EQ(3, cinfo.num_components);
+      jpegli_start_decompress(&cinfo);
+      JSAMPLE image[3] = {0};
+      JSAMPROW row[] = {image};
+      jpegli_read_scanlines(&cinfo, row, 1);
+      EXPECT_EQ(0, image[0]);
+      EXPECT_EQ(0, image[1]);
+      EXPECT_EQ(0, image[2]);
+      jpegli_finish_decompress(&cinfo);
+      return true;
+    };
+    EXPECT_TRUE(try_catch_block());
+    jpegli_destroy_decompress(&cinfo);
+  }
+  if (table_stream) free(table_stream);
+  if (data_stream) free(data_stream);
+}
+
+class DecodeAPITestParam : public ::testing::TestWithParam<TestConfig> {};
+
+TEST_P(DecodeAPITestParam, TestAPI) {
+  TestConfig config = GetParam();
+  const DecompressParams& dparams = config.dparams;
+  if (dparams.skip_scans) return;
+  const std::vector<uint8_t> compressed = GetTestJpegData(config);
+  SourceManager src(compressed.data(), compressed.size(), dparams.chunk_size);
+
+  TestImage output1;
+  DecodeWithLibjpeg(config.jparams, dparams, compressed, &output1);
+
+  TestImage output0;
+  jpeg_decompress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+    cinfo.src = reinterpret_cast<jpeg_source_mgr*>(&src);
+    TestAPINonBuffered(config.jparams, dparams, output1, &cinfo, &output0);
+    return true;
+  };
+  ASSERT_TRUE(try_catch_block());
+  jpegli_destroy_decompress(&cinfo);
+
+  if (config.compare_to_orig) {
+    double rms0 = DistanceRms(config.input, output0);
+    double rms1 = DistanceRms(config.input, output1);
+    printf("rms: %f  vs  %f\n", rms0, rms1);
+    EXPECT_LE(rms0, rms1 * config.max_tolerance_factor);
+  } else {
+    VerifyOutputImage(output0, output1, config.max_rms_dist, config.max_diff);
+  }
+}
+
+class DecodeAPITestParamBuffered : public ::testing::TestWithParam<TestConfig> {
+};
+
+TEST_P(DecodeAPITestParamBuffered, TestAPI) {
+  TestConfig config = GetParam();
+  const DecompressParams& dparams = config.dparams;
+  const std::vector<uint8_t> compressed = GetTestJpegData(config);
+  SourceManager src(compressed.data(), compressed.size(), dparams.chunk_size);
+
+  std::vector<TestImage> output_progression1;
+  DecodeAllScansWithLibjpeg(config.jparams, dparams, compressed,
+                            &output_progression1);
+
+  std::vector<TestImage> output_progression0;
+  jpeg_decompress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+    cinfo.src = reinterpret_cast<jpeg_source_mgr*>(&src);
+    TestAPIBuffered(config.jparams, dparams, &cinfo, &output_progression0);
+    return true;
+  };
+  ASSERT_TRUE(try_catch_block());
+  jpegli_destroy_decompress(&cinfo);
+
+  ASSERT_EQ(output_progression0.size(), output_progression1.size());
+  for (size_t i = 0; i < output_progression0.size(); ++i) {
+    const TestImage& output = output_progression0[i];
+    const TestImage& expected = output_progression1[i];
+    if (config.compare_to_orig) {
+      double rms0 = DistanceRms(config.input, output);
+      double rms1 = DistanceRms(config.input, expected);
+      printf("rms: %f  vs  %f\n", rms0, rms1);
+      EXPECT_LE(rms0, rms1 * config.max_tolerance_factor);
+    } else {
+      VerifyOutputImage(expected, output, config.max_rms_dist, config.max_diff);
+    }
+  }
+}
+
+std::vector<TestConfig> GenerateTests(bool buffered) {
+  std::vector<TestConfig> all_tests;
+  {
+    std::vector<std::pair<std::string, std::string>> testfiles({
+        {"jxl/flower/flower.png.im_q85_420_progr.jpg", "Q85YUV420PROGR"},
+        {"jxl/flower/flower.png.im_q85_420_R13B.jpg", "Q85YUV420R13B"},
+        {"jxl/flower/flower.png.im_q85_444.jpg", "Q85YUV444"},
+    });
+    for (size_t i = 0; i < (buffered ? 1u : testfiles.size()); ++i) {
+      TestConfig config;
+      config.fn = testfiles[i].first;
+      config.fn_desc = testfiles[i].second;
+      for (size_t chunk_size : {0, 1, 64, 65536}) {
+        config.dparams.chunk_size = chunk_size;
+        for (size_t max_output_lines : {0, 1, 8, 16}) {
+          config.dparams.max_output_lines = max_output_lines;
+          config.dparams.output_mode = PIXELS;
+          all_tests.push_back(config);
+        }
+        {
+          config.dparams.max_output_lines = 16;
+          config.dparams.output_mode = RAW_DATA;
+          all_tests.push_back(config);
+        }
+      }
+    }
+  }
+
+  {
+    std::vector<std::pair<std::string, std::string>> testfiles({
+        {"jxl/flower/flower_small.q85_444_non_interleaved.jpg",
+         "Q85YUV444NonInterleaved"},
+        {"jxl/flower/flower_small.q85_420_non_interleaved.jpg",
+         "Q85YUV420NonInterleaved"},
+        {"jxl/flower/flower_small.q85_444_partially_interleaved.jpg",
+         "Q85YUV444PartiallyInterleaved"},
+        {"jxl/flower/flower_small.q85_420_partially_interleaved.jpg",
+         "Q85YUV420PartiallyInterleaved"},
+        {"jxl/flower/flower.png.im_q85_422.jpg", "Q85YUV422"},
+        {"jxl/flower/flower.png.im_q85_440.jpg", "Q85YUV440"},
+        {"jxl/flower/flower.png.im_q85_444_1x2.jpg", "Q85YUV444_1x2"},
+        {"jxl/flower/flower.png.im_q85_asymmetric.jpg", "Q85Asymmetric"},
+        {"jxl/flower/flower.png.im_q85_gray.jpg", "Q85Gray"},
+        {"jxl/flower/flower.png.im_q85_luma_subsample.jpg", "Q85LumaSubsample"},
+        {"jxl/flower/flower.png.im_q85_rgb.jpg", "Q85RGB"},
+        {"jxl/flower/flower.png.im_q85_rgb_subsample_blue.jpg",
+         "Q85RGBSubsampleBlue"},
+        {"jxl/flower/flower_small.cmyk.jpg", "CMYK"},
+    });
+    for (size_t i = 0; i < (buffered ? 4u : testfiles.size()); ++i) {
+      for (JpegIOMode output_mode : {PIXELS, RAW_DATA}) {
+        TestConfig config;
+        config.fn = testfiles[i].first;
+        config.fn_desc = testfiles[i].second;
+        config.dparams.output_mode = output_mode;
+        all_tests.push_back(config);
+      }
+    }
+  }
+
+  // Tests for common chroma subsampling and output modes.
+  for (JpegIOMode output_mode : {PIXELS, RAW_DATA, COEFFICIENTS}) {
+    for (int h_samp : {1, 2}) {
+      for (int v_samp : {1, 2}) {
+        for (bool fancy : {true, false}) {
+          if (!fancy && (output_mode != PIXELS || h_samp * v_samp == 1)) {
+            continue;
+          }
+          TestConfig config;
+          config.dparams.output_mode = output_mode;
+          config.dparams.do_fancy_upsampling = fancy;
+          config.jparams.progressive_mode = 2;
+          config.jparams.h_sampling = {h_samp, 1, 1};
+          config.jparams.v_sampling = {v_samp, 1, 1};
+          if (output_mode == COEFFICIENTS) {
+            config.max_rms_dist = 0.0f;
+          }
+          all_tests.push_back(config);
+        }
+      }
+    }
+  }
+
+  // Tests for partial input.
+  for (float size_factor : {0.1f, 0.33f, 0.5f, 0.75f}) {
+    for (int progr : {0, 1, 3}) {
+      for (int samp : {1, 2}) {
+        for (bool skip_scans : {false, true}) {
+          if (skip_scans && (progr != 1 || size_factor < 0.5f)) continue;
+          for (JpegIOMode output_mode : {PIXELS, RAW_DATA}) {
+            TestConfig config;
+            config.input.xsize = 517;
+            config.input.ysize = 523;
+            config.jparams.h_sampling = {samp, 1, 1};
+            config.jparams.v_sampling = {samp, 1, 1};
+            config.jparams.progressive_mode = progr;
+            config.dparams.size_factor = size_factor;
+            config.dparams.output_mode = output_mode;
+            config.dparams.skip_scans = skip_scans;
+            // The last partially available block can behave differently.
+            // TODO(szabadka) Figure out if we can make the behaviour more
+            // similar.
+            config.max_rms_dist = samp == 1 ? 1.75f : 3.0f;
+            config.max_diff = 255.0f;
+            all_tests.push_back(config);
+          }
+        }
+      }
+    }
+  }
+
+  // Tests for block smoothing.
+  for (float size_factor : {0.1f, 0.33f, 0.5f, 0.75f, 1.0f}) {
+    for (int samp : {1, 2}) {
+      for (bool skip_scans : {false, true}) {
+        if (skip_scans && size_factor < 0.3f) continue;
+        TestConfig config;
+        config.input.xsize = 517;
+        config.input.ysize = 523;
+        config.jparams.h_sampling = {samp, 1, 1};
+        config.jparams.v_sampling = {samp, 1, 1};
+        config.jparams.progressive_mode = 2;
+        config.dparams.size_factor = size_factor;
+        config.dparams.do_block_smoothing = true;
+        config.dparams.skip_scans = skip_scans;
+        // libjpeg does smoothing for incomplete scans differently at
+        // the border between current and previous scans.
+        config.max_rms_dist = 8.0f;
+        config.max_diff = 255.0f;
+        all_tests.push_back(config);
+      }
+    }
+  }
+
+  // Test for switching output color quantization modes between scans.
+  if (buffered) {
+    TestConfig config;
+    config.jparams.progressive_mode = 2;
+    config.dparams.quantize_colors = true;
+    config.dparams.scan_params = {
+        {3, JDITHER_NONE, CQUANT_1PASS},  {4, JDITHER_ORDERED, CQUANT_1PASS},
+        {5, JDITHER_FS, CQUANT_1PASS},    {6, JDITHER_NONE, CQUANT_EXTERNAL},
+        {8, JDITHER_NONE, CQUANT_REUSE},  {9, JDITHER_NONE, CQUANT_EXTERNAL},
+        {10, JDITHER_NONE, CQUANT_2PASS}, {11, JDITHER_NONE, CQUANT_REUSE},
+        {12, JDITHER_NONE, CQUANT_2PASS}, {13, JDITHER_FS, CQUANT_2PASS},
+    };
+    config.compare_to_orig = true;
+    config.max_tolerance_factor = 1.04f;
+    all_tests.push_back(config);
+  }
+
+  if (buffered) {
+    return all_tests;
+  }
+
+  // Tests for output color quantization.
+  for (int num_colors : {8, 64, 256}) {
+    for (ColorQuantMode mode : {CQUANT_1PASS, CQUANT_EXTERNAL, CQUANT_2PASS}) {
+      if (mode == CQUANT_EXTERNAL && num_colors != 256) continue;
+      for (J_DITHER_MODE dither : {JDITHER_NONE, JDITHER_ORDERED, JDITHER_FS}) {
+        if (mode == CQUANT_EXTERNAL && dither != JDITHER_NONE) continue;
+        if (mode != CQUANT_1PASS && dither == JDITHER_ORDERED) continue;
+        for (bool crop : {false, true}) {
+          for (bool scale : {false, true}) {
+            for (bool samp : {false, true}) {
+              if ((num_colors != 256) && (crop || scale || samp)) {
+                continue;
+              }
+              if (mode == CQUANT_2PASS && crop) continue;
+              TestConfig config;
+              config.input.xsize = 1024;
+              config.input.ysize = 768;
+              config.dparams.quantize_colors = true;
+              config.dparams.desired_number_of_colors = num_colors;
+              config.dparams.scan_params = {{kLastScan, dither, mode}};
+              config.dparams.crop_output = crop;
+              if (scale) {
+                config.dparams.scale_num = 7;
+                config.dparams.scale_denom = 8;
+              }
+              if (samp) {
+                config.jparams.h_sampling = {2, 1, 1};
+                config.jparams.v_sampling = {2, 1, 1};
+              }
+              if (!scale && !crop) {
+                config.compare_to_orig = true;
+                if (dither != JDITHER_NONE) {
+                  config.max_tolerance_factor = 1.05f;
+                }
+                if (mode == CQUANT_2PASS &&
+                    (num_colors == 8 || dither == JDITHER_FS)) {
+                  // TODO(szabadka) Lower this bound.
+                  config.max_tolerance_factor = 1.5f;
+                }
+              } else {
+                // We only test for buffer overflows, etc.
+                config.max_rms_dist = 100.0f;
+                config.max_diff = 255.0f;
+              }
+              all_tests.push_back(config);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // Tests for output formats.
+  for (JpegliDataType type :
+       {JPEGLI_TYPE_UINT8, JPEGLI_TYPE_UINT16, JPEGLI_TYPE_FLOAT}) {
+    for (JpegliEndianness endianness :
+         {JPEGLI_NATIVE_ENDIAN, JPEGLI_LITTLE_ENDIAN, JPEGLI_BIG_ENDIAN}) {
+      if (type == JPEGLI_TYPE_UINT8 && endianness != JPEGLI_NATIVE_ENDIAN) {
+        continue;
+      }
+      for (int channels = 1; channels <= 4; ++channels) {
+        TestConfig config;
+        config.dparams.data_type = type;
+        config.dparams.endianness = endianness;
+        config.input.color_space = JCS_UNKNOWN;
+        config.input.components = channels;
+        config.dparams.set_out_color_space = true;
+        config.dparams.out_color_space = JCS_UNKNOWN;
+        all_tests.push_back(config);
+      }
+    }
+  }
+  // Test for output cropping.
+  {
+    TestConfig config;
+    config.dparams.crop_output = true;
+    all_tests.push_back(config);
+  }
+  // Tests for color transforms.
+  for (J_COLOR_SPACE out_color_space : {JCS_RGB, JCS_GRAYSCALE}) {
+    TestConfig config;
+    config.input.xsize = config.input.ysize = 256;
+    config.input.color_space = JCS_GRAYSCALE;
+    config.dparams.set_out_color_space = true;
+    config.dparams.out_color_space = out_color_space;
+    all_tests.push_back(config);
+  }
+  for (J_COLOR_SPACE jpeg_color_space : {JCS_RGB, JCS_YCbCr}) {
+    for (J_COLOR_SPACE out_color_space : {JCS_RGB, JCS_YCbCr, JCS_GRAYSCALE}) {
+      if (jpeg_color_space == JCS_RGB && out_color_space == JCS_YCbCr) continue;
+      TestConfig config;
+      config.input.xsize = config.input.ysize = 256;
+      config.jparams.set_jpeg_colorspace = true;
+      config.jparams.jpeg_color_space = jpeg_color_space;
+      config.dparams.set_out_color_space = true;
+      config.dparams.out_color_space = out_color_space;
+      all_tests.push_back(config);
+    }
+  }
+  for (J_COLOR_SPACE jpeg_color_space : {JCS_CMYK, JCS_YCCK}) {
+    for (J_COLOR_SPACE out_color_space : {JCS_CMYK, JCS_YCCK}) {
+      if (jpeg_color_space == JCS_CMYK && out_color_space == JCS_YCCK) continue;
+      TestConfig config;
+      config.input.xsize = config.input.ysize = 256;
+      config.input.color_space = JCS_CMYK;
+      config.jparams.set_jpeg_colorspace = true;
+      config.jparams.jpeg_color_space = jpeg_color_space;
+      config.dparams.set_out_color_space = true;
+      config.dparams.out_color_space = out_color_space;
+      all_tests.push_back(config);
+    }
+  }
+  // Tests for progressive levels.
+  for (int p = 0; p < 3 + NumTestScanScripts(); ++p) {
+    TestConfig config;
+    config.jparams.progressive_mode = p;
+    all_tests.push_back(config);
+  }
+  // Tests for RST markers.
+  for (size_t r : {1, 17, 1024}) {
+    for (size_t chunk_size : {1, 65536}) {
+      for (int progr : {0, 2}) {
+        TestConfig config;
+        config.dparams.chunk_size = chunk_size;
+        config.jparams.progressive_mode = progr;
+        config.jparams.restart_interval = r;
+        all_tests.push_back(config);
+      }
+    }
+  }
+  for (size_t rr : {1, 3, 8, 100}) {
+    TestConfig config;
+    config.jparams.restart_in_rows = rr;
+    all_tests.push_back(config);
+  }
+  // Tests for custom quantization tables.
+  for (int type : {0, 1, 10, 100, 10000}) {
+    for (int scale : {1, 50, 100, 200, 500}) {
+      for (bool add_raw : {false, true}) {
+        for (bool baseline : {true, false}) {
+          if (!baseline && (add_raw || type * scale < 25500)) continue;
+          TestConfig config;
+          config.input.xsize = 64;
+          config.input.ysize = 64;
+          CustomQuantTable table;
+          table.table_type = type;
+          table.scale_factor = scale;
+          table.force_baseline = baseline;
+          table.add_raw = add_raw;
+          table.Generate();
+          config.jparams.quant_tables.push_back(table);
+          config.jparams.quant_indexes = {0, 0, 0};
+          config.compare_to_orig = true;
+          config.max_tolerance_factor = 1.02;
+          all_tests.push_back(config);
+        }
+      }
+    }
+  }
+  for (int qidx = 0; qidx < 8; ++qidx) {
+    if (qidx == 3) continue;
+    TestConfig config;
+    config.input.xsize = 256;
+    config.input.ysize = 256;
+    config.jparams.quant_indexes = {(qidx >> 2) & 1, (qidx >> 1) & 1,
+                                    (qidx >> 0) & 1};
+    all_tests.push_back(config);
+  }
+  for (int qidx = 0; qidx < 8; ++qidx) {
+    for (int slot_idx = 0; slot_idx < 2; ++slot_idx) {
+      if (qidx == 0 && slot_idx == 0) continue;
+      TestConfig config;
+      config.input.xsize = 256;
+      config.input.ysize = 256;
+      config.jparams.quant_indexes = {(qidx >> 2) & 1, (qidx >> 1) & 1,
+                                      (qidx >> 0) & 1};
+      CustomQuantTable table;
+      table.slot_idx = slot_idx;
+      table.Generate();
+      config.jparams.quant_tables.push_back(table);
+      all_tests.push_back(config);
+    }
+  }
+  for (int qidx = 0; qidx < 8; ++qidx) {
+    for (bool xyb : {false, true}) {
+      TestConfig config;
+      config.input.xsize = 256;
+      config.input.ysize = 256;
+      config.jparams.xyb_mode = xyb;
+      config.jparams.quant_indexes = {(qidx >> 2) & 1, (qidx >> 1) & 1,
+                                      (qidx >> 0) & 1};
+      {
+        CustomQuantTable table;
+        table.slot_idx = 0;
+        table.Generate();
+        config.jparams.quant_tables.push_back(table);
+      }
+      {
+        CustomQuantTable table;
+        table.slot_idx = 1;
+        table.table_type = 20;
+        table.Generate();
+        config.jparams.quant_tables.push_back(table);
+      }
+      config.compare_to_orig = true;
+      all_tests.push_back(config);
+    }
+  }
+  for (bool xyb : {false, true}) {
+    TestConfig config;
+    config.input.xsize = 256;
+    config.input.ysize = 256;
+    config.jparams.xyb_mode = xyb;
+    config.jparams.quant_indexes = {0, 1, 2};
+    {
+      CustomQuantTable table;
+      table.slot_idx = 0;
+      table.Generate();
+      config.jparams.quant_tables.push_back(table);
+    }
+    {
+      CustomQuantTable table;
+      table.slot_idx = 1;
+      table.table_type = 20;
+      table.Generate();
+      config.jparams.quant_tables.push_back(table);
+    }
+    {
+      CustomQuantTable table;
+      table.slot_idx = 2;
+      table.table_type = 30;
+      table.Generate();
+      config.jparams.quant_tables.push_back(table);
+    }
+    config.compare_to_orig = true;
+    all_tests.push_back(config);
+  }
+  // Tests for fixed (and custom) prefix codes.
+  for (J_COLOR_SPACE jpeg_color_space : {JCS_RGB, JCS_YCbCr}) {
+    for (bool flat_dc_luma : {false, true}) {
+      TestConfig config;
+      config.jparams.set_jpeg_colorspace = true;
+      config.jparams.jpeg_color_space = jpeg_color_space;
+      config.jparams.progressive_mode = 0;
+      config.jparams.optimize_coding = 0;
+      config.jparams.use_flat_dc_luma_code = flat_dc_luma;
+      all_tests.push_back(config);
+    }
+  }
+  for (J_COLOR_SPACE jpeg_color_space : {JCS_CMYK, JCS_YCCK}) {
+    for (bool flat_dc_luma : {false, true}) {
+      TestConfig config;
+      config.input.color_space = JCS_CMYK;
+      config.jparams.set_jpeg_colorspace = true;
+      config.jparams.jpeg_color_space = jpeg_color_space;
+      config.jparams.progressive_mode = 0;
+      config.jparams.optimize_coding = 0;
+      config.jparams.use_flat_dc_luma_code = flat_dc_luma;
+      all_tests.push_back(config);
+    }
+  }
+  // Test for jpeg without DHT marker.
+  {
+    TestConfig config;
+    config.jparams.progressive_mode = 0;
+    config.jparams.optimize_coding = 0;
+    config.jparams.omit_standard_tables = true;
+    all_tests.push_back(config);
+  }
+  // Test for custom component ids.
+  {
+    TestConfig config;
+    config.input.xsize = config.input.ysize = 128;
+    config.jparams.comp_ids = {7, 17, 177};
+    all_tests.push_back(config);
+  }
+  // Tests for JFIF/Adobe markers.
+  for (int override_JFIF : {-1, 0, 1}) {
+    for (int override_Adobe : {-1, 0, 1}) {
+      if (override_JFIF == -1 && override_Adobe == -1) continue;
+      TestConfig config;
+      config.input.xsize = config.input.ysize = 128;
+      config.jparams.override_JFIF = override_JFIF;
+      config.jparams.override_Adobe = override_Adobe;
+      all_tests.push_back(config);
+    }
+  }
+  // Tests for small images.
+  for (int xsize : {1, 7, 8, 9, 15, 16, 17}) {
+    for (int ysize : {1, 7, 8, 9, 15, 16, 17}) {
+      TestConfig config;
+      config.input.xsize = xsize;
+      config.input.ysize = ysize;
+      all_tests.push_back(config);
+    }
+  }
+  // Tests for custom marker processor.
+  for (size_t chunk_size : {0, 1, 64, 65536}) {
+    TestConfig config;
+    config.input.xsize = config.input.ysize = 256;
+    config.dparams.chunk_size = chunk_size;
+    config.jparams.add_marker = true;
+    all_tests.push_back(config);
+  }
+  // Tests for icc profile decoding.
+  for (size_t icc_size : {728, 70000, 1000000}) {
+    TestConfig config;
+    config.input.xsize = config.input.ysize = 256;
+    config.jparams.icc.resize(icc_size);
+    for (size_t i = 0; i < icc_size; ++i) {
+      config.jparams.icc[i] = (i * 17) & 0xff;
+    }
+    all_tests.push_back(config);
+  }
+  // Tests for unusual sampling factors.
+  for (int h0_samp : {1, 2, 3, 4}) {
+    for (int v0_samp : {1, 2, 3, 4}) {
+      for (int dxb = 0; dxb < h0_samp; ++dxb) {
+        for (int dyb = 0; dyb < v0_samp; ++dyb) {
+          for (int dx = 0; dx < 2; ++dx) {
+            for (int dy = 0; dy < 2; ++dy) {
+              TestConfig config;
+              config.input.xsize = 128 + dyb * 8 + dy;
+              config.input.ysize = 256 + dxb * 8 + dx;
+              config.jparams.progressive_mode = 2;
+              config.jparams.h_sampling = {h0_samp, 1, 1};
+              config.jparams.v_sampling = {v0_samp, 1, 1};
+              config.compare_to_orig = true;
+              all_tests.push_back(config);
+            }
+          }
+        }
+      }
+    }
+  }
+  for (int h0_samp : {1, 2, 4}) {
+    for (int v0_samp : {1, 2, 4}) {
+      for (int h2_samp : {1, 2, 4}) {
+        for (int v2_samp : {1, 2, 4}) {
+          TestConfig config;
+          config.input.xsize = 137;
+          config.input.ysize = 75;
+          config.jparams.progressive_mode = 2;
+          config.jparams.h_sampling = {h0_samp, 1, h2_samp};
+          config.jparams.v_sampling = {v0_samp, 1, v2_samp};
+          config.compare_to_orig = true;
+          all_tests.push_back(config);
+        }
+      }
+    }
+  }
+  for (int h0_samp : {1, 3}) {
+    for (int v0_samp : {1, 3}) {
+      for (int h2_samp : {1, 3}) {
+        for (int v2_samp : {1, 3}) {
+          TestConfig config;
+          config.input.xsize = 205;
+          config.input.ysize = 99;
+          config.jparams.progressive_mode = 2;
+          config.jparams.h_sampling = {h0_samp, 1, h2_samp};
+          config.jparams.v_sampling = {v0_samp, 1, v2_samp};
+          all_tests.push_back(config);
+        }
+      }
+    }
+  }
+  // Tests for output scaling.
+  for (int scale_num = 1; scale_num <= 16; ++scale_num) {
+    if (scale_num == 8) continue;
+    for (bool crop : {false, true}) {
+      for (int samp : {1, 2}) {
+        for (int progr : {0, 2}) {
+          TestConfig config;
+          config.jparams.h_sampling = {samp, 1, 1};
+          config.jparams.v_sampling = {samp, 1, 1};
+          config.jparams.progressive_mode = progr;
+          config.dparams.scale_num = scale_num;
+          config.dparams.scale_denom = 8;
+          config.dparams.crop_output = crop;
+          all_tests.push_back(config);
+        }
+      }
+    }
+  }
+  return all_tests;
+}
+
+std::string QuantMode(ColorQuantMode mode) {
+  switch (mode) {
+    case CQUANT_1PASS:
+      return "1pass";
+    case CQUANT_EXTERNAL:
+      return "External";
+    case CQUANT_2PASS:
+      return "2pass";
+    case CQUANT_REUSE:
+      return "Reuse";
+  }
+  return "";
+}
+
+std::string DitherMode(J_DITHER_MODE mode) {
+  switch (mode) {
+    case JDITHER_NONE:
+      return "No";
+    case JDITHER_ORDERED:
+      return "Ordered";
+    case JDITHER_FS:
+      return "FS";
+  }
+  return "";
+}
+
+std::ostream& operator<<(std::ostream& os, const DecompressParams& dparams) {
+  if (dparams.chunk_size == 0) {
+    os << "CompleteInput";
+  } else {
+    os << "InputChunks" << dparams.chunk_size;
+  }
+  if (dparams.size_factor < 1.0f) {
+    os << "Partial" << static_cast<int>(dparams.size_factor * 100) << "p";
+  }
+  if (dparams.max_output_lines == 0) {
+    os << "CompleteOutput";
+  } else {
+    os << "OutputLines" << dparams.max_output_lines;
+  }
+  if (dparams.output_mode == RAW_DATA) {
+    os << "RawDataOut";
+  } else if (dparams.output_mode == COEFFICIENTS) {
+    os << "CoeffsOut";
+  }
+  os << IOMethodName(dparams.data_type, dparams.endianness);
+  if (dparams.set_out_color_space) {
+    os << "OutColor" << ColorSpaceName((J_COLOR_SPACE)dparams.out_color_space);
+  }
+  if (dparams.crop_output) {
+    os << "Crop";
+  }
+  if (dparams.do_block_smoothing) {
+    os << "BlockSmoothing";
+  }
+  if (!dparams.do_fancy_upsampling) {
+    os << "NoFancyUpsampling";
+  }
+  if (dparams.scale_num != 1 || dparams.scale_denom != 1) {
+    os << "Scale" << dparams.scale_num << "_" << dparams.scale_denom;
+  }
+  if (dparams.quantize_colors) {
+    os << "Quant" << dparams.desired_number_of_colors << "colors";
+    for (size_t i = 0; i < dparams.scan_params.size(); ++i) {
+      if (i > 0) os << "_";
+      const auto& sparam = dparams.scan_params[i];
+      os << QuantMode(sparam.color_quant_mode);
+      os << DitherMode((J_DITHER_MODE)sparam.dither_mode) << "Dither";
+    }
+  }
+  if (dparams.skip_scans) {
+    os << "SkipScans";
+  }
+  return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const TestConfig& c) {
+  if (!c.fn.empty()) {
+    os << c.fn_desc;
+  } else {
+    os << c.input;
+  }
+  os << c.jparams;
+  os << c.dparams;
+  return os;
+}
+
+std::string TestDescription(const testing::TestParamInfo<TestConfig>& info) {
+  std::stringstream name;
+  name << info.param;
+  return name.str();
+}
+
+JPEGLI_INSTANTIATE_TEST_SUITE_P(DecodeAPITest, DecodeAPITestParam,
+                                testing::ValuesIn(GenerateTests(false)),
+                                TestDescription);
+
+JPEGLI_INSTANTIATE_TEST_SUITE_P(DecodeAPITestBuffered,
+                                DecodeAPITestParamBuffered,
+                                testing::ValuesIn(GenerateTests(true)),
+                                TestDescription);
+
+}  // namespace
+}  // namespace jpegli
diff --git a/lib/jpegli/decode_internal.h b/lib/jpegli/decode_internal.h
new file mode 100644 (file)
index 0000000..ed7baa3
--- /dev/null
@@ -0,0 +1,151 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_DECODE_INTERNAL_H_
+#define LIB_JPEGLI_DECODE_INTERNAL_H_
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include <vector>
+
+#include "lib/jpegli/common.h"
+#include "lib/jpegli/common_internal.h"
+#include "lib/jpegli/huffman.h"
+
+namespace jpegli {
+
+static constexpr int kNeedMoreInput = 100;
+static constexpr int kHandleRestart = 101;
+static constexpr int kHandleMarkerProcessor = 102;
+static constexpr int kProcessNextMarker = 103;
+static constexpr size_t kAllHuffLutSize = NUM_HUFF_TBLS * kJpegHuffmanLutSize;
+
+typedef int16_t coeff_t;
+
+// State of the decoder that has to be saved before decoding one MCU in case
+// we run out of the bitstream.
+struct MCUCodingState {
+  coeff_t last_dc_coeff[kMaxComponents];
+  int eobrun;
+  coeff_t coeffs[D_MAX_BLOCKS_IN_MCU * DCTSIZE2];
+};
+
+}  // namespace jpegli
+
+// Use this forward-declared libjpeg struct to hold all our private variables.
+// TODO(szabadka) Remove variables that have a corresponding version in cinfo.
+struct jpeg_decomp_master {
+  //
+  // Input handling state.
+  //
+  std::vector<uint8_t> input_buffer_;
+  size_t input_buffer_pos_;
+  // Number of bits after codestream_pos_ that were already processed.
+  size_t codestream_bits_ahead_;
+  bool streaming_mode_;
+
+  // Coefficient buffers
+  jvirt_barray_ptr* coef_arrays;
+  JBLOCKARRAY coeff_rows[jpegli::kMaxComponents];
+
+  //
+  // Marker data processing state.
+  //
+  bool found_soi_;
+  bool found_dri_;
+  bool found_sof_;
+  bool found_eoi_;
+  size_t icc_index_;
+  size_t icc_total_;
+  std::vector<uint8_t> icc_profile_;
+  jpegli::HuffmanTableEntry dc_huff_lut_[jpegli::kAllHuffLutSize];
+  jpegli::HuffmanTableEntry ac_huff_lut_[jpegli::kAllHuffLutSize];
+  uint8_t markers_to_save_[32];
+  jpeg_marker_parser_method app_marker_parsers[16];
+  jpeg_marker_parser_method com_marker_parser;
+  // Whether this jpeg has multiple scans (progressive or non-interleaved
+  // sequential).
+  bool is_multiscan_;
+
+  // Fields defined by SOF marker.
+  size_t iMCU_cols_;
+  int h_factor[jpegli::kMaxComponents];
+  int v_factor[jpegli::kMaxComponents];
+
+  // Initialized at strat of frame.
+  uint16_t scan_progression_[jpegli::kMaxComponents][DCTSIZE2];
+
+  //
+  // Per scan state.
+  //
+  size_t scan_mcu_row_;
+  size_t scan_mcu_col_;
+  size_t mcu_rows_per_iMCU_row_;
+  jpegli::coeff_t last_dc_coeff_[jpegli::kMaxComponents];
+  int eobrun_;
+  int restarts_to_go_;
+  int next_restart_marker_;
+
+  jpegli::MCUCodingState mcu_;
+
+  //
+  // Rendering state.
+  //
+  int output_passes_done_;
+  JpegliDataType output_data_type_ = JPEGLI_TYPE_UINT8;
+  bool swap_endianness_ = false;
+  size_t xoffset_;
+  bool need_context_rows_;
+
+  int min_scaled_dct_size;
+  int scaled_dct_size[jpegli::kMaxComponents];
+
+  size_t raw_height_[jpegli::kMaxComponents];
+  jpegli::RowBuffer<float> raw_output_[jpegli::kMaxComponents];
+  jpegli::RowBuffer<float> render_output_[jpegli::kMaxComponents];
+
+  void (*inverse_transform[jpegli::kMaxComponents])(
+      const int16_t* JXL_RESTRICT qblock, const float* JXL_RESTRICT dequant,
+      const float* JXL_RESTRICT biases, float* JXL_RESTRICT scratch_space,
+      float* JXL_RESTRICT output, size_t output_stride, size_t dctsize);
+
+  void (*color_transform)(float* row[jpegli::kMaxComponents], size_t len);
+
+  float* idct_scratch_;
+  float* upsample_scratch_;
+  uint8_t* output_scratch_;
+  int16_t* smoothing_scratch_;
+  float* dequant_;
+  // 1 = 1pass, 2 = 2pass, 3 = external
+  int quant_mode_;
+  int quant_pass_;
+  int num_colors_[jpegli::kMaxComponents];
+  uint8_t* colormap_lut_;
+  uint8_t* pixels_;
+  JSAMPARRAY scanlines_;
+  std::vector<std::vector<uint8_t>> candidate_lists_;
+  bool regenerate_inverse_colormap_;
+  float* dither_[jpegli::kMaxComponents];
+  float* error_row_[2 * jpegli::kMaxComponents];
+  size_t dither_size_;
+  size_t dither_mask_;
+
+  // Per channel and per frequency statistics about the number of nonzeros and
+  // the sum of coefficient absolute values, used in dequantization bias
+  // computation.
+  int* nonzeros_;
+  int* sumabs_;
+  size_t num_processed_blocks_[jpegli::kMaxComponents];
+  float* biases_;
+#define SAVED_COEFS 10
+  // This holds the coef_bits of the scan before the current scan,
+  // i.e. the bottom half when rendering incomplete scans.
+  int (*coef_bits_latch)[SAVED_COEFS];
+  int (*prev_coef_bits_latch)[SAVED_COEFS];
+  bool apply_smoothing;
+};
+
+#endif  // LIB_JPEGLI_DECODE_INTERNAL_H_
diff --git a/lib/jpegli/decode_marker.cc b/lib/jpegli/decode_marker.cc
new file mode 100644 (file)
index 0000000..c5c5790
--- /dev/null
@@ -0,0 +1,588 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/decode_marker.h"
+
+#include <string.h>
+
+#include "lib/jpegli/common.h"
+#include "lib/jpegli/decode_internal.h"
+#include "lib/jpegli/error.h"
+#include "lib/jpegli/huffman.h"
+#include "lib/jpegli/memory_manager.h"
+#include "lib/jxl/base/printf_macros.h"
+
+namespace jpegli {
+namespace {
+
+constexpr int kMaxDimPixels = 65535;
+constexpr uint8_t kIccProfileTag[12] = "ICC_PROFILE";
+
+// Macros for commonly used error conditions.
+
+#define JPEG_VERIFY_LEN(n)                                      \
+  if (pos + (n) > len) {                                        \
+    return JPEGLI_ERROR("Unexpected end of marker: pos=%" PRIuS \
+                        " need=%d len=%" PRIuS,                 \
+                        pos, static_cast<int>(n), len);         \
+  }
+
+#define JPEG_VERIFY_INPUT(var, low, high)                               \
+  if ((var) < (low) || (var) > (high)) {                                \
+    return JPEGLI_ERROR("Invalid " #var ": %d", static_cast<int>(var)); \
+  }
+
+#define JPEG_VERIFY_MARKER_END()                                  \
+  if (pos != len) {                                               \
+    return JPEGLI_ERROR("Invalid marker length: declared=%" PRIuS \
+                        " actual=%" PRIuS,                        \
+                        len, pos);                                \
+  }
+
+inline int ReadUint8(const uint8_t* data, size_t* pos) {
+  return data[(*pos)++];
+}
+
+inline int ReadUint16(const uint8_t* data, size_t* pos) {
+  int v = (data[*pos] << 8) + data[*pos + 1];
+  *pos += 2;
+  return v;
+}
+
+void ProcessSOF(j_decompress_ptr cinfo, const uint8_t* data, size_t len) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (!m->found_soi_) {
+    JPEGLI_ERROR("Unexpected SOF marker.");
+  }
+  if (m->found_sof_) {
+    JPEGLI_ERROR("Duplicate SOF marker.");
+  }
+  m->found_sof_ = true;
+  cinfo->progressive_mode = (cinfo->unread_marker == 0xc2);
+  cinfo->arith_code = 0;
+  size_t pos = 2;
+  JPEG_VERIFY_LEN(6);
+  cinfo->data_precision = ReadUint8(data, &pos);
+  cinfo->image_height = ReadUint16(data, &pos);
+  cinfo->image_width = ReadUint16(data, &pos);
+  cinfo->num_components = ReadUint8(data, &pos);
+  JPEG_VERIFY_INPUT(cinfo->data_precision, kJpegPrecision, kJpegPrecision);
+  JPEG_VERIFY_INPUT(cinfo->image_height, 1, kMaxDimPixels);
+  JPEG_VERIFY_INPUT(cinfo->image_width, 1, kMaxDimPixels);
+  JPEG_VERIFY_INPUT(cinfo->num_components, 1, kMaxComponents);
+  JPEG_VERIFY_LEN(3 * cinfo->num_components);
+  cinfo->comp_info = jpegli::Allocate<jpeg_component_info>(
+      cinfo, cinfo->num_components, JPOOL_IMAGE);
+
+  // Read sampling factors and quant table index for each component.
+  uint8_t ids_seen[256] = {0};
+  cinfo->max_h_samp_factor = 1;
+  cinfo->max_v_samp_factor = 1;
+  for (int i = 0; i < cinfo->num_components; ++i) {
+    jpeg_component_info* comp = &cinfo->comp_info[i];
+    comp->component_index = i;
+    const int id = ReadUint8(data, &pos);
+    if (ids_seen[id]) {  // (cf. section B.2.2, syntax of Ci)
+      JPEGLI_ERROR("Duplicate ID %d in SOF.", id);
+    }
+    ids_seen[id] = 1;
+    comp->component_id = id;
+    int factor = ReadUint8(data, &pos);
+    int h_samp_factor = factor >> 4;
+    int v_samp_factor = factor & 0xf;
+    JPEG_VERIFY_INPUT(h_samp_factor, 1, MAX_SAMP_FACTOR);
+    JPEG_VERIFY_INPUT(v_samp_factor, 1, MAX_SAMP_FACTOR);
+    comp->h_samp_factor = h_samp_factor;
+    comp->v_samp_factor = v_samp_factor;
+    cinfo->max_h_samp_factor =
+        std::max(cinfo->max_h_samp_factor, h_samp_factor);
+    cinfo->max_v_samp_factor =
+        std::max(cinfo->max_v_samp_factor, v_samp_factor);
+    int quant_tbl_idx = ReadUint8(data, &pos);
+    JPEG_VERIFY_INPUT(quant_tbl_idx, 0, NUM_QUANT_TBLS - 1);
+    comp->quant_tbl_no = quant_tbl_idx;
+    if (cinfo->quant_tbl_ptrs[quant_tbl_idx] == nullptr) {
+      JPEGLI_ERROR("Quantization table with index %u not found", quant_tbl_idx);
+    }
+    comp->quant_table = nullptr;  // will be allocated after SOS marker
+  }
+  JPEG_VERIFY_MARKER_END();
+
+  // Set the input colorspace based on the markers we have seen and set
+  // default output colorspace.
+  if (cinfo->num_components == 1) {
+    cinfo->jpeg_color_space = JCS_GRAYSCALE;
+    cinfo->out_color_space = JCS_GRAYSCALE;
+  } else if (cinfo->num_components == 3) {
+    if (cinfo->saw_JFIF_marker) {
+      cinfo->jpeg_color_space = JCS_YCbCr;
+    } else if (cinfo->saw_Adobe_marker) {
+      cinfo->jpeg_color_space =
+          cinfo->Adobe_transform == 0 ? JCS_RGB : JCS_YCbCr;
+    } else {
+      cinfo->jpeg_color_space = JCS_YCbCr;
+      if (cinfo->comp_info[0].component_id == 'R' &&  //
+          cinfo->comp_info[1].component_id == 'G' &&  //
+          cinfo->comp_info[2].component_id == 'B') {
+        cinfo->jpeg_color_space = JCS_RGB;
+      }
+    }
+    cinfo->out_color_space = JCS_RGB;
+  } else if (cinfo->num_components == 4) {
+    if (cinfo->saw_Adobe_marker) {
+      cinfo->jpeg_color_space =
+          cinfo->Adobe_transform == 0 ? JCS_CMYK : JCS_YCCK;
+    } else {
+      cinfo->jpeg_color_space = JCS_CMYK;
+    }
+    cinfo->out_color_space = JCS_CMYK;
+  }
+
+  // We have checked above that none of the sampling factors are 0, so the max
+  // sampling factors can not be 0.
+  cinfo->total_iMCU_rows =
+      DivCeil(cinfo->image_height, cinfo->max_v_samp_factor * DCTSIZE);
+  m->iMCU_cols_ =
+      DivCeil(cinfo->image_width, cinfo->max_h_samp_factor * DCTSIZE);
+  // Compute the block dimensions for each component.
+  for (int i = 0; i < cinfo->num_components; ++i) {
+    jpeg_component_info* comp = &cinfo->comp_info[i];
+    if (cinfo->max_h_samp_factor % comp->h_samp_factor != 0 ||
+        cinfo->max_v_samp_factor % comp->v_samp_factor != 0) {
+      JPEGLI_ERROR("Non-integral subsampling ratios.");
+    }
+    m->h_factor[i] = cinfo->max_h_samp_factor / comp->h_samp_factor;
+    m->v_factor[i] = cinfo->max_v_samp_factor / comp->v_samp_factor;
+    comp->downsampled_width = DivCeil(cinfo->image_width, m->h_factor[i]);
+    comp->downsampled_height = DivCeil(cinfo->image_height, m->v_factor[i]);
+    comp->width_in_blocks = DivCeil(comp->downsampled_width, DCTSIZE);
+    comp->height_in_blocks = DivCeil(comp->downsampled_height, DCTSIZE);
+  }
+  memset(m->scan_progression_, 0, sizeof(m->scan_progression_));
+}
+
+void ProcessSOS(j_decompress_ptr cinfo, const uint8_t* data, size_t len) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (!m->found_sof_) {
+    JPEGLI_ERROR("Unexpected SOS marker.");
+  }
+  size_t pos = 2;
+  JPEG_VERIFY_LEN(1);
+  cinfo->comps_in_scan = ReadUint8(data, &pos);
+  JPEG_VERIFY_INPUT(cinfo->comps_in_scan, 1, cinfo->num_components);
+  JPEG_VERIFY_INPUT(cinfo->comps_in_scan, 1, MAX_COMPS_IN_SCAN);
+
+  JPEG_VERIFY_LEN(2 * cinfo->comps_in_scan);
+  bool is_interleaved = (cinfo->comps_in_scan > 1);
+  uint8_t ids_seen[256] = {0};
+  cinfo->blocks_in_MCU = 0;
+  for (int i = 0; i < cinfo->comps_in_scan; ++i) {
+    int id = ReadUint8(data, &pos);
+    if (ids_seen[id]) {  // (cf. section B.2.3, regarding CSj)
+      return JPEGLI_ERROR("Duplicate ID %d in SOS.", id);
+    }
+    ids_seen[id] = 1;
+    jpeg_component_info* comp = nullptr;
+    for (int j = 0; j < cinfo->num_components; ++j) {
+      if (cinfo->comp_info[j].component_id == id) {
+        comp = &cinfo->comp_info[j];
+        cinfo->cur_comp_info[i] = comp;
+      }
+    }
+    if (!comp) {
+      return JPEGLI_ERROR("SOS marker: Could not find component with id %d",
+                          id);
+    }
+    int c = ReadUint8(data, &pos);
+    comp->dc_tbl_no = c >> 4;
+    comp->ac_tbl_no = c & 0xf;
+    JPEG_VERIFY_INPUT(comp->dc_tbl_no, 0, 3);
+    JPEG_VERIFY_INPUT(comp->ac_tbl_no, 0, 3);
+    comp->MCU_width = is_interleaved ? comp->h_samp_factor : 1;
+    comp->MCU_height = is_interleaved ? comp->v_samp_factor : 1;
+    comp->MCU_blocks = comp->MCU_width * comp->MCU_height;
+    if (cinfo->blocks_in_MCU + comp->MCU_blocks > D_MAX_BLOCKS_IN_MCU) {
+      JPEGLI_ERROR("Too many blocks in MCU.");
+    }
+    for (int j = 0; j < comp->MCU_blocks; ++j) {
+      cinfo->MCU_membership[cinfo->blocks_in_MCU++] = i;
+    }
+  }
+  JPEG_VERIFY_LEN(3);
+  cinfo->Ss = ReadUint8(data, &pos);
+  cinfo->Se = ReadUint8(data, &pos);
+  JPEG_VERIFY_INPUT(cinfo->Ss, 0, 63);
+  JPEG_VERIFY_INPUT(cinfo->Se, cinfo->Ss, 63);
+  int c = ReadUint8(data, &pos);
+  cinfo->Ah = c >> 4;
+  cinfo->Al = c & 0xf;
+  JPEG_VERIFY_MARKER_END();
+
+  if (cinfo->input_scan_number == 0) {
+    m->is_multiscan_ = (cinfo->comps_in_scan < cinfo->num_components ||
+                        cinfo->progressive_mode);
+  }
+  if (cinfo->Ah != 0 && cinfo->Al != cinfo->Ah - 1) {
+    // section G.1.1.1.2 : Successive approximation control only improves
+    // by one bit at a time.
+    JPEGLI_ERROR("Invalid progressive parameters: Al=%d Ah=%d", cinfo->Al,
+                 cinfo->Ah);
+  }
+  if (!cinfo->progressive_mode) {
+    cinfo->Ss = 0;
+    cinfo->Se = 63;
+    cinfo->Ah = 0;
+    cinfo->Al = 0;
+  }
+  const uint16_t scan_bitmask =
+      cinfo->Ah == 0 ? (0xffff << cinfo->Al) : (1u << cinfo->Al);
+  const uint16_t refinement_bitmask = (1 << cinfo->Al) - 1;
+  if (!cinfo->coef_bits) {
+    cinfo->coef_bits =
+        Allocate<int[DCTSIZE2]>(cinfo, cinfo->num_components * 2, JPOOL_IMAGE);
+    m->coef_bits_latch =
+        Allocate<int[SAVED_COEFS]>(cinfo, cinfo->num_components, JPOOL_IMAGE);
+    m->prev_coef_bits_latch =
+        Allocate<int[SAVED_COEFS]>(cinfo, cinfo->num_components, JPOOL_IMAGE);
+
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      for (int i = 0; i < DCTSIZE2; ++i) {
+        cinfo->coef_bits[c][i] = -1;
+        if (i < SAVED_COEFS) {
+          m->coef_bits_latch[c][i] = -1;
+        }
+      }
+    }
+  }
+
+  for (int i = 0; i < cinfo->comps_in_scan; ++i) {
+    int comp_idx = cinfo->cur_comp_info[i]->component_index;
+    for (int k = cinfo->Ss; k <= cinfo->Se; ++k) {
+      if (m->scan_progression_[comp_idx][k] & scan_bitmask) {
+        return JPEGLI_ERROR(
+            "Overlapping scans: component=%d k=%d prev_mask: %u cur_mask %u",
+            comp_idx, k, m->scan_progression_[i][k], scan_bitmask);
+      }
+      if (m->scan_progression_[comp_idx][k] & refinement_bitmask) {
+        return JPEGLI_ERROR(
+            "Invalid scan order, a more refined scan was already done: "
+            "component=%d k=%d prev_mask=%u cur_mask=%u",
+            comp_idx, k, m->scan_progression_[i][k], scan_bitmask);
+      }
+      m->scan_progression_[comp_idx][k] |= scan_bitmask;
+    }
+  }
+  if (cinfo->Al > 10) {
+    return JPEGLI_ERROR("Scan parameter Al=%d is not supported.", cinfo->Al);
+  }
+}
+
+// Reads the Define Huffman Table (DHT) marker segment and builds the Huffman
+// decoding table in either dc_huff_lut_ or ac_huff_lut_, depending on the type
+// and solt_id of Huffman code being read.
+void ProcessDHT(j_decompress_ptr cinfo, const uint8_t* data, size_t len) {
+  size_t pos = 2;
+  if (pos == len) {
+    return JPEGLI_ERROR("DHT marker: no Huffman table found");
+  }
+  while (pos < len) {
+    JPEG_VERIFY_LEN(1 + kJpegHuffmanMaxBitLength);
+    // The index of the Huffman code in the current set of Huffman codes. For AC
+    // component Huffman codes, 0x10 is added to the index.
+    int slot_id = ReadUint8(data, &pos);
+    int huffman_index = slot_id;
+    int is_ac_table = (slot_id & 0x10) != 0;
+    JHUFF_TBL** table;
+    if (is_ac_table) {
+      huffman_index -= 0x10;
+      JPEG_VERIFY_INPUT(huffman_index, 0, NUM_HUFF_TBLS - 1);
+      table = &cinfo->ac_huff_tbl_ptrs[huffman_index];
+    } else {
+      JPEG_VERIFY_INPUT(huffman_index, 0, NUM_HUFF_TBLS - 1);
+      table = &cinfo->dc_huff_tbl_ptrs[huffman_index];
+    }
+    if (*table == nullptr) {
+      *table = jpegli_alloc_huff_table(reinterpret_cast<j_common_ptr>(cinfo));
+    }
+    int total_count = 0;
+    for (size_t i = 1; i <= kJpegHuffmanMaxBitLength; ++i) {
+      int count = ReadUint8(data, &pos);
+      (*table)->bits[i] = count;
+      total_count += count;
+    }
+    if (is_ac_table) {
+      JPEG_VERIFY_INPUT(total_count, 0, kJpegHuffmanAlphabetSize);
+    } else {
+      // Allow symbols up to 15 here, we check later whether any invalid symbols
+      // are actually decoded.
+      // TODO(szabadka) Make sure decoder works (does not crash) with up to
+      // 15-nbits DC symbols and then increase kJpegDCAlphabetSize.
+      JPEG_VERIFY_INPUT(total_count, 0, 16);
+    }
+    JPEG_VERIFY_LEN(total_count);
+    for (int i = 0; i < total_count; ++i) {
+      int value = ReadUint8(data, &pos);
+      if (!is_ac_table) {
+        JPEG_VERIFY_INPUT(value, 0, 15);
+      }
+      (*table)->huffval[i] = value;
+    }
+    for (int i = total_count; i < kJpegHuffmanAlphabetSize; ++i) {
+      (*table)->huffval[i] = 0;
+    }
+  }
+  JPEG_VERIFY_MARKER_END();
+}
+
+void ProcessDQT(j_decompress_ptr cinfo, const uint8_t* data, size_t len) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (m->found_sof_) {
+    JPEGLI_ERROR("Updating quant tables between scans is not supported.");
+  }
+  size_t pos = 2;
+  if (pos == len) {
+    return JPEGLI_ERROR("DQT marker: no quantization table found");
+  }
+  while (pos < len) {
+    JPEG_VERIFY_LEN(1);
+    int quant_table_index = ReadUint8(data, &pos);
+    int precision = quant_table_index >> 4;
+    JPEG_VERIFY_INPUT(precision, 0, 1);
+    quant_table_index &= 0xf;
+    JPEG_VERIFY_INPUT(quant_table_index, 0, NUM_QUANT_TBLS - 1);
+    JPEG_VERIFY_LEN((precision + 1) * DCTSIZE2);
+
+    if (cinfo->quant_tbl_ptrs[quant_table_index] == nullptr) {
+      cinfo->quant_tbl_ptrs[quant_table_index] =
+          jpegli_alloc_quant_table(reinterpret_cast<j_common_ptr>(cinfo));
+    }
+    JQUANT_TBL* quant_table = cinfo->quant_tbl_ptrs[quant_table_index];
+
+    for (size_t i = 0; i < DCTSIZE2; ++i) {
+      int quant_val =
+          precision ? ReadUint16(data, &pos) : ReadUint8(data, &pos);
+      JPEG_VERIFY_INPUT(quant_val, 1, 65535);
+      quant_table->quantval[kJPEGNaturalOrder[i]] = quant_val;
+    }
+  }
+  JPEG_VERIFY_MARKER_END();
+}
+
+void ProcessDNL(j_decompress_ptr cinfo, const uint8_t* data, size_t len) {
+  // Ignore marker.
+}
+
+void ProcessDRI(j_decompress_ptr cinfo, const uint8_t* data, size_t len) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (m->found_dri_) {
+    return JPEGLI_ERROR("Duplicate DRI marker.");
+  }
+  m->found_dri_ = true;
+  size_t pos = 2;
+  JPEG_VERIFY_LEN(2);
+  cinfo->restart_interval = ReadUint16(data, &pos);
+  JPEG_VERIFY_MARKER_END();
+}
+
+void ProcessAPP(j_decompress_ptr cinfo, const uint8_t* data, size_t len) {
+  jpeg_decomp_master* m = cinfo->master;
+  const uint8_t marker = cinfo->unread_marker;
+  const uint8_t* payload = data + 2;
+  size_t payload_size = len - 2;
+  if (marker == 0xE0) {
+    if (payload_size >= 14 && memcmp(payload, "JFIF", 4) == 0) {
+      cinfo->saw_JFIF_marker = TRUE;
+      cinfo->JFIF_major_version = payload[5];
+      cinfo->JFIF_minor_version = payload[6];
+      cinfo->density_unit = payload[7];
+      cinfo->X_density = (payload[8] << 8) + payload[9];
+      cinfo->Y_density = (payload[10] << 8) + payload[11];
+    }
+  } else if (marker == 0xEE) {
+    if (payload_size >= 12 && memcmp(payload, "Adobe", 5) == 0) {
+      cinfo->saw_Adobe_marker = TRUE;
+      cinfo->Adobe_transform = payload[11];
+    }
+  } else if (marker == 0xE2) {
+    if (payload_size >= sizeof(kIccProfileTag) &&
+        memcmp(payload, kIccProfileTag, sizeof(kIccProfileTag)) == 0) {
+      payload += sizeof(kIccProfileTag);
+      payload_size -= sizeof(kIccProfileTag);
+      if (payload_size < 2) {
+        return JPEGLI_ERROR("ICC chunk is too small.");
+      }
+      uint8_t index = payload[0];
+      uint8_t total = payload[1];
+      ++m->icc_index_;
+      if (m->icc_index_ != index) {
+        return JPEGLI_ERROR("Invalid ICC chunk order.");
+      }
+      if (total == 0) {
+        return JPEGLI_ERROR("Invalid ICC chunk total.");
+      }
+      if (m->icc_total_ == 0) {
+        m->icc_total_ = total;
+      } else if (m->icc_total_ != total) {
+        return JPEGLI_ERROR("Invalid ICC chunk total.");
+      }
+      if (m->icc_index_ > m->icc_total_) {
+        return JPEGLI_ERROR("Invalid ICC chunk index.");
+      }
+      m->icc_profile_.insert(m->icc_profile_.end(), payload + 2,
+                             payload + payload_size);
+    }
+  }
+}
+
+void ProcessCOM(j_decompress_ptr cinfo, const uint8_t* data, size_t len) {
+  // Ignore marker.
+}
+
+void ProcessSOI(j_decompress_ptr cinfo, const uint8_t* data, size_t len) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (m->found_soi_) {
+    JPEGLI_ERROR("Duplicate SOI marker");
+  }
+  m->found_soi_ = true;
+}
+
+void ProcessEOI(j_decompress_ptr cinfo, const uint8_t* data, size_t len) {
+  cinfo->master->found_eoi_ = true;
+}
+
+void SaveMarker(j_decompress_ptr cinfo, const uint8_t* data, size_t len) {
+  const uint8_t marker = cinfo->unread_marker;
+  const uint8_t* payload = data + 2;
+  size_t payload_size = len - 2;
+
+  // Insert new saved marker to the head of the list.
+  jpeg_saved_marker_ptr next = cinfo->marker_list;
+  cinfo->marker_list =
+      jpegli::Allocate<jpeg_marker_struct>(cinfo, 1, JPOOL_IMAGE);
+  cinfo->marker_list->next = next;
+  cinfo->marker_list->marker = marker;
+  cinfo->marker_list->original_length = payload_size;
+  cinfo->marker_list->data_length = payload_size;
+  cinfo->marker_list->data =
+      jpegli::Allocate<uint8_t>(cinfo, payload_size, JPOOL_IMAGE);
+  memcpy(cinfo->marker_list->data, payload, payload_size);
+}
+
+uint8_t ProcessNextMarker(j_decompress_ptr cinfo, const uint8_t* const data,
+                          const size_t len, size_t* pos) {
+  jpeg_decomp_master* m = cinfo->master;
+  size_t num_skipped = 0;
+  uint8_t marker = cinfo->unread_marker;
+  if (marker == 0) {
+    // kIsValidMarker[i] == 1 means (0xc0 + i) is a valid marker.
+    static const uint8_t kIsValidMarker[] = {
+        1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
+    };
+    // Skip bytes between markers.
+    while (*pos + 1 < len && (data[*pos] != 0xff || data[*pos + 1] < 0xc0 ||
+                              !kIsValidMarker[data[*pos + 1] - 0xc0])) {
+      ++(*pos);
+      ++num_skipped;
+    }
+    if (*pos + 2 > len) {
+      return kNeedMoreInput;
+    }
+    marker = data[*pos + 1];
+    if (num_skipped > 0) {
+      if (m->found_soi_) {
+        JPEGLI_WARN("Skipped %d bytes before marker 0x%02x", (int)num_skipped,
+                    marker);
+      } else {
+        JPEGLI_ERROR("Did not find SOI marker.");
+      }
+    }
+    *pos += 2;
+    cinfo->unread_marker = marker;
+  }
+  if (!m->found_soi_ && marker != 0xd8) {
+    JPEGLI_ERROR("Did not find SOI marker.");
+  }
+  if (GetMarkerProcessor(cinfo)) {
+    return kHandleMarkerProcessor;
+  }
+  const uint8_t* marker_data = &data[*pos];
+  size_t marker_len = 0;
+  if (marker != 0xd8 && marker != 0xd9) {
+    if (*pos + 2 > len) {
+      return kNeedMoreInput;
+    }
+    marker_len += (data[*pos] << 8) + data[*pos + 1];
+    if (marker_len < 2) {
+      JPEGLI_ERROR("Invalid marker length");
+    }
+    if (*pos + marker_len > len) {
+      // TODO(szabadka) Limit our memory usage by using the skip_input_data
+      // source manager callback on APP markers that are not saved.
+      return kNeedMoreInput;
+    }
+    if (marker >= 0xe0 && m->markers_to_save_[marker - 0xe0]) {
+      SaveMarker(cinfo, marker_data, marker_len);
+    }
+  }
+  if (marker == 0xc0 || marker == 0xc1 || marker == 0xc2) {
+    ProcessSOF(cinfo, marker_data, marker_len);
+  } else if (marker == 0xc4) {
+    ProcessDHT(cinfo, marker_data, marker_len);
+  } else if (marker == 0xda) {
+    ProcessSOS(cinfo, marker_data, marker_len);
+  } else if (marker == 0xdb) {
+    ProcessDQT(cinfo, marker_data, marker_len);
+  } else if (marker == 0xdc) {
+    ProcessDNL(cinfo, marker_data, marker_len);
+  } else if (marker == 0xdd) {
+    ProcessDRI(cinfo, marker_data, marker_len);
+  } else if (marker >= 0xe0 && marker <= 0xef) {
+    ProcessAPP(cinfo, marker_data, marker_len);
+  } else if (marker == 0xfe) {
+    ProcessCOM(cinfo, marker_data, marker_len);
+  } else if (marker == 0xd8) {
+    ProcessSOI(cinfo, marker_data, marker_len);
+  } else if (marker == 0xd9) {
+    ProcessEOI(cinfo, marker_data, marker_len);
+  } else {
+    JPEGLI_ERROR("Unexpected marker 0x%x", marker);
+  }
+  *pos += marker_len;
+  cinfo->unread_marker = 0;
+  if (marker == 0xda) {
+    return JPEG_REACHED_SOS;
+  } else if (marker == 0xd9) {
+    return JPEG_REACHED_EOI;
+  }
+  return kProcessNextMarker;
+}
+
+}  // namespace
+
+jpeg_marker_parser_method GetMarkerProcessor(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  uint8_t marker = cinfo->unread_marker;
+  jpeg_marker_parser_method callback = nullptr;
+  if (marker >= 0xe0 && marker <= 0xef) {
+    callback = m->app_marker_parsers[marker - 0xe0];
+  } else if (marker == 0xfe) {
+    callback = m->com_marker_parser;
+  }
+  return callback;
+}
+
+int ProcessMarkers(j_decompress_ptr cinfo, const uint8_t* const data,
+                   const size_t len, size_t* pos) {
+  for (;;) {
+    int status = ProcessNextMarker(cinfo, data, len, pos);
+    if (status != kProcessNextMarker) {
+      return status;
+    }
+  }
+}
+
+}  // namespace jpegli
diff --git a/lib/jpegli/decode_marker.h b/lib/jpegli/decode_marker.h
new file mode 100644 (file)
index 0000000..fb24b3e
--- /dev/null
@@ -0,0 +1,32 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_DECODE_MARKER_H_
+#define LIB_JPEGLI_DECODE_MARKER_H_
+
+#include <stdint.h>
+
+#include "lib/jpegli/common.h"
+
+namespace jpegli {
+
+// Reads the available input in the source manager's input buffer until either
+// the end of the next SOS marker or the end of the input.
+// The corresponding fields of cinfo are updated with the processed input data.
+// Upon return, the input buffer will be at the start or at the end of a marker
+// data segment (inter-marker data is allowed).
+// Return value is one of:
+//   * JPEG_SUSPENDED, if the current input buffer ends before the next SOS or
+//       EOI marker. Input buffer refill is handled by the caller;
+//   * JPEG_REACHED_SOS, if the next SOS marker is found;
+//   * JPEG_REACHED_EOR, if the end of the input is found.
+int ProcessMarkers(j_decompress_ptr cinfo, const uint8_t* const data,
+                   const size_t len, size_t* pos);
+
+jpeg_marker_parser_method GetMarkerProcessor(j_decompress_ptr cinfo);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_DECODE_MARKER_H_
diff --git a/lib/jpegli/decode_scan.cc b/lib/jpegli/decode_scan.cc
new file mode 100644 (file)
index 0000000..05b1f37
--- /dev/null
@@ -0,0 +1,566 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/decode_scan.h"
+
+#include <string.h>
+
+#include <hwy/base.h>
+
+#include "lib/jpegli/decode_internal.h"
+#include "lib/jpegli/error.h"
+#include "lib/jxl/base/status.h"
+
+namespace jpegli {
+namespace {
+
+// Max 14 block per MCU (when 1 channel is subsampled)
+// Max 64 nonzero coefficients per block
+// Max 16 symbol bits plus 11 extra bits per nonzero symbol
+// Max 2 bytes per 8 bits (worst case is all bytes are escaped 0xff)
+constexpr int kMaxMCUByteSize = 6048;
+
+// Helper structure to read bits from the entropy coded data segment.
+struct BitReaderState {
+  BitReaderState(const uint8_t* data, const size_t len, size_t pos)
+      : data_(data), len_(len), start_pos_(pos) {
+    Reset(pos);
+  }
+
+  void Reset(size_t pos) {
+    pos_ = pos;
+    val_ = 0;
+    bits_left_ = 0;
+    next_marker_pos_ = len_;
+    FillBitWindow();
+  }
+
+  // Returns the next byte and skips the 0xff/0x00 escape sequences.
+  uint8_t GetNextByte() {
+    if (pos_ >= next_marker_pos_) {
+      ++pos_;
+      return 0;
+    }
+    uint8_t c = data_[pos_++];
+    if (c == 0xff) {
+      uint8_t escape = pos_ < len_ ? data_[pos_] : 0;
+      if (escape == 0) {
+        ++pos_;
+      } else {
+        // 0xff was followed by a non-zero byte, which means that we found the
+        // start of the next marker segment.
+        next_marker_pos_ = pos_ - 1;
+      }
+    }
+    return c;
+  }
+
+  void FillBitWindow() {
+    if (bits_left_ <= 16) {
+      while (bits_left_ <= 56) {
+        val_ <<= 8;
+        val_ |= (uint64_t)GetNextByte();
+        bits_left_ += 8;
+      }
+    }
+  }
+
+  int ReadBits(int nbits) {
+    FillBitWindow();
+    uint64_t val = (val_ >> (bits_left_ - nbits)) & ((1ULL << nbits) - 1);
+    bits_left_ -= nbits;
+    return val;
+  }
+
+  // Sets *pos to the next stream position, and *bit_pos to the bit position
+  // within the next byte where parsing should continue.
+  // Returns false if the stream ended too early.
+  bool FinishStream(size_t* pos, size_t* bit_pos) {
+    *bit_pos = (8 - (bits_left_ & 7)) & 7;
+    // Give back some bytes that we did not use.
+    int unused_bytes_left = DivCeil(bits_left_, 8);
+    while (unused_bytes_left-- > 0) {
+      --pos_;
+      // If we give back a 0 byte, we need to check if it was a 0xff/0x00 escape
+      // sequence, and if yes, we need to give back one more byte.
+      if (((pos_ == len_ && pos_ == next_marker_pos_) ||
+           (pos_ > 0 && pos_ < next_marker_pos_ && data_[pos_] == 0)) &&
+          (data_[pos_ - 1] == 0xff)) {
+        --pos_;
+      }
+    }
+    if (pos_ >= next_marker_pos_) {
+      *pos = next_marker_pos_;
+      if (pos_ > next_marker_pos_ || *bit_pos > 0) {
+        // Data ran out before the scan was complete.
+        return false;
+      }
+    }
+    *pos = pos_;
+    return true;
+  }
+
+  const uint8_t* data_;
+  const size_t len_;
+  size_t pos_;
+  uint64_t val_;
+  int bits_left_;
+  size_t next_marker_pos_;
+  size_t start_pos_;
+};
+
+// Returns the next Huffman-coded symbol.
+int ReadSymbol(const HuffmanTableEntry* table, BitReaderState* br) {
+  int nbits;
+  br->FillBitWindow();
+  int val = (br->val_ >> (br->bits_left_ - 8)) & 0xff;
+  table += val;
+  nbits = table->bits - 8;
+  if (nbits > 0) {
+    br->bits_left_ -= 8;
+    table += table->value;
+    val = (br->val_ >> (br->bits_left_ - nbits)) & ((1 << nbits) - 1);
+    table += val;
+  }
+  br->bits_left_ -= table->bits;
+  return table->value;
+}
+
+/**
+ * Returns the DC diff or AC value for extra bits value x and prefix code s.
+ *
+ * CCITT Rec. T.81 (1992 E)
+ * Table F.1 – Difference magnitude categories for DC coding
+ *  SSSS | DIFF values
+ * ------+--------------------------
+ *     0 | 0
+ *     1 | –1, 1
+ *     2 | –3, –2, 2, 3
+ *     3 | –7..–4, 4..7
+ * ......|..........................
+ *    11 | –2047..–1024, 1024..2047
+ *
+ * CCITT Rec. T.81 (1992 E)
+ * Table F.2 – Categories assigned to coefficient values
+ * [ Same as Table F.1, but does not include SSSS equal to 0 and 11]
+ *
+ *
+ * CCITT Rec. T.81 (1992 E)
+ * F.1.2.1.1 Structure of DC code table
+ * For each category,... additional bits... appended... to uniquely identify
+ * which difference... occurred... When DIFF is positive... SSSS... bits of DIFF
+ * are appended. When DIFF is negative... SSSS... bits of (DIFF – 1) are
+ * appended... Most significant bit... is 0 for negative differences and 1 for
+ * positive differences.
+ *
+ * In other words the upper half of extra bits range represents DIFF as is.
+ * The lower half represents the negative DIFFs with an offset.
+ */
+int HuffExtend(int x, int s) {
+  JXL_DASSERT(s >= 1);
+  int half = 1 << (s - 1);
+  if (x >= half) {
+    JXL_DASSERT(x < (1 << s));
+    return x;
+  } else {
+    return x - (1 << s) + 1;
+  }
+}
+
+// Decodes one 8x8 block of DCT coefficients from the bit stream.
+bool DecodeDCTBlock(const HuffmanTableEntry* dc_huff,
+                    const HuffmanTableEntry* ac_huff, int Ss, int Se, int Al,
+                    int* eobrun, BitReaderState* br, coeff_t* last_dc_coeff,
+                    coeff_t* coeffs) {
+  // Nowadays multiplication is even faster than variable shift.
+  int Am = 1 << Al;
+  bool eobrun_allowed = Ss > 0;
+  if (Ss == 0) {
+    int s = ReadSymbol(dc_huff, br);
+    if (s >= kJpegDCAlphabetSize) {
+      return false;
+    }
+    int diff = 0;
+    if (s > 0) {
+      int bits = br->ReadBits(s);
+      diff = HuffExtend(bits, s);
+    }
+    int coeff = diff + *last_dc_coeff;
+    const int dc_coeff = coeff * Am;
+    coeffs[0] = dc_coeff;
+    // TODO(eustas): is there a more elegant / explicit way to check this?
+    if (dc_coeff != coeffs[0]) {
+      return false;
+    }
+    *last_dc_coeff = coeff;
+    ++Ss;
+  }
+  if (Ss > Se) {
+    return true;
+  }
+  if (*eobrun > 0) {
+    --(*eobrun);
+    return true;
+  }
+  for (int k = Ss; k <= Se; k++) {
+    int sr = ReadSymbol(ac_huff, br);
+    if (sr >= kJpegHuffmanAlphabetSize) {
+      return false;
+    }
+    int r = sr >> 4;
+    int s = sr & 15;
+    if (s > 0) {
+      k += r;
+      if (k > Se) {
+        return false;
+      }
+      if (s + Al >= kJpegDCAlphabetSize) {
+        return false;
+      }
+      int bits = br->ReadBits(s);
+      int coeff = HuffExtend(bits, s);
+      coeffs[kJPEGNaturalOrder[k]] = coeff * Am;
+    } else if (r == 15) {
+      k += 15;
+    } else {
+      *eobrun = 1 << r;
+      if (r > 0) {
+        if (!eobrun_allowed) {
+          return false;
+        }
+        *eobrun += br->ReadBits(r);
+      }
+      break;
+    }
+  }
+  --(*eobrun);
+  return true;
+}
+
+bool RefineDCTBlock(const HuffmanTableEntry* ac_huff, int Ss, int Se, int Al,
+                    int* eobrun, BitReaderState* br, coeff_t* coeffs) {
+  // Nowadays multiplication is even faster than variable shift.
+  int Am = 1 << Al;
+  bool eobrun_allowed = Ss > 0;
+  if (Ss == 0) {
+    int s = br->ReadBits(1);
+    coeff_t dc_coeff = coeffs[0];
+    dc_coeff |= s * Am;
+    coeffs[0] = dc_coeff;
+    ++Ss;
+  }
+  if (Ss > Se) {
+    return true;
+  }
+  int p1 = Am;
+  int m1 = -Am;
+  int k = Ss;
+  int r;
+  int s;
+  bool in_zero_run = false;
+  if (*eobrun <= 0) {
+    for (; k <= Se; k++) {
+      s = ReadSymbol(ac_huff, br);
+      if (s >= kJpegHuffmanAlphabetSize) {
+        return false;
+      }
+      r = s >> 4;
+      s &= 15;
+      if (s) {
+        if (s != 1) {
+          return false;
+        }
+        s = br->ReadBits(1) ? p1 : m1;
+        in_zero_run = false;
+      } else {
+        if (r != 15) {
+          *eobrun = 1 << r;
+          if (r > 0) {
+            if (!eobrun_allowed) {
+              return false;
+            }
+            *eobrun += br->ReadBits(r);
+          }
+          break;
+        }
+        in_zero_run = true;
+      }
+      do {
+        coeff_t thiscoef = coeffs[kJPEGNaturalOrder[k]];
+        if (thiscoef != 0) {
+          if (br->ReadBits(1)) {
+            if ((thiscoef & p1) == 0) {
+              if (thiscoef >= 0) {
+                thiscoef += p1;
+              } else {
+                thiscoef += m1;
+              }
+            }
+          }
+          coeffs[kJPEGNaturalOrder[k]] = thiscoef;
+        } else {
+          if (--r < 0) {
+            break;
+          }
+        }
+        k++;
+      } while (k <= Se);
+      if (s) {
+        if (k > Se) {
+          return false;
+        }
+        coeffs[kJPEGNaturalOrder[k]] = s;
+      }
+    }
+  }
+  if (in_zero_run) {
+    return false;
+  }
+  if (*eobrun > 0) {
+    for (; k <= Se; k++) {
+      coeff_t thiscoef = coeffs[kJPEGNaturalOrder[k]];
+      if (thiscoef != 0) {
+        if (br->ReadBits(1)) {
+          if ((thiscoef & p1) == 0) {
+            if (thiscoef >= 0) {
+              thiscoef += p1;
+            } else {
+              thiscoef += m1;
+            }
+          }
+        }
+        coeffs[kJPEGNaturalOrder[k]] = thiscoef;
+      }
+    }
+  }
+  --(*eobrun);
+  return true;
+}
+
+void SaveMCUCodingState(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  memcpy(m->mcu_.last_dc_coeff, m->last_dc_coeff_, sizeof(m->last_dc_coeff_));
+  m->mcu_.eobrun = m->eobrun_;
+  size_t offset = 0;
+  for (int i = 0; i < cinfo->comps_in_scan; ++i) {
+    const jpeg_component_info* comp = cinfo->cur_comp_info[i];
+    int c = comp->component_index;
+    size_t block_x = m->scan_mcu_col_ * comp->MCU_width;
+    for (int iy = 0; iy < comp->MCU_height; ++iy) {
+      size_t block_y = m->scan_mcu_row_ * comp->MCU_height + iy;
+      size_t biy = block_y % comp->v_samp_factor;
+      if (block_y >= comp->height_in_blocks) {
+        continue;
+      }
+      size_t nblocks =
+          std::min<size_t>(comp->MCU_width, comp->width_in_blocks - block_x);
+      size_t ncoeffs = nblocks * DCTSIZE2;
+      coeff_t* coeffs = &m->coeff_rows[c][biy][block_x][0];
+      memcpy(&m->mcu_.coeffs[offset], coeffs, ncoeffs * sizeof(coeffs[0]));
+      offset += ncoeffs;
+    }
+  }
+}
+
+void RestoreMCUCodingState(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  memcpy(m->last_dc_coeff_, m->mcu_.last_dc_coeff, sizeof(m->last_dc_coeff_));
+  m->eobrun_ = m->mcu_.eobrun;
+  size_t offset = 0;
+  for (int i = 0; i < cinfo->comps_in_scan; ++i) {
+    const jpeg_component_info* comp = cinfo->cur_comp_info[i];
+    int c = comp->component_index;
+    size_t block_x = m->scan_mcu_col_ * comp->MCU_width;
+    for (int iy = 0; iy < comp->MCU_height; ++iy) {
+      size_t block_y = m->scan_mcu_row_ * comp->MCU_height + iy;
+      size_t biy = block_y % comp->v_samp_factor;
+      if (block_y >= comp->height_in_blocks) {
+        continue;
+      }
+      size_t nblocks =
+          std::min<size_t>(comp->MCU_width, comp->width_in_blocks - block_x);
+      size_t ncoeffs = nblocks * DCTSIZE2;
+      coeff_t* coeffs = &m->coeff_rows[c][biy][block_x][0];
+      memcpy(coeffs, &m->mcu_.coeffs[offset], ncoeffs * sizeof(coeffs[0]));
+      offset += ncoeffs;
+    }
+  }
+}
+
+bool FinishScan(j_decompress_ptr cinfo, const uint8_t* data, const size_t len,
+                size_t* pos, size_t* bit_pos) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (m->eobrun_ > 0) {
+    JPEGLI_ERROR("End-of-block run too long.");
+  }
+  m->eobrun_ = -1;
+  memset(m->last_dc_coeff_, 0, sizeof(m->last_dc_coeff_));
+  if (*bit_pos == 0) {
+    return true;
+  }
+  if (data[*pos] == 0xff) {
+    // After last br.FinishStream we checked that there is at least 2 bytes
+    // in the buffer.
+    JXL_DASSERT(*pos + 1 < len);
+    // br.FinishStream would have detected an early marker.
+    JXL_DASSERT(data[*pos + 1] == 0);
+    *pos += 2;
+  } else {
+    *pos += 1;
+  }
+  *bit_pos = 0;
+  return true;
+}
+
+}  // namespace
+
+void PrepareForiMCURow(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  for (int i = 0; i < cinfo->comps_in_scan; ++i) {
+    const jpeg_component_info* comp = cinfo->cur_comp_info[i];
+    int c = comp->component_index;
+    int by0 = cinfo->input_iMCU_row * comp->v_samp_factor;
+    int block_rows_left = comp->height_in_blocks - by0;
+    int max_block_rows = std::min(comp->v_samp_factor, block_rows_left);
+    int offset = m->streaming_mode_ ? 0 : by0;
+    m->coeff_rows[c] = (*cinfo->mem->access_virt_barray)(
+        reinterpret_cast<j_common_ptr>(cinfo), m->coef_arrays[c], offset,
+        max_block_rows, true);
+  }
+}
+
+int ProcessScan(j_decompress_ptr cinfo, const uint8_t* const data,
+                const size_t len, size_t* pos, size_t* bit_pos) {
+  if (len == 0) {
+    return kNeedMoreInput;
+  }
+  jpeg_decomp_master* m = cinfo->master;
+  for (;;) {
+    // Handle the restart intervals.
+    if (cinfo->restart_interval > 0 && m->restarts_to_go_ == 0) {
+      if (!FinishScan(cinfo, data, len, pos, bit_pos)) {
+        return kNeedMoreInput;
+      }
+      // Go to the next marker, warn if we had to skip any data.
+      size_t num_skipped = 0;
+      while (*pos + 1 < len && (data[*pos] != 0xff || data[*pos + 1] == 0 ||
+                                data[*pos + 1] == 0xff)) {
+        ++(*pos);
+        ++num_skipped;
+      }
+      if (num_skipped > 0) {
+        JPEGLI_WARN("Skipped %d bytes before restart marker", (int)num_skipped);
+      }
+      if (*pos + 2 > len) {
+        return kNeedMoreInput;
+      }
+      cinfo->unread_marker = data[*pos + 1];
+      *pos += 2;
+      return kHandleRestart;
+    }
+
+    size_t start_pos = *pos;
+    BitReaderState br(data, len, start_pos);
+    if (*bit_pos > 0) {
+      br.ReadBits(*bit_pos);
+    }
+    if (start_pos + kMaxMCUByteSize > len) {
+      SaveMCUCodingState(cinfo);
+    }
+
+    // Decode one MCU.
+    HWY_ALIGN_MAX coeff_t sink_block[DCTSIZE2];
+    bool scan_ok = true;
+    for (int i = 0; i < cinfo->comps_in_scan; ++i) {
+      const jpeg_component_info* comp = cinfo->cur_comp_info[i];
+      int c = comp->component_index;
+      const HuffmanTableEntry* dc_lut =
+          &m->dc_huff_lut_[comp->dc_tbl_no * kJpegHuffmanLutSize];
+      const HuffmanTableEntry* ac_lut =
+          &m->ac_huff_lut_[comp->ac_tbl_no * kJpegHuffmanLutSize];
+      for (int iy = 0; iy < comp->MCU_height; ++iy) {
+        size_t block_y = m->scan_mcu_row_ * comp->MCU_height + iy;
+        int biy = block_y % comp->v_samp_factor;
+        for (int ix = 0; ix < comp->MCU_width; ++ix) {
+          size_t block_x = m->scan_mcu_col_ * comp->MCU_width + ix;
+          coeff_t* coeffs;
+          if (block_x >= comp->width_in_blocks ||
+              block_y >= comp->height_in_blocks) {
+            // Note that it is OK that sink_block is uninitialized because
+            // it will never be used in any branches, even in the RefineDCTBlock
+            // case, because only DC scans can be interleaved and we don't use
+            // the zero-ness of the DC coeff in the DC refinement code-path.
+            coeffs = sink_block;
+          } else {
+            coeffs = &m->coeff_rows[c][biy][block_x][0];
+          }
+          if (cinfo->Ah == 0) {
+            if (!DecodeDCTBlock(dc_lut, ac_lut, cinfo->Ss, cinfo->Se, cinfo->Al,
+                                &m->eobrun_, &br,
+                                &m->last_dc_coeff_[comp->component_index],
+                                coeffs)) {
+              scan_ok = false;
+            }
+          } else {
+            if (!RefineDCTBlock(ac_lut, cinfo->Ss, cinfo->Se, cinfo->Al,
+                                &m->eobrun_, &br, coeffs)) {
+              scan_ok = false;
+            }
+          }
+        }
+      }
+    }
+    size_t new_pos;
+    size_t new_bit_pos;
+    bool stream_ok = br.FinishStream(&new_pos, &new_bit_pos);
+    if (new_pos + 2 > len) {
+      // If reading stopped within the last two bytes, we have to request more
+      // input even if FinishStream() returned true, since the Huffman code
+      // reader could have peaked ahead some bits past the current input chunk
+      // and thus the last prefix code length could have been wrong. We can do
+      // this because a valid JPEG bit stream has two extra bytes at the end.
+      RestoreMCUCodingState(cinfo);
+      return kNeedMoreInput;
+    }
+    *pos = new_pos;
+    *bit_pos = new_bit_pos;
+    if (!stream_ok) {
+      // We hit a marker during parsing.
+      JXL_DASSERT(data[*pos] == 0xff);
+      JXL_DASSERT(data[*pos + 1] != 0);
+      RestoreMCUCodingState(cinfo);
+      JPEGLI_WARN("Incomplete scan detected.");
+      return JPEG_SCAN_COMPLETED;
+    }
+    if (!scan_ok) {
+      JPEGLI_ERROR("Failed to decode DCT block");
+    }
+    if (m->restarts_to_go_ > 0) {
+      --m->restarts_to_go_;
+    }
+    ++m->scan_mcu_col_;
+    if (m->scan_mcu_col_ == cinfo->MCUs_per_row) {
+      ++m->scan_mcu_row_;
+      m->scan_mcu_col_ = 0;
+      if (m->scan_mcu_row_ == cinfo->MCU_rows_in_scan) {
+        if (!FinishScan(cinfo, data, len, pos, bit_pos)) {
+          return kNeedMoreInput;
+        }
+        break;
+      } else if ((m->scan_mcu_row_ % m->mcu_rows_per_iMCU_row_) == 0) {
+        // Current iMCU row is done.
+        break;
+      }
+    }
+  }
+  ++cinfo->input_iMCU_row;
+  if (cinfo->input_iMCU_row < cinfo->total_iMCU_rows) {
+    PrepareForiMCURow(cinfo);
+    return JPEG_ROW_COMPLETED;
+  }
+  return JPEG_SCAN_COMPLETED;
+}
+
+}  // namespace jpegli
diff --git a/lib/jpegli/decode_scan.h b/lib/jpegli/decode_scan.h
new file mode 100644 (file)
index 0000000..1d7b18f
--- /dev/null
@@ -0,0 +1,31 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_DECODE_SCAN_H_
+#define LIB_JPEGLI_DECODE_SCAN_H_
+
+#include <stdint.h>
+
+#include "lib/jpegli/common.h"
+
+namespace jpegli {
+
+// Reads the available input in the source manager's input buffer until the end
+// of the next iMCU row.
+// The corresponding fields of cinfo are updated with the processed input data.
+// Upon return, the input buffer will be at the start of an MCU, or at the end
+// of the scan.
+// Return value is one of:
+//   * JPEG_SUSPENDED, if the input buffer ends before the end of an iMCU row;
+//   * JPEG_ROW_COMPLETED, if the next iMCU row (but not the scan) is reached;
+//   * JPEG_SCAN_COMPLETED, if the end of the scan is reached.
+int ProcessScan(j_decompress_ptr cinfo, const uint8_t* const data,
+                const size_t len, size_t* pos, size_t* bit_pos);
+
+void PrepareForiMCURow(j_decompress_ptr cinfo);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_DECODE_SCAN_H_
diff --git a/lib/jpegli/destination_manager.cc b/lib/jpegli/destination_manager.cc
new file mode 100644 (file)
index 0000000..9bc269f
--- /dev/null
@@ -0,0 +1,148 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <string.h>
+
+#include "lib/jpegli/encode.h"
+#include "lib/jpegli/error.h"
+#include "lib/jpegli/memory_manager.h"
+
+namespace jpegli {
+
+constexpr size_t kDestBufferSize = 64 << 10;
+
+struct StdioDestinationManager {
+  jpeg_destination_mgr pub;
+  FILE* f;
+  uint8_t* buffer;
+
+  static void init_destination(j_compress_ptr cinfo) {
+    auto dest = reinterpret_cast<StdioDestinationManager*>(cinfo->dest);
+    dest->pub.next_output_byte = dest->buffer;
+    dest->pub.free_in_buffer = kDestBufferSize;
+  }
+
+  static boolean empty_output_buffer(j_compress_ptr cinfo) {
+    auto dest = reinterpret_cast<StdioDestinationManager*>(cinfo->dest);
+    if (fwrite(dest->buffer, 1, kDestBufferSize, dest->f) != kDestBufferSize) {
+      JPEGLI_ERROR("Failed to write to output stream.");
+    }
+    dest->pub.next_output_byte = dest->buffer;
+    dest->pub.free_in_buffer = kDestBufferSize;
+    return TRUE;
+  }
+
+  static void term_destination(j_compress_ptr cinfo) {
+    auto dest = reinterpret_cast<StdioDestinationManager*>(cinfo->dest);
+    size_t bytes_left = kDestBufferSize - dest->pub.free_in_buffer;
+    if (bytes_left &&
+        fwrite(dest->buffer, 1, bytes_left, dest->f) != bytes_left) {
+      JPEGLI_ERROR("Failed to write to output stream.");
+    }
+    fflush(dest->f);
+    if (ferror(dest->f)) {
+      JPEGLI_ERROR("Failed to write to output stream.");
+    }
+  }
+};
+
+struct MemoryDestinationManager {
+  jpeg_destination_mgr pub;
+  // Output buffer supplied by the application
+  uint8_t** output;
+  unsigned long* output_size;
+  // Output buffer allocated by us.
+  uint8_t* temp_buffer;
+  // Current output buffer (either application supplied or allocated by us).
+  uint8_t* current_buffer;
+  size_t buffer_size;
+
+  static void init_destination(j_compress_ptr cinfo) {}
+
+  static boolean empty_output_buffer(j_compress_ptr cinfo) {
+    auto dest = reinterpret_cast<MemoryDestinationManager*>(cinfo->dest);
+    uint8_t* next_buffer =
+        reinterpret_cast<uint8_t*>(malloc(dest->buffer_size * 2));
+    memcpy(next_buffer, dest->current_buffer, dest->buffer_size);
+    if (dest->temp_buffer != nullptr) {
+      free(dest->temp_buffer);
+    }
+    dest->temp_buffer = next_buffer;
+    dest->current_buffer = next_buffer;
+    *dest->output = next_buffer;
+    *dest->output_size = dest->buffer_size;
+    dest->pub.next_output_byte = next_buffer + dest->buffer_size;
+    dest->pub.free_in_buffer = dest->buffer_size;
+    dest->buffer_size *= 2;
+    return TRUE;
+  }
+
+  static void term_destination(j_compress_ptr cinfo) {
+    auto dest = reinterpret_cast<MemoryDestinationManager*>(cinfo->dest);
+    *dest->output_size = dest->buffer_size - dest->pub.free_in_buffer;
+  }
+};
+
+}  // namespace jpegli
+
+void jpegli_stdio_dest(j_compress_ptr cinfo, FILE* outfile) {
+  if (outfile == nullptr) {
+    JPEGLI_ERROR("jpegli_stdio_dest: Invalid destination.");
+  }
+  if (cinfo->dest && cinfo->dest->init_destination !=
+                         jpegli::StdioDestinationManager::init_destination) {
+    JPEGLI_ERROR("jpegli_stdio_dest: a different dest manager was already set");
+  }
+  if (!cinfo->dest) {
+    cinfo->dest = reinterpret_cast<jpeg_destination_mgr*>(
+        jpegli::Allocate<jpegli::StdioDestinationManager>(cinfo, 1));
+  }
+  auto dest = reinterpret_cast<jpegli::StdioDestinationManager*>(cinfo->dest);
+  dest->f = outfile;
+  dest->buffer = jpegli::Allocate<uint8_t>(cinfo, jpegli::kDestBufferSize);
+  dest->pub.next_output_byte = dest->buffer;
+  dest->pub.free_in_buffer = jpegli::kDestBufferSize;
+  dest->pub.init_destination =
+      jpegli::StdioDestinationManager::init_destination;
+  dest->pub.empty_output_buffer =
+      jpegli::StdioDestinationManager::empty_output_buffer;
+  dest->pub.term_destination =
+      jpegli::StdioDestinationManager::term_destination;
+}
+
+void jpegli_mem_dest(j_compress_ptr cinfo, unsigned char** outbuffer,
+                     unsigned long* outsize) {
+  if (outbuffer == nullptr || outsize == nullptr) {
+    JPEGLI_ERROR("jpegli_mem_dest: Invalid destination.");
+  }
+  if (cinfo->dest && cinfo->dest->init_destination !=
+                         jpegli::MemoryDestinationManager::init_destination) {
+    JPEGLI_ERROR("jpegli_mem_dest: a different dest manager was already set");
+  }
+  if (!cinfo->dest) {
+    auto dest = jpegli::Allocate<jpegli::MemoryDestinationManager>(cinfo, 1);
+    dest->temp_buffer = nullptr;
+    cinfo->dest = reinterpret_cast<jpeg_destination_mgr*>(dest);
+  }
+  auto dest = reinterpret_cast<jpegli::MemoryDestinationManager*>(cinfo->dest);
+  dest->pub.init_destination =
+      jpegli::MemoryDestinationManager::init_destination;
+  dest->pub.empty_output_buffer =
+      jpegli::MemoryDestinationManager::empty_output_buffer;
+  dest->pub.term_destination =
+      jpegli::MemoryDestinationManager::term_destination;
+  dest->output = outbuffer;
+  dest->output_size = outsize;
+  if (*outbuffer == nullptr || *outsize == 0) {
+    dest->temp_buffer =
+        reinterpret_cast<uint8_t*>(malloc(jpegli::kDestBufferSize));
+    *outbuffer = dest->temp_buffer;
+    *outsize = jpegli::kDestBufferSize;
+  }
+  dest->current_buffer = *outbuffer;
+  dest->buffer_size = *outsize;
+  dest->pub.next_output_byte = dest->current_buffer;
+  dest->pub.free_in_buffer = dest->buffer_size;
+}
diff --git a/lib/jpegli/downsample.cc b/lib/jpegli/downsample.cc
new file mode 100644 (file)
index 0000000..df2c156
--- /dev/null
@@ -0,0 +1,356 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/downsample.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jpegli/downsample.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jpegli/encode_internal.h"
+#include "lib/jpegli/error.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jpegli {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::Vec;
+
+using D = HWY_CAPPED(float, 8);
+constexpr D d;
+
+void DownsampleRow2x1(const float* row_in, size_t len, float* row_out) {
+  const size_t N = Lanes(d);
+  const size_t len_out = len / 2;
+  const auto mul = Set(d, 0.5f);
+  Vec<D> v0, v1;
+  for (size_t x = 0; x < len_out; x += N) {
+    LoadInterleaved2(d, row_in + 2 * x, v0, v1);
+    Store(Mul(mul, Add(v0, v1)), d, row_out + x);
+  }
+}
+
+void DownsampleRow3x1(const float* row_in, size_t len, float* row_out) {
+  const size_t N = Lanes(d);
+  const size_t len_out = len / 3;
+  const auto mul = Set(d, 1.0f / 3);
+  Vec<D> v0, v1, v2;
+  for (size_t x = 0; x < len_out; x += N) {
+    LoadInterleaved3(d, row_in + 3 * x, v0, v1, v2);
+    Store(Mul(mul, Add(Add(v0, v1), v2)), d, row_out + x);
+  }
+}
+
+void DownsampleRow4x1(const float* row_in, size_t len, float* row_out) {
+  const size_t N = Lanes(d);
+  const size_t len_out = len / 4;
+  const auto mul = Set(d, 0.25f);
+  Vec<D> v0, v1, v2, v3;
+  for (size_t x = 0; x < len_out; x += N) {
+    LoadInterleaved4(d, row_in + 4 * x, v0, v1, v2, v3);
+    Store(Mul(mul, Add(Add(v0, v1), Add(v2, v3))), d, row_out + x);
+  }
+}
+
+void Downsample2x1(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  DownsampleRow2x1(rows_in[0], len, row_out);
+}
+
+void Downsample3x1(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  DownsampleRow3x1(rows_in[0], len, row_out);
+}
+
+void Downsample4x1(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  DownsampleRow4x1(rows_in[0], len, row_out);
+}
+
+void Downsample1x2(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  const size_t N = Lanes(d);
+  const auto mul = Set(d, 0.5f);
+  float* row0 = rows_in[0];
+  float* row1 = rows_in[1];
+  for (size_t x = 0; x < len; x += N) {
+    Store(Mul(mul, Add(Load(d, row0 + x), Load(d, row1 + x))), d, row_out + x);
+  }
+}
+
+void Downsample2x2(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  const size_t N = Lanes(d);
+  const size_t len_out = len / 2;
+  const auto mul = Set(d, 0.25f);
+  float* row0 = rows_in[0];
+  float* row1 = rows_in[1];
+  Vec<D> v0, v1, v2, v3;
+  for (size_t x = 0; x < len_out; x += N) {
+    LoadInterleaved2(d, row0 + 2 * x, v0, v1);
+    LoadInterleaved2(d, row1 + 2 * x, v2, v3);
+    Store(Mul(mul, Add(Add(v0, v1), Add(v2, v3))), d, row_out + x);
+  }
+}
+
+void Downsample3x2(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  DownsampleRow3x1(rows_in[0], len, rows_in[0]);
+  DownsampleRow3x1(rows_in[1], len, rows_in[1]);
+  Downsample1x2(rows_in, len / 3, row_out);
+}
+
+void Downsample4x2(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  DownsampleRow4x1(rows_in[0], len, rows_in[0]);
+  DownsampleRow4x1(rows_in[1], len, rows_in[1]);
+  Downsample1x2(rows_in, len / 4, row_out);
+}
+
+void Downsample1x3(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  const size_t N = Lanes(d);
+  const auto mul = Set(d, 1.0f / 3);
+  float* row0 = rows_in[0];
+  float* row1 = rows_in[1];
+  float* row2 = rows_in[2];
+  for (size_t x = 0; x < len; x += N) {
+    const auto in0 = Load(d, row0 + x);
+    const auto in1 = Load(d, row1 + x);
+    const auto in2 = Load(d, row2 + x);
+    Store(Mul(mul, Add(Add(in0, in1), in2)), d, row_out + x);
+  }
+}
+
+void Downsample2x3(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  DownsampleRow2x1(rows_in[0], len, rows_in[0]);
+  DownsampleRow2x1(rows_in[1], len, rows_in[1]);
+  DownsampleRow2x1(rows_in[2], len, rows_in[2]);
+  Downsample1x3(rows_in, len / 2, row_out);
+}
+
+void Downsample3x3(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  DownsampleRow3x1(rows_in[0], len, rows_in[0]);
+  DownsampleRow3x1(rows_in[1], len, rows_in[1]);
+  DownsampleRow3x1(rows_in[2], len, rows_in[2]);
+  Downsample1x3(rows_in, len / 3, row_out);
+}
+
+void Downsample4x3(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  DownsampleRow4x1(rows_in[0], len, rows_in[0]);
+  DownsampleRow4x1(rows_in[1], len, rows_in[1]);
+  DownsampleRow4x1(rows_in[2], len, rows_in[2]);
+  Downsample1x3(rows_in, len / 4, row_out);
+}
+
+void Downsample1x4(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  const size_t N = Lanes(d);
+  const auto mul = Set(d, 0.25f);
+  float* row0 = rows_in[0];
+  float* row1 = rows_in[1];
+  float* row2 = rows_in[2];
+  float* row3 = rows_in[3];
+  for (size_t x = 0; x < len; x += N) {
+    const auto in0 = Load(d, row0 + x);
+    const auto in1 = Load(d, row1 + x);
+    const auto in2 = Load(d, row2 + x);
+    const auto in3 = Load(d, row3 + x);
+    Store(Mul(mul, Add(Add(in0, in1), Add(in2, in3))), d, row_out + x);
+  }
+}
+
+void Downsample2x4(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  DownsampleRow2x1(rows_in[0], len, rows_in[0]);
+  DownsampleRow2x1(rows_in[1], len, rows_in[1]);
+  DownsampleRow2x1(rows_in[2], len, rows_in[2]);
+  DownsampleRow2x1(rows_in[3], len, rows_in[3]);
+  Downsample1x4(rows_in, len / 2, row_out);
+}
+
+void Downsample3x4(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  DownsampleRow3x1(rows_in[0], len, rows_in[0]);
+  DownsampleRow3x1(rows_in[1], len, rows_in[1]);
+  DownsampleRow3x1(rows_in[2], len, rows_in[2]);
+  DownsampleRow3x1(rows_in[3], len, rows_in[3]);
+  Downsample1x4(rows_in, len / 3, row_out);
+}
+
+void Downsample4x4(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                   float* row_out) {
+  DownsampleRow4x1(rows_in[0], len, rows_in[0]);
+  DownsampleRow4x1(rows_in[1], len, rows_in[1]);
+  DownsampleRow4x1(rows_in[2], len, rows_in[2]);
+  DownsampleRow4x1(rows_in[3], len, rows_in[3]);
+  Downsample1x4(rows_in, len / 4, row_out);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jpegli
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jpegli {
+
+HWY_EXPORT(Downsample1x2);
+HWY_EXPORT(Downsample1x3);
+HWY_EXPORT(Downsample1x4);
+HWY_EXPORT(Downsample2x1);
+HWY_EXPORT(Downsample2x2);
+HWY_EXPORT(Downsample2x3);
+HWY_EXPORT(Downsample2x4);
+HWY_EXPORT(Downsample3x1);
+HWY_EXPORT(Downsample3x2);
+HWY_EXPORT(Downsample3x3);
+HWY_EXPORT(Downsample3x4);
+HWY_EXPORT(Downsample4x1);
+HWY_EXPORT(Downsample4x2);
+HWY_EXPORT(Downsample4x3);
+HWY_EXPORT(Downsample4x4);
+
+void NullDownsample(float* rows_in[MAX_SAMP_FACTOR], size_t len,
+                    float* row_out) {}
+
+void ChooseDownsampleMethods(j_compress_ptr cinfo) {
+  jpeg_comp_master* m = cinfo->master;
+  for (int c = 0; c < cinfo->num_components; c++) {
+    m->downsample_method[c] = nullptr;
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    const int h_factor = cinfo->max_h_samp_factor / comp->h_samp_factor;
+    const int v_factor = cinfo->max_v_samp_factor / comp->v_samp_factor;
+    if (v_factor == 1) {
+      if (h_factor == 1) {
+        m->downsample_method[c] = NullDownsample;
+      } else if (h_factor == 2) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample2x1);
+      } else if (h_factor == 3) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample3x1);
+      } else if (h_factor == 4) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample4x1);
+      }
+    } else if (v_factor == 2) {
+      if (h_factor == 1) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample1x2);
+      } else if (h_factor == 2) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample2x2);
+      } else if (h_factor == 3) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample3x2);
+      } else if (h_factor == 4) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample4x2);
+      }
+    } else if (v_factor == 3) {
+      if (h_factor == 1) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample1x2);
+      } else if (h_factor == 2) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample2x2);
+      } else if (h_factor == 3) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample3x2);
+      } else if (h_factor == 4) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample4x2);
+      }
+    } else if (v_factor == 4) {
+      if (h_factor == 1) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample1x4);
+      } else if (h_factor == 2) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample2x4);
+      } else if (h_factor == 3) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample3x4);
+      } else if (h_factor == 4) {
+        m->downsample_method[c] = HWY_DYNAMIC_DISPATCH(Downsample4x4);
+      }
+    }
+    if (m->downsample_method[c] == nullptr) {
+      JPEGLI_ERROR("Unsupported downsampling ratio %dx%d", h_factor, v_factor);
+    }
+  }
+}
+
+void DownsampleInputBuffer(j_compress_ptr cinfo) {
+  if (cinfo->max_h_samp_factor == 1 && cinfo->max_v_samp_factor == 1) {
+    return;
+  }
+  jpeg_comp_master* m = cinfo->master;
+  const size_t iMCU_height = DCTSIZE * cinfo->max_v_samp_factor;
+  const size_t y0 = m->next_iMCU_row * iMCU_height;
+  const size_t y1 = y0 + iMCU_height;
+  const size_t xsize_padded = m->xsize_blocks * DCTSIZE;
+  for (int c = 0; c < cinfo->num_components; c++) {
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    const int h_factor = cinfo->max_h_samp_factor / comp->h_samp_factor;
+    const int v_factor = cinfo->max_v_samp_factor / comp->v_samp_factor;
+    if (h_factor == 1 && v_factor == 1) {
+      continue;
+    }
+    auto& input = *m->smooth_input[c];
+    auto& output = *m->raw_data[c];
+    const size_t yout0 = y0 / v_factor;
+    float* rows_in[MAX_SAMP_FACTOR];
+    for (size_t yin = y0, yout = yout0; yin < y1; yin += v_factor, ++yout) {
+      for (int iy = 0; iy < v_factor; ++iy) {
+        rows_in[iy] = input.Row(yin + iy);
+      }
+      float* row_out = output.Row(yout);
+      (*m->downsample_method[c])(rows_in, xsize_padded, row_out);
+    }
+  }
+}
+
+void ApplyInputSmoothing(j_compress_ptr cinfo) {
+  if (!cinfo->smoothing_factor) {
+    return;
+  }
+  jpeg_comp_master* m = cinfo->master;
+  const float kW1 = cinfo->smoothing_factor / 1024.0;
+  const float kW0 = 1.0f - 8.0f * kW1;
+  const size_t iMCU_height = DCTSIZE * cinfo->max_v_samp_factor;
+  const ssize_t y0 = m->next_iMCU_row * iMCU_height;
+  const ssize_t y1 = y0 + iMCU_height;
+  const ssize_t xsize_padded = m->xsize_blocks * DCTSIZE;
+  for (int c = 0; c < cinfo->num_components; c++) {
+    auto& input = m->input_buffer[c];
+    auto& output = *m->smooth_input[c];
+    if (m->next_iMCU_row == 0) {
+      input.CopyRow(-1, 0, 1);
+    }
+    if (m->next_iMCU_row + 1 == cinfo->total_iMCU_rows) {
+      size_t last_row = m->ysize_blocks * DCTSIZE - 1;
+      input.CopyRow(last_row + 1, last_row, 1);
+    }
+    // TODO(szabadka) SIMDify this.
+    for (ssize_t y = y0; y < y1; ++y) {
+      const float* row_t = input.Row(y - 1);
+      const float* row_m = input.Row(y);
+      const float* row_b = input.Row(y + 1);
+      float* row_out = output.Row(y);
+      for (ssize_t x = 0; x < xsize_padded; ++x) {
+        float val_tl = row_t[x - 1];
+        float val_tm = row_t[x];
+        float val_tr = row_t[x + 1];
+        float val_ml = row_m[x - 1];
+        float val_mm = row_m[x];
+        float val_mr = row_m[x + 1];
+        float val_bl = row_b[x - 1];
+        float val_bm = row_b[x];
+        float val_br = row_b[x + 1];
+        float val1 = (val_tl + val_tm + val_tr + val_ml + val_mr + val_bl +
+                      val_bm + val_br);
+        row_out[x] = val_mm * kW0 + val1 * kW1;
+      }
+    }
+  }
+}
+
+}  // namespace jpegli
+#endif  // HWY_ONCE
diff --git a/lib/jpegli/downsample.h b/lib/jpegli/downsample.h
new file mode 100644 (file)
index 0000000..3ccf069
--- /dev/null
@@ -0,0 +1,21 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_DOWNSAMPLE_H_
+#define LIB_JPEGLI_DOWNSAMPLE_H_
+
+#include "lib/jpegli/common.h"
+
+namespace jpegli {
+
+void ChooseDownsampleMethods(j_compress_ptr cinfo);
+
+void DownsampleInputBuffer(j_compress_ptr cinfo);
+
+void ApplyInputSmoothing(j_compress_ptr cinfo);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_DOWNSAMPLE_H_
diff --git a/lib/jpegli/encode.cc b/lib/jpegli/encode.cc
new file mode 100644 (file)
index 0000000..8a106e2
--- /dev/null
@@ -0,0 +1,1253 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/encode.h"
+
+#include <cmath>
+#include <initializer_list>
+#include <vector>
+
+#include "lib/jpegli/adaptive_quantization.h"
+#include "lib/jpegli/bit_writer.h"
+#include "lib/jpegli/bitstream.h"
+#include "lib/jpegli/color_transform.h"
+#include "lib/jpegli/downsample.h"
+#include "lib/jpegli/encode_finish.h"
+#include "lib/jpegli/encode_internal.h"
+#include "lib/jpegli/encode_streaming.h"
+#include "lib/jpegli/entropy_coding.h"
+#include "lib/jpegli/error.h"
+#include "lib/jpegli/huffman.h"
+#include "lib/jpegli/input.h"
+#include "lib/jpegli/memory_manager.h"
+#include "lib/jpegli/quant.h"
+
+namespace jpegli {
+
+constexpr size_t kMaxBytesInMarker = 65533;
+
+void CheckState(j_compress_ptr cinfo, int state) {
+  if (cinfo->global_state != state) {
+    JPEGLI_ERROR("Unexpected global state %d [expected %d]",
+                 cinfo->global_state, state);
+  }
+}
+
+void CheckState(j_compress_ptr cinfo, int state1, int state2) {
+  if (cinfo->global_state != state1 && cinfo->global_state != state2) {
+    JPEGLI_ERROR("Unexpected global state %d [expected %d or %d]",
+                 cinfo->global_state, state1, state2);
+  }
+}
+
+//
+// Parameter setup
+//
+
+// Initialize cinfo fields that are not dependent on input image. This is shared
+// between jpegli_CreateCompress() and jpegli_set_defaults()
+void InitializeCompressParams(j_compress_ptr cinfo) {
+  cinfo->data_precision = 8;
+  cinfo->num_scans = 0;
+  cinfo->scan_info = nullptr;
+  cinfo->raw_data_in = FALSE;
+  cinfo->arith_code = FALSE;
+  cinfo->optimize_coding = FALSE;
+  cinfo->CCIR601_sampling = FALSE;
+  cinfo->smoothing_factor = 0;
+  cinfo->dct_method = JDCT_FLOAT;
+  cinfo->restart_interval = 0;
+  cinfo->restart_in_rows = 0;
+  cinfo->write_JFIF_header = FALSE;
+  cinfo->JFIF_major_version = 1;
+  cinfo->JFIF_minor_version = 1;
+  cinfo->density_unit = 0;
+  cinfo->X_density = 1;
+  cinfo->Y_density = 1;
+#if JPEG_LIB_VERSION >= 70
+  cinfo->scale_num = 1;
+  cinfo->scale_denom = 1;
+  cinfo->do_fancy_downsampling = FALSE;
+  cinfo->min_DCT_h_scaled_size = DCTSIZE;
+  cinfo->min_DCT_v_scaled_size = DCTSIZE;
+#endif
+  cinfo->master->psnr_target = 0.0f;
+  cinfo->master->psnr_tolerance = 0.01f;
+  cinfo->master->min_distance = 0.1f;
+  cinfo->master->max_distance = 25.0f;
+}
+
+float LinearQualityToDistance(int scale_factor) {
+  scale_factor = std::min(5000, std::max(0, scale_factor));
+  int quality =
+      scale_factor < 100 ? 100 - scale_factor / 2 : 5000 / scale_factor;
+  return jpegli_quality_to_distance(quality);
+}
+
+template <typename T>
+void SetSentTableFlag(T** table_ptrs, size_t num, boolean val) {
+  for (size_t i = 0; i < num; ++i) {
+    if (table_ptrs[i]) table_ptrs[i]->sent_table = val;
+  }
+}
+
+//
+// Compressor initialization
+//
+
+struct ProgressiveScan {
+  int Ss, Se, Ah, Al;
+  bool interleaved;
+};
+
+void SetDefaultScanScript(j_compress_ptr cinfo) {
+  int level = cinfo->master->progressive_level;
+  std::vector<ProgressiveScan> progressive_mode;
+  bool interleave_dc =
+      (cinfo->max_h_samp_factor == 1 && cinfo->max_v_samp_factor == 1);
+  if (level == 0) {
+    progressive_mode.push_back({0, 63, 0, 0, true});
+  } else if (level == 1) {
+    progressive_mode.push_back({0, 0, 0, 0, interleave_dc});
+    progressive_mode.push_back({1, 63, 0, 1, false});
+    progressive_mode.push_back({1, 63, 1, 0, false});
+  } else {
+    progressive_mode.push_back({0, 0, 0, 0, interleave_dc});
+    progressive_mode.push_back({1, 2, 0, 0, false});
+    progressive_mode.push_back({3, 63, 0, 2, false});
+    progressive_mode.push_back({3, 63, 2, 1, false});
+    progressive_mode.push_back({3, 63, 1, 0, false});
+  }
+
+  cinfo->script_space_size = 0;
+  for (const auto& scan : progressive_mode) {
+    int comps = scan.interleaved ? MAX_COMPS_IN_SCAN : 1;
+    cinfo->script_space_size += DivCeil(cinfo->num_components, comps);
+  }
+  cinfo->script_space =
+      Allocate<jpeg_scan_info>(cinfo, cinfo->script_space_size);
+
+  jpeg_scan_info* next_scan = cinfo->script_space;
+  for (const auto& scan : progressive_mode) {
+    int comps = scan.interleaved ? MAX_COMPS_IN_SCAN : 1;
+    for (int c = 0; c < cinfo->num_components; c += comps) {
+      next_scan->Ss = scan.Ss;
+      next_scan->Se = scan.Se;
+      next_scan->Ah = scan.Ah;
+      next_scan->Al = scan.Al;
+      next_scan->comps_in_scan = std::min(comps, cinfo->num_components - c);
+      for (int j = 0; j < next_scan->comps_in_scan; ++j) {
+        next_scan->component_index[j] = c + j;
+      }
+      ++next_scan;
+    }
+  }
+  JXL_ASSERT(next_scan - cinfo->script_space == cinfo->script_space_size);
+  cinfo->scan_info = cinfo->script_space;
+  cinfo->num_scans = cinfo->script_space_size;
+}
+
+void ValidateScanScript(j_compress_ptr cinfo) {
+  // Mask of coefficient bits defined by the scan script, for each component
+  // and coefficient index.
+  uint16_t comp_mask[kMaxComponents][DCTSIZE2] = {};
+  static constexpr int kMaxRefinementBit = 10;
+
+  for (int i = 0; i < cinfo->num_scans; ++i) {
+    const jpeg_scan_info& si = cinfo->scan_info[i];
+    if (si.comps_in_scan < 1 || si.comps_in_scan > MAX_COMPS_IN_SCAN) {
+      JPEGLI_ERROR("Invalid number of components in scan %d", si.comps_in_scan);
+    }
+    int last_ci = -1;
+    for (int j = 0; j < si.comps_in_scan; ++j) {
+      int ci = si.component_index[j];
+      if (ci < 0 || ci >= cinfo->num_components) {
+        JPEGLI_ERROR("Invalid component index %d in scan", ci);
+      } else if (ci == last_ci) {
+        JPEGLI_ERROR("Duplicate component index %d in scan", ci);
+      } else if (ci < last_ci) {
+        JPEGLI_ERROR("Out of order component index %d in scan", ci);
+      }
+      last_ci = ci;
+    }
+    if (si.Ss < 0 || si.Se < si.Ss || si.Se >= DCTSIZE2) {
+      JPEGLI_ERROR("Invalid spectral range %d .. %d in scan", si.Ss, si.Se);
+    }
+    if (si.Ah < 0 || si.Al < 0 || si.Al > kMaxRefinementBit) {
+      JPEGLI_ERROR("Invalid refinement bits %d/%d", si.Ah, si.Al);
+    }
+    if (!cinfo->progressive_mode) {
+      if (si.Ss != 0 || si.Se != DCTSIZE2 - 1 || si.Ah != 0 || si.Al != 0) {
+        JPEGLI_ERROR("Invalid scan for sequential mode");
+      }
+    } else {
+      if (si.Ss == 0 && si.Se != 0) {
+        JPEGLI_ERROR("DC and AC together in progressive scan");
+      }
+    }
+    if (si.Ss != 0 && si.comps_in_scan != 1) {
+      JPEGLI_ERROR("Interleaved AC only scan.");
+    }
+    for (int j = 0; j < si.comps_in_scan; ++j) {
+      int ci = si.component_index[j];
+      if (si.Ss != 0 && comp_mask[ci][0] == 0) {
+        JPEGLI_ERROR("AC before DC in component %d of scan", ci);
+      }
+      for (int k = si.Ss; k <= si.Se; ++k) {
+        if (comp_mask[ci][k] == 0) {
+          if (si.Ah != 0) {
+            JPEGLI_ERROR("Invalid first scan refinement bit");
+          }
+          comp_mask[ci][k] = ((0xffff << si.Al) & 0xffff);
+        } else {
+          if (comp_mask[ci][k] != ((0xffff << si.Ah) & 0xffff) ||
+              si.Al != si.Ah - 1) {
+            JPEGLI_ERROR("Invalid refinement bit progression.");
+          }
+          comp_mask[ci][k] |= 1 << si.Al;
+        }
+      }
+    }
+    if (si.comps_in_scan > 1) {
+      size_t mcu_size = 0;
+      for (int j = 0; j < si.comps_in_scan; ++j) {
+        int ci = si.component_index[j];
+        jpeg_component_info* comp = &cinfo->comp_info[ci];
+        mcu_size += comp->h_samp_factor * comp->v_samp_factor;
+      }
+      if (mcu_size > C_MAX_BLOCKS_IN_MCU) {
+        JPEGLI_ERROR("MCU size too big");
+      }
+    }
+  }
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    for (int k = 0; k < DCTSIZE2; ++k) {
+      if (comp_mask[c][k] != 0xffff) {
+        JPEGLI_ERROR("Incomplete scan of component %d and frequency %d", c, k);
+      }
+    }
+  }
+}
+
+void ProcessCompressionParams(j_compress_ptr cinfo) {
+  if (cinfo->dest == nullptr) {
+    JPEGLI_ERROR("Missing destination.");
+  }
+  if (cinfo->image_width < 1 || cinfo->image_height < 1 ||
+      cinfo->input_components < 1) {
+    JPEGLI_ERROR("Empty input image.");
+  }
+  if (cinfo->image_width > static_cast<int>(JPEG_MAX_DIMENSION) ||
+      cinfo->image_height > static_cast<int>(JPEG_MAX_DIMENSION) ||
+      cinfo->input_components > static_cast<int>(kMaxComponents)) {
+    JPEGLI_ERROR("Input image too big.");
+  }
+  if (cinfo->num_components < 1 ||
+      cinfo->num_components > static_cast<int>(kMaxComponents)) {
+    JPEGLI_ERROR("Invalid number of components.");
+  }
+  if (cinfo->data_precision != kJpegPrecision) {
+    JPEGLI_ERROR("Invalid data precision");
+  }
+  if (cinfo->arith_code) {
+    JPEGLI_ERROR("Arithmetic coding is not implemented.");
+  }
+  if (cinfo->CCIR601_sampling) {
+    JPEGLI_ERROR("CCIR601 sampling is not implemented.");
+  }
+  if (cinfo->restart_interval > 65535u) {
+    JPEGLI_ERROR("Restart interval too big");
+  }
+  if (cinfo->smoothing_factor < 0 || cinfo->smoothing_factor > 100) {
+    JPEGLI_ERROR("Invalid smoothing factor %d", cinfo->smoothing_factor);
+  }
+  jpeg_comp_master* m = cinfo->master;
+  cinfo->max_h_samp_factor = cinfo->max_v_samp_factor = 1;
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    if (comp->component_index != c) {
+      JPEGLI_ERROR("Invalid component index");
+    }
+    for (int j = 0; j < c; ++j) {
+      if (cinfo->comp_info[j].component_id == comp->component_id) {
+        JPEGLI_ERROR("Duplicate component id %d", comp->component_id);
+      }
+    }
+    if (comp->h_samp_factor <= 0 || comp->v_samp_factor <= 0 ||
+        comp->h_samp_factor > MAX_SAMP_FACTOR ||
+        comp->v_samp_factor > MAX_SAMP_FACTOR) {
+      JPEGLI_ERROR("Invalid sampling factor %d x %d", comp->h_samp_factor,
+                   comp->v_samp_factor);
+    }
+    cinfo->max_h_samp_factor =
+        std::max(comp->h_samp_factor, cinfo->max_h_samp_factor);
+    cinfo->max_v_samp_factor =
+        std::max(comp->v_samp_factor, cinfo->max_v_samp_factor);
+  }
+  if (cinfo->num_components == 1 &&
+      (cinfo->max_h_samp_factor != 1 || cinfo->max_v_samp_factor != 1)) {
+    JPEGLI_ERROR("Sampling is not supported for simgle component image.");
+  }
+  size_t iMCU_width = DCTSIZE * cinfo->max_h_samp_factor;
+  size_t iMCU_height = DCTSIZE * cinfo->max_v_samp_factor;
+  size_t total_iMCU_cols = DivCeil(cinfo->image_width, iMCU_width);
+  cinfo->total_iMCU_rows = DivCeil(cinfo->image_height, iMCU_height);
+  m->xsize_blocks = total_iMCU_cols * cinfo->max_h_samp_factor;
+  m->ysize_blocks = cinfo->total_iMCU_rows * cinfo->max_v_samp_factor;
+
+  size_t blocks_per_iMCU = 0;
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    if (cinfo->max_h_samp_factor % comp->h_samp_factor != 0 ||
+        cinfo->max_v_samp_factor % comp->v_samp_factor != 0) {
+      JPEGLI_ERROR("Non-integral sampling ratios are not supported.");
+    }
+    m->h_factor[c] = cinfo->max_h_samp_factor / comp->h_samp_factor;
+    m->v_factor[c] = cinfo->max_v_samp_factor / comp->v_samp_factor;
+    comp->downsampled_width = DivCeil(cinfo->image_width, m->h_factor[c]);
+    comp->downsampled_height = DivCeil(cinfo->image_height, m->v_factor[c]);
+    comp->width_in_blocks = DivCeil(comp->downsampled_width, DCTSIZE);
+    comp->height_in_blocks = DivCeil(comp->downsampled_height, DCTSIZE);
+    blocks_per_iMCU += comp->h_samp_factor * comp->v_samp_factor;
+  }
+  m->blocks_per_iMCU_row = total_iMCU_cols * blocks_per_iMCU;
+  // Disable adaptive quantization for subsampled luma channel.
+  int y_channel = cinfo->jpeg_color_space == JCS_RGB ? 1 : 0;
+  jpeg_component_info* y_comp = &cinfo->comp_info[y_channel];
+  if (y_comp->h_samp_factor != cinfo->max_h_samp_factor ||
+      y_comp->v_samp_factor != cinfo->max_v_samp_factor) {
+    m->use_adaptive_quantization = false;
+  }
+  if (cinfo->scan_info == nullptr) {
+    SetDefaultScanScript(cinfo);
+  }
+  cinfo->progressive_mode =
+      cinfo->scan_info->Ss != 0 || cinfo->scan_info->Se != DCTSIZE2 - 1;
+  ValidateScanScript(cinfo);
+  m->scan_token_info =
+      Allocate<ScanTokenInfo>(cinfo, cinfo->num_scans, JPOOL_IMAGE);
+  memset(m->scan_token_info, 0, cinfo->num_scans * sizeof(ScanTokenInfo));
+  m->ac_ctx_offset = Allocate<uint8_t>(cinfo, cinfo->num_scans, JPOOL_IMAGE);
+  size_t num_ac_contexts = 0;
+  for (int i = 0; i < cinfo->num_scans; ++i) {
+    const jpeg_scan_info* scan_info = &cinfo->scan_info[i];
+    m->ac_ctx_offset[i] = 4 + num_ac_contexts;
+    if (scan_info->Se > 0) {
+      num_ac_contexts += scan_info->comps_in_scan;
+    }
+    if (num_ac_contexts > 252) {
+      JPEGLI_ERROR("Too many AC scans in image");
+    }
+    ScanTokenInfo* sti = &m->scan_token_info[i];
+    if (scan_info->comps_in_scan == 1) {
+      int comp_idx = scan_info->component_index[0];
+      jpeg_component_info* comp = &cinfo->comp_info[comp_idx];
+      sti->MCUs_per_row = comp->width_in_blocks;
+      sti->MCU_rows_in_scan = comp->height_in_blocks;
+      sti->blocks_in_MCU = 1;
+    } else {
+      sti->MCUs_per_row =
+          DivCeil(cinfo->image_width, DCTSIZE * cinfo->max_h_samp_factor);
+      sti->MCU_rows_in_scan =
+          DivCeil(cinfo->image_height, DCTSIZE * cinfo->max_v_samp_factor);
+      sti->blocks_in_MCU = 0;
+      for (int j = 0; j < scan_info->comps_in_scan; ++j) {
+        int comp_idx = scan_info->component_index[j];
+        jpeg_component_info* comp = &cinfo->comp_info[comp_idx];
+        sti->blocks_in_MCU += comp->h_samp_factor * comp->v_samp_factor;
+      }
+    }
+    size_t num_MCUs = sti->MCU_rows_in_scan * sti->MCUs_per_row;
+    sti->num_blocks = num_MCUs * sti->blocks_in_MCU;
+    if (cinfo->restart_in_rows <= 0) {
+      sti->restart_interval = cinfo->restart_interval;
+    } else {
+      sti->restart_interval =
+          std::min<size_t>(sti->MCUs_per_row * cinfo->restart_in_rows, 65535u);
+    }
+    sti->num_restarts = sti->restart_interval > 0
+                            ? DivCeil(num_MCUs, sti->restart_interval)
+                            : 1;
+    sti->restarts = Allocate<size_t>(cinfo, sti->num_restarts, JPOOL_IMAGE);
+  }
+  m->num_contexts = 4 + num_ac_contexts;
+}
+
+bool IsStreamingSupported(j_compress_ptr cinfo) {
+  if (cinfo->global_state == kEncWriteCoeffs) {
+    return false;
+  }
+  // TODO(szabadka) Remove this restriction.
+  if (cinfo->restart_interval > 0 || cinfo->restart_in_rows > 0) {
+    return false;
+  }
+  if (cinfo->num_scans > 1) {
+    return false;
+  }
+  if (cinfo->master->psnr_target > 0) {
+    return false;
+  }
+  return true;
+}
+
+void AllocateBuffers(j_compress_ptr cinfo) {
+  jpeg_comp_master* m = cinfo->master;
+  memset(m->last_dc_coeff, 0, sizeof(m->last_dc_coeff));
+  if (!IsStreamingSupported(cinfo) || cinfo->optimize_coding) {
+    int ysize_blocks = DivCeil(cinfo->image_height, DCTSIZE);
+    int num_arrays = cinfo->num_scans * ysize_blocks;
+    m->token_arrays = Allocate<TokenArray>(cinfo, num_arrays, JPOOL_IMAGE);
+    m->cur_token_array = 0;
+    memset(m->token_arrays, 0, num_arrays * sizeof(TokenArray));
+    m->num_tokens = 0;
+    m->total_num_tokens = 0;
+  }
+  if (cinfo->global_state == kEncWriteCoeffs) {
+    return;
+  }
+  size_t iMCU_width = DCTSIZE * cinfo->max_h_samp_factor;
+  size_t iMCU_height = DCTSIZE * cinfo->max_v_samp_factor;
+  size_t total_iMCU_cols = DivCeil(cinfo->image_width, iMCU_width);
+  size_t xsize_full = total_iMCU_cols * iMCU_width;
+  size_t ysize_full = 3 * iMCU_height;
+  if (!cinfo->raw_data_in) {
+    int num_all_components =
+        std::max(cinfo->input_components, cinfo->num_components);
+    for (int c = 0; c < num_all_components; ++c) {
+      m->input_buffer[c].Allocate(cinfo, ysize_full, xsize_full);
+    }
+  }
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    size_t xsize = total_iMCU_cols * comp->h_samp_factor * DCTSIZE;
+    size_t ysize = 3 * comp->v_samp_factor * DCTSIZE;
+    if (cinfo->raw_data_in) {
+      m->input_buffer[c].Allocate(cinfo, ysize, xsize);
+    }
+    m->smooth_input[c] = &m->input_buffer[c];
+    if (!cinfo->raw_data_in && cinfo->smoothing_factor) {
+      m->smooth_input[c] = Allocate<RowBuffer<float>>(cinfo, 1, JPOOL_IMAGE);
+      m->smooth_input[c]->Allocate(cinfo, ysize_full, xsize_full);
+    }
+    m->raw_data[c] = m->smooth_input[c];
+    if (!cinfo->raw_data_in && (m->h_factor[c] > 1 || m->v_factor[c] > 1)) {
+      m->raw_data[c] = Allocate<RowBuffer<float>>(cinfo, 1, JPOOL_IMAGE);
+      m->raw_data[c]->Allocate(cinfo, ysize, xsize);
+    }
+    m->quant_mul[c] = Allocate<float>(cinfo, DCTSIZE2, JPOOL_IMAGE_ALIGNED);
+  }
+  m->dct_buffer = Allocate<float>(cinfo, 2 * DCTSIZE2, JPOOL_IMAGE_ALIGNED);
+  m->block_tmp = Allocate<int32_t>(cinfo, DCTSIZE2 * 4, JPOOL_IMAGE_ALIGNED);
+  if (!IsStreamingSupported(cinfo)) {
+    m->coeff_buffers =
+        Allocate<jvirt_barray_ptr>(cinfo, cinfo->num_components, JPOOL_IMAGE);
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      jpeg_component_info* comp = &cinfo->comp_info[c];
+      const size_t xsize_blocks = comp->width_in_blocks;
+      const size_t ysize_blocks = comp->height_in_blocks;
+      m->coeff_buffers[c] = (*cinfo->mem->request_virt_barray)(
+          reinterpret_cast<j_common_ptr>(cinfo), JPOOL_IMAGE,
+          /*pre_zero=*/false, xsize_blocks, ysize_blocks, comp->v_samp_factor);
+    }
+  }
+  if (m->use_adaptive_quantization) {
+    int y_channel = cinfo->jpeg_color_space == JCS_RGB ? 1 : 0;
+    jpeg_component_info* y_comp = &cinfo->comp_info[y_channel];
+    const size_t xsize_blocks = y_comp->width_in_blocks;
+    const size_t vecsize = VectorSize();
+    const size_t xsize_padded = DivCeil(2 * xsize_blocks, vecsize) * vecsize;
+    m->diff_buffer =
+        Allocate<float>(cinfo, xsize_blocks * DCTSIZE + 8, JPOOL_IMAGE_ALIGNED);
+    m->fuzzy_erosion_tmp.Allocate(cinfo, 2, xsize_padded);
+    m->pre_erosion.Allocate(cinfo, 6 * cinfo->max_v_samp_factor, xsize_padded);
+    size_t qf_height = cinfo->max_v_samp_factor;
+    if (m->psnr_target > 0) {
+      qf_height *= cinfo->total_iMCU_rows;
+    }
+    m->quant_field.Allocate(cinfo, qf_height, xsize_blocks);
+  } else {
+    m->quant_field.Allocate(cinfo, 1, m->xsize_blocks);
+    m->quant_field.FillRow(0, 0, m->xsize_blocks);
+  }
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    m->zero_bias_offset[c] =
+        Allocate<float>(cinfo, DCTSIZE2, JPOOL_IMAGE_ALIGNED);
+    m->zero_bias_mul[c] = Allocate<float>(cinfo, DCTSIZE2, JPOOL_IMAGE_ALIGNED);
+    memset(m->zero_bias_mul[c], 0, DCTSIZE2 * sizeof(float));
+    memset(m->zero_bias_offset[c], 0, DCTSIZE2 * sizeof(float));
+  }
+}
+
+void InitProgressMonitor(j_compress_ptr cinfo) {
+  if (cinfo->progress == nullptr) {
+    return;
+  }
+  if (IsStreamingSupported(cinfo)) {
+    // We have only one input pass.
+    cinfo->progress->total_passes = 1;
+  } else {
+    // We have one input pass, a histogram pass for each scan, and an encode
+    // pass for each scan.
+    cinfo->progress->total_passes = 1 + 2 * cinfo->num_scans;
+  }
+}
+
+// Common setup code between streaming and transcoding code paths. Called in
+// both jpegli_start_compress() and jpegli_write_coefficients().
+void InitCompress(j_compress_ptr cinfo, boolean write_all_tables) {
+  jpeg_comp_master* m = cinfo->master;
+  (*cinfo->err->reset_error_mgr)(reinterpret_cast<j_common_ptr>(cinfo));
+  ProcessCompressionParams(cinfo);
+  InitProgressMonitor(cinfo);
+  AllocateBuffers(cinfo);
+  if (cinfo->global_state != kEncWriteCoeffs) {
+    ChooseInputMethod(cinfo);
+    if (!cinfo->raw_data_in) {
+      ChooseColorTransform(cinfo);
+      ChooseDownsampleMethods(cinfo);
+    }
+    QuantPass pass = m->psnr_target > 0 ? QuantPass::SEARCH_FIRST_PASS
+                                        : QuantPass::NO_SEARCH;
+    InitQuantizer(cinfo, pass);
+  }
+  if (write_all_tables) {
+    jpegli_suppress_tables(cinfo, FALSE);
+  }
+  if (!cinfo->optimize_coding && !cinfo->progressive_mode) {
+    CopyHuffmanTables(cinfo);
+    InitEntropyCoder(cinfo);
+  }
+  (*cinfo->dest->init_destination)(cinfo);
+  WriteFileHeader(cinfo);
+  JpegBitWriterInit(cinfo);
+  m->next_iMCU_row = 0;
+  m->last_restart_interval = 0;
+  m->next_dht_index = 0;
+}
+
+//
+// Input streaming
+//
+
+void ProgressMonitorInputPass(j_compress_ptr cinfo) {
+  if (cinfo->progress == nullptr) {
+    return;
+  }
+  cinfo->progress->completed_passes = 0;
+  cinfo->progress->pass_counter = cinfo->next_scanline;
+  cinfo->progress->pass_limit = cinfo->image_height;
+  (*cinfo->progress->progress_monitor)(reinterpret_cast<j_common_ptr>(cinfo));
+}
+
+void ReadInputRow(j_compress_ptr cinfo, const uint8_t* scanline,
+                  float* row[kMaxComponents]) {
+  jpeg_comp_master* m = cinfo->master;
+  int num_all_components =
+      std::max(cinfo->input_components, cinfo->num_components);
+  for (int c = 0; c < num_all_components; ++c) {
+    row[c] = m->input_buffer[c].Row(m->next_input_row);
+  }
+  ++m->next_input_row;
+  if (scanline == nullptr) {
+    for (int c = 0; c < cinfo->input_components; ++c) {
+      memset(row[c], 0, cinfo->image_width * sizeof(row[c][0]));
+    }
+    return;
+  }
+  (*m->input_method)(scanline, cinfo->image_width, row);
+}
+
+void PadInputBuffer(j_compress_ptr cinfo, float* row[kMaxComponents]) {
+  jpeg_comp_master* m = cinfo->master;
+  const size_t len0 = cinfo->image_width;
+  const size_t len1 = m->xsize_blocks * DCTSIZE;
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    // Pad row to a multiple of the iMCU width, plus create a border of 1
+    // repeated pixel for adaptive quant field calculation.
+    float last_val = row[c][len0 - 1];
+    for (size_t x = len0; x <= len1; ++x) {
+      row[c][x] = last_val;
+    }
+    row[c][-1] = row[c][0];
+  }
+  if (m->next_input_row == cinfo->image_height) {
+    size_t num_rows = m->ysize_blocks * DCTSIZE - cinfo->image_height;
+    for (size_t i = 0; i < num_rows; ++i) {
+      for (int c = 0; c < cinfo->num_components; ++c) {
+        float* dest = m->input_buffer[c].Row(m->next_input_row) - 1;
+        memcpy(dest, row[c] - 1, (len1 + 2) * sizeof(dest[0]));
+      }
+      ++m->next_input_row;
+    }
+  }
+}
+
+void ProcessiMCURow(j_compress_ptr cinfo) {
+  JXL_ASSERT(cinfo->master->next_iMCU_row < cinfo->total_iMCU_rows);
+  if (!cinfo->raw_data_in) {
+    ApplyInputSmoothing(cinfo);
+    DownsampleInputBuffer(cinfo);
+  }
+  ComputeAdaptiveQuantField(cinfo);
+  if (IsStreamingSupported(cinfo)) {
+    if (cinfo->optimize_coding) {
+      ComputeTokensForiMCURow(cinfo);
+    } else {
+      WriteiMCURow(cinfo);
+    }
+  } else {
+    ComputeCoefficientsForiMCURow(cinfo);
+  }
+  ++cinfo->master->next_iMCU_row;
+}
+
+void ProcessiMCURows(j_compress_ptr cinfo) {
+  jpeg_comp_master* m = cinfo->master;
+  size_t iMCU_height = DCTSIZE * cinfo->max_v_samp_factor;
+  // To have context rows both above and below the current iMCU row, we delay
+  // processing the first iMCU row and process two iMCU rows after we receive
+  // the last input row.
+  if (m->next_input_row % iMCU_height == 0 && m->next_input_row > iMCU_height) {
+    ProcessiMCURow(cinfo);
+  }
+  if (m->next_input_row >= cinfo->image_height) {
+    ProcessiMCURow(cinfo);
+  }
+}
+
+//
+// Non-streaming part
+//
+
+void ZigZagShuffleBlocks(j_compress_ptr cinfo) {
+  JCOEF tmp[DCTSIZE2];
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    for (JDIMENSION by = 0; by < comp->height_in_blocks; ++by) {
+      JBLOCKARRAY ba = GetBlockRow(cinfo, c, by);
+      for (JDIMENSION bx = 0; bx < comp->width_in_blocks; ++bx) {
+        JCOEF* block = &ba[0][bx][0];
+        for (int k = 0; k < DCTSIZE2; ++k) {
+          tmp[k] = block[kJPEGNaturalOrder[k]];
+        }
+        memcpy(block, tmp, sizeof(tmp));
+      }
+    }
+  }
+}
+
+}  // namespace jpegli
+
+//
+// Parameter setup
+//
+
+void jpegli_CreateCompress(j_compress_ptr cinfo, int version,
+                           size_t structsize) {
+  cinfo->mem = nullptr;
+  if (structsize != sizeof(*cinfo)) {
+    JPEGLI_ERROR("jpegli_compress_struct has wrong size.");
+  }
+  jpegli::InitMemoryManager(reinterpret_cast<j_common_ptr>(cinfo));
+  cinfo->progress = nullptr;
+  cinfo->is_decompressor = FALSE;
+  cinfo->global_state = jpegli::kEncStart;
+  cinfo->dest = nullptr;
+  cinfo->image_width = 0;
+  cinfo->image_height = 0;
+  cinfo->input_components = 0;
+  cinfo->in_color_space = JCS_UNKNOWN;
+  cinfo->input_gamma = 1.0f;
+  cinfo->num_components = 0;
+  cinfo->jpeg_color_space = JCS_UNKNOWN;
+  cinfo->comp_info = nullptr;
+  for (int i = 0; i < NUM_QUANT_TBLS; ++i) {
+    cinfo->quant_tbl_ptrs[i] = nullptr;
+  }
+  for (int i = 0; i < NUM_HUFF_TBLS; ++i) {
+    cinfo->dc_huff_tbl_ptrs[i] = nullptr;
+    cinfo->ac_huff_tbl_ptrs[i] = nullptr;
+  }
+  memset(cinfo->arith_dc_L, 0, sizeof(cinfo->arith_dc_L));
+  memset(cinfo->arith_dc_U, 0, sizeof(cinfo->arith_dc_U));
+  memset(cinfo->arith_ac_K, 0, sizeof(cinfo->arith_ac_K));
+  cinfo->write_Adobe_marker = false;
+  cinfo->master = jpegli::Allocate<jpeg_comp_master>(cinfo, 1);
+  jpegli::InitializeCompressParams(cinfo);
+  cinfo->master->force_baseline = true;
+  cinfo->master->xyb_mode = false;
+  cinfo->master->cicp_transfer_function = 2;  // unknown transfer function code
+  cinfo->master->use_std_tables = false;
+  cinfo->master->use_adaptive_quantization = true;
+  cinfo->master->progressive_level = jpegli::kDefaultProgressiveLevel;
+  cinfo->master->data_type = JPEGLI_TYPE_UINT8;
+  cinfo->master->endianness = JPEGLI_NATIVE_ENDIAN;
+  cinfo->master->coeff_buffers = nullptr;
+}
+
+void jpegli_set_xyb_mode(j_compress_ptr cinfo) {
+  CheckState(cinfo, jpegli::kEncStart);
+  cinfo->master->xyb_mode = true;
+}
+
+void jpegli_set_cicp_transfer_function(j_compress_ptr cinfo, int code) {
+  CheckState(cinfo, jpegli::kEncStart);
+  cinfo->master->cicp_transfer_function = code;
+}
+
+void jpegli_set_defaults(j_compress_ptr cinfo) {
+  CheckState(cinfo, jpegli::kEncStart);
+  jpegli::InitializeCompressParams(cinfo);
+  jpegli_default_colorspace(cinfo);
+  jpegli_set_quality(cinfo, 90, TRUE);
+  jpegli_set_progressive_level(cinfo, jpegli::kDefaultProgressiveLevel);
+  jpegli::AddStandardHuffmanTables(reinterpret_cast<j_common_ptr>(cinfo),
+                                   /*is_dc=*/false);
+  jpegli::AddStandardHuffmanTables(reinterpret_cast<j_common_ptr>(cinfo),
+                                   /*is_dc=*/true);
+}
+
+void jpegli_default_colorspace(j_compress_ptr cinfo) {
+  CheckState(cinfo, jpegli::kEncStart);
+  switch (cinfo->in_color_space) {
+    case JCS_GRAYSCALE:
+      jpegli_set_colorspace(cinfo, JCS_GRAYSCALE);
+      break;
+    case JCS_RGB: {
+      if (cinfo->master->xyb_mode) {
+        jpegli_set_colorspace(cinfo, JCS_RGB);
+      } else {
+        jpegli_set_colorspace(cinfo, JCS_YCbCr);
+      }
+      break;
+    }
+    case JCS_YCbCr:
+      jpegli_set_colorspace(cinfo, JCS_YCbCr);
+      break;
+    case JCS_CMYK:
+      jpegli_set_colorspace(cinfo, JCS_CMYK);
+      break;
+    case JCS_YCCK:
+      jpegli_set_colorspace(cinfo, JCS_YCCK);
+      break;
+    case JCS_UNKNOWN:
+      jpegli_set_colorspace(cinfo, JCS_UNKNOWN);
+      break;
+    default:
+      JPEGLI_ERROR("Unsupported input colorspace %d", cinfo->in_color_space);
+  }
+}
+
+void jpegli_set_colorspace(j_compress_ptr cinfo, J_COLOR_SPACE colorspace) {
+  CheckState(cinfo, jpegli::kEncStart);
+  cinfo->jpeg_color_space = colorspace;
+  switch (colorspace) {
+    case JCS_GRAYSCALE:
+      cinfo->num_components = 1;
+      break;
+    case JCS_RGB:
+    case JCS_YCbCr:
+      cinfo->num_components = 3;
+      break;
+    case JCS_CMYK:
+    case JCS_YCCK:
+      cinfo->num_components = 4;
+      break;
+    case JCS_UNKNOWN:
+      cinfo->num_components =
+          std::min<int>(jpegli::kMaxComponents, cinfo->input_components);
+      break;
+    default:
+      JPEGLI_ERROR("Unsupported jpeg colorspace %d", colorspace);
+  }
+  // Adobe marker is only needed to distinguish CMYK and YCCK JPEGs.
+  cinfo->write_Adobe_marker = (cinfo->jpeg_color_space == JCS_YCCK);
+  if (cinfo->comp_info == nullptr) {
+    cinfo->comp_info =
+        jpegli::Allocate<jpeg_component_info>(cinfo, MAX_COMPONENTS);
+  }
+  memset(cinfo->comp_info, 0,
+         jpegli::kMaxComponents * sizeof(jpeg_component_info));
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    comp->component_index = c;
+    comp->component_id = c + 1;
+    comp->h_samp_factor = 1;
+    comp->v_samp_factor = 1;
+    comp->quant_tbl_no = 0;
+    comp->dc_tbl_no = 0;
+    comp->ac_tbl_no = 0;
+  }
+  if (colorspace == JCS_RGB) {
+    cinfo->comp_info[0].component_id = 'R';
+    cinfo->comp_info[1].component_id = 'G';
+    cinfo->comp_info[2].component_id = 'B';
+    if (cinfo->master->xyb_mode) {
+      // Subsample blue channel.
+      cinfo->comp_info[0].h_samp_factor = cinfo->comp_info[0].v_samp_factor = 2;
+      cinfo->comp_info[1].h_samp_factor = cinfo->comp_info[1].v_samp_factor = 2;
+      cinfo->comp_info[2].h_samp_factor = cinfo->comp_info[2].v_samp_factor = 1;
+      // Use separate quantization tables for each component
+      cinfo->comp_info[1].quant_tbl_no = 1;
+      cinfo->comp_info[2].quant_tbl_no = 2;
+    }
+  } else if (colorspace == JCS_CMYK) {
+    cinfo->comp_info[0].component_id = 'C';
+    cinfo->comp_info[1].component_id = 'M';
+    cinfo->comp_info[2].component_id = 'Y';
+    cinfo->comp_info[3].component_id = 'K';
+  } else if (colorspace == JCS_YCbCr || colorspace == JCS_YCCK) {
+    // Use separate quantization and Huffman tables for luma and chroma
+    cinfo->comp_info[1].quant_tbl_no = 1;
+    cinfo->comp_info[2].quant_tbl_no = 1;
+    cinfo->comp_info[1].dc_tbl_no = cinfo->comp_info[1].ac_tbl_no = 1;
+    cinfo->comp_info[2].dc_tbl_no = cinfo->comp_info[2].ac_tbl_no = 1;
+  }
+}
+
+void jpegli_set_distance(j_compress_ptr cinfo, float distance,
+                         boolean force_baseline) {
+  CheckState(cinfo, jpegli::kEncStart);
+  cinfo->master->force_baseline = force_baseline;
+  float distances[NUM_QUANT_TBLS] = {distance, distance, distance};
+  jpegli::SetQuantMatrices(cinfo, distances, /*add_two_chroma_tables=*/true);
+}
+
+float jpegli_quality_to_distance(int quality) {
+  return (quality >= 100  ? 0.01f
+          : quality >= 30 ? 0.1f + (100 - quality) * 0.09f
+                          : 53.0f / 3000.0f * quality * quality -
+                                23.0f / 20.0f * quality + 25.0f);
+}
+
+void jpegli_set_psnr(j_compress_ptr cinfo, float psnr, float tolerance,
+                     float min_distance, float max_distance) {
+  CheckState(cinfo, jpegli::kEncStart);
+  cinfo->master->psnr_target = psnr;
+  cinfo->master->psnr_tolerance = tolerance;
+  cinfo->master->min_distance = min_distance;
+  cinfo->master->max_distance = max_distance;
+}
+
+void jpegli_set_quality(j_compress_ptr cinfo, int quality,
+                        boolean force_baseline) {
+  CheckState(cinfo, jpegli::kEncStart);
+  cinfo->master->force_baseline = force_baseline;
+  float distance = jpegli_quality_to_distance(quality);
+  float distances[NUM_QUANT_TBLS] = {distance, distance, distance};
+  jpegli::SetQuantMatrices(cinfo, distances, /*add_two_chroma_tables=*/false);
+}
+
+void jpegli_set_linear_quality(j_compress_ptr cinfo, int scale_factor,
+                               boolean force_baseline) {
+  CheckState(cinfo, jpegli::kEncStart);
+  cinfo->master->force_baseline = force_baseline;
+  float distance = jpegli::LinearQualityToDistance(scale_factor);
+  float distances[NUM_QUANT_TBLS] = {distance, distance, distance};
+  jpegli::SetQuantMatrices(cinfo, distances, /*add_two_chroma_tables=*/false);
+}
+
+#if JPEG_LIB_VERSION >= 70
+void jpegli_default_qtables(j_compress_ptr cinfo, boolean force_baseline) {
+  CheckState(cinfo, jpegli::kEncStart);
+  cinfo->master->force_baseline = force_baseline;
+  float distances[NUM_QUANT_TBLS];
+  for (int i = 0; i < NUM_QUANT_TBLS; ++i) {
+    distances[i] = jpegli::LinearQualityToDistance(cinfo->q_scale_factor[i]);
+  }
+  jpegli::SetQuantMatrices(cinfo, distances, /*add_two_chroma_tables=*/false);
+}
+#endif
+
+int jpegli_quality_scaling(int quality) {
+  quality = std::min(100, std::max(1, quality));
+  return quality < 50 ? 5000 / quality : 200 - 2 * quality;
+}
+
+void jpegli_use_standard_quant_tables(j_compress_ptr cinfo) {
+  CheckState(cinfo, jpegli::kEncStart);
+  cinfo->master->use_std_tables = true;
+}
+
+void jpegli_add_quant_table(j_compress_ptr cinfo, int which_tbl,
+                            const unsigned int* basic_table, int scale_factor,
+                            boolean force_baseline) {
+  CheckState(cinfo, jpegli::kEncStart);
+  if (which_tbl < 0 || which_tbl > NUM_QUANT_TBLS) {
+    JPEGLI_ERROR("Invalid quant table index %d", which_tbl);
+  }
+  if (cinfo->quant_tbl_ptrs[which_tbl] == nullptr) {
+    cinfo->quant_tbl_ptrs[which_tbl] =
+        jpegli_alloc_quant_table(reinterpret_cast<j_common_ptr>(cinfo));
+  }
+  int max_qval = force_baseline ? 255 : 32767U;
+  JQUANT_TBL* quant_table = cinfo->quant_tbl_ptrs[which_tbl];
+  for (int k = 0; k < DCTSIZE2; ++k) {
+    int qval = (basic_table[k] * scale_factor + 50) / 100;
+    qval = std::max(1, std::min(qval, max_qval));
+    quant_table->quantval[k] = qval;
+  }
+  quant_table->sent_table = FALSE;
+}
+
+void jpegli_enable_adaptive_quantization(j_compress_ptr cinfo, boolean value) {
+  CheckState(cinfo, jpegli::kEncStart);
+  cinfo->master->use_adaptive_quantization = value;
+}
+
+void jpegli_simple_progression(j_compress_ptr cinfo) {
+  CheckState(cinfo, jpegli::kEncStart);
+  jpegli_set_progressive_level(cinfo, 2);
+}
+
+void jpegli_set_progressive_level(j_compress_ptr cinfo, int level) {
+  CheckState(cinfo, jpegli::kEncStart);
+  if (level < 0) {
+    JPEGLI_ERROR("Invalid progressive level %d", level);
+  }
+  cinfo->master->progressive_level = level;
+}
+
+void jpegli_set_input_format(j_compress_ptr cinfo, JpegliDataType data_type,
+                             JpegliEndianness endianness) {
+  CheckState(cinfo, jpegli::kEncStart);
+  switch (data_type) {
+    case JPEGLI_TYPE_UINT8:
+    case JPEGLI_TYPE_UINT16:
+    case JPEGLI_TYPE_FLOAT:
+      cinfo->master->data_type = data_type;
+      break;
+    default:
+      JPEGLI_ERROR("Unsupported data type %d", data_type);
+  }
+  switch (endianness) {
+    case JPEGLI_NATIVE_ENDIAN:
+    case JPEGLI_LITTLE_ENDIAN:
+    case JPEGLI_BIG_ENDIAN:
+      cinfo->master->endianness = endianness;
+      break;
+    default:
+      JPEGLI_ERROR("Unsupported endianness %d", endianness);
+  }
+}
+
+#if JPEG_LIB_VERSION >= 70
+void jpegli_calc_jpeg_dimensions(j_compress_ptr cinfo) {
+  // Since input scaling is not supported, we just copy the image dimensions.
+  cinfo->jpeg_width = cinfo->image_width;
+  cinfo->jpeg_height = cinfo->image_height;
+}
+#endif
+
+void jpegli_copy_critical_parameters(j_decompress_ptr srcinfo,
+                                     j_compress_ptr dstinfo) {
+  CheckState(dstinfo, jpegli::kEncStart);
+  // Image parameters.
+  dstinfo->image_width = srcinfo->image_width;
+  dstinfo->image_height = srcinfo->image_height;
+  dstinfo->input_components = srcinfo->num_components;
+  dstinfo->in_color_space = srcinfo->jpeg_color_space;
+  dstinfo->input_gamma = srcinfo->output_gamma;
+  // Compression parameters.
+  jpegli_set_defaults(dstinfo);
+  jpegli_set_colorspace(dstinfo, srcinfo->jpeg_color_space);
+  if (dstinfo->num_components != srcinfo->num_components) {
+    const auto& cinfo = dstinfo;
+    return JPEGLI_ERROR("Mismatch between src colorspace and components");
+  }
+  dstinfo->data_precision = srcinfo->data_precision;
+  dstinfo->CCIR601_sampling = srcinfo->CCIR601_sampling;
+  dstinfo->JFIF_major_version = srcinfo->JFIF_major_version;
+  dstinfo->JFIF_minor_version = srcinfo->JFIF_minor_version;
+  dstinfo->density_unit = srcinfo->density_unit;
+  dstinfo->X_density = srcinfo->X_density;
+  dstinfo->Y_density = srcinfo->Y_density;
+  for (int c = 0; c < dstinfo->num_components; ++c) {
+    jpeg_component_info* srccomp = &srcinfo->comp_info[c];
+    jpeg_component_info* dstcomp = &dstinfo->comp_info[c];
+    dstcomp->component_id = srccomp->component_id;
+    dstcomp->h_samp_factor = srccomp->h_samp_factor;
+    dstcomp->v_samp_factor = srccomp->v_samp_factor;
+    dstcomp->quant_tbl_no = srccomp->quant_tbl_no;
+  }
+  for (int i = 0; i < NUM_QUANT_TBLS; ++i) {
+    if (!srcinfo->quant_tbl_ptrs[i]) continue;
+    if (dstinfo->quant_tbl_ptrs[i] == nullptr) {
+      dstinfo->quant_tbl_ptrs[i] = jpegli::Allocate<JQUANT_TBL>(dstinfo, 1);
+    }
+    memcpy(dstinfo->quant_tbl_ptrs[i], srcinfo->quant_tbl_ptrs[i],
+           sizeof(JQUANT_TBL));
+    dstinfo->quant_tbl_ptrs[i]->sent_table = FALSE;
+  }
+}
+
+void jpegli_suppress_tables(j_compress_ptr cinfo, boolean suppress) {
+  jpegli::SetSentTableFlag(cinfo->quant_tbl_ptrs, NUM_QUANT_TBLS, suppress);
+  jpegli::SetSentTableFlag(cinfo->dc_huff_tbl_ptrs, NUM_HUFF_TBLS, suppress);
+  jpegli::SetSentTableFlag(cinfo->ac_huff_tbl_ptrs, NUM_HUFF_TBLS, suppress);
+}
+
+//
+// Compressor initialization
+//
+
+void jpegli_start_compress(j_compress_ptr cinfo, boolean write_all_tables) {
+  CheckState(cinfo, jpegli::kEncStart);
+  cinfo->global_state = jpegli::kEncHeader;
+  jpegli::InitCompress(cinfo, write_all_tables);
+  cinfo->next_scanline = 0;
+  cinfo->master->next_input_row = 0;
+}
+
+void jpegli_write_coefficients(j_compress_ptr cinfo,
+                               jvirt_barray_ptr* coef_arrays) {
+  CheckState(cinfo, jpegli::kEncStart);
+  cinfo->global_state = jpegli::kEncWriteCoeffs;
+  jpegli::InitCompress(cinfo, /*write_all_tables=*/true);
+  cinfo->master->coeff_buffers = coef_arrays;
+  cinfo->next_scanline = cinfo->image_height;
+  cinfo->master->next_input_row = cinfo->image_height;
+}
+
+void jpegli_write_tables(j_compress_ptr cinfo) {
+  CheckState(cinfo, jpegli::kEncStart);
+  if (cinfo->dest == nullptr) {
+    JPEGLI_ERROR("Missing destination.");
+  }
+  jpeg_comp_master* m = cinfo->master;
+  (*cinfo->err->reset_error_mgr)(reinterpret_cast<j_common_ptr>(cinfo));
+  (*cinfo->dest->init_destination)(cinfo);
+  jpegli::WriteOutput(cinfo, {0xFF, 0xD8});  // SOI
+  jpegli::EncodeDQT(cinfo, /*write_all_tables=*/true);
+  jpegli::CopyHuffmanTables(cinfo);
+  jpegli::EncodeDHT(cinfo, 0, m->num_huffman_tables);
+  jpegli::WriteOutput(cinfo, {0xFF, 0xD9});  // EOI
+  (*cinfo->dest->term_destination)(cinfo);
+  jpegli_suppress_tables(cinfo, TRUE);
+}
+
+//
+// Marker writing
+//
+
+void jpegli_write_m_header(j_compress_ptr cinfo, int marker,
+                           unsigned int datalen) {
+  CheckState(cinfo, jpegli::kEncHeader, jpegli::kEncWriteCoeffs);
+  if (datalen > jpegli::kMaxBytesInMarker) {
+    JPEGLI_ERROR("Invalid marker length %u", datalen);
+  }
+  if (marker != 0xfe && (marker < 0xe0 || marker > 0xef)) {
+    JPEGLI_ERROR(
+        "jpegli_write_m_header: Only APP and COM markers are supported.");
+  }
+  std::vector<uint8_t> marker_data(4 + datalen);
+  marker_data[0] = 0xff;
+  marker_data[1] = marker;
+  marker_data[2] = (datalen + 2) >> 8;
+  marker_data[3] = (datalen + 2) & 0xff;
+  jpegli::WriteOutput(cinfo, &marker_data[0], 4);
+}
+
+void jpegli_write_m_byte(j_compress_ptr cinfo, int val) {
+  uint8_t data = val;
+  jpegli::WriteOutput(cinfo, &data, 1);
+}
+
+void jpegli_write_marker(j_compress_ptr cinfo, int marker,
+                         const JOCTET* dataptr, unsigned int datalen) {
+  jpegli_write_m_header(cinfo, marker, datalen);
+  jpegli::WriteOutput(cinfo, dataptr, datalen);
+}
+
+void jpegli_write_icc_profile(j_compress_ptr cinfo, const JOCTET* icc_data_ptr,
+                              unsigned int icc_data_len) {
+  constexpr size_t kMaxIccBytesInMarker =
+      jpegli::kMaxBytesInMarker - sizeof jpegli::kICCSignature - 2;
+  const int num_markers =
+      static_cast<int>(jpegli::DivCeil(icc_data_len, kMaxIccBytesInMarker));
+  size_t begin = 0;
+  for (int current_marker = 0; current_marker < num_markers; ++current_marker) {
+    const size_t length = std::min(kMaxIccBytesInMarker, icc_data_len - begin);
+    jpegli_write_m_header(
+        cinfo, jpegli::kICCMarker,
+        static_cast<unsigned int>(length + sizeof jpegli::kICCSignature + 2));
+    for (const unsigned char c : jpegli::kICCSignature) {
+      jpegli_write_m_byte(cinfo, c);
+    }
+    jpegli_write_m_byte(cinfo, current_marker + 1);
+    jpegli_write_m_byte(cinfo, num_markers);
+    for (size_t i = 0; i < length; ++i) {
+      jpegli_write_m_byte(cinfo, icc_data_ptr[begin]);
+      ++begin;
+    }
+  }
+}
+
+//
+// Input streaming
+//
+
+JDIMENSION jpegli_write_scanlines(j_compress_ptr cinfo, JSAMPARRAY scanlines,
+                                  JDIMENSION num_lines) {
+  CheckState(cinfo, jpegli::kEncHeader, jpegli::kEncReadImage);
+  if (cinfo->raw_data_in) {
+    JPEGLI_ERROR("jpegli_write_raw_data() must be called for raw data mode.");
+  }
+  jpegli::ProgressMonitorInputPass(cinfo);
+  if (cinfo->global_state == jpegli::kEncHeader &&
+      jpegli::IsStreamingSupported(cinfo) && !cinfo->optimize_coding) {
+    jpegli::WriteFrameHeader(cinfo);
+    jpegli::WriteScanHeader(cinfo, 0);
+  }
+  cinfo->global_state = jpegli::kEncReadImage;
+  jpeg_comp_master* m = cinfo->master;
+  if (num_lines + cinfo->next_scanline > cinfo->image_height) {
+    num_lines = cinfo->image_height - cinfo->next_scanline;
+  }
+  JDIMENSION prev_scanline = cinfo->next_scanline;
+  size_t input_lag = (std::min<size_t>(cinfo->image_height, m->next_input_row) -
+                      cinfo->next_scanline);
+  if (input_lag > num_lines) {
+    JPEGLI_ERROR("Need at least %u lines to continue", input_lag);
+  }
+  if (input_lag > 0) {
+    if (!jpegli::EmptyBitWriterBuffer(&m->bw)) {
+      return 0;
+    }
+    cinfo->next_scanline += input_lag;
+  }
+  float* rows[jpegli::kMaxComponents];
+  for (size_t i = input_lag; i < num_lines; ++i) {
+    jpegli::ReadInputRow(cinfo, scanlines[i], rows);
+    (*m->color_transform)(rows, cinfo->image_width);
+    jpegli::PadInputBuffer(cinfo, rows);
+    jpegli::ProcessiMCURows(cinfo);
+    if (!jpegli::EmptyBitWriterBuffer(&m->bw)) {
+      break;
+    }
+    ++cinfo->next_scanline;
+  }
+  return cinfo->next_scanline - prev_scanline;
+}
+
+JDIMENSION jpegli_write_raw_data(j_compress_ptr cinfo, JSAMPIMAGE data,
+                                 JDIMENSION num_lines) {
+  CheckState(cinfo, jpegli::kEncHeader, jpegli::kEncReadImage);
+  if (!cinfo->raw_data_in) {
+    JPEGLI_ERROR("jpegli_write_raw_data(): raw data mode was not set");
+  }
+  jpegli::ProgressMonitorInputPass(cinfo);
+  if (cinfo->global_state == jpegli::kEncHeader &&
+      jpegli::IsStreamingSupported(cinfo) && !cinfo->optimize_coding) {
+    jpegli::WriteFrameHeader(cinfo);
+    jpegli::WriteScanHeader(cinfo, 0);
+  }
+  cinfo->global_state = jpegli::kEncReadImage;
+  jpeg_comp_master* m = cinfo->master;
+  if (cinfo->next_scanline >= cinfo->image_height) {
+    return 0;
+  }
+  size_t iMCU_height = DCTSIZE * cinfo->max_v_samp_factor;
+  if (num_lines < iMCU_height) {
+    JPEGLI_ERROR("Missing input lines, minimum is %u", iMCU_height);
+  }
+  if (cinfo->next_scanline < m->next_input_row) {
+    JXL_ASSERT(m->next_input_row - cinfo->next_scanline == iMCU_height);
+    if (!jpegli::EmptyBitWriterBuffer(&m->bw)) {
+      return 0;
+    }
+    cinfo->next_scanline = m->next_input_row;
+    return iMCU_height;
+  }
+  size_t iMCU_y = m->next_input_row / iMCU_height;
+  float* rows[jpegli::kMaxComponents];
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    JSAMPARRAY plane = data[c];
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    size_t xsize = comp->width_in_blocks * DCTSIZE;
+    size_t ysize = comp->v_samp_factor * DCTSIZE;
+    size_t y0 = iMCU_y * ysize;
+    auto& buffer = m->input_buffer[c];
+    for (size_t i = 0; i < ysize; ++i) {
+      rows[0] = buffer.Row(y0 + i);
+      if (plane[i] == nullptr) {
+        memset(rows[0], 0, xsize * sizeof(rows[0][0]));
+      } else {
+        (*m->input_method)(plane[i], xsize, rows);
+      }
+      // We need a border of 1 repeated pixel for adaptive quant field.
+      buffer.PadRow(y0 + i, xsize, /*border=*/1);
+    }
+  }
+  m->next_input_row += iMCU_height;
+  jpegli::ProcessiMCURows(cinfo);
+  if (!jpegli::EmptyBitWriterBuffer(&m->bw)) {
+    return 0;
+  }
+  cinfo->next_scanline += iMCU_height;
+  return iMCU_height;
+}
+
+//
+// Non-streaming part
+//
+
+void jpegli_finish_compress(j_compress_ptr cinfo) {
+  CheckState(cinfo, jpegli::kEncReadImage, jpegli::kEncWriteCoeffs);
+  jpeg_comp_master* m = cinfo->master;
+  if (cinfo->next_scanline < cinfo->image_height) {
+    JPEGLI_ERROR("Incomplete image, expected %d rows, got %d",
+                 cinfo->image_height, cinfo->next_scanline);
+  }
+
+  if (cinfo->global_state == jpegli::kEncWriteCoeffs) {
+    // Zig-zag shuffle all the blocks. For non-transcoding case it was already
+    // done in EncodeiMCURow().
+    jpegli::ZigZagShuffleBlocks(cinfo);
+  }
+
+  if (m->psnr_target > 0) {
+    jpegli::QuantizetoPSNR(cinfo);
+  }
+
+  const bool tokens_done = jpegli::IsStreamingSupported(cinfo);
+  const bool bitstream_done = tokens_done && !cinfo->optimize_coding;
+
+  if (!tokens_done) {
+    jpegli::TokenizeJpeg(cinfo);
+  }
+
+  if (cinfo->optimize_coding || cinfo->progressive_mode) {
+    jpegli::OptimizeHuffmanCodes(cinfo);
+    jpegli::InitEntropyCoder(cinfo);
+  }
+
+  if (!bitstream_done) {
+    jpegli::WriteFrameHeader(cinfo);
+    for (int i = 0; i < cinfo->num_scans; ++i) {
+      jpegli::WriteScanHeader(cinfo, i);
+      jpegli::WriteScanData(cinfo, i);
+    }
+  } else {
+    JumpToByteBoundary(&m->bw);
+    if (!EmptyBitWriterBuffer(&m->bw)) {
+      JPEGLI_ERROR("Output suspension is not supported in finish_compress");
+    }
+  }
+
+  jpegli::WriteOutput(cinfo, {0xFF, 0xD9});  // EOI
+  (*cinfo->dest->term_destination)(cinfo);
+
+  // Release memory and reset global state.
+  jpegli_abort_compress(cinfo);
+}
+
+void jpegli_abort_compress(j_compress_ptr cinfo) {
+  jpegli_abort(reinterpret_cast<j_common_ptr>(cinfo));
+}
+
+void jpegli_destroy_compress(j_compress_ptr cinfo) {
+  jpegli_destroy(reinterpret_cast<j_common_ptr>(cinfo));
+}
diff --git a/lib/jpegli/encode.h b/lib/jpegli/encode.h
new file mode 100644 (file)
index 0000000..320dfaa
--- /dev/null
@@ -0,0 +1,158 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// This file contains the C API of the encoder part of the libjpegli library,
+// which is based on the C API of libjpeg, with the function names changed from
+// jpeg_* to jpegli_*, while compressor object definitions are included directly
+// from jpeglib.h
+//
+// Applications can use the libjpegli library in one of the following ways:
+//
+//  (1) Include jpegli/encode.h and/or jpegli/decode.h, update the function
+//      names of the API and link against libjpegli.
+//
+//  (2) Leave the application code unchanged, but replace the libjpeg.so library
+//      with the one built by this project that is API- and ABI-compatible with
+//      libjpeg-turbo's version of libjpeg.so.
+
+#ifndef LIB_JPEGLI_ENCODE_H_
+#define LIB_JPEGLI_ENCODE_H_
+
+#include "lib/jpegli/common.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#define jpegli_create_compress(cinfo)              \
+  jpegli_CreateCompress((cinfo), JPEG_LIB_VERSION, \
+                        (size_t)sizeof(struct jpeg_compress_struct))
+void jpegli_CreateCompress(j_compress_ptr cinfo, int version,
+                           size_t structsize);
+
+void jpegli_stdio_dest(j_compress_ptr cinfo, FILE* outfile);
+
+void jpegli_mem_dest(j_compress_ptr cinfo, unsigned char** outbuffer,
+                     unsigned long* outsize);
+
+void jpegli_set_defaults(j_compress_ptr cinfo);
+
+void jpegli_default_colorspace(j_compress_ptr cinfo);
+
+void jpegli_set_colorspace(j_compress_ptr cinfo, J_COLOR_SPACE colorspace);
+
+void jpegli_set_quality(j_compress_ptr cinfo, int quality,
+                        boolean force_baseline);
+
+void jpegli_set_linear_quality(j_compress_ptr cinfo, int scale_factor,
+                               boolean force_baseline);
+
+#if JPEG_LIB_VERSION >= 70
+void jpegli_default_qtables(j_compress_ptr cinfo, boolean force_baseline);
+#endif
+
+int jpegli_quality_scaling(int quality);
+
+void jpegli_add_quant_table(j_compress_ptr cinfo, int which_tbl,
+                            const unsigned int* basic_table, int scale_factor,
+                            boolean force_baseline);
+
+void jpegli_simple_progression(j_compress_ptr cinfo);
+
+void jpegli_suppress_tables(j_compress_ptr cinfo, boolean suppress);
+
+#if JPEG_LIB_VERSION >= 70
+void jpegli_calc_jpeg_dimensions(j_compress_ptr cinfo);
+#endif
+
+void jpegli_copy_critical_parameters(j_decompress_ptr srcinfo,
+                                     j_compress_ptr dstinfo);
+
+void jpegli_write_m_header(j_compress_ptr cinfo, int marker,
+                           unsigned int datalen);
+
+void jpegli_write_m_byte(j_compress_ptr cinfo, int val);
+
+void jpegli_write_marker(j_compress_ptr cinfo, int marker,
+                         const JOCTET* dataptr, unsigned int datalen);
+
+void jpegli_write_icc_profile(j_compress_ptr cinfo, const JOCTET* icc_data_ptr,
+                              unsigned int icc_data_len);
+
+void jpegli_start_compress(j_compress_ptr cinfo, boolean write_all_tables);
+
+void jpegli_write_tables(j_compress_ptr cinfo);
+
+JDIMENSION jpegli_write_scanlines(j_compress_ptr cinfo, JSAMPARRAY scanlines,
+                                  JDIMENSION num_lines);
+
+JDIMENSION jpegli_write_raw_data(j_compress_ptr cinfo, JSAMPIMAGE data,
+                                 JDIMENSION num_lines);
+
+void jpegli_write_coefficients(j_compress_ptr cinfo,
+                               jvirt_barray_ptr* coef_arrays);
+
+void jpegli_finish_compress(j_compress_ptr cinfo);
+
+void jpegli_abort_compress(j_compress_ptr cinfo);
+
+void jpegli_destroy_compress(j_compress_ptr cinfo);
+
+//
+// New API functions that are not available in libjpeg
+//
+// NOTE: This part of the API is still experimental and will probably change in
+// the future.
+//
+
+// Sets the butteraugli target distance for the compressor. This may override
+// the default quantization table indexes based on jpeg colorspace, therefore
+// it must be called after jpegli_set_defaults() or after the last
+// jpegli_set_colorspace() or jpegli_default_colorspace() calls.
+void jpegli_set_distance(j_compress_ptr cinfo, float distance,
+                         boolean force_baseline);
+
+// Returns the butteraugli target distance for the given quality parameter.
+float jpegli_quality_to_distance(int quality);
+
+// Enables distance parameter search to meet the given psnr target.
+void jpegli_set_psnr(j_compress_ptr cinfo, float psnr, float tolerance,
+                     float min_distance, float max_distance);
+
+// Changes the default behaviour of the encoder in the selection of quantization
+// matrices and chroma subsampling. Must be called before jpegli_set_defaults()
+// because some default setting depend on the XYB mode.
+void jpegli_set_xyb_mode(j_compress_ptr cinfo);
+
+// Signals to the encoder that the pixel data that will be provided later
+// through jpegli_write_scanlines() has this transfer function. This must be
+// called before jpegli_set_defaults() because it changes the default
+// quantization tables.
+void jpegli_set_cicp_transfer_function(j_compress_ptr cinfo, int code);
+
+void jpegli_set_input_format(j_compress_ptr cinfo, JpegliDataType data_type,
+                             JpegliEndianness endianness);
+
+// Sets whether or not the encoder uses adaptive quantization for creating more
+// zero coefficients based on the local properties of the image.
+// Enabled by default.
+void jpegli_enable_adaptive_quantization(j_compress_ptr cinfo, boolean value);
+
+// Sets the default progression parameters, where level 0 is sequential, and
+// greater level value means more progression steps. Default is 2.
+void jpegli_set_progressive_level(j_compress_ptr cinfo, int level);
+
+// If this function is called before starting compression, the quality and
+// linear quality parameters will be used to scale the standard quantization
+// tables from Annex K of the JPEG standard. By default jpegli uses a different
+// set of quantization tables and used different scaling parameters for DC and
+// AC coefficients. Must be called before jpegli_set_defaults().
+void jpegli_use_standard_quant_tables(j_compress_ptr cinfo);
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}  // extern "C"
+#endif
+
+#endif  // LIB_JPEGLI_ENCODE_H_
diff --git a/lib/jpegli/encode_api_test.cc b/lib/jpegli/encode_api_test.cc
new file mode 100644 (file)
index 0000000..8d53557
--- /dev/null
@@ -0,0 +1,837 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include "lib/jpegli/encode.h"
+#include "lib/jpegli/error.h"
+#include "lib/jpegli/test_utils.h"
+#include "lib/jpegli/testing.h"
+#include "lib/jxl/sanitizers.h"
+
+namespace jpegli {
+namespace {
+
+struct TestConfig {
+  TestImage input;
+  CompressParams jparams;
+  JpegIOMode input_mode = PIXELS;
+  double max_bpp;
+  double max_dist;
+};
+
+class EncodeAPITestParam : public ::testing::TestWithParam<TestConfig> {};
+
+void GenerateInput(JpegIOMode input_mode, const CompressParams& jparams,
+                   TestImage* input) {
+  GeneratePixels(input);
+  if (input_mode == RAW_DATA) {
+    GenerateRawData(jparams, input);
+  } else if (input_mode == COEFFICIENTS) {
+    GenerateCoeffs(jparams, input);
+  }
+}
+
+TEST_P(EncodeAPITestParam, TestAPI) {
+  TestConfig config = GetParam();
+  GenerateInput(config.input_mode, config.jparams, &config.input);
+  std::vector<uint8_t> compressed;
+  ASSERT_TRUE(EncodeWithJpegli(config.input, config.jparams, &compressed));
+  if (config.jparams.icc.empty()) {
+    double bpp =
+        compressed.size() * 8.0 / (config.input.xsize * config.input.ysize);
+    printf("bpp: %f\n", bpp);
+    EXPECT_LT(bpp, config.max_bpp);
+  }
+  DecompressParams dparams;
+  dparams.output_mode =
+      config.input_mode == COEFFICIENTS ? COEFFICIENTS : PIXELS;
+  if (config.jparams.set_jpeg_colorspace &&
+      config.jparams.jpeg_color_space == JCS_GRAYSCALE) {
+    ConvertToGrayscale(&config.input);
+  } else {
+    dparams.set_out_color_space = true;
+    dparams.out_color_space = config.input.color_space;
+  }
+  TestImage output;
+  DecodeWithLibjpeg(config.jparams, dparams, compressed, &output);
+  VerifyOutputImage(config.input, output, config.max_dist);
+}
+
+TEST(EncodeAPITest, ReuseCinfoSameImageTwice) {
+  TestImage input;
+  input.xsize = 129;
+  input.ysize = 73;
+  CompressParams jparams;
+  GenerateInput(PIXELS, jparams, &input);
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  std::vector<uint8_t> compressed0;
+  std::vector<uint8_t> compressed1;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    EncodeWithJpegli(input, jparams, &cinfo);
+    compressed0.assign(buffer, buffer + buffer_size);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    EncodeWithJpegli(input, jparams, &cinfo);
+    compressed1.assign(buffer, buffer + buffer_size);
+    return true;
+  };
+  EXPECT_TRUE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+  ASSERT_EQ(compressed0.size(), compressed1.size());
+  EXPECT_EQ(0,
+            memcmp(compressed0.data(), compressed1.data(), compressed0.size()));
+}
+
+std::vector<TestConfig> GenerateBasicConfigs() {
+  std::vector<TestConfig> all_configs;
+  for (int samp : {1, 2}) {
+    for (int progr : {0, 2}) {
+      for (int optimize : {0, 1}) {
+        if (progr && optimize) continue;
+        TestConfig config;
+        config.input.xsize = 257 + samp * 37;
+        config.input.ysize = 265 + optimize * 17;
+        config.jparams.h_sampling = {samp, 1, 1};
+        config.jparams.v_sampling = {samp, 1, 1};
+        config.jparams.progressive_mode = progr;
+        config.jparams.optimize_coding = optimize;
+        config.max_dist = 2.4f;
+        GeneratePixels(&config.input);
+        all_configs.push_back(config);
+      }
+    }
+  }
+  return all_configs;
+}
+
+TEST(EncodeAPITest, ReuseCinfoSameMemOutput) {
+  std::vector<TestConfig> all_configs = GenerateBasicConfigs();
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  {
+    jpeg_compress_struct cinfo;
+    const auto try_catch_block = [&]() -> bool {
+      ERROR_HANDLER_SETUP(jpegli);
+      jpegli_create_compress(&cinfo);
+      jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+      for (const TestConfig& config : all_configs) {
+        EncodeWithJpegli(config.input, config.jparams, &cinfo);
+      }
+      return true;
+    };
+    EXPECT_TRUE(try_catch_block());
+    jpegli_destroy_compress(&cinfo);
+  }
+  size_t pos = 0;
+  for (size_t i = 0; i < all_configs.size(); ++i) {
+    TestImage output;
+    pos +=
+        DecodeWithLibjpeg(all_configs[i].jparams, DecompressParams(), nullptr,
+                          0, buffer + pos, buffer_size - pos, &output);
+    VerifyOutputImage(all_configs[i].input, output, all_configs[i].max_dist);
+  }
+  if (buffer) free(buffer);
+}
+
+TEST(EncodeAPITest, ReuseCinfoSameStdOutput) {
+  std::vector<TestConfig> all_configs = GenerateBasicConfigs();
+  FILE* tmpf = tmpfile();
+  JXL_CHECK(tmpf);
+  {
+    jpeg_compress_struct cinfo;
+    const auto try_catch_block = [&]() -> bool {
+      ERROR_HANDLER_SETUP(jpegli);
+      jpegli_create_compress(&cinfo);
+      jpegli_stdio_dest(&cinfo, tmpf);
+      for (const TestConfig& config : all_configs) {
+        EncodeWithJpegli(config.input, config.jparams, &cinfo);
+      }
+      return true;
+    };
+    EXPECT_TRUE(try_catch_block());
+    jpegli_destroy_compress(&cinfo);
+  }
+  size_t total_size = ftell(tmpf);
+  rewind(tmpf);
+  std::vector<uint8_t> compressed(total_size);
+  JXL_CHECK(total_size == fread(&compressed[0], 1, total_size, tmpf));
+  fclose(tmpf);
+  size_t pos = 0;
+  for (size_t i = 0; i < all_configs.size(); ++i) {
+    TestImage output;
+    pos += DecodeWithLibjpeg(all_configs[i].jparams, DecompressParams(),
+                             nullptr, 0, &compressed[pos],
+                             compressed.size() - pos, &output);
+    VerifyOutputImage(all_configs[i].input, output, all_configs[i].max_dist);
+  }
+}
+
+TEST(EncodeAPITest, ReuseCinfoChangeParams) {
+  TestImage input, output;
+  CompressParams jparams;
+  DecompressParams dparams;
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  std::vector<uint8_t> compressed;
+  jpeg_compress_struct cinfo;
+  const auto max_rms = [](int q, int hs, int vs) {
+    if (hs == 1 && vs == 1) return q == 90 ? 2.2 : 0.6;
+    if (hs == 2 && vs == 2) return q == 90 ? 2.8 : 1.2;
+    return q == 90 ? 2.4 : 1.0;
+  };
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    input.xsize = 129;
+    input.ysize = 73;
+    dparams.set_out_color_space = true;
+    for (JpegIOMode input_mode : {PIXELS, RAW_DATA, PIXELS, COEFFICIENTS}) {
+      for (int h_samp : {2, 1}) {
+        for (int v_samp : {2, 1}) {
+          for (int progr : {0, 2}) {
+            for (int quality : {90, 100}) {
+              input.Clear();
+              input.color_space =
+                  (input_mode == RAW_DATA ? JCS_YCbCr : JCS_RGB);
+              jparams.quality = quality;
+              jparams.h_sampling = {h_samp, 1, 1};
+              jparams.v_sampling = {v_samp, 1, 1};
+              jparams.progressive_mode = progr;
+              printf(
+                  "Generating input with quality %d chroma subsampling %dx%d "
+                  "input mode %d progressive_mode %d\n",
+                  quality, h_samp, v_samp, input_mode, progr);
+              GenerateInput(input_mode, jparams, &input);
+              jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+              if (input_mode != COEFFICIENTS) {
+                cinfo.image_width = input.xsize;
+                cinfo.image_height = input.ysize;
+                cinfo.input_components = input.components;
+                jpegli_set_defaults(&cinfo);
+                jpegli_start_compress(&cinfo, TRUE);
+                jpegli_abort_compress(&cinfo);
+                jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+              }
+              EncodeWithJpegli(input, jparams, &cinfo);
+              compressed.resize(buffer_size);
+              std::copy_n(buffer, buffer_size, compressed.data());
+              dparams.output_mode =
+                  input_mode == COEFFICIENTS ? COEFFICIENTS : PIXELS;
+              dparams.out_color_space = input.color_space;
+              output.Clear();
+              DecodeWithLibjpeg(jparams, dparams, compressed, &output);
+              VerifyOutputImage(input, output,
+                                max_rms(quality, h_samp, v_samp));
+            }
+          }
+        }
+      }
+    }
+    return true;
+  };
+  EXPECT_TRUE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncodeAPITest, AbbreviatedStreams) {
+  uint8_t* table_stream = nullptr;
+  unsigned long table_stream_size = 0;
+  uint8_t* data_stream = nullptr;
+  unsigned long data_stream_size = 0;
+  {
+    jpeg_compress_struct cinfo;
+    const auto try_catch_block = [&]() -> bool {
+      ERROR_HANDLER_SETUP(jpegli);
+      jpegli_create_compress(&cinfo);
+      jpegli_mem_dest(&cinfo, &table_stream, &table_stream_size);
+      cinfo.input_components = 3;
+      cinfo.in_color_space = JCS_RGB;
+      jpegli_set_defaults(&cinfo);
+      jpegli_write_tables(&cinfo);
+      jpegli_mem_dest(&cinfo, &data_stream, &data_stream_size);
+      cinfo.image_width = 1;
+      cinfo.image_height = 1;
+      cinfo.optimize_coding = FALSE;
+      jpegli_set_progressive_level(&cinfo, 0);
+      jpegli_start_compress(&cinfo, FALSE);
+      JSAMPLE image[3] = {0};
+      JSAMPROW row[] = {image};
+      jpegli_write_scanlines(&cinfo, row, 1);
+      jpegli_finish_compress(&cinfo);
+      return true;
+    };
+    EXPECT_TRUE(try_catch_block());
+    EXPECT_LT(data_stream_size, 50);
+    jpegli_destroy_compress(&cinfo);
+  }
+  TestImage output;
+  DecodeWithLibjpeg(CompressParams(), DecompressParams(), table_stream,
+                    table_stream_size, data_stream, data_stream_size, &output);
+  EXPECT_EQ(1, output.xsize);
+  EXPECT_EQ(1, output.ysize);
+  EXPECT_EQ(3, output.components);
+  EXPECT_EQ(0, output.pixels[0]);
+  EXPECT_EQ(0, output.pixels[1]);
+  EXPECT_EQ(0, output.pixels[2]);
+  if (table_stream) free(table_stream);
+  if (data_stream) free(data_stream);
+}
+
+void CopyQuantTables(j_compress_ptr cinfo, uint16_t* quant_tables) {
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    int quant_idx = cinfo->comp_info[c].quant_tbl_no;
+    JQUANT_TBL* quant_table = cinfo->quant_tbl_ptrs[quant_idx];
+    for (int k = 0; k < DCTSIZE2; ++k) {
+      quant_tables[c * DCTSIZE2 + k] = quant_table->quantval[k];
+    }
+  }
+}
+
+TEST(EncodeAPITest, QualitySettings) {
+  // Test that jpegli_set_quality, jpegli_set_linear_quality and
+  // jpegli_quality_scaling are consistent with each other.
+  uint16_t quant_tables0[3 * DCTSIZE2];
+  uint16_t quant_tables1[3 * DCTSIZE2];
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    cinfo.input_components = 3;
+    cinfo.in_color_space = JCS_RGB;
+    jpegli_set_defaults(&cinfo);
+    for (boolean baseline : {FALSE, TRUE}) {
+      for (int q = 1; q <= 100; ++q) {
+        jpegli_set_quality(&cinfo, q, baseline);
+        CopyQuantTables(&cinfo, quant_tables0);
+        jpegli_set_linear_quality(&cinfo, jpegli_quality_scaling(q), baseline);
+        CopyQuantTables(&cinfo, quant_tables1);
+        EXPECT_EQ(0,
+                  memcmp(quant_tables0, quant_tables1, sizeof(quant_tables0)));
+#if JPEG_LIB_VERSION >= 70
+        for (int i = 0; i < NUM_QUANT_TBLS; ++i) {
+          cinfo.q_scale_factor[i] = jpegli_quality_scaling(q);
+        }
+        jpegli_default_qtables(&cinfo, baseline);
+        CopyQuantTables(&cinfo, quant_tables1);
+        EXPECT_EQ(0,
+                  memcmp(quant_tables0, quant_tables1, sizeof(quant_tables0)));
+#endif
+      }
+    }
+    return true;
+  };
+  EXPECT_TRUE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  // Test jpegli_quality_scaling for some specific values .
+  EXPECT_EQ(5000, jpegli_quality_scaling(-1));
+  EXPECT_EQ(5000, jpegli_quality_scaling(0));
+  EXPECT_EQ(5000, jpegli_quality_scaling(1));
+  EXPECT_EQ(100, jpegli_quality_scaling(50));
+  EXPECT_EQ(50, jpegli_quality_scaling(75));
+  EXPECT_EQ(20, jpegli_quality_scaling(90));
+  EXPECT_EQ(0, jpegli_quality_scaling(100));
+  EXPECT_EQ(0, jpegli_quality_scaling(101));
+}
+
+std::vector<TestConfig> GenerateTests() {
+  std::vector<TestConfig> all_tests;
+  for (int h_samp : {1, 2}) {
+    for (int v_samp : {1, 2}) {
+      for (int progr : {0, 2}) {
+        for (int optimize : {0, 1}) {
+          if (progr && optimize) continue;
+          TestConfig config;
+          config.jparams.h_sampling = {h_samp, 1, 1};
+          config.jparams.v_sampling = {v_samp, 1, 1};
+          config.jparams.progressive_mode = progr;
+          if (!progr) {
+            config.jparams.optimize_coding = optimize;
+          }
+          const float kMaxBpp[4] = {1.55, 1.4, 1.4, 1.32};
+          const float kMaxDist[4] = {1.95, 2.2, 2.2, 2.0};
+          const int idx = v_samp * 2 + h_samp - 3;
+          config.max_bpp =
+              kMaxBpp[idx] * (optimize ? 0.97 : 1.0) * (progr ? 0.97 : 1.0);
+          config.max_dist = kMaxDist[idx];
+          all_tests.push_back(config);
+        }
+      }
+    }
+  }
+  {
+    TestConfig config;
+    config.jparams.quality = 100;
+    config.max_bpp = 6.6;
+    config.max_dist = 0.6;
+    all_tests.push_back(config);
+  }
+  {
+    TestConfig config;
+    config.jparams.quality = 80;
+    config.max_bpp = 1.05;
+    config.max_dist = 2.7;
+    all_tests.push_back(config);
+  }
+  for (int samp : {1, 2}) {
+    for (int progr : {0, 2}) {
+      for (int optimize : {0, 1}) {
+        if (progr && optimize) continue;
+        TestConfig config;
+        config.input.xsize = 257;
+        config.input.ysize = 265;
+        config.jparams.h_sampling = {samp, 1, 1};
+        config.jparams.v_sampling = {samp, 1, 1};
+        config.jparams.progressive_mode = progr;
+        if (!progr) {
+          config.jparams.optimize_coding = optimize;
+        }
+        config.jparams.use_adaptive_quantization = false;
+        config.max_bpp = 2.05f;
+        config.max_dist = 2.3f;
+        all_tests.push_back(config);
+      }
+    }
+  }
+  for (int h0_samp : {1, 2, 4}) {
+    for (int v0_samp : {1, 2, 4}) {
+      for (int h2_samp : {1, 2, 4}) {
+        for (int v2_samp : {1, 2, 4}) {
+          TestConfig config;
+          config.input.xsize = 137;
+          config.input.ysize = 75;
+          config.jparams.progressive_mode = 2;
+          config.jparams.h_sampling = {h0_samp, 1, h2_samp};
+          config.jparams.v_sampling = {v0_samp, 1, v2_samp};
+          config.max_bpp = 2.5;
+          config.max_dist = 12.0;
+          all_tests.push_back(config);
+        }
+      }
+    }
+  }
+  for (int h0_samp : {1, 3}) {
+    for (int v0_samp : {1, 3}) {
+      for (int h2_samp : {1, 3}) {
+        for (int v2_samp : {1, 3}) {
+          TestConfig config;
+          config.input.xsize = 205;
+          config.input.ysize = 99;
+          config.jparams.progressive_mode = 2;
+          config.jparams.h_sampling = {h0_samp, 1, h2_samp};
+          config.jparams.v_sampling = {v0_samp, 1, v2_samp};
+          config.max_bpp = 2.5;
+          config.max_dist = 10.0;
+          all_tests.push_back(config);
+        }
+      }
+    }
+  }
+  for (int h0_samp : {1, 2, 3, 4}) {
+    for (int v0_samp : {1, 2, 3, 4}) {
+      TestConfig config;
+      config.input.xsize = 217;
+      config.input.ysize = 129;
+      config.jparams.progressive_mode = 2;
+      config.jparams.h_sampling = {h0_samp, 1, 1};
+      config.jparams.v_sampling = {v0_samp, 1, 1};
+      config.max_bpp = 2.0;
+      config.max_dist = 5.5;
+      all_tests.push_back(config);
+    }
+  }
+  for (int p = 0; p < 3 + NumTestScanScripts(); ++p) {
+    for (int samp : {1, 2}) {
+      for (int quality : {100, 90, 1}) {
+        for (int r : {0, 1024, 1}) {
+          for (int optimize : {0, 1}) {
+            bool progressive = p == 1 || p == 2 || p > 4;
+            if (progressive && !optimize) continue;
+            TestConfig config;
+            config.input.xsize = 273;
+            config.input.ysize = 265;
+            config.jparams.progressive_mode = p;
+            if (!progressive) {
+              config.jparams.optimize_coding = optimize;
+            }
+            config.jparams.h_sampling = {samp, 1, 1};
+            config.jparams.v_sampling = {samp, 1, 1};
+            config.jparams.quality = quality;
+            config.jparams.restart_interval = r;
+            config.max_bpp = quality == 100 ? 8.0 : 1.9;
+            if (r == 1) {
+              config.max_bpp += 10.0;
+            }
+            config.max_dist = quality == 1 ? 20.0 : 2.1;
+            all_tests.push_back(config);
+          }
+        }
+      }
+    }
+  }
+  {
+    TestConfig config;
+    config.jparams.simple_progression = true;
+    config.max_bpp = 1.48;
+    config.max_dist = 2.0;
+    all_tests.push_back(config);
+  }
+  {
+    TestConfig config;
+    config.input_mode = COEFFICIENTS;
+    config.jparams.h_sampling = {2, 1, 1};
+    config.jparams.v_sampling = {2, 1, 1};
+    config.jparams.progressive_mode = 0;
+    config.jparams.optimize_coding = 0;
+    config.max_bpp = 16;
+    config.max_dist = 0.0;
+    all_tests.push_back(config);
+  }
+  {
+    TestConfig config;
+    config.jparams.xyb_mode = true;
+    config.jparams.progressive_mode = 2;
+    config.max_bpp = 1.5;
+    config.max_dist = 3.5;
+    all_tests.push_back(config);
+  }
+  {
+    TestConfig config;
+    config.jparams.libjpeg_mode = true;
+    config.max_bpp = 2.1;
+    config.max_dist = 1.7;
+    all_tests.push_back(config);
+  }
+
+  for (J_COLOR_SPACE in_color_space : {JCS_RGB, JCS_YCbCr, JCS_GRAYSCALE}) {
+    for (J_COLOR_SPACE jpeg_color_space : {JCS_RGB, JCS_YCbCr, JCS_GRAYSCALE}) {
+      if (jpeg_color_space == JCS_RGB && in_color_space == JCS_YCbCr) continue;
+      TestConfig config;
+      config.input.xsize = config.input.ysize = 256;
+      config.input.color_space = in_color_space;
+      config.jparams.set_jpeg_colorspace = true;
+      config.jparams.jpeg_color_space = jpeg_color_space;
+      config.max_bpp = jpeg_color_space == JCS_RGB ? 4.5 : 1.85;
+      config.max_dist = jpeg_color_space == JCS_RGB ? 1.4 : 2.05;
+      all_tests.push_back(config);
+    }
+  }
+  for (J_COLOR_SPACE in_color_space : {JCS_CMYK, JCS_YCCK}) {
+    for (J_COLOR_SPACE jpeg_color_space : {JCS_CMYK, JCS_YCCK}) {
+      if (jpeg_color_space == JCS_CMYK && in_color_space == JCS_YCCK) continue;
+      TestConfig config;
+      config.input.xsize = config.input.ysize = 256;
+      config.input.color_space = in_color_space;
+      if (in_color_space != jpeg_color_space) {
+        config.jparams.set_jpeg_colorspace = true;
+        config.jparams.jpeg_color_space = jpeg_color_space;
+      }
+      config.max_bpp = jpeg_color_space == JCS_CMYK ? 4.0 : 3.6;
+      config.max_dist = jpeg_color_space == JCS_CMYK ? 1.2 : 1.5;
+      all_tests.push_back(config);
+    }
+  }
+  {
+    TestConfig config;
+    config.input.color_space = JCS_YCbCr;
+    config.max_bpp = 1.6;
+    config.max_dist = 1.35;
+    all_tests.push_back(config);
+  }
+  for (bool xyb : {false, true}) {
+    TestConfig config;
+    config.input.color_space = JCS_GRAYSCALE;
+    config.jparams.xyb_mode = xyb;
+    config.max_bpp = 1.35;
+    config.max_dist = 1.4;
+    all_tests.push_back(config);
+  }
+  for (int channels = 1; channels <= 4; ++channels) {
+    TestConfig config;
+    config.input.color_space = JCS_UNKNOWN;
+    config.input.components = channels;
+    config.max_bpp = 1.35 * channels;
+    config.max_dist = 1.4;
+    all_tests.push_back(config);
+  }
+  for (size_t r : {1, 3, 17, 1024}) {
+    for (int progr : {0, 2}) {
+      TestConfig config;
+      config.jparams.restart_interval = r;
+      config.jparams.progressive_mode = progr;
+      config.max_bpp = 1.58 + 5.5 / r;
+      config.max_dist = 2.2;
+      all_tests.push_back(config);
+    }
+  }
+  for (size_t rr : {1, 3, 8, 100}) {
+    TestConfig config;
+    config.jparams.restart_in_rows = rr;
+    config.max_bpp = 1.6;
+    config.max_dist = 2.2;
+    all_tests.push_back(config);
+  }
+  for (int type : {0, 1, 10, 100, 10000}) {
+    for (int scale : {1, 50, 100, 200, 500}) {
+      for (bool add_raw : {false, true}) {
+        for (bool baseline : {true, false}) {
+          if (!baseline && (add_raw || type * scale < 25500)) continue;
+          TestConfig config;
+          config.input.xsize = 64;
+          config.input.ysize = 64;
+          CustomQuantTable table;
+          table.table_type = type;
+          table.scale_factor = scale;
+          table.force_baseline = baseline;
+          table.add_raw = add_raw;
+          table.Generate();
+          config.jparams.optimize_coding = 1;
+          config.jparams.quant_tables.push_back(table);
+          config.jparams.quant_indexes = {0, 0, 0};
+          float q = (type == 0 ? 16 : type) * scale * 0.01f;
+          if (baseline && !add_raw) q = std::max(1.0f, std::min(255.0f, q));
+          config.max_bpp = 1.5f + 25.0f / q;
+          config.max_dist = 0.6f + 0.25f * q;
+          all_tests.push_back(config);
+        }
+      }
+    }
+  }
+  for (int qidx = 0; qidx < 8; ++qidx) {
+    if (qidx == 3) continue;
+    TestConfig config;
+    config.input.xsize = 256;
+    config.input.ysize = 256;
+    config.jparams.quant_indexes = {(qidx >> 2) & 1, (qidx >> 1) & 1,
+                                    (qidx >> 0) & 1};
+    config.max_bpp = 2.25;
+    config.max_dist = 2.8;
+    all_tests.push_back(config);
+  }
+  for (int qidx = 0; qidx < 8; ++qidx) {
+    for (int slot_idx = 0; slot_idx < 2; ++slot_idx) {
+      if (qidx == 0 && slot_idx == 0) continue;
+      TestConfig config;
+      config.input.xsize = 256;
+      config.input.ysize = 256;
+      config.jparams.quant_indexes = {(qidx >> 2) & 1, (qidx >> 1) & 1,
+                                      (qidx >> 0) & 1};
+      CustomQuantTable table;
+      table.slot_idx = slot_idx;
+      table.Generate();
+      config.jparams.quant_tables.push_back(table);
+      config.max_bpp = 2.3;
+      config.max_dist = 2.9;
+      all_tests.push_back(config);
+    }
+  }
+  for (int qidx = 0; qidx < 8; ++qidx) {
+    for (bool xyb : {false, true}) {
+      TestConfig config;
+      config.input.xsize = 256;
+      config.input.ysize = 256;
+      config.jparams.xyb_mode = xyb;
+      config.jparams.quant_indexes = {(qidx >> 2) & 1, (qidx >> 1) & 1,
+                                      (qidx >> 0) & 1};
+      {
+        CustomQuantTable table;
+        table.slot_idx = 0;
+        table.Generate();
+        config.jparams.quant_tables.push_back(table);
+      }
+      {
+        CustomQuantTable table;
+        table.slot_idx = 1;
+        table.table_type = 20;
+        table.Generate();
+        config.jparams.quant_tables.push_back(table);
+      }
+      config.max_bpp = 2.0;
+      config.max_dist = 3.85;
+      all_tests.push_back(config);
+    }
+  }
+  for (bool xyb : {false, true}) {
+    TestConfig config;
+    config.input.xsize = 256;
+    config.input.ysize = 256;
+    config.jparams.xyb_mode = xyb;
+    config.jparams.quant_indexes = {0, 1, 2};
+    {
+      CustomQuantTable table;
+      table.slot_idx = 0;
+      table.Generate();
+      config.jparams.quant_tables.push_back(table);
+    }
+    {
+      CustomQuantTable table;
+      table.slot_idx = 1;
+      table.table_type = 20;
+      table.Generate();
+      config.jparams.quant_tables.push_back(table);
+    }
+    {
+      CustomQuantTable table;
+      table.slot_idx = 2;
+      table.table_type = 30;
+      table.Generate();
+      config.jparams.quant_tables.push_back(table);
+    }
+    config.max_bpp = 1.5;
+    config.max_dist = 3.75;
+    all_tests.push_back(config);
+  }
+  {
+    TestConfig config;
+    config.jparams.comp_ids = {7, 17, 177};
+    config.input.xsize = config.input.ysize = 128;
+    config.max_bpp = 2.25;
+    config.max_dist = 2.4;
+    all_tests.push_back(config);
+  }
+  for (int override_JFIF : {-1, 0, 1}) {
+    for (int override_Adobe : {-1, 0, 1}) {
+      if (override_JFIF == -1 && override_Adobe == -1) continue;
+      TestConfig config;
+      config.input.xsize = config.input.ysize = 128;
+      config.jparams.override_JFIF = override_JFIF;
+      config.jparams.override_Adobe = override_Adobe;
+      config.max_bpp = 2.25;
+      config.max_dist = 2.4;
+      all_tests.push_back(config);
+    }
+  }
+  {
+    TestConfig config;
+    config.input.xsize = config.input.ysize = 256;
+    config.max_bpp = 1.85;
+    config.max_dist = 2.05;
+    config.jparams.add_marker = true;
+    all_tests.push_back(config);
+  }
+  for (size_t icc_size : {728, 70000, 1000000}) {
+    TestConfig config;
+    config.input.xsize = config.input.ysize = 256;
+    config.max_dist = 2.05;
+    config.jparams.icc.resize(icc_size);
+    for (size_t i = 0; i < icc_size; ++i) {
+      config.jparams.icc[i] = (i * 17) & 0xff;
+    }
+    all_tests.push_back(config);
+  }
+  for (JpegIOMode input_mode : {PIXELS, RAW_DATA, COEFFICIENTS}) {
+    TestConfig config;
+    config.input.xsize = config.input.ysize = 256;
+    config.input_mode = input_mode;
+    if (input_mode == RAW_DATA) {
+      config.input.color_space = JCS_YCbCr;
+    }
+    config.jparams.progressive_mode = 0;
+    config.jparams.optimize_coding = 0;
+    config.max_bpp = 1.85;
+    config.max_dist = 2.05;
+    if (input_mode == COEFFICIENTS) {
+      config.max_bpp = 3.5;
+      config.max_dist = 0.0;
+    }
+    all_tests.push_back(config);
+    config.jparams.use_flat_dc_luma_code = true;
+    all_tests.push_back(config);
+  }
+  for (int xsize : {640, 641, 648, 649}) {
+    for (int ysize : {640, 641, 648, 649}) {
+      for (int h_sampling : {1, 2}) {
+        for (int v_sampling : {1, 2}) {
+          if (h_sampling == 1 && v_sampling == 1) continue;
+          for (int progr : {0, 2}) {
+            TestConfig config;
+            config.input.xsize = xsize;
+            config.input.ysize = ysize;
+            config.input.color_space = JCS_YCbCr;
+            config.jparams.h_sampling = {h_sampling, 1, 1};
+            config.jparams.v_sampling = {v_sampling, 1, 1};
+            config.jparams.progressive_mode = progr;
+            config.input_mode = RAW_DATA;
+            config.max_bpp = 1.75;
+            config.max_dist = 2.0;
+            all_tests.push_back(config);
+            config.input_mode = COEFFICIENTS;
+            if (xsize & 1) {
+              config.jparams.add_marker = true;
+            }
+            config.max_bpp = 24.0;
+            all_tests.push_back(config);
+          }
+        }
+      }
+    }
+  }
+  for (JpegliDataType data_type : {JPEGLI_TYPE_UINT16, JPEGLI_TYPE_FLOAT}) {
+    for (JpegliEndianness endianness :
+         {JPEGLI_LITTLE_ENDIAN, JPEGLI_BIG_ENDIAN, JPEGLI_NATIVE_ENDIAN}) {
+      J_COLOR_SPACE colorspace[4] = {JCS_GRAYSCALE, JCS_UNKNOWN, JCS_RGB,
+                                     JCS_CMYK};
+      float max_bpp[4] = {1.32, 2.7, 1.6, 4.0};
+      for (int channels = 1; channels <= 4; ++channels) {
+        TestConfig config;
+        config.input.data_type = data_type;
+        config.input.endianness = endianness;
+        config.input.components = channels;
+        config.input.color_space = colorspace[channels - 1];
+        config.max_bpp = max_bpp[channels - 1];
+        config.max_dist = 2.2;
+        all_tests.push_back(config);
+      }
+    }
+  }
+  for (int smoothing : {1, 5, 50, 100}) {
+    for (int h_sampling : {1, 2}) {
+      for (int v_sampling : {1, 2}) {
+        TestConfig config;
+        config.input.xsize = 257;
+        config.input.ysize = 265;
+        config.jparams.smoothing_factor = smoothing;
+        config.jparams.h_sampling = {h_sampling, 1, 1};
+        config.jparams.v_sampling = {v_sampling, 1, 1};
+        config.max_bpp = 1.85;
+        config.max_dist = 3.05f;
+        all_tests.push_back(config);
+      }
+    }
+  }
+  return all_tests;
+};
+
+std::ostream& operator<<(std::ostream& os, const TestConfig& c) {
+  os << c.input;
+  os << c.jparams;
+  if (c.input_mode == RAW_DATA) {
+    os << "RawDataIn";
+  } else if (c.input_mode == COEFFICIENTS) {
+    os << "WriteCoeffs";
+  }
+  return os;
+}
+
+std::string TestDescription(
+    const testing::TestParamInfo<EncodeAPITestParam::ParamType>& info) {
+  std::stringstream name;
+  name << info.param;
+  return name.str();
+}
+
+JPEGLI_INSTANTIATE_TEST_SUITE_P(EncodeAPITest, EncodeAPITestParam,
+                                testing::ValuesIn(GenerateTests()),
+                                TestDescription);
+}  // namespace
+}  // namespace jpegli
diff --git a/lib/jpegli/encode_finish.cc b/lib/jpegli/encode_finish.cc
new file mode 100644 (file)
index 0000000..955676b
--- /dev/null
@@ -0,0 +1,230 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/encode_finish.h"
+
+#include <cmath>
+#include <limits>
+
+#include "lib/jpegli/error.h"
+#include "lib/jpegli/memory_manager.h"
+#include "lib/jpegli/quant.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jpegli/encode_finish.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jpegli/dct-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jpegli {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::GetLane;
+
+using D = HWY_FULL(float);
+using DI = HWY_FULL(int32_t);
+using DI16 = Rebind<int16_t, HWY_FULL(int32_t)>;
+
+void ReQuantizeBlock(int16_t* block, const float* qmc, float aq_strength,
+                     const float* zero_bias_offset,
+                     const float* zero_bias_mul) {
+  D d;
+  DI di;
+  DI16 di16;
+  const auto aq_mul = Set(d, aq_strength);
+  for (size_t k = 0; k < DCTSIZE2; k += Lanes(d)) {
+    const auto in = Load(di16, block + k);
+    const auto val = ConvertTo(d, PromoteTo(di, in));
+    const auto q = Load(d, qmc + k);
+    const auto qval = Mul(val, q);
+    const auto zb_offset = Load(d, zero_bias_offset + k);
+    const auto zb_mul = Load(d, zero_bias_mul + k);
+    const auto threshold = Add(zb_offset, Mul(zb_mul, aq_mul));
+    const auto nzero_mask = Ge(Abs(qval), threshold);
+    const auto iqval = IfThenElseZero(nzero_mask, Round(qval));
+    Store(DemoteTo(di16, ConvertTo(di, iqval)), di16, block + k);
+  }
+}
+
+float BlockError(const int16_t* block, const float* qmc, const float* iqmc,
+                 const float aq_strength, const float* zero_bias_offset,
+                 const float* zero_bias_mul) {
+  D d;
+  DI di;
+  DI16 di16;
+  auto err = Zero(d);
+  const auto scale = Set(d, 1.0 / 16);
+  const auto aq_mul = Set(d, aq_strength);
+  for (size_t k = 0; k < DCTSIZE2; k += Lanes(d)) {
+    const auto in = Load(di16, block + k);
+    const auto val = ConvertTo(d, PromoteTo(di, in));
+    const auto q = Load(d, qmc + k);
+    const auto qval = Mul(val, q);
+    const auto zb_offset = Load(d, zero_bias_offset + k);
+    const auto zb_mul = Load(d, zero_bias_mul + k);
+    const auto threshold = Add(zb_offset, Mul(zb_mul, aq_mul));
+    const auto nzero_mask = Ge(Abs(qval), threshold);
+    const auto iqval = IfThenElseZero(nzero_mask, Round(qval));
+    const auto invq = Load(d, iqmc + k);
+    const auto rval = Mul(iqval, invq);
+    const auto diff = Mul(Sub(val, rval), scale);
+    err = Add(err, Mul(diff, diff));
+  }
+  return GetLane(SumOfLanes(d, err));
+}
+
+void ComputeInverseWeights(const float* qmc, float* iqmc) {
+  for (int k = 0; k < 64; ++k) {
+    iqmc[k] = 1.0f / qmc[k];
+  }
+}
+
+float ComputePSNR(j_compress_ptr cinfo, int sampling) {
+  jpeg_comp_master* m = cinfo->master;
+  InitQuantizer(cinfo, QuantPass::SEARCH_SECOND_PASS);
+  double error = 0.0;
+  size_t num = 0;
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    const float* qmc = m->quant_mul[c];
+    const int h_factor = m->h_factor[c];
+    const int v_factor = m->v_factor[c];
+    const float* zero_bias_offset = m->zero_bias_offset[c];
+    const float* zero_bias_mul = m->zero_bias_mul[c];
+    HWY_ALIGN float iqmc[64];
+    ComputeInverseWeights(qmc, iqmc);
+    for (JDIMENSION by = 0; by < comp->height_in_blocks; by += sampling) {
+      JBLOCKARRAY ba = GetBlockRow(cinfo, c, by);
+      const float* qf = m->quant_field.Row(by * v_factor);
+      for (JDIMENSION bx = 0; bx < comp->width_in_blocks; bx += sampling) {
+        error += BlockError(&ba[0][bx][0], qmc, iqmc, qf[bx * h_factor],
+                            zero_bias_offset, zero_bias_mul);
+        num += DCTSIZE2;
+      }
+    }
+  }
+  return 4.3429448f * log(num / (error / 255. / 255.));
+}
+
+void ReQuantizeCoeffs(j_compress_ptr cinfo) {
+  jpeg_comp_master* m = cinfo->master;
+  InitQuantizer(cinfo, QuantPass::SEARCH_SECOND_PASS);
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    const float* qmc = m->quant_mul[c];
+    const int h_factor = m->h_factor[c];
+    const int v_factor = m->v_factor[c];
+    const float* zero_bias_offset = m->zero_bias_offset[c];
+    const float* zero_bias_mul = m->zero_bias_mul[c];
+    for (JDIMENSION by = 0; by < comp->height_in_blocks; ++by) {
+      JBLOCKARRAY ba = GetBlockRow(cinfo, c, by);
+      const float* qf = m->quant_field.Row(by * v_factor);
+      for (JDIMENSION bx = 0; bx < comp->width_in_blocks; ++bx) {
+        ReQuantizeBlock(&ba[0][bx][0], qmc, qf[bx * h_factor], zero_bias_offset,
+                        zero_bias_mul);
+      }
+    }
+  }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jpegli
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jpegli {
+namespace {
+HWY_EXPORT(ComputePSNR);
+HWY_EXPORT(ReQuantizeCoeffs);
+
+void ReQuantizeCoeffs(j_compress_ptr cinfo) {
+  HWY_DYNAMIC_DISPATCH(ReQuantizeCoeffs)(cinfo);
+}
+
+float ComputePSNR(j_compress_ptr cinfo, int sampling) {
+  return HWY_DYNAMIC_DISPATCH(ComputePSNR)(cinfo, sampling);
+}
+
+void UpdateDistance(j_compress_ptr cinfo, float distance) {
+  float distances[NUM_QUANT_TBLS] = {distance, distance, distance};
+  SetQuantMatrices(cinfo, distances, /*add_two_chroma_tables=*/true);
+}
+
+float Clamp(float val, float minval, float maxval) {
+  return std::max(minval, std::min(maxval, val));
+}
+
+#define PSNR_SEARCH_DBG 0
+
+float FindDistanceForPSNR(j_compress_ptr cinfo) {
+  constexpr int kMaxIters = 20;
+  const float psnr_target = cinfo->master->psnr_target;
+  const float tolerance = cinfo->master->psnr_tolerance;
+  const float min_dist = cinfo->master->min_distance;
+  const float max_dist = cinfo->master->max_distance;
+  float d = Clamp(1.0f, min_dist, max_dist);
+  for (int sampling : {4, 1}) {
+    float best_diff = std::numeric_limits<float>::max();
+    float best_distance = 0.0f;
+    float best_psnr = 0.0;
+    float dmin = min_dist;
+    float dmax = max_dist;
+    bool found_lower_bound = false;
+    bool found_upper_bound = false;
+    for (int i = 0; i < kMaxIters; ++i) {
+      UpdateDistance(cinfo, d);
+      float psnr = ComputePSNR(cinfo, sampling);
+      if (psnr > psnr_target) {
+        dmin = d;
+        found_lower_bound = true;
+      } else {
+        dmax = d;
+        found_upper_bound = true;
+      }
+#if (PSNR_SEARCH_DBG > 1)
+      printf("sampling %d iter %2d d %7.4f psnr %.2f", sampling, i, d, psnr);
+      if (found_upper_bound && found_lower_bound) {
+        printf("    d-interval: [ %7.4f .. %7.4f ]", dmin, dmax);
+      }
+      printf("\n");
+#endif
+      float diff = std::abs(psnr - psnr_target);
+      if (diff < best_diff) {
+        best_diff = diff;
+        best_distance = d;
+        best_psnr = psnr;
+      }
+      if (diff < tolerance * psnr_target || dmin == dmax) {
+        break;
+      }
+      if (!found_lower_bound || !found_upper_bound) {
+        d *= std::exp(0.15f * (psnr - psnr_target));
+      } else {
+        d = 0.5f * (dmin + dmax);
+      }
+      d = Clamp(d, min_dist, max_dist);
+    }
+    d = best_distance;
+    if (sampling == 1 && PSNR_SEARCH_DBG) {
+      printf("Final PSNR %.2f at distance %.4f\n", best_psnr, d);
+    }
+  }
+  return d;
+}
+
+}  // namespace
+
+void QuantizetoPSNR(j_compress_ptr cinfo) {
+  float distance = FindDistanceForPSNR(cinfo);
+  UpdateDistance(cinfo, distance);
+  ReQuantizeCoeffs(cinfo);
+}
+
+}  // namespace jpegli
+#endif  // HWY_ONCE
diff --git a/lib/jpegli/encode_finish.h b/lib/jpegli/encode_finish.h
new file mode 100644 (file)
index 0000000..f6862de
--- /dev/null
@@ -0,0 +1,17 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_ENCODE_FINISH_H_
+#define LIB_JPEGLI_ENCODE_FINISH_H_
+
+#include "lib/jpegli/encode_internal.h"
+
+namespace jpegli {
+
+void QuantizetoPSNR(j_compress_ptr cinfo);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_ENCODE_FINISH_H_
diff --git a/lib/jpegli/encode_internal.h b/lib/jpegli/encode_internal.h
new file mode 100644 (file)
index 0000000..4dbef97
--- /dev/null
@@ -0,0 +1,141 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_ENCODE_INTERNAL_H_
+#define LIB_JPEGLI_ENCODE_INTERNAL_H_
+
+#include <stdint.h>
+
+#include "lib/jpegli/bit_writer.h"
+#include "lib/jpegli/common.h"
+#include "lib/jpegli/common_internal.h"
+#include "lib/jpegli/encode.h"
+
+namespace jpegli {
+
+constexpr unsigned char kICCSignature[12] = {
+    0x49, 0x43, 0x43, 0x5F, 0x50, 0x52, 0x4F, 0x46, 0x49, 0x4C, 0x45, 0x00};
+constexpr int kICCMarker = JPEG_APP0 + 2;
+
+constexpr int kDefaultProgressiveLevel = 0;
+
+typedef int16_t coeff_t;
+
+struct HuffmanCodeTable {
+  int depth[256];
+  int code[256];
+};
+
+struct Token {
+  uint8_t context;
+  uint8_t symbol;
+  uint16_t bits;
+  Token(int c, int s, int b) : context(c), symbol(s), bits(b) {}
+};
+
+struct TokenArray {
+  Token* tokens;
+  size_t num_tokens;
+};
+
+struct RefToken {
+  uint8_t symbol;
+  uint8_t refbits;
+};
+
+struct ScanTokenInfo {
+  RefToken* tokens;
+  size_t num_tokens;
+  uint8_t* refbits;
+  uint16_t* eobruns;
+  size_t* restarts;
+  size_t num_restarts;
+  size_t num_nonzeros;
+  size_t num_future_nonzeros;
+  size_t token_offset;
+  size_t restart_interval;
+  size_t MCUs_per_row;
+  size_t MCU_rows_in_scan;
+  size_t blocks_in_MCU;
+  size_t num_blocks;
+};
+
+}  // namespace jpegli
+
+struct jpeg_comp_master {
+  jpegli::RowBuffer<float> input_buffer[jpegli::kMaxComponents];
+  jpegli::RowBuffer<float>* smooth_input[jpegli::kMaxComponents];
+  jpegli::RowBuffer<float>* raw_data[jpegli::kMaxComponents];
+  bool force_baseline;
+  bool xyb_mode;
+  uint8_t cicp_transfer_function;
+  bool use_std_tables;
+  bool use_adaptive_quantization;
+  int progressive_level;
+  size_t xsize_blocks;
+  size_t ysize_blocks;
+  size_t blocks_per_iMCU_row;
+  jpegli::ScanTokenInfo* scan_token_info;
+  JpegliDataType data_type;
+  JpegliEndianness endianness;
+  void (*input_method)(const uint8_t* row_in, size_t len,
+                       float* row_out[jpegli::kMaxComponents]);
+  void (*color_transform)(float* row[jpegli::kMaxComponents], size_t len);
+  void (*downsample_method[jpegli::kMaxComponents])(
+      float* rows_in[MAX_SAMP_FACTOR], size_t len, float* row_out);
+  float* quant_mul[jpegli::kMaxComponents];
+  float* zero_bias_offset[jpegli::kMaxComponents];
+  float* zero_bias_mul[jpegli::kMaxComponents];
+  int h_factor[jpegli::kMaxComponents];
+  int v_factor[jpegli::kMaxComponents];
+  // Array of Huffman tables that will be encoded in one or more DHT segments.
+  // In progressive mode we compute all Huffman tables that will be used in any
+  // of the scans, thus we can have more than 4 tables here.
+  JHUFF_TBL* huffman_tables;
+  size_t num_huffman_tables;
+  // Array of num_huffman_tables slot ids, where the ith element is the slot id
+  // of the ith Huffman table, as it appears in the DHT segment. The range of
+  // the slot ids is 0..3 for DC and 16..19 for AC Huffman codes.
+  uint8_t* slot_id_map;
+  // Maps context ids to an index in the huffman_tables array. Each component in
+  // each scan has a DC and AC context id, which are defined as follows:
+  //   - DC context id is the component index (relative to cinfo->comp_info) of
+  //     the scan component
+  //   - AC context ids start at 4 and are increased for each component of each
+  //     scan that have AC components (i.e. Se > 0)
+  uint8_t* context_map;
+  size_t num_contexts;
+  // Array of cinfo->num_scans context ids, where the ith element is the context
+  // id of the first AC component of the ith scan.
+  uint8_t* ac_ctx_offset;
+  // Array of num_huffman tables derived coding tables.
+  jpegli::HuffmanCodeTable* coding_tables;
+  float* diff_buffer;
+  jpegli::RowBuffer<float> fuzzy_erosion_tmp;
+  jpegli::RowBuffer<float> pre_erosion;
+  jpegli::RowBuffer<float> quant_field;
+  jvirt_barray_ptr* coeff_buffers;
+  size_t next_input_row;
+  size_t next_iMCU_row;
+  size_t next_dht_index;
+  size_t last_restart_interval;
+  JCOEF last_dc_coeff[MAX_COMPS_IN_SCAN];
+  jpegli::JpegBitWriter bw;
+  float* dct_buffer;
+  int32_t* block_tmp;
+  jpegli::TokenArray* token_arrays;
+  size_t cur_token_array;
+  jpegli::Token* next_token;
+  size_t num_tokens;
+  size_t total_num_tokens;
+  jpegli::RefToken* next_refinement_token;
+  uint8_t* next_refinement_bit;
+  float psnr_target;
+  float psnr_tolerance;
+  float min_distance;
+  float max_distance;
+};
+
+#endif  // LIB_JPEGLI_ENCODE_INTERNAL_H_
diff --git a/lib/jpegli/encode_streaming.cc b/lib/jpegli/encode_streaming.cc
new file mode 100644 (file)
index 0000000..89dbd81
--- /dev/null
@@ -0,0 +1,259 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/encode_streaming.h"
+
+#include <cmath>
+
+#include "lib/jpegli/bit_writer.h"
+#include "lib/jpegli/bitstream.h"
+#include "lib/jpegli/entropy_coding.h"
+#include "lib/jpegli/error.h"
+#include "lib/jpegli/memory_manager.h"
+#include "lib/jxl/base/bits.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jpegli/encode_streaming.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jpegli/dct-inl.h"
+#include "lib/jpegli/entropy_coding-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jpegli {
+namespace HWY_NAMESPACE {
+
+static const int kStreamingModeCoefficients = 0;
+static const int kStreamingModeTokens = 1;
+static const int kStreamingModeBits = 2;
+
+namespace {
+void ZigZagShuffle(int32_t* JXL_RESTRICT block) {
+  // TODO(szabadka) SIMDify this.
+  int32_t tmp[DCTSIZE2];
+  tmp[0] = block[0];
+  tmp[1] = block[1];
+  tmp[2] = block[8];
+  tmp[3] = block[16];
+  tmp[4] = block[9];
+  tmp[5] = block[2];
+  tmp[6] = block[3];
+  tmp[7] = block[10];
+  tmp[8] = block[17];
+  tmp[9] = block[24];
+  tmp[10] = block[32];
+  tmp[11] = block[25];
+  tmp[12] = block[18];
+  tmp[13] = block[11];
+  tmp[14] = block[4];
+  tmp[15] = block[5];
+  tmp[16] = block[12];
+  tmp[17] = block[19];
+  tmp[18] = block[26];
+  tmp[19] = block[33];
+  tmp[20] = block[40];
+  tmp[21] = block[48];
+  tmp[22] = block[41];
+  tmp[23] = block[34];
+  tmp[24] = block[27];
+  tmp[25] = block[20];
+  tmp[26] = block[13];
+  tmp[27] = block[6];
+  tmp[28] = block[7];
+  tmp[29] = block[14];
+  tmp[30] = block[21];
+  tmp[31] = block[28];
+  tmp[32] = block[35];
+  tmp[33] = block[42];
+  tmp[34] = block[49];
+  tmp[35] = block[56];
+  tmp[36] = block[57];
+  tmp[37] = block[50];
+  tmp[38] = block[43];
+  tmp[39] = block[36];
+  tmp[40] = block[29];
+  tmp[41] = block[22];
+  tmp[42] = block[15];
+  tmp[43] = block[23];
+  tmp[44] = block[30];
+  tmp[45] = block[37];
+  tmp[46] = block[44];
+  tmp[47] = block[51];
+  tmp[48] = block[58];
+  tmp[49] = block[59];
+  tmp[50] = block[52];
+  tmp[51] = block[45];
+  tmp[52] = block[38];
+  tmp[53] = block[31];
+  tmp[54] = block[39];
+  tmp[55] = block[46];
+  tmp[56] = block[53];
+  tmp[57] = block[60];
+  tmp[58] = block[61];
+  tmp[59] = block[54];
+  tmp[60] = block[47];
+  tmp[61] = block[55];
+  tmp[62] = block[62];
+  tmp[63] = block[63];
+  memcpy(block, tmp, DCTSIZE2 * sizeof(tmp[0]));
+}
+}  // namespace
+
+template <int kMode>
+void ProcessiMCURow(j_compress_ptr cinfo) {
+  jpeg_comp_master* m = cinfo->master;
+  JpegBitWriter* bw = &m->bw;
+  int xsize_mcus = DivCeil(cinfo->image_width, 8 * cinfo->max_h_samp_factor);
+  int ysize_mcus = DivCeil(cinfo->image_height, 8 * cinfo->max_v_samp_factor);
+  int mcu_y = m->next_iMCU_row;
+  int32_t* block = m->block_tmp;
+  int32_t* symbols = m->block_tmp + DCTSIZE2;
+  int32_t* nonzero_idx = m->block_tmp + 3 * DCTSIZE2;
+  coeff_t* JXL_RESTRICT last_dc_coeff = m->last_dc_coeff;
+  bool adaptive_quant = m->use_adaptive_quantization && m->psnr_target == 0;
+  JBLOCKARRAY ba[kMaxComponents];
+  if (kMode == kStreamingModeCoefficients) {
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      jpeg_component_info* comp = &cinfo->comp_info[c];
+      int by0 = mcu_y * comp->v_samp_factor;
+      int block_rows_left = comp->height_in_blocks - by0;
+      int max_block_rows = std::min(comp->v_samp_factor, block_rows_left);
+      ba[c] = (*cinfo->mem->access_virt_barray)(
+          reinterpret_cast<j_common_ptr>(cinfo), m->coeff_buffers[c], by0,
+          max_block_rows, true);
+    }
+  }
+  if (kMode == kStreamingModeTokens) {
+    TokenArray* ta = &m->token_arrays[m->cur_token_array];
+    int max_tokens_per_mcu_row = MaxNumTokensPerMCURow(cinfo);
+    if (ta->num_tokens + max_tokens_per_mcu_row > m->num_tokens) {
+      if (ta->tokens) {
+        m->total_num_tokens += ta->num_tokens;
+        ++m->cur_token_array;
+        ta = &m->token_arrays[m->cur_token_array];
+      }
+      m->num_tokens =
+          EstimateNumTokens(cinfo, mcu_y, ysize_mcus, m->total_num_tokens,
+                            max_tokens_per_mcu_row);
+      ta->tokens = Allocate<Token>(cinfo, m->num_tokens, JPOOL_IMAGE);
+      m->next_token = ta->tokens;
+    }
+  }
+  const float* imcu_start[kMaxComponents];
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    imcu_start[c] = m->raw_data[c]->Row(mcu_y * comp->v_samp_factor * DCTSIZE);
+  }
+  const float* qf = nullptr;
+  if (adaptive_quant) {
+    qf = m->quant_field.Row(0);
+  }
+  HuffmanCodeTable* dc_code = nullptr;
+  HuffmanCodeTable* ac_code = nullptr;
+  const size_t qf_stride = m->quant_field.stride();
+  for (int mcu_x = 0; mcu_x < xsize_mcus; ++mcu_x) {
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      jpeg_component_info* comp = &cinfo->comp_info[c];
+      if (kMode == kStreamingModeBits) {
+        dc_code = &m->coding_tables[m->context_map[c]];
+        ac_code = &m->coding_tables[m->context_map[c + 4]];
+      }
+      float* JXL_RESTRICT qmc = m->quant_mul[c];
+      const size_t stride = m->raw_data[c]->stride();
+      const int h_factor = m->h_factor[c];
+      const float* zero_bias_offset = m->zero_bias_offset[c];
+      const float* zero_bias_mul = m->zero_bias_mul[c];
+      float aq_strength = 0.0f;
+      for (int iy = 0; iy < comp->v_samp_factor; ++iy) {
+        for (int ix = 0; ix < comp->h_samp_factor; ++ix) {
+          size_t by = mcu_y * comp->v_samp_factor + iy;
+          size_t bx = mcu_x * comp->h_samp_factor + ix;
+          if (bx >= comp->width_in_blocks || by >= comp->height_in_blocks) {
+            if (kMode == kStreamingModeTokens) {
+              *m->next_token++ = Token(c, 0, 0);
+              *m->next_token++ = Token(c + 4, 0, 0);
+            } else if (kMode == kStreamingModeBits) {
+              WriteBits(bw, dc_code->depth[0], dc_code->code[0]);
+              WriteBits(bw, ac_code->depth[0], ac_code->code[0]);
+            }
+            continue;
+          }
+          if (adaptive_quant) {
+            aq_strength = qf[iy * qf_stride + bx * h_factor];
+          }
+          const float* pixels = imcu_start[c] + (iy * stride + bx) * DCTSIZE;
+          ComputeCoefficientBlock(pixels, stride, qmc, last_dc_coeff[c],
+                                  aq_strength, zero_bias_offset, zero_bias_mul,
+                                  m->dct_buffer, block);
+          if (kMode == kStreamingModeCoefficients) {
+            JCOEF* cblock = &ba[c][iy][bx][0];
+            for (int k = 0; k < DCTSIZE2; ++k) {
+              cblock[k] = block[kJPEGNaturalOrder[k]];
+            }
+          }
+          block[0] -= last_dc_coeff[c];
+          last_dc_coeff[c] += block[0];
+          if (kMode == kStreamingModeTokens) {
+            ComputeTokensForBlock<int32_t, false>(block, 0, c, c + 4,
+                                                  &m->next_token);
+          } else if (kMode == kStreamingModeBits) {
+            ZigZagShuffle(block);
+            const int num_nonzeros = CompactBlock(block, nonzero_idx);
+            const bool emit_eob = nonzero_idx[num_nonzeros - 1] < 1008;
+            ComputeSymbols(num_nonzeros, nonzero_idx, block, symbols);
+            WriteBlock(symbols, block, num_nonzeros, emit_eob, dc_code, ac_code,
+                       bw);
+          }
+        }
+      }
+    }
+  }
+  if (kMode == kStreamingModeTokens) {
+    TokenArray* ta = &m->token_arrays[m->cur_token_array];
+    ta->num_tokens = m->next_token - ta->tokens;
+    ScanTokenInfo* sti = &m->scan_token_info[0];
+    sti->num_tokens = m->total_num_tokens + ta->num_tokens;
+    sti->restarts[0] = sti->num_tokens;
+  }
+}
+
+void ComputeCoefficientsForiMCURow(j_compress_ptr cinfo) {
+  ProcessiMCURow<kStreamingModeCoefficients>(cinfo);
+}
+
+void ComputeTokensForiMCURow(j_compress_ptr cinfo) {
+  ProcessiMCURow<kStreamingModeTokens>(cinfo);
+}
+
+void WriteiMCURow(j_compress_ptr cinfo) {
+  ProcessiMCURow<kStreamingModeBits>(cinfo);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jpegli
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jpegli {
+HWY_EXPORT(ComputeCoefficientsForiMCURow);
+HWY_EXPORT(ComputeTokensForiMCURow);
+HWY_EXPORT(WriteiMCURow);
+
+void ComputeCoefficientsForiMCURow(j_compress_ptr cinfo) {
+  HWY_DYNAMIC_DISPATCH(ComputeCoefficientsForiMCURow)(cinfo);
+}
+
+void ComputeTokensForiMCURow(j_compress_ptr cinfo) {
+  HWY_DYNAMIC_DISPATCH(ComputeTokensForiMCURow)(cinfo);
+}
+
+void WriteiMCURow(j_compress_ptr cinfo) {
+  HWY_DYNAMIC_DISPATCH(WriteiMCURow)(cinfo);
+}
+
+}  // namespace jpegli
+#endif  // HWY_ONCE
diff --git a/lib/jpegli/encode_streaming.h b/lib/jpegli/encode_streaming.h
new file mode 100644 (file)
index 0000000..69acff4
--- /dev/null
@@ -0,0 +1,21 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_ENCODE_STREAMING_H_
+#define LIB_JPEGLI_ENCODE_STREAMING_H_
+
+#include "lib/jpegli/encode_internal.h"
+
+namespace jpegli {
+
+void ComputeCoefficientsForiMCURow(j_compress_ptr cinfo);
+
+void ComputeTokensForiMCURow(j_compress_ptr cinfo);
+
+void WriteiMCURow(j_compress_ptr cinfo);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_ENCODE_STREAMING_H_
diff --git a/lib/jpegli/entropy_coding-inl.h b/lib/jpegli/entropy_coding-inl.h
new file mode 100644 (file)
index 0000000..bfb436d
--- /dev/null
@@ -0,0 +1,213 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#if defined(LIB_JPEGLI_ENTROPY_CODING_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JPEGLI_ENTROPY_CODING_INL_H_
+#undef LIB_JPEGLI_ENTROPY_CODING_INL_H_
+#else
+#define LIB_JPEGLI_ENTROPY_CODING_INL_H_
+#endif
+
+#include "lib/jxl/base/compiler_specific.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jpegli {
+namespace HWY_NAMESPACE {
+namespace {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Abs;
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::And;
+using hwy::HWY_NAMESPACE::AndNot;
+using hwy::HWY_NAMESPACE::Compress;
+using hwy::HWY_NAMESPACE::CountTrue;
+using hwy::HWY_NAMESPACE::Eq;
+using hwy::HWY_NAMESPACE::GetLane;
+using hwy::HWY_NAMESPACE::MaskFromVec;
+using hwy::HWY_NAMESPACE::Max;
+using hwy::HWY_NAMESPACE::Not;
+using hwy::HWY_NAMESPACE::Or;
+using hwy::HWY_NAMESPACE::ShiftRight;
+using hwy::HWY_NAMESPACE::Shl;
+using hwy::HWY_NAMESPACE::Sub;
+
+using DI = HWY_FULL(int32_t);
+constexpr DI di;
+
+template <typename DI, class V>
+JXL_INLINE V NumBits(DI di, const V x) {
+  // TODO(szabadka) Add faster implementations for some specific architectures.
+  const auto b1 = And(x, Set(di, 1));
+  const auto b2 = And(x, Set(di, 2));
+  const auto b3 = Sub((And(x, Set(di, 4))), Set(di, 1));
+  const auto b4 = Sub((And(x, Set(di, 8))), Set(di, 4));
+  const auto b5 = Sub((And(x, Set(di, 16))), Set(di, 11));
+  const auto b6 = Sub((And(x, Set(di, 32))), Set(di, 26));
+  const auto b7 = Sub((And(x, Set(di, 64))), Set(di, 57));
+  const auto b8 = Sub((And(x, Set(di, 128))), Set(di, 120));
+  const auto b9 = Sub((And(x, Set(di, 256))), Set(di, 247));
+  const auto b10 = Sub((And(x, Set(di, 512))), Set(di, 502));
+  const auto b11 = Sub((And(x, Set(di, 1024))), Set(di, 1013));
+  const auto b12 = Sub((And(x, Set(di, 2048))), Set(di, 2036));
+  return Max(Max(Max(Max(b1, b2), Max(b3, b4)), Max(Max(b5, b6), Max(b7, b8))),
+             Max(Max(b9, b10), Max(b11, b12)));
+}
+
+// Coefficient indexes pre-multiplied by 16 for the symbol calculation.
+HWY_ALIGN constexpr int32_t kIndexes[64] = {
+    0,   16,  32,  48,  64,  80,  96,  112, 128, 144, 160, 176,  192,
+    208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384,  400,
+    416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592,  608,
+    624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800,  816,
+    832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008,
+};
+
+JXL_INLINE int CompactBlock(int32_t* JXL_RESTRICT block,
+                            int32_t* JXL_RESTRICT nonzero_idx) {
+  const auto zero = Zero(di);
+  HWY_ALIGN constexpr int32_t dc_mask_lanes[HWY_LANES(DI)] = {-1};
+  const auto dc_mask = MaskFromVec(Load(di, dc_mask_lanes));
+  int num_nonzeros = 0;
+  int k = 0;
+  {
+    const auto coef = Load(di, block);
+    const auto idx = Load(di, kIndexes);
+    const auto nonzero_mask = Or(dc_mask, Not(Eq(coef, zero)));
+    const auto nzero_coef = Compress(coef, nonzero_mask);
+    const auto nzero_idx = Compress(idx, nonzero_mask);
+    StoreU(nzero_coef, di, &block[num_nonzeros]);
+    StoreU(nzero_idx, di, &nonzero_idx[num_nonzeros]);
+    num_nonzeros += CountTrue(di, nonzero_mask);
+    k += Lanes(di);
+  }
+  for (; k < DCTSIZE2; k += Lanes(di)) {
+    const auto coef = Load(di, &block[k]);
+    const auto idx = Load(di, &kIndexes[k]);
+    const auto nonzero_mask = Not(Eq(coef, zero));
+    const auto nzero_coef = Compress(coef, nonzero_mask);
+    const auto nzero_idx = Compress(idx, nonzero_mask);
+    StoreU(nzero_coef, di, &block[num_nonzeros]);
+    StoreU(nzero_idx, di, &nonzero_idx[num_nonzeros]);
+    num_nonzeros += CountTrue(di, nonzero_mask);
+  }
+  return num_nonzeros;
+}
+
+JXL_INLINE void ComputeSymbols(const int num_nonzeros,
+                               int32_t* JXL_RESTRICT nonzero_idx,
+                               int32_t* JXL_RESTRICT block,
+                               int32_t* JXL_RESTRICT symbols) {
+  nonzero_idx[-1] = -16;
+  const auto one = Set(di, 1);
+  const auto offset = Set(di, 16);
+  for (int i = 0; i < num_nonzeros; i += Lanes(di)) {
+    const auto idx = Load(di, &nonzero_idx[i]);
+    const auto prev_idx = LoadU(di, &nonzero_idx[i - 1]);
+    const auto coeff = Load(di, &block[i]);
+    const auto nbits = NumBits(di, Abs(coeff));
+    const auto mask = ShiftRight<8 * sizeof(int32_t) - 1>(coeff);
+    const auto bits = And(Add(coeff, mask), Sub(Shl(one, nbits), one));
+    const auto symbol = Sub(Add(nbits, idx), Add(prev_idx, offset));
+    Store(symbol, di, symbols + i);
+    Store(bits, di, block + i);
+  }
+}
+
+template <typename T>
+int NumNonZero8x8ExceptDC(const T* block) {
+  const HWY_CAPPED(T, 8) di;
+
+  const auto zero = Zero(di);
+  // Add FFFF for every zero coefficient, negate to get #zeros.
+  auto neg_sum_zero = zero;
+  {
+    // First row has DC, so mask
+    const size_t y = 0;
+    HWY_ALIGN const T dc_mask_lanes[8] = {-1};
+
+    for (size_t x = 0; x < 8; x += Lanes(di)) {
+      const auto dc_mask = Load(di, dc_mask_lanes + x);
+
+      // DC counts as zero so we don't include it in nzeros.
+      const auto coef = AndNot(dc_mask, Load(di, &block[y * 8 + x]));
+
+      neg_sum_zero = Add(neg_sum_zero, VecFromMask(di, Eq(coef, zero)));
+    }
+  }
+  // Remaining rows: no mask
+  for (size_t y = 1; y < 8; y++) {
+    for (size_t x = 0; x < 8; x += Lanes(di)) {
+      const auto coef = Load(di, &block[y * 8 + x]);
+      neg_sum_zero = Add(neg_sum_zero, VecFromMask(di, Eq(coef, zero)));
+    }
+  }
+
+  // We want 64 - sum_zero, add because neg_sum_zero is already negated.
+  return kDCTBlockSize + GetLane(SumOfLanes(di, neg_sum_zero));
+}
+
+template <typename T, bool zig_zag_order>
+void ComputeTokensForBlock(const T* block, int last_dc, int dc_ctx, int ac_ctx,
+                           Token** tokens_ptr) {
+  Token* next_token = *tokens_ptr;
+  coeff_t temp2;
+  coeff_t temp;
+  temp = block[0] - last_dc;
+  if (temp == 0) {
+    *next_token++ = Token(dc_ctx, 0, 0);
+  } else {
+    temp2 = temp;
+    if (temp < 0) {
+      temp = -temp;
+      temp2--;
+    }
+    int dc_nbits = jxl::FloorLog2Nonzero<uint32_t>(temp) + 1;
+    int dc_mask = (1 << dc_nbits) - 1;
+    *next_token++ = Token(dc_ctx, dc_nbits, temp2 & dc_mask);
+  }
+  int num_nonzeros = NumNonZero8x8ExceptDC(block);
+  for (int k = 1; k < 64; ++k) {
+    if (num_nonzeros == 0) {
+      *next_token++ = Token(ac_ctx, 0, 0);
+      break;
+    }
+    int r = 0;
+    if (zig_zag_order) {
+      while ((temp = block[k]) == 0) {
+        r++;
+        k++;
+      }
+    } else {
+      while ((temp = block[kJPEGNaturalOrder[k]]) == 0) {
+        r++;
+        k++;
+      }
+    }
+    --num_nonzeros;
+    if (temp < 0) {
+      temp = -temp;
+      temp2 = ~temp;
+    } else {
+      temp2 = temp;
+    }
+    while (r > 15) {
+      *next_token++ = Token(ac_ctx, 0xf0, 0);
+      r -= 16;
+    }
+    int ac_nbits = jxl::FloorLog2Nonzero<uint32_t>(temp) + 1;
+    int ac_mask = (1 << ac_nbits) - 1;
+    int symbol = (r << 4u) + ac_nbits;
+    *next_token++ = Token(ac_ctx, symbol, temp2 & ac_mask);
+  }
+  *tokens_ptr = next_token;
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace
+}  // namespace HWY_NAMESPACE
+}  // namespace jpegli
+HWY_AFTER_NAMESPACE();
+#endif  // LIB_JPEGLI_ENTROPY_CODING_INL_H_
diff --git a/lib/jpegli/entropy_coding.cc b/lib/jpegli/entropy_coding.cc
new file mode 100644 (file)
index 0000000..7e50bbc
--- /dev/null
@@ -0,0 +1,837 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/entropy_coding.h"
+
+#include <vector>
+
+#include "lib/jpegli/encode_internal.h"
+#include "lib/jpegli/error.h"
+#include "lib/jpegli/huffman.h"
+#include "lib/jxl/base/bits.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jpegli/entropy_coding.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jpegli/entropy_coding-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jpegli {
+namespace HWY_NAMESPACE {
+
+void ComputeTokensSequential(const coeff_t* block, int last_dc, int dc_ctx,
+                             int ac_ctx, Token** tokens_ptr) {
+  ComputeTokensForBlock<coeff_t, true>(block, last_dc, dc_ctx, ac_ctx,
+                                       tokens_ptr);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jpegli
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jpegli {
+
+size_t MaxNumTokensPerMCURow(j_compress_ptr cinfo) {
+  int MCUs_per_row = DivCeil(cinfo->image_width, 8 * cinfo->max_h_samp_factor);
+  size_t blocks_per_mcu = 0;
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    blocks_per_mcu += comp->h_samp_factor * comp->v_samp_factor;
+  }
+  return kDCTBlockSize * blocks_per_mcu * MCUs_per_row;
+}
+
+size_t EstimateNumTokens(j_compress_ptr cinfo, size_t mcu_y, size_t ysize_mcus,
+                         size_t num_tokens, size_t max_per_row) {
+  size_t estimate;
+  if (mcu_y == 0) {
+    estimate = 16 * max_per_row;
+  } else {
+    estimate = (4 * ysize_mcus * num_tokens) / (3 * mcu_y);
+  }
+  size_t mcus_left = ysize_mcus - mcu_y;
+  return std::min(mcus_left * max_per_row,
+                  std::max(max_per_row, estimate - num_tokens));
+}
+
+namespace {
+HWY_EXPORT(ComputeTokensSequential);
+
+void TokenizeProgressiveDC(const coeff_t* coeffs, int context, int Al,
+                           coeff_t* last_dc_coeff, Token** next_token) {
+  coeff_t temp2;
+  coeff_t temp;
+  temp2 = coeffs[0] >> Al;
+  temp = temp2 - *last_dc_coeff;
+  *last_dc_coeff = temp2;
+  temp2 = temp;
+  if (temp < 0) {
+    temp = -temp;
+    temp2--;
+  }
+  int nbits = (temp == 0) ? 0 : (jxl::FloorLog2Nonzero<uint32_t>(temp) + 1);
+  int bits = temp2 & ((1 << nbits) - 1);
+  *(*next_token)++ = Token(context, nbits, bits);
+}
+
+void TokenizeACProgressiveScan(j_compress_ptr cinfo, int scan_index,
+                               int context, ScanTokenInfo* sti) {
+  jpeg_comp_master* m = cinfo->master;
+  const jpeg_scan_info* scan_info = &cinfo->scan_info[scan_index];
+  const int comp_idx = scan_info->component_index[0];
+  const jpeg_component_info* comp = &cinfo->comp_info[comp_idx];
+  const int Al = scan_info->Al;
+  const int Ss = scan_info->Ss;
+  const int Se = scan_info->Se;
+  const size_t restart_interval = sti->restart_interval;
+  int restarts_to_go = restart_interval;
+  size_t num_blocks = comp->height_in_blocks * comp->width_in_blocks;
+  size_t num_restarts =
+      restart_interval > 0 ? DivCeil(num_blocks, restart_interval) : 1;
+  size_t restart_idx = 0;
+  int eob_run = 0;
+  TokenArray* ta = &m->token_arrays[m->cur_token_array];
+  sti->token_offset = m->total_num_tokens + ta->num_tokens;
+  sti->restarts = Allocate<size_t>(cinfo, num_restarts, JPOOL_IMAGE);
+  for (JDIMENSION by = 0; by < comp->height_in_blocks; ++by) {
+    JBLOCKARRAY ba = (*cinfo->mem->access_virt_barray)(
+        reinterpret_cast<j_common_ptr>(cinfo), m->coeff_buffers[comp_idx], by,
+        1, false);
+    // Each coefficient can appear in at most one token, but we have to reserve
+    // one extra EOBrun token that was rolled over from the previous block-row
+    // and has to be flushed at the end.
+    int max_tokens_per_row = 1 + comp->width_in_blocks * (Se - Ss + 1);
+    if (ta->num_tokens + max_tokens_per_row > m->num_tokens) {
+      if (ta->tokens) {
+        m->total_num_tokens += ta->num_tokens;
+        ++m->cur_token_array;
+        ta = &m->token_arrays[m->cur_token_array];
+      }
+      m->num_tokens =
+          EstimateNumTokens(cinfo, by, comp->height_in_blocks,
+                            m->total_num_tokens, max_tokens_per_row);
+      ta->tokens = Allocate<Token>(cinfo, m->num_tokens, JPOOL_IMAGE);
+      m->next_token = ta->tokens;
+    }
+    for (JDIMENSION bx = 0; bx < comp->width_in_blocks; ++bx) {
+      if (restart_interval > 0 && restarts_to_go == 0) {
+        if (eob_run > 0) {
+          int nbits = jxl::FloorLog2Nonzero<uint32_t>(eob_run);
+          int symbol = nbits << 4u;
+          *m->next_token++ =
+              Token(context, symbol, eob_run & ((1 << nbits) - 1));
+          eob_run = 0;
+        }
+        ta->num_tokens = m->next_token - ta->tokens;
+        sti->restarts[restart_idx++] = m->total_num_tokens + ta->num_tokens;
+        restarts_to_go = restart_interval;
+      }
+      const coeff_t* block = &ba[0][bx][0];
+      coeff_t temp2;
+      coeff_t temp;
+      int r = 0;
+      int num_nzeros = 0;
+      int num_future_nzeros = 0;
+      for (int k = Ss; k <= Se; ++k) {
+        if ((temp = block[k]) == 0) {
+          r++;
+          continue;
+        }
+        if (temp < 0) {
+          temp = -temp;
+          temp >>= Al;
+          temp2 = ~temp;
+        } else {
+          temp >>= Al;
+          temp2 = temp;
+        }
+        if (temp == 0) {
+          r++;
+          num_future_nzeros++;
+          continue;
+        }
+        if (eob_run > 0) {
+          int nbits = jxl::FloorLog2Nonzero<uint32_t>(eob_run);
+          int symbol = nbits << 4u;
+          *m->next_token++ =
+              Token(context, symbol, eob_run & ((1 << nbits) - 1));
+          eob_run = 0;
+        }
+        while (r > 15) {
+          *m->next_token++ = Token(context, 0xf0, 0);
+          r -= 16;
+        }
+        int nbits = jxl::FloorLog2Nonzero<uint32_t>(temp) + 1;
+        int symbol = (r << 4u) + nbits;
+        *m->next_token++ = Token(context, symbol, temp2 & ((1 << nbits) - 1));
+        ++num_nzeros;
+        r = 0;
+      }
+      if (r > 0) {
+        ++eob_run;
+        if (eob_run == 0x7FFF) {
+          int nbits = jxl::FloorLog2Nonzero<uint32_t>(eob_run);
+          int symbol = nbits << 4u;
+          *m->next_token++ =
+              Token(context, symbol, eob_run & ((1 << nbits) - 1));
+          eob_run = 0;
+        }
+      }
+      sti->num_nonzeros += num_nzeros;
+      sti->num_future_nonzeros += num_future_nzeros;
+      --restarts_to_go;
+    }
+    ta->num_tokens = m->next_token - ta->tokens;
+  }
+  if (eob_run > 0) {
+    int nbits = jxl::FloorLog2Nonzero<uint32_t>(eob_run);
+    int symbol = nbits << 4u;
+    *m->next_token++ = Token(context, symbol, eob_run & ((1 << nbits) - 1));
+    ++ta->num_tokens;
+    eob_run = 0;
+  }
+  sti->num_tokens = m->total_num_tokens + ta->num_tokens - sti->token_offset;
+  sti->restarts[restart_idx++] = m->total_num_tokens + ta->num_tokens;
+}
+
+void TokenizeACRefinementScan(j_compress_ptr cinfo, int scan_index,
+                              ScanTokenInfo* sti) {
+  jpeg_comp_master* m = cinfo->master;
+  const jpeg_scan_info* scan_info = &cinfo->scan_info[scan_index];
+  const int comp_idx = scan_info->component_index[0];
+  const jpeg_component_info* comp = &cinfo->comp_info[comp_idx];
+  const int Al = scan_info->Al;
+  const int Ss = scan_info->Ss;
+  const int Se = scan_info->Se;
+  const size_t restart_interval = sti->restart_interval;
+  int restarts_to_go = restart_interval;
+  RefToken token;
+  int eob_run = 0;
+  int eob_refbits = 0;
+  size_t num_blocks = comp->height_in_blocks * comp->width_in_blocks;
+  size_t num_restarts =
+      restart_interval > 0 ? DivCeil(num_blocks, restart_interval) : 1;
+  sti->tokens = m->next_refinement_token;
+  sti->refbits = m->next_refinement_bit;
+  sti->eobruns = Allocate<uint16_t>(cinfo, num_blocks / 2, JPOOL_IMAGE);
+  sti->restarts = Allocate<size_t>(cinfo, num_restarts, JPOOL_IMAGE);
+  RefToken* next_token = sti->tokens;
+  RefToken* next_eob_token = next_token;
+  uint8_t* next_ref_bit = sti->refbits;
+  uint16_t* next_eobrun = sti->eobruns;
+  size_t restart_idx = 0;
+  for (JDIMENSION by = 0; by < comp->height_in_blocks; ++by) {
+    JBLOCKARRAY ba = (*cinfo->mem->access_virt_barray)(
+        reinterpret_cast<j_common_ptr>(cinfo), m->coeff_buffers[comp_idx], by,
+        1, false);
+    for (JDIMENSION bx = 0; bx < comp->width_in_blocks; ++bx) {
+      if (restart_interval > 0 && restarts_to_go == 0) {
+        sti->restarts[restart_idx++] = next_token - sti->tokens;
+        restarts_to_go = restart_interval;
+        next_eob_token = next_token;
+        eob_run = eob_refbits = 0;
+      }
+      const coeff_t* block = &ba[0][bx][0];
+      int num_eob_refinement_bits = 0;
+      int num_refinement_bits = 0;
+      int num_nzeros = 0;
+      int r = 0;
+      for (int k = Ss; k <= Se; ++k) {
+        int absval = block[k];
+        if (absval == 0) {
+          r++;
+          continue;
+        }
+        const int mask = absval >> (8 * sizeof(int) - 1);
+        absval += mask;
+        absval ^= mask;
+        absval >>= Al;
+        if (absval == 0) {
+          r++;
+          continue;
+        }
+        while (r > 15) {
+          token.symbol = 0xf0;
+          token.refbits = num_refinement_bits;
+          *next_token++ = token;
+          r -= 16;
+          num_eob_refinement_bits += num_refinement_bits;
+          num_refinement_bits = 0;
+        }
+        if (absval > 1) {
+          *next_ref_bit++ = absval & 1u;
+          ++num_refinement_bits;
+          continue;
+        }
+        int symbol = (r << 4u) + 1 + ((mask + 1) << 1);
+        token.symbol = symbol;
+        token.refbits = num_refinement_bits;
+        *next_token++ = token;
+        ++num_nzeros;
+        num_refinement_bits = 0;
+        num_eob_refinement_bits = 0;
+        r = 0;
+        next_eob_token = next_token;
+        eob_run = eob_refbits = 0;
+      }
+      if (r > 0 || num_eob_refinement_bits + num_refinement_bits > 0) {
+        ++eob_run;
+        eob_refbits += num_eob_refinement_bits + num_refinement_bits;
+        if (eob_refbits > 255) {
+          ++next_eob_token;
+          eob_refbits = num_eob_refinement_bits + num_refinement_bits;
+          eob_run = 1;
+        }
+        next_token = next_eob_token;
+        next_token->refbits = eob_refbits;
+        if (eob_run == 1) {
+          next_token->symbol = 0;
+        } else if (eob_run == 2) {
+          next_token->symbol = 16;
+          *next_eobrun++ = 0;
+        } else if ((eob_run & (eob_run - 1)) == 0) {
+          next_token->symbol += 16;
+          next_eobrun[-1] = 0;
+        } else {
+          ++next_eobrun[-1];
+        }
+        ++next_token;
+        if (eob_run == 0x7fff) {
+          next_eob_token = next_token;
+          eob_run = eob_refbits = 0;
+        }
+      }
+      sti->num_nonzeros += num_nzeros;
+      --restarts_to_go;
+    }
+  }
+  sti->num_tokens = next_token - sti->tokens;
+  sti->restarts[restart_idx++] = sti->num_tokens;
+  m->next_refinement_token = next_token;
+  m->next_refinement_bit = next_ref_bit;
+}
+
+void TokenizeScan(j_compress_ptr cinfo, size_t scan_index, int ac_ctx_offset,
+                  ScanTokenInfo* sti) {
+  const jpeg_scan_info* scan_info = &cinfo->scan_info[scan_index];
+  if (scan_info->Ss > 0) {
+    if (scan_info->Ah == 0) {
+      TokenizeACProgressiveScan(cinfo, scan_index, ac_ctx_offset, sti);
+    } else {
+      TokenizeACRefinementScan(cinfo, scan_index, sti);
+    }
+    return;
+  }
+
+  jpeg_comp_master* m = cinfo->master;
+  size_t restart_interval = sti->restart_interval;
+  int restarts_to_go = restart_interval;
+  coeff_t last_dc_coeff[MAX_COMPS_IN_SCAN] = {0};
+
+  // "Non-interleaved" means color data comes in separate scans, in other words
+  // each scan can contain only one color component.
+  const bool is_interleaved = (scan_info->comps_in_scan > 1);
+  const bool is_progressive = cinfo->progressive_mode;
+  const int Ah = scan_info->Ah;
+  const int Al = scan_info->Al;
+  HWY_ALIGN constexpr coeff_t kSinkBlock[DCTSIZE2] = {0};
+
+  size_t restart_idx = 0;
+  TokenArray* ta = &m->token_arrays[m->cur_token_array];
+  sti->token_offset = Ah > 0 ? 0 : m->total_num_tokens + ta->num_tokens;
+
+  if (Ah > 0) {
+    sti->refbits = Allocate<uint8_t>(cinfo, sti->num_blocks, JPOOL_IMAGE);
+  } else if (cinfo->progressive_mode) {
+    if (ta->num_tokens + sti->num_blocks > m->num_tokens) {
+      if (ta->tokens) {
+        m->total_num_tokens += ta->num_tokens;
+        ++m->cur_token_array;
+        ta = &m->token_arrays[m->cur_token_array];
+      }
+      m->num_tokens = sti->num_blocks;
+      ta->tokens = Allocate<Token>(cinfo, m->num_tokens, JPOOL_IMAGE);
+      m->next_token = ta->tokens;
+    }
+  }
+
+  JBLOCKARRAY ba[MAX_COMPS_IN_SCAN];
+  size_t block_idx = 0;
+  for (size_t mcu_y = 0; mcu_y < sti->MCU_rows_in_scan; ++mcu_y) {
+    for (int i = 0; i < scan_info->comps_in_scan; ++i) {
+      int comp_idx = scan_info->component_index[i];
+      jpeg_component_info* comp = &cinfo->comp_info[comp_idx];
+      int n_blocks_y = is_interleaved ? comp->v_samp_factor : 1;
+      int by0 = mcu_y * n_blocks_y;
+      int block_rows_left = comp->height_in_blocks - by0;
+      int max_block_rows = std::min(n_blocks_y, block_rows_left);
+      ba[i] = (*cinfo->mem->access_virt_barray)(
+          reinterpret_cast<j_common_ptr>(cinfo), m->coeff_buffers[comp_idx],
+          by0, max_block_rows, false);
+    }
+    if (!cinfo->progressive_mode) {
+      int max_tokens_per_mcu_row = MaxNumTokensPerMCURow(cinfo);
+      if (ta->num_tokens + max_tokens_per_mcu_row > m->num_tokens) {
+        if (ta->tokens) {
+          m->total_num_tokens += ta->num_tokens;
+          ++m->cur_token_array;
+          ta = &m->token_arrays[m->cur_token_array];
+        }
+        m->num_tokens =
+            EstimateNumTokens(cinfo, mcu_y, sti->MCU_rows_in_scan,
+                              m->total_num_tokens, max_tokens_per_mcu_row);
+        ta->tokens = Allocate<Token>(cinfo, m->num_tokens, JPOOL_IMAGE);
+        m->next_token = ta->tokens;
+      }
+    }
+    for (size_t mcu_x = 0; mcu_x < sti->MCUs_per_row; ++mcu_x) {
+      // Possibly emit a restart marker.
+      if (restart_interval > 0 && restarts_to_go == 0) {
+        restarts_to_go = restart_interval;
+        memset(last_dc_coeff, 0, sizeof(last_dc_coeff));
+        ta->num_tokens = m->next_token - ta->tokens;
+        sti->restarts[restart_idx++] =
+            Ah > 0 ? block_idx : m->total_num_tokens + ta->num_tokens;
+      }
+      // Encode one MCU
+      for (int i = 0; i < scan_info->comps_in_scan; ++i) {
+        int comp_idx = scan_info->component_index[i];
+        jpeg_component_info* comp = &cinfo->comp_info[comp_idx];
+        int n_blocks_y = is_interleaved ? comp->v_samp_factor : 1;
+        int n_blocks_x = is_interleaved ? comp->h_samp_factor : 1;
+        for (int iy = 0; iy < n_blocks_y; ++iy) {
+          for (int ix = 0; ix < n_blocks_x; ++ix) {
+            size_t block_y = mcu_y * n_blocks_y + iy;
+            size_t block_x = mcu_x * n_blocks_x + ix;
+            const coeff_t* block;
+            if (block_x >= comp->width_in_blocks ||
+                block_y >= comp->height_in_blocks) {
+              block = kSinkBlock;
+            } else {
+              block = &ba[i][iy][block_x][0];
+            }
+            if (!is_progressive) {
+              HWY_DYNAMIC_DISPATCH(ComputeTokensSequential)
+              (block, last_dc_coeff[i], comp_idx, ac_ctx_offset + i,
+               &m->next_token);
+              last_dc_coeff[i] = block[0];
+            } else {
+              if (Ah == 0) {
+                TokenizeProgressiveDC(block, comp_idx, Al, last_dc_coeff + i,
+                                      &m->next_token);
+              } else {
+                sti->refbits[block_idx] = (block[0] >> Al) & 1;
+              }
+            }
+            ++block_idx;
+          }
+        }
+      }
+      --restarts_to_go;
+    }
+    ta->num_tokens = m->next_token - ta->tokens;
+  }
+  JXL_DASSERT(block_idx == sti->num_blocks);
+  sti->num_tokens =
+      Ah > 0 ? sti->num_blocks
+             : m->total_num_tokens + ta->num_tokens - sti->token_offset;
+  sti->restarts[restart_idx++] =
+      Ah > 0 ? sti->num_blocks : m->total_num_tokens + ta->num_tokens;
+  if (Ah == 0 && cinfo->progressive_mode) {
+    JXL_DASSERT(sti->num_blocks == sti->num_tokens);
+  }
+}
+
+}  // namespace
+
+void TokenizeJpeg(j_compress_ptr cinfo) {
+  jpeg_comp_master* m = cinfo->master;
+  std::vector<int> processed(cinfo->num_scans);
+  size_t max_refinement_tokens = 0;
+  size_t num_refinement_bits = 0;
+  int num_refinement_scans[DCTSIZE2] = {};
+  int max_num_refinement_scans = 0;
+  for (int i = 0; i < cinfo->num_scans; ++i) {
+    const jpeg_scan_info* si = &cinfo->scan_info[i];
+    ScanTokenInfo* sti = &m->scan_token_info[i];
+    if (si->Ss > 0 && si->Ah == 0 && si->Al > 0) {
+      int offset = m->ac_ctx_offset[i];
+      TokenizeScan(cinfo, i, offset, sti);
+      processed[i] = 1;
+      max_refinement_tokens += sti->num_future_nonzeros;
+      for (int k = si->Ss; k <= si->Se; ++k) {
+        num_refinement_scans[k] = si->Al;
+      }
+      max_num_refinement_scans = std::max(max_num_refinement_scans, si->Al);
+      num_refinement_bits += sti->num_nonzeros;
+    }
+    if (si->Ss > 0 && si->Ah > 0) {
+      int comp_idx = si->component_index[0];
+      const jpeg_component_info* comp = &cinfo->comp_info[comp_idx];
+      size_t num_blocks = comp->width_in_blocks * comp->height_in_blocks;
+      max_refinement_tokens += (1 + (si->Se - si->Ss) / 16) * num_blocks;
+    }
+  }
+  if (max_refinement_tokens > 0) {
+    m->next_refinement_token =
+        Allocate<RefToken>(cinfo, max_refinement_tokens, JPOOL_IMAGE);
+  }
+  for (int j = 0; j < max_num_refinement_scans; ++j) {
+    uint8_t* refinement_bits =
+        Allocate<uint8_t>(cinfo, num_refinement_bits, JPOOL_IMAGE);
+    m->next_refinement_bit = refinement_bits;
+    size_t new_refinement_bits = 0;
+    for (int i = 0; i < cinfo->num_scans; ++i) {
+      const jpeg_scan_info* si = &cinfo->scan_info[i];
+      ScanTokenInfo* sti = &m->scan_token_info[i];
+      if (si->Ss > 0 && si->Ah > 0 &&
+          si->Ah == num_refinement_scans[si->Ss] - j) {
+        int offset = m->ac_ctx_offset[i];
+        TokenizeScan(cinfo, i, offset, sti);
+        processed[i] = 1;
+        new_refinement_bits += sti->num_nonzeros;
+      }
+    }
+    JXL_DASSERT(m->next_refinement_bit ==
+                refinement_bits + num_refinement_bits);
+    num_refinement_bits += new_refinement_bits;
+  }
+  for (int i = 0; i < cinfo->num_scans; ++i) {
+    if (processed[i]) {
+      continue;
+    }
+    int offset = m->ac_ctx_offset[i];
+    TokenizeScan(cinfo, i, offset, &m->scan_token_info[i]);
+    processed[i] = 1;
+  }
+}
+
+namespace {
+
+struct Histogram {
+  int count[kJpegHuffmanAlphabetSize];
+  Histogram() { memset(count, 0, sizeof(count)); }
+};
+
+void BuildHistograms(j_compress_ptr cinfo, Histogram* histograms) {
+  jpeg_comp_master* m = cinfo->master;
+  size_t num_token_arrays = m->cur_token_array + 1;
+  for (size_t i = 0; i < num_token_arrays; ++i) {
+    Token* tokens = m->token_arrays[i].tokens;
+    size_t num_tokens = m->token_arrays[i].num_tokens;
+    for (size_t j = 0; j < num_tokens; ++j) {
+      Token t = tokens[j];
+      ++histograms[t.context].count[t.symbol];
+    }
+  }
+  for (int i = 0; i < cinfo->num_scans; ++i) {
+    const jpeg_scan_info& si = cinfo->scan_info[i];
+    const ScanTokenInfo& sti = m->scan_token_info[i];
+    if (si.Ss > 0 && si.Ah > 0) {
+      int context = m->ac_ctx_offset[i];
+      int* ac_histo = &histograms[context].count[0];
+      for (size_t j = 0; j < sti.num_tokens; ++j) {
+        ++ac_histo[sti.tokens[j].symbol & 253];
+      }
+    }
+  }
+}
+
+struct JpegClusteredHistograms {
+  std::vector<Histogram> histograms;
+  std::vector<uint32_t> histogram_indexes;
+  std::vector<uint32_t> slot_ids;
+};
+
+float HistogramCost(const Histogram& histo) {
+  std::vector<uint32_t> counts(kJpegHuffmanAlphabetSize + 1);
+  std::vector<uint8_t> depths(kJpegHuffmanAlphabetSize + 1);
+  for (size_t i = 0; i < kJpegHuffmanAlphabetSize; ++i) {
+    counts[i] = histo.count[i];
+  }
+  counts[kJpegHuffmanAlphabetSize] = 1;
+  CreateHuffmanTree(counts.data(), counts.size(), kJpegHuffmanMaxBitLength,
+                    &depths[0]);
+  size_t header_bits = (1 + kJpegHuffmanMaxBitLength) * 8;
+  size_t data_bits = 0;
+  for (size_t i = 0; i < kJpegHuffmanAlphabetSize; ++i) {
+    if (depths[i] > 0) {
+      header_bits += 8;
+      data_bits += counts[i] * depths[i];
+    }
+  }
+  return header_bits + data_bits;
+}
+
+void AddHistograms(const Histogram& a, const Histogram& b, Histogram* c) {
+  for (size_t i = 0; i < kJpegHuffmanAlphabetSize; ++i) {
+    c->count[i] = a.count[i] + b.count[i];
+  }
+}
+
+bool IsEmptyHistogram(const Histogram& histo) {
+  for (size_t i = 0; i < kJpegHuffmanAlphabetSize; ++i) {
+    if (histo.count[i]) return false;
+  }
+  return true;
+}
+
+void ClusterJpegHistograms(const Histogram* histograms, size_t num,
+                           JpegClusteredHistograms* clusters) {
+  clusters->histogram_indexes.resize(num);
+  std::vector<uint32_t> slot_histograms;
+  std::vector<float> slot_costs;
+  for (size_t i = 0; i < num; ++i) {
+    const Histogram& cur = histograms[i];
+    if (IsEmptyHistogram(cur)) {
+      continue;
+    }
+    float best_cost = HistogramCost(cur);
+    size_t best_slot = slot_histograms.size();
+    for (size_t j = 0; j < slot_histograms.size(); ++j) {
+      size_t prev_idx = slot_histograms[j];
+      const Histogram& prev = clusters->histograms[prev_idx];
+      Histogram combined;
+      AddHistograms(prev, cur, &combined);
+      float combined_cost = HistogramCost(combined);
+      float cost = combined_cost - slot_costs[j];
+      if (cost < best_cost) {
+        best_cost = cost;
+        best_slot = j;
+      }
+    }
+    if (best_slot == slot_histograms.size()) {
+      // Create new histogram.
+      size_t histogram_index = clusters->histograms.size();
+      clusters->histograms.push_back(cur);
+      clusters->histogram_indexes[i] = histogram_index;
+      if (best_slot < 4) {
+        // We have a free slot, so we put the new histogram there.
+        slot_histograms.push_back(histogram_index);
+        slot_costs.push_back(best_cost);
+      } else {
+        // TODO(szabadka) Find the best histogram to replce.
+        best_slot = (clusters->slot_ids.back() + 1) % 4;
+      }
+      slot_histograms[best_slot] = histogram_index;
+      slot_costs[best_slot] = best_cost;
+      clusters->slot_ids.push_back(best_slot);
+    } else {
+      // Merge this histogram with a previous one.
+      size_t histogram_index = slot_histograms[best_slot];
+      const Histogram& prev = clusters->histograms[histogram_index];
+      AddHistograms(prev, cur, &clusters->histograms[histogram_index]);
+      clusters->histogram_indexes[i] = histogram_index;
+      JXL_ASSERT(clusters->slot_ids[histogram_index] == best_slot);
+      slot_costs[best_slot] += best_cost;
+    }
+  }
+}
+
+void CopyHuffmanTable(j_compress_ptr cinfo, int index, bool is_dc,
+                      int* inv_slot_map, uint8_t* slot_id_map,
+                      JHUFF_TBL* huffman_tables, size_t* num_huffman_tables) {
+  const char* type = is_dc ? "DC" : "AC";
+  if (index < 0 || index >= NUM_HUFF_TBLS) {
+    JPEGLI_ERROR("Invalid %s Huffman table index %d", type, index);
+  }
+  // Check if we have already copied this Huffman table.
+  int slot_idx = index + (is_dc ? 0 : NUM_HUFF_TBLS);
+  if (inv_slot_map[slot_idx] != -1) {
+    return;
+  }
+  inv_slot_map[slot_idx] = *num_huffman_tables;
+  // Look up and validate Huffman table.
+  JHUFF_TBL* table =
+      is_dc ? cinfo->dc_huff_tbl_ptrs[index] : cinfo->ac_huff_tbl_ptrs[index];
+  if (table == nullptr) {
+    JPEGLI_ERROR("Missing %s Huffman table %d", type, index);
+  }
+  ValidateHuffmanTable(reinterpret_cast<j_common_ptr>(cinfo), table, is_dc);
+  // Copy Huffman table to the end of the list and save slot id.
+  slot_id_map[*num_huffman_tables] = index + (is_dc ? 0 : 0x10);
+  memcpy(&huffman_tables[*num_huffman_tables], table, sizeof(JHUFF_TBL));
+  ++(*num_huffman_tables);
+}
+
+void BuildJpegHuffmanTable(const Histogram& histo, JHUFF_TBL* table) {
+  std::vector<uint32_t> counts(kJpegHuffmanAlphabetSize + 1);
+  std::vector<uint8_t> depths(kJpegHuffmanAlphabetSize + 1);
+  for (size_t j = 0; j < kJpegHuffmanAlphabetSize; ++j) {
+    counts[j] = histo.count[j];
+  }
+  counts[kJpegHuffmanAlphabetSize] = 1;
+  CreateHuffmanTree(counts.data(), counts.size(), kJpegHuffmanMaxBitLength,
+                    &depths[0]);
+  memset(table, 0, sizeof(JHUFF_TBL));
+  for (size_t i = 0; i < kJpegHuffmanAlphabetSize; ++i) {
+    if (depths[i] > 0) {
+      ++table->bits[depths[i]];
+    }
+  }
+  int offset[kJpegHuffmanMaxBitLength + 1] = {0};
+  for (size_t i = 1; i <= kJpegHuffmanMaxBitLength; ++i) {
+    offset[i] = offset[i - 1] + table->bits[i - 1];
+  }
+  for (size_t i = 0; i < kJpegHuffmanAlphabetSize; ++i) {
+    if (depths[i] > 0) {
+      table->huffval[offset[depths[i]]++] = i;
+    }
+  }
+}
+
+}  // namespace
+
+void CopyHuffmanTables(j_compress_ptr cinfo) {
+  jpeg_comp_master* m = cinfo->master;
+  size_t max_huff_tables = 2 * cinfo->num_components;
+  // Copy Huffman tables and save slot ids.
+  m->huffman_tables = Allocate<JHUFF_TBL>(cinfo, max_huff_tables, JPOOL_IMAGE);
+  m->slot_id_map = Allocate<uint8_t>(cinfo, max_huff_tables, JPOOL_IMAGE);
+  m->num_huffman_tables = 0;
+  int inv_slot_map[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    CopyHuffmanTable(cinfo, comp->dc_tbl_no, /*is_dc=*/true, &inv_slot_map[0],
+                     m->slot_id_map, m->huffman_tables, &m->num_huffman_tables);
+    CopyHuffmanTable(cinfo, comp->ac_tbl_no, /*is_dc=*/false, &inv_slot_map[0],
+                     m->slot_id_map, m->huffman_tables, &m->num_huffman_tables);
+  }
+  // Compute context map.
+  m->context_map = Allocate<uint8_t>(cinfo, 8, JPOOL_IMAGE);
+  memset(m->context_map, 0, 8);
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    m->context_map[c] = inv_slot_map[cinfo->comp_info[c].dc_tbl_no];
+  }
+  int ac_ctx = 4;
+  for (int i = 0; i < cinfo->num_scans; ++i) {
+    const jpeg_scan_info* si = &cinfo->scan_info[i];
+    if (si->Se > 0) {
+      for (int j = 0; j < si->comps_in_scan; ++j) {
+        int c = si->component_index[j];
+        jpeg_component_info* comp = &cinfo->comp_info[c];
+        m->context_map[ac_ctx++] = inv_slot_map[comp->ac_tbl_no + 4];
+      }
+    }
+  }
+}
+
+void OptimizeHuffmanCodes(j_compress_ptr cinfo) {
+  jpeg_comp_master* m = cinfo->master;
+  // Build DC and AC histograms.
+  std::vector<Histogram> histograms(m->num_contexts);
+  BuildHistograms(cinfo, &histograms[0]);
+
+  // Cluster DC histograms.
+  JpegClusteredHistograms dc_clusters;
+  ClusterJpegHistograms(histograms.data(), cinfo->num_components, &dc_clusters);
+
+  // Cluster AC histograms.
+  JpegClusteredHistograms ac_clusters;
+  ClusterJpegHistograms(histograms.data() + 4, m->num_contexts - 4,
+                        &ac_clusters);
+
+  // Create Huffman tables and slot ids clusters.
+  size_t num_dc_huff = dc_clusters.histograms.size();
+  m->num_huffman_tables = num_dc_huff + ac_clusters.histograms.size();
+  m->huffman_tables =
+      Allocate<JHUFF_TBL>(cinfo, m->num_huffman_tables, JPOOL_IMAGE);
+  m->slot_id_map = Allocate<uint8_t>(cinfo, m->num_huffman_tables, JPOOL_IMAGE);
+  for (size_t i = 0; i < m->num_huffman_tables; ++i) {
+    JHUFF_TBL huff_table = {};
+    if (i < dc_clusters.histograms.size()) {
+      m->slot_id_map[i] = i;
+      BuildJpegHuffmanTable(dc_clusters.histograms[i], &huff_table);
+    } else {
+      m->slot_id_map[i] = 16 + ac_clusters.slot_ids[i - num_dc_huff];
+      BuildJpegHuffmanTable(ac_clusters.histograms[i - num_dc_huff],
+                            &huff_table);
+    }
+    memcpy(&m->huffman_tables[i], &huff_table, sizeof(huff_table));
+  }
+
+  // Create context map from clustered histogram indexes.
+  m->context_map = Allocate<uint8_t>(cinfo, m->num_contexts, JPOOL_IMAGE);
+  memset(m->context_map, 0, m->num_contexts);
+  for (size_t i = 0; i < m->num_contexts; ++i) {
+    if (i < (size_t)cinfo->num_components) {
+      m->context_map[i] = dc_clusters.histogram_indexes[i];
+    } else if (i >= 4) {
+      m->context_map[i] = num_dc_huff + ac_clusters.histogram_indexes[i - 4];
+    }
+  }
+}
+
+namespace {
+
+constexpr uint8_t kNumExtraBits[256] = {
+    0,  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  //
+    1,  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  //
+    2,  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  //
+    3,  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  //
+    4,  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  //
+    5,  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  //
+    6,  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  //
+    7,  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  //
+    8,  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  //
+    9,  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  //
+    10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  //
+    11, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  //
+    12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  //
+    13, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  //
+    14, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  //
+    0,  1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,  //
+};
+
+void BuildHuffmanCodeTable(const JHUFF_TBL& table, HuffmanCodeTable* code) {
+  int huff_code[kJpegHuffmanAlphabetSize];
+  // +1 for a sentinel element.
+  uint32_t huff_size[kJpegHuffmanAlphabetSize + 1];
+  int p = 0;
+  for (size_t l = 1; l <= kJpegHuffmanMaxBitLength; ++l) {
+    int i = table.bits[l];
+    while (i--) huff_size[p++] = l;
+  }
+
+  // Reuse sentinel element.
+  int last_p = p;
+  huff_size[last_p] = 0;
+
+  int next_code = 0;
+  uint32_t si = huff_size[0];
+  p = 0;
+  while (huff_size[p]) {
+    while ((huff_size[p]) == si) {
+      huff_code[p++] = next_code;
+      next_code++;
+    }
+    next_code <<= 1;
+    si++;
+  }
+  for (p = 0; p < last_p; p++) {
+    int i = table.huffval[p];
+    int nbits = kNumExtraBits[i];
+    code->depth[i] = huff_size[p] + nbits;
+    code->code[i] = huff_code[p] << nbits;
+  }
+}
+
+}  // namespace
+
+void InitEntropyCoder(j_compress_ptr cinfo) {
+  jpeg_comp_master* m = cinfo->master;
+  m->coding_tables =
+      Allocate<HuffmanCodeTable>(cinfo, m->num_huffman_tables, JPOOL_IMAGE);
+  for (size_t i = 0; i < m->num_huffman_tables; ++i) {
+    BuildHuffmanCodeTable(m->huffman_tables[i], &m->coding_tables[i]);
+  }
+}
+
+}  // namespace jpegli
+#endif  // HWY_ONCE
diff --git a/lib/jpegli/entropy_coding.h b/lib/jpegli/entropy_coding.h
new file mode 100644 (file)
index 0000000..a552219
--- /dev/null
@@ -0,0 +1,28 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_ENTROPY_CODING_H_
+#define LIB_JPEGLI_ENTROPY_CODING_H_
+
+#include "lib/jpegli/common.h"
+
+namespace jpegli {
+
+size_t MaxNumTokensPerMCURow(j_compress_ptr cinfo);
+
+size_t EstimateNumTokens(j_compress_ptr cinfo, size_t mcu_y, size_t ysize_mcus,
+                         size_t num_tokens, size_t max_per_row);
+
+void TokenizeJpeg(j_compress_ptr cinfo);
+
+void CopyHuffmanTables(j_compress_ptr cinfo);
+
+void OptimizeHuffmanCodes(j_compress_ptr cinfo);
+
+void InitEntropyCoder(j_compress_ptr cinfo);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_ENTROPY_CODING_H_
diff --git a/lib/jpegli/error.cc b/lib/jpegli/error.cc
new file mode 100644 (file)
index 0000000..2892616
--- /dev/null
@@ -0,0 +1,102 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/error.h"
+
+#include <setjmp.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <string>
+
+#include "lib/jpegli/common.h"
+
+namespace jpegli {
+
+const char* const kErrorMessageTable[] = {
+    "Message codes are not supported, error message is in msg_parm.s string",
+};
+
+bool FormatString(char* buffer, const char* format, ...) {
+  va_list args;
+  va_start(args, format);
+  vsnprintf(buffer, JMSG_STR_PARM_MAX, format, args);
+  va_end(args);
+  return false;
+}
+
+void ExitWithAbort(j_common_ptr cinfo) {
+  (*cinfo->err->output_message)(cinfo);
+  jpegli_destroy(cinfo);
+  exit(EXIT_FAILURE);
+}
+
+void EmitMessage(j_common_ptr cinfo, int msg_level) {
+  if (msg_level < 0) {
+    if (cinfo->err->num_warnings <= 5 || cinfo->err->trace_level >= 3) {
+      (*cinfo->err->output_message)(cinfo);
+    }
+    ++cinfo->err->num_warnings;
+  } else if (cinfo->err->trace_level >= msg_level) {
+    (*cinfo->err->output_message)(cinfo);
+  }
+}
+
+void OutputMessage(j_common_ptr cinfo) {
+  char buffer[JMSG_LENGTH_MAX];
+  (*cinfo->err->format_message)(cinfo, buffer);
+  fprintf(stderr, "%s\n", buffer);
+}
+
+void FormatMessage(j_common_ptr cinfo, char* buffer) {
+  jpeg_error_mgr* err = cinfo->err;
+  int code = err->msg_code;
+  if (code == 0) {
+    memcpy(buffer, cinfo->err->msg_parm.s, JMSG_STR_PARM_MAX);
+  } else if (err->addon_message_table != nullptr &&
+             code >= err->first_addon_message &&
+             code <= err->last_addon_message) {
+    std::string msg(err->addon_message_table[code - err->first_addon_message]);
+    if (msg.find("%s") != std::string::npos) {
+      snprintf(buffer, JMSG_LENGTH_MAX, msg.data(), err->msg_parm.s);
+    } else {
+      snprintf(buffer, JMSG_LENGTH_MAX, msg.data(), err->msg_parm.i[0],
+               err->msg_parm.i[1], err->msg_parm.i[2], err->msg_parm.i[3],
+               err->msg_parm.i[4], err->msg_parm.i[5], err->msg_parm.i[6],
+               err->msg_parm.i[7]);
+    }
+  } else {
+    snprintf(buffer, JMSG_LENGTH_MAX, "%s", kErrorMessageTable[0]);
+  }
+}
+
+void ResetErrorManager(j_common_ptr cinfo) {
+  memset(cinfo->err->msg_parm.s, 0, JMSG_STR_PARM_MAX);
+  cinfo->err->msg_code = 0;
+  cinfo->err->num_warnings = 0;
+}
+
+}  // namespace jpegli
+
+struct jpeg_error_mgr* jpegli_std_error(struct jpeg_error_mgr* err) {
+  err->error_exit = jpegli::ExitWithAbort;
+  err->emit_message = jpegli::EmitMessage;
+  err->output_message = jpegli::OutputMessage;
+  err->format_message = jpegli::FormatMessage;
+  err->reset_error_mgr = jpegli::ResetErrorManager;
+  memset(err->msg_parm.s, 0, JMSG_STR_PARM_MAX);
+  err->trace_level = 0;
+  err->num_warnings = 0;
+  // We don't support message codes and message table, but we define one here
+  // in case the application has a custom format_message and tries to access
+  // these fields there.
+  err->msg_code = 0;
+  err->jpeg_message_table = jpegli::kErrorMessageTable;
+  err->last_jpeg_message = 0;
+  err->addon_message_table = nullptr;
+  err->first_addon_message = 0;
+  err->last_addon_message = 0;
+  return err;
+}
diff --git a/lib/jpegli/error.h b/lib/jpegli/error.h
new file mode 100644 (file)
index 0000000..4451abd
--- /dev/null
@@ -0,0 +1,37 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_ERROR_H_
+#define LIB_JPEGLI_ERROR_H_
+
+#include <stdarg.h>
+#include <stdint.h>
+
+#include "lib/jpegli/common.h"
+
+namespace jpegli {
+
+bool FormatString(char* buffer, const char* format, ...);
+
+}  // namespace jpegli
+
+#define JPEGLI_ERROR(format, ...)                                            \
+  jpegli::FormatString(cinfo->err->msg_parm.s, ("%s:%d: " format), __FILE__, \
+                       __LINE__, ##__VA_ARGS__),                             \
+      (*cinfo->err->error_exit)(reinterpret_cast<j_common_ptr>(cinfo))
+
+#define JPEGLI_WARN(format, ...)                                             \
+  jpegli::FormatString(cinfo->err->msg_parm.s, ("%s:%d: " format), __FILE__, \
+                       __LINE__, ##__VA_ARGS__),                             \
+      (*cinfo->err->emit_message)(reinterpret_cast<j_common_ptr>(cinfo), -1)
+
+#define JPEGLI_TRACE(level, format, ...)                                     \
+  if (cinfo->err->trace_level >= (level))                                    \
+  jpegli::FormatString(cinfo->err->msg_parm.s, ("%s:%d: " format), __FILE__, \
+                       __LINE__, ##__VA_ARGS__),                             \
+      (*cinfo->err->emit_message)(reinterpret_cast<j_common_ptr>(cinfo),     \
+                                  (level))
+
+#endif  // LIB_JPEGLI_ERROR_H_
diff --git a/lib/jpegli/error_handling_test.cc b/lib/jpegli/error_handling_test.cc
new file mode 100644 (file)
index 0000000..0d481c5
--- /dev/null
@@ -0,0 +1,1276 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/decode.h"
+#include "lib/jpegli/encode.h"
+#include "lib/jpegli/error.h"
+#include "lib/jpegli/test_utils.h"
+#include "lib/jpegli/testing.h"
+#include "lib/jxl/sanitizers.h"
+
+namespace jpegli {
+namespace {
+
+TEST(EncoderErrorHandlingTest, MinimalSuccess) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  {
+    jpeg_compress_struct cinfo;
+    const auto try_catch_block = [&]() -> bool {
+      ERROR_HANDLER_SETUP(jpegli);
+      jpegli_create_compress(&cinfo);
+      jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+      cinfo.image_width = 1;
+      cinfo.image_height = 1;
+      cinfo.input_components = 1;
+      jpegli_set_defaults(&cinfo);
+      jpegli_start_compress(&cinfo, TRUE);
+      JSAMPLE image[1] = {0};
+      JSAMPROW row[] = {image};
+      jpegli_write_scanlines(&cinfo, row, 1);
+      jpegli_finish_compress(&cinfo);
+      return true;
+    };
+    EXPECT_TRUE(try_catch_block());
+    jpegli_destroy_compress(&cinfo);
+  }
+  TestImage output;
+  DecodeWithLibjpeg(CompressParams(), DecompressParams(), nullptr, 0, buffer,
+                    buffer_size, &output);
+  EXPECT_EQ(1, output.xsize);
+  EXPECT_EQ(1, output.ysize);
+  EXPECT_EQ(1, output.components);
+  EXPECT_EQ(0, output.pixels[0]);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, NoDestination) {
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+}
+
+TEST(EncoderErrorHandlingTest, NoImageDimensions) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, ImageTooBig) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 100000;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, NoInputComponents) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    jpegli_set_defaults(&cinfo);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, TooManyInputComponents) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1000;
+    jpegli_set_defaults(&cinfo);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, NoSetDefaults) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_start_compress(&cinfo, TRUE);
+    JSAMPLE image[1] = {0};
+    JSAMPROW row[] = {image};
+    jpegli_write_scanlines(&cinfo, row, 1);
+    jpegli_finish_compress(&cinfo);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, NoStartCompress) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    JSAMPLE image[1] = {0};
+    JSAMPROW row[] = {image};
+    jpegli_write_scanlines(&cinfo, row, 1);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, NoWriteScanlines) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    jpegli_start_compress(&cinfo, TRUE);
+    jpegli_finish_compress(&cinfo);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, NoWriteAllScanlines) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 2;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    jpegli_start_compress(&cinfo, TRUE);
+    JSAMPLE image[1] = {0};
+    JSAMPROW row[] = {image};
+    jpegli_write_scanlines(&cinfo, row, 1);
+    jpegli_finish_compress(&cinfo);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidQuantValue) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    cinfo.quant_tbl_ptrs[0] = jpegli_alloc_quant_table((j_common_ptr)&cinfo);
+    for (size_t k = 0; k < DCTSIZE2; ++k) {
+      cinfo.quant_tbl_ptrs[0]->quantval[k] = 0;
+    }
+    jpegli_start_compress(&cinfo, TRUE);
+    JSAMPLE image[1] = {0};
+    JSAMPROW row[] = {image};
+    jpegli_write_scanlines(&cinfo, row, 1);
+    jpegli_finish_compress(&cinfo);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidQuantTableIndex) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    cinfo.comp_info[0].quant_tbl_no = 3;
+    jpegli_start_compress(&cinfo, TRUE);
+    JSAMPLE image[1] = {0};
+    JSAMPROW row[] = {image};
+    jpegli_write_scanlines(&cinfo, row, 1);
+    jpegli_finish_compress(&cinfo);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, NumberOfComponentsMismatch1) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    cinfo.num_components = 100;
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, NumberOfComponentsMismatch2) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    cinfo.num_components = 2;
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, NumberOfComponentsMismatch3) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    cinfo.num_components = 2;
+    cinfo.comp_info[1].h_samp_factor = cinfo.comp_info[1].v_samp_factor = 1;
+    jpegli_start_compress(&cinfo, TRUE);
+    JSAMPLE image[1] = {0};
+    JSAMPROW row[] = {image};
+    jpegli_write_scanlines(&cinfo, row, 1);
+    jpegli_finish_compress(&cinfo);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, NumberOfComponentsMismatch4) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    cinfo.in_color_space = JCS_RGB;
+    jpegli_set_defaults(&cinfo);
+    jpegli_start_compress(&cinfo, TRUE);
+    JSAMPLE image[1] = {0};
+    JSAMPROW row[] = {image};
+    jpegli_write_scanlines(&cinfo, row, 1);
+    jpegli_finish_compress(&cinfo);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, NumberOfComponentsMismatch5) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 3;
+    cinfo.in_color_space = JCS_GRAYSCALE;
+    jpegli_set_defaults(&cinfo);
+    jpegli_start_compress(&cinfo, TRUE);
+    JSAMPLE image[3] = {0};
+    JSAMPROW row[] = {image};
+    jpegli_write_scanlines(&cinfo, row, 1);
+    jpegli_finish_compress(&cinfo);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, NumberOfComponentsMismatch6) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 3;
+    cinfo.in_color_space = JCS_RGB;
+    jpegli_set_defaults(&cinfo);
+    cinfo.num_components = 2;
+    jpegli_start_compress(&cinfo, TRUE);
+    JSAMPLE image[3] = {0};
+    JSAMPROW row[] = {image};
+    jpegli_write_scanlines(&cinfo, row, 1);
+    jpegli_finish_compress(&cinfo);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidColorTransform) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 3;
+    cinfo.in_color_space = JCS_YCbCr;
+    jpegli_set_defaults(&cinfo);
+    cinfo.jpeg_color_space = JCS_RGB;
+    jpegli_start_compress(&cinfo, TRUE);
+    JSAMPLE image[3] = {0};
+    JSAMPROW row[] = {image};
+    jpegli_write_scanlines(&cinfo, row, 1);
+    jpegli_finish_compress(&cinfo);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, DuplicateComponentIds) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 3;
+    jpegli_set_defaults(&cinfo);
+    cinfo.comp_info[0].component_id = 0;
+    cinfo.comp_info[1].component_id = 0;
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidComponentIndex) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 3;
+    jpegli_set_defaults(&cinfo);
+    cinfo.comp_info[0].component_index = 17;
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, ArithmeticCoding) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 3;
+    jpegli_set_defaults(&cinfo);
+    cinfo.arith_code = TRUE;
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, CCIR601Sampling) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 3;
+    jpegli_set_defaults(&cinfo);
+    cinfo.CCIR601_sampling = TRUE;
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidScanScript1) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    static constexpr jpeg_scan_info kScript[] = {{1, {0}, 0, 63, 0, 0}};  //
+    cinfo.scan_info = kScript;
+    cinfo.num_scans = 0;
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidScanScript2) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    static constexpr jpeg_scan_info kScript[] = {{2, {0, 1}, 0, 63, 0, 0}};  //
+    cinfo.scan_info = kScript;
+    cinfo.num_scans = ARRAY_SIZE(kScript);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidScanScript3) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    static constexpr jpeg_scan_info kScript[] = {{5, {0}, 0, 63, 0, 0}};  //
+    cinfo.scan_info = kScript;
+    cinfo.num_scans = ARRAY_SIZE(kScript);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidScanScript4) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 2;
+    jpegli_set_defaults(&cinfo);
+    static constexpr jpeg_scan_info kScript[] = {{2, {0, 0}, 0, 63, 0, 0}};  //
+    cinfo.scan_info = kScript;
+    cinfo.num_scans = ARRAY_SIZE(kScript);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidScanScript5) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 2;
+    jpegli_set_defaults(&cinfo);
+    static constexpr jpeg_scan_info kScript[] = {{2, {1, 0}, 0, 63, 0, 0}};  //
+    cinfo.scan_info = kScript;
+    cinfo.num_scans = ARRAY_SIZE(kScript);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidScanScript6) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    static constexpr jpeg_scan_info kScript[] = {{1, {0}, 0, 64, 0, 0}};  //
+    cinfo.scan_info = kScript;
+    cinfo.num_scans = ARRAY_SIZE(kScript);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidScanScript7) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    static constexpr jpeg_scan_info kScript[] = {{1, {0}, 2, 1, 0, 0}};  //
+    cinfo.scan_info = kScript;
+    cinfo.num_scans = ARRAY_SIZE(kScript);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidScanScript8) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 2;
+    jpegli_set_defaults(&cinfo);
+    static constexpr jpeg_scan_info kScript[] = {
+        {1, {0}, 0, 63, 0, 0}, {1, {1}, 0, 0, 0, 0}, {1, {1}, 1, 63, 0, 0}  //
+    };
+    cinfo.scan_info = kScript;
+    cinfo.num_scans = ARRAY_SIZE(kScript);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidScanScript9) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    static constexpr jpeg_scan_info kScript[] = {
+        {1, {0}, 0, 1, 0, 0}, {1, {0}, 2, 63, 0, 0},  //
+    };
+    cinfo.scan_info = kScript;
+    cinfo.num_scans = ARRAY_SIZE(kScript);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidScanScript10) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 2;
+    jpegli_set_defaults(&cinfo);
+    static constexpr jpeg_scan_info kScript[] = {
+        {2, {0, 1}, 0, 0, 0, 0}, {2, {0, 1}, 1, 63, 0, 0}  //
+    };
+    cinfo.scan_info = kScript;
+    cinfo.num_scans = ARRAY_SIZE(kScript);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidScanScript11) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    static constexpr jpeg_scan_info kScript[] = {
+        {1, {0}, 1, 63, 0, 0}, {1, {0}, 0, 0, 0, 0}  //
+    };
+    cinfo.scan_info = kScript;
+    cinfo.num_scans = ARRAY_SIZE(kScript);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidScanScript12) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    static constexpr jpeg_scan_info kScript[] = {
+        {1, {0}, 0, 0, 10, 1}, {1, {0}, 0, 0, 1, 0}, {1, {0}, 1, 63, 0, 0}  //
+    };
+    cinfo.scan_info = kScript;
+    cinfo.num_scans = ARRAY_SIZE(kScript);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, InvalidScanScript13) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    static constexpr jpeg_scan_info kScript[] = {
+        {1, {0}, 0, 0, 0, 2},
+        {1, {0}, 0, 0, 1, 0},
+        {1, {0}, 0, 0, 2, 1},  //
+        {1, {0}, 1, 63, 0, 0}  //
+    };
+    cinfo.scan_info = kScript;
+    cinfo.num_scans = ARRAY_SIZE(kScript);
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, MCUSizeTooBig) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 3;
+    jpegli_set_defaults(&cinfo);
+    jpegli_set_progressive_level(&cinfo, 0);
+    cinfo.comp_info[0].h_samp_factor = 3;
+    cinfo.comp_info[0].v_samp_factor = 3;
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, RestartIntervalTooBig) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 1;
+    jpegli_set_defaults(&cinfo);
+    cinfo.restart_interval = 1000000;
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, SamplingFactorTooBig) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 3;
+    jpegli_set_defaults(&cinfo);
+    cinfo.comp_info[0].h_samp_factor = 5;
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+TEST(EncoderErrorHandlingTest, NonIntegralSamplingRatio) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = 1;
+    cinfo.image_height = 1;
+    cinfo.input_components = 3;
+    jpegli_set_defaults(&cinfo);
+    cinfo.comp_info[0].h_samp_factor = 3;
+    cinfo.comp_info[1].h_samp_factor = 2;
+    jpegli_start_compress(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  if (buffer) free(buffer);
+}
+
+constexpr const char* kAddOnTable[] = {"First message",
+                                       "Second message with int param %d",
+                                       "Third message with string param %s"};
+
+TEST(EncoderErrorHandlingTest, AddOnTableNoParam) {
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    cinfo.err->addon_message_table = kAddOnTable;
+    cinfo.err->first_addon_message = 10000;
+    cinfo.err->last_addon_message = 10002;
+    cinfo.err->msg_code = 10000;
+    (*cinfo.err->error_exit)(reinterpret_cast<j_common_ptr>(&cinfo));
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+}
+
+TEST(EncoderErrorHandlingTest, AddOnTableIntParam) {
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    cinfo.err->addon_message_table = kAddOnTable;
+    cinfo.err->first_addon_message = 10000;
+    cinfo.err->last_addon_message = 10002;
+    cinfo.err->msg_code = 10001;
+    cinfo.err->msg_parm.i[0] = 17;
+    (*cinfo.err->error_exit)(reinterpret_cast<j_common_ptr>(&cinfo));
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+}
+
+TEST(EncoderErrorHandlingTest, AddOnTableNoStringParam) {
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    cinfo.err->addon_message_table = kAddOnTable;
+    cinfo.err->first_addon_message = 10000;
+    cinfo.err->last_addon_message = 10002;
+    cinfo.err->msg_code = 10002;
+    memcpy(cinfo.err->msg_parm.s, "MESSAGE PARAM", 14);
+    (*cinfo.err->error_exit)(reinterpret_cast<j_common_ptr>(&cinfo));
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+}
+
+static const uint8_t kCompressed0[] = {
+    // SOI
+    0xff, 0xd8,  //
+    // DQT
+    0xff, 0xdb, 0x00, 0x43, 0x00, 0x03, 0x02, 0x02, 0x03, 0x02,  //
+    0x02, 0x03, 0x03, 0x03, 0x03, 0x04, 0x03, 0x03, 0x04, 0x05,  //
+    0x08, 0x05, 0x05, 0x04, 0x04, 0x05, 0x0a, 0x07, 0x07, 0x06,  //
+    0x08, 0x0c, 0x0a, 0x0c, 0x0c, 0x0b, 0x0a, 0x0b, 0x0b, 0x0d,  //
+    0x0e, 0x12, 0x10, 0x0d, 0x0e, 0x11, 0x0e, 0x0b, 0x0b, 0x10,  //
+    0x16, 0x10, 0x11, 0x13, 0x14, 0x15, 0x15, 0x15, 0x0c, 0x0f,  //
+    0x17, 0x18, 0x16, 0x14, 0x18, 0x12, 0x14, 0x15, 0x14,        //
+    // SOF
+    0xff, 0xc0, 0x00, 0x0b, 0x08, 0x00, 0x01, 0x00, 0x01, 0x01,  //
+    0x01, 0x11, 0x00,                                            //
+    // DHT
+    0xff, 0xc4, 0x00, 0xd2, 0x00, 0x00, 0x01, 0x05, 0x01, 0x01,  //
+    0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  //
+    0x00, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,  //
+    0x09, 0x0a, 0x0b, 0x10, 0x00, 0x02, 0x01, 0x03, 0x03, 0x02,  //
+    0x04, 0x03, 0x05, 0x05, 0x04, 0x04, 0x00, 0x00, 0x01, 0x7d,  //
+    0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12, 0x21, 0x31,  //
+    0x41, 0x06, 0x13, 0x51, 0x61, 0x07, 0x22, 0x71, 0x14, 0x32,  //
+    0x81, 0x91, 0xa1, 0x08, 0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52,  //
+    0xd1, 0xf0, 0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16,  //
+    0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a,  //
+    0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x43, 0x44, 0x45,  //
+    0x46, 0x47, 0x48, 0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57,  //
+    0x58, 0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,  //
+    0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x83,  //
+    0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x92, 0x93, 0x94,  //
+    0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5,  //
+    0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6,  //
+    0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,  //
+    0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8,  //
+    0xd9, 0xda, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8,  //
+    0xe9, 0xea, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,  //
+    0xf9, 0xfa,                                                  //
+    // SOS
+    0xff, 0xda, 0x00, 0x08, 0x01, 0x01, 0x00, 0x00, 0x3f, 0x00,  //
+    // entropy coded data
+    0xfc, 0xaa, 0xaf,  //
+    // EOI
+    0xff, 0xd9,  //
+};
+static const size_t kLen0 = sizeof(kCompressed0);
+
+static const size_t kDQTOffset = 2;
+static const size_t kSOFOffset = 71;
+static const size_t kDHTOffset = 84;
+static const size_t kSOSOffset = 296;
+
+TEST(DecoderErrorHandlingTest, MinimalSuccess) {
+  JXL_CHECK(kCompressed0[kDQTOffset] == 0xff);
+  JXL_CHECK(kCompressed0[kSOFOffset] == 0xff);
+  JXL_CHECK(kCompressed0[kDHTOffset] == 0xff);
+  JXL_CHECK(kCompressed0[kSOSOffset] == 0xff);
+  jpeg_decompress_struct cinfo = {};
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+    jpegli_mem_src(&cinfo, kCompressed0, kLen0);
+    jpegli_read_header(&cinfo, TRUE);
+    EXPECT_EQ(1, cinfo.image_width);
+    EXPECT_EQ(1, cinfo.image_height);
+    jpegli_start_decompress(&cinfo);
+    JSAMPLE image[1];
+    JSAMPROW row[] = {image};
+    jpegli_read_scanlines(&cinfo, row, 1);
+    EXPECT_EQ(0, image[0]);
+    jpegli_finish_decompress(&cinfo);
+    return true;
+  };
+  EXPECT_TRUE(try_catch_block());
+  jpegli_destroy_decompress(&cinfo);
+}
+
+TEST(DecoderErrorHandlingTest, NoSource) {
+  jpeg_decompress_struct cinfo = {};
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+    jpegli_read_header(&cinfo, TRUE);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_decompress(&cinfo);
+}
+
+TEST(DecoderErrorHandlingTest, NoReadHeader) {
+  jpeg_decompress_struct cinfo = {};
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+    jpegli_mem_src(&cinfo, kCompressed0, kLen0);
+    jpegli_start_decompress(&cinfo);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_decompress(&cinfo);
+}
+
+TEST(DecoderErrorHandlingTest, NoStartDecompress) {
+  jpeg_decompress_struct cinfo = {};
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+    jpegli_mem_src(&cinfo, kCompressed0, kLen0);
+    jpegli_read_header(&cinfo, TRUE);
+    EXPECT_EQ(1, cinfo.image_width);
+    EXPECT_EQ(1, cinfo.image_height);
+    JSAMPLE image[1];
+    JSAMPROW row[] = {image};
+    jpegli_read_scanlines(&cinfo, row, 1);
+    EXPECT_EQ(0, image[0]);
+    jpegli_finish_decompress(&cinfo);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_decompress(&cinfo);
+}
+
+TEST(DecoderErrorHandlingTest, NoReadScanlines) {
+  jpeg_decompress_struct cinfo = {};
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+    jpegli_mem_src(&cinfo, kCompressed0, kLen0);
+    jpegli_read_header(&cinfo, TRUE);
+    EXPECT_EQ(1, cinfo.image_width);
+    EXPECT_EQ(1, cinfo.image_height);
+    jpegli_start_decompress(&cinfo);
+    jpegli_finish_decompress(&cinfo);
+    return true;
+  };
+  EXPECT_FALSE(try_catch_block());
+  jpegli_destroy_decompress(&cinfo);
+}
+
+static const size_t kMaxImageWidth = 0xffff;
+JSAMPLE kOutputBuffer[MAX_COMPONENTS * kMaxImageWidth];
+
+bool ParseCompressed(const std::vector<uint8_t>& compressed) {
+  jpeg_decompress_struct cinfo = {};
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+    jpegli_mem_src(&cinfo, compressed.data(), compressed.size());
+    jpegli_read_header(&cinfo, TRUE);
+    jpegli_start_decompress(&cinfo);
+    for (JDIMENSION i = 0; i < cinfo.output_height; ++i) {
+      JSAMPROW row[] = {kOutputBuffer};
+      jpegli_read_scanlines(&cinfo, row, 1);
+    }
+    jpegli_finish_decompress(&cinfo);
+    return true;
+  };
+  bool retval = try_catch_block();
+  jpegli_destroy_decompress(&cinfo);
+  return retval;
+}
+
+TEST(DecoderErrorHandlingTest, NoSOI) {
+  for (int pos : {0, 1}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[pos] = 0;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+}
+
+TEST(DecoderErrorHandlingTest, InvalidDQT) {
+  // Bad marker length
+  for (int diff : {-2, -1, 1, 2}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kDQTOffset + 3] += diff;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+  // inavlid table index / precision
+  for (int val : {0x20, 0x05}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kDQTOffset + 4] = val;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+  // zero quant value
+  for (int k : {0, 1, 17, 63}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kDQTOffset + 5 + k] = 0;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+}
+
+TEST(DecoderErrorHandlingTest, InvalidSOF) {
+  // Bad marker length
+  for (int diff : {-2, -1, 1, 2}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kSOFOffset + 3] += diff;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+  // zero width, height or num_components
+  for (int pos : {6, 8, 9}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kSOFOffset + pos] = 0;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+  // invalid data precision
+  for (int val : {0, 1, 127}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kSOFOffset + 4] = val;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+  // too many num_components
+  for (int val : {5, 255}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kSOFOffset + 9] = val;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+  // invalid sampling factors
+  for (int val : {0x00, 0x01, 0x10, 0x15, 0x51}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kSOFOffset + 11] = val;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+  // invalid quant table index
+  for (int val : {5, 17}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kSOFOffset + 12] = val;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+}
+
+TEST(DecoderErrorHandlingTest, InvalidDHT) {
+  // Bad marker length
+  for (int diff : {-2, -1, 1, 2}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kDHTOffset + 3] += diff;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+  {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kDHTOffset + 2] += 17;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+  // inavlid table slot_id
+  for (int val : {0x05, 0x15, 0x20}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kDHTOffset + 4] = val;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+}
+
+TEST(DecoderErrorHandlingTest, InvalidSOS) {
+  // Invalid comps_in_scan
+  for (int val : {2, 5, 17}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kSOSOffset + 4] = val;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+  // invalid Huffman table indexes
+  for (int val : {0x05, 0x50, 0x15, 0x51}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kSOSOffset + 6] = val;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+  // invalid Ss/Se
+  for (int pos : {7, 8}) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    compressed[kSOSOffset + pos] = 64;
+    EXPECT_FALSE(ParseCompressed(compressed));
+  }
+}
+
+TEST(DecoderErrorHandlingTest, MutateSingleBytes) {
+  for (size_t pos = 0; pos < kLen0; ++pos) {
+    std::vector<uint8_t> compressed(kCompressed0, kCompressed0 + kLen0);
+    for (int val : {0x00, 0x0f, 0xf0, 0xff}) {
+      compressed[pos] = val;
+      ParseCompressed(compressed);
+    }
+  }
+}
+
+}  // namespace
+}  // namespace jpegli
diff --git a/lib/jpegli/huffman.cc b/lib/jpegli/huffman.cc
new file mode 100644 (file)
index 0000000..1cf88a5
--- /dev/null
@@ -0,0 +1,321 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/huffman.h"
+
+#include <limits>
+#include <vector>
+
+#include "lib/jpegli/common.h"
+#include "lib/jpegli/error.h"
+
+namespace jpegli {
+
+// Returns the table width of the next 2nd level table, count is the histogram
+// of bit lengths for the remaining symbols, len is the code length of the next
+// processed symbol.
+static inline int NextTableBitSize(const int* count, int len) {
+  int left = 1 << (len - kJpegHuffmanRootTableBits);
+  while (len < static_cast<int>(kJpegHuffmanMaxBitLength)) {
+    left -= count[len];
+    if (left <= 0) break;
+    ++len;
+    left <<= 1;
+  }
+  return len - kJpegHuffmanRootTableBits;
+}
+
+void BuildJpegHuffmanTable(const uint32_t* count, const uint32_t* symbols,
+                           HuffmanTableEntry* lut) {
+  HuffmanTableEntry code;    // current table entry
+  HuffmanTableEntry* table;  // next available space in table
+  int len;                   // current code length
+  int idx;                   // symbol index
+  int key;                   // prefix code
+  int reps;                  // number of replicate key values in current table
+  int low;                   // low bits for current root entry
+  int table_bits;            // key length of current table
+  int table_size;            // size of current table
+
+  // Make a local copy of the input bit length histogram.
+  int tmp_count[kJpegHuffmanMaxBitLength + 1] = {0};
+  int total_count = 0;
+  for (len = 1; len <= static_cast<int>(kJpegHuffmanMaxBitLength); ++len) {
+    tmp_count[len] = count[len];
+    total_count += tmp_count[len];
+  }
+
+  table = lut;
+  table_bits = kJpegHuffmanRootTableBits;
+  table_size = 1 << table_bits;
+
+  // Special case code with only one value.
+  if (total_count == 1) {
+    code.bits = 0;
+    code.value = symbols[0];
+    for (key = 0; key < table_size; ++key) {
+      table[key] = code;
+    }
+    return;
+  }
+
+  // Fill in root table.
+  key = 0;
+  idx = 0;
+  for (len = 1; len <= kJpegHuffmanRootTableBits; ++len) {
+    for (; tmp_count[len] > 0; --tmp_count[len]) {
+      code.bits = len;
+      code.value = symbols[idx++];
+      reps = 1 << (kJpegHuffmanRootTableBits - len);
+      while (reps--) {
+        table[key++] = code;
+      }
+    }
+  }
+
+  // Fill in 2nd level tables and add pointers to root table.
+  table += table_size;
+  table_size = 0;
+  low = 0;
+  for (len = kJpegHuffmanRootTableBits + 1;
+       len <= static_cast<int>(kJpegHuffmanMaxBitLength); ++len) {
+    for (; tmp_count[len] > 0; --tmp_count[len]) {
+      // Start a new sub-table if the previous one is full.
+      if (low >= table_size) {
+        table += table_size;
+        table_bits = NextTableBitSize(tmp_count, len);
+        table_size = 1 << table_bits;
+        low = 0;
+        lut[key].bits = table_bits + kJpegHuffmanRootTableBits;
+        lut[key].value = (table - lut) - key;
+        ++key;
+      }
+      code.bits = len - kJpegHuffmanRootTableBits;
+      code.value = symbols[idx++];
+      reps = 1 << (table_bits - code.bits);
+      while (reps--) {
+        table[low++] = code;
+      }
+    }
+  }
+}
+
+// A node of a Huffman tree.
+struct HuffmanTree {
+  HuffmanTree(uint32_t count, int16_t left, int16_t right)
+      : total_count(count), index_left(left), index_right_or_value(right) {}
+  uint32_t total_count;
+  int16_t index_left;
+  int16_t index_right_or_value;
+};
+
+void SetDepth(const HuffmanTree& p, HuffmanTree* pool, uint8_t* depth,
+              uint8_t level) {
+  if (p.index_left >= 0) {
+    ++level;
+    SetDepth(pool[p.index_left], pool, depth, level);
+    SetDepth(pool[p.index_right_or_value], pool, depth, level);
+  } else {
+    depth[p.index_right_or_value] = level;
+  }
+}
+
+// Sort the root nodes, least popular first.
+static JXL_INLINE bool Compare(const HuffmanTree& v0, const HuffmanTree& v1) {
+  return v0.total_count < v1.total_count;
+}
+
+// This function will create a Huffman tree.
+//
+// The catch here is that the tree cannot be arbitrarily deep.
+// Brotli specifies a maximum depth of 15 bits for "code trees"
+// and 7 bits for "code length code trees."
+//
+// count_limit is the value that is to be faked as the minimum value
+// and this minimum value is raised until the tree matches the
+// maximum length requirement.
+//
+// This algorithm is not of excellent performance for very long data blocks,
+// especially when population counts are longer than 2**tree_limit, but
+// we are not planning to use this with extremely long blocks.
+//
+// See http://en.wikipedia.org/wiki/Huffman_coding
+void CreateHuffmanTree(const uint32_t* data, const size_t length,
+                       const int tree_limit, uint8_t* depth) {
+  // For block sizes below 64 kB, we never need to do a second iteration
+  // of this loop. Probably all of our block sizes will be smaller than
+  // that, so this loop is mostly of academic interest. If we actually
+  // would need this, we would be better off with the Katajainen algorithm.
+  for (uint32_t count_limit = 1;; count_limit *= 2) {
+    std::vector<HuffmanTree> tree;
+    tree.reserve(2 * length + 1);
+
+    for (size_t i = length; i != 0;) {
+      --i;
+      if (data[i]) {
+        const uint32_t count = std::max(data[i], count_limit - 1);
+        tree.emplace_back(count, -1, static_cast<int16_t>(i));
+      }
+    }
+
+    const size_t n = tree.size();
+    if (n == 1) {
+      // Fake value; will be fixed on upper level.
+      depth[tree[0].index_right_or_value] = 1;
+      break;
+    }
+
+    std::stable_sort(tree.begin(), tree.end(), Compare);
+
+    // The nodes are:
+    // [0, n): the sorted leaf nodes that we start with.
+    // [n]: we add a sentinel here.
+    // [n + 1, 2n): new parent nodes are added here, starting from
+    //              (n+1). These are naturally in ascending order.
+    // [2n]: we add a sentinel at the end as well.
+    // There will be (2n+1) elements at the end.
+    const HuffmanTree sentinel(std::numeric_limits<uint32_t>::max(), -1, -1);
+    tree.push_back(sentinel);
+    tree.push_back(sentinel);
+
+    size_t i = 0;      // Points to the next leaf node.
+    size_t j = n + 1;  // Points to the next non-leaf node.
+    for (size_t k = n - 1; k != 0; --k) {
+      size_t left, right;
+      if (tree[i].total_count <= tree[j].total_count) {
+        left = i;
+        ++i;
+      } else {
+        left = j;
+        ++j;
+      }
+      if (tree[i].total_count <= tree[j].total_count) {
+        right = i;
+        ++i;
+      } else {
+        right = j;
+        ++j;
+      }
+
+      // The sentinel node becomes the parent node.
+      size_t j_end = tree.size() - 1;
+      tree[j_end].total_count =
+          tree[left].total_count + tree[right].total_count;
+      tree[j_end].index_left = static_cast<int16_t>(left);
+      tree[j_end].index_right_or_value = static_cast<int16_t>(right);
+
+      // Add back the last sentinel node.
+      tree.push_back(sentinel);
+    }
+    JXL_DASSERT(tree.size() == 2 * n + 1);
+    SetDepth(tree[2 * n - 1], &tree[0], depth, 0);
+
+    // We need to pack the Huffman tree in tree_limit bits.
+    // If this was not successful, add fake entities to the lowest values
+    // and retry.
+    if (*std::max_element(&depth[0], &depth[length]) <= tree_limit) {
+      break;
+    }
+  }
+}
+
+void ValidateHuffmanTable(j_common_ptr cinfo, const JHUFF_TBL* table,
+                          bool is_dc) {
+  size_t total_symbols = 0;
+  size_t total_p = 0;
+  size_t max_depth = 0;
+  for (size_t d = 1; d <= kJpegHuffmanMaxBitLength; ++d) {
+    uint8_t count = table->bits[d];
+    if (count) {
+      total_symbols += count;
+      total_p += (1u << (kJpegHuffmanMaxBitLength - d)) * count;
+      max_depth = d;
+    }
+  }
+  total_p += 1u << (kJpegHuffmanMaxBitLength - max_depth);  // sentinel symbol
+  if (total_symbols == 0) {
+    JPEGLI_ERROR("Empty Huffman table");
+  }
+  if (total_symbols > kJpegHuffmanAlphabetSize) {
+    JPEGLI_ERROR("Too many symbols in Huffman table");
+  }
+  if (total_p != (1u << kJpegHuffmanMaxBitLength)) {
+    JPEGLI_ERROR("Invalid bit length distribution");
+  }
+  uint8_t symbol_seen[kJpegHuffmanAlphabetSize] = {};
+  for (size_t i = 0; i < total_symbols; ++i) {
+    uint8_t symbol = table->huffval[i];
+    if (symbol_seen[symbol]) {
+      JPEGLI_ERROR("Duplicate symbol %d in Huffman table", symbol);
+    }
+    symbol_seen[symbol] = 1;
+  }
+}
+
+void AddStandardHuffmanTables(j_common_ptr cinfo, bool is_dc) {
+  // Huffman tables from the JPEG standard.
+  static constexpr JHUFF_TBL kStandardDCTables[2] = {
+      // DC luma
+      {{0, 0, 1, 5, 1, 1, 1, 1, 1, 1},
+       {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
+       FALSE},
+      // DC chroma
+      {{0, 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+       {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
+       FALSE}};
+  static constexpr JHUFF_TBL kStandardACTables[2] = {
+      // AC luma
+      {{0, 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 125},
+       {0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12, 0x21, 0x31, 0x41, 0x06,
+        0x13, 0x51, 0x61, 0x07, 0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08,
+        0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0, 0x24, 0x33, 0x62, 0x72,
+        0x82, 0x09, 0x0a, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28,
+        0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x43, 0x44, 0x45,
+        0x46, 0x47, 0x48, 0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
+        0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x75,
+        0x76, 0x77, 0x78, 0x79, 0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
+        0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3,
+        0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6,
+        0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9,
+        0xca, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2,
+        0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xf1, 0xf2, 0xf3, 0xf4,
+        0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa},
+       FALSE},
+      // AC chroma
+      {{0, 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 119},
+       {0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21, 0x31, 0x06, 0x12, 0x41,
+        0x51, 0x07, 0x61, 0x71, 0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91,
+        0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0, 0x15, 0x62, 0x72, 0xd1,
+        0x0a, 0x16, 0x24, 0x34, 0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26,
+        0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x43, 0x44,
+        0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
+        0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x73, 0x74,
+        0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+        0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a,
+        0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4,
+        0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+        0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda,
+        0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xf2, 0xf3, 0xf4,
+        0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa},
+       FALSE}};
+  const JHUFF_TBL* std_tables = is_dc ? kStandardDCTables : kStandardACTables;
+  JHUFF_TBL** tables;
+  if (cinfo->is_decompressor) {
+    j_decompress_ptr cinfo_d = reinterpret_cast<j_decompress_ptr>(cinfo);
+    tables = is_dc ? cinfo_d->dc_huff_tbl_ptrs : cinfo_d->ac_huff_tbl_ptrs;
+  } else {
+    j_compress_ptr cinfo_c = reinterpret_cast<j_compress_ptr>(cinfo);
+    tables = is_dc ? cinfo_c->dc_huff_tbl_ptrs : cinfo_c->ac_huff_tbl_ptrs;
+  }
+  for (int i = 0; i < 2; ++i) {
+    if (tables[i] == nullptr) {
+      tables[i] = jpegli_alloc_huff_table(cinfo);
+      memcpy(tables[i], &std_tables[i], sizeof(JHUFF_TBL));
+      ValidateHuffmanTable(cinfo, tables[i], is_dc);
+    }
+  }
+}
+
+}  // namespace jpegli
diff --git a/lib/jpegli/huffman.h b/lib/jpegli/huffman.h
new file mode 100644 (file)
index 0000000..f0e5e1d
--- /dev/null
@@ -0,0 +1,50 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_HUFFMAN_H_
+#define LIB_JPEGLI_HUFFMAN_H_
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "lib/jpegli/common_internal.h"
+
+namespace jpegli {
+
+constexpr int kJpegHuffmanRootTableBits = 8;
+// Maximum huffman lookup table size.
+// According to zlib/examples/enough.c, 758 entries are always enough for
+// an alphabet of 257 symbols (256 + 1 special symbol for the all 1s code) and
+// max bit length 16 if the root table has 8 bits.
+constexpr int kJpegHuffmanLutSize = 758;
+
+struct HuffmanTableEntry {
+  uint8_t bits;    // number of bits used for this symbol
+  uint16_t value;  // symbol value or table offset
+};
+
+void BuildJpegHuffmanTable(const uint32_t* count, const uint32_t* symbols,
+                           HuffmanTableEntry* lut);
+
+// This function will create a Huffman tree.
+//
+// The (data,length) contains the population counts.
+// The tree_limit is the maximum bit depth of the Huffman codes.
+//
+// The depth contains the tree, i.e., how many bits are used for
+// the symbol.
+//
+// See http://en.wikipedia.org/wiki/Huffman_coding
+void CreateHuffmanTree(const uint32_t* data, size_t length, int tree_limit,
+                       uint8_t* depth);
+
+void ValidateHuffmanTable(j_common_ptr cinfo, const JHUFF_TBL* table,
+                          bool is_dc);
+
+void AddStandardHuffmanTables(j_common_ptr cinfo, bool is_dc);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_HUFFMAN_H_
diff --git a/lib/jpegli/idct.cc b/lib/jpegli/idct.cc
new file mode 100644 (file)
index 0000000..4d10563
--- /dev/null
@@ -0,0 +1,692 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/idct.h"
+
+#include <cmath>
+
+#include "lib/jpegli/decode_internal.h"
+#include "lib/jxl/base/status.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jpegli/idct.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jpegli/transpose-inl.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jpegli {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Abs;
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Gt;
+using hwy::HWY_NAMESPACE::IfThenElseZero;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::NegMulAdd;
+using hwy::HWY_NAMESPACE::Rebind;
+using hwy::HWY_NAMESPACE::Sub;
+using hwy::HWY_NAMESPACE::Vec;
+using hwy::HWY_NAMESPACE::Xor;
+
+using D = HWY_FULL(float);
+using DI = HWY_FULL(int32_t);
+constexpr D d;
+constexpr DI di;
+
+using D8 = HWY_CAPPED(float, 8);
+constexpr D8 d8;
+
+void DequantBlock(const int16_t* JXL_RESTRICT qblock,
+                  const float* JXL_RESTRICT dequant,
+                  const float* JXL_RESTRICT biases, float* JXL_RESTRICT block) {
+  for (size_t k = 0; k < 64; k += Lanes(d)) {
+    const auto mul = Load(d, dequant + k);
+    const auto bias = Load(d, biases + k);
+    const Rebind<int16_t, DI> di16;
+    const Vec<DI> quant_i = PromoteTo(di, Load(di16, qblock + k));
+    const Rebind<float, DI> df;
+    const auto quant = ConvertTo(df, quant_i);
+    const auto abs_quant = Abs(quant);
+    const auto not_0 = Gt(abs_quant, Zero(df));
+    const auto sign_quant = Xor(quant, abs_quant);
+    const auto biased_quant = Sub(quant, Xor(bias, sign_quant));
+    const auto dequant = IfThenElseZero(not_0, Mul(biased_quant, mul));
+    Store(dequant, d, block + k);
+  }
+}
+
+template <size_t N>
+void ForwardEvenOdd(const float* JXL_RESTRICT ain, size_t ain_stride,
+                    float* JXL_RESTRICT aout) {
+  for (size_t i = 0; i < N / 2; i++) {
+    auto in1 = LoadU(d8, ain + 2 * i * ain_stride);
+    Store(in1, d8, aout + i * 8);
+  }
+  for (size_t i = N / 2; i < N; i++) {
+    auto in1 = LoadU(d8, ain + (2 * (i - N / 2) + 1) * ain_stride);
+    Store(in1, d8, aout + i * 8);
+  }
+}
+
+template <size_t N>
+void BTranspose(float* JXL_RESTRICT coeff) {
+  for (size_t i = N - 1; i > 0; i--) {
+    auto in1 = Load(d8, coeff + i * 8);
+    auto in2 = Load(d8, coeff + (i - 1) * 8);
+    Store(Add(in1, in2), d8, coeff + i * 8);
+  }
+  constexpr float kSqrt2 = 1.41421356237f;
+  auto sqrt2 = Set(d8, kSqrt2);
+  auto in1 = Load(d8, coeff);
+  Store(Mul(in1, sqrt2), d8, coeff);
+}
+
+// Constants for DCT implementation. Generated by the following snippet:
+// for i in range(N // 2):
+//    print(1.0 / (2 * math.cos((i + 0.5) * math.pi / N)), end=", ")
+template <size_t N>
+struct WcMultipliers;
+
+template <>
+struct WcMultipliers<4> {
+  static constexpr float kMultipliers[] = {
+      0.541196100146197,
+      1.3065629648763764,
+  };
+};
+
+template <>
+struct WcMultipliers<8> {
+  static constexpr float kMultipliers[] = {
+      0.5097955791041592,
+      0.6013448869350453,
+      0.8999762231364156,
+      2.5629154477415055,
+  };
+};
+
+constexpr float WcMultipliers<4>::kMultipliers[];
+constexpr float WcMultipliers<8>::kMultipliers[];
+
+template <size_t N>
+void MultiplyAndAdd(const float* JXL_RESTRICT coeff, float* JXL_RESTRICT out,
+                    size_t out_stride) {
+  for (size_t i = 0; i < N / 2; i++) {
+    auto mul = Set(d8, WcMultipliers<N>::kMultipliers[i]);
+    auto in1 = Load(d8, coeff + i * 8);
+    auto in2 = Load(d8, coeff + (N / 2 + i) * 8);
+    auto out1 = MulAdd(mul, in2, in1);
+    auto out2 = NegMulAdd(mul, in2, in1);
+    StoreU(out1, d8, out + i * out_stride);
+    StoreU(out2, d8, out + (N - i - 1) * out_stride);
+  }
+}
+
+template <size_t N>
+struct IDCT1DImpl;
+
+template <>
+struct IDCT1DImpl<1> {
+  JXL_INLINE void operator()(const float* from, size_t from_stride, float* to,
+                             size_t to_stride) {
+    StoreU(LoadU(d8, from), d8, to);
+  }
+};
+
+template <>
+struct IDCT1DImpl<2> {
+  JXL_INLINE void operator()(const float* from, size_t from_stride, float* to,
+                             size_t to_stride) {
+    JXL_DASSERT(from_stride >= 8);
+    JXL_DASSERT(to_stride >= 8);
+    auto in1 = LoadU(d8, from);
+    auto in2 = LoadU(d8, from + from_stride);
+    StoreU(Add(in1, in2), d8, to);
+    StoreU(Sub(in1, in2), d8, to + to_stride);
+  }
+};
+
+template <size_t N>
+struct IDCT1DImpl {
+  void operator()(const float* from, size_t from_stride, float* to,
+                  size_t to_stride) {
+    JXL_DASSERT(from_stride >= 8);
+    JXL_DASSERT(to_stride >= 8);
+    HWY_ALIGN float tmp[64];
+    ForwardEvenOdd<N>(from, from_stride, tmp);
+    IDCT1DImpl<N / 2>()(tmp, 8, tmp, 8);
+    BTranspose<N / 2>(tmp + N * 4);
+    IDCT1DImpl<N / 2>()(tmp + N * 4, 8, tmp + N * 4, 8);
+    MultiplyAndAdd<N>(tmp, to, to_stride);
+  }
+};
+
+template <size_t N>
+void IDCT1D(float* JXL_RESTRICT from, float* JXL_RESTRICT output,
+            size_t output_stride) {
+  for (size_t i = 0; i < 8; i += Lanes(d8)) {
+    IDCT1DImpl<N>()(from + i, 8, output + i, output_stride);
+  }
+}
+
+void ComputeScaledIDCT(float* JXL_RESTRICT block0, float* JXL_RESTRICT block1,
+                       float* JXL_RESTRICT output, size_t output_stride) {
+  Transpose8x8Block(block0, block1);
+  IDCT1D<8>(block1, block0, 8);
+  Transpose8x8Block(block0, block1);
+  IDCT1D<8>(block1, output, output_stride);
+}
+
+void InverseTransformBlock8x8(const int16_t* JXL_RESTRICT qblock,
+                              const float* JXL_RESTRICT dequant,
+                              const float* JXL_RESTRICT biases,
+                              float* JXL_RESTRICT scratch_space,
+                              float* JXL_RESTRICT output, size_t output_stride,
+                              size_t dctsize) {
+  float* JXL_RESTRICT block0 = scratch_space;
+  float* JXL_RESTRICT block1 = scratch_space + DCTSIZE2;
+  DequantBlock(qblock, dequant, biases, block0);
+  ComputeScaledIDCT(block0, block1, output, output_stride);
+}
+
+// Computes the N-point IDCT of in[], and stores the result in out[]. The in[]
+// array is at most 8 values long, values in[8:N-1] are assumed to be 0.
+void Compute1dIDCT(float* in, float* out, size_t N) {
+  switch (N) {
+    case 3: {
+      static constexpr float kC3[3] = {
+          1.414213562373,
+          1.224744871392,
+          0.707106781187,
+      };
+      float even0 = in[0] + kC3[2] * in[2];
+      float even1 = in[0] - kC3[0] * in[2];
+      float odd0 = kC3[1] * in[1];
+      out[0] = even0 + odd0;
+      out[2] = even0 - odd0;
+      out[1] = even1;
+      break;
+    }
+    case 5: {
+      static constexpr float kC5[5] = {
+          1.414213562373, 1.344997023928, 1.144122805635,
+          0.831253875555, 0.437016024449,
+      };
+      float even0 = in[0] + kC5[2] * in[2] + kC5[4] * in[4];
+      float even1 = in[0] - kC5[4] * in[2] - kC5[2] * in[4];
+      float even2 = in[0] - kC5[0] * in[2] + kC5[0] * in[4];
+      float odd0 = kC5[1] * in[1] + kC5[3] * in[3];
+      float odd1 = kC5[3] * in[1] - kC5[1] * in[3];
+      out[0] = even0 + odd0;
+      out[4] = even0 - odd0;
+      out[1] = even1 + odd1;
+      out[3] = even1 - odd1;
+      out[2] = even2;
+      break;
+    }
+    case 6: {
+      static constexpr float kC6[6] = {
+          1.414213562373, 1.366025403784, 1.224744871392,
+          1.000000000000, 0.707106781187, 0.366025403784,
+      };
+      float even0 = in[0] + kC6[2] * in[2] + kC6[4] * in[4];
+      float even1 = in[0] - kC6[0] * in[4];
+      float even2 = in[0] - kC6[2] * in[2] + kC6[4] * in[4];
+      float odd0 = kC6[1] * in[1] + kC6[3] * in[3] + kC6[5] * in[5];
+      float odd1 = kC6[3] * in[1] - kC6[3] * in[3] - kC6[3] * in[5];
+      float odd2 = kC6[5] * in[1] - kC6[3] * in[3] + kC6[1] * in[5];
+      out[0] = even0 + odd0;
+      out[5] = even0 - odd0;
+      out[1] = even1 + odd1;
+      out[4] = even1 - odd1;
+      out[2] = even2 + odd2;
+      out[3] = even2 - odd2;
+      break;
+    }
+    case 7: {
+      static constexpr float kC7[7] = {
+          1.414213562373, 1.378756275744, 1.274162392264, 1.105676685997,
+          0.881747733790, 0.613604268353, 0.314692122713,
+      };
+      float even0 = in[0] + kC7[2] * in[2] + kC7[4] * in[4] + kC7[6] * in[6];
+      float even1 = in[0] + kC7[6] * in[2] - kC7[2] * in[4] - kC7[4] * in[6];
+      float even2 = in[0] - kC7[4] * in[2] - kC7[6] * in[4] + kC7[2] * in[6];
+      float even3 = in[0] - kC7[0] * in[2] + kC7[0] * in[4] - kC7[0] * in[6];
+      float odd0 = kC7[1] * in[1] + kC7[3] * in[3] + kC7[5] * in[5];
+      float odd1 = kC7[3] * in[1] - kC7[5] * in[3] - kC7[1] * in[5];
+      float odd2 = kC7[5] * in[1] - kC7[1] * in[3] + kC7[3] * in[5];
+      out[0] = even0 + odd0;
+      out[6] = even0 - odd0;
+      out[1] = even1 + odd1;
+      out[5] = even1 - odd1;
+      out[2] = even2 + odd2;
+      out[4] = even2 - odd2;
+      out[3] = even3;
+      break;
+    }
+    case 9: {
+      static constexpr float kC9[9] = {
+          1.414213562373, 1.392728480640, 1.328926048777,
+          1.224744871392, 1.083350440839, 0.909038955344,
+          0.707106781187, 0.483689525296, 0.245575607938,
+      };
+      float even0 = in[0] + kC9[2] * in[2] + kC9[4] * in[4] + kC9[6] * in[6];
+      float even1 = in[0] + kC9[6] * in[2] - kC9[6] * in[4] - kC9[0] * in[6];
+      float even2 = in[0] - kC9[8] * in[2] - kC9[2] * in[4] + kC9[6] * in[6];
+      float even3 = in[0] - kC9[4] * in[2] + kC9[8] * in[4] + kC9[6] * in[6];
+      float even4 = in[0] - kC9[0] * in[2] + kC9[0] * in[4] - kC9[0] * in[6];
+      float odd0 =
+          kC9[1] * in[1] + kC9[3] * in[3] + kC9[5] * in[5] + kC9[7] * in[7];
+      float odd1 = kC9[3] * in[1] - kC9[3] * in[5] - kC9[3] * in[7];
+      float odd2 =
+          kC9[5] * in[1] - kC9[3] * in[3] - kC9[7] * in[5] + kC9[1] * in[7];
+      float odd3 =
+          kC9[7] * in[1] - kC9[3] * in[3] + kC9[1] * in[5] - kC9[5] * in[7];
+      out[0] = even0 + odd0;
+      out[8] = even0 - odd0;
+      out[1] = even1 + odd1;
+      out[7] = even1 - odd1;
+      out[2] = even2 + odd2;
+      out[6] = even2 - odd2;
+      out[3] = even3 + odd3;
+      out[5] = even3 - odd3;
+      out[4] = even4;
+      break;
+    }
+    case 10: {
+      static constexpr float kC10[10] = {
+          1.414213562373, 1.396802246667, 1.344997023928, 1.260073510670,
+          1.144122805635, 1.000000000000, 0.831253875555, 0.642039521920,
+          0.437016024449, 0.221231742082,
+      };
+      float even0 = in[0] + kC10[2] * in[2] + kC10[4] * in[4] + kC10[6] * in[6];
+      float even1 = in[0] + kC10[6] * in[2] - kC10[8] * in[4] - kC10[2] * in[6];
+      float even2 = in[0] - kC10[0] * in[4];
+      float even3 = in[0] - kC10[6] * in[2] - kC10[8] * in[4] + kC10[2] * in[6];
+      float even4 = in[0] - kC10[2] * in[2] + kC10[4] * in[4] - kC10[6] * in[6];
+      float odd0 =
+          kC10[1] * in[1] + kC10[3] * in[3] + kC10[5] * in[5] + kC10[7] * in[7];
+      float odd1 =
+          kC10[3] * in[1] + kC10[9] * in[3] - kC10[5] * in[5] - kC10[1] * in[7];
+      float odd2 =
+          kC10[5] * in[1] - kC10[5] * in[3] - kC10[5] * in[5] + kC10[5] * in[7];
+      float odd3 =
+          kC10[7] * in[1] - kC10[1] * in[3] + kC10[5] * in[5] + kC10[9] * in[7];
+      float odd4 =
+          kC10[9] * in[1] - kC10[7] * in[3] + kC10[5] * in[5] - kC10[3] * in[7];
+      out[0] = even0 + odd0;
+      out[9] = even0 - odd0;
+      out[1] = even1 + odd1;
+      out[8] = even1 - odd1;
+      out[2] = even2 + odd2;
+      out[7] = even2 - odd2;
+      out[3] = even3 + odd3;
+      out[6] = even3 - odd3;
+      out[4] = even4 + odd4;
+      out[5] = even4 - odd4;
+      break;
+    }
+    case 11: {
+      static constexpr float kC11[11] = {
+          1.414213562373, 1.399818907436, 1.356927976287, 1.286413904599,
+          1.189712155524, 1.068791297809, 0.926112931411, 0.764581576418,
+          0.587485545401, 0.398430002847, 0.201263574413,
+      };
+      float even0 = in[0] + kC11[2] * in[2] + kC11[4] * in[4] + kC11[6] * in[6];
+      float even1 =
+          in[0] + kC11[6] * in[2] - kC11[10] * in[4] - kC11[4] * in[6];
+      float even2 =
+          in[0] + kC11[10] * in[2] - kC11[2] * in[4] - kC11[8] * in[6];
+      float even3 = in[0] - kC11[8] * in[2] - kC11[6] * in[4] + kC11[2] * in[6];
+      float even4 =
+          in[0] - kC11[4] * in[2] + kC11[8] * in[4] + kC11[10] * in[6];
+      float even5 = in[0] - kC11[0] * in[2] + kC11[0] * in[4] - kC11[0] * in[6];
+      float odd0 =
+          kC11[1] * in[1] + kC11[3] * in[3] + kC11[5] * in[5] + kC11[7] * in[7];
+      float odd1 =
+          kC11[3] * in[1] + kC11[9] * in[3] - kC11[7] * in[5] - kC11[1] * in[7];
+      float odd2 =
+          kC11[5] * in[1] - kC11[7] * in[3] - kC11[3] * in[5] + kC11[9] * in[7];
+      float odd3 =
+          kC11[7] * in[1] - kC11[1] * in[3] + kC11[9] * in[5] + kC11[5] * in[7];
+      float odd4 =
+          kC11[9] * in[1] - kC11[5] * in[3] + kC11[1] * in[5] - kC11[3] * in[7];
+      out[0] = even0 + odd0;
+      out[10] = even0 - odd0;
+      out[1] = even1 + odd1;
+      out[9] = even1 - odd1;
+      out[2] = even2 + odd2;
+      out[8] = even2 - odd2;
+      out[3] = even3 + odd3;
+      out[7] = even3 - odd3;
+      out[4] = even4 + odd4;
+      out[6] = even4 - odd4;
+      out[5] = even5;
+      break;
+    }
+    case 12: {
+      static constexpr float kC12[12] = {
+          1.414213562373, 1.402114769300, 1.366025403784, 1.306562964876,
+          1.224744871392, 1.121971053594, 1.000000000000, 0.860918669154,
+          0.707106781187, 0.541196100146, 0.366025403784, 0.184591911283,
+      };
+      float even0 = in[0] + kC12[2] * in[2] + kC12[4] * in[4] + kC12[6] * in[6];
+      float even1 = in[0] + kC12[6] * in[2] - kC12[6] * in[6];
+      float even2 =
+          in[0] + kC12[10] * in[2] - kC12[4] * in[4] - kC12[6] * in[6];
+      float even3 =
+          in[0] - kC12[10] * in[2] - kC12[4] * in[4] + kC12[6] * in[6];
+      float even4 = in[0] - kC12[6] * in[2] + kC12[6] * in[6];
+      float even5 = in[0] - kC12[2] * in[2] + kC12[4] * in[4] - kC12[6] * in[6];
+      float odd0 =
+          kC12[1] * in[1] + kC12[3] * in[3] + kC12[5] * in[5] + kC12[7] * in[7];
+      float odd1 =
+          kC12[3] * in[1] + kC12[9] * in[3] - kC12[9] * in[5] - kC12[3] * in[7];
+      float odd2 = kC12[5] * in[1] - kC12[9] * in[3] - kC12[1] * in[5] -
+                   kC12[11] * in[7];
+      float odd3 = kC12[7] * in[1] - kC12[3] * in[3] - kC12[11] * in[5] +
+                   kC12[1] * in[7];
+      float odd4 =
+          kC12[9] * in[1] - kC12[3] * in[3] + kC12[3] * in[5] - kC12[9] * in[7];
+      float odd5 = kC12[11] * in[1] - kC12[9] * in[3] + kC12[7] * in[5] -
+                   kC12[5] * in[7];
+      out[0] = even0 + odd0;
+      out[11] = even0 - odd0;
+      out[1] = even1 + odd1;
+      out[10] = even1 - odd1;
+      out[2] = even2 + odd2;
+      out[9] = even2 - odd2;
+      out[3] = even3 + odd3;
+      out[8] = even3 - odd3;
+      out[4] = even4 + odd4;
+      out[7] = even4 - odd4;
+      out[5] = even5 + odd5;
+      out[6] = even5 - odd5;
+      break;
+    }
+    case 13: {
+      static constexpr float kC13[13] = {
+          1.414213562373, 1.403902353238, 1.373119086479, 1.322312651445,
+          1.252223920364, 1.163874944761, 1.058554051646, 0.937797056801,
+          0.803364869133, 0.657217812653, 0.501487040539, 0.338443458124,
+          0.170464607981,
+      };
+      float even0 = in[0] + kC13[2] * in[2] + kC13[4] * in[4] + kC13[6] * in[6];
+      float even1 =
+          in[0] + kC13[6] * in[2] + kC13[12] * in[4] - kC13[8] * in[6];
+      float even2 =
+          in[0] + kC13[10] * in[2] - kC13[6] * in[4] - kC13[4] * in[6];
+      float even3 =
+          in[0] - kC13[12] * in[2] - kC13[2] * in[4] + kC13[10] * in[6];
+      float even4 =
+          in[0] - kC13[8] * in[2] - kC13[10] * in[4] + kC13[2] * in[6];
+      float even5 =
+          in[0] - kC13[4] * in[2] + kC13[8] * in[4] - kC13[12] * in[6];
+      float even6 = in[0] - kC13[0] * in[2] + kC13[0] * in[4] - kC13[0] * in[6];
+      float odd0 =
+          kC13[1] * in[1] + kC13[3] * in[3] + kC13[5] * in[5] + kC13[7] * in[7];
+      float odd1 = kC13[3] * in[1] + kC13[9] * in[3] - kC13[11] * in[5] -
+                   kC13[5] * in[7];
+      float odd2 = kC13[5] * in[1] - kC13[11] * in[3] - kC13[1] * in[5] -
+                   kC13[9] * in[7];
+      float odd3 =
+          kC13[7] * in[1] - kC13[5] * in[3] - kC13[9] * in[5] + kC13[3] * in[7];
+      float odd4 = kC13[9] * in[1] - kC13[1] * in[3] + kC13[7] * in[5] +
+                   kC13[11] * in[7];
+      float odd5 = kC13[11] * in[1] - kC13[7] * in[3] + kC13[3] * in[5] -
+                   kC13[1] * in[7];
+      out[0] = even0 + odd0;
+      out[12] = even0 - odd0;
+      out[1] = even1 + odd1;
+      out[11] = even1 - odd1;
+      out[2] = even2 + odd2;
+      out[10] = even2 - odd2;
+      out[3] = even3 + odd3;
+      out[9] = even3 - odd3;
+      out[4] = even4 + odd4;
+      out[8] = even4 - odd4;
+      out[5] = even5 + odd5;
+      out[7] = even5 - odd5;
+      out[6] = even6;
+      break;
+    }
+    case 14: {
+      static constexpr float kC14[14] = {
+          1.414213562373, 1.405321284327, 1.378756275744, 1.334852607020,
+          1.274162392264, 1.197448846138, 1.105676685997, 1.000000000000,
+          0.881747733790, 0.752406978226, 0.613604268353, 0.467085128785,
+          0.314692122713, 0.158341680609,
+      };
+      float even0 = in[0] + kC14[2] * in[2] + kC14[4] * in[4] + kC14[6] * in[6];
+      float even1 =
+          in[0] + kC14[6] * in[2] + kC14[12] * in[4] - kC14[10] * in[6];
+      float even2 =
+          in[0] + kC14[10] * in[2] - kC14[8] * in[4] - kC14[2] * in[6];
+      float even3 = in[0] - kC14[0] * in[4];
+      float even4 =
+          in[0] - kC14[10] * in[2] - kC14[8] * in[4] + kC14[2] * in[6];
+      float even5 =
+          in[0] - kC14[6] * in[2] + kC14[12] * in[4] + kC14[10] * in[6];
+      float even6 = in[0] - kC14[2] * in[2] + kC14[4] * in[4] - kC14[6] * in[6];
+      float odd0 =
+          kC14[1] * in[1] + kC14[3] * in[3] + kC14[5] * in[5] + kC14[7] * in[7];
+      float odd1 = kC14[3] * in[1] + kC14[9] * in[3] - kC14[13] * in[5] -
+                   kC14[7] * in[7];
+      float odd2 = kC14[5] * in[1] - kC14[13] * in[3] - kC14[3] * in[5] -
+                   kC14[7] * in[7];
+      float odd3 =
+          kC14[7] * in[1] - kC14[7] * in[3] - kC14[7] * in[5] + kC14[7] * in[7];
+      float odd4 = kC14[9] * in[1] - kC14[1] * in[3] + kC14[11] * in[5] +
+                   kC14[7] * in[7];
+      float odd5 = kC14[11] * in[1] - kC14[5] * in[3] + kC14[1] * in[5] -
+                   kC14[7] * in[7];
+      float odd6 = kC14[13] * in[1] - kC14[11] * in[3] + kC14[9] * in[5] -
+                   kC14[7] * in[7];
+      out[0] = even0 + odd0;
+      out[13] = even0 - odd0;
+      out[1] = even1 + odd1;
+      out[12] = even1 - odd1;
+      out[2] = even2 + odd2;
+      out[11] = even2 - odd2;
+      out[3] = even3 + odd3;
+      out[10] = even3 - odd3;
+      out[4] = even4 + odd4;
+      out[9] = even4 - odd4;
+      out[5] = even5 + odd5;
+      out[8] = even5 - odd5;
+      out[6] = even6 + odd6;
+      out[7] = even6 - odd6;
+      break;
+    }
+    case 15: {
+      static constexpr float kC15[15] = {
+          1.414213562373, 1.406466352507, 1.383309602960, 1.344997023928,
+          1.291948376043, 1.224744871392, 1.144122805635, 1.050965490998,
+          0.946293578512, 0.831253875555, 0.707106781187, 0.575212476952,
+          0.437016024449, 0.294031532930, 0.147825570407,
+      };
+      float even0 = in[0] + kC15[2] * in[2] + kC15[4] * in[4] + kC15[6] * in[6];
+      float even1 =
+          in[0] + kC15[6] * in[2] + kC15[12] * in[4] - kC15[12] * in[6];
+      float even2 =
+          in[0] + kC15[10] * in[2] - kC15[10] * in[4] - kC15[0] * in[6];
+      float even3 =
+          in[0] + kC15[14] * in[2] - kC15[2] * in[4] - kC15[12] * in[6];
+      float even4 =
+          in[0] - kC15[12] * in[2] - kC15[6] * in[4] + kC15[6] * in[6];
+      float even5 =
+          in[0] - kC15[8] * in[2] - kC15[14] * in[4] + kC15[6] * in[6];
+      float even6 =
+          in[0] - kC15[4] * in[2] + kC15[8] * in[4] - kC15[12] * in[6];
+      float even7 = in[0] - kC15[0] * in[2] + kC15[0] * in[4] - kC15[0] * in[6];
+      float odd0 =
+          kC15[1] * in[1] + kC15[3] * in[3] + kC15[5] * in[5] + kC15[7] * in[7];
+      float odd1 = kC15[3] * in[1] + kC15[9] * in[3] - kC15[9] * in[7];
+      float odd2 = kC15[5] * in[1] - kC15[5] * in[5] - kC15[5] * in[7];
+      float odd3 = kC15[7] * in[1] - kC15[9] * in[3] - kC15[5] * in[5] +
+                   kC15[11] * in[7];
+      float odd4 = kC15[9] * in[1] - kC15[3] * in[3] + kC15[3] * in[7];
+      float odd5 = kC15[11] * in[1] - kC15[3] * in[3] + kC15[5] * in[5] -
+                   kC15[13] * in[7];
+      float odd6 = kC15[13] * in[1] - kC15[9] * in[3] + kC15[5] * in[5] -
+                   kC15[1] * in[7];
+      out[0] = even0 + odd0;
+      out[14] = even0 - odd0;
+      out[1] = even1 + odd1;
+      out[13] = even1 - odd1;
+      out[2] = even2 + odd2;
+      out[12] = even2 - odd2;
+      out[3] = even3 + odd3;
+      out[11] = even3 - odd3;
+      out[4] = even4 + odd4;
+      out[10] = even4 - odd4;
+      out[5] = even5 + odd5;
+      out[9] = even5 - odd5;
+      out[6] = even6 + odd6;
+      out[8] = even6 - odd6;
+      out[7] = even7;
+      break;
+    }
+    case 16: {
+      static constexpr float kC16[16] = {
+          1.414213562373, 1.407403737526, 1.387039845322, 1.353318001174,
+          1.306562964876, 1.247225012987, 1.175875602419, 1.093201867002,
+          1.000000000000, 0.897167586343, 0.785694958387, 0.666655658478,
+          0.541196100146, 0.410524527522, 0.275899379283, 0.138617169199,
+      };
+      float even0 = in[0] + kC16[2] * in[2] + kC16[4] * in[4] + kC16[6] * in[6];
+      float even1 =
+          in[0] + kC16[6] * in[2] + kC16[12] * in[4] - kC16[14] * in[6];
+      float even2 =
+          in[0] + kC16[10] * in[2] - kC16[12] * in[4] - kC16[2] * in[6];
+      float even3 =
+          in[0] + kC16[14] * in[2] - kC16[4] * in[4] - kC16[10] * in[6];
+      float even4 =
+          in[0] - kC16[14] * in[2] - kC16[4] * in[4] + kC16[10] * in[6];
+      float even5 =
+          in[0] - kC16[10] * in[2] - kC16[12] * in[4] + kC16[2] * in[6];
+      float even6 =
+          in[0] - kC16[6] * in[2] + kC16[12] * in[4] + kC16[14] * in[6];
+      float even7 = in[0] - kC16[2] * in[2] + kC16[4] * in[4] - kC16[6] * in[6];
+      float odd0 = (kC16[1] * in[1] + kC16[3] * in[3] + kC16[5] * in[5] +
+                    kC16[7] * in[7]);
+      float odd1 = (kC16[3] * in[1] + kC16[9] * in[3] + kC16[15] * in[5] -
+                    kC16[11] * in[7]);
+      float odd2 = (kC16[5] * in[1] + kC16[15] * in[3] - kC16[7] * in[5] -
+                    kC16[3] * in[7]);
+      float odd3 = (kC16[7] * in[1] - kC16[11] * in[3] - kC16[3] * in[5] +
+                    kC16[15] * in[7]);
+      float odd4 = (kC16[9] * in[1] - kC16[5] * in[3] - kC16[13] * in[5] +
+                    kC16[1] * in[7]);
+      float odd5 = (kC16[11] * in[1] - kC16[1] * in[3] + kC16[9] * in[5] +
+                    kC16[13] * in[7]);
+      float odd6 = (kC16[13] * in[1] - kC16[7] * in[3] + kC16[1] * in[5] -
+                    kC16[5] * in[7]);
+      float odd7 = (kC16[15] * in[1] - kC16[13] * in[3] + kC16[11] * in[5] -
+                    kC16[9] * in[7]);
+      out[0] = even0 + odd0;
+      out[15] = even0 - odd0;
+      out[1] = even1 + odd1;
+      out[14] = even1 - odd1;
+      out[2] = even2 + odd2;
+      out[13] = even2 - odd2;
+      out[3] = even3 + odd3;
+      out[12] = even3 - odd3;
+      out[4] = even4 + odd4;
+      out[11] = even4 - odd4;
+      out[5] = even5 + odd5;
+      out[10] = even5 - odd5;
+      out[6] = even6 + odd6;
+      out[9] = even6 - odd6;
+      out[7] = even7 + odd7;
+      out[8] = even7 - odd7;
+      break;
+    }
+  }
+}
+
+void InverseTransformBlockGeneric(const int16_t* JXL_RESTRICT qblock,
+                                  const float* JXL_RESTRICT dequant,
+                                  const float* JXL_RESTRICT biases,
+                                  float* JXL_RESTRICT scratch_space,
+                                  float* JXL_RESTRICT output,
+                                  size_t output_stride, size_t dctsize) {
+  float* JXL_RESTRICT block0 = scratch_space;
+  float* JXL_RESTRICT block1 = scratch_space + DCTSIZE2;
+  DequantBlock(qblock, dequant, biases, block0);
+  if (dctsize == 1) {
+    *output = *block0;
+  } else if (dctsize == 2 || dctsize == 4) {
+    float* JXL_RESTRICT block2 = scratch_space + 2 * DCTSIZE2;
+    ComputeScaledIDCT(block0, block1, block2, 8);
+    if (dctsize == 4) {
+      for (size_t iy = 0; iy < 4; ++iy) {
+        for (size_t ix = 0; ix < 4; ++ix) {
+          float* block = &block2[16 * iy + 2 * ix];
+          output[iy * output_stride + ix] =
+              0.25f * (block[0] + block[1] + block[8] + block[9]);
+        }
+      }
+    } else {
+      for (size_t iy = 0; iy < 2; ++iy) {
+        for (size_t ix = 0; ix < 2; ++ix) {
+          float* block = &block2[32 * iy + 4 * ix];
+          output[iy * output_stride + ix] =
+              0.0625f *
+              (block[0] + block[1] + block[2] + block[3] + block[8] + block[9] +
+               block[10] + block[11] + block[16] + block[17] + block[18] +
+               block[19] + block[24] + block[25] + block[26] + block[27]);
+        }
+      }
+    }
+  } else {
+    float dctin[DCTSIZE];
+    float dctout[DCTSIZE * 2];
+    size_t insize = std::min<size_t>(dctsize, DCTSIZE);
+    for (size_t ix = 0; ix < insize; ++ix) {
+      for (size_t iy = 0; iy < insize; ++iy) {
+        dctin[iy] = block0[iy * DCTSIZE + ix];
+      }
+      Compute1dIDCT(dctin, dctout, dctsize);
+      for (size_t iy = 0; iy < dctsize; ++iy) {
+        block1[iy * dctsize + ix] = dctout[iy];
+      }
+    }
+    for (size_t iy = 0; iy < dctsize; ++iy) {
+      Compute1dIDCT(block1 + iy * dctsize, output + iy * output_stride,
+                    dctsize);
+    }
+  }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jpegli
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jpegli {
+
+HWY_EXPORT(InverseTransformBlock8x8);
+HWY_EXPORT(InverseTransformBlockGeneric);
+
+void ChooseInverseTransform(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    if (m->scaled_dct_size[c] == DCTSIZE) {
+      m->inverse_transform[c] = HWY_DYNAMIC_DISPATCH(InverseTransformBlock8x8);
+    } else {
+      m->inverse_transform[c] =
+          HWY_DYNAMIC_DISPATCH(InverseTransformBlockGeneric);
+    }
+  }
+}
+
+}  // namespace jpegli
+#endif  // HWY_ONCE
diff --git a/lib/jpegli/idct.h b/lib/jpegli/idct.h
new file mode 100644 (file)
index 0000000..c2ec6d1
--- /dev/null
@@ -0,0 +1,18 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_IDCT_H_
+#define LIB_JPEGLI_IDCT_H_
+
+#include "lib/jpegli/common.h"
+#include "lib/jxl/base/compiler_specific.h"
+
+namespace jpegli {
+
+void ChooseInverseTransform(j_decompress_ptr cinfo);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_IDCT_H_
diff --git a/lib/jpegli/input.cc b/lib/jpegli/input.cc
new file mode 100644 (file)
index 0000000..765bf98
--- /dev/null
@@ -0,0 +1,414 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/input.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jpegli/input.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jpegli/encode_internal.h"
+#include "lib/jpegli/error.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/compiler_specific.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jpegli {
+namespace HWY_NAMESPACE {
+
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::Rebind;
+using hwy::HWY_NAMESPACE::Vec;
+
+using D = HWY_FULL(float);
+using DU = HWY_FULL(uint32_t);
+using DU8 = Rebind<uint8_t, D>;
+using DU16 = Rebind<uint16_t, D>;
+
+constexpr D d;
+constexpr DU du;
+constexpr DU8 du8;
+constexpr DU16 du16;
+
+static constexpr double kMul16 = 1.0 / 257.0;
+static constexpr double kMulFloat = 255.0;
+
+template <size_t C>
+void ReadUint8Row(const uint8_t* row_in, size_t x0, size_t len,
+                  float* row_out[kMaxComponents]) {
+  for (size_t x = x0; x < len; ++x) {
+    for (size_t c = 0; c < C; ++c) {
+      row_out[c][x] = row_in[C * x + c];
+    }
+  }
+}
+
+template <size_t C, bool swap_endianness = false>
+void ReadUint16Row(const uint8_t* row_in, size_t x0, size_t len,
+                   float* row_out[kMaxComponents]) {
+  const uint16_t* row16 = reinterpret_cast<const uint16_t*>(row_in);
+  for (size_t x = x0; x < len; ++x) {
+    for (size_t c = 0; c < C; ++c) {
+      uint16_t val = row16[C * x + c];
+      if (swap_endianness) val = JXL_BSWAP16(val);
+      row_out[c][x] = val * kMul16;
+    }
+  }
+}
+
+template <size_t C, bool swap_endianness = false>
+void ReadFloatRow(const uint8_t* row_in, size_t x0, size_t len,
+                  float* row_out[kMaxComponents]) {
+  const float* rowf = reinterpret_cast<const float*>(row_in);
+  for (size_t x = x0; x < len; ++x) {
+    for (size_t c = 0; c < C; ++c) {
+      float val = rowf[C * x + c];
+      if (swap_endianness) val = BSwapFloat(val);
+      row_out[c][x] = val * kMulFloat;
+    }
+  }
+}
+
+void ReadUint8RowSingle(const uint8_t* row_in, size_t len,
+                        float* row_out[kMaxComponents]) {
+  const size_t N = Lanes(d);
+  const size_t simd_len = len & (~(N - 1));
+  float* JXL_RESTRICT const row0 = row_out[0];
+  for (size_t x = 0; x < simd_len; x += N) {
+    Store(ConvertTo(d, PromoteTo(du, LoadU(du8, row_in + x))), d, row0 + x);
+  }
+  ReadUint8Row<1>(row_in, simd_len, len, row_out);
+}
+
+void ReadUint8RowInterleaved2(const uint8_t* row_in, size_t len,
+                              float* row_out[kMaxComponents]) {
+  const size_t N = Lanes(d);
+  const size_t simd_len = len & (~(N - 1));
+  float* JXL_RESTRICT const row0 = row_out[0];
+  float* JXL_RESTRICT const row1 = row_out[1];
+  Vec<DU8> out0, out1;
+  for (size_t x = 0; x < simd_len; x += N) {
+    LoadInterleaved2(du8, row_in + 2 * x, out0, out1);
+    Store(ConvertTo(d, PromoteTo(du, out0)), d, row0 + x);
+    Store(ConvertTo(d, PromoteTo(du, out1)), d, row1 + x);
+  }
+  ReadUint8Row<2>(row_in, simd_len, len, row_out);
+}
+
+void ReadUint8RowInterleaved3(const uint8_t* row_in, size_t len,
+                              float* row_out[kMaxComponents]) {
+  const size_t N = Lanes(d);
+  const size_t simd_len = len & (~(N - 1));
+  float* JXL_RESTRICT const row0 = row_out[0];
+  float* JXL_RESTRICT const row1 = row_out[1];
+  float* JXL_RESTRICT const row2 = row_out[2];
+  Vec<DU8> out0, out1, out2;
+  for (size_t x = 0; x < simd_len; x += N) {
+    LoadInterleaved3(du8, row_in + 3 * x, out0, out1, out2);
+    Store(ConvertTo(d, PromoteTo(du, out0)), d, row0 + x);
+    Store(ConvertTo(d, PromoteTo(du, out1)), d, row1 + x);
+    Store(ConvertTo(d, PromoteTo(du, out2)), d, row2 + x);
+  }
+  ReadUint8Row<3>(row_in, simd_len, len, row_out);
+}
+
+void ReadUint8RowInterleaved4(const uint8_t* row_in, size_t len,
+                              float* row_out[kMaxComponents]) {
+  const size_t N = Lanes(d);
+  const size_t simd_len = len & (~(N - 1));
+  float* JXL_RESTRICT const row0 = row_out[0];
+  float* JXL_RESTRICT const row1 = row_out[1];
+  float* JXL_RESTRICT const row2 = row_out[2];
+  float* JXL_RESTRICT const row3 = row_out[3];
+  Vec<DU8> out0, out1, out2, out3;
+  for (size_t x = 0; x < simd_len; x += N) {
+    LoadInterleaved4(du8, row_in + 4 * x, out0, out1, out2, out3);
+    Store(ConvertTo(d, PromoteTo(du, out0)), d, row0 + x);
+    Store(ConvertTo(d, PromoteTo(du, out1)), d, row1 + x);
+    Store(ConvertTo(d, PromoteTo(du, out2)), d, row2 + x);
+    Store(ConvertTo(d, PromoteTo(du, out3)), d, row3 + x);
+  }
+  ReadUint8Row<4>(row_in, simd_len, len, row_out);
+}
+
+void ReadUint16RowSingle(const uint8_t* row_in, size_t len,
+                         float* row_out[kMaxComponents]) {
+  const size_t N = Lanes(d);
+  const size_t simd_len = len & (~(N - 1));
+  const auto mul = Set(d, kMul16);
+  const uint16_t* JXL_RESTRICT const row =
+      reinterpret_cast<const uint16_t*>(row_in);
+  float* JXL_RESTRICT const row0 = row_out[0];
+  for (size_t x = 0; x < simd_len; x += N) {
+    Store(Mul(mul, ConvertTo(d, PromoteTo(du, LoadU(du16, row + x)))), d,
+          row0 + x);
+  }
+  ReadUint16Row<1>(row_in, simd_len, len, row_out);
+}
+
+void ReadUint16RowInterleaved2(const uint8_t* row_in, size_t len,
+                               float* row_out[kMaxComponents]) {
+  const size_t N = Lanes(d);
+  const size_t simd_len = len & (~(N - 1));
+  const auto mul = Set(d, kMul16);
+  const uint16_t* JXL_RESTRICT const row =
+      reinterpret_cast<const uint16_t*>(row_in);
+  float* JXL_RESTRICT const row0 = row_out[0];
+  float* JXL_RESTRICT const row1 = row_out[1];
+  Vec<DU16> out0, out1;
+  for (size_t x = 0; x < simd_len; x += N) {
+    LoadInterleaved2(du16, row + 2 * x, out0, out1);
+    Store(Mul(mul, ConvertTo(d, PromoteTo(du, out0))), d, row0 + x);
+    Store(Mul(mul, ConvertTo(d, PromoteTo(du, out1))), d, row1 + x);
+  }
+  ReadUint16Row<2>(row_in, simd_len, len, row_out);
+}
+
+void ReadUint16RowInterleaved3(const uint8_t* row_in, size_t len,
+                               float* row_out[kMaxComponents]) {
+  const size_t N = Lanes(d);
+  const size_t simd_len = len & (~(N - 1));
+  const auto mul = Set(d, kMul16);
+  const uint16_t* JXL_RESTRICT const row =
+      reinterpret_cast<const uint16_t*>(row_in);
+  float* JXL_RESTRICT const row0 = row_out[0];
+  float* JXL_RESTRICT const row1 = row_out[1];
+  float* JXL_RESTRICT const row2 = row_out[2];
+  Vec<DU16> out0, out1, out2;
+  for (size_t x = 0; x < simd_len; x += N) {
+    LoadInterleaved3(du16, row + 3 * x, out0, out1, out2);
+    Store(Mul(mul, ConvertTo(d, PromoteTo(du, out0))), d, row0 + x);
+    Store(Mul(mul, ConvertTo(d, PromoteTo(du, out1))), d, row1 + x);
+    Store(Mul(mul, ConvertTo(d, PromoteTo(du, out2))), d, row2 + x);
+  }
+  ReadUint16Row<3>(row_in, simd_len, len, row_out);
+}
+
+void ReadUint16RowInterleaved4(const uint8_t* row_in, size_t len,
+                               float* row_out[kMaxComponents]) {
+  const size_t N = Lanes(d);
+  const size_t simd_len = len & (~(N - 1));
+  const auto mul = Set(d, kMul16);
+  const uint16_t* JXL_RESTRICT const row =
+      reinterpret_cast<const uint16_t*>(row_in);
+  float* JXL_RESTRICT const row0 = row_out[0];
+  float* JXL_RESTRICT const row1 = row_out[1];
+  float* JXL_RESTRICT const row2 = row_out[2];
+  float* JXL_RESTRICT const row3 = row_out[3];
+  Vec<DU16> out0, out1, out2, out3;
+  for (size_t x = 0; x < simd_len; x += N) {
+    LoadInterleaved4(du16, row + 4 * x, out0, out1, out2, out3);
+    Store(Mul(mul, ConvertTo(d, PromoteTo(du, out0))), d, row0 + x);
+    Store(Mul(mul, ConvertTo(d, PromoteTo(du, out1))), d, row1 + x);
+    Store(Mul(mul, ConvertTo(d, PromoteTo(du, out2))), d, row2 + x);
+    Store(Mul(mul, ConvertTo(d, PromoteTo(du, out3))), d, row3 + x);
+  }
+  ReadUint16Row<4>(row_in, simd_len, len, row_out);
+}
+
+void ReadUint16RowSingleSwap(const uint8_t* row_in, size_t len,
+                             float* row_out[kMaxComponents]) {
+  ReadUint16Row<1, true>(row_in, 0, len, row_out);
+}
+
+void ReadUint16RowInterleaved2Swap(const uint8_t* row_in, size_t len,
+                                   float* row_out[kMaxComponents]) {
+  ReadUint16Row<2, true>(row_in, 0, len, row_out);
+}
+
+void ReadUint16RowInterleaved3Swap(const uint8_t* row_in, size_t len,
+                                   float* row_out[kMaxComponents]) {
+  ReadUint16Row<3, true>(row_in, 0, len, row_out);
+}
+
+void ReadUint16RowInterleaved4Swap(const uint8_t* row_in, size_t len,
+                                   float* row_out[kMaxComponents]) {
+  ReadUint16Row<4, true>(row_in, 0, len, row_out);
+}
+
+void ReadFloatRowSingle(const uint8_t* row_in, size_t len,
+                        float* row_out[kMaxComponents]) {
+  const size_t N = Lanes(d);
+  const size_t simd_len = len & (~(N - 1));
+  const auto mul = Set(d, kMulFloat);
+  const float* JXL_RESTRICT const row = reinterpret_cast<const float*>(row_in);
+  float* JXL_RESTRICT const row0 = row_out[0];
+  for (size_t x = 0; x < simd_len; x += N) {
+    Store(Mul(mul, LoadU(d, row + x)), d, row0 + x);
+  }
+  ReadFloatRow<1>(row_in, simd_len, len, row_out);
+}
+
+void ReadFloatRowInterleaved2(const uint8_t* row_in, size_t len,
+                              float* row_out[kMaxComponents]) {
+  const size_t N = Lanes(d);
+  const size_t simd_len = len & (~(N - 1));
+  const auto mul = Set(d, kMulFloat);
+  const float* JXL_RESTRICT const row = reinterpret_cast<const float*>(row_in);
+  float* JXL_RESTRICT const row0 = row_out[0];
+  float* JXL_RESTRICT const row1 = row_out[1];
+  Vec<D> out0, out1;
+  for (size_t x = 0; x < simd_len; x += N) {
+    LoadInterleaved2(d, row + 2 * x, out0, out1);
+    Store(Mul(mul, out0), d, row0 + x);
+    Store(Mul(mul, out1), d, row1 + x);
+  }
+  ReadFloatRow<2>(row_in, simd_len, len, row_out);
+}
+
+void ReadFloatRowInterleaved3(const uint8_t* row_in, size_t len,
+                              float* row_out[kMaxComponents]) {
+  const size_t N = Lanes(d);
+  const size_t simd_len = len & (~(N - 1));
+  const auto mul = Set(d, kMulFloat);
+  const float* JXL_RESTRICT const row = reinterpret_cast<const float*>(row_in);
+  float* JXL_RESTRICT const row0 = row_out[0];
+  float* JXL_RESTRICT const row1 = row_out[1];
+  float* JXL_RESTRICT const row2 = row_out[2];
+  Vec<D> out0, out1, out2;
+  for (size_t x = 0; x < simd_len; x += N) {
+    LoadInterleaved3(d, row + 3 * x, out0, out1, out2);
+    Store(Mul(mul, out0), d, row0 + x);
+    Store(Mul(mul, out1), d, row1 + x);
+    Store(Mul(mul, out2), d, row2 + x);
+  }
+  ReadFloatRow<3>(row_in, simd_len, len, row_out);
+}
+
+void ReadFloatRowInterleaved4(const uint8_t* row_in, size_t len,
+                              float* row_out[kMaxComponents]) {
+  const size_t N = Lanes(d);
+  const size_t simd_len = len & (~(N - 1));
+  const auto mul = Set(d, kMulFloat);
+  const float* JXL_RESTRICT const row = reinterpret_cast<const float*>(row_in);
+  float* JXL_RESTRICT const row0 = row_out[0];
+  float* JXL_RESTRICT const row1 = row_out[1];
+  float* JXL_RESTRICT const row2 = row_out[2];
+  float* JXL_RESTRICT const row3 = row_out[3];
+  Vec<D> out0, out1, out2, out3;
+  for (size_t x = 0; x < simd_len; x += N) {
+    LoadInterleaved4(d, row + 4 * x, out0, out1, out2, out3);
+    Store(Mul(mul, out0), d, row0 + x);
+    Store(Mul(mul, out1), d, row1 + x);
+    Store(Mul(mul, out2), d, row2 + x);
+    Store(Mul(mul, out3), d, row3 + x);
+  }
+  ReadFloatRow<4>(row_in, simd_len, len, row_out);
+}
+
+void ReadFloatRowSingleSwap(const uint8_t* row_in, size_t len,
+                            float* row_out[kMaxComponents]) {
+  ReadFloatRow<1, true>(row_in, 0, len, row_out);
+}
+
+void ReadFloatRowInterleaved2Swap(const uint8_t* row_in, size_t len,
+                                  float* row_out[kMaxComponents]) {
+  ReadFloatRow<2, true>(row_in, 0, len, row_out);
+}
+
+void ReadFloatRowInterleaved3Swap(const uint8_t* row_in, size_t len,
+                                  float* row_out[kMaxComponents]) {
+  ReadFloatRow<3, true>(row_in, 0, len, row_out);
+}
+
+void ReadFloatRowInterleaved4Swap(const uint8_t* row_in, size_t len,
+                                  float* row_out[kMaxComponents]) {
+  ReadFloatRow<4, true>(row_in, 0, len, row_out);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jpegli
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jpegli {
+
+HWY_EXPORT(ReadUint8RowSingle);
+HWY_EXPORT(ReadUint8RowInterleaved2);
+HWY_EXPORT(ReadUint8RowInterleaved3);
+HWY_EXPORT(ReadUint8RowInterleaved4);
+HWY_EXPORT(ReadUint16RowSingle);
+HWY_EXPORT(ReadUint16RowInterleaved2);
+HWY_EXPORT(ReadUint16RowInterleaved3);
+HWY_EXPORT(ReadUint16RowInterleaved4);
+HWY_EXPORT(ReadUint16RowSingleSwap);
+HWY_EXPORT(ReadUint16RowInterleaved2Swap);
+HWY_EXPORT(ReadUint16RowInterleaved3Swap);
+HWY_EXPORT(ReadUint16RowInterleaved4Swap);
+HWY_EXPORT(ReadFloatRowSingle);
+HWY_EXPORT(ReadFloatRowInterleaved2);
+HWY_EXPORT(ReadFloatRowInterleaved3);
+HWY_EXPORT(ReadFloatRowInterleaved4);
+HWY_EXPORT(ReadFloatRowSingleSwap);
+HWY_EXPORT(ReadFloatRowInterleaved2Swap);
+HWY_EXPORT(ReadFloatRowInterleaved3Swap);
+HWY_EXPORT(ReadFloatRowInterleaved4Swap);
+
+void ChooseInputMethod(j_compress_ptr cinfo) {
+  jpeg_comp_master* m = cinfo->master;
+  bool swap_endianness =
+      (m->endianness == JPEGLI_LITTLE_ENDIAN && !IsLittleEndian()) ||
+      (m->endianness == JPEGLI_BIG_ENDIAN && IsLittleEndian());
+  m->input_method = nullptr;
+  if (m->data_type == JPEGLI_TYPE_UINT8) {
+    if (cinfo->raw_data_in || cinfo->input_components == 1) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint8RowSingle);
+    } else if (cinfo->input_components == 2) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint8RowInterleaved2);
+    } else if (cinfo->input_components == 3) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint8RowInterleaved3);
+    } else if (cinfo->input_components == 4) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint8RowInterleaved4);
+    }
+  } else if (m->data_type == JPEGLI_TYPE_UINT16 && !swap_endianness) {
+    if (cinfo->raw_data_in || cinfo->input_components == 1) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowSingle);
+    } else if (cinfo->input_components == 2) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved2);
+    } else if (cinfo->input_components == 3) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved3);
+    } else if (cinfo->input_components == 4) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved4);
+    }
+  } else if (m->data_type == JPEGLI_TYPE_UINT16 && swap_endianness) {
+    if (cinfo->raw_data_in || cinfo->input_components == 1) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowSingleSwap);
+    } else if (cinfo->input_components == 2) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved2Swap);
+    } else if (cinfo->input_components == 3) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved3Swap);
+    } else if (cinfo->input_components == 4) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadUint16RowInterleaved4Swap);
+    }
+  } else if (m->data_type == JPEGLI_TYPE_FLOAT && !swap_endianness) {
+    if (cinfo->raw_data_in || cinfo->input_components == 1) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowSingle);
+    } else if (cinfo->input_components == 2) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved2);
+    } else if (cinfo->input_components == 3) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved3);
+    } else if (cinfo->input_components == 4) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved4);
+    }
+  } else if (m->data_type == JPEGLI_TYPE_FLOAT && swap_endianness) {
+    if (cinfo->raw_data_in || cinfo->input_components == 1) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowSingleSwap);
+    } else if (cinfo->input_components == 2) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved2Swap);
+    } else if (cinfo->input_components == 3) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved3Swap);
+    } else if (cinfo->input_components == 4) {
+      m->input_method = HWY_DYNAMIC_DISPATCH(ReadFloatRowInterleaved4Swap);
+    }
+  }
+  if (m->input_method == nullptr) {
+    JPEGLI_ERROR("Could not find input method.");
+  }
+}
+
+}  // namespace jpegli
+#endif  // HWY_ONCE
diff --git a/lib/jpegli/input.h b/lib/jpegli/input.h
new file mode 100644 (file)
index 0000000..f54d0be
--- /dev/null
@@ -0,0 +1,17 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_INPUT_H_
+#define LIB_JPEGLI_INPUT_H_
+
+#include "lib/jpegli/common.h"
+
+namespace jpegli {
+
+void ChooseInputMethod(j_compress_ptr cinfo);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_INPUT_H_
diff --git a/lib/jpegli/input_suspension_test.cc b/lib/jpegli/input_suspension_test.cc
new file mode 100644 (file)
index 0000000..09bafd9
--- /dev/null
@@ -0,0 +1,612 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <cmath>
+#include <cstdint>
+#include <vector>
+
+#include "lib/jpegli/decode.h"
+#include "lib/jpegli/test_utils.h"
+#include "lib/jpegli/testing.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/sanitizers.h"
+
+namespace jpegli {
+namespace {
+
+static constexpr uint8_t kFakeEoiMarker[2] = {0xff, 0xd9};
+
+struct SourceManager {
+  SourceManager(const uint8_t* data, size_t len, size_t max_chunk_size,
+                bool is_partial_file)
+      : data_(data),
+        len_(len),
+        pos_(0),
+        max_chunk_size_(max_chunk_size),
+        is_partial_file_(is_partial_file) {
+    pub_.init_source = init_source;
+    pub_.fill_input_buffer = fill_input_buffer;
+    pub_.next_input_byte = nullptr;
+    pub_.bytes_in_buffer = 0;
+    pub_.skip_input_data = skip_input_data;
+    pub_.resync_to_restart = jpegli_resync_to_restart;
+    pub_.term_source = term_source;
+    if (max_chunk_size_ == 0) max_chunk_size_ = len;
+  }
+
+  ~SourceManager() {
+    EXPECT_EQ(0, pub_.bytes_in_buffer);
+    if (!is_partial_file_) {
+      EXPECT_EQ(len_, pos_);
+    }
+  }
+
+  bool LoadNextChunk() {
+    if (pos_ >= len_ && !is_partial_file_) {
+      return false;
+    }
+    if (pub_.bytes_in_buffer > 0) {
+      EXPECT_LE(pub_.bytes_in_buffer, buffer_.size());
+      memmove(&buffer_[0], pub_.next_input_byte, pub_.bytes_in_buffer);
+    }
+    size_t chunk_size =
+        pos_ < len_ ? std::min(len_ - pos_, max_chunk_size_) : 2;
+    buffer_.resize(pub_.bytes_in_buffer + chunk_size);
+    memcpy(&buffer_[pub_.bytes_in_buffer],
+           pos_ < len_ ? data_ + pos_ : kFakeEoiMarker, chunk_size);
+    pub_.next_input_byte = &buffer_[0];
+    pub_.bytes_in_buffer += chunk_size;
+    pos_ += chunk_size;
+    return true;
+  }
+
+ private:
+  jpeg_source_mgr pub_;
+  std::vector<uint8_t> buffer_;
+  const uint8_t* data_;
+  size_t len_;
+  size_t pos_;
+  size_t max_chunk_size_;
+  bool is_partial_file_;
+
+  static void init_source(j_decompress_ptr cinfo) {
+    auto src = reinterpret_cast<SourceManager*>(cinfo->src);
+    src->pub_.next_input_byte = nullptr;
+    src->pub_.bytes_in_buffer = 0;
+  }
+
+  static boolean fill_input_buffer(j_decompress_ptr cinfo) { return FALSE; }
+
+  static void skip_input_data(j_decompress_ptr cinfo, long num_bytes) {
+    auto src = reinterpret_cast<SourceManager*>(cinfo->src);
+    if (num_bytes <= 0) {
+      return;
+    }
+    if (src->pub_.bytes_in_buffer >= static_cast<size_t>(num_bytes)) {
+      src->pub_.bytes_in_buffer -= num_bytes;
+      src->pub_.next_input_byte += num_bytes;
+    } else {
+      src->pos_ += num_bytes - src->pub_.bytes_in_buffer;
+      src->pub_.bytes_in_buffer = 0;
+    }
+  }
+
+  static void term_source(j_decompress_ptr cinfo) {}
+};
+
+uint8_t markers_seen[kMarkerSequenceLen];
+size_t num_markers_seen = 0;
+
+uint8_t get_next_byte(j_decompress_ptr cinfo) {
+  cinfo->src->bytes_in_buffer--;
+  return *cinfo->src->next_input_byte++;
+}
+
+boolean test_marker_processor(j_decompress_ptr cinfo) {
+  markers_seen[num_markers_seen] = cinfo->unread_marker;
+  if (cinfo->src->bytes_in_buffer < 2) {
+    return FALSE;
+  }
+  size_t marker_len = (get_next_byte(cinfo) << 8) + get_next_byte(cinfo);
+  EXPECT_EQ(2 + ((num_markers_seen + 2) % sizeof(kMarkerData)), marker_len);
+  if (marker_len > 2) {
+    (*cinfo->src->skip_input_data)(cinfo, marker_len - 2);
+  }
+  ++num_markers_seen;
+  return TRUE;
+}
+
+void ReadOutputImage(const DecompressParams& dparams, j_decompress_ptr cinfo,
+                     SourceManager* src, TestImage* output) {
+  output->ysize = cinfo->output_height;
+  output->xsize = cinfo->output_width;
+  output->components = cinfo->num_components;
+  if (cinfo->raw_data_out) {
+    output->color_space = cinfo->jpeg_color_space;
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      size_t xsize = cinfo->comp_info[c].width_in_blocks * DCTSIZE;
+      size_t ysize = cinfo->comp_info[c].height_in_blocks * DCTSIZE;
+      std::vector<uint8_t> plane(ysize * xsize);
+      output->raw_data.emplace_back(std::move(plane));
+    }
+  } else {
+    output->color_space = cinfo->out_color_space;
+    output->AllocatePixels();
+  }
+  size_t total_output_lines = 0;
+  while (cinfo->output_scanline < cinfo->output_height) {
+    size_t max_lines;
+    size_t num_output_lines;
+    if (cinfo->raw_data_out) {
+      size_t iMCU_height = cinfo->max_v_samp_factor * DCTSIZE;
+      EXPECT_EQ(cinfo->output_scanline, cinfo->output_iMCU_row * iMCU_height);
+      max_lines = iMCU_height;
+      std::vector<std::vector<JSAMPROW>> rowdata(cinfo->num_components);
+      std::vector<JSAMPARRAY> data(cinfo->num_components);
+      for (int c = 0; c < cinfo->num_components; ++c) {
+        size_t xsize = cinfo->comp_info[c].width_in_blocks * DCTSIZE;
+        size_t ysize = cinfo->comp_info[c].height_in_blocks * DCTSIZE;
+        size_t num_lines = cinfo->comp_info[c].v_samp_factor * DCTSIZE;
+        rowdata[c].resize(num_lines);
+        size_t y0 = cinfo->output_iMCU_row * num_lines;
+        for (size_t i = 0; i < num_lines; ++i) {
+          rowdata[c][i] =
+              y0 + i < ysize ? &output->raw_data[c][(y0 + i) * xsize] : nullptr;
+        }
+        data[c] = &rowdata[c][0];
+      }
+      while ((num_output_lines =
+                  jpegli_read_raw_data(cinfo, &data[0], max_lines)) == 0) {
+        JXL_CHECK(src && src->LoadNextChunk());
+      }
+    } else {
+      size_t max_output_lines = dparams.max_output_lines;
+      if (max_output_lines == 0) max_output_lines = cinfo->output_height;
+      size_t lines_left = cinfo->output_height - cinfo->output_scanline;
+      max_lines = std::min<size_t>(max_output_lines, lines_left);
+      size_t stride = cinfo->output_width * cinfo->num_components;
+      std::vector<JSAMPROW> scanlines(max_lines);
+      for (size_t i = 0; i < max_lines; ++i) {
+        size_t yidx = cinfo->output_scanline + i;
+        scanlines[i] = &output->pixels[yidx * stride];
+      }
+      while ((num_output_lines = jpegli_read_scanlines(cinfo, &scanlines[0],
+                                                       max_lines)) == 0) {
+        JXL_CHECK(src && src->LoadNextChunk());
+      }
+    }
+    total_output_lines += num_output_lines;
+    EXPECT_EQ(total_output_lines, cinfo->output_scanline);
+    if (num_output_lines < max_lines) {
+      JXL_CHECK(src && src->LoadNextChunk());
+    }
+  }
+}
+
+struct TestConfig {
+  std::string fn;
+  std::string fn_desc;
+  TestImage input;
+  CompressParams jparams;
+  DecompressParams dparams;
+  float max_rms_dist = 1.0f;
+};
+
+std::vector<uint8_t> GetTestJpegData(TestConfig& config) {
+  if (!config.fn.empty()) {
+    return ReadTestData(config.fn.c_str());
+  }
+  GeneratePixels(&config.input);
+  std::vector<uint8_t> compressed;
+  JXL_CHECK(EncodeWithJpegli(config.input, config.jparams, &compressed));
+  return compressed;
+}
+
+bool IsSequential(const TestConfig& config) {
+  if (!config.fn.empty()) {
+    return config.fn_desc.find("PROGR") == std::string::npos;
+  }
+  return config.jparams.progressive_mode <= 0;
+}
+
+class InputSuspensionTestParam : public ::testing::TestWithParam<TestConfig> {};
+
+TEST_P(InputSuspensionTestParam, InputOutputLockStepNonBuffered) {
+  TestConfig config = GetParam();
+  const DecompressParams& dparams = config.dparams;
+  std::vector<uint8_t> compressed = GetTestJpegData(config);
+  bool is_partial = config.dparams.size_factor < 1.0f;
+  if (is_partial) {
+    compressed.resize(compressed.size() * config.dparams.size_factor);
+  }
+  SourceManager src(compressed.data(), compressed.size(), dparams.chunk_size,
+                    is_partial);
+  TestImage output0;
+  jpeg_decompress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+    cinfo.src = reinterpret_cast<jpeg_source_mgr*>(&src);
+
+    if (config.jparams.add_marker) {
+      jpegli_save_markers(&cinfo, kSpecialMarker0, 0xffff);
+      jpegli_save_markers(&cinfo, kSpecialMarker1, 0xffff);
+      num_markers_seen = 0;
+      jpegli_set_marker_processor(&cinfo, 0xe6, test_marker_processor);
+      jpegli_set_marker_processor(&cinfo, 0xe7, test_marker_processor);
+      jpegli_set_marker_processor(&cinfo, 0xe8, test_marker_processor);
+    }
+    while (jpegli_read_header(&cinfo, TRUE) == JPEG_SUSPENDED) {
+      JXL_CHECK(src.LoadNextChunk());
+    }
+    SetDecompressParams(dparams, &cinfo);
+    jpegli_set_output_format(&cinfo, dparams.data_type, dparams.endianness);
+    if (config.jparams.add_marker) {
+      EXPECT_EQ(num_markers_seen, kMarkerSequenceLen);
+      EXPECT_EQ(0, memcmp(markers_seen, kMarkerSequence, num_markers_seen));
+    }
+    VerifyHeader(config.jparams, &cinfo);
+    cinfo.raw_data_out = dparams.output_mode == RAW_DATA;
+
+    if (dparams.output_mode == COEFFICIENTS) {
+      jvirt_barray_ptr* coef_arrays;
+      while ((coef_arrays = jpegli_read_coefficients(&cinfo)) == nullptr) {
+        JXL_CHECK(src.LoadNextChunk());
+      }
+      CopyCoefficients(&cinfo, coef_arrays, &output0);
+    } else {
+      while (!jpegli_start_decompress(&cinfo)) {
+        JXL_CHECK(src.LoadNextChunk());
+      }
+      ReadOutputImage(dparams, &cinfo, &src, &output0);
+    }
+
+    while (!jpegli_finish_decompress(&cinfo)) {
+      JXL_CHECK(src.LoadNextChunk());
+    }
+    return true;
+  };
+  ASSERT_TRUE(try_catch_block());
+  jpegli_destroy_decompress(&cinfo);
+
+  TestImage output1;
+  DecodeWithLibjpeg(config.jparams, dparams, compressed, &output1);
+  VerifyOutputImage(output1, output0, config.max_rms_dist);
+}
+
+TEST_P(InputSuspensionTestParam, InputOutputLockStepBuffered) {
+  TestConfig config = GetParam();
+  if (config.jparams.add_marker) return;
+  const DecompressParams& dparams = config.dparams;
+  std::vector<uint8_t> compressed = GetTestJpegData(config);
+  bool is_partial = config.dparams.size_factor < 1.0f;
+  if (is_partial) {
+    compressed.resize(compressed.size() * config.dparams.size_factor);
+  }
+  SourceManager src(compressed.data(), compressed.size(), dparams.chunk_size,
+                    is_partial);
+  std::vector<TestImage> output_progression0;
+  jpeg_decompress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+
+    cinfo.src = reinterpret_cast<jpeg_source_mgr*>(&src);
+
+    while (jpegli_read_header(&cinfo, TRUE) == JPEG_SUSPENDED) {
+      JXL_CHECK(src.LoadNextChunk());
+    }
+    SetDecompressParams(dparams, &cinfo);
+    jpegli_set_output_format(&cinfo, dparams.data_type, dparams.endianness);
+
+    cinfo.buffered_image = TRUE;
+    cinfo.raw_data_out = dparams.output_mode == RAW_DATA;
+
+    EXPECT_TRUE(jpegli_start_decompress(&cinfo));
+    EXPECT_FALSE(jpegli_input_complete(&cinfo));
+    EXPECT_EQ(0, cinfo.output_scan_number);
+
+    int sos_marker_cnt = 1;  // read_header reads the first SOS marker
+    while (!jpegli_input_complete(&cinfo)) {
+      EXPECT_EQ(cinfo.input_scan_number, sos_marker_cnt);
+      EXPECT_TRUE(jpegli_start_output(&cinfo, cinfo.input_scan_number));
+      // start output sets output_scan_number, but does not change
+      // input_scan_number
+      EXPECT_EQ(cinfo.output_scan_number, cinfo.input_scan_number);
+      EXPECT_EQ(cinfo.input_scan_number, sos_marker_cnt);
+      TestImage output;
+      ReadOutputImage(dparams, &cinfo, &src, &output);
+      output_progression0.emplace_back(std::move(output));
+      // read scanlines/read raw data does not change input/output scan number
+      EXPECT_EQ(cinfo.input_scan_number, sos_marker_cnt);
+      EXPECT_EQ(cinfo.output_scan_number, cinfo.input_scan_number);
+      while (!jpegli_finish_output(&cinfo)) {
+        JXL_CHECK(src.LoadNextChunk());
+      }
+      ++sos_marker_cnt;  // finish output reads the next SOS marker or EOI
+      if (dparams.output_mode == COEFFICIENTS) {
+        jvirt_barray_ptr* coef_arrays = jpegli_read_coefficients(&cinfo);
+        JXL_CHECK(coef_arrays != nullptr);
+        CopyCoefficients(&cinfo, coef_arrays, &output_progression0.back());
+      }
+    }
+
+    EXPECT_TRUE(jpegli_finish_decompress(&cinfo));
+    return true;
+  };
+  ASSERT_TRUE(try_catch_block());
+  jpegli_destroy_decompress(&cinfo);
+
+  std::vector<TestImage> output_progression1;
+  DecodeAllScansWithLibjpeg(config.jparams, dparams, compressed,
+                            &output_progression1);
+  ASSERT_EQ(output_progression0.size(), output_progression1.size());
+  for (size_t i = 0; i < output_progression0.size(); ++i) {
+    const TestImage& output = output_progression0[i];
+    const TestImage& expected = output_progression1[i];
+    VerifyOutputImage(expected, output, config.max_rms_dist);
+  }
+}
+
+TEST_P(InputSuspensionTestParam, PreConsumeInputBuffered) {
+  TestConfig config = GetParam();
+  if (config.jparams.add_marker) return;
+  const DecompressParams& dparams = config.dparams;
+  std::vector<uint8_t> compressed = GetTestJpegData(config);
+  bool is_partial = config.dparams.size_factor < 1.0f;
+  if (is_partial) {
+    compressed.resize(compressed.size() * config.dparams.size_factor);
+  }
+  std::vector<TestImage> output_progression1;
+  DecodeAllScansWithLibjpeg(config.jparams, dparams, compressed,
+                            &output_progression1);
+  SourceManager src(compressed.data(), compressed.size(), dparams.chunk_size,
+                    is_partial);
+  TestImage output0;
+  jpeg_decompress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+    cinfo.src = reinterpret_cast<jpeg_source_mgr*>(&src);
+
+    int status;
+    while ((status = jpegli_consume_input(&cinfo)) != JPEG_REACHED_SOS) {
+      if (status == JPEG_SUSPENDED) {
+        JXL_CHECK(src.LoadNextChunk());
+      }
+    }
+    EXPECT_EQ(JPEG_REACHED_SOS, jpegli_consume_input(&cinfo));
+    cinfo.buffered_image = TRUE;
+    cinfo.raw_data_out = dparams.output_mode == RAW_DATA;
+    cinfo.do_block_smoothing = dparams.do_block_smoothing;
+
+    EXPECT_TRUE(jpegli_start_decompress(&cinfo));
+    EXPECT_FALSE(jpegli_input_complete(&cinfo));
+    EXPECT_EQ(1, cinfo.input_scan_number);
+    EXPECT_EQ(0, cinfo.output_scan_number);
+
+    while ((status = jpegli_consume_input(&cinfo)) != JPEG_REACHED_EOI) {
+      if (status == JPEG_SUSPENDED) {
+        JXL_CHECK(src.LoadNextChunk());
+      }
+    }
+
+    EXPECT_TRUE(jpegli_input_complete(&cinfo));
+    EXPECT_EQ(output_progression1.size(), cinfo.input_scan_number);
+    EXPECT_EQ(0, cinfo.output_scan_number);
+
+    EXPECT_TRUE(jpegli_start_output(&cinfo, cinfo.input_scan_number));
+    EXPECT_EQ(output_progression1.size(), cinfo.input_scan_number);
+    EXPECT_EQ(cinfo.output_scan_number, cinfo.input_scan_number);
+
+    ReadOutputImage(dparams, &cinfo, nullptr, &output0);
+    EXPECT_EQ(output_progression1.size(), cinfo.input_scan_number);
+    EXPECT_EQ(cinfo.output_scan_number, cinfo.input_scan_number);
+
+    EXPECT_TRUE(jpegli_finish_output(&cinfo));
+    if (dparams.output_mode == COEFFICIENTS) {
+      jvirt_barray_ptr* coef_arrays = jpegli_read_coefficients(&cinfo);
+      JXL_CHECK(coef_arrays != nullptr);
+      CopyCoefficients(&cinfo, coef_arrays, &output0);
+    }
+    EXPECT_TRUE(jpegli_finish_decompress(&cinfo));
+    return true;
+  };
+  ASSERT_TRUE(try_catch_block());
+  jpegli_destroy_decompress(&cinfo);
+
+  VerifyOutputImage(output_progression1.back(), output0, config.max_rms_dist);
+}
+
+TEST_P(InputSuspensionTestParam, PreConsumeInputNonBuffered) {
+  TestConfig config = GetParam();
+  if (config.jparams.add_marker || IsSequential(config)) return;
+  const DecompressParams& dparams = config.dparams;
+  std::vector<uint8_t> compressed = GetTestJpegData(config);
+  bool is_partial = config.dparams.size_factor < 1.0f;
+  if (is_partial) {
+    compressed.resize(compressed.size() * config.dparams.size_factor);
+  }
+  SourceManager src(compressed.data(), compressed.size(), dparams.chunk_size,
+                    is_partial);
+  TestImage output0;
+  jpeg_decompress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+    cinfo.src = reinterpret_cast<jpeg_source_mgr*>(&src);
+
+    int status;
+    while ((status = jpegli_consume_input(&cinfo)) != JPEG_REACHED_SOS) {
+      if (status == JPEG_SUSPENDED) {
+        JXL_CHECK(src.LoadNextChunk());
+      }
+    }
+    EXPECT_EQ(JPEG_REACHED_SOS, jpegli_consume_input(&cinfo));
+    cinfo.raw_data_out = dparams.output_mode == RAW_DATA;
+    cinfo.do_block_smoothing = dparams.do_block_smoothing;
+
+    if (dparams.output_mode == COEFFICIENTS) {
+      jpegli_read_coefficients(&cinfo);
+    } else {
+      while (!jpegli_start_decompress(&cinfo)) {
+        JXL_CHECK(src.LoadNextChunk());
+      }
+    }
+
+    while ((status = jpegli_consume_input(&cinfo)) != JPEG_REACHED_EOI) {
+      if (status == JPEG_SUSPENDED) {
+        JXL_CHECK(src.LoadNextChunk());
+      }
+    }
+
+    if (dparams.output_mode == COEFFICIENTS) {
+      jvirt_barray_ptr* coef_arrays = jpegli_read_coefficients(&cinfo);
+      JXL_CHECK(coef_arrays != nullptr);
+      CopyCoefficients(&cinfo, coef_arrays, &output0);
+    } else {
+      ReadOutputImage(dparams, &cinfo, nullptr, &output0);
+    }
+
+    EXPECT_TRUE(jpegli_finish_decompress(&cinfo));
+    return true;
+  };
+  ASSERT_TRUE(try_catch_block());
+  jpegli_destroy_decompress(&cinfo);
+
+  TestImage output1;
+  DecodeWithLibjpeg(config.jparams, dparams, compressed, &output1);
+  VerifyOutputImage(output1, output0, config.max_rms_dist);
+}
+
+std::vector<TestConfig> GenerateTests() {
+  std::vector<TestConfig> all_tests;
+  std::vector<std::pair<std::string, std::string>> testfiles({
+      {"jxl/flower/flower.png.im_q85_444.jpg", "Q85YUV444"},
+      {"jxl/flower/flower.png.im_q85_420_R13B.jpg", "Q85YUV420R13B"},
+      {"jxl/flower/flower.png.im_q85_420_progr.jpg", "Q85YUV420PROGR"},
+  });
+  for (const auto& it : testfiles) {
+    for (size_t chunk_size : {1, 64, 65536}) {
+      for (size_t max_output_lines : {0, 1, 8, 16}) {
+        TestConfig config;
+        config.fn = it.first;
+        config.fn_desc = it.second;
+        config.dparams.chunk_size = chunk_size;
+        config.dparams.max_output_lines = max_output_lines;
+        all_tests.push_back(config);
+        if (max_output_lines == 16) {
+          config.dparams.output_mode = RAW_DATA;
+          all_tests.push_back(config);
+          config.dparams.output_mode = COEFFICIENTS;
+          all_tests.push_back(config);
+        }
+      }
+    }
+  }
+  for (size_t r : {1, 17, 1024}) {
+    for (size_t chunk_size : {1, 65536}) {
+      TestConfig config;
+      config.dparams.chunk_size = chunk_size;
+      config.jparams.progressive_mode = 2;
+      config.jparams.restart_interval = r;
+      all_tests.push_back(config);
+    }
+  }
+  for (size_t chunk_size : {1, 4, 1024}) {
+    TestConfig config;
+    config.input.xsize = 256;
+    config.input.ysize = 256;
+    config.dparams.chunk_size = chunk_size;
+    config.jparams.add_marker = true;
+    all_tests.push_back(config);
+  }
+  // Tests for partial input.
+  for (float size_factor : {0.1f, 0.33f, 0.5f, 0.75f}) {
+    for (int progr : {0, 1, 3}) {
+      for (int samp : {1, 2}) {
+        for (JpegIOMode output_mode : {PIXELS, RAW_DATA}) {
+          TestConfig config;
+          config.input.xsize = 517;
+          config.input.ysize = 523;
+          config.jparams.h_sampling = {samp, 1, 1};
+          config.jparams.v_sampling = {samp, 1, 1};
+          config.jparams.progressive_mode = progr;
+          config.dparams.size_factor = size_factor;
+          config.dparams.output_mode = output_mode;
+          // The last partially available block can behave differently.
+          // TODO(szabadka) Figure out if we can make the behaviour more
+          // similar.
+          config.max_rms_dist = samp == 1 ? 1.75f : 3.0f;
+          all_tests.push_back(config);
+        }
+      }
+    }
+  }
+  // Tests for block smoothing.
+  for (float size_factor : {0.1f, 0.33f, 0.5f, 0.75f, 1.0f}) {
+    for (int samp : {1, 2}) {
+      TestConfig config;
+      config.input.xsize = 517;
+      config.input.ysize = 523;
+      config.jparams.h_sampling = {samp, 1, 1};
+      config.jparams.v_sampling = {samp, 1, 1};
+      config.jparams.progressive_mode = 2;
+      config.dparams.size_factor = size_factor;
+      config.dparams.do_block_smoothing = true;
+      // libjpeg does smoothing for incomplete scans differently at
+      // the border between current and previous scans.
+      config.max_rms_dist = 8.0f;
+      all_tests.push_back(config);
+    }
+  }
+  return all_tests;
+}
+
+std::ostream& operator<<(std::ostream& os, const TestConfig& c) {
+  if (!c.fn.empty()) {
+    os << c.fn_desc;
+  } else {
+    os << c.input;
+  }
+  os << c.jparams;
+  if (c.dparams.chunk_size == 0) {
+    os << "CompleteInput";
+  } else {
+    os << "InputChunks" << c.dparams.chunk_size;
+  }
+  if (c.dparams.size_factor < 1.0f) {
+    os << "Partial" << static_cast<int>(c.dparams.size_factor * 100) << "p";
+  }
+  if (c.dparams.max_output_lines == 0) {
+    os << "CompleteOutput";
+  } else {
+    os << "OutputLines" << c.dparams.max_output_lines;
+  }
+  if (c.dparams.output_mode == RAW_DATA) {
+    os << "RawDataOut";
+  } else if (c.dparams.output_mode == COEFFICIENTS) {
+    os << "CoeffsOut";
+  }
+  if (c.dparams.do_block_smoothing) {
+    os << "BlockSmoothing";
+  }
+  return os;
+}
+
+std::string TestDescription(
+    const testing::TestParamInfo<InputSuspensionTestParam::ParamType>& info) {
+  std::stringstream name;
+  name << info.param;
+  return name.str();
+}
+
+JPEGLI_INSTANTIATE_TEST_SUITE_P(InputSuspensionTest, InputSuspensionTestParam,
+                                testing::ValuesIn(GenerateTests()),
+                                TestDescription);
+
+}  // namespace
+}  // namespace jpegli
diff --git a/lib/jpegli/jpeg.version.62 b/lib/jpegli/jpeg.version.62
new file mode 100644 (file)
index 0000000..3a8d1f5
--- /dev/null
@@ -0,0 +1,11 @@
+LIBJPEG_6.2 {
+  global:
+    jpeg*;
+};
+
+LIBJPEGTURBO_6.2 {
+  global:
+    jpeg_mem_src*;
+    jpeg_mem_dest*;
+    tj*;
+};
\ No newline at end of file
diff --git a/lib/jpegli/jpeg.version.8 b/lib/jpegli/jpeg.version.8
new file mode 100644 (file)
index 0000000..aa891f8
--- /dev/null
@@ -0,0 +1,9 @@
+LIBJPEG_8.0 {
+  global:
+    jpeg*;
+};
+
+LIBJPEGTURBO_8.0 {
+  global:
+    tj*;
+};
diff --git a/lib/jpegli/libjpeg_test_util.cc b/lib/jpegli/libjpeg_test_util.cc
new file mode 100644 (file)
index 0000000..de23037
--- /dev/null
@@ -0,0 +1,261 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/libjpeg_test_util.h"
+
+/* clang-format off */
+#include <stdio.h>
+#include <jpeglib.h>
+#include <setjmp.h>
+/* clang-format on */
+
+#include "lib/jxl/sanitizers.h"
+
+namespace jpegli {
+
+namespace {
+
+#define JPEG_API_FN(name) jpeg_##name
+#include "lib/jpegli/test_utils-inl.h"
+#undef JPEG_API_FN
+
+void ReadOutputPass(j_decompress_ptr cinfo, const DecompressParams& dparams,
+                    TestImage* output) {
+  JDIMENSION xoffset = 0;
+  JDIMENSION yoffset = 0;
+  JDIMENSION xsize_cropped = cinfo->output_width;
+  JDIMENSION ysize_cropped = cinfo->output_height;
+  if (dparams.crop_output) {
+    xoffset = xsize_cropped = cinfo->output_width / 3;
+    yoffset = ysize_cropped = cinfo->output_height / 3;
+    jpeg_crop_scanline(cinfo, &xoffset, &xsize_cropped);
+    JXL_CHECK(xsize_cropped == cinfo->output_width);
+  }
+  output->xsize = xsize_cropped;
+  output->ysize = ysize_cropped;
+  output->components = cinfo->out_color_components;
+  if (cinfo->quantize_colors) {
+    jxl::msan::UnpoisonMemory(cinfo->colormap, cinfo->out_color_components *
+                                                   sizeof(cinfo->colormap[0]));
+    for (int c = 0; c < cinfo->out_color_components; ++c) {
+      jxl::msan::UnpoisonMemory(
+          cinfo->colormap[c],
+          cinfo->actual_number_of_colors * sizeof(cinfo->colormap[c][0]));
+    }
+  }
+  if (!cinfo->raw_data_out) {
+    size_t stride = output->xsize * output->components;
+    output->pixels.resize(output->ysize * stride);
+    output->color_space = cinfo->out_color_space;
+    if (yoffset > 0) {
+      jpeg_skip_scanlines(cinfo, yoffset);
+    }
+    for (size_t y = 0; y < output->ysize; ++y) {
+      JSAMPROW rows[] = {
+          reinterpret_cast<JSAMPLE*>(&output->pixels[y * stride])};
+      JXL_CHECK(1 == jpeg_read_scanlines(cinfo, rows, 1));
+      jxl::msan::UnpoisonMemory(
+          rows[0], sizeof(JSAMPLE) * cinfo->output_components * output->xsize);
+      if (cinfo->quantize_colors) {
+        UnmapColors(rows[0], cinfo->output_width, cinfo->out_color_components,
+                    cinfo->colormap, cinfo->actual_number_of_colors);
+      }
+    }
+    if (cinfo->output_scanline < cinfo->output_height) {
+      jpeg_skip_scanlines(cinfo, cinfo->output_height - cinfo->output_scanline);
+    }
+  } else {
+    output->color_space = cinfo->jpeg_color_space;
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      size_t xsize = cinfo->comp_info[c].width_in_blocks * DCTSIZE;
+      size_t ysize = cinfo->comp_info[c].height_in_blocks * DCTSIZE;
+      std::vector<uint8_t> plane(ysize * xsize);
+      output->raw_data.emplace_back(std::move(plane));
+    }
+    while (cinfo->output_scanline < cinfo->output_height) {
+      size_t iMCU_height = cinfo->max_v_samp_factor * DCTSIZE;
+      JXL_CHECK(cinfo->output_scanline == cinfo->output_iMCU_row * iMCU_height);
+      std::vector<std::vector<JSAMPROW>> rowdata(cinfo->num_components);
+      std::vector<JSAMPARRAY> data(cinfo->num_components);
+      for (int c = 0; c < cinfo->num_components; ++c) {
+        size_t xsize = cinfo->comp_info[c].width_in_blocks * DCTSIZE;
+        size_t ysize = cinfo->comp_info[c].height_in_blocks * DCTSIZE;
+        size_t num_lines = cinfo->comp_info[c].v_samp_factor * DCTSIZE;
+        rowdata[c].resize(num_lines);
+        size_t y0 = cinfo->output_iMCU_row * num_lines;
+        for (size_t i = 0; i < num_lines; ++i) {
+          rowdata[c][i] =
+              y0 + i < ysize ? &output->raw_data[c][(y0 + i) * xsize] : nullptr;
+        }
+        data[c] = &rowdata[c][0];
+      }
+      JXL_CHECK(iMCU_height ==
+                jpeg_read_raw_data(cinfo, &data[0], iMCU_height));
+    }
+  }
+  JXL_CHECK(cinfo->total_iMCU_rows ==
+            DivCeil(cinfo->image_height, cinfo->max_v_samp_factor * DCTSIZE));
+}
+
+void DecodeWithLibjpeg(const CompressParams& jparams,
+                       const DecompressParams& dparams, j_decompress_ptr cinfo,
+                       TestImage* output) {
+  if (jparams.add_marker) {
+    jpeg_save_markers(cinfo, kSpecialMarker0, 0xffff);
+    jpeg_save_markers(cinfo, kSpecialMarker1, 0xffff);
+  }
+  if (!jparams.icc.empty()) {
+    jpeg_save_markers(cinfo, JPEG_APP0 + 2, 0xffff);
+  }
+  JXL_CHECK(JPEG_REACHED_SOS ==
+            jpeg_read_header(cinfo, /*require_image=*/TRUE));
+  if (!jparams.icc.empty()) {
+    uint8_t* icc_data = nullptr;
+    unsigned int icc_len;
+    JXL_CHECK(jpeg_read_icc_profile(cinfo, &icc_data, &icc_len));
+    JXL_CHECK(icc_data);
+    jxl::msan::UnpoisonMemory(icc_data, icc_len);
+    JXL_CHECK(0 == memcmp(jparams.icc.data(), icc_data, icc_len));
+    free(icc_data);
+  }
+  SetDecompressParams(dparams, cinfo);
+  VerifyHeader(jparams, cinfo);
+  if (dparams.output_mode == COEFFICIENTS) {
+    jvirt_barray_ptr* coef_arrays = jpeg_read_coefficients(cinfo);
+    JXL_CHECK(coef_arrays != nullptr);
+    CopyCoefficients(cinfo, coef_arrays, output);
+  } else {
+    JXL_CHECK(jpeg_start_decompress(cinfo));
+    VerifyScanHeader(jparams, cinfo);
+    ReadOutputPass(cinfo, dparams, output);
+  }
+  JXL_CHECK(jpeg_finish_decompress(cinfo));
+}
+
+}  // namespace
+
+// Verifies that an image encoded with libjpegli can be decoded with libjpeg,
+// and checks that the jpeg coding metadata matches jparams.
+void DecodeAllScansWithLibjpeg(const CompressParams& jparams,
+                               const DecompressParams& dparams,
+                               const std::vector<uint8_t>& compressed,
+                               std::vector<TestImage>* output_progression) {
+  jpeg_decompress_struct cinfo = {};
+  const auto try_catch_block = [&]() {
+    jpeg_error_mgr jerr;
+    jmp_buf env;
+    cinfo.err = jpeg_std_error(&jerr);
+    if (setjmp(env)) {
+      return false;
+    }
+    cinfo.client_data = reinterpret_cast<void*>(&env);
+    cinfo.err->error_exit = [](j_common_ptr cinfo) {
+      (*cinfo->err->output_message)(cinfo);
+      jmp_buf* env = reinterpret_cast<jmp_buf*>(cinfo->client_data);
+      jpeg_destroy(cinfo);
+      longjmp(*env, 1);
+    };
+    jpeg_create_decompress(&cinfo);
+    jpeg_mem_src(&cinfo, compressed.data(), compressed.size());
+    if (jparams.add_marker) {
+      jpeg_save_markers(&cinfo, kSpecialMarker0, 0xffff);
+      jpeg_save_markers(&cinfo, kSpecialMarker1, 0xffff);
+    }
+    JXL_CHECK(JPEG_REACHED_SOS ==
+              jpeg_read_header(&cinfo, /*require_image=*/TRUE));
+    cinfo.buffered_image = TRUE;
+    SetDecompressParams(dparams, &cinfo);
+    VerifyHeader(jparams, &cinfo);
+    JXL_CHECK(jpeg_start_decompress(&cinfo));
+    // start decompress should not read the whole input in buffered image mode
+    JXL_CHECK(!jpeg_input_complete(&cinfo));
+    JXL_CHECK(cinfo.output_scan_number == 0);
+    int sos_marker_cnt = 1;  // read header reads the first SOS marker
+    while (!jpeg_input_complete(&cinfo)) {
+      JXL_CHECK(cinfo.input_scan_number == sos_marker_cnt);
+      if (dparams.skip_scans && (cinfo.input_scan_number % 2) != 1) {
+        int result = JPEG_SUSPENDED;
+        while (result != JPEG_REACHED_SOS && result != JPEG_REACHED_EOI) {
+          result = jpeg_consume_input(&cinfo);
+        }
+        if (result == JPEG_REACHED_SOS) ++sos_marker_cnt;
+        continue;
+      }
+      SetScanDecompressParams(dparams, &cinfo, cinfo.input_scan_number);
+      JXL_CHECK(jpeg_start_output(&cinfo, cinfo.input_scan_number));
+      // start output sets output_scan_number, but does not change
+      // input_scan_number
+      JXL_CHECK(cinfo.output_scan_number == cinfo.input_scan_number);
+      JXL_CHECK(cinfo.input_scan_number == sos_marker_cnt);
+      VerifyScanHeader(jparams, &cinfo);
+      TestImage output;
+      ReadOutputPass(&cinfo, dparams, &output);
+      output_progression->emplace_back(std::move(output));
+      // read scanlines/read raw data does not change input/output scan number
+      if (!cinfo.progressive_mode) {
+        JXL_CHECK(cinfo.input_scan_number == sos_marker_cnt);
+        JXL_CHECK(cinfo.output_scan_number == cinfo.input_scan_number);
+      }
+      JXL_CHECK(jpeg_finish_output(&cinfo));
+      ++sos_marker_cnt;  // finish output reads the next SOS marker or EOI
+      if (dparams.output_mode == COEFFICIENTS) {
+        jvirt_barray_ptr* coef_arrays = jpeg_read_coefficients(&cinfo);
+        JXL_CHECK(coef_arrays != nullptr);
+        CopyCoefficients(&cinfo, coef_arrays, &output_progression->back());
+      }
+    }
+    JXL_CHECK(jpeg_finish_decompress(&cinfo));
+    return true;
+  };
+  JXL_CHECK(try_catch_block());
+  jpeg_destroy_decompress(&cinfo);
+}
+
+// Returns the number of bytes read from compressed.
+size_t DecodeWithLibjpeg(const CompressParams& jparams,
+                         const DecompressParams& dparams,
+                         const uint8_t* table_stream, size_t table_stream_size,
+                         const uint8_t* compressed, size_t len,
+                         TestImage* output) {
+  jpeg_decompress_struct cinfo = {};
+  size_t bytes_read;
+  const auto try_catch_block = [&]() {
+    jpeg_error_mgr jerr;
+    jmp_buf env;
+    cinfo.err = jpeg_std_error(&jerr);
+    if (setjmp(env)) {
+      return false;
+    }
+    cinfo.client_data = reinterpret_cast<void*>(&env);
+    cinfo.err->error_exit = [](j_common_ptr cinfo) {
+      (*cinfo->err->output_message)(cinfo);
+      jmp_buf* env = reinterpret_cast<jmp_buf*>(cinfo->client_data);
+      jpeg_destroy(cinfo);
+      longjmp(*env, 1);
+    };
+    jpeg_create_decompress(&cinfo);
+    if (table_stream != nullptr) {
+      jpeg_mem_src(&cinfo, table_stream, table_stream_size);
+      jpeg_read_header(&cinfo, FALSE);
+    }
+    jpeg_mem_src(&cinfo, compressed, len);
+    DecodeWithLibjpeg(jparams, dparams, &cinfo, output);
+    bytes_read = len - cinfo.src->bytes_in_buffer;
+    return true;
+  };
+  JXL_CHECK(try_catch_block());
+  jpeg_destroy_decompress(&cinfo);
+  return bytes_read;
+}
+
+void DecodeWithLibjpeg(const CompressParams& jparams,
+                       const DecompressParams& dparams,
+                       const std::vector<uint8_t>& compressed,
+                       TestImage* output) {
+  DecodeWithLibjpeg(jparams, dparams, nullptr, 0, compressed.data(),
+                    compressed.size(), output);
+}
+
+}  // namespace jpegli
diff --git a/lib/jpegli/libjpeg_test_util.h b/lib/jpegli/libjpeg_test_util.h
new file mode 100644 (file)
index 0000000..18cc1e5
--- /dev/null
@@ -0,0 +1,37 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_LIBJPEG_TEST_UTIL_H_
+#define LIB_JPEGLI_LIBJPEG_TEST_UTIL_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "lib/jpegli/test_params.h"
+
+namespace jpegli {
+
+// Verifies that an image encoded with libjpegli can be decoded with libjpeg,
+// and checks that the jpeg coding metadata matches jparams.
+void DecodeAllScansWithLibjpeg(const CompressParams& jparams,
+                               const DecompressParams& dparams,
+                               const std::vector<uint8_t>& compressed,
+                               std::vector<TestImage>* output_progression);
+// Returns the number of bytes read from compressed.
+size_t DecodeWithLibjpeg(const CompressParams& jparams,
+                         const DecompressParams& dparams,
+                         const uint8_t* table_stream, size_t table_stream_size,
+                         const uint8_t* compressed, size_t len,
+                         TestImage* output);
+void DecodeWithLibjpeg(const CompressParams& jparams,
+                       const DecompressParams& dparams,
+                       const std::vector<uint8_t>& compressed,
+                       TestImage* output);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_LIBJPEG_TEST_UTIL_H_
diff --git a/lib/jpegli/libjpeg_wrapper.cc b/lib/jpegli/libjpeg_wrapper.cc
new file mode 100644 (file)
index 0000000..b38d16f
--- /dev/null
@@ -0,0 +1,255 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// This file contains wrapper-functions that are used to build the libjpeg.so
+// shared library that is API- and ABI-compatible with libjpeg-turbo's version
+// of libjpeg.so.
+
+#include "lib/jpegli/common.h"
+#include "lib/jpegli/decode.h"
+#include "lib/jpegli/encode.h"
+#include "lib/jpegli/error.h"
+
+struct jpeg_error_mgr *jpeg_std_error(struct jpeg_error_mgr *err) {
+  return jpegli_std_error(err);
+}
+
+void jpeg_abort(j_common_ptr cinfo) { jpegli_abort(cinfo); }
+
+void jpeg_destroy(j_common_ptr cinfo) { jpegli_destroy(cinfo); }
+
+JQUANT_TBL *jpeg_alloc_quant_table(j_common_ptr cinfo) {
+  return jpegli_alloc_quant_table(cinfo);
+}
+
+JHUFF_TBL *jpeg_alloc_huff_table(j_common_ptr cinfo) {
+  return jpegli_alloc_huff_table(cinfo);
+}
+
+void jpeg_CreateDecompress(j_decompress_ptr cinfo, int version,
+                           size_t structsize) {
+  jpegli_CreateDecompress(cinfo, version, structsize);
+}
+
+void jpeg_stdio_src(j_decompress_ptr cinfo, FILE *infile) {
+  jpegli_stdio_src(cinfo, infile);
+}
+
+void jpeg_mem_src(j_decompress_ptr cinfo, const unsigned char *inbuffer,
+                  unsigned long insize) {
+  jpegli_mem_src(cinfo, inbuffer, insize);
+}
+
+int jpeg_read_header(j_decompress_ptr cinfo, boolean require_image) {
+  return jpegli_read_header(cinfo, require_image);
+}
+
+boolean jpeg_start_decompress(j_decompress_ptr cinfo) {
+  return jpegli_start_decompress(cinfo);
+}
+
+JDIMENSION jpeg_read_scanlines(j_decompress_ptr cinfo, JSAMPARRAY scanlines,
+                               JDIMENSION max_lines) {
+  return jpegli_read_scanlines(cinfo, scanlines, max_lines);
+}
+
+JDIMENSION jpeg_skip_scanlines(j_decompress_ptr cinfo, JDIMENSION num_lines) {
+  return jpegli_skip_scanlines(cinfo, num_lines);
+}
+
+void jpeg_crop_scanline(j_decompress_ptr cinfo, JDIMENSION *xoffset,
+                        JDIMENSION *width) {
+  jpegli_crop_scanline(cinfo, xoffset, width);
+}
+
+boolean jpeg_finish_decompress(j_decompress_ptr cinfo) {
+  return jpegli_finish_decompress(cinfo);
+}
+
+JDIMENSION jpeg_read_raw_data(j_decompress_ptr cinfo, JSAMPIMAGE data,
+                              JDIMENSION max_lines) {
+  return jpegli_read_raw_data(cinfo, data, max_lines);
+}
+
+jvirt_barray_ptr *jpeg_read_coefficients(j_decompress_ptr cinfo) {
+  return jpegli_read_coefficients(cinfo);
+}
+
+boolean jpeg_has_multiple_scans(j_decompress_ptr cinfo) {
+  return jpegli_has_multiple_scans(cinfo);
+}
+
+boolean jpeg_start_output(j_decompress_ptr cinfo, int scan_number) {
+  return jpegli_start_output(cinfo, scan_number);
+}
+
+boolean jpeg_finish_output(j_decompress_ptr cinfo) {
+  return jpegli_finish_output(cinfo);
+}
+
+boolean jpeg_input_complete(j_decompress_ptr cinfo) {
+  return jpegli_input_complete(cinfo);
+}
+
+int jpeg_consume_input(j_decompress_ptr cinfo) {
+  return jpegli_consume_input(cinfo);
+}
+
+#if JPEG_LIB_VERSION >= 80
+void jpeg_core_output_dimensions(j_decompress_ptr cinfo) {
+  jpegli_core_output_dimensions(cinfo);
+}
+#endif
+void jpeg_calc_output_dimensions(j_decompress_ptr cinfo) {
+  jpegli_calc_output_dimensions(cinfo);
+}
+
+void jpeg_save_markers(j_decompress_ptr cinfo, int marker_code,
+                       unsigned int length_limit) {
+  jpegli_save_markers(cinfo, marker_code, length_limit);
+}
+
+void jpeg_set_marker_processor(j_decompress_ptr cinfo, int marker_code,
+                               jpeg_marker_parser_method routine) {
+  jpegli_set_marker_processor(cinfo, marker_code, routine);
+}
+
+boolean jpeg_read_icc_profile(j_decompress_ptr cinfo, JOCTET **icc_data_ptr,
+                              unsigned int *icc_data_len) {
+  return jpegli_read_icc_profile(cinfo, icc_data_ptr, icc_data_len);
+}
+
+void jpeg_abort_decompress(j_decompress_ptr cinfo) {
+  return jpegli_abort_decompress(cinfo);
+}
+
+void jpeg_destroy_decompress(j_decompress_ptr cinfo) {
+  return jpegli_destroy_decompress(cinfo);
+}
+
+void jpeg_CreateCompress(j_compress_ptr cinfo, int version, size_t structsize) {
+  jpegli_CreateCompress(cinfo, version, structsize);
+}
+
+void jpeg_stdio_dest(j_compress_ptr cinfo, FILE *outfile) {
+  jpegli_stdio_dest(cinfo, outfile);
+}
+
+void jpeg_mem_dest(j_compress_ptr cinfo, unsigned char **outbuffer,
+                   unsigned long *outsize) {
+  jpegli_mem_dest(cinfo, outbuffer, outsize);
+}
+
+void jpeg_set_defaults(j_compress_ptr cinfo) { jpegli_set_defaults(cinfo); }
+
+void jpeg_default_colorspace(j_compress_ptr cinfo) {
+  jpegli_default_colorspace(cinfo);
+}
+
+void jpeg_set_colorspace(j_compress_ptr cinfo, J_COLOR_SPACE colorspace) {
+  jpegli_set_colorspace(cinfo, colorspace);
+}
+
+void jpeg_set_quality(j_compress_ptr cinfo, int quality,
+                      boolean force_baseline) {
+  jpegli_set_quality(cinfo, quality, force_baseline);
+}
+
+void jpeg_set_linear_quality(j_compress_ptr cinfo, int scale_factor,
+                             boolean force_baseline) {
+  jpegli_set_linear_quality(cinfo, scale_factor, force_baseline);
+}
+
+#if JPEG_LIB_VERSION >= 70
+void jpeg_default_qtables(j_compress_ptr cinfo, boolean force_baseline) {
+  jpegli_default_qtables(cinfo, force_baseline);
+}
+#endif
+
+int jpeg_quality_scaling(int quality) {
+  return jpegli_quality_scaling(quality);
+}
+
+void jpeg_add_quant_table(j_compress_ptr cinfo, int which_tbl,
+                          const unsigned int *basic_table, int scale_factor,
+                          boolean force_baseline) {
+  jpegli_add_quant_table(cinfo, which_tbl, basic_table, scale_factor,
+                         force_baseline);
+}
+
+void jpeg_simple_progression(j_compress_ptr cinfo) {
+  jpegli_simple_progression(cinfo);
+}
+
+void jpeg_suppress_tables(j_compress_ptr cinfo, boolean suppress) {
+  jpegli_suppress_tables(cinfo, suppress);
+}
+
+#if JPEG_LIB_VERSION >= 70
+void jpeg_calc_jpeg_dimensions(j_compress_ptr cinfo) {
+  jpegli_calc_jpeg_dimensions(cinfo);
+}
+#endif
+
+void jpeg_copy_critical_parameters(j_decompress_ptr srcinfo,
+                                   j_compress_ptr dstinfo) {
+  jpegli_copy_critical_parameters(srcinfo, dstinfo);
+}
+
+void jpeg_write_m_header(j_compress_ptr cinfo, int marker,
+                         unsigned int datalen) {
+  jpegli_write_m_header(cinfo, marker, datalen);
+}
+
+void jpeg_write_m_byte(j_compress_ptr cinfo, int val) {
+  jpegli_write_m_byte(cinfo, val);
+}
+
+void jpeg_write_marker(j_compress_ptr cinfo, int marker, const JOCTET *dataptr,
+                       unsigned int datalen) {
+  jpegli_write_marker(cinfo, marker, dataptr, datalen);
+}
+
+void jpeg_write_icc_profile(j_compress_ptr cinfo, const JOCTET *icc_data_ptr,
+                            unsigned int icc_data_len) {
+  jpegli_write_icc_profile(cinfo, icc_data_ptr, icc_data_len);
+}
+
+void jpeg_start_compress(j_compress_ptr cinfo, boolean write_all_tables) {
+  jpegli_start_compress(cinfo, write_all_tables);
+}
+
+void jpeg_write_tables(j_compress_ptr cinfo) { jpegli_write_tables(cinfo); }
+
+JDIMENSION jpeg_write_scanlines(j_compress_ptr cinfo, JSAMPARRAY scanlines,
+                                JDIMENSION num_lines) {
+  return jpegli_write_scanlines(cinfo, scanlines, num_lines);
+}
+
+JDIMENSION jpeg_write_raw_data(j_compress_ptr cinfo, JSAMPIMAGE data,
+                               JDIMENSION num_lines) {
+  return jpegli_write_raw_data(cinfo, data, num_lines);
+}
+
+void jpeg_write_coefficients(j_compress_ptr cinfo,
+                             jvirt_barray_ptr *coef_arrays) {
+  jpegli_write_coefficients(cinfo, coef_arrays);
+}
+
+void jpeg_finish_compress(j_compress_ptr cinfo) {
+  jpegli_finish_compress(cinfo);
+}
+
+void jpeg_abort_compress(j_compress_ptr cinfo) { jpegli_abort_compress(cinfo); }
+
+void jpeg_destroy_compress(j_compress_ptr cinfo) {
+  jpegli_destroy_compress(cinfo);
+}
+
+boolean jpeg_resync_to_restart(j_decompress_ptr cinfo, int desired) {
+  return jpegli_resync_to_restart(cinfo, desired);
+}
+
+void jpeg_new_colormap(j_decompress_ptr cinfo) { jpegli_new_colormap(cinfo); }
diff --git a/lib/jpegli/memory_manager.cc b/lib/jpegli/memory_manager.cc
new file mode 100644 (file)
index 0000000..3a8f230
--- /dev/null
@@ -0,0 +1,186 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/memory_manager.h"
+
+#include <string.h>
+
+#include <hwy/aligned_allocator.h>
+#include <vector>
+
+#include "lib/jpegli/common_internal.h"
+#include "lib/jpegli/error.h"
+
+struct jvirt_sarray_control {
+  JSAMPARRAY full_buffer;
+  size_t numrows;
+  JDIMENSION maxaccess;
+};
+
+struct jvirt_barray_control {
+  JBLOCKARRAY full_buffer;
+  size_t numrows;
+  JDIMENSION maxaccess;
+};
+
+namespace jpegli {
+
+namespace {
+
+struct MemoryManager {
+  struct jpeg_memory_mgr pub;
+  std::vector<void*> owned_ptrs[2 * JPOOL_NUMPOOLS];
+  uint64_t pool_memory_usage[2 * JPOOL_NUMPOOLS];
+  uint64_t total_memory_usage;
+  uint64_t peak_memory_usage;
+};
+
+void* Alloc(j_common_ptr cinfo, int pool_id, size_t sizeofobject) {
+  MemoryManager* mem = reinterpret_cast<MemoryManager*>(cinfo->mem);
+  if (pool_id < 0 || pool_id >= 2 * JPOOL_NUMPOOLS) {
+    JPEGLI_ERROR("Invalid pool id %d", pool_id);
+  }
+  if (mem->pub.max_memory_to_use > 0 &&
+      mem->total_memory_usage + static_cast<uint64_t>(sizeofobject) >
+          static_cast<uint64_t>(mem->pub.max_memory_to_use)) {
+    JPEGLI_ERROR("Total memory usage exceeding %ld",
+                 mem->pub.max_memory_to_use);
+  }
+  void* p;
+  if (pool_id < JPOOL_NUMPOOLS) {
+    p = malloc(sizeofobject);
+  } else {
+    p = hwy::AllocateAlignedBytes(sizeofobject, nullptr, nullptr);
+  }
+  if (p == nullptr) {
+    JPEGLI_ERROR("Out of memory");
+  }
+  mem->owned_ptrs[pool_id].push_back(p);
+  mem->pool_memory_usage[pool_id] += sizeofobject;
+  mem->total_memory_usage += sizeofobject;
+  mem->peak_memory_usage =
+      std::max(mem->peak_memory_usage, mem->total_memory_usage);
+  return p;
+}
+
+constexpr size_t gcd(size_t a, size_t b) { return b == 0 ? a : gcd(b, a % b); }
+constexpr size_t lcm(size_t a, size_t b) { return (a * b) / gcd(a, b); }
+
+template <typename T>
+T** Alloc2dArray(j_common_ptr cinfo, int pool_id, JDIMENSION samplesperrow,
+                 JDIMENSION numrows) {
+  T** array = Allocate<T*>(cinfo, numrows, pool_id);
+  // Always use aligned allocator for large 2d arrays.
+  if (pool_id < JPOOL_NUMPOOLS) {
+    pool_id += JPOOL_NUMPOOLS;
+  }
+  size_t alignment = lcm(sizeof(T), HWY_ALIGNMENT);
+  size_t memstride = RoundUpTo(samplesperrow * sizeof(T), alignment);
+  size_t stride = memstride / sizeof(T);
+  T* buffer = Allocate<T>(cinfo, numrows * stride, pool_id);
+  for (size_t i = 0; i < numrows; ++i) {
+    array[i] = &buffer[i * stride];
+  }
+  return array;
+}
+
+template <typename Control, typename T>
+Control* RequestVirtualArray(j_common_ptr cinfo, int pool_id, boolean pre_zero,
+                             JDIMENSION samplesperrow, JDIMENSION numrows,
+                             JDIMENSION maxaccess) {
+  if (pool_id != JPOOL_IMAGE) {
+    JPEGLI_ERROR("Only image lifetime virtual arrays are supported.");
+  }
+  Control* p = Allocate<Control>(cinfo, 1, pool_id);
+  p->full_buffer = Alloc2dArray<T>(cinfo, pool_id, samplesperrow, numrows);
+  p->numrows = numrows;
+  p->maxaccess = maxaccess;
+  if (pre_zero) {
+    for (size_t i = 0; i < numrows; ++i) {
+      memset(p->full_buffer[i], 0, samplesperrow * sizeof(T));
+    }
+  }
+  return p;
+}
+
+void RealizeVirtualArrays(j_common_ptr cinfo) {
+  // Nothing to do, the full arrays were realized at request time already.
+}
+
+template <typename Control, typename T>
+T** AccessVirtualArray(j_common_ptr cinfo, Control* ptr, JDIMENSION start_row,
+                       JDIMENSION num_rows, boolean writable) {
+  if (num_rows > ptr->maxaccess) {
+    JPEGLI_ERROR("Invalid virtual array access, num rows %u vs max rows %u",
+                 num_rows, ptr->maxaccess);
+  }
+  if (start_row + num_rows > ptr->numrows) {
+    JPEGLI_ERROR("Invalid virtual array access, %u vs %u total rows",
+                 start_row + num_rows, ptr->numrows);
+  }
+  if (ptr->full_buffer == nullptr) {
+    JPEGLI_ERROR("Invalid virtual array access, array not realized.");
+  }
+  return ptr->full_buffer + start_row;
+}
+
+void ClearPool(j_common_ptr cinfo, int pool_id) {
+  MemoryManager* mem = reinterpret_cast<MemoryManager*>(cinfo->mem);
+  mem->owned_ptrs[pool_id].clear();
+  mem->total_memory_usage -= mem->pool_memory_usage[pool_id];
+  mem->pool_memory_usage[pool_id] = 0;
+}
+
+void FreePool(j_common_ptr cinfo, int pool_id) {
+  MemoryManager* mem = reinterpret_cast<MemoryManager*>(cinfo->mem);
+  if (pool_id < 0 || pool_id >= JPOOL_NUMPOOLS) {
+    JPEGLI_ERROR("Invalid pool id %d", pool_id);
+  }
+  for (void* ptr : mem->owned_ptrs[pool_id]) {
+    free(ptr);
+  }
+  ClearPool(cinfo, pool_id);
+  for (void* ptr : mem->owned_ptrs[JPOOL_NUMPOOLS + pool_id]) {
+    hwy::FreeAlignedBytes(ptr, nullptr, nullptr);
+  }
+  ClearPool(cinfo, JPOOL_NUMPOOLS + pool_id);
+}
+
+void SelfDestruct(j_common_ptr cinfo) {
+  MemoryManager* mem = reinterpret_cast<MemoryManager*>(cinfo->mem);
+  for (int pool_id = 0; pool_id < JPOOL_NUMPOOLS; ++pool_id) {
+    FreePool(cinfo, pool_id);
+  }
+  delete mem;
+  cinfo->mem = nullptr;
+}
+
+}  // namespace
+
+void InitMemoryManager(j_common_ptr cinfo) {
+  MemoryManager* mem = new MemoryManager;
+  mem->pub.alloc_small = jpegli::Alloc;
+  mem->pub.alloc_large = jpegli::Alloc;
+  mem->pub.alloc_sarray = jpegli::Alloc2dArray<JSAMPLE>;
+  mem->pub.alloc_barray = jpegli::Alloc2dArray<JBLOCK>;
+  mem->pub.request_virt_sarray =
+      jpegli::RequestVirtualArray<jvirt_sarray_control, JSAMPLE>;
+  mem->pub.request_virt_barray =
+      jpegli::RequestVirtualArray<jvirt_barray_control, JBLOCK>;
+  mem->pub.realize_virt_arrays = jpegli::RealizeVirtualArrays;
+  mem->pub.access_virt_sarray =
+      jpegli::AccessVirtualArray<jvirt_sarray_control, JSAMPLE>;
+  mem->pub.access_virt_barray =
+      jpegli::AccessVirtualArray<jvirt_barray_control, JBLOCK>;
+  mem->pub.free_pool = jpegli::FreePool;
+  mem->pub.self_destruct = jpegli::SelfDestruct;
+  mem->pub.max_memory_to_use = 0;
+  mem->total_memory_usage = 0;
+  mem->peak_memory_usage = 0;
+  memset(mem->pool_memory_usage, 0, sizeof(mem->pool_memory_usage));
+  cinfo->mem = reinterpret_cast<struct jpeg_memory_mgr*>(mem);
+}
+
+}  // namespace jpegli
diff --git a/lib/jpegli/memory_manager.h b/lib/jpegli/memory_manager.h
new file mode 100644 (file)
index 0000000..3e2bdab
--- /dev/null
@@ -0,0 +1,45 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_MEMORY_MANAGER_H_
+#define LIB_JPEGLI_MEMORY_MANAGER_H_
+
+#include <stdlib.h>
+
+#include "lib/jpegli/common.h"
+
+#define JPOOL_PERMANENT_ALIGNED (JPOOL_NUMPOOLS + JPOOL_PERMANENT)
+#define JPOOL_IMAGE_ALIGNED (JPOOL_NUMPOOLS + JPOOL_IMAGE)
+
+namespace jpegli {
+
+void InitMemoryManager(j_common_ptr cinfo);
+
+template <typename T>
+T* Allocate(j_common_ptr cinfo, size_t len, int pool_id = JPOOL_PERMANENT) {
+  void* p = (*cinfo->mem->alloc_small)(cinfo, pool_id, len * sizeof(T));
+  return reinterpret_cast<T*>(p);
+}
+
+template <typename T>
+T* Allocate(j_decompress_ptr cinfo, size_t len, int pool_id = JPOOL_PERMANENT) {
+  return Allocate<T>(reinterpret_cast<j_common_ptr>(cinfo), len, pool_id);
+}
+
+template <typename T>
+T* Allocate(j_compress_ptr cinfo, size_t len, int pool_id = JPOOL_PERMANENT) {
+  return Allocate<T>(reinterpret_cast<j_common_ptr>(cinfo), len, pool_id);
+}
+
+template <typename T>
+JBLOCKARRAY GetBlockRow(T cinfo, int c, JDIMENSION by) {
+  return (*cinfo->mem->access_virt_barray)(
+      reinterpret_cast<j_common_ptr>(cinfo), cinfo->master->coeff_buffers[c],
+      by, 1, true);
+}
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_MEMORY_MANAGER_H_
diff --git a/lib/jpegli/output_suspension_test.cc b/lib/jpegli/output_suspension_test.cc
new file mode 100644 (file)
index 0000000..73db791
--- /dev/null
@@ -0,0 +1,219 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/encode.h"
+#include "lib/jpegli/test_utils.h"
+#include "lib/jpegli/testing.h"
+
+namespace jpegli {
+namespace {
+
+static constexpr size_t kInitialBufferSize = 1024;
+static constexpr size_t kFinalBufferSize = 18;
+
+struct DestinationManager {
+  jpeg_destination_mgr pub;
+  std::vector<uint8_t> buffer;
+
+  DestinationManager() {
+    pub.init_destination = init_destination;
+    pub.empty_output_buffer = empty_output_buffer;
+    pub.term_destination = term_destination;
+  }
+
+  void Rewind() {
+    pub.next_output_byte = buffer.data();
+    pub.free_in_buffer = buffer.size();
+  }
+
+  void EmptyTo(std::vector<uint8_t>* output, size_t new_size = 0) {
+    output->insert(output->end(), buffer.data(), pub.next_output_byte);
+    if (new_size > 0) {
+      buffer.resize(new_size);
+    }
+    Rewind();
+  }
+
+  static void init_destination(j_compress_ptr cinfo) {
+    auto us = reinterpret_cast<DestinationManager*>(cinfo->dest);
+    us->buffer.resize(kInitialBufferSize);
+    us->Rewind();
+  }
+
+  static boolean empty_output_buffer(j_compress_ptr cinfo) { return FALSE; }
+
+  static void term_destination(j_compress_ptr cinfo) {}
+};
+
+struct TestConfig {
+  TestImage input;
+  CompressParams jparams;
+  size_t buffer_size;
+  size_t lines_batch_size;
+};
+
+class OutputSuspensionTestParam : public ::testing::TestWithParam<TestConfig> {
+};
+
+TEST_P(OutputSuspensionTestParam, PixelData) {
+  jpeg_compress_struct cinfo = {};
+  TestConfig config = GetParam();
+  TestImage& input = config.input;
+  GeneratePixels(&input);
+  DestinationManager dest;
+  std::vector<uint8_t> compressed;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    cinfo.dest = reinterpret_cast<jpeg_destination_mgr*>(&dest);
+
+    cinfo.image_width = input.xsize;
+    cinfo.image_height = input.ysize;
+    cinfo.input_components = input.components;
+    cinfo.in_color_space = JCS_RGB;
+    jpegli_set_defaults(&cinfo);
+    cinfo.comp_info[0].v_samp_factor = config.jparams.v_sampling[0];
+    jpegli_set_progressive_level(&cinfo, 0);
+    cinfo.optimize_coding = FALSE;
+    jpegli_start_compress(&cinfo, TRUE);
+
+    size_t stride = cinfo.image_width * cinfo.input_components;
+    std::vector<uint8_t> row_bytes(config.lines_batch_size * stride);
+    while (cinfo.next_scanline < cinfo.image_height) {
+      size_t lines_left = cinfo.image_height - cinfo.next_scanline;
+      size_t num_lines = std::min(config.lines_batch_size, lines_left);
+      memcpy(&row_bytes[0], &input.pixels[cinfo.next_scanline * stride],
+             num_lines * stride);
+      std::vector<JSAMPROW> rows(num_lines);
+      for (size_t i = 0; i < num_lines; ++i) {
+        rows[i] = &row_bytes[i * stride];
+      }
+      size_t lines_done = 0;
+      while (lines_done < num_lines) {
+        lines_done += jpegli_write_scanlines(&cinfo, &rows[lines_done],
+                                             num_lines - lines_done);
+        if (lines_done < num_lines) {
+          dest.EmptyTo(&compressed, config.buffer_size);
+        }
+      }
+    }
+    dest.EmptyTo(&compressed, kFinalBufferSize);
+    jpegli_finish_compress(&cinfo);
+    dest.EmptyTo(&compressed);
+    return true;
+  };
+  ASSERT_TRUE(try_catch_block());
+  jpegli_destroy_compress(&cinfo);
+  TestImage output;
+  DecodeWithLibjpeg(CompressParams(), DecompressParams(), compressed, &output);
+  VerifyOutputImage(input, output, 2.5);
+}
+
+TEST_P(OutputSuspensionTestParam, RawData) {
+  jpeg_compress_struct cinfo = {};
+  TestConfig config = GetParam();
+  if (config.lines_batch_size != 1) return;
+  TestImage& input = config.input;
+  input.color_space = JCS_YCbCr;
+  GeneratePixels(&input);
+  GenerateRawData(config.jparams, &input);
+  DestinationManager dest;
+  std::vector<uint8_t> compressed;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    cinfo.dest = reinterpret_cast<jpeg_destination_mgr*>(&dest);
+    cinfo.image_width = input.xsize;
+    cinfo.image_height = input.ysize;
+    cinfo.input_components = input.components;
+    cinfo.in_color_space = JCS_YCbCr;
+    jpegli_set_defaults(&cinfo);
+    cinfo.comp_info[0].v_samp_factor = config.jparams.v_sampling[0];
+    jpegli_set_progressive_level(&cinfo, 0);
+    cinfo.optimize_coding = FALSE;
+    cinfo.raw_data_in = TRUE;
+    jpegli_start_compress(&cinfo, TRUE);
+
+    std::vector<std::vector<uint8_t>> raw_data = input.raw_data;
+    size_t max_lines = config.jparams.max_v_sample() * DCTSIZE;
+    std::vector<std::vector<JSAMPROW>> rowdata(cinfo.num_components);
+    std::vector<JSAMPARRAY> data(cinfo.num_components);
+    for (int c = 0; c < cinfo.num_components; ++c) {
+      rowdata[c].resize(config.jparams.v_samp(c) * DCTSIZE);
+      data[c] = &rowdata[c][0];
+    }
+    while (cinfo.next_scanline < cinfo.image_height) {
+      for (int c = 0; c < cinfo.num_components; ++c) {
+        size_t cwidth = cinfo.comp_info[c].width_in_blocks * DCTSIZE;
+        size_t cheight = cinfo.comp_info[c].height_in_blocks * DCTSIZE;
+        size_t num_lines = config.jparams.v_samp(c) * DCTSIZE;
+        size_t y0 = (cinfo.next_scanline / max_lines) * num_lines;
+        for (size_t i = 0; i < num_lines; ++i) {
+          rowdata[c][i] =
+              (y0 + i < cheight ? &raw_data[c][(y0 + i) * cwidth] : nullptr);
+        }
+      }
+      while (jpegli_write_raw_data(&cinfo, &data[0], max_lines) == 0) {
+        dest.EmptyTo(&compressed, config.buffer_size);
+      }
+    }
+    dest.EmptyTo(&compressed, kFinalBufferSize);
+    jpegli_finish_compress(&cinfo);
+    dest.EmptyTo(&compressed);
+    return true;
+  };
+  try_catch_block();
+  jpegli_destroy_compress(&cinfo);
+  DecompressParams dparams;
+  dparams.output_mode = RAW_DATA;
+  TestImage output;
+  DecodeWithLibjpeg(CompressParams(), dparams, compressed, &output);
+  VerifyOutputImage(input, output, 3.5);
+}
+
+std::vector<TestConfig> GenerateTests() {
+  std::vector<TestConfig> all_tests;
+  const size_t xsize0 = 1920;
+  const size_t ysize0 = 1080;
+  for (int dysize : {0, 1, 8, 9}) {
+    for (int v_sampling : {1, 2}) {
+      for (int nlines : {1, 8, 117}) {
+        for (int bufsize : {1, 16, 16 << 10}) {
+          TestConfig config;
+          config.lines_batch_size = nlines;
+          config.buffer_size = bufsize;
+          config.input.xsize = xsize0;
+          config.input.ysize = ysize0 + dysize;
+          config.jparams.h_sampling = {1, 1, 1};
+          config.jparams.v_sampling = {v_sampling, 1, 1};
+          all_tests.push_back(config);
+        }
+      }
+    }
+  }
+  return all_tests;
+}
+
+std::ostream& operator<<(std::ostream& os, const TestConfig& c) {
+  os << c.input;
+  os << c.jparams;
+  os << "Lines" << c.lines_batch_size;
+  os << "BufSize" << c.buffer_size;
+  return os;
+}
+
+std::string TestDescription(
+    const testing::TestParamInfo<OutputSuspensionTestParam::ParamType>& info) {
+  std::stringstream name;
+  name << info.param;
+  return name.str();
+}
+
+JPEGLI_INSTANTIATE_TEST_SUITE_P(OutputSuspensionTest, OutputSuspensionTestParam,
+                                testing::ValuesIn(GenerateTests()),
+                                TestDescription);
+
+}  // namespace
+}  // namespace jpegli
diff --git a/lib/jpegli/quant.cc b/lib/jpegli/quant.cc
new file mode 100644 (file)
index 0000000..36f1df4
--- /dev/null
@@ -0,0 +1,768 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/quant.h"
+
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include "lib/jpegli/adaptive_quantization.h"
+#include "lib/jpegli/common.h"
+#include "lib/jpegli/encode_internal.h"
+#include "lib/jpegli/error.h"
+#include "lib/jpegli/memory_manager.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/status.h"
+
+namespace jpegli {
+
+namespace {
+
+// Global scale is chosen in a way that butteraugli 3-norm matches libjpeg
+// with the same quality setting. Fitted for quality 90 on jyrki31 corpus.
+constexpr float kGlobalScaleXYB = 1.43951668f;
+constexpr float kGlobalScaleYCbCr = 1.73966010f;
+
+static constexpr float kBaseQuantMatrixXYB[] = {
+    // c = 0
+    7.5629935265f,
+    19.8247814178f,
+    22.5724945068f,
+    20.6706695557f,
+    22.6864585876f,
+    23.5696277618f,
+    25.8129081726f,
+    36.3307571411f,
+    19.8247814178f,
+    21.5503177643f,
+    19.9372234344f,
+    20.5424213409f,
+    21.8645496368f,
+    23.9041385651f,
+    28.2844066620f,
+    32.6609764099f,
+    22.5724945068f,
+    19.9372234344f,
+    21.9017257690f,
+    19.1223449707f,
+    21.7515811920f,
+    24.6724700928f,
+    25.4249649048f,
+    32.6653823853f,
+    20.6706695557f,
+    20.5424213409f,
+    19.1223449707f,
+    20.1610221863f,
+    25.3719692230f,
+    25.9668903351f,
+    30.9804954529f,
+    31.3406009674f,
+    22.6864585876f,
+    21.8645496368f,
+    21.7515811920f,
+    25.3719692230f,
+    26.2431850433f,
+    40.5992202759f,
+    43.2624626160f,
+    63.3010940552f,
+    23.5696277618f,
+    23.9041385651f,
+    24.6724700928f,
+    25.9668903351f,
+    40.5992202759f,
+    48.3026771545f,
+    34.0964355469f,
+    61.9852142334f,
+    25.8129081726f,
+    28.2844066620f,
+    25.4249649048f,
+    30.9804954529f,
+    43.2624626160f,
+    34.0964355469f,
+    34.4937438965f,
+    66.9702758789f,
+    36.3307571411f,
+    32.6609764099f,
+    32.6653823853f,
+    31.3406009674f,
+    63.3010940552f,
+    61.9852142334f,
+    66.9702758789f,
+    39.9652709961f,
+    // c = 1
+    1.6262000799f,
+    3.2199242115f,
+    3.4903779030f,
+    3.9148359299f,
+    4.8337211609f,
+    4.9108843803f,
+    5.3137121201f,
+    6.1676793098f,
+    3.2199242115f,
+    3.4547898769f,
+    3.6036829948f,
+    4.2652835846f,
+    4.8368387222f,
+    4.8226222992f,
+    5.6120514870f,
+    6.3431472778f,
+    3.4903779030f,
+    3.6036829948f,
+    3.9044559002f,
+    4.3374395370f,
+    4.8435096741f,
+    5.4057979584f,
+    5.6066360474f,
+    6.1075134277f,
+    3.9148359299f,
+    4.2652835846f,
+    4.3374395370f,
+    4.6064834595f,
+    5.1751475334f,
+    5.4013924599f,
+    6.0399808884f,
+    6.7825231552f,
+    4.8337211609f,
+    4.8368387222f,
+    4.8435096741f,
+    5.1751475334f,
+    5.3748049736f,
+    6.1410837173f,
+    7.6529307365f,
+    7.5235214233f,
+    4.9108843803f,
+    4.8226222992f,
+    5.4057979584f,
+    5.4013924599f,
+    6.1410837173f,
+    6.3431472778f,
+    7.1083049774f,
+    7.6008300781f,
+    5.3137121201f,
+    5.6120514870f,
+    5.6066360474f,
+    6.0399808884f,
+    7.6529307365f,
+    7.1083049774f,
+    7.0943155289f,
+    7.0478363037f,
+    6.1676793098f,
+    6.3431472778f,
+    6.1075134277f,
+    6.7825231552f,
+    7.5235214233f,
+    7.6008300781f,
+    7.0478363037f,
+    6.9186143875f,
+    // c = 2
+    3.3038473129f,
+    10.0689258575f,
+    12.2785224915f,
+    14.6041173935f,
+    16.2107315063f,
+    19.2314529419f,
+    28.0129547119f,
+    55.6682891846f,
+    10.0689258575f,
+    11.4085016251f,
+    11.3871345520f,
+    15.4934167862f,
+    16.5364933014f,
+    14.9153423309f,
+    26.3748722076f,
+    40.8614425659f,
+    12.2785224915f,
+    11.3871345520f,
+    17.0886878967f,
+    13.9500350952f,
+    16.0003223419f,
+    28.5660629272f,
+    26.2124195099f,
+    30.1260128021f,
+    14.6041173935f,
+    15.4934167862f,
+    13.9500350952f,
+    21.1235027313f,
+    26.1579780579f,
+    25.5579223633f,
+    40.6859359741f,
+    33.8056335449f,
+    16.2107315063f,
+    16.5364933014f,
+    16.0003223419f,
+    26.1579780579f,
+    26.8042831421f,
+    26.1587715149f,
+    35.7343978882f,
+    43.6857032776f,
+    19.2314529419f,
+    14.9153423309f,
+    28.5660629272f,
+    25.5579223633f,
+    26.1587715149f,
+    34.5418128967f,
+    41.3197937012f,
+    48.7867660522f,
+    28.0129547119f,
+    26.3748722076f,
+    26.2124195099f,
+    40.6859359741f,
+    35.7343978882f,
+    41.3197937012f,
+    47.6329460144f,
+    55.3498458862f,
+    55.6682891846f,
+    40.8614425659f,
+    30.1260128021f,
+    33.8056335449f,
+    43.6857032776f,
+    48.7867660522f,
+    55.3498458862f,
+    63.6065597534f,
+};
+
+static const float kBaseQuantMatrixYCbCr[] = {
+    // c = 0
+    1.2397409345866273f,  //
+    1.7227115097630963f,  //
+    2.9212167156636855f,  //
+    2.812737435286529f,   //
+    3.339819711906184f,   //
+    3.463603762596166f,   //
+    3.840915217993518f,   //
+    3.86956f,             //
+    1.7227115097630963f,  //
+    2.0928894413636874f,  //
+    2.8456760904429297f,  //
+    2.704506820909662f,   //
+    3.4407673520905337f,  //
+    3.166232352090534f,   //
+    4.025208741558432f,   //
+    4.035324490952577f,   //
+    2.9212167156636855f,  //
+    2.8456760904429297f,  //
+    2.9587403520905338f,  //
+    3.3862948970669273f,  //
+    3.619523781336757f,   //
+    3.9046279999999998f,  //
+    3.757835838431854f,   //
+    4.237447515714274f,   //
+    2.812737435286529f,   //
+    2.704506820909662f,   //
+    3.3862948970669273f,  //
+    3.380058821812233f,   //
+    4.1679867415584315f,  //
+    4.805510627261856f,   //
+    4.784259f,            //
+    4.605934f,            //
+    3.339819711906184f,   //
+    3.4407673520905337f,  //
+    3.619523781336757f,   //
+    4.1679867415584315f,  //
+    4.579851258441568f,   //
+    4.923237f,            //
+    5.574107f,            //
+    5.48533336146308f,    //
+    3.463603762596166f,   //
+    3.166232352090534f,   //
+    3.9046279999999998f,  //
+    4.805510627261856f,   //
+    4.923237f,            //
+    5.43936f,             //
+    5.093895741558431f,   //
+    6.0872254423617225f,  //
+    3.840915217993518f,   //
+    4.025208741558432f,   //
+    3.757835838431854f,   //
+    4.784259f,            //
+    5.574107f,            //
+    5.093895741558431f,   //
+    5.438461f,            //
+    5.4037359493250845f,  //
+    3.86956f,             //
+    4.035324490952577f,   //
+    4.237447515714274f,   //
+    4.605934f,            //
+    5.48533336146308f,    //
+    6.0872254423617225f,  //
+    5.4037359493250845f,  //
+    4.37787101190424f,
+    // c = 1
+    2.8236197786377537f,  //
+    6.495639358561486f,   //
+    9.310489207538302f,   //
+    10.64747864717083f,   //
+    11.07419143098738f,   //
+    17.146390223910462f,  //
+    18.463982229408998f,  //
+    29.087001644203088f,  //
+    6.495639358561486f,   //
+    8.890103846667353f,   //
+    8.976895794294748f,   //
+    13.666270550318826f,  //
+    16.547071905624193f,  //
+    16.63871382827686f,   //
+    26.778396930893695f,  //
+    21.33034294694781f,   //
+    9.310489207538302f,   //
+    8.976895794294748f,   //
+    11.08737706005991f,   //
+    18.20548239870446f,   //
+    19.752481654011646f,  //
+    23.985660533114896f,  //
+    102.6457378402362f,   //
+    24.450989f,           //
+    10.64747864717083f,   //
+    13.666270550318826f,  //
+    18.20548239870446f,   //
+    18.628012327860365f,  //
+    16.042509519487183f,  //
+    25.04918273242625f,   //
+    25.017140189353015f,  //
+    35.79788782635831f,   //
+    11.07419143098738f,   //
+    16.547071905624193f,  //
+    19.752481654011646f,  //
+    16.042509519487183f,  //
+    19.373482748612577f,  //
+    14.677529999999999f,  //
+    19.94695960400931f,   //
+    51.094112f,           //
+    17.146390223910462f,  //
+    16.63871382827686f,   //
+    23.985660533114896f,  //
+    25.04918273242625f,   //
+    14.677529999999999f,  //
+    31.320412426835304f,  //
+    46.357234000000005f,  //
+    67.48111451705412f,   //
+    18.463982229408998f,  //
+    26.778396930893695f,  //
+    102.6457378402362f,   //
+    25.017140189353015f,  //
+    19.94695960400931f,   //
+    46.357234000000005f,  //
+    61.315764694388044f,  //
+    88.34665293823721f,   //
+    29.087001644203088f,  //
+    21.33034294694781f,   //
+    24.450989f,           //
+    35.79788782635831f,   //
+    51.094112f,           //
+    67.48111451705412f,   //
+    88.34665293823721f,   //
+    112.16099098350989f,
+    // c = 2
+    2.9217254961255255f,  //
+    4.497681013199305f,   //
+    7.356344520940414f,   //
+    6.583891506504051f,   //
+    8.535608740100237f,   //
+    8.799434353234647f,   //
+    9.188341534163023f,   //
+    9.482700481227672f,   //
+    4.497681013199305f,   //
+    6.309548851989123f,   //
+    7.024608962670982f,   //
+    7.156445324163424f,   //
+    8.049059218663244f,   //
+    7.0124290657218555f,  //
+    6.711923184393611f,   //
+    8.380307846134853f,   //
+    7.356344520940414f,   //
+    7.024608962670982f,   //
+    6.892101177327445f,   //
+    6.882819916277163f,   //
+    8.782226090078568f,   //
+    6.8774750000000004f,  //
+    7.8858175969577955f,  //
+    8.67909f,             //
+    6.583891506504051f,   //
+    7.156445324163424f,   //
+    6.882819916277163f,   //
+    7.003072944847055f,   //
+    7.7223464701024875f,  //
+    7.955425720217421f,   //
+    7.4734110000000005f,  //
+    8.362933242943903f,   //
+    8.535608740100237f,   //
+    8.049059218663244f,   //
+    8.782226090078568f,   //
+    7.7223464701024875f,  //
+    6.778005927001542f,   //
+    9.484922741558432f,   //
+    9.043702663686046f,   //
+    8.053178199770173f,   //
+    8.799434353234647f,   //
+    7.0124290657218555f,  //
+    6.8774750000000004f,  //
+    7.955425720217421f,   //
+    9.484922741558432f,   //
+    8.607606527385098f,   //
+    9.922697394370815f,   //
+    64.25135180237939f,   //
+    9.188341534163023f,   //
+    6.711923184393611f,   //
+    7.8858175969577955f,  //
+    7.4734110000000005f,  //
+    9.043702663686046f,   //
+    9.922697394370815f,   //
+    63.184936549738225f,  //
+    83.35294340273799f,   //
+    9.482700481227672f,   //
+    8.380307846134853f,   //
+    8.67909f,             //
+    8.362933242943903f,   //
+    8.053178199770173f,   //
+    64.25135180237939f,   //
+    83.35294340273799f,   //
+    114.89202448569779f,  //
+};
+
+static const float k420GlobalScale = 1.22;
+static const float k420Rescale[64] = {
+    0.4093, 0.3209, 0.3477, 0.3333, 0.3144, 0.2823, 0.3214, 0.3354,  //
+    0.3209, 0.3111, 0.3489, 0.2801, 0.3059, 0.3119, 0.4135, 0.3445,  //
+    0.3477, 0.3489, 0.3586, 0.3257, 0.2727, 0.3754, 0.3369, 0.3484,  //
+    0.3333, 0.2801, 0.3257, 0.3020, 0.3515, 0.3410, 0.3971, 0.3839,  //
+    0.3144, 0.3059, 0.2727, 0.3515, 0.3105, 0.3397, 0.2716, 0.3836,  //
+    0.2823, 0.3119, 0.3754, 0.3410, 0.3397, 0.3212, 0.3203, 0.0726,  //
+    0.3214, 0.4135, 0.3369, 0.3971, 0.2716, 0.3203, 0.0798, 0.0553,  //
+    0.3354, 0.3445, 0.3484, 0.3839, 0.3836, 0.0726, 0.0553, 0.3368,  //
+};
+
+static const float kBaseQuantMatrixStd[] = {
+    // c = 0
+    16.0f, 11.0f, 10.0f, 16.0f, 24.0f, 40.0f, 51.0f, 61.0f,      //
+    12.0f, 12.0f, 14.0f, 19.0f, 26.0f, 58.0f, 60.0f, 55.0f,      //
+    14.0f, 13.0f, 16.0f, 24.0f, 40.0f, 57.0f, 69.0f, 56.0f,      //
+    14.0f, 17.0f, 22.0f, 29.0f, 51.0f, 87.0f, 80.0f, 62.0f,      //
+    18.0f, 22.0f, 37.0f, 56.0f, 68.0f, 109.0f, 103.0f, 77.0f,    //
+    24.0f, 35.0f, 55.0f, 64.0f, 81.0f, 104.0f, 113.0f, 92.0f,    //
+    49.0f, 64.0f, 78.0f, 87.0f, 103.0f, 121.0f, 120.0f, 101.0f,  //
+    72.0f, 92.0f, 95.0f, 98.0f, 112.0f, 100.0f, 103.0f, 99.0f,   //
+    // c = 1
+    17.0f, 18.0f, 24.0f, 47.0f, 99.0f, 99.0f, 99.0f, 99.0f,  //
+    18.0f, 21.0f, 26.0f, 66.0f, 99.0f, 99.0f, 99.0f, 99.0f,  //
+    24.0f, 26.0f, 56.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f,  //
+    47.0f, 66.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f,  //
+    99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f,  //
+    99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f,  //
+    99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f,  //
+    99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f, 99.0f,  //
+};
+
+static const float kZeroBiasMulYCbCrLQ[] = {
+    // c = 0
+    0.0000f, 0.0568f, 0.3880f, 0.6190f, 0.6190f, 0.4490f, 0.4490f, 0.6187f,  //
+    0.0568f, 0.5829f, 0.6189f, 0.6190f, 0.6190f, 0.7190f, 0.6190f, 0.6189f,  //
+    0.3880f, 0.6189f, 0.6190f, 0.6190f, 0.6190f, 0.6190f, 0.6187f, 0.6100f,  //
+    0.6190f, 0.6190f, 0.6190f, 0.6190f, 0.5890f, 0.3839f, 0.7160f, 0.6190f,  //
+    0.6190f, 0.6190f, 0.6190f, 0.5890f, 0.6190f, 0.3880f, 0.5860f, 0.4790f,  //
+    0.4490f, 0.7190f, 0.6190f, 0.3839f, 0.3880f, 0.6190f, 0.6190f, 0.6190f,  //
+    0.4490f, 0.6190f, 0.6187f, 0.7160f, 0.5860f, 0.6190f, 0.6204f, 0.6190f,  //
+    0.6187f, 0.6189f, 0.6100f, 0.6190f, 0.4790f, 0.6190f, 0.6190f, 0.3480f,  //
+    // c = 1
+    0.0000f, 1.1640f, 0.9373f, 1.1319f, 0.8016f, 0.9136f, 1.1530f, 0.9430f,  //
+    1.1640f, 0.9188f, 0.9160f, 1.1980f, 1.1830f, 0.9758f, 0.9430f, 0.9430f,  //
+    0.9373f, 0.9160f, 0.8430f, 1.1720f, 0.7083f, 0.9430f, 0.9430f, 0.9430f,  //
+    1.1319f, 1.1980f, 1.1720f, 1.1490f, 0.8547f, 0.9430f, 0.9430f, 0.9430f,  //
+    0.8016f, 1.1830f, 0.7083f, 0.8547f, 0.9430f, 0.9430f, 0.9430f, 0.9430f,  //
+    0.9136f, 0.9758f, 0.9430f, 0.9430f, 0.9430f, 0.9430f, 0.9430f, 0.9430f,  //
+    1.1530f, 0.9430f, 0.9430f, 0.9430f, 0.9430f, 0.9430f, 0.9430f, 0.9480f,  //
+    0.9430f, 0.9430f, 0.9430f, 0.9430f, 0.9430f, 0.9430f, 0.9480f, 0.9430f,  //
+    // c = 2
+    0.0000f, 1.3190f, 0.4308f, 0.4460f, 0.0661f, 0.0660f, 0.2660f, 0.2960f,  //
+    1.3190f, 0.3280f, 0.3093f, 0.0750f, 0.0505f, 0.1594f, 0.3060f, 0.2113f,  //
+    0.4308f, 0.3093f, 0.3060f, 0.1182f, 0.0500f, 0.3060f, 0.3915f, 0.2426f,  //
+    0.4460f, 0.0750f, 0.1182f, 0.0512f, 0.0500f, 0.2130f, 0.3930f, 0.1590f,  //
+    0.0661f, 0.0505f, 0.0500f, 0.0500f, 0.3055f, 0.3360f, 0.5148f, 0.5403f,  //
+    0.0660f, 0.1594f, 0.3060f, 0.2130f, 0.3360f, 0.5060f, 0.5874f, 0.3060f,  //
+    0.2660f, 0.3060f, 0.3915f, 0.3930f, 0.5148f, 0.5874f, 0.3060f, 0.3060f,  //
+    0.2960f, 0.2113f, 0.2426f, 0.1590f, 0.5403f, 0.3060f, 0.3060f, 0.3060f,  //
+};
+
+static const float kZeroBiasMulYCbCrHQ[] = {
+    // c = 0
+    0.0000f, 0.0044f, 0.2521f, 0.6547f, 0.8161f, 0.6130f, 0.8841f, 0.8155f,  //
+    0.0044f, 0.6831f, 0.6553f, 0.6295f, 0.7848f, 0.7843f, 0.8474f, 0.7836f,  //
+    0.2521f, 0.6553f, 0.7834f, 0.7829f, 0.8161f, 0.8072f, 0.7743f, 0.9242f,  //
+    0.6547f, 0.6295f, 0.7829f, 0.8654f, 0.7829f, 0.6986f, 0.7818f, 0.7726f,  //
+    0.8161f, 0.7848f, 0.8161f, 0.7829f, 0.7471f, 0.7827f, 0.7843f, 0.7653f,  //
+    0.6130f, 0.7843f, 0.8072f, 0.6986f, 0.7827f, 0.7848f, 0.9508f, 0.7653f,  //
+    0.8841f, 0.8474f, 0.7743f, 0.7818f, 0.7843f, 0.9508f, 0.7839f, 0.8437f,  //
+    0.8155f, 0.7836f, 0.9242f, 0.7726f, 0.7653f, 0.7653f, 0.8437f, 0.7819f,  //
+    // c = 1
+    0.0000f, 1.0816f, 1.0556f, 1.2876f, 1.1554f, 1.1567f, 1.8851f, 0.5488f,  //
+    1.0816f, 1.1537f, 1.1850f, 1.0712f, 1.1671f, 2.0719f, 1.0544f, 1.4764f,  //
+    1.0556f, 1.1850f, 1.2870f, 1.1981f, 1.8181f, 1.2618f, 1.0564f, 1.1191f,  //
+    1.2876f, 1.0712f, 1.1981f, 1.4753f, 2.0609f, 1.0564f, 1.2645f, 1.0564f,  //
+    1.1554f, 1.1671f, 1.8181f, 2.0609f, 0.7324f, 1.1163f, 0.8464f, 1.0564f,  //
+    1.1567f, 2.0719f, 1.2618f, 1.0564f, 1.1163f, 1.0040f, 1.0564f, 1.0564f,  //
+    1.8851f, 1.0544f, 1.0564f, 1.2645f, 0.8464f, 1.0564f, 1.0564f, 1.0564f,  //
+    0.5488f, 1.4764f, 1.1191f, 1.0564f, 1.0564f, 1.0564f, 1.0564f, 1.0564f,  //
+    // c = 2
+    0.0000f, 0.5392f, 0.6659f, 0.8968f, 0.6829f, 0.6328f, 0.5802f, 0.4836f,  //
+    0.5392f, 0.6746f, 0.6760f, 0.6102f, 0.6015f, 0.6958f, 0.7327f, 0.4897f,  //
+    0.6659f, 0.6760f, 0.6957f, 0.6543f, 0.4396f, 0.6330f, 0.7081f, 0.2583f,  //
+    0.8968f, 0.6102f, 0.6543f, 0.5913f, 0.6457f, 0.5828f, 0.5139f, 0.3565f,  //
+    0.6829f, 0.6015f, 0.4396f, 0.6457f, 0.5633f, 0.4263f, 0.6371f, 0.5949f,  //
+    0.6328f, 0.6958f, 0.6330f, 0.5828f, 0.4263f, 0.2847f, 0.2909f, 0.6629f,  //
+    0.5802f, 0.7327f, 0.7081f, 0.5139f, 0.6371f, 0.2909f, 0.6644f, 0.6644f,  //
+    0.4836f, 0.4897f, 0.2583f, 0.3565f, 0.5949f, 0.6629f, 0.6644f, 0.6644f,  //
+};
+
+static const float kZeroBiasOffsetYCbCrDC[] = {0.0f, 0.0f, 0.0f};
+
+static const float kZeroBiasOffsetYCbCrAC[] = {
+    0.59082f,
+    0.58146f,
+    0.57988f,
+};
+
+constexpr uint8_t kTransferFunctionPQ = 16;
+constexpr uint8_t kTransferFunctionHLG = 18;
+
+float DistanceToLinearQuality(float distance) {
+  if (distance <= 0.1f) {
+    return 1.0f;
+  } else if (distance <= 4.6f) {
+    return (200.0f / 9.0f) * (distance - 0.1f);
+  } else if (distance <= 6.4f) {
+    return 5000.0f / (100.0f - (distance - 0.1f) / 0.09f);
+  } else if (distance < 25.0f) {
+    return 530000.0f /
+           (3450.0f -
+            300.0f * std::sqrt((848.0f * distance - 5330.0f) / 120.0f));
+  } else {
+    return 5000.0f;
+  }
+}
+
+constexpr float kExponent[DCTSIZE2] = {
+    1.00f, 0.51f, 0.67f, 0.74f, 1.00f, 1.00f, 1.00f, 1.00f,  //
+    0.51f, 0.66f, 0.69f, 0.87f, 1.00f, 1.00f, 1.00f, 1.00f,  //
+    0.67f, 0.69f, 0.84f, 0.83f, 0.96f, 1.00f, 1.00f, 1.00f,  //
+    0.74f, 0.87f, 0.83f, 1.00f, 1.00f, 0.91f, 0.91f, 1.00f,  //
+    1.00f, 1.00f, 0.96f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f,  //
+    1.00f, 1.00f, 1.00f, 0.91f, 1.00f, 1.00f, 1.00f, 1.00f,  //
+    1.00f, 1.00f, 1.00f, 0.91f, 1.00f, 1.00f, 1.00f, 1.00f,  //
+    1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f,  //
+};
+constexpr float kDist0 = 1.5f;  // distance where non-linearity kicks in.
+
+float DistanceToScale(float distance, int k) {
+  if (distance < kDist0) {
+    return distance;
+  }
+  const float exp = kExponent[k];
+  const float mul = std::pow(kDist0, 1.0 - exp);
+  return std::max<float>(0.5f * distance, mul * std::pow(distance, exp));
+}
+
+float ScaleToDistance(float scale, int k) {
+  if (scale < kDist0) {
+    return scale;
+  }
+  const float exp = 1.0 / kExponent[k];
+  const float mul = std::pow(kDist0, 1.0 - exp);
+  return std::min<float>(2.0f * scale, mul * std::pow(scale, exp));
+}
+
+float QuantValsToDistance(j_compress_ptr cinfo) {
+  jpeg_comp_master* m = cinfo->master;
+  float global_scale = kGlobalScaleYCbCr;
+  if (m->cicp_transfer_function == kTransferFunctionPQ) {
+    global_scale *= .4f;
+  } else if (m->cicp_transfer_function == kTransferFunctionHLG) {
+    global_scale *= .5f;
+  }
+  int quant_max = m->force_baseline ? 255 : 32767U;
+  static const float kDistMax = 10000.0f;
+  float dist_min = 0.0f;
+  float dist_max = kDistMax;
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    int quant_idx = cinfo->comp_info[c].quant_tbl_no;
+    uint16_t* quantval = cinfo->quant_tbl_ptrs[quant_idx]->quantval;
+    const float* base_qm = &kBaseQuantMatrixYCbCr[quant_idx * DCTSIZE2];
+    for (int k = 0; k < DCTSIZE2; ++k) {
+      float dmin = 0.0;
+      float dmax = kDistMax;
+      float invq = 1.0f / base_qm[k] / global_scale;
+      int qval = quantval[k];
+      if (qval > 1) {
+        float scale_min = (qval - 0.5f) * invq;
+        dmin = ScaleToDistance(scale_min, k);
+      }
+      if (qval < quant_max) {
+        float scale_max = (qval + 0.5f) * invq;
+        dmax = ScaleToDistance(scale_max, k);
+      }
+      if (dmin <= dist_max) {
+        dist_min = std::max(dmin, dist_min);
+      }
+      if (dmax >= dist_min) {
+        dist_max = std::min(dist_max, dmax);
+      }
+    }
+  }
+  float distance;
+  if (dist_min == 0) {
+    distance = dist_max;
+  } else if (dist_max == kDistMax) {
+    distance = dist_min;
+  } else {
+    distance = 0.5f * (dist_min + dist_max);
+  }
+  return distance;
+}
+
+bool IsYUV420(j_compress_ptr cinfo) {
+  return (cinfo->jpeg_color_space == JCS_YCbCr &&
+          cinfo->comp_info[0].h_samp_factor == 2 &&
+          cinfo->comp_info[0].v_samp_factor == 2 &&
+          cinfo->comp_info[1].h_samp_factor == 1 &&
+          cinfo->comp_info[1].v_samp_factor == 1 &&
+          cinfo->comp_info[2].h_samp_factor == 1 &&
+          cinfo->comp_info[2].v_samp_factor == 1);
+}
+
+}  // namespace
+
+void SetQuantMatrices(j_compress_ptr cinfo, float distances[NUM_QUANT_TBLS],
+                      bool add_two_chroma_tables) {
+  jpeg_comp_master* m = cinfo->master;
+  const bool xyb = m->xyb_mode && cinfo->jpeg_color_space == JCS_RGB;
+  const bool is_yuv420 = IsYUV420(cinfo);
+
+  float global_scale;
+  bool non_linear_scaling = true;
+  const float* base_quant_matrix[NUM_QUANT_TBLS];
+  int num_base_tables;
+
+  if (xyb) {
+    global_scale = kGlobalScaleXYB;
+    num_base_tables = 3;
+    base_quant_matrix[0] = kBaseQuantMatrixXYB;
+    base_quant_matrix[1] = kBaseQuantMatrixXYB + DCTSIZE2;
+    base_quant_matrix[2] = kBaseQuantMatrixXYB + 2 * DCTSIZE2;
+  } else if (cinfo->jpeg_color_space == JCS_YCbCr && !m->use_std_tables) {
+    global_scale = kGlobalScaleYCbCr;
+    if (m->cicp_transfer_function == kTransferFunctionPQ) {
+      global_scale *= .4f;
+    } else if (m->cicp_transfer_function == kTransferFunctionHLG) {
+      global_scale *= .5f;
+    }
+    if (is_yuv420) {
+      global_scale *= k420GlobalScale;
+    }
+    if (add_two_chroma_tables) {
+      cinfo->comp_info[2].quant_tbl_no = 2;
+      num_base_tables = 3;
+      base_quant_matrix[0] = kBaseQuantMatrixYCbCr;
+      base_quant_matrix[1] = kBaseQuantMatrixYCbCr + DCTSIZE2;
+      base_quant_matrix[2] = kBaseQuantMatrixYCbCr + 2 * DCTSIZE2;
+    } else {
+      num_base_tables = 2;
+      base_quant_matrix[0] = kBaseQuantMatrixYCbCr;
+      // Use the Cr table for both Cb and Cr.
+      base_quant_matrix[1] = kBaseQuantMatrixYCbCr + 2 * DCTSIZE2;
+    }
+  } else {
+    global_scale = 0.01f;
+    non_linear_scaling = false;
+    num_base_tables = 2;
+    base_quant_matrix[0] = kBaseQuantMatrixStd;
+    base_quant_matrix[1] = kBaseQuantMatrixStd + DCTSIZE2;
+  }
+
+  int quant_max = m->force_baseline ? 255 : 32767U;
+  for (int quant_idx = 0; quant_idx < num_base_tables; ++quant_idx) {
+    const float* base_qm = base_quant_matrix[quant_idx];
+    JQUANT_TBL** qtable = &cinfo->quant_tbl_ptrs[quant_idx];
+    if (*qtable == nullptr) {
+      *qtable = jpegli_alloc_quant_table(reinterpret_cast<j_common_ptr>(cinfo));
+    }
+    for (int k = 0; k < DCTSIZE2; ++k) {
+      float scale = global_scale;
+      if (non_linear_scaling) {
+        scale *= DistanceToScale(distances[quant_idx], k);
+        if (is_yuv420 && quant_idx > 0) {
+          scale *= k420Rescale[k];
+        }
+      } else {
+        scale *= DistanceToLinearQuality(distances[quant_idx]);
+      }
+      int qval = std::round(scale * base_qm[k]);
+      (*qtable)->quantval[k] = std::max(1, std::min(qval, quant_max));
+    }
+    (*qtable)->sent_table = FALSE;
+  }
+}
+
+void InitQuantizer(j_compress_ptr cinfo, QuantPass pass) {
+  jpeg_comp_master* m = cinfo->master;
+  // Compute quantization multupliers from the quant table values.
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    int quant_idx = cinfo->comp_info[c].quant_tbl_no;
+    JQUANT_TBL* quant_table = cinfo->quant_tbl_ptrs[quant_idx];
+    if (!quant_table) {
+      JPEGLI_ERROR("Missing quantization table %d for component %d", quant_idx,
+                   c);
+    }
+    for (size_t k = 0; k < DCTSIZE2; k++) {
+      int val = quant_table->quantval[k];
+      if (val == 0) {
+        JPEGLI_ERROR("Invalid quantval 0.");
+      }
+      switch (pass) {
+        case QuantPass::NO_SEARCH:
+          m->quant_mul[c][k] = 8.0f / val;
+          break;
+        case QuantPass::SEARCH_FIRST_PASS:
+          m->quant_mul[c][k] = 128.0f;
+          break;
+        case QuantPass::SEARCH_SECOND_PASS:
+          m->quant_mul[c][kJPEGZigZagOrder[k]] = 1.0f / (16 * val);
+          break;
+      }
+    }
+  }
+  if (m->use_adaptive_quantization) {
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      for (int k = 0; k < DCTSIZE2; ++k) {
+        m->zero_bias_mul[c][k] = k == 0 ? 0.0f : 0.5f;
+        m->zero_bias_offset[c][k] = k == 0 ? 0.0f : 0.5f;
+      }
+    }
+    if (cinfo->jpeg_color_space == JCS_YCbCr) {
+      float distance = QuantValsToDistance(cinfo);
+      static const float kDistHQ = 1.0f;
+      static const float kDistLQ = 3.0f;
+      float mix0 = (distance - kDistHQ) / (kDistLQ - kDistHQ);
+      mix0 = std::max(0.0f, std::min(1.0f, mix0));
+      float mix1 = 1.0f - mix0;
+      for (int c = 0; c < cinfo->num_components; ++c) {
+        for (int k = 0; k < DCTSIZE2; ++k) {
+          float mul0 = kZeroBiasMulYCbCrLQ[c * DCTSIZE2 + k];
+          float mul1 = kZeroBiasMulYCbCrHQ[c * DCTSIZE2 + k];
+          m->zero_bias_mul[c][k] = mix0 * mul0 + mix1 * mul1;
+          m->zero_bias_offset[c][k] =
+              k == 0 ? kZeroBiasOffsetYCbCrDC[c] : kZeroBiasOffsetYCbCrAC[c];
+        }
+      }
+    }
+  } else if (cinfo->jpeg_color_space == JCS_YCbCr) {
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      for (int k = 0; k < DCTSIZE2; ++k) {
+        m->zero_bias_offset[c][k] =
+            k == 0 ? kZeroBiasOffsetYCbCrDC[c] : kZeroBiasOffsetYCbCrAC[c];
+      }
+    }
+  }
+}
+
+}  // namespace jpegli
diff --git a/lib/jpegli/quant.h b/lib/jpegli/quant.h
new file mode 100644 (file)
index 0000000..cb37757
--- /dev/null
@@ -0,0 +1,26 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_QUANT_H_
+#define LIB_JPEGLI_QUANT_H_
+
+#include "lib/jpegli/common.h"
+
+namespace jpegli {
+
+void SetQuantMatrices(j_compress_ptr cinfo, float distances[NUM_QUANT_TBLS],
+                      bool add_two_chroma_tables);
+
+enum QuantPass {
+  NO_SEARCH,
+  SEARCH_FIRST_PASS,
+  SEARCH_SECOND_PASS,
+};
+
+void InitQuantizer(j_compress_ptr cinfo, QuantPass pass);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_QUANT_H_
diff --git a/lib/jpegli/render.cc b/lib/jpegli/render.cc
new file mode 100644 (file)
index 0000000..24e7e99
--- /dev/null
@@ -0,0 +1,763 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/render.h"
+
+#include <string.h>
+
+#include <array>
+#include <atomic>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <hwy/aligned_allocator.h>
+#include <vector>
+
+#include "lib/jpegli/color_quantize.h"
+#include "lib/jpegli/color_transform.h"
+#include "lib/jpegli/decode_internal.h"
+#include "lib/jpegli/error.h"
+#include "lib/jpegli/idct.h"
+#include "lib/jpegli/upsample.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+
+#ifdef MEMORY_SANITIZER
+#define JXL_MEMORY_SANITIZER 1
+#elif defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#define JXL_MEMORY_SANITIZER 1
+#else
+#define JXL_MEMORY_SANITIZER 0
+#endif
+#else
+#define JXL_MEMORY_SANITIZER 0
+#endif
+
+#if JXL_MEMORY_SANITIZER
+#include "sanitizer/msan_interface.h"
+#endif
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jpegli/render.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+HWY_BEFORE_NAMESPACE();
+namespace jpegli {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Abs;
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::Clamp;
+using hwy::HWY_NAMESPACE::Gt;
+using hwy::HWY_NAMESPACE::IfThenElseZero;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::NearestInt;
+using hwy::HWY_NAMESPACE::Or;
+using hwy::HWY_NAMESPACE::Rebind;
+using hwy::HWY_NAMESPACE::ShiftLeftSame;
+using hwy::HWY_NAMESPACE::ShiftRightSame;
+using hwy::HWY_NAMESPACE::Vec;
+using D = HWY_FULL(float);
+using DI = HWY_FULL(int32_t);
+constexpr D d;
+constexpr DI di;
+
+void GatherBlockStats(const int16_t* JXL_RESTRICT coeffs,
+                      const size_t coeffs_size, int32_t* JXL_RESTRICT nonzeros,
+                      int32_t* JXL_RESTRICT sumabs) {
+  for (size_t i = 0; i < coeffs_size; i += Lanes(d)) {
+    size_t k = i % DCTSIZE2;
+    const Rebind<int16_t, DI> di16;
+    const Vec<DI> coeff = PromoteTo(di, Load(di16, coeffs + i));
+    const auto abs_coeff = Abs(coeff);
+    const auto not_0 = Gt(abs_coeff, Zero(di));
+    const auto nzero = IfThenElseZero(not_0, Set(di, 1));
+    Store(Add(nzero, Load(di, nonzeros + k)), di, nonzeros + k);
+    Store(Add(abs_coeff, Load(di, sumabs + k)), di, sumabs + k);
+  }
+}
+
+void DecenterRow(float* row, size_t xsize) {
+  const HWY_CAPPED(float, 8) df;
+  const auto c128 = Set(df, 128.0f / 255);
+  for (size_t x = 0; x < xsize; x += Lanes(df)) {
+    Store(Add(Load(df, row + x), c128), df, row + x);
+  }
+}
+
+void DitherRow(j_decompress_ptr cinfo, float* row, int c, size_t y,
+               size_t xsize) {
+  jpeg_decomp_master* m = cinfo->master;
+  if (!m->dither_[c]) return;
+  const float* dither_row =
+      &m->dither_[c][(y & m->dither_mask_) * m->dither_size_];
+  for (size_t x = 0; x < xsize; ++x) {
+    row[x] += dither_row[x & m->dither_mask_];
+  }
+}
+
+template <typename T>
+void StoreUnsignedRow(float* JXL_RESTRICT input[], size_t x0, size_t len,
+                      size_t num_channels, float multiplier, T* output) {
+  const HWY_CAPPED(float, 8) d;
+  auto zero = Zero(d);
+  auto mul = Set(d, multiplier);
+  const Rebind<T, decltype(d)> du;
+#if JXL_MEMORY_SANITIZER
+  const size_t padding = hwy::RoundUpTo(len, Lanes(d)) - len;
+  for (size_t c = 0; c < num_channels; ++c) {
+    __msan_unpoison(input[c] + x0 + len, sizeof(input[c][0]) * padding);
+  }
+#endif
+  if (num_channels == 1) {
+    for (size_t i = 0; i < len; i += Lanes(d)) {
+      auto v0 = Clamp(zero, Mul(LoadU(d, &input[0][x0 + i]), mul), mul);
+      StoreU(DemoteTo(du, NearestInt(v0)), du, &output[i]);
+    }
+  } else if (num_channels == 2) {
+    for (size_t i = 0; i < len; i += Lanes(d)) {
+      auto v0 = Clamp(zero, Mul(LoadU(d, &input[0][x0 + i]), mul), mul);
+      auto v1 = Clamp(zero, Mul(LoadU(d, &input[1][x0 + i]), mul), mul);
+      StoreInterleaved2(DemoteTo(du, NearestInt(v0)),
+                        DemoteTo(du, NearestInt(v1)), du, &output[2 * i]);
+    }
+  } else if (num_channels == 3) {
+    for (size_t i = 0; i < len; i += Lanes(d)) {
+      auto v0 = Clamp(zero, Mul(LoadU(d, &input[0][x0 + i]), mul), mul);
+      auto v1 = Clamp(zero, Mul(LoadU(d, &input[1][x0 + i]), mul), mul);
+      auto v2 = Clamp(zero, Mul(LoadU(d, &input[2][x0 + i]), mul), mul);
+      StoreInterleaved3(DemoteTo(du, NearestInt(v0)),
+                        DemoteTo(du, NearestInt(v1)),
+                        DemoteTo(du, NearestInt(v2)), du, &output[3 * i]);
+    }
+  } else if (num_channels == 4) {
+    for (size_t i = 0; i < len; i += Lanes(d)) {
+      auto v0 = Clamp(zero, Mul(LoadU(d, &input[0][x0 + i]), mul), mul);
+      auto v1 = Clamp(zero, Mul(LoadU(d, &input[1][x0 + i]), mul), mul);
+      auto v2 = Clamp(zero, Mul(LoadU(d, &input[2][x0 + i]), mul), mul);
+      auto v3 = Clamp(zero, Mul(LoadU(d, &input[3][x0 + i]), mul), mul);
+      StoreInterleaved4(DemoteTo(du, NearestInt(v0)),
+                        DemoteTo(du, NearestInt(v1)),
+                        DemoteTo(du, NearestInt(v2)),
+                        DemoteTo(du, NearestInt(v3)), du, &output[4 * i]);
+    }
+  }
+#if JXL_MEMORY_SANITIZER
+  __msan_poison(output + num_channels * len,
+                sizeof(output[0]) * num_channels * padding);
+#endif
+}
+
+void StoreFloatRow(float* JXL_RESTRICT input[3], size_t x0, size_t len,
+                   size_t num_channels, float* output) {
+  const HWY_CAPPED(float, 8) d;
+  if (num_channels == 1) {
+    memcpy(output, input[0] + x0, len * sizeof(output[0]));
+  } else if (num_channels == 2) {
+    for (size_t i = 0; i < len; i += Lanes(d)) {
+      StoreInterleaved2(LoadU(d, &input[0][x0 + i]),
+                        LoadU(d, &input[1][x0 + i]), d, &output[2 * i]);
+    }
+  } else if (num_channels == 3) {
+    for (size_t i = 0; i < len; i += Lanes(d)) {
+      StoreInterleaved3(LoadU(d, &input[0][x0 + i]),
+                        LoadU(d, &input[1][x0 + i]),
+                        LoadU(d, &input[2][x0 + i]), d, &output[3 * i]);
+    }
+  } else if (num_channels == 4) {
+    for (size_t i = 0; i < len; i += Lanes(d)) {
+      StoreInterleaved4(LoadU(d, &input[0][x0 + i]),
+                        LoadU(d, &input[1][x0 + i]),
+                        LoadU(d, &input[2][x0 + i]),
+                        LoadU(d, &input[3][x0 + i]), d, &output[4 * i]);
+    }
+  }
+}
+
+static constexpr float kFSWeightMR = 7.0f / 16.0f;
+static constexpr float kFSWeightBL = 3.0f / 16.0f;
+static constexpr float kFSWeightBM = 5.0f / 16.0f;
+static constexpr float kFSWeightBR = 1.0f / 16.0f;
+
+float LimitError(float error) {
+  float abserror = std::abs(error);
+  if (abserror > 48.0f) {
+    abserror = 32.0f;
+  } else if (abserror > 16.0f) {
+    abserror = 0.5f * abserror + 8.0f;
+  }
+  return error > 0.0f ? abserror : -abserror;
+}
+
+void WriteToOutput(j_decompress_ptr cinfo, float* JXL_RESTRICT rows[],
+                   size_t xoffset, size_t len, size_t num_channels,
+                   uint8_t* JXL_RESTRICT output) {
+  jpeg_decomp_master* m = cinfo->master;
+  uint8_t* JXL_RESTRICT scratch_space = m->output_scratch_;
+  if (cinfo->quantize_colors && m->quant_pass_ == 1) {
+    float* error_row[kMaxComponents];
+    float* next_error_row[kMaxComponents];
+    if (cinfo->dither_mode == JDITHER_ORDERED) {
+      for (size_t c = 0; c < num_channels; ++c) {
+        DitherRow(cinfo, &rows[c][xoffset], c, cinfo->output_scanline,
+                  cinfo->output_width);
+      }
+    } else if (cinfo->dither_mode == JDITHER_FS) {
+      for (size_t c = 0; c < num_channels; ++c) {
+        if (cinfo->output_scanline % 2 == 0) {
+          error_row[c] = m->error_row_[c];
+          next_error_row[c] = m->error_row_[c + kMaxComponents];
+        } else {
+          error_row[c] = m->error_row_[c + kMaxComponents];
+          next_error_row[c] = m->error_row_[c];
+        }
+        memset(next_error_row[c], 0.0, cinfo->output_width * sizeof(float));
+      }
+    }
+    const float mul = 255.0f;
+    if (cinfo->dither_mode != JDITHER_FS) {
+      StoreUnsignedRow(rows, xoffset, len, num_channels, mul, scratch_space);
+    }
+    for (size_t i = 0; i < len; ++i) {
+      uint8_t* pixel = &scratch_space[num_channels * i];
+      if (cinfo->dither_mode == JDITHER_FS) {
+        for (size_t c = 0; c < num_channels; ++c) {
+          float val = rows[c][i] * mul + LimitError(error_row[c][i]);
+          pixel[c] = std::round(std::min(255.0f, std::max(0.0f, val)));
+        }
+      }
+      int index = LookupColorIndex(cinfo, pixel);
+      output[i] = index;
+      if (cinfo->dither_mode == JDITHER_FS) {
+        size_t prev_i = i > 0 ? i - 1 : 0;
+        size_t next_i = i + 1 < len ? i + 1 : len - 1;
+        for (size_t c = 0; c < num_channels; ++c) {
+          float error = pixel[c] - cinfo->colormap[c][index];
+          error_row[c][next_i] += kFSWeightMR * error;
+          next_error_row[c][prev_i] += kFSWeightBL * error;
+          next_error_row[c][i] += kFSWeightBM * error;
+          next_error_row[c][next_i] += kFSWeightBR * error;
+        }
+      }
+    }
+  } else if (m->output_data_type_ == JPEGLI_TYPE_UINT8) {
+    const float mul = 255.0;
+    StoreUnsignedRow(rows, xoffset, len, num_channels, mul, scratch_space);
+    memcpy(output, scratch_space, len * num_channels);
+  } else if (m->output_data_type_ == JPEGLI_TYPE_UINT16) {
+    const float mul = 65535.0;
+    uint16_t* tmp = reinterpret_cast<uint16_t*>(scratch_space);
+    StoreUnsignedRow(rows, xoffset, len, num_channels, mul, tmp);
+    if (m->swap_endianness_) {
+      const HWY_CAPPED(uint16_t, 8) du;
+      size_t output_len = len * num_channels;
+      for (size_t j = 0; j < output_len; j += Lanes(du)) {
+        auto v = LoadU(du, tmp + j);
+        auto vswap = Or(ShiftRightSame(v, 8), ShiftLeftSame(v, 8));
+        StoreU(vswap, du, tmp + j);
+      }
+    }
+    memcpy(output, tmp, len * num_channels * 2);
+  } else if (m->output_data_type_ == JPEGLI_TYPE_FLOAT) {
+    float* tmp = reinterpret_cast<float*>(scratch_space);
+    StoreFloatRow(rows, xoffset, len, num_channels, tmp);
+    if (m->swap_endianness_) {
+      size_t output_len = len * num_channels;
+      for (size_t j = 0; j < output_len; ++j) {
+        tmp[j] = BSwapFloat(tmp[j]);
+      }
+    }
+    memcpy(output, tmp, len * num_channels * 4);
+  }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jpegli
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+
+namespace jpegli {
+
+HWY_EXPORT(GatherBlockStats);
+HWY_EXPORT(WriteToOutput);
+HWY_EXPORT(DecenterRow);
+
+void GatherBlockStats(const int16_t* JXL_RESTRICT coeffs,
+                      const size_t coeffs_size, int32_t* JXL_RESTRICT nonzeros,
+                      int32_t* JXL_RESTRICT sumabs) {
+  return HWY_DYNAMIC_DISPATCH(GatherBlockStats)(coeffs, coeffs_size, nonzeros,
+                                                sumabs);
+}
+
+void WriteToOutput(j_decompress_ptr cinfo, float* JXL_RESTRICT rows[],
+                   size_t xoffset, size_t len, size_t num_channels,
+                   uint8_t* JXL_RESTRICT output) {
+  return HWY_DYNAMIC_DISPATCH(WriteToOutput)(cinfo, rows, xoffset, len,
+                                             num_channels, output);
+}
+
+void DecenterRow(float* row, size_t xsize) {
+  return HWY_DYNAMIC_DISPATCH(DecenterRow)(row, xsize);
+}
+
+bool ShouldApplyDequantBiases(j_decompress_ptr cinfo, int ci) {
+  const auto& compinfo = cinfo->comp_info[ci];
+  return (compinfo.h_samp_factor == cinfo->max_h_samp_factor &&
+          compinfo.v_samp_factor == cinfo->max_v_samp_factor);
+}
+
+// See the following article for the details:
+// J. R. Price and M. Rabbani, "Dequantization bias for JPEG decompression"
+// Proceedings International Conference on Information Technology: Coding and
+// Computing (Cat. No.PR00540), 2000, pp. 30-35, doi: 10.1109/ITCC.2000.844179.
+void ComputeOptimalLaplacianBiases(const int num_blocks, const int* nonzeros,
+                                   const int* sumabs, float* biases) {
+  for (size_t k = 1; k < DCTSIZE2; ++k) {
+    if (nonzeros[k] == 0) {
+      biases[k] = 0.5f;
+      continue;
+    }
+    // Notation adapted from the article
+    float N = num_blocks;
+    float N1 = nonzeros[k];
+    float N0 = num_blocks - N1;
+    float S = sumabs[k];
+    // Compute gamma from N0, N1, N, S (eq. 11), with A and B being just
+    // temporary grouping of terms.
+    float A = 4.0 * S + 2.0 * N;
+    float B = 4.0 * S - 2.0 * N1;
+    float gamma = (-1.0 * N0 + std::sqrt(N0 * N0 * 1.0 + A * B)) / A;
+    float gamma2 = gamma * gamma;
+    // The bias is computed from gamma with (eq. 5), where the quantization
+    // multiplier Q can be factored out and thus the bias can be applied
+    // directly on the quantized coefficient.
+    biases[k] =
+        0.5 * (((1.0 + gamma2) / (1.0 - gamma2)) + 1.0 / std::log(gamma));
+  }
+}
+
+constexpr std::array<int, SAVED_COEFS> Q_POS = {0, 1, 8,  16, 9,
+                                                2, 3, 10, 17, 24};
+
+bool is_nonzero_quantizers(const JQUANT_TBL* qtable) {
+  return std::all_of(Q_POS.begin(), Q_POS.end(),
+                     [&](int pos) { return qtable->quantval[pos] != 0; });
+}
+
+// Determine whether smoothing should be applied during decompression
+bool do_smoothing(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  bool smoothing_useful = false;
+
+  if (!cinfo->progressive_mode || cinfo->coef_bits == nullptr) {
+    return false;
+  }
+  auto coef_bits_latch = m->coef_bits_latch;
+  auto prev_coef_bits_latch = m->prev_coef_bits_latch;
+
+  for (int ci = 0; ci < cinfo->num_components; ci++) {
+    jpeg_component_info* compptr = &cinfo->comp_info[ci];
+    JQUANT_TBL* qtable = compptr->quant_table;
+    int* coef_bits = cinfo->coef_bits[ci];
+    int* prev_coef_bits = cinfo->coef_bits[ci + cinfo->num_components];
+
+    // Return early if conditions for smoothing are not met
+    if (qtable == nullptr || !is_nonzero_quantizers(qtable) ||
+        coef_bits[0] < 0) {
+      return false;
+    }
+
+    coef_bits_latch[ci][0] = coef_bits[0];
+
+    for (int coefi = 1; coefi < SAVED_COEFS; coefi++) {
+      prev_coef_bits_latch[ci][coefi] =
+          cinfo->input_scan_number > 1 ? prev_coef_bits[coefi] : -1;
+      if (coef_bits[coefi] != 0) {
+        smoothing_useful = true;
+      }
+      coef_bits_latch[ci][coefi] = coef_bits[coefi];
+    }
+  }
+
+  return smoothing_useful;
+}
+
+void PredictSmooth(j_decompress_ptr cinfo, JBLOCKARRAY blocks, int component,
+                   size_t bx, int iy) {
+  const size_t imcu_row = cinfo->output_iMCU_row;
+  int16_t* scratch = cinfo->master->smoothing_scratch_;
+  std::vector<int> Q_VAL(SAVED_COEFS);
+  int* coef_bits;
+
+  std::array<std::array<int, 5>, 5> dc_values;
+  auto& compinfo = cinfo->comp_info[component];
+  const size_t by0 = imcu_row * compinfo.v_samp_factor;
+  const size_t by = by0 + iy;
+
+  int prev_iy = by > 0 ? iy - 1 : 0;
+  int prev_prev_iy = by > 1 ? iy - 2 : prev_iy;
+  int next_iy = by + 1 < compinfo.height_in_blocks ? iy + 1 : iy;
+  int next_next_iy = by + 2 < compinfo.height_in_blocks ? iy + 2 : next_iy;
+
+  const int16_t* cur_row = blocks[iy][bx];
+  const int16_t* prev_row = blocks[prev_iy][bx];
+  const int16_t* prev_prev_row = blocks[prev_prev_iy][bx];
+  const int16_t* next_row = blocks[next_iy][bx];
+  const int16_t* next_next_row = blocks[next_next_iy][bx];
+
+  int prev_block_ind = bx ? -DCTSIZE2 : 0;
+  int prev_prev_block_ind = bx > 1 ? -2 * DCTSIZE2 : prev_block_ind;
+  int next_block_ind = bx + 1 < compinfo.width_in_blocks ? DCTSIZE2 : 0;
+  int next_next_block_ind =
+      bx + 2 < compinfo.width_in_blocks ? DCTSIZE2 * 2 : next_block_ind;
+
+  std::array<const int16_t*, 5> row_ptrs = {prev_prev_row, prev_row, cur_row,
+                                            next_row, next_next_row};
+  std::array<int, 5> block_inds = {prev_prev_block_ind, prev_block_ind, 0,
+                                   next_block_ind, next_next_block_ind};
+
+  memcpy(scratch, cur_row, DCTSIZE2 * sizeof(cur_row[0]));
+
+  for (int r = 0; r < 5; ++r) {
+    for (int c = 0; c < 5; ++c) {
+      dc_values[r][c] = row_ptrs[r][block_inds[c]];
+    }
+  }
+  // Get the correct coef_bits: In case of an incomplete scan, we use the
+  // prev coeficients.
+  if (cinfo->output_iMCU_row + 1 > cinfo->input_iMCU_row) {
+    coef_bits = cinfo->master->prev_coef_bits_latch[component];
+  } else {
+    coef_bits = cinfo->master->coef_bits_latch[component];
+  }
+
+  bool change_dc = true;
+  for (int i = 1; i < SAVED_COEFS; i++) {
+    if (coef_bits[i] != -1) {
+      change_dc = false;
+      break;
+    }
+  }
+
+  JQUANT_TBL* quanttbl = cinfo->quant_tbl_ptrs[compinfo.quant_tbl_no];
+  for (size_t i = 0; i < 6; ++i) {
+    Q_VAL[i] = quanttbl->quantval[Q_POS[i]];
+  }
+  if (change_dc) {
+    for (size_t i = 6; i < SAVED_COEFS; ++i) {
+      Q_VAL[i] = quanttbl->quantval[Q_POS[i]];
+    }
+  }
+  auto calculate_dct_value = [&](int coef_index) {
+    int64_t num = 0;
+    int pred;
+    int Al;
+    // we use the symmetry of the smoothing matrices by transposing the 5x5 dc
+    // matrix in that case.
+    bool swap_indices = coef_index == 2 || coef_index == 5 || coef_index == 8 ||
+                        coef_index == 9;
+    auto dc = [&](int i, int j) {
+      return swap_indices ? dc_values[j][i] : dc_values[i][j];
+    };
+    Al = coef_bits[coef_index];
+    switch (coef_index) {
+      case 0:
+        // set the DC
+        num = (-2 * dc(0, 0) - 6 * dc(0, 1) - 8 * dc(0, 2) - 6 * dc(0, 3) -
+               2 * dc(0, 4) - 6 * dc(1, 0) + 6 * dc(1, 1) + 42 * dc(1, 2) +
+               6 * dc(1, 3) - 6 * dc(1, 4) - 8 * dc(2, 0) + 42 * dc(2, 1) +
+               152 * dc(2, 2) + 42 * dc(2, 3) - 8 * dc(2, 4) - 6 * dc(3, 0) +
+               6 * dc(3, 1) + 42 * dc(3, 2) + 6 * dc(3, 3) - 6 * dc(3, 4) -
+               2 * dc(4, 0) - 6 * dc(4, 1) - 8 * dc(4, 2) - 6 * dc(4, 3) -
+               2 * dc(4, 4));
+        // special case: for the DC the dequantization is different
+        Al = 0;
+        break;
+      case 1:
+      case 2:
+        // set Q01 or Q10
+        num = (change_dc ? (-dc(0, 0) - dc(0, 1) + dc(0, 3) + dc(0, 4) -
+                            3 * dc(1, 0) + 13 * dc(1, 1) - 13 * dc(1, 3) +
+                            3 * dc(1, 4) - 3 * dc(2, 0) + 38 * dc(2, 1) -
+                            38 * dc(2, 3) + 3 * dc(2, 4) - 3 * dc(3, 0) +
+                            13 * dc(3, 1) - 13 * dc(3, 3) + 3 * dc(3, 4) -
+                            dc(4, 0) - dc(4, 1) + dc(4, 3) + dc(4, 4))
+                         : (-7 * dc(2, 0) + 50 * dc(2, 1) - 50 * dc(2, 3) +
+                            7 * dc(2, 4)));
+        break;
+      case 3:
+      case 5:
+        // set Q02 or Q20
+        num = (change_dc
+                   ? dc(0, 2) + 2 * dc(1, 1) + 7 * dc(1, 2) + 2 * dc(1, 3) -
+                         5 * dc(2, 1) - 14 * dc(2, 2) - 5 * dc(2, 3) +
+                         2 * dc(3, 1) + 7 * dc(3, 2) + 2 * dc(3, 3) + dc(4, 2)
+                   : (-dc(0, 2) + 13 * dc(1, 2) - 24 * dc(2, 2) +
+                      13 * dc(3, 2) - dc(4, 2)));
+        break;
+      case 4:
+        // set Q11
+        num =
+            (change_dc ? -dc(0, 0) + dc(0, 4) + 9 * dc(1, 1) - 9 * dc(1, 3) -
+                             9 * dc(3, 1) + 9 * dc(3, 3) + dc(4, 0) - dc(4, 4)
+                       : (dc(1, 4) + dc(3, 0) - 10 * dc(3, 1) + 10 * dc(3, 3) -
+                          dc(0, 1) - dc(3, 4) + dc(4, 1) - dc(4, 3) + dc(0, 3) -
+                          dc(1, 0) + 10 * dc(1, 1) - 10 * dc(1, 3)));
+        break;
+      case 6:
+      case 9:
+        // set Q03 or Q30
+        num = (dc(1, 1) - dc(1, 3) + 2 * dc(2, 1) - 2 * dc(2, 3) + dc(3, 1) -
+               dc(3, 3));
+        break;
+      case 7:
+      case 8:
+        // set Q12 and Q21
+        num = (dc(1, 1) - 3 * dc(1, 2) + dc(1, 3) - dc(3, 1) + 3 * dc(3, 2) -
+               dc(3, 3));
+        break;
+    }
+    num = Q_VAL[0] * num;
+    if (num >= 0) {
+      pred = ((Q_VAL[coef_index] << 7) + num) / (Q_VAL[coef_index] << 8);
+      if (Al > 0 && pred >= (1 << Al)) pred = (1 << Al) - 1;
+    } else {
+      pred = ((Q_VAL[coef_index] << 7) - num) / (Q_VAL[coef_index] << 8);
+      if (Al > 0 && pred >= (1 << Al)) pred = (1 << Al) - 1;
+      pred = -pred;
+    }
+    return static_cast<int16_t>(pred);
+  };
+
+  int loop_end = change_dc ? SAVED_COEFS : 6;
+  for (int i = 1; i < loop_end; ++i) {
+    if (coef_bits[i] != 0 && scratch[Q_POS[i]] == 0) {
+      scratch[Q_POS[i]] = calculate_dct_value(i);
+    }
+  }
+  if (change_dc) {
+    scratch[0] = calculate_dct_value(0);
+  }
+}
+
+void PrepareForOutput(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  bool smoothing = do_smoothing(cinfo);
+  m->apply_smoothing = smoothing && cinfo->do_block_smoothing;
+  size_t coeffs_per_block = cinfo->num_components * DCTSIZE2;
+  memset(m->nonzeros_, 0, coeffs_per_block * sizeof(m->nonzeros_[0]));
+  memset(m->sumabs_, 0, coeffs_per_block * sizeof(m->sumabs_[0]));
+  memset(m->num_processed_blocks_, 0, sizeof(m->num_processed_blocks_));
+  memset(m->biases_, 0, coeffs_per_block * sizeof(m->biases_[0]));
+  cinfo->output_iMCU_row = 0;
+  cinfo->output_scanline = 0;
+  const float kDequantScale = 1.0f / (8 * 255);
+  for (int c = 0; c < cinfo->num_components; c++) {
+    const auto& comp = cinfo->comp_info[c];
+    JQUANT_TBL* table = comp.quant_table;
+    if (table == nullptr) continue;
+    for (size_t k = 0; k < DCTSIZE2; ++k) {
+      m->dequant_[c * DCTSIZE2 + k] = table->quantval[k] * kDequantScale;
+    }
+  }
+  ChooseInverseTransform(cinfo);
+  ChooseColorTransform(cinfo);
+}
+
+void DecodeCurrentiMCURow(j_decompress_ptr cinfo) {
+  jpeg_decomp_master* m = cinfo->master;
+  const size_t imcu_row = cinfo->output_iMCU_row;
+  JBLOCKARRAY ba[kMaxComponents];
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    const jpeg_component_info* comp = &cinfo->comp_info[c];
+    int by0 = imcu_row * comp->v_samp_factor;
+    int block_rows_left = comp->height_in_blocks - by0;
+    int max_block_rows = std::min(comp->v_samp_factor, block_rows_left);
+    int offset = m->streaming_mode_ ? 0 : by0;
+    ba[c] = (*cinfo->mem->access_virt_barray)(
+        reinterpret_cast<j_common_ptr>(cinfo), m->coef_arrays[c], offset,
+        max_block_rows, false);
+  }
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    size_t k0 = c * DCTSIZE2;
+    auto& compinfo = cinfo->comp_info[c];
+    size_t block_row = imcu_row * compinfo.v_samp_factor;
+    if (ShouldApplyDequantBiases(cinfo, c)) {
+      // Update statistics for this iMCU row.
+      for (int iy = 0; iy < compinfo.v_samp_factor; ++iy) {
+        size_t by = block_row + iy;
+        if (by >= compinfo.height_in_blocks) {
+          continue;
+        }
+        int16_t* JXL_RESTRICT coeffs = &ba[c][iy][0][0];
+        size_t num = compinfo.width_in_blocks * DCTSIZE2;
+        GatherBlockStats(coeffs, num, &m->nonzeros_[k0], &m->sumabs_[k0]);
+        m->num_processed_blocks_[c] += compinfo.width_in_blocks;
+      }
+      if (imcu_row % 4 == 3) {
+        // Re-compute optimal biases every few iMCU-rows.
+        ComputeOptimalLaplacianBiases(m->num_processed_blocks_[c],
+                                      &m->nonzeros_[k0], &m->sumabs_[k0],
+                                      &m->biases_[k0]);
+      }
+    }
+    RowBuffer<float>* raw_out = &m->raw_output_[c];
+    for (int iy = 0; iy < compinfo.v_samp_factor; ++iy) {
+      size_t by = block_row + iy;
+      if (by >= compinfo.height_in_blocks) {
+        continue;
+      }
+      size_t dctsize = m->scaled_dct_size[c];
+      int16_t* JXL_RESTRICT row_in = &ba[c][iy][0][0];
+      float* JXL_RESTRICT row_out = raw_out->Row(by * dctsize);
+      for (size_t bx = 0; bx < compinfo.width_in_blocks; ++bx) {
+        if (m->apply_smoothing) {
+          PredictSmooth(cinfo, ba[c], c, bx, iy);
+          (*m->inverse_transform[c])(m->smoothing_scratch_, &m->dequant_[k0],
+                                     &m->biases_[k0], m->idct_scratch_,
+                                     &row_out[bx * dctsize], raw_out->stride(),
+                                     dctsize);
+        } else {
+          (*m->inverse_transform[c])(&row_in[bx * DCTSIZE2], &m->dequant_[k0],
+                                     &m->biases_[k0], m->idct_scratch_,
+                                     &row_out[bx * dctsize], raw_out->stride(),
+                                     dctsize);
+        }
+      }
+      if (m->streaming_mode_) {
+        memset(row_in, 0, compinfo.width_in_blocks * sizeof(JBLOCK));
+      }
+    }
+  }
+}
+
+void ProcessRawOutput(j_decompress_ptr cinfo, JSAMPIMAGE data) {
+  jpegli::DecodeCurrentiMCURow(cinfo);
+  jpeg_decomp_master* m = cinfo->master;
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    const auto& compinfo = cinfo->comp_info[c];
+    size_t comp_width = compinfo.width_in_blocks * DCTSIZE;
+    size_t comp_height = compinfo.height_in_blocks * DCTSIZE;
+    size_t comp_nrows = compinfo.v_samp_factor * DCTSIZE;
+    size_t y0 = cinfo->output_iMCU_row * compinfo.v_samp_factor * DCTSIZE;
+    size_t y1 = std::min(y0 + comp_nrows, comp_height);
+    for (size_t y = y0; y < y1; ++y) {
+      float* rows[1] = {m->raw_output_[c].Row(y)};
+      uint8_t* output = data[c][y - y0];
+      DecenterRow(rows[0], comp_width);
+      WriteToOutput(cinfo, rows, 0, comp_width, 1, output);
+    }
+  }
+  ++cinfo->output_iMCU_row;
+  cinfo->output_scanline += cinfo->max_v_samp_factor * DCTSIZE;
+  if (cinfo->output_scanline >= cinfo->output_height) {
+    ++m->output_passes_done_;
+  }
+}
+
+void ProcessOutput(j_decompress_ptr cinfo, size_t* num_output_rows,
+                   JSAMPARRAY scanlines, size_t max_output_rows) {
+  jpeg_decomp_master* m = cinfo->master;
+  const int vfactor = cinfo->max_v_samp_factor;
+  const int hfactor = cinfo->max_h_samp_factor;
+  const size_t context = m->need_context_rows_ ? 1 : 0;
+  const size_t imcu_row = cinfo->output_iMCU_row;
+  const size_t imcu_height = vfactor * m->min_scaled_dct_size;
+  const size_t imcu_width = hfactor * m->min_scaled_dct_size;
+  const size_t output_width = m->iMCU_cols_ * imcu_width;
+  if (imcu_row == cinfo->total_iMCU_rows ||
+      (imcu_row > context &&
+       cinfo->output_scanline < (imcu_row - context) * imcu_height)) {
+    // We are ready to output some scanlines.
+    size_t ybegin = cinfo->output_scanline;
+    size_t yend = (imcu_row == cinfo->total_iMCU_rows
+                       ? cinfo->output_height
+                       : (imcu_row - context) * imcu_height);
+    yend = std::min<size_t>(yend, ybegin + max_output_rows - *num_output_rows);
+    size_t yb = (ybegin / vfactor) * vfactor;
+    size_t ye = DivCeil(yend, vfactor) * vfactor;
+    for (size_t y = yb; y < ye; y += vfactor) {
+      for (int c = 0; c < cinfo->num_components; ++c) {
+        RowBuffer<float>* raw_out = &m->raw_output_[c];
+        RowBuffer<float>* render_out = &m->render_output_[c];
+        int line_groups = vfactor / m->v_factor[c];
+        int downsampled_width = output_width / m->h_factor[c];
+        size_t yc = y / m->v_factor[c];
+        for (int dy = 0; dy < line_groups; ++dy) {
+          size_t ymid = yc + dy;
+          const float* JXL_RESTRICT row_mid = raw_out->Row(ymid);
+          if (cinfo->do_fancy_upsampling && m->v_factor[c] == 2) {
+            const float* JXL_RESTRICT row_top =
+                ymid == 0 ? row_mid : raw_out->Row(ymid - 1);
+            const float* JXL_RESTRICT row_bot = ymid + 1 == m->raw_height_[c]
+                                                    ? row_mid
+                                                    : raw_out->Row(ymid + 1);
+            Upsample2Vertical(row_top, row_mid, row_bot,
+                              render_out->Row(2 * dy),
+                              render_out->Row(2 * dy + 1), downsampled_width);
+          } else {
+            for (int yix = 0; yix < m->v_factor[c]; ++yix) {
+              memcpy(render_out->Row(m->v_factor[c] * dy + yix), row_mid,
+                     downsampled_width * sizeof(float));
+            }
+          }
+          if (m->h_factor[c] > 1) {
+            for (int yix = 0; yix < m->v_factor[c]; ++yix) {
+              int row_ix = m->v_factor[c] * dy + yix;
+              float* JXL_RESTRICT row = render_out->Row(row_ix);
+              float* JXL_RESTRICT tmp = m->upsample_scratch_;
+              if (cinfo->do_fancy_upsampling && m->h_factor[c] == 2) {
+                Upsample2Horizontal(row, tmp, output_width);
+              } else {
+                // TODO(szabadka) SIMDify this.
+                for (size_t x = 0; x < output_width; ++x) {
+                  tmp[x] = row[x / m->h_factor[c]];
+                }
+                memcpy(row, tmp, output_width * sizeof(tmp[0]));
+              }
+            }
+          }
+        }
+      }
+      for (int yix = 0; yix < vfactor; ++yix) {
+        if (y + yix < ybegin || y + yix >= yend) continue;
+        float* rows[kMaxComponents];
+        int num_all_components =
+            std::max(cinfo->out_color_components, cinfo->num_components);
+        for (int c = 0; c < num_all_components; ++c) {
+          rows[c] = m->render_output_[c].Row(yix);
+        }
+        (*m->color_transform)(rows, output_width);
+        for (int c = 0; c < cinfo->out_color_components; ++c) {
+          // Undo the centering of the sample values around zero.
+          DecenterRow(rows[c], output_width);
+        }
+        if (scanlines) {
+          uint8_t* output = scanlines[*num_output_rows];
+          WriteToOutput(cinfo, rows, m->xoffset_, cinfo->output_width,
+                        cinfo->out_color_components, output);
+        }
+        JXL_ASSERT(cinfo->output_scanline == y + yix);
+        ++cinfo->output_scanline;
+        ++(*num_output_rows);
+        if (cinfo->output_scanline == cinfo->output_height) {
+          ++m->output_passes_done_;
+        }
+      }
+    }
+  } else {
+    DecodeCurrentiMCURow(cinfo);
+    ++cinfo->output_iMCU_row;
+  }
+}
+
+}  // namespace jpegli
+#endif  // HWY_ONCE
diff --git a/lib/jpegli/render.h b/lib/jpegli/render.h
new file mode 100644 (file)
index 0000000..ad69335
--- /dev/null
@@ -0,0 +1,24 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_RENDER_H_
+#define LIB_JPEGLI_RENDER_H_
+
+#include <stdint.h>
+
+#include "lib/jpegli/common.h"
+
+namespace jpegli {
+
+void PrepareForOutput(j_decompress_ptr cinfo);
+
+void ProcessOutput(j_decompress_ptr cinfo, size_t* num_output_rows,
+                   JSAMPARRAY scanlines, size_t max_output_rows);
+
+void ProcessRawOutput(j_decompress_ptr cinfo, JSAMPIMAGE data);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_RENDER_H_
diff --git a/lib/jpegli/simd.cc b/lib/jpegli/simd.cc
new file mode 100644 (file)
index 0000000..5e84939
--- /dev/null
@@ -0,0 +1,38 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/simd.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jpegli/simd.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+HWY_BEFORE_NAMESPACE();
+namespace jpegli {
+namespace HWY_NAMESPACE {
+
+size_t GetVectorSize() { return HWY_LANES(uint8_t); }
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jpegli
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jpegli {
+namespace {
+
+HWY_EXPORT(GetVectorSize);  // Local function.
+
+}  // namespace
+
+size_t VectorSize() {
+  static size_t bytes = HWY_DYNAMIC_DISPATCH(GetVectorSize)();
+  return bytes;
+}
+
+}  // namespace jpegli
+#endif  // HWY_ONCE
diff --git a/lib/jpegli/simd.h b/lib/jpegli/simd.h
new file mode 100644 (file)
index 0000000..aec772e
--- /dev/null
@@ -0,0 +1,18 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_SIMD_H_
+#define LIB_JPEGLI_SIMD_H_
+
+#include <stddef.h>
+
+namespace jpegli {
+
+// Returns SIMD vector size in bytes.
+size_t VectorSize();
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_SIMD_H_
diff --git a/lib/jpegli/source_manager.cc b/lib/jpegli/source_manager.cc
new file mode 100644 (file)
index 0000000..0b8e0a5
--- /dev/null
@@ -0,0 +1,90 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/decode.h"
+#include "lib/jpegli/error.h"
+#include "lib/jpegli/memory_manager.h"
+
+namespace jpegli {
+
+void init_mem_source(j_decompress_ptr cinfo) {}
+void init_stdio_source(j_decompress_ptr cinfo) {}
+
+void skip_input_data(j_decompress_ptr cinfo, long num_bytes) {
+  if (num_bytes <= 0) return;
+  while (num_bytes > static_cast<long>(cinfo->src->bytes_in_buffer)) {
+    num_bytes -= cinfo->src->bytes_in_buffer;
+    (*cinfo->src->fill_input_buffer)(cinfo);
+  }
+  cinfo->src->next_input_byte += num_bytes;
+  cinfo->src->bytes_in_buffer -= num_bytes;
+}
+
+void term_source(j_decompress_ptr cinfo) {}
+
+boolean EmitFakeEoiMarker(j_decompress_ptr cinfo) {
+  static constexpr uint8_t kFakeEoiMarker[2] = {0xff, 0xd9};
+  cinfo->src->next_input_byte = kFakeEoiMarker;
+  cinfo->src->bytes_in_buffer = 2;
+  return TRUE;
+}
+
+constexpr size_t kStdioBufferSize = 64 << 10;
+
+struct StdioSourceManager {
+  jpeg_source_mgr pub;
+  FILE* f;
+  uint8_t* buffer;
+
+  static boolean fill_input_buffer(j_decompress_ptr cinfo) {
+    auto src = reinterpret_cast<StdioSourceManager*>(cinfo->src);
+    size_t num_bytes_read = fread(src->buffer, 1, kStdioBufferSize, src->f);
+    if (num_bytes_read == 0) {
+      return EmitFakeEoiMarker(cinfo);
+    }
+    src->pub.next_input_byte = src->buffer;
+    src->pub.bytes_in_buffer = num_bytes_read;
+    return TRUE;
+  }
+};
+
+}  // namespace jpegli
+
+void jpegli_mem_src(j_decompress_ptr cinfo, const unsigned char* inbuffer,
+                    unsigned long insize) {
+  if (cinfo->src && cinfo->src->init_source != jpegli::init_mem_source) {
+    JPEGLI_ERROR("jpegli_mem_src: a different source manager was already set");
+  }
+  if (!cinfo->src) {
+    cinfo->src = jpegli::Allocate<jpeg_source_mgr>(cinfo, 1);
+  }
+  cinfo->src->next_input_byte = inbuffer;
+  cinfo->src->bytes_in_buffer = insize;
+  cinfo->src->init_source = jpegli::init_mem_source;
+  cinfo->src->fill_input_buffer = jpegli::EmitFakeEoiMarker;
+  cinfo->src->skip_input_data = jpegli::skip_input_data;
+  cinfo->src->resync_to_restart = jpegli_resync_to_restart;
+  cinfo->src->term_source = jpegli::term_source;
+}
+
+void jpegli_stdio_src(j_decompress_ptr cinfo, FILE* infile) {
+  if (cinfo->src && cinfo->src->init_source != jpegli::init_stdio_source) {
+    JPEGLI_ERROR("jpeg_stdio_src: a different source manager was already set");
+  }
+  if (!cinfo->src) {
+    cinfo->src = reinterpret_cast<jpeg_source_mgr*>(
+        jpegli::Allocate<jpegli::StdioSourceManager>(cinfo, 1));
+  }
+  auto src = reinterpret_cast<jpegli::StdioSourceManager*>(cinfo->src);
+  src->f = infile;
+  src->buffer = jpegli::Allocate<uint8_t>(cinfo, jpegli::kStdioBufferSize);
+  src->pub.next_input_byte = src->buffer;
+  src->pub.bytes_in_buffer = 0;
+  src->pub.init_source = jpegli::init_stdio_source;
+  src->pub.fill_input_buffer = jpegli::StdioSourceManager::fill_input_buffer;
+  src->pub.skip_input_data = jpegli::skip_input_data;
+  src->pub.resync_to_restart = jpegli_resync_to_restart;
+  src->pub.term_source = jpegli::term_source;
+}
diff --git a/lib/jpegli/source_manager_test.cc b/lib/jpegli/source_manager_test.cc
new file mode 100644 (file)
index 0000000..4e13787
--- /dev/null
@@ -0,0 +1,142 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <cmath>
+#include <cstdint>
+#include <vector>
+
+#include "lib/jpegli/decode.h"
+#include "lib/jpegli/test_utils.h"
+#include "lib/jpegli/testing.h"
+#include "lib/jxl/base/status.h"
+
+namespace jpegli {
+namespace {
+
+void ReadOutputImage(j_decompress_ptr cinfo, TestImage* output) {
+  jpegli_read_header(cinfo, /*require_image=*/TRUE);
+  jpegli_start_decompress(cinfo);
+  output->ysize = cinfo->output_height;
+  output->xsize = cinfo->output_width;
+  output->components = cinfo->num_components;
+  output->AllocatePixels();
+  size_t stride = cinfo->output_width * cinfo->num_components;
+  while (cinfo->output_scanline < cinfo->output_height) {
+    JSAMPROW scanline = &output->pixels[cinfo->output_scanline * stride];
+    jpegli_read_scanlines(cinfo, &scanline, 1);
+  }
+  jpegli_finish_decompress(cinfo);
+}
+
+struct TestConfig {
+  std::string fn;
+  std::string fn_desc;
+  DecompressParams dparams;
+};
+
+class SourceManagerTestParam : public ::testing::TestWithParam<TestConfig> {};
+
+namespace {
+FILE* MemOpen(const std::vector<uint8_t>& data) {
+  FILE* src = tmpfile();
+  if (!src) return nullptr;
+  fwrite(data.data(), 1, data.size(), src);
+  rewind(src);
+  return src;
+}
+}  // namespace
+
+TEST_P(SourceManagerTestParam, TestStdioSourceManager) {
+  TestConfig config = GetParam();
+  std::vector<uint8_t> compressed = ReadTestData(config.fn.c_str());
+  if (config.dparams.size_factor < 1.0) {
+    compressed.resize(compressed.size() * config.dparams.size_factor);
+  }
+  FILE* src = MemOpen(compressed);
+  ASSERT_TRUE(src);
+  TestImage output0;
+  jpeg_decompress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+    jpegli_stdio_src(&cinfo, src);
+    ReadOutputImage(&cinfo, &output0);
+    return true;
+  };
+  bool ok = try_catch_block();
+  fclose(src);
+  ASSERT_TRUE(ok);
+  jpegli_destroy_decompress(&cinfo);
+
+  TestImage output1;
+  DecodeWithLibjpeg(CompressParams(), DecompressParams(), compressed, &output1);
+  VerifyOutputImage(output1, output0, 1.0f);
+}
+
+TEST_P(SourceManagerTestParam, TestMemSourceManager) {
+  TestConfig config = GetParam();
+  std::vector<uint8_t> compressed = ReadTestData(config.fn.c_str());
+  if (config.dparams.size_factor < 1.0f) {
+    compressed.resize(compressed.size() * config.dparams.size_factor);
+  }
+  TestImage output0;
+  jpeg_decompress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_decompress(&cinfo);
+    jpegli_mem_src(&cinfo, compressed.data(), compressed.size());
+    ReadOutputImage(&cinfo, &output0);
+    return true;
+  };
+  ASSERT_TRUE(try_catch_block());
+  jpegli_destroy_decompress(&cinfo);
+
+  TestImage output1;
+  DecodeWithLibjpeg(CompressParams(), DecompressParams(), compressed, &output1);
+  VerifyOutputImage(output1, output0, 1.0f);
+}
+
+std::vector<TestConfig> GenerateTests() {
+  std::vector<TestConfig> all_tests;
+  {
+    std::vector<std::pair<std::string, std::string>> testfiles({
+        {"jxl/flower/flower.png.im_q85_444.jpg", "Q85YUV444"},
+        {"jxl/flower/flower.png.im_q85_420.jpg", "Q85YUV420"},
+        {"jxl/flower/flower.png.im_q85_420_R13B.jpg", "Q85YUV420R13B"},
+    });
+    for (const auto& it : testfiles) {
+      for (float size_factor : {0.1f, 0.33f, 0.5f, 0.75f}) {
+        TestConfig config;
+        config.fn = it.first;
+        config.fn_desc = it.second;
+        config.dparams.size_factor = size_factor;
+        all_tests.push_back(config);
+      }
+    }
+    return all_tests;
+  }
+}
+
+std::ostream& operator<<(std::ostream& os, const TestConfig& c) {
+  os << c.fn_desc;
+  if (c.dparams.size_factor < 1.0f) {
+    os << "Partial" << static_cast<int>(c.dparams.size_factor * 100) << "p";
+  }
+  return os;
+}
+
+std::string TestDescription(
+    const testing::TestParamInfo<SourceManagerTestParam::ParamType>& info) {
+  std::stringstream name;
+  name << info.param;
+  return name.str();
+}
+
+JPEGLI_INSTANTIATE_TEST_SUITE_P(SourceManagerTest, SourceManagerTestParam,
+                                testing::ValuesIn(GenerateTests()),
+                                TestDescription);
+
+}  // namespace
+}  // namespace jpegli
diff --git a/lib/jpegli/streaming_test.cc b/lib/jpegli/streaming_test.cc
new file mode 100644 (file)
index 0000000..8d2e357
--- /dev/null
@@ -0,0 +1,233 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/decode.h"
+#include "lib/jpegli/encode.h"
+#include "lib/jpegli/test_utils.h"
+#include "lib/jpegli/testing.h"
+
+namespace jpegli {
+namespace {
+
+// A simple suspending source manager with an input buffer.
+struct SourceManager {
+  jpeg_source_mgr pub;
+  std::vector<uint8_t> buffer;
+
+  SourceManager() {
+    pub.next_input_byte = nullptr;
+    pub.bytes_in_buffer = 0;
+    pub.init_source = init_source;
+    pub.fill_input_buffer = fill_input_buffer;
+    pub.skip_input_data = skip_input_data;
+    pub.resync_to_restart = jpegli_resync_to_restart;
+    pub.term_source = term_source;
+  }
+
+  static void init_source(j_decompress_ptr cinfo) {}
+  static boolean fill_input_buffer(j_decompress_ptr cinfo) { return FALSE; }
+  static void skip_input_data(j_decompress_ptr cinfo, long num_bytes) {}
+  static void term_source(j_decompress_ptr cinfo) {}
+};
+
+// A destination manager that empties its output buffer into a SourceManager's
+// input buffer. The buffer size is kept short because empty_output_buffer() is
+// called only when the output buffer is full, and we want to update the decoder
+// input frequently to demonstrate that streaming works.
+static constexpr size_t kOutputBufferSize = 1024;
+struct DestinationManager {
+  jpeg_destination_mgr pub;
+  std::vector<uint8_t> buffer;
+  SourceManager* dest;
+
+  DestinationManager(SourceManager* src)
+      : buffer(kOutputBufferSize), dest(src) {
+    pub.next_output_byte = buffer.data();
+    pub.free_in_buffer = buffer.size();
+    pub.init_destination = init_destination;
+    pub.empty_output_buffer = empty_output_buffer;
+    pub.term_destination = term_destination;
+  }
+
+  static void init_destination(j_compress_ptr cinfo) {}
+
+  static boolean empty_output_buffer(j_compress_ptr cinfo) {
+    auto us = reinterpret_cast<DestinationManager*>(cinfo->dest);
+    jpeg_destination_mgr* src = &us->pub;
+    jpeg_source_mgr* dst = &us->dest->pub;
+    std::vector<uint8_t>& src_buf = us->buffer;
+    std::vector<uint8_t>& dst_buf = us->dest->buffer;
+    if (dst->bytes_in_buffer > 0 && dst->bytes_in_buffer < dst_buf.size()) {
+      memmove(dst_buf.data(), dst->next_input_byte, dst->bytes_in_buffer);
+    }
+    size_t src_len = src_buf.size() - src->free_in_buffer;
+    dst_buf.resize(dst->bytes_in_buffer + src_len);
+    memcpy(&dst_buf[dst->bytes_in_buffer], src_buf.data(), src_len);
+    dst->next_input_byte = dst_buf.data();
+    dst->bytes_in_buffer = dst_buf.size();
+    src->next_output_byte = src_buf.data();
+    src->free_in_buffer = src_buf.size();
+    return true;
+  }
+
+  static void term_destination(j_compress_ptr cinfo) {
+    empty_output_buffer(cinfo);
+  }
+};
+
+struct TestConfig {
+  TestImage input;
+  CompressParams jparams;
+};
+
+class StreamingTestParam : public ::testing::TestWithParam<TestConfig> {};
+
+TEST_P(StreamingTestParam, TestStreaming) {
+  jpeg_decompress_struct dinfo = {};
+  jpeg_compress_struct cinfo = {};
+  TestConfig config = GetParam();
+  TestImage& input = config.input;
+  TestImage output;
+  GeneratePixels(&input);
+  const auto try_catch_block = [&]() {
+    ERROR_HANDLER_SETUP(jpegli);
+    dinfo.err = cinfo.err;
+    dinfo.client_data = cinfo.client_data;
+    // Create a pair of compressor and decompressor objects, where the
+    // compressor's output is connected to the decompressor's input.
+    jpegli_create_decompress(&dinfo);
+    jpegli_create_compress(&cinfo);
+    SourceManager src;
+    dinfo.src = reinterpret_cast<jpeg_source_mgr*>(&src);
+    DestinationManager dest(&src);
+    cinfo.dest = reinterpret_cast<jpeg_destination_mgr*>(&dest);
+
+    cinfo.image_width = input.xsize;
+    cinfo.image_height = input.ysize;
+    cinfo.input_components = input.components;
+    cinfo.in_color_space = (J_COLOR_SPACE)input.color_space;
+    jpegli_set_defaults(&cinfo);
+    cinfo.comp_info[0].v_samp_factor = config.jparams.v_sampling[0];
+    jpegli_set_progressive_level(&cinfo, 0);
+    cinfo.optimize_coding = FALSE;
+    jpegli_start_compress(&cinfo, TRUE);
+
+    size_t stride = cinfo.image_width * cinfo.input_components;
+    size_t iMCU_height = 8 * cinfo.max_v_samp_factor;
+    std::vector<uint8_t> row_bytes(iMCU_height * stride);
+    size_t yin = 0;
+    size_t yout = 0;
+    while (yin < cinfo.image_height) {
+      // Feed one iMCU row at a time to the compressor.
+      size_t lines_in = std::min(iMCU_height, cinfo.image_height - yin);
+      memcpy(&row_bytes[0], &input.pixels[yin * stride], lines_in * stride);
+      std::vector<JSAMPROW> rows_in(lines_in);
+      for (size_t i = 0; i < lines_in; ++i) {
+        rows_in[i] = &row_bytes[i * stride];
+      }
+      EXPECT_EQ(lines_in,
+                jpegli_write_scanlines(&cinfo, &rows_in[0], lines_in));
+      yin += lines_in;
+      if (yin == cinfo.image_height) {
+        jpegli_finish_compress(&cinfo);
+      }
+
+      // Atfer the first iMCU row, we don't yet expect any output because the
+      // compressor delays processing to have context rows after the iMCU row.
+      if (yin < std::min<size_t>(2 * iMCU_height, cinfo.image_height)) {
+        continue;
+      }
+
+      // After two iMCU rows, the compressor has started emitting compressed
+      // data. We check here that at least the scan header was output, because
+      // we expect that the compressor's output buffer was filled at least once
+      // while emitting the first compressed iMCU row.
+      if (yin == std::min<size_t>(2 * iMCU_height, cinfo.image_height)) {
+        EXPECT_EQ(JPEG_REACHED_SOS,
+                  jpegli_read_header(&dinfo, /*require_image=*/TRUE));
+        output.xsize = dinfo.image_width;
+        output.ysize = dinfo.image_height;
+        output.components = dinfo.num_components;
+        EXPECT_EQ(output.xsize, input.xsize);
+        EXPECT_EQ(output.ysize, input.ysize);
+        EXPECT_EQ(output.components, input.components);
+        EXPECT_TRUE(jpegli_start_decompress(&dinfo));
+        output.pixels.resize(output.ysize * stride);
+        if (yin < cinfo.image_height) {
+          continue;
+        }
+      }
+
+      // After six iMCU rows, the compressor has emitted five iMCU rows of
+      // compressed data, of which we expect four full iMCU row of compressed
+      // data to be in the decoder's input buffer, but since the decoder also
+      // needs context rows for upsampling and smoothing, we don't expect any
+      // output to be ready yet.
+      if (yin < 7 * iMCU_height && yin < cinfo.image_height) {
+        continue;
+      }
+
+      // After five iMCU rows, we expect the decoder to have rendered the output
+      // with four iMCU rows of delay.
+      // TODO(szabadka) Reduce the processing delay in the decoder if possible.
+      size_t lines_out =
+          (yin == cinfo.image_height ? cinfo.image_height - yout : iMCU_height);
+      std::vector<JSAMPROW> rows_out(lines_out);
+      for (size_t i = 0; i < lines_out; ++i) {
+        rows_out[i] =
+            reinterpret_cast<JSAMPLE*>(&output.pixels[(yout + i) * stride]);
+      }
+      EXPECT_EQ(lines_out,
+                jpegli_read_scanlines(&dinfo, &rows_out[0], lines_out));
+      VerifyOutputImage(input, output, yout, lines_out, 3.8f);
+      yout += lines_out;
+
+      if (yout == cinfo.image_height) {
+        EXPECT_TRUE(jpegli_finish_decompress(&dinfo));
+      }
+    }
+    return true;
+  };
+  EXPECT_TRUE(try_catch_block());
+  jpegli_destroy_decompress(&dinfo);
+  jpegli_destroy_compress(&cinfo);
+}
+
+std::vector<TestConfig> GenerateTests() {
+  std::vector<TestConfig> all_tests;
+  const size_t xsize0 = 1920;
+  const size_t ysize0 = 1080;
+  for (int dysize : {0, 1, 8, 9}) {
+    for (int v_sampling : {1, 2}) {
+      TestConfig config;
+      config.input.xsize = xsize0;
+      config.input.ysize = ysize0 + dysize;
+      config.jparams.h_sampling = {1, 1, 1};
+      config.jparams.v_sampling = {v_sampling, 1, 1};
+      all_tests.push_back(config);
+    }
+  }
+  return all_tests;
+}
+
+std::ostream& operator<<(std::ostream& os, const TestConfig& c) {
+  os << c.input;
+  os << c.jparams;
+  return os;
+}
+
+std::string TestDescription(
+    const testing::TestParamInfo<StreamingTestParam::ParamType>& info) {
+  std::stringstream name;
+  name << info.param;
+  return name.str();
+}
+
+JPEGLI_INSTANTIATE_TEST_SUITE_P(StreamingTest, StreamingTestParam,
+                                testing::ValuesIn(GenerateTests()),
+                                TestDescription);
+
+}  // namespace
+}  // namespace jpegli
diff --git a/lib/jpegli/test_params.h b/lib/jpegli/test_params.h
new file mode 100644 (file)
index 0000000..6ab9fa5
--- /dev/null
@@ -0,0 +1,163 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_TEST_PARAMS_H_
+#define LIB_JPEGLI_TEST_PARAMS_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <vector>
+
+#include "lib/jpegli/types.h"
+
+namespace jpegli {
+
+// We define this here as well to make sure that the *_api_test.cc tests only
+// use the public API and therefore we don't include any *_internal.h headers.
+template <typename T1, typename T2>
+constexpr inline T1 DivCeil(T1 a, T2 b) {
+  return (a + b - 1) / b;
+}
+
+#define ARRAY_SIZE(X) (sizeof(X) / sizeof((X)[0]))
+
+static constexpr int kLastScan = 0xffff;
+
+static uint32_t kTestColorMap[] = {
+    0x000000, 0xff0000, 0x00ff00, 0x0000ff, 0xffff00, 0x00ffff,
+    0xff00ff, 0xffffff, 0x6251fc, 0x45d9c7, 0xa7f059, 0xd9a945,
+    0xfa4e44, 0xceaffc, 0xbad7db, 0xc1f0b1, 0xdbca9a, 0xfacac5,
+    0xf201ff, 0x0063db, 0x00f01c, 0xdbb204, 0xf12f0c, 0x7ba1dc};
+static constexpr int kTestColorMapNumColors = ARRAY_SIZE(kTestColorMap);
+
+static constexpr int kSpecialMarker0 = 0xe5;
+static constexpr int kSpecialMarker1 = 0xe9;
+static constexpr uint8_t kMarkerData[] = {0, 1, 255, 0, 17};
+static constexpr uint8_t kMarkerSequence[] = {0xe6, 0xe8, 0xe7,
+                                              0xe6, 0xe7, 0xe8};
+static constexpr size_t kMarkerSequenceLen = ARRAY_SIZE(kMarkerSequence);
+
+enum JpegIOMode {
+  PIXELS,
+  RAW_DATA,
+  COEFFICIENTS,
+};
+
+struct CustomQuantTable {
+  int slot_idx = 0;
+  uint16_t table_type = 0;
+  int scale_factor = 100;
+  bool add_raw = false;
+  bool force_baseline = true;
+  std::vector<unsigned int> basic_table;
+  std::vector<unsigned int> quantval;
+  void Generate();
+};
+
+struct TestImage {
+  size_t xsize = 2268;
+  size_t ysize = 1512;
+  int color_space = 2;  // JCS_RGB
+  size_t components = 3;
+  JpegliDataType data_type = JPEGLI_TYPE_UINT8;
+  JpegliEndianness endianness = JPEGLI_NATIVE_ENDIAN;
+  std::vector<uint8_t> pixels;
+  std::vector<std::vector<uint8_t>> raw_data;
+  std::vector<std::vector<int16_t>> coeffs;
+  void AllocatePixels() {
+    pixels.resize(ysize * xsize * components *
+                  jpegli_bytes_per_sample(data_type));
+  }
+  void Clear() {
+    pixels.clear();
+    raw_data.clear();
+    coeffs.clear();
+  }
+};
+
+struct CompressParams {
+  int quality = 90;
+  bool set_jpeg_colorspace = false;
+  int jpeg_color_space = 0;  // JCS_UNKNOWN
+  std::vector<int> quant_indexes;
+  std::vector<CustomQuantTable> quant_tables;
+  std::vector<int> h_sampling;
+  std::vector<int> v_sampling;
+  std::vector<int> comp_ids;
+  int override_JFIF = -1;
+  int override_Adobe = -1;
+  bool add_marker = false;
+  bool simple_progression = false;
+  // -1 is library default
+  // 0, 1, 2 is set through jpegli_set_progressive_level()
+  // 2 + N is kScriptN
+  int progressive_mode = -1;
+  unsigned int restart_interval = 0;
+  int restart_in_rows = 0;
+  int smoothing_factor = 0;
+  int optimize_coding = -1;
+  bool use_flat_dc_luma_code = false;
+  bool omit_standard_tables = false;
+  bool xyb_mode = false;
+  bool libjpeg_mode = false;
+  bool use_adaptive_quantization = true;
+  std::vector<uint8_t> icc;
+
+  int h_samp(int c) const { return h_sampling.empty() ? 1 : h_sampling[c]; }
+  int v_samp(int c) const { return v_sampling.empty() ? 1 : v_sampling[c]; }
+  int max_h_sample() const {
+    auto it = std::max_element(h_sampling.begin(), h_sampling.end());
+    return it == h_sampling.end() ? 1 : *it;
+  }
+  int max_v_sample() const {
+    auto it = std::max_element(v_sampling.begin(), v_sampling.end());
+    return it == v_sampling.end() ? 1 : *it;
+  }
+  int comp_width(const TestImage& input, int c) const {
+    return DivCeil(input.xsize * h_samp(c), max_h_sample() * 8) * 8;
+  }
+  int comp_height(const TestImage& input, int c) const {
+    return DivCeil(input.ysize * v_samp(c), max_v_sample() * 8) * 8;
+  }
+};
+
+enum ColorQuantMode {
+  CQUANT_1PASS,
+  CQUANT_2PASS,
+  CQUANT_EXTERNAL,
+  CQUANT_REUSE,
+};
+
+struct ScanDecompressParams {
+  int max_scan_number;
+  int dither_mode;
+  ColorQuantMode color_quant_mode;
+};
+
+struct DecompressParams {
+  float size_factor = 1.0f;
+  size_t chunk_size = 65536;
+  size_t max_output_lines = 16;
+  JpegIOMode output_mode = PIXELS;
+  JpegliDataType data_type = JPEGLI_TYPE_UINT8;
+  JpegliEndianness endianness = JPEGLI_NATIVE_ENDIAN;
+  bool set_out_color_space = false;
+  int out_color_space = 0;  // JCS_UNKNOWN
+  bool crop_output = false;
+  bool do_block_smoothing = false;
+  bool do_fancy_upsampling = true;
+  bool skip_scans = false;
+  int scale_num = 1;
+  int scale_denom = 1;
+  bool quantize_colors = false;
+  int desired_number_of_colors = 256;
+  std::vector<ScanDecompressParams> scan_params;
+};
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_TEST_PARAMS_H_
diff --git a/lib/jpegli/test_utils-inl.h b/lib/jpegli/test_utils-inl.h
new file mode 100644 (file)
index 0000000..a454917
--- /dev/null
@@ -0,0 +1,430 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This template file is included in both the libjpeg_test_util.cc and the
+// test_utils.cc files with different JPEG_API_FN macros and possibly different
+// include paths for the jpeg headers.
+
+// Sequential non-interleaved.
+static constexpr jpeg_scan_info kScript1[] = {
+    {1, {0}, 0, 63, 0, 0},
+    {1, {1}, 0, 63, 0, 0},
+    {1, {2}, 0, 63, 0, 0},
+};
+// Sequential partially interleaved, chroma first.
+static constexpr jpeg_scan_info kScript2[] = {
+    {2, {1, 2}, 0, 63, 0, 0},
+    {1, {0}, 0, 63, 0, 0},
+};
+
+// Rest of the scan scripts are progressive.
+
+static constexpr jpeg_scan_info kScript3[] = {
+    // Interleaved full DC.
+    {3, {0, 1, 2}, 0, 0, 0, 0},
+    // Full AC scans.
+    {1, {0}, 1, 63, 0, 0},
+    {1, {1}, 1, 63, 0, 0},
+    {1, {2}, 1, 63, 0, 0},
+};
+static constexpr jpeg_scan_info kScript4[] = {
+    // Non-interleaved full DC.
+    {1, {0}, 0, 0, 0, 0},
+    {1, {1}, 0, 0, 0, 0},
+    {1, {2}, 0, 0, 0, 0},
+    // Full AC scans.
+    {1, {0}, 1, 63, 0, 0},
+    {1, {1}, 1, 63, 0, 0},
+    {1, {2}, 1, 63, 0, 0},
+};
+static constexpr jpeg_scan_info kScript5[] = {
+    // Partially interleaved full DC, chroma first.
+    {2, {1, 2}, 0, 0, 0, 0},
+    {1, {0}, 0, 0, 0, 0},
+    // AC shifted by 1 bit.
+    {1, {0}, 1, 63, 0, 1},
+    {1, {1}, 1, 63, 0, 1},
+    {1, {2}, 1, 63, 0, 1},
+    // AC refinement scan.
+    {1, {0}, 1, 63, 1, 0},
+    {1, {1}, 1, 63, 1, 0},
+    {1, {2}, 1, 63, 1, 0},
+};
+static constexpr jpeg_scan_info kScript6[] = {
+    // Interleaved DC shifted by 2 bits.
+    {3, {0, 1, 2}, 0, 0, 0, 2},
+    // Interleaved DC refinement scans.
+    {3, {0, 1, 2}, 0, 0, 2, 1},
+    {3, {0, 1, 2}, 0, 0, 1, 0},
+    // Full AC scans.
+    {1, {0}, 1, 63, 0, 0},
+    {1, {1}, 1, 63, 0, 0},
+    {1, {2}, 1, 63, 0, 0},
+};
+
+static constexpr jpeg_scan_info kScript7[] = {
+    // Non-interleaved DC shifted by 2 bits.
+    {1, {0}, 0, 0, 0, 2},
+    {1, {1}, 0, 0, 0, 2},
+    {1, {2}, 0, 0, 0, 2},
+    // Non-interleaved DC first refinement scans.
+    {1, {0}, 0, 0, 2, 1},
+    {1, {1}, 0, 0, 2, 1},
+    {1, {2}, 0, 0, 2, 1},
+    // Non-interleaved DC second refinement scans.
+    {1, {0}, 0, 0, 1, 0},
+    {1, {1}, 0, 0, 1, 0},
+    {1, {2}, 0, 0, 1, 0},
+    // Full AC scans.
+    {1, {0}, 1, 63, 0, 0},
+    {1, {1}, 1, 63, 0, 0},
+    {1, {2}, 1, 63, 0, 0},
+};
+
+static constexpr jpeg_scan_info kScript8[] = {
+    // Partially interleaved DC shifted by 2 bits, chroma first
+    {2, {1, 2}, 0, 0, 0, 2},
+    {1, {0}, 0, 0, 0, 2},
+    // Partially interleaved DC first refinement scans.
+    {2, {0, 2}, 0, 0, 2, 1},
+    {1, {1}, 0, 0, 2, 1},
+    // Partially interleaved DC first refinement scans, chroma first.
+    {2, {1, 2}, 0, 0, 1, 0},
+    {1, {0}, 0, 0, 1, 0},
+    // Full AC scans.
+    {1, {0}, 1, 63, 0, 0},
+    {1, {1}, 1, 63, 0, 0},
+    {1, {2}, 1, 63, 0, 0},
+};
+
+static constexpr jpeg_scan_info kScript9[] = {
+    // Interleaved full DC.
+    {3, {0, 1, 2}, 0, 0, 0, 0},
+    // AC scans for component 0
+    // shifted by 1 bit, two spectral ranges
+    {1, {0}, 1, 6, 0, 1},
+    {1, {0}, 7, 63, 0, 1},
+    // refinement scan, full
+    {1, {0}, 1, 63, 1, 0},
+    // AC scans for component 1
+    // shifted by 1 bit, full
+    {1, {1}, 1, 63, 0, 1},
+    // refinement scan, two spectral ranges
+    {1, {1}, 1, 6, 1, 0},
+    {1, {1}, 7, 63, 1, 0},
+    // AC scans for component 2
+    // shifted by 1 bit, two spectral ranges
+    {1, {2}, 1, 6, 0, 1},
+    {1, {2}, 7, 63, 0, 1},
+    // refinement scan, two spectral ranges (but different from above)
+    {1, {2}, 1, 16, 1, 0},
+    {1, {2}, 17, 63, 1, 0},
+};
+
+static constexpr jpeg_scan_info kScript10[] = {
+    // Interleaved full DC.
+    {3, {0, 1, 2}, 0, 0, 0, 0},
+    // AC scans for spectral range 1..16
+    // shifted by 1
+    {1, {0}, 1, 16, 0, 1},
+    {1, {1}, 1, 16, 0, 1},
+    {1, {2}, 1, 16, 0, 1},
+    // refinement scans, two sub-ranges
+    {1, {0}, 1, 8, 1, 0},
+    {1, {0}, 9, 16, 1, 0},
+    {1, {1}, 1, 8, 1, 0},
+    {1, {1}, 9, 16, 1, 0},
+    {1, {2}, 1, 8, 1, 0},
+    {1, {2}, 9, 16, 1, 0},
+    // AC scans for spectral range 17..63
+    {1, {0}, 17, 63, 0, 1},
+    {1, {1}, 17, 63, 0, 1},
+    {1, {2}, 17, 63, 0, 1},
+    // refinement scans, two sub-ranges
+    {1, {0}, 17, 28, 1, 0},
+    {1, {0}, 29, 63, 1, 0},
+    {1, {1}, 17, 28, 1, 0},
+    {1, {1}, 29, 63, 1, 0},
+    {1, {2}, 17, 28, 1, 0},
+    {1, {2}, 29, 63, 1, 0},
+};
+
+struct ScanScript {
+  int num_scans;
+  const jpeg_scan_info* scans;
+};
+
+static constexpr ScanScript kTestScript[] = {
+    {ARRAY_SIZE(kScript1), kScript1}, {ARRAY_SIZE(kScript2), kScript2},
+    {ARRAY_SIZE(kScript3), kScript3}, {ARRAY_SIZE(kScript4), kScript4},
+    {ARRAY_SIZE(kScript5), kScript5}, {ARRAY_SIZE(kScript6), kScript6},
+    {ARRAY_SIZE(kScript7), kScript7}, {ARRAY_SIZE(kScript8), kScript8},
+    {ARRAY_SIZE(kScript9), kScript9}, {ARRAY_SIZE(kScript10), kScript10},
+};
+static constexpr int kNumTestScripts = ARRAY_SIZE(kTestScript);
+
+void SetScanDecompressParams(const DecompressParams& dparams,
+                             j_decompress_ptr cinfo, int scan_number) {
+  const ScanDecompressParams* sparams = nullptr;
+  for (const auto& sp : dparams.scan_params) {
+    if (scan_number <= sp.max_scan_number) {
+      sparams = &sp;
+      break;
+    }
+  }
+  if (sparams == nullptr) {
+    return;
+  }
+  if (dparams.quantize_colors) {
+    cinfo->dither_mode = (J_DITHER_MODE)sparams->dither_mode;
+    if (sparams->color_quant_mode == CQUANT_1PASS) {
+      cinfo->two_pass_quantize = FALSE;
+      cinfo->colormap = nullptr;
+    } else if (sparams->color_quant_mode == CQUANT_2PASS) {
+      JXL_CHECK(cinfo->out_color_space == JCS_RGB);
+      cinfo->two_pass_quantize = TRUE;
+      cinfo->colormap = nullptr;
+    } else if (sparams->color_quant_mode == CQUANT_EXTERNAL) {
+      JXL_CHECK(cinfo->out_color_space == JCS_RGB);
+      cinfo->two_pass_quantize = FALSE;
+      bool have_colormap = cinfo->colormap != nullptr;
+      cinfo->actual_number_of_colors = kTestColorMapNumColors;
+      cinfo->colormap = (*cinfo->mem->alloc_sarray)(
+          reinterpret_cast<j_common_ptr>(cinfo), JPOOL_IMAGE,
+          cinfo->actual_number_of_colors, 3);
+      jxl::msan::UnpoisonMemory(cinfo->colormap, 3 * sizeof(JSAMPROW));
+      for (int i = 0; i < kTestColorMapNumColors; ++i) {
+        cinfo->colormap[0][i] = (kTestColorMap[i] >> 16) & 0xff;
+        cinfo->colormap[1][i] = (kTestColorMap[i] >> 8) & 0xff;
+        cinfo->colormap[2][i] = (kTestColorMap[i] >> 0) & 0xff;
+      }
+      if (have_colormap) {
+        JPEG_API_FN(new_colormap)(cinfo);
+      }
+    } else if (sparams->color_quant_mode == CQUANT_REUSE) {
+      JXL_CHECK(cinfo->out_color_space == JCS_RGB);
+      JXL_CHECK(cinfo->colormap);
+    }
+  }
+}
+
+void SetDecompressParams(const DecompressParams& dparams,
+                         j_decompress_ptr cinfo) {
+  cinfo->do_block_smoothing = dparams.do_block_smoothing;
+  cinfo->do_fancy_upsampling = dparams.do_fancy_upsampling;
+  if (dparams.output_mode == RAW_DATA) {
+    cinfo->raw_data_out = TRUE;
+  }
+  if (dparams.set_out_color_space) {
+    cinfo->out_color_space = (J_COLOR_SPACE)dparams.out_color_space;
+    if (dparams.out_color_space == JCS_UNKNOWN) {
+      cinfo->jpeg_color_space = JCS_UNKNOWN;
+    }
+  }
+  cinfo->scale_num = dparams.scale_num;
+  cinfo->scale_denom = dparams.scale_denom;
+  cinfo->quantize_colors = dparams.quantize_colors;
+  cinfo->desired_number_of_colors = dparams.desired_number_of_colors;
+  if (!dparams.scan_params.empty()) {
+    if (cinfo->buffered_image) {
+      for (const auto& sparams : dparams.scan_params) {
+        if (sparams.color_quant_mode == CQUANT_1PASS) {
+          cinfo->enable_1pass_quant = TRUE;
+        } else if (sparams.color_quant_mode == CQUANT_2PASS) {
+          cinfo->enable_2pass_quant = TRUE;
+        } else if (sparams.color_quant_mode == CQUANT_EXTERNAL) {
+          cinfo->enable_external_quant = TRUE;
+        }
+      }
+      SetScanDecompressParams(dparams, cinfo, 1);
+    } else {
+      SetScanDecompressParams(dparams, cinfo, kLastScan);
+    }
+  }
+}
+
+void CheckMarkerPresent(j_decompress_ptr cinfo, uint8_t marker_type) {
+  bool marker_found = false;
+  for (jpeg_saved_marker_ptr marker = cinfo->marker_list; marker != nullptr;
+       marker = marker->next) {
+    jxl::msan::UnpoisonMemory(marker, sizeof(*marker));
+    jxl::msan::UnpoisonMemory(marker->data, marker->data_length);
+    if (marker->marker == marker_type &&
+        marker->data_length == sizeof(kMarkerData) &&
+        memcmp(marker->data, kMarkerData, sizeof(kMarkerData)) == 0) {
+      marker_found = true;
+    }
+  }
+  JXL_CHECK(marker_found);
+}
+
+void VerifyHeader(const CompressParams& jparams, j_decompress_ptr cinfo) {
+  if (jparams.set_jpeg_colorspace) {
+    JXL_CHECK(cinfo->jpeg_color_space == jparams.jpeg_color_space);
+  }
+  if (jparams.override_JFIF >= 0) {
+    JXL_CHECK(cinfo->saw_JFIF_marker == jparams.override_JFIF);
+  }
+  if (jparams.override_Adobe >= 0) {
+    JXL_CHECK(cinfo->saw_Adobe_marker == jparams.override_Adobe);
+  }
+  if (jparams.add_marker) {
+    CheckMarkerPresent(cinfo, kSpecialMarker0);
+    CheckMarkerPresent(cinfo, kSpecialMarker1);
+  }
+  jxl::msan::UnpoisonMemory(
+      cinfo->comp_info, cinfo->num_components * sizeof(cinfo->comp_info[0]));
+  int max_h_samp_factor = 1;
+  int max_v_samp_factor = 1;
+  for (int i = 0; i < cinfo->num_components; ++i) {
+    jpeg_component_info* comp = &cinfo->comp_info[i];
+    if (!jparams.comp_ids.empty()) {
+      JXL_CHECK(comp->component_id == jparams.comp_ids[i]);
+    }
+    if (!jparams.h_sampling.empty()) {
+      JXL_CHECK(comp->h_samp_factor == jparams.h_sampling[i]);
+    }
+    if (!jparams.v_sampling.empty()) {
+      JXL_CHECK(comp->v_samp_factor == jparams.v_sampling[i]);
+    }
+    if (!jparams.quant_indexes.empty()) {
+      JXL_CHECK(comp->quant_tbl_no == jparams.quant_indexes[i]);
+    }
+    max_h_samp_factor = std::max(max_h_samp_factor, comp->h_samp_factor);
+    max_v_samp_factor = std::max(max_v_samp_factor, comp->v_samp_factor);
+  }
+  JXL_CHECK(max_h_samp_factor == cinfo->max_h_samp_factor);
+  JXL_CHECK(max_v_samp_factor == cinfo->max_v_samp_factor);
+  int referenced_tables[NUM_QUANT_TBLS] = {};
+  for (int i = 0; i < cinfo->num_components; ++i) {
+    jpeg_component_info* comp = &cinfo->comp_info[i];
+    JXL_CHECK(comp->width_in_blocks ==
+              DivCeil(cinfo->image_width * comp->h_samp_factor,
+                      max_h_samp_factor * DCTSIZE));
+    JXL_CHECK(comp->height_in_blocks ==
+              DivCeil(cinfo->image_height * comp->v_samp_factor,
+                      max_v_samp_factor * DCTSIZE));
+    referenced_tables[comp->quant_tbl_no] = 1;
+  }
+  for (const auto& table : jparams.quant_tables) {
+    JQUANT_TBL* quant_table = cinfo->quant_tbl_ptrs[table.slot_idx];
+    if (!referenced_tables[table.slot_idx]) {
+      JXL_CHECK(quant_table == nullptr);
+      continue;
+    }
+    JXL_CHECK(quant_table != nullptr);
+    jxl::msan::UnpoisonMemory(quant_table, sizeof(*quant_table));
+    for (int k = 0; k < DCTSIZE2; ++k) {
+      JXL_CHECK(quant_table->quantval[k] == table.quantval[k]);
+    }
+  }
+}
+
+void VerifyScanHeader(const CompressParams& jparams, j_decompress_ptr cinfo) {
+  JXL_CHECK(cinfo->input_scan_number > 0);
+  if (cinfo->progressive_mode) {
+    JXL_CHECK(cinfo->Ss != 0 || cinfo->Se != 63);
+  } else {
+    JXL_CHECK(cinfo->Ss == 0 && cinfo->Se == 63);
+  }
+  if (jparams.progressive_mode > 2) {
+    JXL_CHECK(jparams.progressive_mode < 3 + kNumTestScripts);
+    const ScanScript& script = kTestScript[jparams.progressive_mode - 3];
+    JXL_CHECK(cinfo->input_scan_number <= script.num_scans);
+    const jpeg_scan_info& scan = script.scans[cinfo->input_scan_number - 1];
+    JXL_CHECK(cinfo->comps_in_scan == scan.comps_in_scan);
+    for (int i = 0; i < cinfo->comps_in_scan; ++i) {
+      JXL_CHECK(cinfo->cur_comp_info[i]->component_index ==
+                scan.component_index[i]);
+    }
+    JXL_CHECK(cinfo->Ss == scan.Ss);
+    JXL_CHECK(cinfo->Se == scan.Se);
+    JXL_CHECK(cinfo->Ah == scan.Ah);
+    JXL_CHECK(cinfo->Al == scan.Al);
+  }
+  if (jparams.restart_interval > 0) {
+    JXL_CHECK(cinfo->restart_interval == jparams.restart_interval);
+  } else if (jparams.restart_in_rows > 0) {
+    JXL_CHECK(cinfo->restart_interval ==
+              jparams.restart_in_rows * cinfo->MCUs_per_row);
+  }
+  if (jparams.progressive_mode == 0 && jparams.optimize_coding == 0) {
+    if (cinfo->jpeg_color_space == JCS_RGB) {
+      JXL_CHECK(cinfo->comp_info[0].dc_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[1].dc_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[2].dc_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[0].ac_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[1].ac_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[2].ac_tbl_no == 0);
+    } else if (cinfo->jpeg_color_space == JCS_YCbCr) {
+      JXL_CHECK(cinfo->comp_info[0].dc_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[1].dc_tbl_no == 1);
+      JXL_CHECK(cinfo->comp_info[2].dc_tbl_no == 1);
+      JXL_CHECK(cinfo->comp_info[0].ac_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[1].ac_tbl_no == 1);
+      JXL_CHECK(cinfo->comp_info[2].ac_tbl_no == 1);
+    } else if (cinfo->jpeg_color_space == JCS_CMYK) {
+      JXL_CHECK(cinfo->comp_info[0].dc_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[1].dc_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[2].dc_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[3].dc_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[0].ac_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[1].ac_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[2].ac_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[3].ac_tbl_no == 0);
+    } else if (cinfo->jpeg_color_space == JCS_YCCK) {
+      JXL_CHECK(cinfo->comp_info[0].dc_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[1].dc_tbl_no == 1);
+      JXL_CHECK(cinfo->comp_info[2].dc_tbl_no == 1);
+      JXL_CHECK(cinfo->comp_info[3].dc_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[0].ac_tbl_no == 0);
+      JXL_CHECK(cinfo->comp_info[1].ac_tbl_no == 1);
+      JXL_CHECK(cinfo->comp_info[2].ac_tbl_no == 1);
+      JXL_CHECK(cinfo->comp_info[3].ac_tbl_no == 0);
+    }
+    if (jparams.use_flat_dc_luma_code) {
+      JHUFF_TBL* tbl = cinfo->dc_huff_tbl_ptrs[0];
+      jxl::msan::UnpoisonMemory(tbl, sizeof(*tbl));
+      for (int i = 0; i < 15; ++i) {
+        JXL_CHECK(tbl->huffval[i] == i);
+      }
+    }
+  }
+}
+
+void UnmapColors(uint8_t* row, size_t xsize, int components,
+                 JSAMPARRAY colormap, size_t num_colors) {
+  JXL_CHECK(colormap != nullptr);
+  std::vector<uint8_t> tmp(xsize * components);
+  for (size_t x = 0; x < xsize; ++x) {
+    JXL_CHECK(row[x] < num_colors);
+    for (int c = 0; c < components; ++c) {
+      tmp[x * components + c] = colormap[c][row[x]];
+    }
+  }
+  memcpy(row, tmp.data(), tmp.size());
+}
+
+void CopyCoefficients(j_decompress_ptr cinfo, jvirt_barray_ptr* coef_arrays,
+                      TestImage* output) {
+  output->xsize = cinfo->image_width;
+  output->ysize = cinfo->image_height;
+  output->components = cinfo->num_components;
+  output->color_space = cinfo->out_color_space;
+  j_common_ptr comptr = reinterpret_cast<j_common_ptr>(cinfo);
+  for (int c = 0; c < cinfo->num_components; ++c) {
+    jpeg_component_info* comp = &cinfo->comp_info[c];
+    std::vector<JCOEF> coeffs(comp->width_in_blocks * comp->height_in_blocks *
+                              DCTSIZE2);
+    for (size_t by = 0; by < comp->height_in_blocks; ++by) {
+      JBLOCKARRAY ba = (*cinfo->mem->access_virt_barray)(comptr, coef_arrays[c],
+                                                         by, 1, true);
+      size_t stride = comp->width_in_blocks * sizeof(JBLOCK);
+      size_t offset = by * comp->width_in_blocks * DCTSIZE2;
+      memcpy(&coeffs[offset], ba[0], stride);
+    }
+    output->coeffs.emplace_back(std::move(coeffs));
+  }
+}
diff --git a/lib/jpegli/test_utils.cc b/lib/jpegli/test_utils.cc
new file mode 100644 (file)
index 0000000..232b937
--- /dev/null
@@ -0,0 +1,787 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/test_utils.h"
+
+#include <cmath>
+#include <cstdint>
+#include <fstream>
+
+#include "lib/jpegli/decode.h"
+#include "lib/jpegli/encode.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/sanitizers.h"
+
+#if !defined(TEST_DATA_PATH)
+#include "tools/cpp/runfiles/runfiles.h"
+#endif
+
+namespace jpegli {
+
+#define JPEG_API_FN(name) jpegli_##name
+#include "lib/jpegli/test_utils-inl.h"
+#undef JPEG_API_FN
+
+#if defined(TEST_DATA_PATH)
+std::string GetTestDataPath(const std::string& filename) {
+  return std::string(TEST_DATA_PATH "/") + filename;
+}
+#else
+using bazel::tools::cpp::runfiles::Runfiles;
+const std::unique_ptr<Runfiles> kRunfiles(Runfiles::Create(""));
+std::string GetTestDataPath(const std::string& filename) {
+  std::string root(JPEGXL_ROOT_PACKAGE "/testdata/");
+  return kRunfiles->Rlocation(root + filename);
+}
+#endif
+
+std::vector<uint8_t> ReadTestData(const std::string& filename) {
+  std::string full_path = GetTestDataPath(filename);
+  fprintf(stderr, "ReadTestData %s\n", full_path.c_str());
+  std::ifstream file(full_path, std::ios::binary);
+  std::vector<char> str((std::istreambuf_iterator<char>(file)),
+                        std::istreambuf_iterator<char>());
+  JXL_CHECK(file.good());
+  const uint8_t* raw = reinterpret_cast<const uint8_t*>(str.data());
+  std::vector<uint8_t> data(raw, raw + str.size());
+  printf("Test data %s is %d bytes long.\n", filename.c_str(),
+         static_cast<int>(data.size()));
+  return data;
+}
+
+void CustomQuantTable::Generate() {
+  basic_table.resize(DCTSIZE2);
+  quantval.resize(DCTSIZE2);
+  switch (table_type) {
+    case 0: {
+      for (int k = 0; k < DCTSIZE2; ++k) {
+        basic_table[k] = k + 1;
+      }
+      break;
+    }
+    default:
+      for (int k = 0; k < DCTSIZE2; ++k) {
+        basic_table[k] = table_type;
+      }
+  }
+  for (int k = 0; k < DCTSIZE2; ++k) {
+    quantval[k] = (basic_table[k] * scale_factor + 50U) / 100U;
+    quantval[k] = std::max(quantval[k], 1U);
+    quantval[k] = std::min(quantval[k], 65535U);
+    if (!add_raw) {
+      quantval[k] = std::min(quantval[k], force_baseline ? 255U : 32767U);
+    }
+  }
+}
+
+bool PNMParser::ParseHeader(const uint8_t** pos, size_t* xsize, size_t* ysize,
+                            size_t* num_channels, size_t* bitdepth) {
+  if (pos_[0] != 'P' || (pos_[1] != '5' && pos_[1] != '6')) {
+    fprintf(stderr, "Invalid PNM header.");
+    return false;
+  }
+  *num_channels = (pos_[1] == '5' ? 1 : 3);
+  pos_ += 2;
+
+  size_t maxval;
+  if (!SkipWhitespace() || !ParseUnsigned(xsize) || !SkipWhitespace() ||
+      !ParseUnsigned(ysize) || !SkipWhitespace() || !ParseUnsigned(&maxval) ||
+      !SkipWhitespace()) {
+    return false;
+  }
+  if (maxval == 0 || maxval >= 65536) {
+    fprintf(stderr, "Invalid maxval value.\n");
+    return false;
+  }
+  bool found_bitdepth = false;
+  for (int bits = 1; bits <= 16; ++bits) {
+    if (maxval == (1u << bits) - 1) {
+      *bitdepth = bits;
+      found_bitdepth = true;
+      break;
+    }
+  }
+  if (!found_bitdepth) {
+    fprintf(stderr, "Invalid maxval value.\n");
+    return false;
+  }
+
+  *pos = pos_;
+  return true;
+}
+
+bool PNMParser::ParseUnsigned(size_t* number) {
+  if (pos_ == end_ || *pos_ < '0' || *pos_ > '9') {
+    fprintf(stderr, "Expected unsigned number.\n");
+    return false;
+  }
+  *number = 0;
+  while (pos_ < end_ && *pos_ >= '0' && *pos_ <= '9') {
+    *number *= 10;
+    *number += *pos_ - '0';
+    ++pos_;
+  }
+
+  return true;
+}
+
+bool PNMParser::SkipWhitespace() {
+  if (pos_ == end_ || !IsWhitespace(*pos_)) {
+    fprintf(stderr, "Expected whitespace.\n");
+    return false;
+  }
+  while (pos_ < end_ && IsWhitespace(*pos_)) {
+    ++pos_;
+  }
+  return true;
+}
+
+bool ReadPNM(const std::vector<uint8_t>& data, size_t* xsize, size_t* ysize,
+             size_t* num_channels, size_t* bitdepth,
+             std::vector<uint8_t>* pixels) {
+  if (data.size() < 2) {
+    fprintf(stderr, "PNM file too small.\n");
+    return false;
+  }
+  PNMParser parser(data.data(), data.size());
+  const uint8_t* pos = nullptr;
+  if (!parser.ParseHeader(&pos, xsize, ysize, num_channels, bitdepth)) {
+    return false;
+  }
+  pixels->resize(data.data() + data.size() - pos);
+  memcpy(&(*pixels)[0], pos, pixels->size());
+  return true;
+}
+
+std::string ColorSpaceName(J_COLOR_SPACE colorspace) {
+  switch (colorspace) {
+    case JCS_UNKNOWN:
+      return "UNKNOWN";
+    case JCS_GRAYSCALE:
+      return "GRAYSCALE";
+    case JCS_RGB:
+      return "RGB";
+    case JCS_YCbCr:
+      return "YCbCr";
+    case JCS_CMYK:
+      return "CMYK";
+    case JCS_YCCK:
+      return "YCCK";
+    default:
+      return "";
+  }
+}
+
+std::string IOMethodName(JpegliDataType data_type,
+                         JpegliEndianness endianness) {
+  std::string retval;
+  if (data_type == JPEGLI_TYPE_UINT8) {
+    return "";
+  } else if (data_type == JPEGLI_TYPE_UINT16) {
+    retval = "UINT16";
+  } else if (data_type == JPEGLI_TYPE_FLOAT) {
+    retval = "FLOAT";
+  }
+  if (endianness == JPEGLI_LITTLE_ENDIAN) {
+    retval += "LE";
+  } else if (endianness == JPEGLI_BIG_ENDIAN) {
+    retval += "BE";
+  }
+  return retval;
+}
+
+std::string SamplingId(const CompressParams& jparams) {
+  std::stringstream os;
+  JXL_CHECK(jparams.h_sampling.size() == jparams.v_sampling.size());
+  if (!jparams.h_sampling.empty()) {
+    size_t len = jparams.h_sampling.size();
+    while (len > 1 && jparams.h_sampling[len - 1] == 1 &&
+           jparams.v_sampling[len - 1] == 1) {
+      --len;
+    }
+    os << "SAMP";
+    for (size_t i = 0; i < len; ++i) {
+      if (i > 0) os << "_";
+      os << jparams.h_sampling[i] << "x" << jparams.v_sampling[i];
+    }
+  }
+  return os.str();
+}
+
+std::ostream& operator<<(std::ostream& os, const TestImage& input) {
+  os << input.xsize << "x" << input.ysize;
+  os << IOMethodName(input.data_type, input.endianness);
+  if (input.color_space != JCS_RGB) {
+    os << "InputColor" << ColorSpaceName((J_COLOR_SPACE)input.color_space);
+  }
+  if (input.color_space == JCS_UNKNOWN) {
+    os << input.components;
+  }
+  return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const CompressParams& jparams) {
+  os << "Q" << jparams.quality;
+  os << SamplingId(jparams);
+  if (jparams.set_jpeg_colorspace) {
+    os << "JpegColor"
+       << ColorSpaceName((J_COLOR_SPACE)jparams.jpeg_color_space);
+  }
+  if (!jparams.comp_ids.empty()) {
+    os << "CID";
+    for (size_t i = 0; i < jparams.comp_ids.size(); ++i) {
+      os << jparams.comp_ids[i];
+    }
+  }
+  if (!jparams.quant_indexes.empty()) {
+    os << "QIDX";
+    for (size_t i = 0; i < jparams.quant_indexes.size(); ++i) {
+      os << jparams.quant_indexes[i];
+    }
+    for (const auto& table : jparams.quant_tables) {
+      os << "TABLE" << table.slot_idx << "T" << table.table_type << "F"
+         << table.scale_factor
+         << (table.add_raw          ? "R"
+             : table.force_baseline ? "B"
+                                    : "");
+    }
+  }
+  if (jparams.progressive_mode >= 0) {
+    os << "P" << jparams.progressive_mode;
+  } else if (jparams.simple_progression) {
+    os << "Psimple";
+  }
+  if (jparams.optimize_coding == 1) {
+    os << "OptimizedCode";
+  } else if (jparams.optimize_coding == 0) {
+    os << "FixedCode";
+    if (jparams.use_flat_dc_luma_code) {
+      os << "FlatDCLuma";
+    } else if (jparams.omit_standard_tables) {
+      os << "OmitDHT";
+    }
+  }
+  if (!jparams.use_adaptive_quantization) {
+    os << "NoAQ";
+  }
+  if (jparams.restart_interval > 0) {
+    os << "R" << jparams.restart_interval;
+  }
+  if (jparams.restart_in_rows > 0) {
+    os << "RR" << jparams.restart_in_rows;
+  }
+  if (jparams.xyb_mode) {
+    os << "XYB";
+  } else if (jparams.libjpeg_mode) {
+    os << "Libjpeg";
+  }
+  if (jparams.override_JFIF >= 0) {
+    os << (jparams.override_JFIF ? "AddJFIF" : "NoJFIF");
+  }
+  if (jparams.override_Adobe >= 0) {
+    os << (jparams.override_Adobe ? "AddAdobe" : "NoAdobe");
+  }
+  if (jparams.add_marker) {
+    os << "AddMarker";
+  }
+  if (!jparams.icc.empty()) {
+    os << "ICCSize" << jparams.icc.size();
+  }
+  if (jparams.smoothing_factor != 0) {
+    os << "SF" << jparams.smoothing_factor;
+  }
+  return os;
+}
+
+void SetNumChannels(J_COLOR_SPACE colorspace, size_t* channels) {
+  if (colorspace == JCS_GRAYSCALE) {
+    *channels = 1;
+  } else if (colorspace == JCS_RGB || colorspace == JCS_YCbCr) {
+    *channels = 3;
+  } else if (colorspace == JCS_CMYK || colorspace == JCS_YCCK) {
+    *channels = 4;
+  } else if (colorspace == JCS_UNKNOWN) {
+    JXL_CHECK(*channels <= 4);
+  } else {
+    JXL_ABORT();
+  }
+}
+
+void RGBToYCbCr(float r, float g, float b, float* y, float* cb, float* cr) {
+  *y = 0.299f * r + 0.587f * g + 0.114f * b;
+  *cb = -0.168736f * r - 0.331264f * g + 0.5f * b + 0.5f;
+  *cr = 0.5f * r - 0.418688f * g - 0.081312f * b + 0.5f;
+}
+
+void ConvertPixel(const uint8_t* input_rgb, uint8_t* out,
+                  J_COLOR_SPACE colorspace, size_t num_channels,
+                  JpegliDataType data_type = JPEGLI_TYPE_UINT8,
+                  bool swap_endianness = JPEGLI_NATIVE_ENDIAN) {
+  const float kMul = 255.0f;
+  float r = input_rgb[0] / kMul;
+  float g = input_rgb[1] / kMul;
+  float b = input_rgb[2] / kMul;
+  uint8_t out8[MAX_COMPONENTS];
+  if (colorspace == JCS_GRAYSCALE) {
+    const float Y = 0.299f * r + 0.587f * g + 0.114f * b;
+    out8[0] = static_cast<uint8_t>(std::round(Y * kMul));
+  } else if (colorspace == JCS_RGB || colorspace == JCS_UNKNOWN) {
+    for (size_t c = 0; c < num_channels; ++c) {
+      out8[c] = input_rgb[std::min<size_t>(2, c)];
+    }
+  } else if (colorspace == JCS_YCbCr) {
+    float Y, Cb, Cr;
+    RGBToYCbCr(r, g, b, &Y, &Cb, &Cr);
+    out8[0] = static_cast<uint8_t>(std::round(Y * kMul));
+    out8[1] = static_cast<uint8_t>(std::round(Cb * kMul));
+    out8[2] = static_cast<uint8_t>(std::round(Cr * kMul));
+  } else if (colorspace == JCS_CMYK || colorspace == JCS_YCCK) {
+    float K = 1.0f - std::max(r, std::max(g, b));
+    float scaleK = 1.0f / (1.0f - K);
+    r *= scaleK;
+    g *= scaleK;
+    b *= scaleK;
+    if (colorspace == JCS_CMYK) {
+      out8[0] = static_cast<uint8_t>(std::round((1.0f - r) * kMul));
+      out8[1] = static_cast<uint8_t>(std::round((1.0f - g) * kMul));
+      out8[2] = static_cast<uint8_t>(std::round((1.0f - b) * kMul));
+    } else if (colorspace == JCS_YCCK) {
+      float Y, Cb, Cr;
+      RGBToYCbCr(r, g, b, &Y, &Cb, &Cr);
+      out8[0] = static_cast<uint8_t>(std::round(Y * kMul));
+      out8[1] = static_cast<uint8_t>(std::round(Cb * kMul));
+      out8[2] = static_cast<uint8_t>(std::round(Cr * kMul));
+    }
+    out8[3] = static_cast<uint8_t>(std::round(K * kMul));
+  } else {
+    JXL_ABORT("Colorspace %d not supported", colorspace);
+  }
+  if (data_type == JPEGLI_TYPE_UINT8) {
+    memcpy(out, out8, num_channels);
+  } else if (data_type == JPEGLI_TYPE_UINT16) {
+    for (size_t c = 0; c < num_channels; ++c) {
+      uint16_t val = (out8[c] << 8) + out8[c];
+      val |= 0x40;  // Make little-endian and big-endian asymmetric
+      if (swap_endianness) {
+        val = JXL_BSWAP16(val);
+      }
+      memcpy(&out[sizeof(val) * c], &val, sizeof(val));
+    }
+  } else if (data_type == JPEGLI_TYPE_FLOAT) {
+    for (size_t c = 0; c < num_channels; ++c) {
+      float val = out8[c] / 255.0f;
+      if (swap_endianness) {
+        val = BSwapFloat(val);
+      }
+      memcpy(&out[sizeof(val) * c], &val, sizeof(val));
+    }
+  }
+}
+
+void ConvertToGrayscale(TestImage* img) {
+  if (img->color_space == JCS_GRAYSCALE) return;
+  JXL_CHECK(img->data_type == JPEGLI_TYPE_UINT8);
+  for (size_t i = 0; i < img->pixels.size(); i += 3) {
+    if (img->color_space == JCS_RGB) {
+      ConvertPixel(&img->pixels[i], &img->pixels[i / 3], JCS_GRAYSCALE, 1);
+    } else if (img->color_space == JCS_YCbCr) {
+      img->pixels[i / 3] = img->pixels[i];
+    }
+  }
+  img->pixels.resize(img->pixels.size() / 3);
+  img->color_space = JCS_GRAYSCALE;
+  img->components = 1;
+}
+
+void GeneratePixels(TestImage* img) {
+  const std::vector<uint8_t> imgdata = ReadTestData("jxl/flower/flower.pnm");
+  size_t xsize, ysize, channels, bitdepth;
+  std::vector<uint8_t> pixels;
+  JXL_CHECK(ReadPNM(imgdata, &xsize, &ysize, &channels, &bitdepth, &pixels));
+  if (img->xsize == 0) img->xsize = xsize;
+  if (img->ysize == 0) img->ysize = ysize;
+  JXL_CHECK(img->xsize <= xsize);
+  JXL_CHECK(img->ysize <= ysize);
+  JXL_CHECK(3 == channels);
+  JXL_CHECK(8 == bitdepth);
+  size_t in_bytes_per_pixel = channels;
+  size_t in_stride = xsize * in_bytes_per_pixel;
+  size_t x0 = (xsize - img->xsize) / 2;
+  size_t y0 = (ysize - img->ysize) / 2;
+  SetNumChannels((J_COLOR_SPACE)img->color_space, &img->components);
+  size_t out_bytes_per_pixel =
+      jpegli_bytes_per_sample(img->data_type) * img->components;
+  size_t out_stride = img->xsize * out_bytes_per_pixel;
+  bool swap_endianness =
+      (img->endianness == JPEGLI_LITTLE_ENDIAN && !IsLittleEndian()) ||
+      (img->endianness == JPEGLI_BIG_ENDIAN && IsLittleEndian());
+  img->pixels.resize(img->ysize * out_stride);
+  for (size_t iy = 0; iy < img->ysize; ++iy) {
+    size_t y = y0 + iy;
+    for (size_t ix = 0; ix < img->xsize; ++ix) {
+      size_t x = x0 + ix;
+      size_t idx_in = y * in_stride + x * in_bytes_per_pixel;
+      size_t idx_out = iy * out_stride + ix * out_bytes_per_pixel;
+      ConvertPixel(&pixels[idx_in], &img->pixels[idx_out],
+                   (J_COLOR_SPACE)img->color_space, img->components,
+                   img->data_type, swap_endianness);
+    }
+  }
+}
+
+void GenerateRawData(const CompressParams& jparams, TestImage* img) {
+  for (size_t c = 0; c < img->components; ++c) {
+    size_t xsize = jparams.comp_width(*img, c);
+    size_t ysize = jparams.comp_height(*img, c);
+    size_t factor_y = jparams.max_v_sample() / jparams.v_samp(c);
+    size_t factor_x = jparams.max_h_sample() / jparams.h_samp(c);
+    size_t factor = factor_x * factor_y;
+    std::vector<uint8_t> plane(ysize * xsize);
+    size_t bytes_per_pixel = img->components;
+    for (size_t y = 0; y < ysize; ++y) {
+      for (size_t x = 0; x < xsize; ++x) {
+        int result = 0;
+        for (size_t iy = 0; iy < factor_y; ++iy) {
+          size_t yy = std::min(y * factor_y + iy, img->ysize - 1);
+          for (size_t ix = 0; ix < factor_x; ++ix) {
+            size_t xx = std::min(x * factor_x + ix, img->xsize - 1);
+            size_t pixel_ix = (yy * img->xsize + xx) * bytes_per_pixel + c;
+            result += img->pixels[pixel_ix];
+          }
+        }
+        result = static_cast<uint8_t>((result + factor / 2) / factor);
+        plane[y * xsize + x] = result;
+      }
+    }
+    img->raw_data.emplace_back(std::move(plane));
+  }
+}
+
+void GenerateCoeffs(const CompressParams& jparams, TestImage* img) {
+  for (size_t c = 0; c < img->components; ++c) {
+    int xsize_blocks = jparams.comp_width(*img, c) / DCTSIZE;
+    int ysize_blocks = jparams.comp_height(*img, c) / DCTSIZE;
+    std::vector<JCOEF> plane(ysize_blocks * xsize_blocks * DCTSIZE2);
+    for (int by = 0; by < ysize_blocks; ++by) {
+      for (int bx = 0; bx < xsize_blocks; ++bx) {
+        JCOEF* block = &plane[(by * xsize_blocks + bx) * DCTSIZE2];
+        for (int k = 0; k < DCTSIZE2; ++k) {
+          block[k] = (bx - by) / (k + 1);
+        }
+      }
+    }
+    img->coeffs.emplace_back(std::move(plane));
+  }
+}
+
+void EncodeWithJpegli(const TestImage& input, const CompressParams& jparams,
+                      j_compress_ptr cinfo) {
+  cinfo->image_width = input.xsize;
+  cinfo->image_height = input.ysize;
+  cinfo->input_components = input.components;
+  if (jparams.xyb_mode) {
+    jpegli_set_xyb_mode(cinfo);
+  }
+  if (jparams.libjpeg_mode) {
+    jpegli_enable_adaptive_quantization(cinfo, FALSE);
+    jpegli_use_standard_quant_tables(cinfo);
+    jpegli_set_progressive_level(cinfo, 0);
+  }
+  jpegli_set_defaults(cinfo);
+  cinfo->in_color_space = (J_COLOR_SPACE)input.color_space;
+  jpegli_default_colorspace(cinfo);
+  if (jparams.override_JFIF >= 0) {
+    cinfo->write_JFIF_header = jparams.override_JFIF;
+  }
+  if (jparams.override_Adobe >= 0) {
+    cinfo->write_Adobe_marker = jparams.override_Adobe;
+  }
+  if (jparams.set_jpeg_colorspace) {
+    jpegli_set_colorspace(cinfo, (J_COLOR_SPACE)jparams.jpeg_color_space);
+  }
+  if (!jparams.comp_ids.empty()) {
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      cinfo->comp_info[c].component_id = jparams.comp_ids[c];
+    }
+  }
+  if (!jparams.h_sampling.empty()) {
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      cinfo->comp_info[c].h_samp_factor = jparams.h_sampling[c];
+      cinfo->comp_info[c].v_samp_factor = jparams.v_sampling[c];
+    }
+  }
+  jpegli_set_quality(cinfo, jparams.quality, TRUE);
+  if (!jparams.quant_indexes.empty()) {
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      cinfo->comp_info[c].quant_tbl_no = jparams.quant_indexes[c];
+    }
+    for (const auto& table : jparams.quant_tables) {
+      if (table.add_raw) {
+        cinfo->quant_tbl_ptrs[table.slot_idx] =
+            jpegli_alloc_quant_table((j_common_ptr)cinfo);
+        for (int k = 0; k < DCTSIZE2; ++k) {
+          cinfo->quant_tbl_ptrs[table.slot_idx]->quantval[k] =
+              table.quantval[k];
+        }
+        cinfo->quant_tbl_ptrs[table.slot_idx]->sent_table = FALSE;
+      } else {
+        jpegli_add_quant_table(cinfo, table.slot_idx, &table.basic_table[0],
+                               table.scale_factor, table.force_baseline);
+      }
+    }
+  }
+  if (jparams.simple_progression) {
+    jpegli_simple_progression(cinfo);
+    JXL_CHECK(jparams.progressive_mode == -1);
+  }
+  if (jparams.progressive_mode > 2) {
+    const ScanScript& script = kTestScript[jparams.progressive_mode - 3];
+    cinfo->scan_info = script.scans;
+    cinfo->num_scans = script.num_scans;
+  } else if (jparams.progressive_mode >= 0) {
+    jpegli_set_progressive_level(cinfo, jparams.progressive_mode);
+  }
+  jpegli_set_input_format(cinfo, input.data_type, input.endianness);
+  jpegli_enable_adaptive_quantization(cinfo, jparams.use_adaptive_quantization);
+  cinfo->restart_interval = jparams.restart_interval;
+  cinfo->restart_in_rows = jparams.restart_in_rows;
+  cinfo->smoothing_factor = jparams.smoothing_factor;
+  if (jparams.optimize_coding == 1) {
+    cinfo->optimize_coding = TRUE;
+  } else if (jparams.optimize_coding == 0) {
+    cinfo->optimize_coding = FALSE;
+  }
+  cinfo->raw_data_in = !input.raw_data.empty();
+  if (jparams.optimize_coding == 0 && jparams.use_flat_dc_luma_code) {
+    JHUFF_TBL* tbl = cinfo->dc_huff_tbl_ptrs[0];
+    memset(tbl, 0, sizeof(*tbl));
+    tbl->bits[4] = 15;
+    for (int i = 0; i < 15; ++i) tbl->huffval[i] = i;
+  }
+  if (input.coeffs.empty()) {
+    bool write_all_tables = TRUE;
+    if (jparams.optimize_coding == 0 && !jparams.use_flat_dc_luma_code &&
+        jparams.omit_standard_tables) {
+      write_all_tables = FALSE;
+      cinfo->dc_huff_tbl_ptrs[0]->sent_table = TRUE;
+      cinfo->dc_huff_tbl_ptrs[1]->sent_table = TRUE;
+      cinfo->ac_huff_tbl_ptrs[0]->sent_table = TRUE;
+      cinfo->ac_huff_tbl_ptrs[1]->sent_table = TRUE;
+    }
+    jpegli_start_compress(cinfo, write_all_tables);
+    if (jparams.add_marker) {
+      jpegli_write_marker(cinfo, kSpecialMarker0, kMarkerData,
+                          sizeof(kMarkerData));
+      jpegli_write_m_header(cinfo, kSpecialMarker1, sizeof(kMarkerData));
+      for (size_t p = 0; p < sizeof(kMarkerData); ++p) {
+        jpegli_write_m_byte(cinfo, kMarkerData[p]);
+      }
+      for (size_t i = 0; i < kMarkerSequenceLen; ++i) {
+        jpegli_write_marker(cinfo, kMarkerSequence[i], kMarkerData,
+                            ((i + 2) % sizeof(kMarkerData)));
+      }
+    }
+    if (!jparams.icc.empty()) {
+      jpegli_write_icc_profile(cinfo, jparams.icc.data(), jparams.icc.size());
+    }
+  }
+  if (cinfo->raw_data_in) {
+    // Need to copy because jpeg API requires non-const pointers.
+    std::vector<std::vector<uint8_t>> raw_data = input.raw_data;
+    size_t max_lines = jparams.max_v_sample() * DCTSIZE;
+    std::vector<std::vector<JSAMPROW>> rowdata(cinfo->num_components);
+    std::vector<JSAMPARRAY> data(cinfo->num_components);
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      rowdata[c].resize(jparams.v_samp(c) * DCTSIZE);
+      data[c] = &rowdata[c][0];
+    }
+    while (cinfo->next_scanline < cinfo->image_height) {
+      for (int c = 0; c < cinfo->num_components; ++c) {
+        size_t cwidth = cinfo->comp_info[c].width_in_blocks * DCTSIZE;
+        size_t cheight = cinfo->comp_info[c].height_in_blocks * DCTSIZE;
+        size_t num_lines = jparams.v_samp(c) * DCTSIZE;
+        size_t y0 = (cinfo->next_scanline / max_lines) * num_lines;
+        for (size_t i = 0; i < num_lines; ++i) {
+          rowdata[c][i] =
+              (y0 + i < cheight ? &raw_data[c][(y0 + i) * cwidth] : nullptr);
+        }
+      }
+      size_t num_lines = jpegli_write_raw_data(cinfo, &data[0], max_lines);
+      JXL_CHECK(num_lines == max_lines);
+    }
+  } else if (!input.coeffs.empty()) {
+    j_common_ptr comptr = reinterpret_cast<j_common_ptr>(cinfo);
+    jvirt_barray_ptr* coef_arrays = reinterpret_cast<jvirt_barray_ptr*>((
+        *cinfo->mem->alloc_small)(
+        comptr, JPOOL_IMAGE, cinfo->num_components * sizeof(jvirt_barray_ptr)));
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      size_t xsize_blocks = jparams.comp_width(input, c) / DCTSIZE;
+      size_t ysize_blocks = jparams.comp_height(input, c) / DCTSIZE;
+      coef_arrays[c] = (*cinfo->mem->request_virt_barray)(
+          comptr, JPOOL_IMAGE, FALSE, xsize_blocks, ysize_blocks,
+          cinfo->comp_info[c].v_samp_factor);
+    }
+    jpegli_write_coefficients(cinfo, coef_arrays);
+    if (jparams.add_marker) {
+      jpegli_write_marker(cinfo, kSpecialMarker0, kMarkerData,
+                          sizeof(kMarkerData));
+      jpegli_write_m_header(cinfo, kSpecialMarker1, sizeof(kMarkerData));
+      for (size_t p = 0; p < sizeof(kMarkerData); ++p) {
+        jpegli_write_m_byte(cinfo, kMarkerData[p]);
+      }
+    }
+    for (int c = 0; c < cinfo->num_components; ++c) {
+      jpeg_component_info* comp = &cinfo->comp_info[c];
+      for (size_t by = 0; by < comp->height_in_blocks; ++by) {
+        JBLOCKARRAY ba = (*cinfo->mem->access_virt_barray)(
+            comptr, coef_arrays[c], by, 1, true);
+        size_t stride = comp->width_in_blocks * sizeof(JBLOCK);
+        size_t offset = by * comp->width_in_blocks * DCTSIZE2;
+        memcpy(ba[0], &input.coeffs[c][offset], stride);
+      }
+    }
+  } else {
+    size_t stride = cinfo->image_width * cinfo->input_components *
+                    jpegli_bytes_per_sample(input.data_type);
+    std::vector<uint8_t> row_bytes(stride);
+    for (size_t y = 0; y < cinfo->image_height; ++y) {
+      memcpy(&row_bytes[0], &input.pixels[y * stride], stride);
+      JSAMPROW row[] = {row_bytes.data()};
+      jpegli_write_scanlines(cinfo, row, 1);
+    }
+  }
+  jpegli_finish_compress(cinfo);
+}
+
+bool EncodeWithJpegli(const TestImage& input, const CompressParams& jparams,
+                      std::vector<uint8_t>* compressed) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    EncodeWithJpegli(input, jparams, &cinfo);
+    return true;
+  };
+  bool success = try_catch_block();
+  jpegli_destroy_compress(&cinfo);
+  if (success) {
+    compressed->resize(buffer_size);
+    std::copy_n(buffer, buffer_size, compressed->data());
+  }
+  if (buffer) std::free(buffer);
+  return success;
+}
+
+int NumTestScanScripts() { return kNumTestScripts; }
+
+void DumpImage(const TestImage& image, const std::string fn) {
+  JXL_CHECK(image.components == 1 || image.components == 3);
+  size_t bytes_per_sample = jpegli_bytes_per_sample(image.data_type);
+  uint32_t maxval = (1u << (8 * bytes_per_sample)) - 1;
+  char type = image.components == 1 ? '5' : '6';
+  std::ofstream out(fn.c_str(), std::ofstream::binary);
+  out << "P" << type << std::endl
+      << image.xsize << " " << image.ysize << std::endl
+      << maxval << std::endl;
+  out.write(reinterpret_cast<const char*>(image.pixels.data()),
+            image.pixels.size());
+  out.close();
+}
+
+double DistanceRms(const TestImage& input, const TestImage& output,
+                   size_t start_line, size_t num_lines, double* max_diff) {
+  size_t stride = input.xsize * input.components;
+  size_t start_offset = start_line * stride;
+  auto get_sample = [&](const TestImage& im, const std::vector<uint8_t>& data,
+                        size_t idx) -> double {
+    size_t bytes_per_sample = jpegli_bytes_per_sample(im.data_type);
+    bool is_little_endian =
+        (im.endianness == JPEGLI_LITTLE_ENDIAN ||
+         (im.endianness == JPEGLI_NATIVE_ENDIAN && IsLittleEndian()));
+    size_t offset = start_offset + idx * bytes_per_sample;
+    JXL_CHECK(offset < data.size());
+    const uint8_t* p = &data[offset];
+    if (im.data_type == JPEGLI_TYPE_UINT8) {
+      static const double mul8 = 1.0 / 255.0;
+      return p[0] * mul8;
+    } else if (im.data_type == JPEGLI_TYPE_UINT16) {
+      static const double mul16 = 1.0 / 65535.0;
+      return (is_little_endian ? LoadLE16(p) : LoadBE16(p)) * mul16;
+    } else if (im.data_type == JPEGLI_TYPE_FLOAT) {
+      return (is_little_endian ? LoadLEFloat(p) : LoadBEFloat(p));
+    }
+    return 0.0;
+  };
+  double diff2 = 0.0;
+  size_t num_samples = 0;
+  if (max_diff) *max_diff = 0.0;
+  if (!input.pixels.empty() && !output.pixels.empty()) {
+    num_samples = num_lines * stride;
+    for (size_t i = 0; i < num_samples; ++i) {
+      double sample_orig = get_sample(input, input.pixels, i);
+      double sample_output = get_sample(output, output.pixels, i);
+      double diff = sample_orig - sample_output;
+      if (max_diff) *max_diff = std::max(*max_diff, 255.0 * std::abs(diff));
+      diff2 += diff * diff;
+    }
+  } else {
+    JXL_CHECK(!input.raw_data.empty());
+    JXL_CHECK(!output.raw_data.empty());
+    for (size_t c = 0; c < input.raw_data.size(); ++c) {
+      JXL_CHECK(c < output.raw_data.size());
+      num_samples += input.raw_data[c].size();
+      for (size_t i = 0; i < input.raw_data[c].size(); ++i) {
+        double sample_orig = get_sample(input, input.raw_data[c], i);
+        double sample_output = get_sample(output, output.raw_data[c], i);
+        double diff = sample_orig - sample_output;
+        if (max_diff) *max_diff = std::max(*max_diff, 255.0 * std::abs(diff));
+        diff2 += diff * diff;
+      }
+    }
+  }
+  return std::sqrt(diff2 / num_samples) * 255.0;
+}
+
+double DistanceRms(const TestImage& input, const TestImage& output,
+                   double* max_diff) {
+  return DistanceRms(input, output, 0, output.ysize, max_diff);
+}
+
+void VerifyOutputImage(const TestImage& input, const TestImage& output,
+                       size_t start_line, size_t num_lines, double max_rms,
+                       double max_diff) {
+  double max_d;
+  double rms = DistanceRms(input, output, start_line, num_lines, &max_d);
+  printf("rms: %f, max_rms: %f, max_d: %f,  max_diff: %f\n", rms, max_rms,
+         max_d, max_diff);
+  JXL_CHECK(rms <= max_rms);
+  JXL_CHECK(max_d <= max_diff);
+}
+
+void VerifyOutputImage(const TestImage& input, const TestImage& output,
+                       double max_rms, double max_diff) {
+  JXL_CHECK(output.xsize == input.xsize);
+  JXL_CHECK(output.ysize == input.ysize);
+  JXL_CHECK(output.components == input.components);
+  JXL_CHECK(output.color_space == input.color_space);
+  if (!input.coeffs.empty()) {
+    JXL_CHECK(input.coeffs.size() == input.components);
+    JXL_CHECK(output.coeffs.size() == input.components);
+    for (size_t c = 0; c < input.components; ++c) {
+      JXL_CHECK(output.coeffs[c].size() == input.coeffs[c].size());
+      JXL_CHECK(0 == memcmp(input.coeffs[c].data(), output.coeffs[c].data(),
+                            input.coeffs[c].size()));
+    }
+  } else {
+    VerifyOutputImage(input, output, 0, output.ysize, max_rms, max_diff);
+  }
+}
+
+}  // namespace jpegli
diff --git a/lib/jpegli/test_utils.h b/lib/jpegli/test_utils.h
new file mode 100644 (file)
index 0000000..132cfd0
--- /dev/null
@@ -0,0 +1,130 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_TEST_UTILS_H_
+#define LIB_JPEGLI_TEST_UTILS_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+/* clang-format off */
+#include <stdio.h>
+#include <jpeglib.h>
+#include <setjmp.h>
+/* clang-format on */
+
+#include "lib/jpegli/common.h"
+#include "lib/jpegli/libjpeg_test_util.h"
+#include "lib/jpegli/test_params.h"
+
+namespace jpegli {
+
+#define ERROR_HANDLER_SETUP(flavor)                                \
+  jpeg_error_mgr jerr;                                             \
+  jmp_buf env;                                                     \
+  cinfo.err = flavor##_std_error(&jerr);                           \
+  if (setjmp(env)) {                                               \
+    return false;                                                  \
+  }                                                                \
+  cinfo.client_data = reinterpret_cast<void*>(&env);               \
+  cinfo.err->error_exit = [](j_common_ptr cinfo) {                 \
+    (*cinfo->err->output_message)(cinfo);                          \
+    jmp_buf* env = reinterpret_cast<jmp_buf*>(cinfo->client_data); \
+    flavor##_destroy(cinfo);                                       \
+    longjmp(*env, 1);                                              \
+  };
+
+std::string IOMethodName(JpegliDataType data_type, JpegliEndianness endianness);
+
+std::string ColorSpaceName(J_COLOR_SPACE colorspace);
+
+std::ostream& operator<<(std::ostream& os, const TestImage& input);
+
+std::ostream& operator<<(std::ostream& os, const CompressParams& jparams);
+
+int NumTestScanScripts();
+
+void VerifyHeader(const CompressParams& jparams, j_decompress_ptr cinfo);
+void VerifyScanHeader(const CompressParams& jparams, j_decompress_ptr cinfo);
+
+void SetDecompressParams(const DecompressParams& dparams,
+                         j_decompress_ptr cinfo);
+
+void SetScanDecompressParams(const DecompressParams& dparams,
+                             j_decompress_ptr cinfo, int scan_number);
+
+void CopyCoefficients(j_decompress_ptr cinfo, jvirt_barray_ptr* coef_arrays,
+                      TestImage* output);
+
+void UnmapColors(uint8_t* row, size_t xsize, int components,
+                 JSAMPARRAY colormap, size_t num_colors);
+
+std::string GetTestDataPath(const std::string& filename);
+std::vector<uint8_t> ReadTestData(const std::string& filename);
+
+class PNMParser {
+ public:
+  explicit PNMParser(const uint8_t* data, const size_t len)
+      : pos_(data), end_(data + len) {}
+
+  // Sets "pos" to the first non-header byte/pixel on success.
+  bool ParseHeader(const uint8_t** pos, size_t* xsize, size_t* ysize,
+                   size_t* num_channels, size_t* bitdepth);
+
+ private:
+  static bool IsLineBreak(const uint8_t c) { return c == '\r' || c == '\n'; }
+  static bool IsWhitespace(const uint8_t c) {
+    return IsLineBreak(c) || c == '\t' || c == ' ';
+  }
+
+  bool ParseUnsigned(size_t* number);
+
+  bool SkipWhitespace();
+
+  const uint8_t* pos_;
+  const uint8_t* const end_;
+};
+
+bool ReadPNM(const std::vector<uint8_t>& data, size_t* xsize, size_t* ysize,
+             size_t* num_channels, size_t* bitdepth,
+             std::vector<uint8_t>* pixels);
+
+void SetNumChannels(J_COLOR_SPACE colorspace, size_t* channels);
+
+void ConvertToGrayscale(TestImage* img);
+
+void GeneratePixels(TestImage* img);
+
+void GenerateRawData(const CompressParams& jparams, TestImage* img);
+
+void GenerateCoeffs(const CompressParams& jparams, TestImage* img);
+
+void EncodeWithJpegli(const TestImage& input, const CompressParams& jparams,
+                      j_compress_ptr cinfo);
+
+bool EncodeWithJpegli(const TestImage& input, const CompressParams& jparams,
+                      std::vector<uint8_t>* compressed);
+
+double DistanceRms(const TestImage& input, const TestImage& output,
+                   size_t start_line, size_t num_lines,
+                   double* max_diff = nullptr);
+
+double DistanceRms(const TestImage& input, const TestImage& output,
+                   double* max_diff = nullptr);
+
+void VerifyOutputImage(const TestImage& input, const TestImage& output,
+                       size_t start_line, size_t num_lines, double max_rms,
+                       double max_diff = 255.0);
+
+void VerifyOutputImage(const TestImage& input, const TestImage& output,
+                       double max_rms, double max_diff = 255.0);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_TEST_UTILS_H_
diff --git a/lib/jpegli/testing.h b/lib/jpegli/testing.h
new file mode 100644 (file)
index 0000000..873a017
--- /dev/null
@@ -0,0 +1,35 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_TESTING_H_
+#define LIB_JPEGLI_TESTING_H_
+
+// GTest/GMock specific macros / wrappers.
+
+// gmock unconditionally redefines those macros (to wrong values).
+// Lets include it only here and mitigate the problem.
+#pragma push_macro("PRIdS")
+#pragma push_macro("PRIuS")
+#include "gmock/gmock.h"
+#pragma pop_macro("PRIuS")
+#pragma pop_macro("PRIdS")
+
+#include "gtest/gtest.h"
+
+// googletest before 1.10 didn't define INSTANTIATE_TEST_SUITE_P() but instead
+// used INSTANTIATE_TEST_CASE_P which is now deprecated.
+#ifdef INSTANTIATE_TEST_SUITE_P
+#define JPEGLI_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_SUITE_P
+#else
+#define JPEGLI_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_CASE_P
+#endif
+
+// Ensures that we don't make our test bounds too lax, effectively disabling the
+// tests.
+MATCHER_P(IsSlightlyBelow, max, "") {
+  return max * 0.75 <= arg && arg <= max * 1.0;
+}
+
+#endif  // LIB_JPEGLI_TESTING_H_
diff --git a/lib/jpegli/transcode_api_test.cc b/lib/jpegli/transcode_api_test.cc
new file mode 100644 (file)
index 0000000..1d99ce3
--- /dev/null
@@ -0,0 +1,133 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <vector>
+
+#include "lib/jpegli/decode.h"
+#include "lib/jpegli/encode.h"
+#include "lib/jpegli/test_utils.h"
+#include "lib/jpegli/testing.h"
+#include "lib/jxl/base/status.h"
+
+namespace jpegli {
+namespace {
+
+void TranscodeWithJpegli(const std::vector<uint8_t>& jpeg_input,
+                         const CompressParams& jparams,
+                         std::vector<uint8_t>* jpeg_output) {
+  jpeg_decompress_struct dinfo = {};
+  jpeg_compress_struct cinfo = {};
+  uint8_t* transcoded_data = nullptr;
+  unsigned long transcoded_size;
+  const auto try_catch_block = [&]() -> bool {
+    ERROR_HANDLER_SETUP(jpegli);
+    dinfo.err = cinfo.err;
+    dinfo.client_data = cinfo.client_data;
+    jpegli_create_decompress(&dinfo);
+    jpegli_mem_src(&dinfo, jpeg_input.data(), jpeg_input.size());
+    EXPECT_EQ(JPEG_REACHED_SOS,
+              jpegli_read_header(&dinfo, /*require_image=*/TRUE));
+    jvirt_barray_ptr* coef_arrays = jpegli_read_coefficients(&dinfo);
+    JXL_CHECK(coef_arrays != nullptr);
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &transcoded_data, &transcoded_size);
+    jpegli_copy_critical_parameters(&dinfo, &cinfo);
+    jpegli_set_progressive_level(&cinfo, jparams.progressive_mode);
+    cinfo.optimize_coding = jparams.optimize_coding;
+    jpegli_write_coefficients(&cinfo, coef_arrays);
+    jpegli_finish_compress(&cinfo);
+    jpegli_finish_decompress(&dinfo);
+    return true;
+  };
+  ASSERT_TRUE(try_catch_block());
+  jpegli_destroy_decompress(&dinfo);
+  jpegli_destroy_compress(&cinfo);
+  if (transcoded_data) {
+    jpeg_output->assign(transcoded_data, transcoded_data + transcoded_size);
+    free(transcoded_data);
+  }
+}
+
+struct TestConfig {
+  TestImage input;
+  CompressParams jparams;
+};
+
+class TranscodeAPITestParam : public ::testing::TestWithParam<TestConfig> {};
+
+TEST_P(TranscodeAPITestParam, TestAPI) {
+  TestConfig config = GetParam();
+  CompressParams& jparams = config.jparams;
+  GeneratePixels(&config.input);
+
+  // Start with sequential non-optimized jpeg.
+  jparams.progressive_mode = 0;
+  jparams.optimize_coding = 0;
+  std::vector<uint8_t> compressed;
+  ASSERT_TRUE(EncodeWithJpegli(config.input, jparams, &compressed));
+  TestImage output0;
+  DecodeWithLibjpeg(jparams, DecompressParams(), compressed, &output0);
+
+  // Transcode first to a sequential optimized jpeg, and then further to
+  // a progressive jpeg.
+  for (int progr : {0, 2}) {
+    std::vector<uint8_t> transcoded;
+    jparams.progressive_mode = progr;
+    jparams.optimize_coding = 1;
+    TranscodeWithJpegli(compressed, jparams, &transcoded);
+
+    // We expect a size reduction of at least 2%.
+    EXPECT_LT(transcoded.size(), compressed.size() * 0.98f);
+
+    // Verify that transcoding is lossless.
+    TestImage output1;
+    DecodeWithLibjpeg(jparams, DecompressParams(), transcoded, &output1);
+    ASSERT_EQ(output0.pixels.size(), output1.pixels.size());
+    EXPECT_EQ(0, memcmp(output0.pixels.data(), output1.pixels.data(),
+                        output0.pixels.size()));
+    compressed = transcoded;
+  }
+}
+
+std::vector<TestConfig> GenerateTests() {
+  std::vector<TestConfig> all_tests;
+  const size_t xsize0 = 1024;
+  const size_t ysize0 = 768;
+  for (int dxsize : {0, 1, 8, 9}) {
+    for (int dysize : {0, 1, 8, 9}) {
+      for (int h_sampling : {1, 2}) {
+        for (int v_sampling : {1, 2}) {
+          TestConfig config;
+          config.input.xsize = xsize0 + dxsize;
+          config.input.ysize = ysize0 + dysize;
+          config.jparams.h_sampling = {h_sampling, 1, 1};
+          config.jparams.v_sampling = {v_sampling, 1, 1};
+          all_tests.push_back(config);
+        }
+      }
+    }
+  }
+  return all_tests;
+}
+
+std::ostream& operator<<(std::ostream& os, const TestConfig& c) {
+  os << c.input;
+  os << c.jparams;
+  return os;
+}
+
+std::string TestDescription(
+    const testing::TestParamInfo<TranscodeAPITestParam::ParamType>& info) {
+  std::stringstream name;
+  name << info.param;
+  return name.str();
+}
+
+JPEGLI_INSTANTIATE_TEST_SUITE_P(TranscodeAPITest, TranscodeAPITestParam,
+                                testing::ValuesIn(GenerateTests()),
+                                TestDescription);
+
+}  // namespace
+}  // namespace jpegli
diff --git a/lib/jpegli/transpose-inl.h b/lib/jpegli/transpose-inl.h
new file mode 100644 (file)
index 0000000..9fdd222
--- /dev/null
@@ -0,0 +1,111 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#if defined(LIB_JPEGLI_TRANSPOSE_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JPEGLI_TRANSPOSE_INL_H_
+#undef LIB_JPEGLI_TRANSPOSE_INL_H_
+#else
+#define LIB_JPEGLI_TRANSPOSE_INL_H_
+#endif
+
+#include "lib/jxl/base/compiler_specific.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jpegli {
+namespace HWY_NAMESPACE {
+namespace {
+
+#if HWY_CAP_GE256
+static JXL_INLINE void Transpose8x8Block(const float* JXL_RESTRICT from,
+                                         float* JXL_RESTRICT to) {
+  const HWY_CAPPED(float, 8) d;
+  auto i0 = Load(d, from);
+  auto i1 = Load(d, from + 1 * 8);
+  auto i2 = Load(d, from + 2 * 8);
+  auto i3 = Load(d, from + 3 * 8);
+  auto i4 = Load(d, from + 4 * 8);
+  auto i5 = Load(d, from + 5 * 8);
+  auto i6 = Load(d, from + 6 * 8);
+  auto i7 = Load(d, from + 7 * 8);
+
+  const auto q0 = InterleaveLower(d, i0, i2);
+  const auto q1 = InterleaveLower(d, i1, i3);
+  const auto q2 = InterleaveUpper(d, i0, i2);
+  const auto q3 = InterleaveUpper(d, i1, i3);
+  const auto q4 = InterleaveLower(d, i4, i6);
+  const auto q5 = InterleaveLower(d, i5, i7);
+  const auto q6 = InterleaveUpper(d, i4, i6);
+  const auto q7 = InterleaveUpper(d, i5, i7);
+
+  const auto r0 = InterleaveLower(d, q0, q1);
+  const auto r1 = InterleaveUpper(d, q0, q1);
+  const auto r2 = InterleaveLower(d, q2, q3);
+  const auto r3 = InterleaveUpper(d, q2, q3);
+  const auto r4 = InterleaveLower(d, q4, q5);
+  const auto r5 = InterleaveUpper(d, q4, q5);
+  const auto r6 = InterleaveLower(d, q6, q7);
+  const auto r7 = InterleaveUpper(d, q6, q7);
+
+  i0 = ConcatLowerLower(d, r4, r0);
+  i1 = ConcatLowerLower(d, r5, r1);
+  i2 = ConcatLowerLower(d, r6, r2);
+  i3 = ConcatLowerLower(d, r7, r3);
+  i4 = ConcatUpperUpper(d, r4, r0);
+  i5 = ConcatUpperUpper(d, r5, r1);
+  i6 = ConcatUpperUpper(d, r6, r2);
+  i7 = ConcatUpperUpper(d, r7, r3);
+
+  Store(i0, d, to);
+  Store(i1, d, to + 1 * 8);
+  Store(i2, d, to + 2 * 8);
+  Store(i3, d, to + 3 * 8);
+  Store(i4, d, to + 4 * 8);
+  Store(i5, d, to + 5 * 8);
+  Store(i6, d, to + 6 * 8);
+  Store(i7, d, to + 7 * 8);
+}
+#elif HWY_TARGET != HWY_SCALAR
+static JXL_INLINE void Transpose8x8Block(const float* JXL_RESTRICT from,
+                                         float* JXL_RESTRICT to) {
+  const HWY_CAPPED(float, 4) d;
+  for (size_t n = 0; n < 8; n += 4) {
+    for (size_t m = 0; m < 8; m += 4) {
+      auto p0 = Load(d, from + n * 8 + m);
+      auto p1 = Load(d, from + (n + 1) * 8 + m);
+      auto p2 = Load(d, from + (n + 2) * 8 + m);
+      auto p3 = Load(d, from + (n + 3) * 8 + m);
+      const auto q0 = InterleaveLower(d, p0, p2);
+      const auto q1 = InterleaveLower(d, p1, p3);
+      const auto q2 = InterleaveUpper(d, p0, p2);
+      const auto q3 = InterleaveUpper(d, p1, p3);
+
+      const auto r0 = InterleaveLower(d, q0, q1);
+      const auto r1 = InterleaveUpper(d, q0, q1);
+      const auto r2 = InterleaveLower(d, q2, q3);
+      const auto r3 = InterleaveUpper(d, q2, q3);
+      Store(r0, d, to + m * 8 + n);
+      Store(r1, d, to + (1 + m) * 8 + n);
+      Store(r2, d, to + (2 + m) * 8 + n);
+      Store(r3, d, to + (3 + m) * 8 + n);
+    }
+  }
+}
+#else
+static JXL_INLINE void Transpose8x8Block(const float* JXL_RESTRICT from,
+                                         float* JXL_RESTRICT to) {
+  for (size_t n = 0; n < 8; ++n) {
+    for (size_t m = 0; m < 8; ++m) {
+      to[8 * n + m] = from[8 * m + n];
+    }
+  }
+}
+#endif
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace
+}  // namespace HWY_NAMESPACE
+}  // namespace jpegli
+HWY_AFTER_NAMESPACE();
+#endif  // LIB_JPEGLI_TRANSPOSE_INL_H_
diff --git a/lib/jpegli/types.h b/lib/jpegli/types.h
new file mode 100644 (file)
index 0000000..2f446b7
--- /dev/null
@@ -0,0 +1,38 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_TYPES_H_
+#define LIB_JPEGLI_TYPES_H_
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+//
+// New API structs and functions that are not available in libjpeg
+//
+// NOTE: This part of the API is still experimental and will probably change in
+// the future.
+//
+
+typedef enum {
+  JPEGLI_TYPE_FLOAT = 0,
+  JPEGLI_TYPE_UINT8 = 2,
+  JPEGLI_TYPE_UINT16 = 3,
+} JpegliDataType;
+
+typedef enum {
+  JPEGLI_NATIVE_ENDIAN = 0,
+  JPEGLI_LITTLE_ENDIAN = 1,
+  JPEGLI_BIG_ENDIAN = 2,
+} JpegliEndianness;
+
+int jpegli_bytes_per_sample(JpegliDataType data_type);
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}  // extern "C"
+#endif
+
+#endif  // LIB_JPEGLI_TYPES_H_
diff --git a/lib/jpegli/upsample.cc b/lib/jpegli/upsample.cc
new file mode 100644 (file)
index 0000000..5559aa7
--- /dev/null
@@ -0,0 +1,137 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jpegli/upsample.h"
+
+#include <string.h>
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jpegli/upsample.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+HWY_BEFORE_NAMESPACE();
+namespace jpegli {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::Vec;
+
+#if HWY_CAP_GE512
+using hwy::HWY_NAMESPACE::Half;
+using hwy::HWY_NAMESPACE::Vec;
+template <size_t i, class DF, class V>
+HWY_INLINE Vec<Half<Half<DF>>> Quarter(const DF df, V v) {
+  using HF = Half<DF>;
+  using HHF = Half<HF>;
+  auto half = i >= 2 ? UpperHalf(HF(), v) : LowerHalf(HF(), v);
+  return i & 1 ? UpperHalf(HHF(), half) : LowerHalf(HHF(), half);
+}
+
+template <class DF, class V>
+HWY_INLINE Vec<DF> Concat4(const DF df, V v0, V v1, V v2, V v3) {
+  using HF = Half<DF>;
+  return Combine(DF(), Combine(HF(), v3, v2), Combine(HF(), v1, v0));
+}
+
+#endif
+
+// Stores v0[0], v1[0], v0[1], v1[1], ... to mem, in this order. Mem must be
+// aligned.
+template <class DF, class V, typename T>
+void StoreInterleaved(const DF df, V v0, V v1, T* mem) {
+  static_assert(sizeof(T) == 4, "only use StoreInterleaved for 4-byte types");
+#if HWY_TARGET == HWY_SCALAR
+  Store(v0, df, mem);
+  Store(v1, df, mem + 1);
+#elif !HWY_CAP_GE256
+  Store(InterleaveLower(df, v0, v1), df, mem);
+  Store(InterleaveUpper(df, v0, v1), df, mem + Lanes(df));
+#else
+  if (!HWY_CAP_GE512 || Lanes(df) == 8) {
+    auto t0 = InterleaveLower(df, v0, v1);
+    auto t1 = InterleaveUpper(df, v0, v1);
+    Store(ConcatLowerLower(df, t1, t0), df, mem);
+    Store(ConcatUpperUpper(df, t1, t0), df, mem + Lanes(df));
+  } else {
+#if HWY_CAP_GE512
+    auto t0 = InterleaveLower(df, v0, v1);
+    auto t1 = InterleaveUpper(df, v0, v1);
+    Store(Concat4(df, Quarter<0>(df, t0), Quarter<0>(df, t1),
+                  Quarter<1>(df, t0), Quarter<1>(df, t1)),
+          df, mem);
+    Store(Concat4(df, Quarter<2>(df, t0), Quarter<2>(df, t1),
+                  Quarter<3>(df, t0), Quarter<3>(df, t1)),
+          df, mem + Lanes(df));
+#endif
+  }
+#endif
+}
+
+void Upsample2Horizontal(float* JXL_RESTRICT row,
+                         float* JXL_RESTRICT scratch_space, size_t len_out) {
+  HWY_FULL(float) df;
+  auto threefour = Set(df, 0.75f);
+  auto onefour = Set(df, 0.25f);
+  const size_t len_in = (len_out + 1) >> 1;
+  memcpy(scratch_space, row, len_in * sizeof(row[0]));
+  scratch_space[-1] = scratch_space[0];
+  scratch_space[len_in] = scratch_space[len_in - 1];
+  for (size_t x = 0; x < len_in; x += Lanes(df)) {
+    auto current = Mul(Load(df, scratch_space + x), threefour);
+    auto prev = LoadU(df, scratch_space + x - 1);
+    auto next = LoadU(df, scratch_space + x + 1);
+    auto left = MulAdd(onefour, prev, current);
+    auto right = MulAdd(onefour, next, current);
+    StoreInterleaved(df, left, right, row + x * 2);
+  }
+}
+
+void Upsample2Vertical(const float* JXL_RESTRICT row_top,
+                       const float* JXL_RESTRICT row_mid,
+                       const float* JXL_RESTRICT row_bot,
+                       float* JXL_RESTRICT row_out0,
+                       float* JXL_RESTRICT row_out1, size_t len) {
+  HWY_FULL(float) df;
+  auto threefour = Set(df, 0.75f);
+  auto onefour = Set(df, 0.25f);
+  for (size_t x = 0; x < len; x += Lanes(df)) {
+    auto it = Load(df, row_top + x);
+    auto im = Load(df, row_mid + x);
+    auto ib = Load(df, row_bot + x);
+    auto im_scaled = Mul(im, threefour);
+    Store(MulAdd(it, onefour, im_scaled), df, row_out0 + x);
+    Store(MulAdd(ib, onefour, im_scaled), df, row_out1 + x);
+  }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jpegli
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jpegli {
+
+HWY_EXPORT(Upsample2Horizontal);
+HWY_EXPORT(Upsample2Vertical);
+
+void Upsample2Horizontal(float* JXL_RESTRICT row,
+                         float* JXL_RESTRICT scratch_space, size_t len_out) {
+  return HWY_DYNAMIC_DISPATCH(Upsample2Horizontal)(row, scratch_space, len_out);
+}
+
+void Upsample2Vertical(const float* JXL_RESTRICT row_top,
+                       const float* JXL_RESTRICT row_mid,
+                       const float* JXL_RESTRICT row_bot,
+                       float* JXL_RESTRICT row_out0,
+                       float* JXL_RESTRICT row_out1, size_t len) {
+  return HWY_DYNAMIC_DISPATCH(Upsample2Vertical)(row_top, row_mid, row_bot,
+                                                 row_out0, row_out1, len);
+}
+}  // namespace jpegli
+#endif  // HWY_ONCE
diff --git a/lib/jpegli/upsample.h b/lib/jpegli/upsample.h
new file mode 100644 (file)
index 0000000..1a05720
--- /dev/null
@@ -0,0 +1,26 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JPEGLI_UPSAMPLE_H_
+#define LIB_JPEGLI_UPSAMPLE_H_
+
+#include <stddef.h>
+
+#include "lib/jxl/base/compiler_specific.h"
+
+namespace jpegli {
+
+void Upsample2Horizontal(float* JXL_RESTRICT row,
+                         float* JXL_RESTRICT scratch_space, size_t len_out);
+
+void Upsample2Vertical(const float* JXL_RESTRICT row_top,
+                       const float* JXL_RESTRICT row_mid,
+                       const float* JXL_RESTRICT row_bot,
+                       float* JXL_RESTRICT row_out0,
+                       float* JXL_RESTRICT row_out1, size_t len);
+
+}  // namespace jpegli
+
+#endif  // LIB_JPEGLI_UPSAMPLE_H_
index 72c07f4..5f46c90 100644 (file)
 # Use of this source code is governed by a BSD-style
 # license that can be found in the LICENSE file.
 
-# Lists all source files for the JPEG XL decoder library. These are also used
-# by the encoder: the encoder uses both dec and enc ourse files, while the
-# decoder uses only dec source files.
-# TODO(lode): further prune these files and move to JPEGXL_INTERNAL_SOURCES_ENC:
-#             only those files that the decoder absolutely needs, and or not
-#             only for encoding, should be listed here.
-set(JPEGXL_INTERNAL_SOURCES_DEC
-  jxl/ac_context.h
-  jxl/ac_strategy.cc
-  jxl/ac_strategy.h
-  jxl/alpha.cc
-  jxl/alpha.h
-  jxl/ans_common.cc
-  jxl/ans_common.h
-  jxl/ans_params.h
-  jxl/aux_out.cc
-  jxl/aux_out.h
-  jxl/aux_out_fwd.h
-  jxl/base/arch_macros.h
-  jxl/base/bits.h
-  jxl/base/byte_order.h
-  jxl/base/cache_aligned.cc
-  jxl/base/cache_aligned.h
-  jxl/base/compiler_specific.h
-  jxl/base/data_parallel.cc
-  jxl/base/data_parallel.h
-  jxl/base/file_io.h
-  jxl/base/iaca.h
-  jxl/base/os_macros.h
-  jxl/base/override.h
-  jxl/base/padded_bytes.cc
-  jxl/base/padded_bytes.h
-  jxl/base/printf_macros.h
-  jxl/base/profiler.h
-  jxl/base/random.cc
-  jxl/base/random.h
-  jxl/base/sanitizer_definitions.h
-  jxl/base/scope_guard.h
-  jxl/base/span.h
-  jxl/base/status.h
-  jxl/base/thread_pool_internal.h
-  jxl/blending.cc
-  jxl/blending.h
-  jxl/box_content_decoder.cc
-  jxl/box_content_decoder.h
-  jxl/chroma_from_luma.cc
-  jxl/chroma_from_luma.h
-  jxl/codec_in_out.h
-  jxl/coeff_order.cc
-  jxl/coeff_order.h
-  jxl/coeff_order_fwd.h
-  jxl/color_encoding_internal.cc
-  jxl/color_encoding_internal.h
-  jxl/color_management.cc
-  jxl/color_management.h
-  jxl/common.h
-  jxl/compressed_dc.cc
-  jxl/compressed_dc.h
-  jxl/convolve-inl.h
-  jxl/convolve.h
-  jxl/convolve_separable5.cc
-  jxl/convolve_separable7.cc
-  jxl/convolve_slow.cc
-  jxl/convolve_symmetric3.cc
-  jxl/convolve_symmetric5.cc
-  jxl/dct-inl.h
-  jxl/dct_block-inl.h
-  jxl/dct_scales.cc
-  jxl/dct_scales.h
-  jxl/dct_util.h
-  jxl/dec_ans.cc
-  jxl/dec_ans.h
-  jxl/dec_bit_reader.h
-  jxl/dec_cache.cc
-  jxl/dec_cache.h
-  jxl/dec_context_map.cc
-  jxl/dec_context_map.h
-  jxl/dec_external_image.cc
-  jxl/dec_external_image.h
-  jxl/dec_frame.cc
-  jxl/dec_frame.h
-  jxl/dec_group.cc
-  jxl/dec_group.h
-  jxl/dec_group_border.cc
-  jxl/dec_group_border.h
-  jxl/dec_huffman.cc
-  jxl/dec_huffman.h
-  jxl/dec_modular.cc
-  jxl/dec_modular.h
-  jxl/dec_noise.cc
-  jxl/dec_noise.h
-  jxl/dec_patch_dictionary.cc
-  jxl/dec_patch_dictionary.h
-  jxl/dec_tone_mapping-inl.h
-  jxl/dec_transforms-inl.h
-  jxl/dec_xyb-inl.h
-  jxl/dec_xyb.cc
-  jxl/dec_xyb.h
-  jxl/decode.cc
-  jxl/decode_to_jpeg.cc
-  jxl/decode_to_jpeg.h
-  jxl/enc_bit_writer.cc
-  jxl/enc_bit_writer.h
-  jxl/entropy_coder.cc
-  jxl/entropy_coder.h
-  jxl/epf.cc
-  jxl/epf.h
-  jxl/exif.h
-  jxl/fast_dct-inl.h
-  jxl/fast_dct.cc
-  jxl/fast_dct.h
-  jxl/fast_dct128-inl.h
-  jxl/fast_dct16-inl.h
-  jxl/fast_dct256-inl.h
-  jxl/fast_dct32-inl.h
-  jxl/fast_dct64-inl.h
-  jxl/fast_dct8-inl.h
-  jxl/fast_math-inl.h
-  jxl/field_encodings.h
-  jxl/fields.cc
-  jxl/fields.h
-  jxl/frame_header.cc
-  jxl/frame_header.h
-  jxl/gauss_blur.cc
-  jxl/gauss_blur.h
-  jxl/headers.cc
-  jxl/headers.h
-  jxl/huffman_table.cc
-  jxl/huffman_table.h
-  jxl/icc_codec.cc
-  jxl/icc_codec.h
-  jxl/icc_codec_common.cc
-  jxl/icc_codec_common.h
-  jxl/image.cc
-  jxl/image.h
-  jxl/image_bundle.cc
-  jxl/image_bundle.h
-  jxl/image_metadata.cc
-  jxl/image_metadata.h
-  jxl/image_ops.h
-  jxl/jpeg/dec_jpeg_data.cc
-  jxl/jpeg/dec_jpeg_data.h
-  jxl/jpeg/dec_jpeg_data_writer.cc
-  jxl/jpeg/dec_jpeg_data_writer.h
-  jxl/jpeg/dec_jpeg_output_chunk.h
-  jxl/jpeg/dec_jpeg_serialization_state.h
-  jxl/jpeg/jpeg_data.cc
-  jxl/jpeg/jpeg_data.h
-  jxl/jxl_inspection.h
-  jxl/lehmer_code.h
-  jxl/linalg.h
-  jxl/loop_filter.cc
-  jxl/loop_filter.h
-  jxl/luminance.cc
-  jxl/luminance.h
-  jxl/memory_manager_internal.cc
-  jxl/memory_manager_internal.h
-  jxl/modular/encoding/context_predict.h
-  jxl/modular/encoding/dec_ma.cc
-  jxl/modular/encoding/dec_ma.h
-  jxl/modular/encoding/encoding.cc
-  jxl/modular/encoding/encoding.h
-  jxl/modular/encoding/ma_common.h
-  jxl/modular/modular_image.cc
-  jxl/modular/modular_image.h
-  jxl/modular/options.h
-  jxl/modular/transform/palette.h
-  jxl/modular/transform/rct.cc
-  jxl/modular/transform/rct.h
-  jxl/modular/transform/squeeze.cc
-  jxl/modular/transform/squeeze.h
-  jxl/modular/transform/transform.cc
-  jxl/modular/transform/transform.h
-  jxl/noise.h
-  jxl/opsin_params.cc
-  jxl/opsin_params.h
-  jxl/passes_state.cc
-  jxl/passes_state.h
-  jxl/patch_dictionary_internal.h
-  jxl/quant_weights.cc
-  jxl/quant_weights.h
-  jxl/quantizer-inl.h
-  jxl/quantizer.cc
-  jxl/quantizer.h
-  jxl/rational_polynomial-inl.h
-  jxl/render_pipeline/low_memory_render_pipeline.cc
-  jxl/render_pipeline/low_memory_render_pipeline.h
-  jxl/render_pipeline/render_pipeline.cc
-  jxl/render_pipeline/render_pipeline.h
-  jxl/render_pipeline/render_pipeline_stage.h
-  jxl/render_pipeline/simple_render_pipeline.cc
-  jxl/render_pipeline/simple_render_pipeline.h
-  jxl/render_pipeline/stage_blending.cc
-  jxl/render_pipeline/stage_blending.h
-  jxl/render_pipeline/stage_chroma_upsampling.cc
-  jxl/render_pipeline/stage_chroma_upsampling.h
-  jxl/render_pipeline/stage_epf.cc
-  jxl/render_pipeline/stage_epf.h
-  jxl/render_pipeline/stage_from_linear.cc
-  jxl/render_pipeline/stage_from_linear.h
-  jxl/render_pipeline/stage_gaborish.cc
-  jxl/render_pipeline/stage_gaborish.h
-  jxl/render_pipeline/stage_noise.cc
-  jxl/render_pipeline/stage_noise.h
-  jxl/render_pipeline/stage_patches.cc
-  jxl/render_pipeline/stage_patches.h
-  jxl/render_pipeline/stage_splines.cc
-  jxl/render_pipeline/stage_splines.h
-  jxl/render_pipeline/stage_spot.cc
-  jxl/render_pipeline/stage_spot.h
-  jxl/render_pipeline/stage_to_linear.cc
-  jxl/render_pipeline/stage_to_linear.h
-  jxl/render_pipeline/stage_tone_mapping.cc
-  jxl/render_pipeline/stage_tone_mapping.h
-  jxl/render_pipeline/stage_upsampling.cc
-  jxl/render_pipeline/stage_upsampling.h
-  jxl/render_pipeline/stage_write.cc
-  jxl/render_pipeline/stage_write.h
-  jxl/render_pipeline/stage_xyb.cc
-  jxl/render_pipeline/stage_xyb.h
-  jxl/render_pipeline/stage_ycbcr.cc
-  jxl/render_pipeline/stage_ycbcr.h
-  jxl/render_pipeline/test_render_pipeline_stages.h
-  jxl/sanitizers.h
-  jxl/simd_util-inl.h
-  jxl/size_constraints.h
-  jxl/splines.cc
-  jxl/splines.h
-  jxl/toc.cc
-  jxl/toc.h
-  jxl/transfer_functions-inl.h
-  jxl/transpose-inl.h
-  jxl/xorshift128plus-inl.h
-)
+include(jxl_lists.cmake)
 
-# List of source files only needed by the encoder or by tools (including
-# decoding tools), but not by the decoder library.
-set(JPEGXL_INTERNAL_SOURCES_ENC
-  jxl/butteraugli/butteraugli.cc
-  jxl/butteraugli/butteraugli.h
-  jxl/butteraugli_wrapper.cc
-  jxl/enc_ac_strategy.cc
-  jxl/enc_ac_strategy.h
-  jxl/enc_adaptive_quantization.cc
-  jxl/enc_adaptive_quantization.h
-  jxl/enc_ans.cc
-  jxl/enc_ans.h
-  jxl/enc_ans_params.h
-  jxl/enc_ar_control_field.cc
-  jxl/enc_ar_control_field.h
-  jxl/enc_butteraugli_comparator.cc
-  jxl/enc_butteraugli_comparator.h
-  jxl/enc_butteraugli_pnorm.cc
-  jxl/enc_butteraugli_pnorm.h
-  jxl/enc_cache.cc
-  jxl/enc_cache.h
-  jxl/enc_chroma_from_luma.cc
-  jxl/enc_chroma_from_luma.h
-  jxl/enc_cluster.cc
-  jxl/enc_cluster.h
-  jxl/enc_coeff_order.cc
-  jxl/enc_coeff_order.h
-  jxl/enc_color_management.cc
-  jxl/enc_color_management.h
-  jxl/enc_comparator.cc
-  jxl/enc_comparator.h
-  jxl/enc_context_map.cc
-  jxl/enc_context_map.h
-  jxl/enc_detect_dots.cc
-  jxl/enc_detect_dots.h
-  jxl/enc_dot_dictionary.cc
-  jxl/enc_dot_dictionary.h
-  jxl/enc_entropy_coder.cc
-  jxl/enc_entropy_coder.h
-  jxl/enc_external_image.cc
-  jxl/enc_external_image.h
-  jxl/enc_file.cc
-  jxl/enc_file.h
-  jxl/enc_frame.cc
-  jxl/enc_frame.h
-  jxl/enc_gamma_correct.h
-  jxl/enc_group.cc
-  jxl/enc_group.h
-  jxl/enc_heuristics.cc
-  jxl/enc_heuristics.h
-  jxl/enc_huffman.cc
-  jxl/enc_huffman.h
-  jxl/enc_icc_codec.cc
-  jxl/enc_icc_codec.h
-  jxl/enc_image_bundle.cc
-  jxl/enc_image_bundle.h
-  jxl/enc_jxl_skcms.h
-  jxl/enc_modular.cc
-  jxl/enc_modular.h
-  jxl/enc_noise.cc
-  jxl/enc_noise.h
-  jxl/enc_params.h
-  jxl/enc_patch_dictionary.cc
-  jxl/enc_patch_dictionary.h
-  jxl/enc_photon_noise.cc
-  jxl/enc_photon_noise.h
-  jxl/enc_quant_weights.cc
-  jxl/enc_quant_weights.h
-  jxl/enc_splines.cc
-  jxl/enc_splines.h
-  jxl/enc_toc.cc
-  jxl/enc_toc.h
-  jxl/enc_transforms-inl.h
-  jxl/enc_transforms.cc
-  jxl/enc_transforms.h
-  jxl/enc_xyb.cc
-  jxl/enc_xyb.h
-  jxl/encode.cc
-  jxl/encode_internal.h
-  jxl/gaborish.cc
-  jxl/gaborish.h
-  jxl/huffman_tree.cc
-  jxl/huffman_tree.h
-  jxl/jpeg/enc_jpeg_data.cc
-  jxl/jpeg/enc_jpeg_data.h
-  jxl/jpeg/enc_jpeg_data_reader.cc
-  jxl/jpeg/enc_jpeg_data_reader.h
-  jxl/jpeg/enc_jpeg_huffman_decode.cc
-  jxl/jpeg/enc_jpeg_huffman_decode.h
-  jxl/linalg.cc
-  jxl/modular/encoding/enc_debug_tree.cc
-  jxl/modular/encoding/enc_debug_tree.h
-  jxl/modular/encoding/enc_encoding.cc
-  jxl/modular/encoding/enc_encoding.h
-  jxl/modular/encoding/enc_ma.cc
-  jxl/modular/encoding/enc_ma.h
-  jxl/modular/transform/enc_palette.cc
-  jxl/modular/transform/enc_palette.h
-  jxl/modular/transform/enc_rct.cc
-  jxl/modular/transform/enc_rct.h
-  jxl/modular/transform/enc_squeeze.cc
-  jxl/modular/transform/enc_squeeze.h
-  jxl/modular/transform/enc_transform.cc
-  jxl/modular/transform/enc_transform.h
-  jxl/optimize.cc
-  jxl/optimize.h
-  jxl/progressive_split.cc
-  jxl/progressive_split.h
-)
+if (JPEGXL_ENABLE_TOOLS OR JPEGXL_ENABLE_DEVTOOLS OR JPEGXL_ENABLE_BOXES)
+list(APPEND JPEGXL_INTERNAL_DEC_SOURCES ${JPEGXL_INTERNAL_DEC_BOX_SOURCES})
+endif()
+
+if (JPEGXL_ENABLE_TRANSCODE_JPEG OR JPEGXL_ENABLE_TOOLS OR JPEGXL_ENABLE_DEVTOOLS)
+list(APPEND JPEGXL_INTERNAL_DEC_SOURCES ${JPEGXL_INTERNAL_DEC_JPEG_SOURCES})
+endif()
+
+set_source_files_properties(jxl/enc_fast_lossless.cc PROPERTIES COMPILE_FLAGS -O3)
 
 set(JPEGXL_DEC_INTERNAL_LIBS
-  brotlidec-static
-  brotlicommon-static
   hwy
   Threads::Threads
   ${ATOMICS_LIBRARIES}
 )
 
-if(JPEGXL_ENABLE_PROFILER)
-list(APPEND JPEGXL_DEC_INTERNAL_LIBS jxl_profiler)
+if (JPEGXL_ENABLE_TRANSCODE_JPEG OR JPEGXL_ENABLE_BOXES)
+list(APPEND JPEGXL_DEC_INTERNAL_LIBS brotlidec brotlicommon)
 endif()
 
 set(JPEGXL_INTERNAL_LIBS
   ${JPEGXL_DEC_INTERNAL_LIBS}
-  brotlienc-static
+  brotlienc
 )
 
-# strips the -static suffix from all the elements in LIST
-function(strip_static OUTPUT_VAR LIB_LIST)
-  foreach(lib IN LISTS ${LIB_LIST})
-    string(REGEX REPLACE "-static$" "" lib "${lib}")
-    list(APPEND out_list "${lib}")
-  endforeach()
-  set(${OUTPUT_VAR} ${out_list} PARENT_SCOPE)
-endfunction()
-
-if (JPEGXL_ENABLE_SKCMS)
-  list(APPEND JPEGXL_INTERNAL_FLAGS -DJPEGXL_ENABLE_SKCMS=1)
-  if (JPEGXL_BUNDLE_SKCMS)
-    list(APPEND JPEGXL_INTERNAL_FLAGS -DJPEGXL_BUNDLE_SKCMS=1)
-    # skcms objects are later added to JPEGXL_INTERNAL_OBJECTS
-  else ()
-    list(APPEND JPEGXL_INTERNAL_LIBS skcms)
-  endif ()
-else ()
-  list(APPEND JPEGXL_INTERNAL_LIBS lcms2)
+if (JPEGXL_ENABLE_TRANSCODE_JPEG)
+  list(APPEND JPEGXL_INTERNAL_FLAGS -DJPEGXL_ENABLE_TRANSCODE_JPEG=1)
+else()
+  list(APPEND JPEGXL_INTERNAL_FLAGS -DJPEGXL_ENABLE_TRANSCODE_JPEG=0)
 endif ()
 
-if (NOT JPEGXL_ENABLE_TRANSCODE_JPEG)
-  list(APPEND JPEGXL_INTERNAL_FLAGS -DJPEGXL_ENABLE_TRANSCODE_JPEG=0)
+if (JPEGXL_ENABLE_BOXES)
+  list(APPEND JPEGXL_INTERNAL_FLAGS -DJPEGXL_ENABLE_BOXES=1)
+else()
+  list(APPEND JPEGXL_INTERNAL_FLAGS -DJPEGXL_ENABLE_BOXES=0)
 endif ()
 
 set(OBJ_COMPILE_DEFINITIONS
@@ -400,118 +52,123 @@ set(OBJ_COMPILE_DEFINITIONS
   JXL_INTERNAL_LIBRARY_BUILD
 )
 
+# Generate version.h
+configure_file("jxl/version.h.in" "include/jxl/version.h")
+
+# Headers for exporting/importing public headers
+include(GenerateExportHeader)
+
+# CMake does not allow generate_export_header for INTERFACE library, so we
+# add this stub library just for file generation.
+add_library(jxl_export OBJECT ${JPEGXL_INTERNAL_PUBLIC_HEADERS})
+set_target_properties(jxl_export PROPERTIES
+  CXX_VISIBILITY_PRESET hidden
+  VISIBILITY_INLINES_HIDDEN 1
+  DEFINE_SYMBOL JXL_INTERNAL_LIBRARY_BUILD
+  LINKER_LANGUAGE CXX
+)
+generate_export_header(jxl_export
+  BASE_NAME JXL
+  EXPORT_FILE_NAME include/jxl/jxl_export.h)
+# Place all public headers in a single directory.
+foreach(path ${JPEGXL_INTERNAL_PUBLIC_HEADERS})
+  configure_file(
+    ${path}
+    ${path}
+    COPYONLY
+  )
+endforeach()
+
+add_library(jxl_base INTERFACE)
+target_include_directories(jxl_base SYSTEM INTERFACE
+  "$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include>"
+)
+target_include_directories(jxl_base INTERFACE
+  ${PROJECT_SOURCE_DIR}
+  ${JXL_HWY_INCLUDE_DIRS}
+)
+add_dependencies(jxl_base jxl_export)
+
 # Decoder-only object library
-add_library(jxl_dec-obj OBJECT ${JPEGXL_INTERNAL_SOURCES_DEC})
+add_library(jxl_dec-obj OBJECT ${JPEGXL_INTERNAL_DEC_SOURCES})
 target_compile_options(jxl_dec-obj PRIVATE ${JPEGXL_INTERNAL_FLAGS})
 target_compile_options(jxl_dec-obj PUBLIC ${JPEGXL_COVERAGE_FLAGS})
 set_property(TARGET jxl_dec-obj PROPERTY POSITION_INDEPENDENT_CODE ON)
 target_include_directories(jxl_dec-obj PUBLIC
   "$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}>"
-  "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
-  "$<BUILD_INTERFACE:$<TARGET_PROPERTY:hwy,INTERFACE_INCLUDE_DIRECTORIES>>"
-  "$<BUILD_INTERFACE:$<TARGET_PROPERTY:brotlicommon-static,INTERFACE_INCLUDE_DIRECTORIES>>"
+  "${JXL_HWY_INCLUDE_DIRS}"
+  "$<BUILD_INTERFACE:$<TARGET_PROPERTY:brotlicommon,INTERFACE_INCLUDE_DIRECTORIES>>"
 )
 target_compile_definitions(jxl_dec-obj PUBLIC
   ${OBJ_COMPILE_DEFINITIONS}
 )
-if (JPEGXL_ENABLE_PROFILER)
-target_link_libraries(jxl_dec-obj PUBLIC jxl_profiler)
-endif()
+target_link_libraries(jxl_dec-obj PUBLIC jxl_base)
 
 # Object library. This is used to hold the set of objects and properties.
-add_library(jxl_enc-obj OBJECT ${JPEGXL_INTERNAL_SOURCES_ENC})
+add_library(jxl_enc-obj OBJECT ${JPEGXL_INTERNAL_ENC_SOURCES})
 target_compile_options(jxl_enc-obj PRIVATE ${JPEGXL_INTERNAL_FLAGS})
 target_compile_options(jxl_enc-obj PUBLIC ${JPEGXL_COVERAGE_FLAGS})
 set_property(TARGET jxl_enc-obj PROPERTY POSITION_INDEPENDENT_CODE ON)
 target_include_directories(jxl_enc-obj PUBLIC
   ${PROJECT_SOURCE_DIR}
-  ${CMAKE_CURRENT_SOURCE_DIR}/include
-  $<TARGET_PROPERTY:hwy,INTERFACE_INCLUDE_DIRECTORIES>
-  $<TARGET_PROPERTY:brotlicommon-static,INTERFACE_INCLUDE_DIRECTORIES>
+  ${JXL_HWY_INCLUDE_DIRS}
+  $<TARGET_PROPERTY:brotlicommon,INTERFACE_INCLUDE_DIRECTORIES>
 )
 target_compile_definitions(jxl_enc-obj PUBLIC
   ${OBJ_COMPILE_DEFINITIONS}
 )
-if (JPEGXL_ENABLE_PROFILER)
-target_link_libraries(jxl_enc-obj PUBLIC jxl_profiler)
-endif()
-
-#TODO(lode): don't depend on CMS for the core library
-if (JPEGXL_ENABLE_SKCMS)
-  target_include_directories(jxl_enc-obj PRIVATE
-    $<TARGET_PROPERTY:skcms,INCLUDE_DIRECTORIES>
-  )
-else ()
-  target_include_directories(jxl_enc-obj PRIVATE
-    $<TARGET_PROPERTY:lcms2,INCLUDE_DIRECTORIES>
-  )
-endif ()
-
-# Generate version.h
-configure_file("jxl/version.h.in" "include/jxl/version.h")
+target_link_libraries(jxl_enc-obj PUBLIC jxl_base)
 
-# Headers for exporting/importing public headers
-include(GenerateExportHeader)
 set_target_properties(jxl_dec-obj PROPERTIES
   CXX_VISIBILITY_PRESET hidden
   VISIBILITY_INLINES_HIDDEN 1
   DEFINE_SYMBOL JXL_INTERNAL_LIBRARY_BUILD
 )
-target_include_directories(jxl_dec-obj PUBLIC
-    ${CMAKE_CURRENT_BINARY_DIR}/include)
 
 set_target_properties(jxl_enc-obj PROPERTIES
   CXX_VISIBILITY_PRESET hidden
   VISIBILITY_INLINES_HIDDEN 1
   DEFINE_SYMBOL JXL_INTERNAL_LIBRARY_BUILD
 )
-generate_export_header(jxl_enc-obj
-  BASE_NAME JXL
-  EXPORT_FILE_NAME include/jxl/jxl_export.h)
-target_include_directories(jxl_enc-obj PUBLIC
-    ${CMAKE_CURRENT_BINARY_DIR}/include)
 
 # Private static library. This exposes all the internal functions and is used
 # for tests.
-add_library(jxl_dec-static STATIC
+add_library(jxl_dec-internal STATIC
   $<TARGET_OBJECTS:jxl_dec-obj>
+  ${JXL_CMS_OBJECTS}
+)
+target_link_libraries(jxl_dec-internal PUBLIC
+  ${JPEGXL_COVERAGE_FLAGS}
+  ${JPEGXL_DEC_INTERNAL_LIBS}
+  jxl_base
 )
-target_link_libraries(jxl_dec-static
-  PUBLIC ${JPEGXL_COVERAGE_FLAGS} ${JPEGXL_DEC_INTERNAL_LIBS})
-target_include_directories(jxl_dec-static PUBLIC
-  "$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}>"
-  "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
-  "$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include>")
 
 # The list of objects in the static and shared libraries.
 set(JPEGXL_INTERNAL_OBJECTS
   $<TARGET_OBJECTS:jxl_enc-obj>
   $<TARGET_OBJECTS:jxl_dec-obj>
 )
-if (JPEGXL_ENABLE_SKCMS AND JPEGXL_BUNDLE_SKCMS)
-  list(APPEND JPEGXL_INTERNAL_OBJECTS $<TARGET_OBJECTS:skcms-obj>)
-endif()
 
 # Private static library. This exposes all the internal functions and is used
 # for tests.
 # TODO(lode): once the source files are correctly split so that it is possible
-# to do, remove $<TARGET_OBJECTS:jxl_dec-obj> here and depend on jxl_dec-static
-add_library(jxl-static STATIC ${JPEGXL_INTERNAL_OBJECTS})
-target_link_libraries(jxl-static
-  PUBLIC ${JPEGXL_COVERAGE_FLAGS} ${JPEGXL_INTERNAL_LIBS})
-target_include_directories(jxl-static PUBLIC
-  "$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}>"
-  "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
-  "$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include>")
-
-# JXL_EXPORT is defined to "__declspec(dllimport)" automatically by CMake
-# in Windows builds when including headers from the C API and compiling from
-# outside the jxl library. This is required when using the shared library,
-# however in windows this causes the function to not be found when linking
-# against the static library. This define JXL_EXPORT= here forces it to not
-# use dllimport in tests and other tools that require the static library.
-target_compile_definitions(jxl-static INTERFACE -DJXL_EXPORT=)
-target_compile_definitions(jxl_dec-static INTERFACE -DJXL_EXPORT=)
+# to do, remove $<TARGET_OBJECTS:jxl_dec-obj> here and depend on jxl_dec-internal
+add_library(jxl-internal STATIC
+  ${JPEGXL_INTERNAL_OBJECTS}
+)
+target_link_libraries(jxl-internal PUBLIC
+  ${JPEGXL_COVERAGE_FLAGS}
+  ${JPEGXL_INTERNAL_LIBS}
+  jxl_cms
+  jxl_base
+)
+target_include_directories(jxl-internal PUBLIC
+  "$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}>")
+
+target_compile_definitions(jxl-internal INTERFACE -DJXL_STATIC_DEFINE)
+target_compile_definitions(jxl_dec-internal INTERFACE -DJXL_STATIC_DEFINE)
+target_compile_definitions(jxl-internal INTERFACE -DJXL_STATIC_DEFINE)
+target_compile_definitions(jxl_dec-internal INTERFACE -DJXL_STATIC_DEFINE)
 
 # TODO(deymo): Move TCMalloc linkage to the tools/ directory since the library
 # shouldn't do any allocs anyway.
@@ -530,49 +187,27 @@ if(JPEGXL_ENABLE_TCMALLOC)
         "bug for details:\n"
         "   https://github.com/gperftools/gperftools/issues/1204\n")
   endif()
-  target_link_libraries(jxl-static PUBLIC PkgConfig::TCMallocMinimal)
+  target_link_libraries(jxl-internal PUBLIC PkgConfig::TCMallocMinimal)
 endif()  # JPEGXL_ENABLE_TCMALLOC
 
-# Install the static library too, but as jxl.a file without the -static except
-# in Windows.
-if (NOT WIN32 OR MINGW)
-  set_target_properties(jxl-static PROPERTIES OUTPUT_NAME "jxl")
-  set_target_properties(jxl_dec-static PROPERTIES OUTPUT_NAME "jxl_dec")
-endif()
-install(TARGETS jxl-static DESTINATION ${CMAKE_INSTALL_LIBDIR})
-install(TARGETS jxl_dec-static DESTINATION ${CMAKE_INSTALL_LIBDIR})
-
-if (BUILD_SHARED_LIBS)
-
-# Public shared library.
-add_library(jxl SHARED ${JPEGXL_INTERNAL_OBJECTS})
-strip_static(JPEGXL_INTERNAL_SHARED_LIBS JPEGXL_INTERNAL_LIBS)
-target_link_libraries(jxl PUBLIC ${JPEGXL_COVERAGE_FLAGS})
+# Public library.
+add_library(jxl ${JPEGXL_INTERNAL_OBJECTS})
+strip_internal(JPEGXL_INTERNAL_SHARED_LIBS JPEGXL_INTERNAL_LIBS)
+target_link_libraries(jxl PUBLIC ${JPEGXL_COVERAGE_FLAGS} jxl_base)
+target_link_libraries(jxl PUBLIC jxl_cms)
 target_link_libraries(jxl PRIVATE ${JPEGXL_INTERNAL_SHARED_LIBS})
-# Shared library include path contains only the "include/" paths.
-target_include_directories(jxl PUBLIC
-  "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>"
-  "$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include>")
 set_target_properties(jxl PROPERTIES
   VERSION ${JPEGXL_LIBRARY_VERSION}
-  SOVERSION ${JPEGXL_LIBRARY_SOVERSION}
-  LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-  RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
-
-# Public shared decoder library.
-add_library(jxl_dec SHARED $<TARGET_OBJECTS:jxl_dec-obj>)
-strip_static(JPEGXL_DEC_INTERNAL_SHARED_LIBS JPEGXL_DEC_INTERNAL_LIBS)
-target_link_libraries(jxl_dec PUBLIC ${JPEGXL_COVERAGE_FLAGS})
+  SOVERSION ${JPEGXL_LIBRARY_SOVERSION})
+
+# Public decoder library.
+add_library(jxl_dec $<TARGET_OBJECTS:jxl_dec-obj>)
+strip_internal(JPEGXL_DEC_INTERNAL_SHARED_LIBS JPEGXL_DEC_INTERNAL_LIBS)
+target_link_libraries(jxl_dec PUBLIC ${JPEGXL_COVERAGE_FLAGS} jxl_base)
 target_link_libraries(jxl_dec PRIVATE ${JPEGXL_DEC_INTERNAL_SHARED_LIBS})
-# Shared library include path contains only the "include/" paths.
-target_include_directories(jxl_dec PUBLIC
-  "${CMAKE_CURRENT_SOURCE_DIR}/include"
-  "${CMAKE_CURRENT_BINARY_DIR}/include")
 set_target_properties(jxl_dec PROPERTIES
   VERSION ${JPEGXL_LIBRARY_VERSION}
-  SOVERSION ${JPEGXL_LIBRARY_SOVERSION}
-  LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-  RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
+  SOVERSION ${JPEGXL_LIBRARY_SOVERSION})
 
 # Check whether the linker support excluding libs
 set(LINKER_EXCLUDE_LIBS_FLAG "-Wl,--exclude-libs=ALL")
@@ -581,6 +216,11 @@ list(APPEND CMAKE_EXE_LINKER_FLAGS ${LINKER_EXCLUDE_LIBS_FLAG})
 check_c_source_compiles("int main(){return 0;}" LINKER_SUPPORT_EXCLUDE_LIBS)
 list(REMOVE_ITEM CMAKE_EXE_LINKER_FLAGS ${LINKER_EXCLUDE_LIBS_FLAG})
 
+if(NOT BUILD_SHARED_LIBS)
+  target_compile_definitions(jxl PUBLIC -DJXL_STATIC_DEFINE)
+  target_compile_definitions(jxl_dec PUBLIC -DJXL_STATIC_DEFINE)
+endif()
+
 # Add a jxl.version file as a version script to tag symbols with the
 # appropriate version number. This script is also used to limit what's exposed
 # in the shared library from the static dependencies bundled here.
@@ -605,36 +245,22 @@ foreach(target IN ITEMS jxl jxl_dec)
   endif()
 endforeach()
 
-# Only install libjxl shared library. The libjxl_dec is not installed since it
+# Only install libjxl public library. The libjxl_dec is not installed since it
 # contains symbols also in libjxl which would conflict if programs try to use
 # both.
 install(TARGETS jxl
   RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
   LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
   ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
-else()
-add_library(jxl ALIAS jxl-static)
-add_library(jxl_dec ALIAS jxl_dec-static)
-endif()  # BUILD_SHARED_LIBS
 
 # Add a pkg-config file for libjxl.
 set(JPEGXL_LIBRARY_REQUIRES
-    "libhwy libbrotlicommon libbrotlienc libbrotlidec")
-if(NOT JPEGXL_ENABLE_SKCMS)
-  set(JPEGXL_LIBRARY_REQUIRES "${JPEGXL_LIBRARY_REQUIRES} lcms2")
-endif()
+    "libhwy libbrotlienc libbrotlidec libjxl_cms")
 
-# Allow adding prefix if CMAKE_INSTALL_INCLUDEDIR not absolute.
-if(IS_ABSOLUTE "${CMAKE_INSTALL_INCLUDEDIR}")
-    set(PKGCONFIG_TARGET_INCLUDES "${CMAKE_INSTALL_INCLUDEDIR}")
-else()
-    set(PKGCONFIG_TARGET_INCLUDES "\${prefix}/${CMAKE_INSTALL_INCLUDEDIR}")
-endif()
-# Allow adding prefix if CMAKE_INSTALL_LIBDIR not absolute.
-if(IS_ABSOLUTE "${CMAKE_INSTALL_LIBDIR}")
-    set(PKGCONFIG_TARGET_LIBS "${CMAKE_INSTALL_LIBDIR}")
+if (BUILD_SHARED_LIBS)
+  set(JPEGXL_REQUIRES_TYPE "Requires.private")
 else()
-    set(PKGCONFIG_TARGET_LIBS "\${exec_prefix}/${CMAKE_INSTALL_LIBDIR}")
+  set(JPEGXL_REQUIRES_TYPE "Requires")
 endif()
 
 configure_file("${CMAKE_CURRENT_SOURCE_DIR}/jxl/libjxl.pc.in"
index ada3bcb..3de477f 100644 (file)
@@ -13,8 +13,6 @@
 #include <utility>
 
 #include "lib/jxl/base/bits.h"
-#include "lib/jxl/base/profiler.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/image_ops.h"
 
 namespace jxl {
index 7d21167..ecdcbbb 100644 (file)
@@ -13,7 +13,7 @@
 
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/coeff_order_fwd.h"
-#include "lib/jxl/common.h"
+#include "lib/jxl/frame_dimensions.h"
 #include "lib/jxl/image_ops.h"
 
 // Defines the different kinds of transforms, and heuristics to choose between
index d366aa3..3745db2 100644 (file)
 #include <cmath>
 #include <hwy/aligned_allocator.h>
 #include <hwy/base.h>  // HWY_ALIGN_MAX
-#include <hwy/tests/test_util-inl.h>
+#include <hwy/tests/hwy_gtest.h>
 #include <utility>
 
 #include "lib/jxl/base/random.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/dct_scales.h"
 #include "lib/jxl/dec_transforms_testonly.h"
 #include "lib/jxl/enc_transforms.h"
+#include "lib/jxl/simd_util.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
 namespace {
@@ -28,18 +29,22 @@ class AcStrategyRoundtrip : public ::hwy::TestWithParamTargetAndT<int> {
   void Run() {
     const AcStrategy::Type type = static_cast<AcStrategy::Type>(GetParam());
     const AcStrategy acs = AcStrategy::FromRawStrategy(type);
+    const size_t dct_scratch_size =
+        3 * (MaxVectorSize() / sizeof(float)) * AcStrategy::kMaxBlockDim;
 
-    auto mem = hwy::AllocateAligned<float>(4 * AcStrategy::kMaxCoeffArea);
-    float* scratch_space = mem.get();
-    float* coeffs = scratch_space + AcStrategy::kMaxCoeffArea;
+    auto mem = hwy::AllocateAligned<float>(4 * AcStrategy::kMaxCoeffArea +
+                                           dct_scratch_size);
+    float* coeffs = mem.get();
     float* idct = coeffs + AcStrategy::kMaxCoeffArea;
+    float* input = idct + AcStrategy::kMaxCoeffArea;
+    float* scratch_space = input + AcStrategy::kMaxCoeffArea;
+
     Rng rng(type * 65537 + 13);
 
     for (size_t j = 0; j < 64; j++) {
       size_t i = (acs.log2_covered_blocks()
                       ? rng.UniformU(0, 64u << acs.log2_covered_blocks())
                       : j);
-      float* input = idct + AcStrategy::kMaxCoeffArea;
       std::fill_n(input, AcStrategy::kMaxCoeffArea, 0);
       input[i] = 0.2f;
       TransformFromPixels(type, input, acs.covered_blocks_x() * 8, coeffs,
@@ -60,7 +65,8 @@ class AcStrategyRoundtrip : public ::hwy::TestWithParamTargetAndT<int> {
         float* dc = idct + AcStrategy::kMaxCoeffArea;
         std::fill_n(dc, AcStrategy::kMaxCoeffArea, 0);
         dc[y * acs.covered_blocks_x() * 8 + x] = 0.2;
-        LowestFrequenciesFromDC(type, dc, acs.covered_blocks_x() * 8, coeffs);
+        LowestFrequenciesFromDC(type, dc, acs.covered_blocks_x() * 8, coeffs,
+                                scratch_space);
         DCFromLowestFrequencies(type, coeffs, idct, acs.covered_blocks_x() * 8);
         std::fill_n(dc, AcStrategy::kMaxCoeffArea, 0);
         dc[y * acs.covered_blocks_x() * 8 + x] = 0.2;
@@ -86,12 +92,17 @@ class AcStrategyRoundtripDownsample
   void Run() {
     const AcStrategy::Type type = static_cast<AcStrategy::Type>(GetParam());
     const AcStrategy acs = AcStrategy::FromRawStrategy(type);
+    const size_t dct_scratch_size =
+        3 * (MaxVectorSize() / sizeof(float)) * AcStrategy::kMaxBlockDim;
 
-    auto mem = hwy::AllocateAligned<float>(4 * AcStrategy::kMaxCoeffArea);
-    float* scratch_space = mem.get();
-    float* coeffs = scratch_space + AcStrategy::kMaxCoeffArea;
-    std::fill_n(coeffs, AcStrategy::kMaxCoeffArea, 0.0f);
+    auto mem = hwy::AllocateAligned<float>(4 * AcStrategy::kMaxCoeffArea +
+                                           dct_scratch_size);
+    float* coeffs = mem.get();
     float* idct = coeffs + AcStrategy::kMaxCoeffArea;
+    float* dc = idct + AcStrategy::kMaxCoeffArea;
+    float* scratch_space = dc + AcStrategy::kMaxCoeffArea;
+
+    std::fill_n(coeffs, AcStrategy::kMaxCoeffArea, 0.0f);
     Rng rng(type * 65537 + 13);
 
     for (size_t y = 0; y < acs.covered_blocks_y(); y++) {
@@ -99,10 +110,10 @@ class AcStrategyRoundtripDownsample
         if (x > 4 || y > 4) {
           if (rng.Bernoulli(0.9f)) continue;
         }
-        float* dc = idct + AcStrategy::kMaxCoeffArea;
         std::fill_n(dc, AcStrategy::kMaxCoeffArea, 0);
         dc[y * acs.covered_blocks_x() * 8 + x] = 0.2f;
-        LowestFrequenciesFromDC(type, dc, acs.covered_blocks_x() * 8, coeffs);
+        LowestFrequenciesFromDC(type, dc, acs.covered_blocks_x() * 8, coeffs,
+                                scratch_space);
         TransformToPixels(type, coeffs, idct, acs.covered_blocks_x() * 8,
                           scratch_space);
         std::fill_n(coeffs, AcStrategy::kMaxCoeffArea, 0.0f);
@@ -141,14 +152,19 @@ class AcStrategyDownsample : public ::hwy::TestWithParamTargetAndT<int> {
   void Run() {
     const AcStrategy::Type type = static_cast<AcStrategy::Type>(GetParam());
     const AcStrategy acs = AcStrategy::FromRawStrategy(type);
+    const size_t dct_scratch_size =
+        3 * (MaxVectorSize() / sizeof(float)) * AcStrategy::kMaxBlockDim;
     size_t cx = acs.covered_blocks_y();
     size_t cy = acs.covered_blocks_x();
     CoefficientLayout(&cy, &cx);
 
-    auto mem = hwy::AllocateAligned<float>(4 * AcStrategy::kMaxCoeffArea);
-    float* scratch_space = mem.get();
-    float* idct = scratch_space + AcStrategy::kMaxCoeffArea;
+    auto mem = hwy::AllocateAligned<float>(4 * AcStrategy::kMaxCoeffArea +
+                                           dct_scratch_size);
+    float* idct = mem.get();
     float* idct_acs_downsampled = idct + AcStrategy::kMaxCoeffArea;
+    float* coeffs = idct + AcStrategy::kMaxCoeffArea;
+    float* scratch_space = coeffs + AcStrategy::kMaxCoeffArea;
+
     Rng rng(type * 65537 + 13);
 
     for (size_t y = 0; y < cy; y++) {
@@ -215,7 +231,10 @@ TEST_P(AcStrategyTargetTest, BenchmarkAFV) {
   const AcStrategy::Type type = AcStrategy::Type::AFV0;
   HWY_ALIGN_MAX float pixels[64] = {1};
   HWY_ALIGN_MAX float coeffs[64] = {};
-  HWY_ALIGN_MAX float scratch_space[64] = {};
+  const size_t dct_scratch_size =
+      3 * (MaxVectorSize() / sizeof(float)) * AcStrategy::kMaxBlockDim;
+  auto mem = hwy::AllocateAligned<float>(64 + dct_scratch_size);
+  float* scratch_space = mem.get();
   for (size_t i = 0; i < 1 << 14; i++) {
     TransformToPixels(type, coeffs, pixels, 8, scratch_space);
     TransformFromPixels(type, pixels, 8, coeffs, scratch_space);
index f0ab39a..48d7e7e 100644 (file)
@@ -42,7 +42,7 @@ void PerformAlphaBlending(const float* bg, const float* bga, const float* fg,
                           bool alpha_is_premultiplied, bool clamp) {
   if (bg == bga && fg == fga) {
     for (size_t x = 0; x < num_pixels; ++x) {
-      float fa = clamp ? fga[x] : std::min(std::max(0.0f, fga[x]), 1.0f);
+      float fa = clamp ? fga[x] : Clamp(fga[x]);
       out[x] = (1.f - (1.f - fa) * (1.f - bga[x]));
     }
   } else {
@@ -66,10 +66,14 @@ void PerformAlphaWeightedAdd(const float* bg, const float* fg, const float* fga,
                              float* out, size_t num_pixels, bool clamp) {
   if (fg == fga) {
     memcpy(out, bg, num_pixels * sizeof(*out));
-  } else {
+  } else if (clamp) {
     for (size_t x = 0; x < num_pixels; ++x) {
       out[x] = bg[x] + fg[x] * Clamp(fga[x]);
     }
+  } else {
+    for (size_t x = 0; x < num_pixels; ++x) {
+      out[x] = bg[x] + fg[x] * fga[x];
+    }
   }
 }
 
@@ -108,13 +112,4 @@ void UnpremultiplyAlpha(float* JXL_RESTRICT r, float* JXL_RESTRICT g,
   }
 }
 
-void UnpremultiplyAlpha(float* JXL_RESTRICT rgba, size_t num_pixels) {
-  for (size_t x = 0, ix = 0; x < num_pixels; ++x, ix += 4) {
-    const float multiplier = 1.f / std::max(kSmallAlpha, rgba[ix + 3]);
-    rgba[ix] *= multiplier;
-    rgba[ix + 1] *= multiplier;
-    rgba[ix + 2] *= multiplier;
-  }
-}
-
 }  // namespace jxl
index f49790b..efb76c8 100644 (file)
@@ -60,7 +60,6 @@ void PremultiplyAlpha(float* JXL_RESTRICT r, float* JXL_RESTRICT g,
 void UnpremultiplyAlpha(float* JXL_RESTRICT r, float* JXL_RESTRICT g,
                         float* JXL_RESTRICT b, const float* JXL_RESTRICT a,
                         size_t num_pixels);
-void UnpremultiplyAlpha(float* JXL_RESTRICT rgba, size_t num_pixels);
 
 }  // namespace jxl
 
index c643fbd..ddafd82 100644 (file)
@@ -6,6 +6,7 @@
 #include "lib/jxl/alpha.h"
 
 #include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
 namespace {
index 32a658f..8e52cad 100644 (file)
@@ -24,10 +24,10 @@ std::vector<int32_t> CreateFlatHistogram(int length, int total_count) {
   return result;
 }
 
-// First, all trailing non-occuring symbols are removed from the distribution;
-// if this leaves the distribution empty, a dummy symbol with max weight is
-// added. This ensures that the resulting distribution sums to total table size.
-// Then, `entry_size` is chosen to be the largest power of two so that
+// First, all trailing non-occurring symbols are removed from the distribution;
+// if this leaves the distribution empty, a placeholder symbol with max weight
+// is  added. This ensures that the resulting distribution sums to total table
+// size. Then, `entry_size` is chosen to be the largest power of two so that
 // `table_size` = ANS_TAB_SIZE/`entry_size` is at least as big as the
 // distribution size.
 // Note that each entry will only ever contain two different symbols, and
index 2c4ea8e..487b6cf 100644 (file)
@@ -7,8 +7,8 @@
 
 #include <vector>
 
-#include "gtest/gtest.h"
 #include "lib/jxl/ans_params.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
 namespace {
index ca9883d..06bc464 100644 (file)
@@ -8,15 +8,15 @@
 
 #include <vector>
 
-#include "gtest/gtest.h"
 #include "lib/jxl/ans_params.h"
-#include "lib/jxl/aux_out_fwd.h"
 #include "lib/jxl/base/random.h"
 #include "lib/jxl/base/span.h"
 #include "lib/jxl/dec_ans.h"
 #include "lib/jxl/dec_bit_reader.h"
 #include "lib/jxl/enc_ans.h"
+#include "lib/jxl/enc_aux_out.h"
 #include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
 namespace {
@@ -30,7 +30,7 @@ void RoundtripTestcase(int n_histograms, int alphabet_size,
   // Space for magic bytes.
   BitWriter::Allotment allotment_magic1(&writer, 16);
   writer.Write(16, kMagic1);
-  ReclaimAndCharge(&writer, &allotment_magic1, 0, nullptr);
+  allotment_magic1.ReclaimAndCharge(&writer, 0, nullptr);
 
   std::vector<uint8_t> context_map;
   EntropyEncodingData codes;
@@ -45,7 +45,7 @@ void RoundtripTestcase(int n_histograms, int alphabet_size,
   BitWriter::Allotment allotment_magic2(&writer, 24);
   writer.Write(16, kMagic2);
   writer.ZeroPadToByte();
-  ReclaimAndCharge(&writer, &allotment_magic2, 0, nullptr);
+  allotment_magic2.ReclaimAndCharge(&writer, 0, nullptr);
 
   // We do not truncate the output. Reading past the end reads out zeroes
   // anyway.
@@ -171,7 +171,7 @@ TEST(ANSTest, UintConfigRoundtrip) {
     BitWriter writer;
     BitWriter::Allotment allotment(&writer, 10 * uint_config.size());
     EncodeUintConfigs(uint_config, &writer, log_alpha_size);
-    ReclaimAndCharge(&writer, &allotment, 0, nullptr);
+    allotment.ReclaimAndCharge(&writer, 0, nullptr);
     writer.ZeroPadToByte();
     BitReader br(writer.GetSpan());
     EXPECT_TRUE(DecodeUintConfigs(log_alpha_size, &uint_config_dec, &br));
diff --git a/lib/jxl/aux_out.cc b/lib/jxl/aux_out.cc
deleted file mode 100644 (file)
index d8ee946..0000000
+++ /dev/null
@@ -1,87 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "lib/jxl/aux_out.h"
-
-#include <stdint.h>
-
-#include <numeric>  // accumulate
-
-#include "lib/jxl/aux_out_fwd.h"
-#include "lib/jxl/base/printf_macros.h"
-#include "lib/jxl/enc_bit_writer.h"
-
-namespace jxl {
-
-void AuxOut::Print(size_t num_inputs) const {
-  if (num_inputs == 0) return;
-
-  LayerTotals all_layers;
-  for (size_t i = 0; i < layers.size(); ++i) {
-    all_layers.Assimilate(layers[i]);
-  }
-
-  printf("Average butteraugli iters: %10.2f\n",
-         num_butteraugli_iters * 1.0 / num_inputs);
-  if (min_quant_rescale != 1.0 || max_quant_rescale != 1.0) {
-    printf("quant rescale range: %f .. %f\n", min_quant_rescale,
-           max_quant_rescale);
-    printf("bitrate error range: %.3f%% .. %.3f%%\n",
-           100.0f * min_bitrate_error, 100.0f * max_bitrate_error);
-  }
-
-  for (size_t i = 0; i < layers.size(); ++i) {
-    if (layers[i].total_bits != 0) {
-      printf("Total layer bits %-10s\t", LayerName(i));
-      printf("%10f%%", 100.0 * layers[i].total_bits / all_layers.total_bits);
-      layers[i].Print(num_inputs);
-    }
-  }
-  printf("Total image size           ");
-  all_layers.Print(num_inputs);
-
-  const uint32_t dc_pred_total =
-      std::accumulate(dc_pred_usage.begin(), dc_pred_usage.end(), 0u);
-  const uint32_t dc_pred_total_xb =
-      std::accumulate(dc_pred_usage_xb.begin(), dc_pred_usage_xb.end(), 0u);
-  if (dc_pred_total + dc_pred_total_xb != 0) {
-    printf("\nDC pred     Y                XB:\n");
-    for (size_t i = 0; i < dc_pred_usage.size(); ++i) {
-      printf("  %6u (%5.2f%%)    %6u (%5.2f%%)\n", dc_pred_usage[i],
-             100.0 * dc_pred_usage[i] / dc_pred_total, dc_pred_usage_xb[i],
-             100.0 * dc_pred_usage_xb[i] / dc_pred_total_xb);
-    }
-  }
-
-  size_t total_blocks = 0;
-  size_t total_positions = 0;
-  if (total_blocks != 0 && total_positions != 0) {
-    printf("\n\t\t  Blocks\t\tPositions\t\t\tBlocks/Position\n");
-    printf(" Total:\t\t    %7" PRIuS "\t\t     %7" PRIuS " \t\t\t%10f%%\n\n",
-           total_blocks, total_positions,
-           100.0 * total_blocks / total_positions);
-  }
-}
-
-void ReclaimAndCharge(BitWriter* JXL_RESTRICT writer,
-                      BitWriter::Allotment* JXL_RESTRICT allotment,
-                      size_t layer, AuxOut* JXL_RESTRICT aux_out) {
-  size_t used_bits, unused_bits;
-  allotment->PrivateReclaim(writer, &used_bits, &unused_bits);
-
-#if 0
-  printf("Layer %s bits: max %" PRIuS " used %" PRIuS " unused %" PRIuS "\n", LayerName(layer),
-         allotment->MaxBits(), used_bits, unused_bits);
-#endif
-
-  // This may be a nested call with aux_out == null. Whenever we know that
-  // aux_out is null, we can call ReclaimUnused directly.
-  if (aux_out != nullptr) {
-    aux_out->layers[layer].total_bits += used_bits;
-    aux_out->layers[layer].histogram_bits += allotment->HistogramBits();
-  }
-}
-
-}  // namespace jxl
diff --git a/lib/jxl/aux_out.h b/lib/jxl/aux_out.h
deleted file mode 100644 (file)
index 7076603..0000000
+++ /dev/null
@@ -1,309 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#ifndef LIB_JXL_AUX_OUT_H_
-#define LIB_JXL_AUX_OUT_H_
-
-// Optional output information for debugging and analyzing size usage.
-
-#include <inttypes.h>
-#include <stddef.h>
-#include <stdint.h>
-#include <stdio.h>
-
-#include <array>
-#include <functional>
-#include <sstream>
-#include <string>
-#include <utility>
-
-#include "lib/jxl/aux_out_fwd.h"
-#include "lib/jxl/base/compiler_specific.h"
-#include "lib/jxl/base/status.h"
-#include "lib/jxl/codec_in_out.h"
-#include "lib/jxl/color_management.h"
-#include "lib/jxl/common.h"
-#include "lib/jxl/dec_xyb.h"
-#include "lib/jxl/image.h"
-#include "lib/jxl/image_bundle.h"
-#include "lib/jxl/image_ops.h"
-#include "lib/jxl/jxl_inspection.h"
-
-namespace jxl {
-
-// For LayerName and AuxOut::layers[] index. Order does not matter.
-enum {
-  kLayerHeader = 0,
-  kLayerTOC,
-  kLayerDictionary,
-  kLayerSplines,
-  kLayerNoise,
-  kLayerQuant,
-  kLayerModularTree,
-  kLayerModularGlobal,
-  kLayerDC,
-  kLayerModularDcGroup,
-  kLayerControlFields,
-  kLayerOrder,
-  kLayerAC,
-  kLayerACTokens,
-  kLayerModularAcGroup,
-  kNumImageLayers
-};
-
-static inline const char* LayerName(size_t layer) {
-  switch (layer) {
-    case kLayerHeader:
-      return "Headers";
-    case kLayerTOC:
-      return "TOC";
-    case kLayerDictionary:
-      return "Patches";
-    case kLayerSplines:
-      return "Splines";
-    case kLayerNoise:
-      return "Noise";
-    case kLayerQuant:
-      return "Quantizer";
-    case kLayerModularTree:
-      return "ModularTree";
-    case kLayerModularGlobal:
-      return "ModularGlobal";
-    case kLayerDC:
-      return "DC";
-    case kLayerModularDcGroup:
-      return "ModularDcGroup";
-    case kLayerControlFields:
-      return "ControlFields";
-    case kLayerOrder:
-      return "CoeffOrder";
-    case kLayerAC:
-      return "ACHistograms";
-    case kLayerACTokens:
-      return "ACTokens";
-    case kLayerModularAcGroup:
-      return "ModularAcGroup";
-    default:
-      JXL_ABORT("Invalid layer %d\n", static_cast<int>(layer));
-  }
-}
-
-// Statistics gathered during compression or decompression.
-struct AuxOut {
- private:
-  struct LayerTotals {
-    void Assimilate(const LayerTotals& victim) {
-      num_clustered_histograms += victim.num_clustered_histograms;
-      histogram_bits += victim.histogram_bits;
-      extra_bits += victim.extra_bits;
-      total_bits += victim.total_bits;
-      clustered_entropy += victim.clustered_entropy;
-    }
-    void Print(size_t num_inputs) const {
-      printf("%10" PRId64, static_cast<int64_t>(total_bits));
-      if (histogram_bits != 0) {
-        printf("   [c/i:%6.2f | hst:%8" PRId64 " | ex:%8" PRId64
-               " | h+c+e:%12.3f",
-               num_clustered_histograms * 1.0 / num_inputs,
-               static_cast<int64_t>(histogram_bits >> 3),
-               static_cast<int64_t>(extra_bits >> 3),
-               (histogram_bits + clustered_entropy + extra_bits) / 8.0);
-        printf("]");
-      }
-      printf("\n");
-    }
-    size_t num_clustered_histograms = 0;
-    size_t extra_bits = 0;
-
-    // Set via BitsWritten below
-    size_t histogram_bits = 0;
-    size_t total_bits = 0;
-
-    double clustered_entropy = 0.0;
-  };
-
- public:
-  AuxOut() = default;
-  AuxOut(const AuxOut&) = default;
-
-  void Assimilate(const AuxOut& victim) {
-    for (size_t i = 0; i < layers.size(); ++i) {
-      layers[i].Assimilate(victim.layers[i]);
-    }
-    num_blocks += victim.num_blocks;
-    num_small_blocks += victim.num_small_blocks;
-    num_dct4x8_blocks += victim.num_dct4x8_blocks;
-    num_afv_blocks += victim.num_afv_blocks;
-    num_dct8_blocks += victim.num_dct8_blocks;
-    num_dct8x16_blocks += victim.num_dct8x16_blocks;
-    num_dct8x32_blocks += victim.num_dct8x32_blocks;
-    num_dct16_blocks += victim.num_dct16_blocks;
-    num_dct16x32_blocks += victim.num_dct16x32_blocks;
-    num_dct32_blocks += victim.num_dct32_blocks;
-    num_dct32x64_blocks += victim.num_dct32x64_blocks;
-    num_dct64_blocks += victim.num_dct64_blocks;
-    num_butteraugli_iters += victim.num_butteraugli_iters;
-    for (size_t i = 0; i < dc_pred_usage.size(); ++i) {
-      dc_pred_usage[i] += victim.dc_pred_usage[i];
-      dc_pred_usage_xb[i] += victim.dc_pred_usage_xb[i];
-    }
-    max_quant_rescale = std::max(max_quant_rescale, victim.max_quant_rescale);
-    min_quant_rescale = std::min(min_quant_rescale, victim.min_quant_rescale);
-    max_bitrate_error = std::max(max_bitrate_error, victim.max_bitrate_error);
-    min_bitrate_error = std::min(min_bitrate_error, victim.min_bitrate_error);
-  }
-
-  void Print(size_t num_inputs) const;
-
-  size_t TotalBits() const {
-    size_t total = 0;
-    for (const auto& layer : layers) {
-      total += layer.total_bits;
-    }
-    return total;
-  }
-
-  template <typename T>
-  void DumpImage(const char* label, const Image3<T>& image) const {
-    if (!dump_image) return;
-    if (debug_prefix.empty()) return;
-    std::ostringstream pathname;
-    pathname << debug_prefix << label << ".png";
-    CodecInOut io;
-    // Always save to 16-bit png.
-    io.metadata.m.SetUintSamples(16);
-    io.metadata.m.color_encoding = ColorEncoding::SRGB();
-    io.SetFromImage(ConvertToFloat(image), io.metadata.m.color_encoding);
-    (void)dump_image(io, pathname.str());
-  }
-  template <typename T>
-  void DumpImage(const char* label, const Plane<T>& image) {
-    DumpImage(label,
-              Image3<T>(CopyImage(image), CopyImage(image), CopyImage(image)));
-  }
-
-  template <typename T>
-  void DumpXybImage(const char* label, const Image3<T>& image) const {
-    if (!dump_image) return;
-    if (debug_prefix.empty()) return;
-    std::ostringstream pathname;
-    pathname << debug_prefix << label << ".png";
-
-    Image3F linear(image.xsize(), image.ysize());
-    OpsinParams opsin_params;
-    opsin_params.Init(kDefaultIntensityTarget);
-    OpsinToLinear(image, Rect(linear), nullptr, &linear, opsin_params);
-
-    CodecInOut io;
-    io.metadata.m.SetUintSamples(16);
-    io.metadata.m.color_encoding = ColorEncoding::LinearSRGB();
-    io.SetFromImage(std::move(linear), io.metadata.m.color_encoding);
-
-    (void)dump_image(io, pathname.str());
-  }
-
-  // Normalizes all the channels to range 0-1, creating a false-color image
-  // which allows seeing the information from non-RGB channels in an RGB debug
-  // image.
-  template <typename T>
-  void DumpImageNormalized(const char* label, const Image3<T>& image) const {
-    std::array<T, 3> min;
-    std::array<T, 3> max;
-    Image3MinMax(image, &min, &max);
-    Image3B normalized(image.xsize(), image.ysize());
-    for (size_t c = 0; c < 3; ++c) {
-      float mul = min[c] == max[c] ? 0 : (255.0f / (max[c] - min[c]));
-      for (size_t y = 0; y < image.ysize(); ++y) {
-        const T* JXL_RESTRICT row_in = image.ConstPlaneRow(c, y);
-        uint8_t* JXL_RESTRICT row_out = normalized.PlaneRow(c, y);
-        for (size_t x = 0; x < image.xsize(); ++x) {
-          row_out[x] = static_cast<uint8_t>((row_in[x] - min[c]) * mul);
-        }
-      }
-    }
-    DumpImage(label, normalized);
-  }
-
-  template <typename T>
-  void DumpPlaneNormalized(const char* label, const Plane<T>& image) const {
-    T min;
-    T max;
-    ImageMinMax(image, &min, &max);
-    Image3B normalized(image.xsize(), image.ysize());
-    for (size_t c = 0; c < 3; ++c) {
-      float mul = min == max ? 0 : (255.0f / (max - min));
-      for (size_t y = 0; y < image.ysize(); ++y) {
-        const T* JXL_RESTRICT row_in = image.ConstRow(y);
-        uint8_t* JXL_RESTRICT row_out = normalized.PlaneRow(c, y);
-        for (size_t x = 0; x < image.xsize(); ++x) {
-          row_out[x] = static_cast<uint8_t>((row_in[x] - min) * mul);
-        }
-      }
-    }
-    DumpImage(label, normalized);
-  }
-
-  void SetInspectorImage3F(const jxl::InspectorImage3F& inspector) {
-    inspector_image3f_ = inspector;
-  }
-
-  // Allows hooking intermediate data inspection into various places of the
-  // processing pipeline. Returns true iff processing should proceed.
-  bool InspectImage3F(const char* label, const Image3F& image) {
-    if (inspector_image3f_ != nullptr) {
-      return inspector_image3f_(label, image);
-    }
-    return true;
-  }
-
-  std::array<LayerTotals, kNumImageLayers> layers;
-  size_t num_blocks = 0;
-
-  // Number of blocks that use larger DCT (set by ac_strategy).
-  size_t num_small_blocks = 0;
-  size_t num_dct4x8_blocks = 0;
-  size_t num_afv_blocks = 0;
-  size_t num_dct8_blocks = 0;
-  size_t num_dct8x16_blocks = 0;
-  size_t num_dct8x32_blocks = 0;
-  size_t num_dct16_blocks = 0;
-  size_t num_dct16x32_blocks = 0;
-  size_t num_dct32_blocks = 0;
-  size_t num_dct32x64_blocks = 0;
-  size_t num_dct64_blocks = 0;
-
-  std::array<uint32_t, 8> dc_pred_usage = {{0}};
-  std::array<uint32_t, 8> dc_pred_usage_xb = {{0}};
-
-  int num_butteraugli_iters = 0;
-
-  float max_quant_rescale = 1.0f;
-  float min_quant_rescale = 1.0f;
-  float min_bitrate_error = 0.0f;
-  float max_bitrate_error = 0.0f;
-
-  // If not empty, additional debugging information (e.g. debug images) is
-  // saved in files with this prefix.
-  std::string debug_prefix;
-
-  // By how much the decoded image was downsampled relative to the encoded
-  // image.
-  size_t downsampling = 1;
-
-  jxl::InspectorImage3F inspector_image3f_;
-
-  std::function<Status(const CodecInOut&, const std::string&)> dump_image =
-      nullptr;
-};
-
-// Used to skip image creation if they won't be written to debug directory.
-static inline bool WantDebugOutput(const AuxOut* aux_out) {
-  // Need valid pointer and filename.
-  return aux_out != nullptr && !aux_out->debug_prefix.empty();
-}
-
-}  // namespace jxl
-
-#endif  // LIB_JXL_AUX_OUT_H_
diff --git a/lib/jxl/aux_out_fwd.h b/lib/jxl/aux_out_fwd.h
deleted file mode 100644 (file)
index 29b31ad..0000000
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#ifndef LIB_JXL_AUX_OUT_FWD_H_
-#define LIB_JXL_AUX_OUT_FWD_H_
-
-#include <stddef.h>
-
-#include "lib/jxl/enc_bit_writer.h"
-
-namespace jxl {
-
-struct AuxOut;
-
-// Helper function that ensures the `bits_written` are charged to `layer` in
-// `aux_out`. Example usage:
-//   BitWriter::Allotment allotment(&writer, max_bits);
-//   writer.Write(..); writer.Write(..);
-//   ReclaimAndCharge(&writer, &allotment, layer, aux_out);
-void ReclaimAndCharge(BitWriter* JXL_RESTRICT writer,
-                      BitWriter::Allotment* JXL_RESTRICT allotment,
-                      size_t layer, AuxOut* JXL_RESTRICT aux_out);
-
-}  // namespace jxl
-
-#endif  // LIB_JXL_AUX_OUT_FWD_H_
index ccf1a5e..8966834 100644 (file)
@@ -6,6 +6,7 @@
 #ifndef LIB_JXL_BASE_BYTE_ORDER_H_
 #define LIB_JXL_BASE_BYTE_ORDER_H_
 
+#include <jxl/types.h>
 #include <stdint.h>
 #include <string.h>  // memcpy
 
@@ -36,10 +37,17 @@ static inline bool IsLittleEndian() {
 }
 #endif
 
+static inline bool SwapEndianness(JxlEndianness endianness) {
+  return ((endianness == JXL_BIG_ENDIAN && IsLittleEndian()) ||
+          (endianness == JXL_LITTLE_ENDIAN && !IsLittleEndian()));
+}
+
 #if JXL_COMPILER_MSVC
+#define JXL_BSWAP16(x) _byteswap_ushort(x)
 #define JXL_BSWAP32(x) _byteswap_ulong(x)
 #define JXL_BSWAP64(x) _byteswap_uint64(x)
 #else
+#define JXL_BSWAP16(x) __builtin_bswap16(x)
 #define JXL_BSWAP32(x) __builtin_bswap32(x)
 #define JXL_BSWAP64(x) __builtin_bswap64(x)
 #endif
@@ -127,6 +135,22 @@ static JXL_INLINE uint64_t LoadLE64(const uint8_t* p) {
 #endif
 }
 
+// Loads a Big-Endian float
+static JXL_INLINE float LoadBEFloat(const uint8_t* p) {
+  uint32_t u = LoadBE32(p);
+  float result;
+  memcpy(&result, &u, 4);
+  return result;
+}
+
+// Loads a Little-Endian float
+static JXL_INLINE float LoadLEFloat(const uint8_t* p) {
+  uint32_t u = LoadLE32(p);
+  float result;
+  memcpy(&result, &u, 4);
+  return result;
+}
+
 static JXL_INLINE void StoreBE16(const uint32_t native, uint8_t* p) {
   p[0] = (native >> 8) & 0xFF;
   p[1] = native & 0xFF;
@@ -197,6 +221,15 @@ static JXL_INLINE void StoreLE64(const uint64_t native, uint8_t* p) {
 #endif
 }
 
+static JXL_INLINE float BSwapFloat(float x) {
+  uint32_t u;
+  memcpy(&u, &x, 4);
+  uint32_t uswap = JXL_BSWAP32(u);
+  float xswap;
+  memcpy(&xswap, &uswap, 4);
+  return xswap;
+}
+
 // Big/Little Endian order.
 struct OrderBE {};
 struct OrderLE {};
diff --git a/lib/jxl/base/common.h b/lib/jxl/base/common.h
new file mode 100644 (file)
index 0000000..b7fe6ab
--- /dev/null
@@ -0,0 +1,95 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_COMMON_H_
+#define LIB_JXL_BASE_COMMON_H_
+
+// Shared constants and helper functions.
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <memory>
+#include <string>
+
+#include "lib/jxl/base/compiler_specific.h"
+
+namespace jxl {
+// Some enums and typedefs used by more than one header file.
+
+constexpr size_t kBitsPerByte = 8;  // more clear than CHAR_BIT
+
+constexpr inline size_t RoundUpBitsToByteMultiple(size_t bits) {
+  return (bits + 7) & ~size_t(7);
+}
+
+constexpr inline size_t RoundUpToBlockDim(size_t dim) {
+  return (dim + 7) & ~size_t(7);
+}
+
+static inline bool JXL_MAYBE_UNUSED SafeAdd(const uint64_t a, const uint64_t b,
+                                            uint64_t& sum) {
+  sum = a + b;
+  return sum >= a;  // no need to check b - either sum >= both or < both.
+}
+
+template <typename T1, typename T2>
+constexpr inline T1 DivCeil(T1 a, T2 b) {
+  return (a + b - 1) / b;
+}
+
+// Works for any `align`; if a power of two, compiler emits ADD+AND.
+constexpr inline size_t RoundUpTo(size_t what, size_t align) {
+  return DivCeil(what, align) * align;
+}
+
+constexpr double kPi = 3.14159265358979323846264338327950288;
+
+// Reasonable default for sRGB, matches common monitors. We map white to this
+// many nits (cd/m^2) by default. Butteraugli was tuned for 250 nits, which is
+// very close.
+// NB: This constant is not very "base", but it is shared between modules.
+static constexpr float kDefaultIntensityTarget = 255;
+
+template <typename T>
+constexpr T Pi(T multiplier) {
+  return static_cast<T>(multiplier * kPi);
+}
+
+// Prior to C++14 (i.e. C++11): provide our own make_unique
+#if __cplusplus < 201402L
+template <typename T, typename... Args>
+std::unique_ptr<T> make_unique(Args&&... args) {
+  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+#else
+using std::make_unique;
+#endif
+
+template <typename T>
+JXL_INLINE T Clamp1(T val, T low, T hi) {
+  return val < low ? low : val > hi ? hi : val;
+}
+
+// conversion from integer to string.
+template <typename T>
+std::string ToString(T n) {
+  char data[32] = {};
+  if (T(0.1) != T(0)) {
+    // float
+    snprintf(data, sizeof(data), "%g", static_cast<double>(n));
+  } else if (T(-1) > T(0)) {
+    // unsigned
+    snprintf(data, sizeof(data), "%llu", static_cast<unsigned long long>(n));
+  } else {
+    // signed
+    snprintf(data, sizeof(data), "%lld", static_cast<long long>(n));
+  }
+  return data;
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_BASE_COMMON_H_
index 7aa8b99..702ff8e 100644 (file)
@@ -9,6 +9,7 @@
 // Macros for compiler version + nonstandard keywords, e.g. __builtin_expect.
 
 #include <stdint.h>
+#include <sys/types.h>
 
 #include "lib/jxl/base/sanitizer_definitions.h"
 
 #endif
 
 #if JXL_COMPILER_MSVC
-#define JXL_UNREACHABLE __assume(false)
+#define JXL_UNREACHABLE_BUILTIN __assume(false)
 #elif JXL_COMPILER_CLANG || JXL_COMPILER_GCC >= 405
-#define JXL_UNREACHABLE __builtin_unreachable()
+#define JXL_UNREACHABLE_BUILTIN __builtin_unreachable()
 #else
-#define JXL_UNREACHABLE
+#define JXL_UNREACHABLE_BUILTIN
 #endif
 
 #if JXL_COMPILER_MSVC
diff --git a/lib/jxl/base/data_parallel.cc b/lib/jxl/base/data_parallel.cc
deleted file mode 100644 (file)
index 20a9112..0000000
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "lib/jxl/base/data_parallel.h"
-
-namespace jxl {
-
-// static
-JxlParallelRetCode ThreadPool::SequentialRunnerStatic(
-    void* runner_opaque, void* jpegxl_opaque, JxlParallelRunInit init,
-    JxlParallelRunFunction func, uint32_t start_range, uint32_t end_range) {
-  JxlParallelRetCode init_ret = (*init)(jpegxl_opaque, 1);
-  if (init_ret != 0) return init_ret;
-
-  for (uint32_t i = start_range; i < end_range; i++) {
-    (*func)(jpegxl_opaque, i, 0);
-  }
-  return 0;
-}
-
-}  // namespace jxl
index 666925a..05ccea4 100644 (file)
@@ -9,10 +9,10 @@
 // Portable, low-overhead C++11 ThreadPool alternative to OpenMP for
 // data-parallel computations.
 
+#include <jxl/parallel_runner.h>
 #include <stddef.h>
 #include <stdint.h>
 
-#include "jxl/parallel_runner.h"
 #include "lib/jxl/base/bits.h"
 #include "lib/jxl/base/status.h"
 #if JXL_COMPILER_MSVC
@@ -25,7 +25,7 @@ namespace jxl {
 class ThreadPool {
  public:
   ThreadPool(JxlParallelRunner runner, void* runner_opaque)
-      : runner_(runner ? runner : &ThreadPool::SequentialRunnerStatic),
+      : runner_(runner),
         runner_opaque_(runner ? runner_opaque : static_cast<void*>(this)) {}
 
   ThreadPool(const ThreadPool&) = delete;
@@ -50,6 +50,16 @@ class ThreadPool {
     RunCallState<InitFunc, DataFunc> call_state(init_func, data_func);
     // The runner_ uses the C convention and returns 0 in case of error, so we
     // convert it to a Status.
+    if (!runner_) {
+      void* jpegxl_opaque = static_cast<void*>(&call_state);
+      if (call_state.CallInitFunc(jpegxl_opaque, 1) != 0) {
+        return JXL_FAILURE("Failed to initialize thread");
+      }
+      for (uint32_t i = begin; i < end; i++) {
+        call_state.CallDataFunc(jpegxl_opaque, i, 0);
+      }
+      return true;
+    }
     return (*runner_)(runner_opaque_, static_cast<void*>(&call_state),
                       &call_state.CallInitFunc, &call_state.CallDataFunc, begin,
                       end) == 0;
@@ -89,12 +99,6 @@ class ThreadPool {
     const DataFunc& data_func_;
   };
 
-  // Default JxlParallelRunner used when no runner is provided by the
-  // caller. This runner doesn't use any threading and thread_id is always 0.
-  static JxlParallelRetCode SequentialRunnerStatic(
-      void* runner_opaque, void* jpegxl_opaque, JxlParallelRunInit init,
-      JxlParallelRunFunction func, uint32_t start_range, uint32_t end_range);
-
   // The caller supplied runner function and its opaque void*.
   const JxlParallelRunner runner_;
   void* const runner_opaque_;
similarity index 94%
rename from lib/jxl/fast_math-inl.h
rename to lib/jxl/base/fast_math-inl.h
index 5c48034..fa749cc 100644 (file)
@@ -5,17 +5,17 @@
 
 // Fast SIMD math ops (log2, encoder only, cos, erf for splines)
 
-#if defined(LIB_JXL_FAST_MATH_INL_H_) == defined(HWY_TARGET_TOGGLE)
-#ifdef LIB_JXL_FAST_MATH_INL_H_
-#undef LIB_JXL_FAST_MATH_INL_H_
+#if defined(LIB_JXL_BASE_FAST_MATH_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JXL_BASE_FAST_MATH_INL_H_
+#undef LIB_JXL_BASE_FAST_MATH_INL_H_
 #else
-#define LIB_JXL_FAST_MATH_INL_H_
+#define LIB_JXL_BASE_FAST_MATH_INL_H_
 #endif
 
 #include <hwy/highway.h>
 
-#include "lib/jxl/common.h"
-#include "lib/jxl/rational_polynomial-inl.h"
+#include "lib/jxl/base/common.h"
+#include "lib/jxl/base/rational_polynomial-inl.h"
 HWY_BEFORE_NAMESPACE();
 namespace jxl {
 namespace HWY_NAMESPACE {
@@ -216,11 +216,11 @@ V CubeRootAndAdd(const V x, const V add) {
 }  // namespace jxl
 HWY_AFTER_NAMESPACE();
 
-#endif  // LIB_JXL_FAST_MATH_INL_H_
+#endif  // LIB_JXL_BASE_FAST_MATH_INL_H_
 
 #if HWY_ONCE
-#ifndef FAST_MATH_ONCE
-#define FAST_MATH_ONCE
+#ifndef LIB_JXL_BASE_FAST_MATH_ONCE
+#define LIB_JXL_BASE_FAST_MATH_ONCE
 
 namespace jxl {
 inline float FastLog2f(float f) { return HWY_STATIC_DISPATCH(FastLog2f)(f); }
@@ -232,5 +232,5 @@ inline float FastCosf(float f) { return HWY_STATIC_DISPATCH(FastCosf)(f); }
 inline float FastErff(float f) { return HWY_STATIC_DISPATCH(FastErff)(f); }
 }  // namespace jxl
 
-#endif  // FAST_MATH_ONCE
+#endif  // LIB_JXL_BASE_FAST_MATH_ONCE
 #endif  // HWY_ONCE
diff --git a/lib/jxl/base/file_io.h b/lib/jxl/base/file_io.h
deleted file mode 100644 (file)
index 8c7777c..0000000
+++ /dev/null
@@ -1,152 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#ifndef LIB_JXL_BASE_FILE_IO_H_
-#define LIB_JXL_BASE_FILE_IO_H_
-
-// Helper functions for reading/writing files.
-
-#include <stdio.h>
-#include <sys/stat.h>
-
-#include <list>
-#include <string>
-#include <vector>
-
-#include "lib/jxl/base/compiler_specific.h"
-#include "lib/jxl/base/padded_bytes.h"
-#include "lib/jxl/base/status.h"
-
-namespace jxl {
-
-// Returns extension including the dot, or empty string if none. Assumes
-// filename is not a hidden file (e.g. ".bashrc"). May be called with a pathname
-// if the filename contains a dot and/or no other path component does.
-static inline std::string Extension(const std::string& filename) {
-  const size_t pos = filename.rfind('.');
-  if (pos == std::string::npos) return std::string();
-  return filename.substr(pos);
-}
-
-// RAII, ensures files are closed even when returning early.
-class FileWrapper {
- public:
-  FileWrapper(const FileWrapper& other) = delete;
-  FileWrapper& operator=(const FileWrapper& other) = delete;
-
-  explicit FileWrapper(const std::string& pathname, const char* mode)
-      : file_(pathname == "-" ? (mode[0] == 'r' ? stdin : stdout)
-                              : fopen(pathname.c_str(), mode)),
-        close_on_delete_(pathname != "-") {
-#ifdef _WIN32
-    struct __stat64 s = {};
-    const int err = _stat64(pathname.c_str(), &s);
-    const bool is_file = (s.st_mode & S_IFREG) != 0;
-#else
-    struct stat s = {};
-    const int err = stat(pathname.c_str(), &s);
-    const bool is_file = S_ISREG(s.st_mode);
-#endif
-    if (err == 0 && is_file) {
-      size_ = s.st_size;
-    }
-  }
-
-  ~FileWrapper() {
-    if (file_ != nullptr && close_on_delete_) {
-      const int err = fclose(file_);
-      JXL_CHECK(err == 0);
-    }
-  }
-
-  // We intend to use FileWrapper as a replacement of FILE.
-  // NOLINTNEXTLINE(google-explicit-constructor)
-  operator FILE*() const { return file_; }
-
-  int64_t size() { return size_; }
-
- private:
-  FILE* const file_;
-  bool close_on_delete_ = true;
-  int64_t size_ = -1;
-};
-
-template <typename ContainerType>
-static inline Status ReadFile(const std::string& pathname,
-                              ContainerType* JXL_RESTRICT bytes) {
-  FileWrapper f(pathname, "rb");
-  if (f == nullptr)
-    return JXL_FAILURE("Failed to open file for reading: %s", pathname.c_str());
-
-  // Get size of file in bytes
-  const int64_t size = f.size();
-  if (size < 0) {
-    // Size is unknown, loop reading chunks until EOF.
-    bytes->clear();
-    std::list<std::vector<uint8_t>> chunks;
-
-    size_t total_size = 0;
-    while (true) {
-      std::vector<uint8_t> chunk(16 * 1024);
-      const size_t bytes_read = fread(chunk.data(), 1, chunk.size(), f);
-      if (ferror(f) || bytes_read > chunk.size()) {
-        return JXL_FAILURE("Error reading %s", pathname.c_str());
-      }
-
-      chunk.resize(bytes_read);
-      total_size += bytes_read;
-      if (bytes_read != 0) {
-        chunks.emplace_back(std::move(chunk));
-      }
-      if (feof(f)) {
-        break;
-      }
-    }
-    bytes->resize(total_size);
-    size_t pos = 0;
-    for (const auto& chunk : chunks) {
-      // Needed in case ContainerType is std::string, whose data() is const.
-      char* bytes_writable = reinterpret_cast<char*>(&(*bytes)[0]);
-      memcpy(bytes_writable + pos, chunk.data(), chunk.size());
-      pos += chunk.size();
-    }
-  } else {
-    // Size is known, read the file directly.
-    bytes->resize(static_cast<size_t>(size));
-    size_t pos = 0;
-    while (pos < bytes->size()) {
-      // Needed in case ContainerType is std::string, whose data() is const.
-      char* bytes_writable = reinterpret_cast<char*>(&(*bytes)[0]);
-      const size_t bytes_read =
-          fread(bytes_writable + pos, 1, bytes->size() - pos, f);
-      if (bytes_read == 0) return JXL_FAILURE("Failed to read");
-      pos += bytes_read;
-    }
-    JXL_ASSERT(pos == bytes->size());
-  }
-  return true;
-}
-
-template <typename ContainerType>
-static inline Status WriteFile(const ContainerType& bytes,
-                               const std::string& pathname) {
-  FileWrapper f(pathname, "wb");
-  if (f == nullptr) return JXL_FAILURE("Failed to open file for writing");
-
-  size_t pos = 0;
-  while (pos < bytes.size()) {
-    const size_t bytes_written =
-        fwrite(bytes.data() + pos, 1, bytes.size() - pos, f);
-    if (bytes_written == 0) return JXL_FAILURE("Failed to write");
-    pos += bytes_written;
-  }
-  JXL_ASSERT(pos == bytes.size());
-
-  return true;
-}
-
-}  // namespace jxl
-
-#endif  // LIB_JXL_BASE_FILE_IO_H_
diff --git a/lib/jxl/base/float.h b/lib/jxl/base/float.h
new file mode 100644 (file)
index 0000000..b17413f
--- /dev/null
@@ -0,0 +1,99 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_BASE_FLOAT_H_
+#define LIB_JXL_BASE_FLOAT_H_
+
+#include <jxl/types.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+namespace {
+// Based on highway scalar implementation, for testing
+float LoadFloat16(uint16_t bits16) {
+  const uint32_t sign = bits16 >> 15;
+  const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
+  const uint32_t mantissa = bits16 & 0x3FF;
+
+  // Subnormal or zero
+  if (biased_exp == 0) {
+    const float subnormal =
+        (1.0f / 16384) * (static_cast<float>(mantissa) * (1.0f / 1024));
+    return sign ? -subnormal : subnormal;
+  }
+
+  // Normalized: convert the representation directly (faster than ldexp/tables).
+  const uint32_t biased_exp32 = biased_exp + (127 - 15);
+  const uint32_t mantissa32 = mantissa << (23 - 10);
+  const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
+
+  float result;
+  memcpy(&result, &bits32, 4);
+  return result;
+}
+}  // namespace
+
+template <typename SaveFloatAtFn>
+static Status JXL_INLINE LoadFloatRow(const uint8_t* src, size_t count,
+                                      size_t stride, JxlDataType type,
+                                      bool little_endian, float scale,
+                                      SaveFloatAtFn callback) {
+  switch (type) {
+    case JXL_TYPE_FLOAT:
+      if (little_endian) {
+        for (size_t i = 0; i < count; ++i) {
+          callback(i, LoadLEFloat(src + stride * i));
+        }
+      } else {
+        for (size_t i = 0; i < count; ++i) {
+          callback(i, LoadBEFloat(src + stride * i));
+        }
+      }
+      return true;
+
+    case JXL_TYPE_UINT8:
+      for (size_t i = 0; i < count; ++i) {
+        callback(i, src[stride * i] * scale);
+      }
+      return true;
+
+    case JXL_TYPE_UINT16:
+      if (little_endian) {
+        for (size_t i = 0; i < count; ++i) {
+          callback(i, LoadLE16(src + stride * i) * scale);
+        }
+      } else {
+        for (size_t i = 0; i < count; ++i) {
+          callback(i, LoadBE16(src + stride * i) * scale);
+        }
+      }
+      return true;
+
+    case JXL_TYPE_FLOAT16:
+      if (little_endian) {
+        for (size_t i = 0; i < count; ++i) {
+          callback(i, LoadFloat16(LoadLE16(src + stride * i)));
+        }
+      } else {
+        for (size_t i = 0; i < count; ++i) {
+          callback(i, LoadFloat16(LoadBE16(src + stride * i)));
+        }
+      }
+      return true;
+
+    default:
+      return JXL_FAILURE("Unsupported sample format");
+  }
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_BASE_FLOAT_H_
diff --git a/lib/jxl/base/matrix_ops.h b/lib/jxl/base/matrix_ops.h
new file mode 100644 (file)
index 0000000..1a969bd
--- /dev/null
@@ -0,0 +1,84 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_MATRIX_OPS_H_
+#define LIB_JXL_MATRIX_OPS_H_
+
+// 3x3 matrix operations.
+
+#include <cmath>  // abs
+#include <cstddef>
+
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+// Computes C = A * B, where A, B, C are 3x3 matrices.
+template <typename T>
+void Mul3x3Matrix(const T* a, const T* b, T* c) {
+  alignas(16) T temp[3];  // For transposed column
+  for (size_t x = 0; x < 3; x++) {
+    for (size_t z = 0; z < 3; z++) {
+      temp[z] = b[z * 3 + x];
+    }
+    for (size_t y = 0; y < 3; y++) {
+      double e = 0;
+      for (size_t z = 0; z < 3; z++) {
+        e += a[y * 3 + z] * temp[z];
+      }
+      c[y * 3 + x] = e;
+    }
+  }
+}
+
+// Computes C = A * B, where A is 3x3 matrix and B is vector.
+template <typename T>
+void Mul3x3Vector(const T* a, const T* b, T* c) {
+  for (size_t y = 0; y < 3; y++) {
+    double e = 0;
+    for (size_t x = 0; x < 3; x++) {
+      e += a[y * 3 + x] * b[x];
+    }
+    c[y] = e;
+  }
+}
+
+// Inverts a 3x3 matrix in place.
+template <typename T>
+Status Inv3x3Matrix(T* matrix) {
+  // Intermediate computation is done in double precision.
+  double temp[9];
+  temp[0] = static_cast<double>(matrix[4]) * matrix[8] -
+            static_cast<double>(matrix[5]) * matrix[7];
+  temp[1] = static_cast<double>(matrix[2]) * matrix[7] -
+            static_cast<double>(matrix[1]) * matrix[8];
+  temp[2] = static_cast<double>(matrix[1]) * matrix[5] -
+            static_cast<double>(matrix[2]) * matrix[4];
+  temp[3] = static_cast<double>(matrix[5]) * matrix[6] -
+            static_cast<double>(matrix[3]) * matrix[8];
+  temp[4] = static_cast<double>(matrix[0]) * matrix[8] -
+            static_cast<double>(matrix[2]) * matrix[6];
+  temp[5] = static_cast<double>(matrix[2]) * matrix[3] -
+            static_cast<double>(matrix[0]) * matrix[5];
+  temp[6] = static_cast<double>(matrix[3]) * matrix[7] -
+            static_cast<double>(matrix[4]) * matrix[6];
+  temp[7] = static_cast<double>(matrix[1]) * matrix[6] -
+            static_cast<double>(matrix[0]) * matrix[7];
+  temp[8] = static_cast<double>(matrix[0]) * matrix[4] -
+            static_cast<double>(matrix[1]) * matrix[3];
+  double det = matrix[0] * temp[0] + matrix[1] * temp[3] + matrix[2] * temp[6];
+  if (std::abs(det) < 1e-10) {
+    return JXL_FAILURE("Matrix determinant is too close to 0");
+  }
+  double idet = 1.0 / det;
+  for (size_t i = 0; i < 9; i++) {
+    matrix[i] = temp[i] * idet;
+  }
+  return true;
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_MATRIX_OPS_H_
diff --git a/lib/jxl/base/padded_bytes.cc b/lib/jxl/base/padded_bytes.cc
deleted file mode 100644 (file)
index 11e4bff..0000000
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "lib/jxl/base/padded_bytes.h"
-
-namespace jxl {
-
-void PaddedBytes::IncreaseCapacityTo(size_t capacity) {
-  JXL_ASSERT(capacity > capacity_);
-
-  size_t new_capacity = std::max(capacity, 3 * capacity_ / 2);
-  new_capacity = std::max<size_t>(64, new_capacity);
-
-  // BitWriter writes up to 7 bytes past the end.
-  CacheAlignedUniquePtr new_data = AllocateArray(new_capacity + 8);
-  if (new_data == nullptr) {
-    // Allocation failed, discard all data to ensure this is noticed.
-    size_ = capacity_ = 0;
-    return;
-  }
-
-  if (data_ == nullptr) {
-    // First allocation: ensure first byte is initialized (won't be copied).
-    new_data[0] = 0;
-  } else {
-    // Subsequent resize: copy existing data to new location.
-    memcpy(new_data.get(), data_.get(), size_);
-    // Ensure that the first new byte is initialized, to allow write_bits to
-    // safely append to the newly-resized PaddedBytes.
-    new_data[size_] = 0;
-  }
-
-  capacity_ = new_capacity;
-  std::swap(new_data, data_);
-}
-
-void PaddedBytes::assign(const uint8_t* new_begin, const uint8_t* new_end) {
-  JXL_DASSERT(new_begin <= new_end);
-  const size_t new_size = static_cast<size_t>(new_end - new_begin);
-
-  // memcpy requires non-overlapping ranges, and resizing might invalidate the
-  // new range. Neither happens if the new range is completely to the left or
-  // right of the _allocated_ range (irrespective of size_).
-  const uint8_t* allocated_end = begin() + capacity_;
-  const bool outside = new_end <= begin() || new_begin >= allocated_end;
-  if (outside) {
-    resize(new_size);  // grow or shrink
-    memcpy(data(), new_begin, new_size);
-    return;
-  }
-
-  // There is overlap. The new size cannot be larger because we own the memory
-  // and the new range cannot include anything outside the allocated range.
-  JXL_ASSERT(new_size <= capacity_);
-
-  // memmove allows overlap and capacity_ is sufficient.
-  memmove(data(), new_begin, new_size);
-  size_ = new_size;  // shrink
-}
-
-}  // namespace jxl
diff --git a/lib/jxl/base/profiler.h b/lib/jxl/base/profiler.h
deleted file mode 100644 (file)
index 13f95d2..0000000
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#ifndef LIB_JXL_BASE_PROFILER_H_
-#define LIB_JXL_BASE_PROFILER_H_
-
-// High precision, low overhead time measurements. Returns exact call counts and
-// total elapsed time for user-defined 'zones' (code regions, i.e. C++ scopes).
-//
-// To use the profiler you must set the JPEGXL_ENABLE_PROFILER CMake flag, which
-// defines PROFILER_ENABLED and links against the libjxl_profiler library.
-
-// If zero, this file has no effect and no measurements will be recorded.
-#ifndef PROFILER_ENABLED
-#define PROFILER_ENABLED 0
-#endif  // PROFILER_ENABLED
-
-#if PROFILER_ENABLED
-
-#include "lib/profiler/profiler.h"
-
-#else  // !PROFILER_ENABLED
-
-#define PROFILER_ZONE(name)
-#define PROFILER_FUNC
-#define PROFILER_PRINT_RESULTS()
-
-#endif  // PROFILER_ENABLED
-
-#endif  // LIB_JXL_BASE_PROFILER_H_
diff --git a/lib/jxl/base/random.cc b/lib/jxl/base/random.cc
deleted file mode 100644 (file)
index 0fbe758..0000000
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "lib/jxl/base/random.h"
-
-#include "lib/jxl/fast_math-inl.h"
-
-namespace jxl {
-
-Rng::GeometricDistribution::GeometricDistribution(float p)
-    : inv_log_1mp(1.0 / FastLog2f(1 - p)) {}
-
-uint32_t Rng::Geometric(const GeometricDistribution& dist) {
-  float f = UniformF(0, 1);
-  float log = FastLog2f(1 - f) * dist.inv_log_1mp;
-  return static_cast<uint32_t>(log);
-}
-
-}  // namespace jxl
index 663b88c..b27815b 100644 (file)
@@ -14,6 +14,7 @@
 #include <string.h>
 
 #include <algorithm>
+#include <cmath>
 
 #include "lib/jxl/base/status.h"
 
@@ -69,15 +70,18 @@ struct Rng {
   bool Bernoulli(float p) { return UniformF(0, 1) < p; }
 
   // State for geometric distributions.
-  struct GeometricDistribution {
-    explicit GeometricDistribution(float p);
-
-   private:
-    float inv_log_1mp;
-    friend struct Rng;
-  };
+  // The stored value is inv_log_1mp
+  using GeometricDistribution = float;
+  static GeometricDistribution MakeGeometric(float p) {
+    return 1.0 / std::log(1 - p);
+  }
 
-  uint32_t Geometric(const GeometricDistribution& dist);
+  uint32_t Geometric(const GeometricDistribution& dist) {
+    float f = UniformF(0, 1);
+    float inv_log_1mp = dist;
+    float log = std::log(1 - f) * inv_log_1mp;
+    return static_cast<uint32_t>(log);
+  }
 
   template <typename T>
   void Shuffle(T* t, size_t n) {
similarity index 89%
rename from lib/jxl/rational_polynomial-inl.h
rename to lib/jxl/base/rational_polynomial-inl.h
index 176e240..e073937 100644 (file)
@@ -5,11 +5,12 @@
 
 // Fast SIMD evaluation of rational polynomials for approximating functions.
 
-#if defined(LIB_JXL_RATIONAL_POLYNOMIAL_INL_H_) == defined(HWY_TARGET_TOGGLE)
-#ifdef LIB_JXL_RATIONAL_POLYNOMIAL_INL_H_
-#undef LIB_JXL_RATIONAL_POLYNOMIAL_INL_H_
+#if defined(LIB_JXL_BASE_RATIONAL_POLYNOMIAL_INL_H_) == \
+    defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JXL_BASE_RATIONAL_POLYNOMIAL_INL_H_
+#undef LIB_JXL_BASE_RATIONAL_POLYNOMIAL_INL_H_
 #else
-#define LIB_JXL_RATIONAL_POLYNOMIAL_INL_H_
+#define LIB_JXL_BASE_RATIONAL_POLYNOMIAL_INL_H_
 #endif
 
 #include <stddef.h>
@@ -87,6 +88,9 @@ HWY_INLINE HWY_MAYBE_UNUSED V EvalRationalPolynomial(const D d, const V x,
   if (kDegP >= 7) yp = MulAdd(yp, x, LoadDup128(d, p + ((kDegP - 7) * 4)));
   if (kDegQ >= 7) yq = MulAdd(yq, x, LoadDup128(d, q + ((kDegQ - 7) * 4)));
 
+  static_assert(kDegP < 8, "Polynomial degree is too high");
+  static_assert(kDegQ < 8, "Polynomial degree is too high");
+
   return FastDivision<T, V>()(yp, yq);
 }
 
@@ -95,4 +99,4 @@ HWY_INLINE HWY_MAYBE_UNUSED V EvalRationalPolynomial(const D d, const V x,
 }  // namespace HWY_NAMESPACE
 }  // namespace jxl
 HWY_AFTER_NAMESPACE();
-#endif  // LIB_JXL_RATIONAL_POLYNOMIAL_INL_H_
+#endif  // LIB_JXL_BASE_RATIONAL_POLYNOMIAL_INL_H_
index 41c3623..dc1c781 100644 (file)
@@ -9,7 +9,9 @@
 // Span (array view) is a non-owning container that provides cheap "cut"
 // operations and could be used as "ArrayLike" data source for PaddedBytes.
 
-#include <stddef.h>
+#include <cstddef>
+#include <cstdint>
+#include <vector>
 
 #include "lib/jxl/base/status.h"
 
@@ -26,6 +28,12 @@ class Span {
   template <size_t N>
   explicit constexpr Span(T (&a)[N]) noexcept : Span(a, N) {}
 
+  template <typename U>
+  constexpr Span(U* array, size_t length) noexcept
+      : ptr_(reinterpret_cast<T*>(array)), len_(length) {
+    static_assert(sizeof(U) == sizeof(T), "Incompatible type of source.");
+  }
+
   template <typename ArrayLike>
   explicit constexpr Span(const ArrayLike& other) noexcept
       : Span(reinterpret_cast<T*>(other.data()), other.size()) {
@@ -39,6 +47,10 @@ class Span {
 
   constexpr bool empty() const noexcept { return len_ == 0; }
 
+  constexpr T* begin() const noexcept { return data(); }
+
+  constexpr T* end() const noexcept { return data() + size(); }
+
   constexpr T& operator[](size_t i) const noexcept {
     // MSVC 2015 accepts this as constexpr, but not ptr_[i]
     return *(data() + i);
@@ -50,11 +62,19 @@ class Span {
     len_ -= n;
   }
 
+  // NCT == non-const-T; compiler will complain if NCT is not compatible with T.
+  template <typename NCT>
+  void AppendTo(std::vector<NCT>* dst) const {
+    dst->insert(dst->end(), begin(), end());
+  }
+
  private:
   T* ptr_;
   size_t len_;
 };
 
+typedef Span<const uint8_t> Bytes;
+
 }  // namespace jxl
 
 #endif  // LIB_JXL_BASE_SPAN_H_
index 682f440..26390ad 100644 (file)
@@ -13,6 +13,9 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+#include <type_traits>
+#include <utility>
+
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/sanitizer_definitions.h"
 
@@ -65,10 +68,10 @@ namespace jxl {
 #define JXL_DEBUG_V_LEVEL 0
 #endif  // JXL_DEBUG_V_LEVEL
 
-// Pass -DJXL_DEBUG_ON_ABORT=0 to disable the debug messages on JXL_ASSERT,
-// JXL_CHECK and JXL_ABORT.
+// Pass -DJXL_DEBUG_ON_ABORT={0,1} to force disable/enable the debug messages on
+// JXL_ASSERT, JXL_CHECK and JXL_ABORT.
 #ifndef JXL_DEBUG_ON_ABORT
-#define JXL_DEBUG_ON_ABORT 1
+#define JXL_DEBUG_ON_ABORT JXL_DEBUG_ON_ERROR
 #endif  // JXL_DEBUG_ON_ABORT
 
 // Print a debug message on standard error. You should use the JXL_DEBUG macro
@@ -94,12 +97,14 @@ inline JXL_NOINLINE bool Debug(const char* format, ...) {
 //   #ifndef JXL_DEBUG_MYMODULE
 //   #define JXL_DEBUG_MYMODULE 0
 //   #endif JXL_DEBUG_MYMODULE
-#define JXL_DEBUG(enabled, format, ...)                         \
-  do {                                                          \
-    if (enabled) {                                              \
-      ::jxl::Debug(("%s:%d: " format "\n"), __FILE__, __LINE__, \
-                   ##__VA_ARGS__);                              \
-    }                                                           \
+#define JXL_DEBUG_TMP(format, ...) \
+  ::jxl::Debug(("%s:%d: " format "\n"), __FILE__, __LINE__, ##__VA_ARGS__)
+
+#define JXL_DEBUG(enabled, format, ...)     \
+  do {                                      \
+    if (enabled) {                          \
+      JXL_DEBUG_TMP(format, ##__VA_ARGS__); \
+    }                                       \
   } while (0)
 
 // JXL_DEBUG version that prints the debug message if the global verbose level
@@ -146,6 +151,21 @@ JXL_NORETURN inline JXL_NOINLINE bool Abort() {
                                         __FILE__, __LINE__, ##__VA_ARGS__), \
    ::jxl::Abort())
 
+// Use this for code paths that are unreachable unless the code would change
+// to make it reachable, in which case it will print a warning and abort in
+// debug builds. In release builds no code is produced for this, so only use
+// this if this path is really unreachable.
+#define JXL_UNREACHABLE(format, ...)                                   \
+  do {                                                                 \
+    if (JXL_DEBUG_WARNING) {                                           \
+      ::jxl::Debug(("%s:%d: JXL_UNREACHABLE: " format "\n"), __FILE__, \
+                   __LINE__, ##__VA_ARGS__);                           \
+      ::jxl::Abort();                                                  \
+    } else {                                                           \
+      JXL_UNREACHABLE_BUILTIN;                                         \
+    }                                                                  \
+  } while (0)
+
 // Does not guarantee running the code, use only for debug mode checks.
 #if JXL_ENABLE_ASSERT
 #define JXL_ASSERT(condition)                                      \
@@ -297,6 +317,8 @@ class JXL_MUST_USE_RESULT Status {
   StatusCode code_;
 };
 
+static constexpr Status OkStatus() { return Status(StatusCode::kOk); }
+
 // Helper function to create a Status and print the debug message or abort when
 // needed.
 inline JXL_FORMAT(2, 3) Status
@@ -319,6 +341,91 @@ inline JXL_FORMAT(2, 3) Status
   return status;
 }
 
+template <typename T>
+class JXL_MUST_USE_RESULT StatusOr {
+  static_assert(!std::is_convertible<StatusCode, T>::value &&
+                    !std::is_convertible<T, StatusCode>::value,
+                "You cannot make a StatusOr with a type convertible from or to "
+                "StatusCode");
+  static_assert(std::is_move_constructible<T>::value &&
+                    std::is_move_assignable<T>::value,
+                "T must be move constructible and move assignable");
+
+ public:
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  StatusOr(StatusCode code) : code_(code) {
+    JXL_ASSERT(code_ != StatusCode::kOk);
+  }
+
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  StatusOr(Status status) : StatusOr(status.code()) {}
+
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  StatusOr(T&& value) : code_(StatusCode::kOk) {
+    new (&storage_.data_) T(std::move(value));
+  }
+
+  StatusOr(StatusOr&& other) noexcept {
+    if (other.ok()) {
+      new (&storage_.data_) T(std::move(other.storage_.data_));
+    }
+    code_ = other.code_;
+  }
+
+  StatusOr& operator=(StatusOr&& other) noexcept {
+    if (this == &other) return *this;
+    if (ok() && other.ok()) {
+      storage_.data_ = std::move(other.storage_.data_);
+    } else if (other.ok()) {
+      new (&storage_.data_) T(std::move(other.storage_.data_));
+    } else if (ok()) {
+      storage_.data_.~T();
+    }
+    code_ = other.code_;
+    return *this;
+  }
+
+  StatusOr(const StatusOr&) = delete;
+  StatusOr operator=(const StatusOr&) = delete;
+
+  bool ok() const { return code_ == StatusCode::kOk; }
+  Status status() const { return code_; }
+
+  // Only call this if you are absolutely sure that `ok()` is true.
+  // Ideally, never call this manually and rely on JXL_ASSIGN_OR_RETURN.
+  T value() && {
+    JXL_ASSERT(ok());
+    return std::move(storage_.data_);
+  }
+
+  ~StatusOr() {
+    if (code_ == StatusCode::kOk) {
+      storage_.data_.~T();
+    }
+  }
+
+ private:
+  union Storage {
+    char placeholder_;
+    T data_;
+    Storage() {}
+    ~Storage() {}
+  } storage_;
+
+  StatusCode code_;
+};
+
+#define JXL_ASSIGN_OR_RETURN(lhs, statusor) \
+  PRIVATE_JXL_ASSIGN_OR_RETURN_IMPL(        \
+      assign_or_return_temporary_variable##__LINE__, lhs, statusor)
+
+// NOLINTBEGIN(bugprone-macro-parentheses)
+#define PRIVATE_JXL_ASSIGN_OR_RETURN_IMPL(name, lhs, statusor) \
+  auto name = statusor;                                        \
+  JXL_RETURN_IF_ERROR(name.status());                          \
+  lhs = std::move(name).value();
+// NOLINTEND(bugprone-macro-parentheses)
+
 }  // namespace jxl
 
 #endif  // LIB_JXL_BASE_STATUS_H_
diff --git a/lib/jxl/base/thread_pool_internal.h b/lib/jxl/base/thread_pool_internal.h
deleted file mode 100644 (file)
index 6e23a33..0000000
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#ifndef LIB_JXL_BASE_THREAD_POOL_INTERNAL_H_
-#define LIB_JXL_BASE_THREAD_POOL_INTERNAL_H_
-
-#include <stddef.h>
-
-#include <cmath>
-
-#include "jxl/parallel_runner.h"
-#include "lib/jxl/base/data_parallel.h"
-#include "lib/threads/thread_parallel_runner_internal.h"
-
-namespace jxl {
-
-// Helper class to pass an internal ThreadPool-like object using threads. This
-// is only suitable for tests or tools that access the internal API of JPEG XL.
-// In other cases the caller will provide a JxlParallelRunner() for handling
-// this. This class uses jpegxl::ThreadParallelRunner (from jpegxl_threads
-// library). For interface details check jpegxl::ThreadParallelRunner.
-class ThreadPoolInternal : public ThreadPool {
- public:
-  // Starts the given number of worker threads and blocks until they are ready.
-  // "num_worker_threads" defaults to one per hyperthread. If zero, all tasks
-  // run on the main thread.
-  explicit ThreadPoolInternal(
-      int num_worker_threads = std::thread::hardware_concurrency())
-      : ThreadPool(&jpegxl::ThreadParallelRunner::Runner,
-                   static_cast<void*>(&runner_)),
-        runner_(num_worker_threads) {}
-
-  ThreadPoolInternal(const ThreadPoolInternal&) = delete;
-  ThreadPoolInternal& operator&(const ThreadPoolInternal&) = delete;
-
-  size_t NumThreads() const { return runner_.NumThreads(); }
-  size_t NumWorkerThreads() const { return runner_.NumWorkerThreads(); }
-
-  template <class Func>
-  void RunOnEachThread(const Func& func) {
-    runner_.RunOnEachThread(func);
-  }
-
- private:
-  jpegxl::ThreadParallelRunner runner_;
-};
-
-}  // namespace jxl
-
-#endif  // LIB_JXL_BASE_THREAD_POOL_INTERNAL_H_
index dbe93d4..22a2064 100644 (file)
@@ -9,16 +9,15 @@
 #include <array>
 #include <vector>
 
-#include "gtest/gtest.h"
-#include "lib/jxl/aux_out.h"
-#include "lib/jxl/aux_out_fwd.h"
+#include "lib/jxl/base/common.h"
 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/base/random.h"
 #include "lib/jxl/base/span.h"
-#include "lib/jxl/base/thread_pool_internal.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/enc_aux_out.h"
 #include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
 namespace {
@@ -28,7 +27,7 @@ TEST(BitReaderTest, ExtendsWithZeroes) {
     std::vector<uint8_t> data(size, 0xff);
 
     for (size_t n_bytes = 0; n_bytes < size; n_bytes++) {
-      BitReader br(Span<const uint8_t>(data.data(), n_bytes));
+      BitReader br(Bytes(data.data(), n_bytes));
       // Read all the bits
       for (size_t i = 0; i < n_bytes * kBitsPerByte; i++) {
         ASSERT_EQ(br.ReadBits(1), 1u) << "n_bytes=" << n_bytes << " i=" << i;
@@ -52,7 +51,7 @@ struct Symbol {
 
 // Reading from output gives the same values.
 TEST(BitReaderTest, TestRoundTrip) {
-  ThreadPoolInternal pool(8);
+  test::ThreadPoolForTests pool(8);
   EXPECT_TRUE(RunOnPool(
       &pool, 0, 1000, ThreadPool::NoInit,
       [](const uint32_t task, size_t /* thread */) {
@@ -74,7 +73,7 @@ TEST(BitReaderTest, TestRoundTrip) {
         }
 
         writer.ZeroPadToByte();
-        ReclaimAndCharge(&writer, &allotment, 0, nullptr);
+        allotment.ReclaimAndCharge(&writer, 0, nullptr);
         BitReader reader(writer.GetSpan());
         for (const Symbol& s : symbols) {
           EXPECT_EQ(s.value, reader.ReadBits(s.num_bits));
@@ -86,7 +85,7 @@ TEST(BitReaderTest, TestRoundTrip) {
 
 // SkipBits is the same as reading that many bits.
 TEST(BitReaderTest, TestSkip) {
-  ThreadPoolInternal pool(8);
+  test::ThreadPoolForTests pool(8);
   EXPECT_TRUE(RunOnPool(
       &pool, 0, 96, ThreadPool::NoInit,
       [](const uint32_t task, size_t /* thread */) {
@@ -110,7 +109,7 @@ TEST(BitReaderTest, TestSkip) {
           EXPECT_EQ(task + skip + 3, writer.BitsWritten());
           writer.ZeroPadToByte();
           AuxOut aux_out;
-          ReclaimAndCharge(&writer, &allotment, 0, &aux_out);
+          allotment.ReclaimAndCharge(&writer, 0, &aux_out);
           EXPECT_LT(aux_out.layers[0].total_bits, kSize * 8);
 
           BitReader reader1(writer.GetSpan());
@@ -159,7 +158,7 @@ TEST(BitReaderTest, TestOrder) {
     }
 
     writer.ZeroPadToByte();
-    ReclaimAndCharge(&writer, &allotment, 0, nullptr);
+    allotment.ReclaimAndCharge(&writer, 0, nullptr);
     BitReader reader(writer.GetSpan());
     EXPECT_EQ(0x1Fu, reader.ReadFixedBits<8>());
     EXPECT_EQ(0xFCu, reader.ReadFixedBits<8>());
@@ -174,7 +173,7 @@ TEST(BitReaderTest, TestOrder) {
     writer.Write(8, 0x3F);
 
     writer.ZeroPadToByte();
-    ReclaimAndCharge(&writer, &allotment, 0, nullptr);
+    allotment.ReclaimAndCharge(&writer, 0, nullptr);
     BitReader reader(writer.GetSpan());
     EXPECT_EQ(0xF8u, reader.ReadFixedBits<8>());
     EXPECT_EQ(0x3Fu, reader.ReadFixedBits<8>());
@@ -188,7 +187,7 @@ TEST(BitReaderTest, TestOrder) {
     writer.Write(16, 0xF83F);
 
     writer.ZeroPadToByte();
-    ReclaimAndCharge(&writer, &allotment, 0, nullptr);
+    allotment.ReclaimAndCharge(&writer, 0, nullptr);
     BitReader reader(writer.GetSpan());
     EXPECT_EQ(0x3Fu, reader.ReadFixedBits<8>());
     EXPECT_EQ(0xF8u, reader.ReadFixedBits<8>());
@@ -205,7 +204,7 @@ TEST(BitReaderTest, TestOrder) {
     writer.Write(4, 8);
 
     writer.ZeroPadToByte();
-    ReclaimAndCharge(&writer, &allotment, 0, nullptr);
+    allotment.ReclaimAndCharge(&writer, 0, nullptr);
     BitReader reader(writer.GetSpan());
     EXPECT_EQ(0xBDu, reader.ReadFixedBits<8>());
     EXPECT_EQ(0x8Du, reader.ReadFixedBits<8>());
@@ -215,7 +214,7 @@ TEST(BitReaderTest, TestOrder) {
 
 TEST(BitReaderTest, TotalCountersTest) {
   uint8_t buf[8] = {1, 2, 3, 4};
-  BitReader reader(Span<const uint8_t>(buf, sizeof(buf)));
+  BitReader reader(Bytes(buf, sizeof(buf)));
 
   EXPECT_EQ(sizeof(buf), reader.TotalBytes());
   EXPECT_EQ(0u, reader.TotalBitsConsumed());
@@ -241,7 +240,7 @@ TEST(BitReaderTest, MoveTest) {
   uint8_t buf[8] = {1, 2, 3, 4};
   BitReader reader2;
   {
-    BitReader reader1(Span<const uint8_t>(buf, sizeof(buf)));
+    BitReader reader1(Bytes(buf, sizeof(buf)));
 
     EXPECT_EQ(0u, reader1.TotalBitsConsumed());
     reader1.ReadFixedBits<16>();
index 699090c..bd7aa54 100644 (file)
@@ -5,7 +5,7 @@
 
 #include "lib/jxl/base/bits.h"
 
-#include "gtest/gtest.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
 namespace {
index ab37fda..291e3ba 100644 (file)
@@ -80,7 +80,7 @@ void PerformBlending(const float* const* bg, const float* const* fg,
     } else if (ec_blending[i].mode == PatchBlendMode::kNone) {
       if (xsize) memcpy(tmp.Row(3 + i), bg[3 + i] + x0, xsize * sizeof(**fg));
     } else {
-      JXL_ABORT("Unreachable");
+      JXL_UNREACHABLE("new PatchBlendMode?");
     }
   }
   size_t alpha = color_blending.alpha_channel;
@@ -142,7 +142,7 @@ void PerformBlending(const float* const* bg, const float* const* fg,
       memcpy(tmp.Row(p), bg[p] + x0, xsize * sizeof(**fg));
     }
   } else {
-    JXL_ABORT("Unreachable");
+    JXL_UNREACHABLE("new PatchBlendMode?");
   }
   for (size_t i = 0; i < 3 + num_ec; i++) {
     if (xsize != 0) memcpy(out[i] + x0, tmp.Row(i), xsize * sizeof(**out));
index e032b99..73d6f44 100644 (file)
@@ -3,10 +3,13 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+#include <cstdint>
+#include <vector>
+
 #include "lib/extras/codec.h"
 #include "lib/jxl/image_test_utils.h"
 #include "lib/jxl/test_utils.h"
-#include "lib/jxl/testdata.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
 namespace {
@@ -14,22 +17,21 @@ namespace {
 using ::testing::SizeIs;
 
 TEST(BlendingTest, Crops) {
-  ThreadPool* pool = nullptr;
-
-  const PaddedBytes compressed =
-      ReadTestData("jxl/blending/cropped_traffic_light.jxl");
+  const std::vector<uint8_t> compressed =
+      jxl::test::ReadTestData("jxl/blending/cropped_traffic_light.jxl");
   CodecInOut decoded;
-  ASSERT_TRUE(test::DecodeFile({}, compressed, &decoded, pool));
+  ASSERT_TRUE(test::DecodeFile({}, Bytes(compressed), &decoded));
   ASSERT_THAT(decoded.frames, SizeIs(4));
 
   int i = 0;
   for (const ImageBundle& ib : decoded.frames) {
     std::ostringstream filename;
     filename << "jxl/blending/cropped_traffic_light_frame-" << i << ".png";
-    const PaddedBytes compressed_frame = ReadTestData(filename.str());
+    const std::vector<uint8_t> compressed_frame =
+        jxl::test::ReadTestData(filename.str());
     CodecInOut frame;
-    ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(compressed_frame), &frame));
-    EXPECT_TRUE(SamePixels(ib.color(), *frame.Main().color()));
+    ASSERT_TRUE(SetFromBytes(Bytes(compressed_frame), &frame));
+    JXL_EXPECT_OK(SamePixels(ib.color(), *frame.Main().color(), _));
     ++i;
   }
 }
index 41d878c..6153360 100644 (file)
@@ -7,14 +7,13 @@
 #define LIB_JXL_BOX_CONTENT_DECODER_H_
 
 #include <brotli/decode.h>
+#include <jxl/decode.h>
 #include <stdint.h>
 #include <stdlib.h>
 
 #include <memory>
 #include <vector>
 
-#include "jxl/decode.h"
-
 namespace jxl {
 
 /** Outputs the contents of a box in a streaming fashion, either directly, or
index ee1a530..4011b1e 100644 (file)
 #include <new>
 #include <vector>
 
-#if PROFILER_ENABLED
-#include <chrono>
-#endif  // PROFILER_ENABLED
-
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "lib/jxl/butteraugli/butteraugli.cc"
 #include <hwy/foreach_target.h>
 
+#include "lib/jxl/base/fast_math-inl.h"
 #include "lib/jxl/base/printf_macros.h"
-#include "lib/jxl/base/profiler.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/convolve.h"
-#include "lib/jxl/fast_math-inl.h"
 #include "lib/jxl/gauss_blur.h"
 #include "lib/jxl/image_ops.h"
 
 
 namespace jxl {
 
+static const double wMfMalta = 37.0819870399;
+static const double norm1Mf = 130262059.556;
+static const double wMfMaltaX = 8246.75321353;
+static const double norm1MfX = 1009002.70582;
+static const double wHfMalta = 18.7237414387;
+static const double norm1Hf = 4498534.45232;
+static const double wHfMaltaX = 6923.99476109;
+static const double norm1HfX = 8051.15833247;
+static const double wUhfMalta = 1.10039032555;
+static const double norm1Uhf = 71.7800275169;
+static const double wUhfMaltaX = 173.5;
+static const double norm1UhfX = 5.0;
+static const double wmul[9] = {
+    400.0,         1.50815703118,  0,
+    2150.0,        10.6195433239,  16.2176043152,
+    29.2353797994, 0.844626970982, 0.703646627719,
+};
+
 std::vector<float> ComputeKernel(float sigma) {
   const float m = 2.25;  // Accuracy increases when m is increased.
   const double scaler = -1.0 / (2.0 * sigma * sigma);
@@ -89,7 +102,6 @@ void ConvolveBorderColumn(const ImageF& in, const std::vector<float>& kernel,
 void ConvolutionWithTranspose(const ImageF& in,
                               const std::vector<float>& kernel,
                               ImageF* BUTTERAUGLI_RESTRICT out) {
-  PROFILER_FUNC;
   JXL_CHECK(out->xsize() == in.ysize());
   JXL_CHECK(out->ysize() == in.xsize());
   const size_t len = kernel.size();
@@ -109,7 +121,6 @@ void ConvolutionWithTranspose(const ImageF& in,
   // middle
   switch (len) {
     case 7: {
-      PROFILER_ZONE("conv7");
       const float sk0 = scaled_kernel[0];
       const float sk1 = scaled_kernel[1];
       const float sk2 = scaled_kernel[2];
@@ -127,7 +138,6 @@ void ConvolutionWithTranspose(const ImageF& in,
       }
     } break;
     case 13: {
-      PROFILER_ZONE("conv15");
       for (size_t y = 0; y < in.ysize(); ++y) {
         const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y) + border1 - offset;
         for (size_t x = border1; x < border2; ++x, ++row_in) {
@@ -145,7 +155,6 @@ void ConvolutionWithTranspose(const ImageF& in,
       break;
     }
     case 15: {
-      PROFILER_ZONE("conv15");
       for (size_t y = 0; y < in.ysize(); ++y) {
         const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y) + border1 - offset;
         for (size_t x = border1; x < border2; ++x, ++row_in) {
@@ -164,7 +173,6 @@ void ConvolutionWithTranspose(const ImageF& in,
       break;
     }
     case 33: {
-      PROFILER_ZONE("conv33");
       for (size_t y = 0; y < in.ysize(); ++y) {
         const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y) + border1 - offset;
         for (size_t x = border1; x < border2; ++x, ++row_in) {
@@ -192,23 +200,7 @@ void ConvolutionWithTranspose(const ImageF& in,
       break;
     }
     default:
-      printf("Warning: Unexpected kernel size! %" PRIuS "\n", len);
-      for (size_t y = 0; y < in.ysize(); ++y) {
-        const float* BUTTERAUGLI_RESTRICT row_in = in.Row(y);
-        for (size_t x = border1; x < border2; ++x) {
-          const int d = x - offset;
-          float* BUTTERAUGLI_RESTRICT row_out = out->Row(x);
-          float sum = 0.0f;
-          size_t j;
-          for (j = 0; j <= len / 2; ++j) {
-            sum += row_in[d + j] * scaled_kernel[j];
-          }
-          for (; j < len; ++j) {
-            sum += row_in[d + j] * scaled_kernel[len - 1 - j];
-          }
-          row_out[y] = sum;
-        }
-      }
+      JXL_UNREACHABLE("Kernel size %" PRIuS " not implemented", len);
   }
   // left border
   for (size_t x = 0; x < border1; ++x) {
@@ -322,95 +314,118 @@ template <class D, class V>
 HWY_INLINE void XybLowFreqToVals(const D d, const V& x, const V& y,
                                  const V& b_arg, V* HWY_RESTRICT valx,
                                  V* HWY_RESTRICT valy, V* HWY_RESTRICT valb) {
-  static const double xmuli = 32.2217497012;
-  static const double ymuli = 13.7697791434;
-  static const double bmuli = 47.504615728;
-  static const double y_to_b_muli = -0.362267051518;
-  const V xmul = Set(d, xmuli);
-  const V ymul = Set(d, ymuli);
-  const V bmul = Set(d, bmuli);
-  const V y_to_b_mul = Set(d, y_to_b_muli);
+  static const double xmul_scalar = 33.832837186260;
+  static const double ymul_scalar = 14.458268100570;
+  static const double bmul_scalar = 49.87984651440;
+  static const double y_to_b_mul_scalar = -0.362267051518;
+  const V xmul = Set(d, xmul_scalar);
+  const V ymul = Set(d, ymul_scalar);
+  const V bmul = Set(d, bmul_scalar);
+  const V y_to_b_mul = Set(d, y_to_b_mul_scalar);
   const V b = MulAdd(y_to_b_mul, y, b_arg);
   *valb = Mul(b, bmul);
   *valx = Mul(x, xmul);
   *valy = Mul(y, ymul);
 }
 
-void SuppressXByY(const ImageF& in_x, const ImageF& in_y, const double yw,
-                  ImageF* HWY_RESTRICT out) {
-  JXL_DASSERT(SameSize(in_x, in_y) && SameSize(in_x, *out));
-  const size_t xsize = in_x.xsize();
-  const size_t ysize = in_x.ysize();
+void XybLowFreqToVals(Image3F* xyb_lf) {
+  // Modify range around zero code only concerns the high frequency
+  // planes and only the X and Y channels.
+  // Convert low freq xyb to vals space so that we can do a simple squared sum
+  // diff on the low frequencies later.
+  const HWY_FULL(float) d;
+  for (size_t y = 0; y < xyb_lf->ysize(); ++y) {
+    float* BUTTERAUGLI_RESTRICT row_x = xyb_lf->PlaneRow(0, y);
+    float* BUTTERAUGLI_RESTRICT row_y = xyb_lf->PlaneRow(1, y);
+    float* BUTTERAUGLI_RESTRICT row_b = xyb_lf->PlaneRow(2, y);
+    for (size_t x = 0; x < xyb_lf->xsize(); x += Lanes(d)) {
+      auto valx = Undefined(d);
+      auto valy = Undefined(d);
+      auto valb = Undefined(d);
+      XybLowFreqToVals(d, Load(d, row_x + x), Load(d, row_y + x),
+                       Load(d, row_b + x), &valx, &valy, &valb);
+      Store(valx, d, row_x + x);
+      Store(valy, d, row_y + x);
+      Store(valb, d, row_b + x);
+    }
+  }
+}
 
+void SuppressXByY(const ImageF& in_y, ImageF* HWY_RESTRICT inout_x) {
+  JXL_DASSERT(SameSize(*inout_x, in_y));
+  const size_t xsize = in_y.xsize();
+  const size_t ysize = in_y.ysize();
   const HWY_FULL(float) d;
+  static const double suppress = 46.0;
   static const double s = 0.653020556257;
   const auto sv = Set(d, s);
   const auto one_minus_s = Set(d, 1.0 - s);
-  const auto ywv = Set(d, yw);
+  const auto ywv = Set(d, suppress);
 
   for (size_t y = 0; y < ysize; ++y) {
-    const float* HWY_RESTRICT row_x = in_x.ConstRow(y);
     const float* HWY_RESTRICT row_y = in_y.ConstRow(y);
-    float* HWY_RESTRICT row_out = out->Row(y);
-
+    float* HWY_RESTRICT row_x = inout_x->Row(y);
     for (size_t x = 0; x < xsize; x += Lanes(d)) {
       const auto vx = Load(d, row_x + x);
       const auto vy = Load(d, row_y + x);
       const auto scaler =
           MulAdd(Div(ywv, MulAdd(vy, vy, ywv)), one_minus_s, sv);
-      Store(Mul(scaler, vx), d, row_out + x);
+      Store(Mul(scaler, vx), d, row_x + x);
     }
   }
 }
 
-static void SeparateFrequencies(size_t xsize, size_t ysize,
-                                const ButteraugliParams& params,
-                                BlurTemp* blur_temp, const Image3F& xyb,
-                                PsychoImage& ps) {
-  PROFILER_FUNC;
+void Subtract(const ImageF& a, const ImageF& b, ImageF* c) {
   const HWY_FULL(float) d;
+  for (size_t y = 0; y < a.ysize(); ++y) {
+    const float* row_a = a.ConstRow(y);
+    const float* row_b = b.ConstRow(y);
+    float* row_c = c->Row(y);
+    for (size_t x = 0; x < a.xsize(); x += Lanes(d)) {
+      Store(Sub(Load(d, row_a + x), Load(d, row_b + x)), d, row_c + x);
+    }
+  }
+}
 
-  // Extract lf ...
+void SeparateLFAndMF(const ButteraugliParams& params, const Image3F& xyb,
+                     Image3F* lf, Image3F* mf, BlurTemp* blur_temp) {
   static const double kSigmaLf = 7.15593339443;
-  static const double kSigmaHf = 3.22489901262;
-  static const double kSigmaUhf = 1.56416327805;
-  ps.mf = Image3F(xsize, ysize);
-  ps.hf[0] = ImageF(xsize, ysize);
-  ps.hf[1] = ImageF(xsize, ysize);
-  ps.lf = Image3F(xyb.xsize(), xyb.ysize());
-  ps.mf = Image3F(xyb.xsize(), xyb.ysize());
   for (int i = 0; i < 3; ++i) {
-    Blur(xyb.Plane(i), kSigmaLf, params, blur_temp, &ps.lf.Plane(i));
-
+    // Extract lf ...
+    Blur(xyb.Plane(i), kSigmaLf, params, blur_temp, &lf->Plane(i));
     // ... and keep everything else in mf.
-    for (size_t y = 0; y < ysize; ++y) {
-      const float* BUTTERAUGLI_RESTRICT row_xyb = xyb.PlaneRow(i, y);
-      const float* BUTTERAUGLI_RESTRICT row_lf = ps.lf.ConstPlaneRow(i, y);
-      float* BUTTERAUGLI_RESTRICT row_mf = ps.mf.PlaneRow(i, y);
-      for (size_t x = 0; x < xsize; x += Lanes(d)) {
-        const auto mf = Sub(Load(d, row_xyb + x), Load(d, row_lf + x));
-        Store(mf, d, row_mf + x);
-      }
-    }
+    Subtract(xyb.Plane(i), lf->Plane(i), &mf->Plane(i));
+  }
+  XybLowFreqToVals(lf);
+}
+
+void SeparateMFAndHF(const ButteraugliParams& params, Image3F* mf, ImageF* hf,
+                     BlurTemp* blur_temp) {
+  const HWY_FULL(float) d;
+  static const double kSigmaHf = 3.22489901262;
+  const size_t xsize = mf->xsize();
+  const size_t ysize = mf->ysize();
+  hf[0] = ImageF(xsize, ysize);
+  hf[1] = ImageF(xsize, ysize);
+  for (int i = 0; i < 3; ++i) {
     if (i == 2) {
-      Blur(ps.mf.Plane(i), kSigmaHf, params, blur_temp, &ps.mf.Plane(i));
+      Blur(mf->Plane(i), kSigmaHf, params, blur_temp, &mf->Plane(i));
       break;
     }
-    // Divide mf into mf and hf.
     for (size_t y = 0; y < ysize; ++y) {
-      float* BUTTERAUGLI_RESTRICT row_mf = ps.mf.PlaneRow(i, y);
-      float* BUTTERAUGLI_RESTRICT row_hf = ps.hf[i].Row(y);
+      float* BUTTERAUGLI_RESTRICT row_mf = mf->PlaneRow(i, y);
+      float* BUTTERAUGLI_RESTRICT row_hf = hf[i].Row(y);
       for (size_t x = 0; x < xsize; x += Lanes(d)) {
         Store(Load(d, row_mf + x), d, row_hf + x);
       }
     }
-    Blur(ps.mf.Plane(i), kSigmaHf, params, blur_temp, &ps.mf.Plane(i));
+    Blur(mf->Plane(i), kSigmaHf, params, blur_temp, &mf->Plane(i));
     static const double kRemoveMfRange = 0.29;
     static const double kAddMfRange = 0.1;
     if (i == 0) {
       for (size_t y = 0; y < ysize; ++y) {
-        float* BUTTERAUGLI_RESTRICT row_mf = ps.mf.PlaneRow(0, y);
-        float* BUTTERAUGLI_RESTRICT row_hf = ps.hf[0].Row(y);
+        float* BUTTERAUGLI_RESTRICT row_mf = mf->PlaneRow(0, y);
+        float* BUTTERAUGLI_RESTRICT row_hf = hf[0].Row(y);
         for (size_t x = 0; x < xsize; x += Lanes(d)) {
           auto mf = Load(d, row_mf + x);
           auto hf = Sub(Load(d, row_hf + x), mf);
@@ -421,8 +436,8 @@ static void SeparateFrequencies(size_t xsize, size_t ysize,
       }
     } else {
       for (size_t y = 0; y < ysize; ++y) {
-        float* BUTTERAUGLI_RESTRICT row_mf = ps.mf.PlaneRow(1, y);
-        float* BUTTERAUGLI_RESTRICT row_hf = ps.hf[1].Row(y);
+        float* BUTTERAUGLI_RESTRICT row_mf = mf->PlaneRow(1, y);
+        float* BUTTERAUGLI_RESTRICT row_hf = hf[1].Row(y);
         for (size_t x = 0; x < xsize; x += Lanes(d)) {
           auto mf = Load(d, row_mf + x);
           auto hf = Sub(Load(d, row_hf + x), mf);
@@ -434,27 +449,28 @@ static void SeparateFrequencies(size_t xsize, size_t ysize,
       }
     }
   }
-
-  // Temporarily used as output of SuppressXByY
-  ps.uhf[0] = ImageF(xsize, ysize);
-  ps.uhf[1] = ImageF(xsize, ysize);
-
   // Suppress red-green by intensity change in the high freq channels.
-  static const double suppress = 46.0;
-  SuppressXByY(ps.hf[0], ps.hf[1], suppress, &ps.uhf[0]);
-  // hf is the SuppressXByY output, uhf will be written below.
-  ps.hf[0].Swap(ps.uhf[0]);
+  SuppressXByY(hf[1], &hf[0]);
+}
 
+void SeparateHFAndUHF(const ButteraugliParams& params, ImageF* hf, ImageF* uhf,
+                      BlurTemp* blur_temp) {
+  const HWY_FULL(float) d;
+  const size_t xsize = hf[0].xsize();
+  const size_t ysize = hf[0].ysize();
+  static const double kSigmaUhf = 1.56416327805;
+  uhf[0] = ImageF(xsize, ysize);
+  uhf[1] = ImageF(xsize, ysize);
   for (int i = 0; i < 2; ++i) {
     // Divide hf into hf and uhf.
     for (size_t y = 0; y < ysize; ++y) {
-      float* BUTTERAUGLI_RESTRICT row_uhf = ps.uhf[i].Row(y);
-      float* BUTTERAUGLI_RESTRICT row_hf = ps.hf[i].Row(y);
+      float* BUTTERAUGLI_RESTRICT row_uhf = uhf[i].Row(y);
+      float* BUTTERAUGLI_RESTRICT row_hf = hf[i].Row(y);
       for (size_t x = 0; x < xsize; ++x) {
         row_uhf[x] = row_hf[x];
       }
     }
-    Blur(ps.hf[i], kSigmaUhf, params, blur_temp, &ps.hf[i]);
+    Blur(hf[i], kSigmaUhf, params, blur_temp, &hf[i]);
     static const double kRemoveHfRange = 1.5;
     static const double kAddHfRange = 0.132;
     static const double kRemoveUhfRange = 0.04;
@@ -464,8 +480,8 @@ static void SeparateFrequencies(size_t xsize, size_t ysize,
     static double kMulYUhf = 2.69313763794;
     if (i == 0) {
       for (size_t y = 0; y < ysize; ++y) {
-        float* BUTTERAUGLI_RESTRICT row_uhf = ps.uhf[0].Row(y);
-        float* BUTTERAUGLI_RESTRICT row_hf = ps.hf[0].Row(y);
+        float* BUTTERAUGLI_RESTRICT row_uhf = uhf[0].Row(y);
+        float* BUTTERAUGLI_RESTRICT row_hf = hf[0].Row(y);
         for (size_t x = 0; x < xsize; x += Lanes(d)) {
           auto hf = Load(d, row_hf + x);
           auto uhf = Sub(Load(d, row_uhf + x), hf);
@@ -477,8 +493,8 @@ static void SeparateFrequencies(size_t xsize, size_t ysize,
       }
     } else {
       for (size_t y = 0; y < ysize; ++y) {
-        float* BUTTERAUGLI_RESTRICT row_uhf = ps.uhf[1].Row(y);
-        float* BUTTERAUGLI_RESTRICT row_hf = ps.hf[1].Row(y);
+        float* BUTTERAUGLI_RESTRICT row_uhf = uhf[1].Row(y);
+        float* BUTTERAUGLI_RESTRICT row_hf = hf[1].Row(y);
         for (size_t x = 0; x < xsize; x += Lanes(d)) {
           auto hf = Load(d, row_hf + x);
           hf = MaximumClamp(d, hf, kMaxclampHf);
@@ -495,27 +511,26 @@ static void SeparateFrequencies(size_t xsize, size_t ysize,
       }
     }
   }
-  // Modify range around zero code only concerns the high frequency
-  // planes and only the X and Y channels.
-  // Convert low freq xyb to vals space so that we can do a simple squared sum
-  // diff on the low frequencies later.
-  for (size_t y = 0; y < ysize; ++y) {
-    float* BUTTERAUGLI_RESTRICT row_x = ps.lf.PlaneRow(0, y);
-    float* BUTTERAUGLI_RESTRICT row_y = ps.lf.PlaneRow(1, y);
-    float* BUTTERAUGLI_RESTRICT row_b = ps.lf.PlaneRow(2, y);
-    for (size_t x = 0; x < xsize; x += Lanes(d)) {
-      auto valx = Undefined(d);
-      auto valy = Undefined(d);
-      auto valb = Undefined(d);
-      XybLowFreqToVals(d, Load(d, row_x + x), Load(d, row_y + x),
-                       Load(d, row_b + x), &valx, &valy, &valb);
-      Store(valx, d, row_x + x);
-      Store(valy, d, row_y + x);
-      Store(valb, d, row_b + x);
-    }
+}
+
+void DeallocateHFAndUHF(ImageF* hf, ImageF* uhf) {
+  for (int i = 0; i < 2; ++i) {
+    hf[i] = ImageF();
+    uhf[i] = ImageF();
   }
 }
 
+static void SeparateFrequencies(size_t xsize, size_t ysize,
+                                const ButteraugliParams& params,
+                                BlurTemp* blur_temp, const Image3F& xyb,
+                                PsychoImage& ps) {
+  ps.lf = Image3F(xyb.xsize(), xyb.ysize());
+  ps.mf = Image3F(xyb.xsize(), xyb.ysize());
+  SeparateLFAndMF(params, xyb, &ps.lf, &ps.mf, blur_temp);
+  SeparateMFAndHF(params, &ps.mf, &ps.hf[0], blur_temp);
+  SeparateHFAndUHF(params, &ps.hf[0], &ps.uhf[0], blur_temp);
+}
+
 namespace {
 template <typename V>
 BUTTERAUGLI_INLINE V Sum(V a, V b, V c, V d) {
@@ -919,7 +934,6 @@ static BUTTERAUGLI_INLINE float PaddedMaltaUnit(const ImageF& diffs,
     return GetLane(MaltaUnit(Tag(), df, d, diffs.PixelsPerRow()));
   }
 
-  PROFILER_ZONE("Padded Malta");
   float borderimage[12 * 9];  // round up to 4
   for (int dy = 0; dy < 9; ++dy) {
     int y = y0 + dy - 4;
@@ -949,7 +963,7 @@ static void MaltaDiffMapT(const Tag tag, const ImageF& lum0, const ImageF& lum1,
                           const double w_0gt1, const double w_0lt1,
                           const double norm1, const double len,
                           const double mulli, ImageF* HWY_RESTRICT diffs,
-                          Image3F* HWY_RESTRICT block_diff_ac, size_t c) {
+                          ImageF* HWY_RESTRICT block_diff_ac) {
   JXL_DASSERT(SameSize(lum0, lum1) && SameSize(lum0, *diffs));
   const size_t xsize_ = lum0.xsize();
   const size_t ysize_ = lum0.ysize();
@@ -1004,7 +1018,7 @@ static void MaltaDiffMapT(const Tag tag, const ImageF& lum0, const ImageF& lum1,
   size_t y0 = 0;
   // Top
   for (; y0 < 4; ++y0) {
-    float* BUTTERAUGLI_RESTRICT row_diff = block_diff_ac->PlaneRow(c, y0);
+    float* BUTTERAUGLI_RESTRICT row_diff = block_diff_ac->Row(y0);
     for (size_t x0 = 0; x0 < xsize_; ++x0) {
       row_diff[x0] += PaddedMaltaUnit<Tag>(*diffs, x0, y0);
     }
@@ -1017,7 +1031,7 @@ static void MaltaDiffMapT(const Tag tag, const ImageF& lum0, const ImageF& lum1,
   // Middle
   for (; y0 < ysize_ - 4; ++y0) {
     const float* BUTTERAUGLI_RESTRICT row_in = diffs->ConstRow(y0);
-    float* BUTTERAUGLI_RESTRICT row_diff = block_diff_ac->PlaneRow(c, y0);
+    float* BUTTERAUGLI_RESTRICT row_diff = block_diff_ac->Row(y0);
     size_t x0 = 0;
     for (; x0 < aligned_x; ++x0) {
       row_diff[x0] += PaddedMaltaUnit<Tag>(*diffs, x0, y0);
@@ -1035,7 +1049,7 @@ static void MaltaDiffMapT(const Tag tag, const ImageF& lum0, const ImageF& lum1,
 
   // Bottom
   for (; y0 < ysize_; ++y0) {
-    float* BUTTERAUGLI_RESTRICT row_diff = block_diff_ac->PlaneRow(c, y0);
+    float* BUTTERAUGLI_RESTRICT row_diff = block_diff_ac->Row(y0);
     for (size_t x0 = 0; x0 < xsize_; ++x0) {
       row_diff[x0] += PaddedMaltaUnit<Tag>(*diffs, x0, y0);
     }
@@ -1044,23 +1058,52 @@ static void MaltaDiffMapT(const Tag tag, const ImageF& lum0, const ImageF& lum1,
 
 // Need non-template wrapper functions for HWY_EXPORT.
 void MaltaDiffMap(const ImageF& lum0, const ImageF& lum1, const double w_0gt1,
-                  const double w_0lt1, const double norm1, const double len,
-                  const double mulli, ImageF* HWY_RESTRICT diffs,
-                  Image3F* HWY_RESTRICT block_diff_ac, size_t c) {
+                  const double w_0lt1, const double norm1,
+                  ImageF* HWY_RESTRICT diffs,
+                  ImageF* HWY_RESTRICT block_diff_ac) {
+  const double len = 3.75;
+  static const double mulli = 0.39905817637;
   MaltaDiffMapT(MaltaTag(), lum0, lum1, w_0gt1, w_0lt1, norm1, len, mulli,
-                diffs, block_diff_ac, c);
+                diffs, block_diff_ac);
 }
 
 void MaltaDiffMapLF(const ImageF& lum0, const ImageF& lum1, const double w_0gt1,
-                    const double w_0lt1, const double norm1, const double len,
-                    const double mulli, ImageF* HWY_RESTRICT diffs,
-                    Image3F* HWY_RESTRICT block_diff_ac, size_t c) {
+                    const double w_0lt1, const double norm1,
+                    ImageF* HWY_RESTRICT diffs,
+                    ImageF* HWY_RESTRICT block_diff_ac) {
+  const double len = 3.75;
+  static const double mulli = 0.611612573796;
   MaltaDiffMapT(MaltaTagLF(), lum0, lum1, w_0gt1, w_0lt1, norm1, len, mulli,
-                diffs, block_diff_ac, c);
+                diffs, block_diff_ac);
+}
+
+void CombineChannelsForMasking(const ImageF* hf, const ImageF* uhf,
+                               ImageF* out) {
+  // Only X and Y components are involved in masking. B's influence
+  // is considered less important in the high frequency area, and we
+  // don't model masking from lower frequency signals.
+  static const float muls[3] = {
+      2.5f,
+      0.4f,
+      0.4f,
+  };
+  // Silly and unoptimized approach here. TODO(jyrki): rework this.
+  for (size_t y = 0; y < hf[0].ysize(); ++y) {
+    const float* BUTTERAUGLI_RESTRICT row_y_hf = hf[1].Row(y);
+    const float* BUTTERAUGLI_RESTRICT row_y_uhf = uhf[1].Row(y);
+    const float* BUTTERAUGLI_RESTRICT row_x_hf = hf[0].Row(y);
+    const float* BUTTERAUGLI_RESTRICT row_x_uhf = uhf[0].Row(y);
+    float* BUTTERAUGLI_RESTRICT row = out->Row(y);
+    for (size_t x = 0; x < hf[0].xsize(); ++x) {
+      float xdiff = (row_x_uhf[x] + row_x_hf[x]) * muls[0];
+      float ydiff = row_y_uhf[x] * muls[1] + row_y_hf[x] * muls[2];
+      row[x] = xdiff * xdiff + ydiff * ydiff;
+      row[x] = sqrt(row[x]);
+    }
+  }
 }
 
 void DiffPrecompute(const ImageF& xyb, float mul, float bias_arg, ImageF* out) {
-  PROFILER_FUNC;
   const size_t xsize = xyb.xsize();
   const size_t ysize = xyb.ysize();
   const float bias = mul * bias_arg;
@@ -1078,7 +1121,7 @@ void DiffPrecompute(const ImageF& xyb, float mul, float bias_arg, ImageF* out) {
 // std::log(80.0) / std::log(255.0);
 constexpr float kIntensityTargetNormalizationHack = 0.79079917404f;
 static const float kInternalGoodQualityThreshold =
-    17.8f * kIntensityTargetNormalizationHack;
+    17.83f * kIntensityTargetNormalizationHack;
 static const float kGlobalScale = 1.0 / kInternalGoodQualityThreshold;
 
 void StoreMin3(const float v, float& min0, float& min1, float& min2) {
@@ -1150,10 +1193,6 @@ void Mask(const ImageF& mask0, const ImageF& mask1,
           const ButteraugliParams& params, BlurTemp* blur_temp,
           ImageF* BUTTERAUGLI_RESTRICT mask,
           ImageF* BUTTERAUGLI_RESTRICT diff_ac) {
-  // Only X and Y components are involved in masking. B's influence
-  // is considered less important in the high frequency area, and we
-  // don't model masking from lower frequency signals.
-  PROFILER_FUNC;
   const size_t xsize = mask0.xsize();
   const size_t ysize = mask0.ysize();
   *mask = ImageF(xsize, ysize);
@@ -1169,7 +1208,6 @@ void Mask(const ImageF& mask0, const ImageF& mask1,
   Blur(diff0, kRadius, params, blur_temp, &blurred0);
   FuzzyErosion(blurred0, &diff0);
   Blur(diff1, kRadius, params, blur_temp, &blurred1);
-  FuzzyErosion(blurred1, &diff1);
   for (size_t y = 0; y < ysize; ++y) {
     for (size_t x = 0; x < xsize; ++x) {
       mask->Row(y)[x] = diff0.Row(y)[x];
@@ -1185,39 +1223,13 @@ void Mask(const ImageF& mask0, const ImageF& mask1,
 // `diff_ac` may be null.
 void MaskPsychoImage(const PsychoImage& pi0, const PsychoImage& pi1,
                      const size_t xsize, const size_t ysize,
-                     const ButteraugliParams& params, Image3F* temp,
-                     BlurTemp* blur_temp, ImageF* BUTTERAUGLI_RESTRICT mask,
+                     const ButteraugliParams& params, BlurTemp* blur_temp,
+                     ImageF* BUTTERAUGLI_RESTRICT mask,
                      ImageF* BUTTERAUGLI_RESTRICT diff_ac) {
   ImageF mask0(xsize, ysize);
   ImageF mask1(xsize, ysize);
-  static const float muls[3] = {
-      2.5f,
-      0.4f,
-      0.4f,
-  };
-  // Silly and unoptimized approach here. TODO(jyrki): rework this.
-  for (size_t y = 0; y < ysize; ++y) {
-    const float* BUTTERAUGLI_RESTRICT row_y_hf0 = pi0.hf[1].Row(y);
-    const float* BUTTERAUGLI_RESTRICT row_y_hf1 = pi1.hf[1].Row(y);
-    const float* BUTTERAUGLI_RESTRICT row_y_uhf0 = pi0.uhf[1].Row(y);
-    const float* BUTTERAUGLI_RESTRICT row_y_uhf1 = pi1.uhf[1].Row(y);
-    const float* BUTTERAUGLI_RESTRICT row_x_hf0 = pi0.hf[0].Row(y);
-    const float* BUTTERAUGLI_RESTRICT row_x_hf1 = pi1.hf[0].Row(y);
-    const float* BUTTERAUGLI_RESTRICT row_x_uhf0 = pi0.uhf[0].Row(y);
-    const float* BUTTERAUGLI_RESTRICT row_x_uhf1 = pi1.uhf[0].Row(y);
-    float* BUTTERAUGLI_RESTRICT row0 = mask0.Row(y);
-    float* BUTTERAUGLI_RESTRICT row1 = mask1.Row(y);
-    for (size_t x = 0; x < xsize; ++x) {
-      float xdiff0 = (row_x_uhf0[x] + row_x_hf0[x]) * muls[0];
-      float xdiff1 = (row_x_uhf1[x] + row_x_hf1[x]) * muls[0];
-      float ydiff0 = row_y_uhf0[x] * muls[1] + row_y_hf0[x] * muls[2];
-      float ydiff1 = row_y_uhf1[x] * muls[1] + row_y_hf1[x] * muls[2];
-      row0[x] = xdiff0 * xdiff0 + ydiff0 * ydiff0;
-      row0[x] = sqrt(row0[x]);
-      row1[x] = xdiff1 * xdiff1 + ydiff1 * ydiff1;
-      row1[x] = sqrt(row1[x]);
-    }
-  }
+  CombineChannelsForMasking(&pi0.hf[0], &pi0.uhf[0], &mask0);
+  CombineChannelsForMasking(&pi1.hf[0], &pi1.uhf[0], &mask1);
   Mask(mask0, mask1, params, blur_temp, mask, diff_ac);
 }
 
@@ -1247,7 +1259,6 @@ inline float MaskColor(const float color[3], const float mask) {
 void CombineChannelsToDiffmap(const ImageF& mask, const Image3F& block_diff_dc,
                               const Image3F& block_diff_ac, float xmul,
                               ImageF* result) {
-  PROFILER_FUNC;
   JXL_CHECK(SameSize(mask, *result));
   size_t xsize = mask.xsize();
   size_t ysize = mask.ysize();
@@ -1273,7 +1284,7 @@ void CombineChannelsToDiffmap(const ImageF& mask, const Image3F& block_diff_dc,
 
 // Adds weighted L2 difference between i0 and i1 to diffmap.
 static void L2Diff(const ImageF& i0, const ImageF& i1, const float w,
-                   Image3F* BUTTERAUGLI_RESTRICT diffmap, size_t c) {
+                   ImageF* BUTTERAUGLI_RESTRICT diffmap) {
   if (w == 0) return;
 
   const HWY_FULL(float) d;
@@ -1282,7 +1293,7 @@ static void L2Diff(const ImageF& i0, const ImageF& i1, const float w,
   for (size_t y = 0; y < i0.ysize(); ++y) {
     const float* BUTTERAUGLI_RESTRICT row0 = i0.ConstRow(y);
     const float* BUTTERAUGLI_RESTRICT row1 = i1.ConstRow(y);
-    float* BUTTERAUGLI_RESTRICT row_diff = diffmap->PlaneRow(c, y);
+    float* BUTTERAUGLI_RESTRICT row_diff = diffmap->Row(y);
 
     for (size_t x = 0; x < i0.xsize(); x += Lanes(d)) {
       const auto diff = Sub(Load(d, row0 + x), Load(d, row1 + x));
@@ -1295,7 +1306,7 @@ static void L2Diff(const ImageF& i0, const ImageF& i1, const float w,
 
 // Initializes diffmap to the weighted L2 difference between i0 and i1.
 static void SetL2Diff(const ImageF& i0, const ImageF& i1, const float w,
-                      Image3F* BUTTERAUGLI_RESTRICT diffmap, size_t c) {
+                      ImageF* BUTTERAUGLI_RESTRICT diffmap) {
   if (w == 0) return;
 
   const HWY_FULL(float) d;
@@ -1304,7 +1315,7 @@ static void SetL2Diff(const ImageF& i0, const ImageF& i1, const float w,
   for (size_t y = 0; y < i0.ysize(); ++y) {
     const float* BUTTERAUGLI_RESTRICT row0 = i0.ConstRow(y);
     const float* BUTTERAUGLI_RESTRICT row1 = i1.ConstRow(y);
-    float* BUTTERAUGLI_RESTRICT row_diff = diffmap->PlaneRow(c, y);
+    float* BUTTERAUGLI_RESTRICT row_diff = diffmap->Row(y);
 
     for (size_t x = 0; x < i0.xsize(); x += Lanes(d)) {
       const auto diff = Sub(Load(d, row0 + x), Load(d, row1 + x));
@@ -1318,7 +1329,7 @@ static void SetL2Diff(const ImageF& i0, const ImageF& i1, const float w,
 // i1 is the deformed copy.
 static void L2DiffAsymmetric(const ImageF& i0, const ImageF& i1, float w_0gt1,
                              float w_0lt1,
-                             Image3F* BUTTERAUGLI_RESTRICT diffmap, size_t c) {
+                             ImageF* BUTTERAUGLI_RESTRICT diffmap) {
   if (w_0gt1 == 0 && w_0lt1 == 0) {
     return;
   }
@@ -1330,7 +1341,7 @@ static void L2DiffAsymmetric(const ImageF& i0, const ImageF& i1, float w_0gt1,
   for (size_t y = 0; y < i0.ysize(); ++y) {
     const float* BUTTERAUGLI_RESTRICT row0 = i0.Row(y);
     const float* BUTTERAUGLI_RESTRICT row1 = i1.Row(y);
-    float* BUTTERAUGLI_RESTRICT row_diff = diffmap->PlaneRow(c, y);
+    float* BUTTERAUGLI_RESTRICT row_diff = diffmap->Row(y);
 
     for (size_t x = 0; x < i0.xsize(); x += Lanes(d)) {
       const auto val0 = Load(d, row0 + x);
@@ -1420,10 +1431,8 @@ BUTTERAUGLI_INLINE void OpsinAbsorbance(const DF df, const V& in0, const V& in1,
 }
 
 // `blurred` is a temporary image used inside this function and not returned.
-Image3F OpsinDynamicsImage(const Image3F& rgb, const ButteraugliParams& params,
-                           Image3F* blurred, BlurTemp* blur_temp) {
-  PROFILER_FUNC;
-  Image3F xyb(rgb.xsize(), rgb.ysize());
+void OpsinDynamicsImage(const Image3F& rgb, const ButteraugliParams& params,
+                        Image3F* blurred, BlurTemp* blur_temp, Image3F* xyb) {
   const double kSigma = 1.2;
   Blur(rgb.Plane(0), kSigma, params, blur_temp, &blurred->Plane(0));
   Blur(rgb.Plane(1), kSigma, params, blur_temp, &blurred->Plane(1));
@@ -1431,18 +1440,15 @@ Image3F OpsinDynamicsImage(const Image3F& rgb, const ButteraugliParams& params,
   const HWY_FULL(float) df;
   const auto intensity_target_multiplier = Set(df, params.intensity_target);
   for (size_t y = 0; y < rgb.ysize(); ++y) {
-    const float* BUTTERAUGLI_RESTRICT row_r = rgb.ConstPlaneRow(0, y);
-    const float* BUTTERAUGLI_RESTRICT row_g = rgb.ConstPlaneRow(1, y);
-    const float* BUTTERAUGLI_RESTRICT row_b = rgb.ConstPlaneRow(2, y);
-    const float* BUTTERAUGLI_RESTRICT row_blurred_r =
-        blurred->ConstPlaneRow(0, y);
-    const float* BUTTERAUGLI_RESTRICT row_blurred_g =
-        blurred->ConstPlaneRow(1, y);
-    const float* BUTTERAUGLI_RESTRICT row_blurred_b =
-        blurred->ConstPlaneRow(2, y);
-    float* BUTTERAUGLI_RESTRICT row_out_x = xyb.PlaneRow(0, y);
-    float* BUTTERAUGLI_RESTRICT row_out_y = xyb.PlaneRow(1, y);
-    float* BUTTERAUGLI_RESTRICT row_out_b = xyb.PlaneRow(2, y);
+    const float* row_r = rgb.ConstPlaneRow(0, y);
+    const float* row_g = rgb.ConstPlaneRow(1, y);
+    const float* row_b = rgb.ConstPlaneRow(2, y);
+    const float* row_blurred_r = blurred->ConstPlaneRow(0, y);
+    const float* row_blurred_g = blurred->ConstPlaneRow(1, y);
+    const float* row_blurred_b = blurred->ConstPlaneRow(2, y);
+    float* row_out_x = xyb->PlaneRow(0, y);
+    float* row_out_y = xyb->PlaneRow(1, y);
+    float* row_out_b = xyb->PlaneRow(2, y);
     const auto min = Set(df, 1e-4f);
     for (size_t x = 0; x < rgb.xsize(); x += Lanes(df)) {
       auto sensitivity0 = Undefined(df);
@@ -1492,7 +1498,108 @@ Image3F OpsinDynamicsImage(const Image3F& rgb, const ButteraugliParams& params,
       Store(cur_mixed2, df, row_out_b + x);
     }
   }
-  return xyb;
+}
+
+void ButteraugliDiffmapInPlace(Image3F& image0, Image3F& image1,
+                               const ButteraugliParams& params,
+                               ImageF& diffmap) {
+  // image0 and image1 are in linear sRGB color space
+  const size_t xsize = image0.xsize();
+  const size_t ysize = image0.ysize();
+  BlurTemp blur_temp;
+  {
+    // Convert image0 and image1 to XYB in-place
+    Image3F temp(xsize, ysize);
+    OpsinDynamicsImage(image0, params, &temp, &blur_temp, &image0);
+    OpsinDynamicsImage(image1, params, &temp, &blur_temp, &image1);
+  }
+  // image0 and image1 are in XYB color space
+  ImageF block_diff_dc(xsize, ysize);
+  ZeroFillImage(&block_diff_dc);
+  {
+    // separate out LF components from image0 and image1 and compute the dc
+    // diff image from them
+    Image3F lf0 = Image3F(xsize, ysize);
+    Image3F lf1 = Image3F(xsize, ysize);
+    SeparateLFAndMF(params, image0, &lf0, &image0, &blur_temp);
+    SeparateLFAndMF(params, image1, &lf1, &image1, &blur_temp);
+    for (size_t c = 0; c < 3; ++c) {
+      L2Diff(lf0.Plane(c), lf1.Plane(c), wmul[6 + c], &block_diff_dc);
+    }
+  }
+  // image0 and image1 are MF residuals (before blurring) in XYB color space
+  ImageF hf0[2];
+  ImageF hf1[2];
+  SeparateMFAndHF(params, &image0, &hf0[0], &blur_temp);
+  SeparateMFAndHF(params, &image1, &hf1[0], &blur_temp);
+  // image0 and image1 are MF-images in XYB color space
+
+  ImageF block_diff_ac(xsize, ysize);
+  ZeroFillImage(&block_diff_ac);
+  // start accumulating ac diff image from MF images
+  {
+    ImageF diffs(xsize, ysize);
+    MaltaDiffMapLF(image0.Plane(1), image1.Plane(1), wMfMalta, wMfMalta,
+                   norm1Mf, &diffs, &block_diff_ac);
+    MaltaDiffMapLF(image0.Plane(0), image1.Plane(0), wMfMaltaX, wMfMaltaX,
+                   norm1MfX, &diffs, &block_diff_ac);
+  }
+  for (size_t c = 0; c < 3; ++c) {
+    L2Diff(image0.Plane(c), image1.Plane(c), wmul[3 + c], &block_diff_ac);
+  }
+  // we will not need the MF-images and more, so we deallocate them to reduce
+  // peak memory usage
+  image0 = Image3F();
+  image1 = Image3F();
+
+  ImageF uhf0[2];
+  ImageF uhf1[2];
+  SeparateHFAndUHF(params, &hf0[0], &uhf0[0], &blur_temp);
+  SeparateHFAndUHF(params, &hf1[0], &uhf1[0], &blur_temp);
+
+  // continue accumulating ac diff image from HF and UHF images
+  const float hf_asymmetry = params.hf_asymmetry;
+  {
+    ImageF diffs(xsize, ysize);
+    MaltaDiffMap(uhf0[1], uhf1[1], wUhfMalta * hf_asymmetry,
+                 wUhfMalta / hf_asymmetry, norm1Uhf, &diffs, &block_diff_ac);
+    MaltaDiffMap(uhf0[0], uhf1[0], wUhfMaltaX * hf_asymmetry,
+                 wUhfMaltaX / hf_asymmetry, norm1UhfX, &diffs, &block_diff_ac);
+    MaltaDiffMapLF(hf0[1], hf1[1], wHfMalta * std::sqrt(hf_asymmetry),
+                   wHfMalta / std::sqrt(hf_asymmetry), norm1Hf, &diffs,
+                   &block_diff_ac);
+    MaltaDiffMapLF(hf0[0], hf1[0], wHfMaltaX * std::sqrt(hf_asymmetry),
+                   wHfMaltaX / std::sqrt(hf_asymmetry), norm1HfX, &diffs,
+                   &block_diff_ac);
+  }
+  for (size_t c = 0; c < 2; ++c) {
+    L2DiffAsymmetric(hf0[c], hf1[c], wmul[c] * hf_asymmetry,
+                     wmul[c] / hf_asymmetry, &block_diff_ac);
+  }
+
+  // compute mask image from HF and UHF X and Y images
+  ImageF mask(xsize, ysize);
+  {
+    ImageF mask0(xsize, ysize);
+    ImageF mask1(xsize, ysize);
+    CombineChannelsForMasking(&hf0[0], &uhf0[0], &mask0);
+    CombineChannelsForMasking(&hf1[0], &uhf1[0], &mask1);
+    DeallocateHFAndUHF(&hf1[0], &uhf1[0]);
+    DeallocateHFAndUHF(&hf0[0], &uhf0[0]);
+    Mask(mask0, mask1, params, &blur_temp, &mask, &block_diff_ac);
+  }
+
+  // compute final diffmap from mask image and ac and dc diff images
+  diffmap = ImageF(xsize, ysize);
+  for (size_t y = 0; y < ysize; ++y) {
+    const float* row_dc = block_diff_dc.Row(y);
+    const float* row_ac = block_diff_ac.Row(y);
+    float* row_out = diffmap.Row(y);
+    for (size_t x = 0; x < xsize; ++x) {
+      const float val = mask.Row(y)[x];
+      row_out[x] = sqrt(row_dc[x] * MaskDcY(val) + row_ac[x] * MaskY(val));
+    }
+  }
 }
 
 // NOLINTNEXTLINE(google-readability-namespace-comments)
@@ -1512,6 +1619,7 @@ HWY_EXPORT(CombineChannelsToDiffmap);  // Local function.
 HWY_EXPORT(MaltaDiffMap);              // Local function.
 HWY_EXPORT(MaltaDiffMapLF);            // Local function.
 HWY_EXPORT(OpsinDynamicsImage);        // Local function.
+HWY_EXPORT(ButteraugliDiffmapInPlace);  // Local function.
 
 #if BUTTERAUGLI_ENABLE_CHECKS
 
@@ -1530,7 +1638,6 @@ static inline bool IsNan(const double x) {
 }
 
 static inline void CheckImage(const ImageF& image, const char* name) {
-  PROFILER_FUNC;
   for (size_t y = 0; y < image.ysize(); ++y) {
     const float* BUTTERAUGLI_RESTRICT row = image.Row(y);
     for (size_t x = 0; x < image.xsize(); ++x) {
@@ -1628,8 +1735,9 @@ ButteraugliComparator::ButteraugliComparator(const Image3F& rgb0,
     return;
   }
 
-  Image3F xyb0 = HWY_DYNAMIC_DISPATCH(OpsinDynamicsImage)(rgb0, params, Temp(),
-                                                          &blur_temp_);
+  Image3F xyb0(xsize_, ysize_);
+  HWY_DYNAMIC_DISPATCH(OpsinDynamicsImage)
+  (rgb0, params, Temp(), &blur_temp_, &xyb0);
   ReleaseTemp();
   HWY_DYNAMIC_DISPATCH(SeparateFrequencies)
   (xsize_, ysize_, params_, &blur_temp_, xyb0, pi0_);
@@ -1642,26 +1750,26 @@ ButteraugliComparator::ButteraugliComparator(const Image3F& rgb0,
 
 void ButteraugliComparator::Mask(ImageF* BUTTERAUGLI_RESTRICT mask) const {
   HWY_DYNAMIC_DISPATCH(MaskPsychoImage)
-  (pi0_, pi0_, xsize_, ysize_, params_, Temp(), &blur_temp_, mask, nullptr);
-  ReleaseTemp();
+  (pi0_, pi0_, xsize_, ysize_, params_, &blur_temp_, mask, nullptr);
 }
 
 void ButteraugliComparator::Diffmap(const Image3F& rgb1, ImageF& result) const {
-  PROFILER_FUNC;
   if (xsize_ < 8 || ysize_ < 8) {
     ZeroFillImage(&result);
     return;
   }
-  const Image3F xyb1 = HWY_DYNAMIC_DISPATCH(OpsinDynamicsImage)(
-      rgb1, params_, Temp(), &blur_temp_);
+  Image3F xyb1(xsize_, ysize_);
+  HWY_DYNAMIC_DISPATCH(OpsinDynamicsImage)
+  (rgb1, params_, Temp(), &blur_temp_, &xyb1);
   ReleaseTemp();
   DiffmapOpsinDynamicsImage(xyb1, result);
   if (sub_) {
     if (sub_->xsize_ < 8 || sub_->ysize_ < 8) {
       return;
     }
-    const Image3F sub_xyb = HWY_DYNAMIC_DISPATCH(OpsinDynamicsImage)(
-        SubSample2x(rgb1), params_, sub_->Temp(), &sub_->blur_temp_);
+    Image3F sub_xyb(sub_->xsize_, sub_->ysize_);
+    HWY_DYNAMIC_DISPATCH(OpsinDynamicsImage)
+    (SubSample2x(rgb1), params_, sub_->Temp(), &sub_->blur_temp_, &sub_xyb);
     sub_->ReleaseTemp();
     ImageF subresult;
     sub_->DiffmapOpsinDynamicsImage(sub_xyb, subresult);
@@ -1671,7 +1779,6 @@ void ButteraugliComparator::Diffmap(const Image3F& rgb1, ImageF& result) const {
 
 void ButteraugliComparator::DiffmapOpsinDynamicsImage(const Image3F& xyb1,
                                                       ImageF& result) const {
-  PROFILER_FUNC;
   if (xsize_ < 8 || ysize_ < 8) {
     ZeroFillImage(&result);
     return;
@@ -1689,29 +1796,22 @@ void MaltaDiffMap(const ImageF& lum0, const ImageF& lum1, const double w_0gt1,
                   const double w_0lt1, const double norm1,
                   ImageF* HWY_RESTRICT diffs,
                   Image3F* HWY_RESTRICT block_diff_ac, size_t c) {
-  PROFILER_FUNC;
-  const double len = 3.75;
-  static const double mulli = 0.39905817637;
   HWY_DYNAMIC_DISPATCH(MaltaDiffMap)
-  (lum0, lum1, w_0gt1, w_0lt1, norm1, len, mulli, diffs, block_diff_ac, c);
+  (lum0, lum1, w_0gt1, w_0lt1, norm1, diffs, &block_diff_ac->Plane(c));
 }
 
 void MaltaDiffMapLF(const ImageF& lum0, const ImageF& lum1, const double w_0gt1,
                     const double w_0lt1, const double norm1,
                     ImageF* HWY_RESTRICT diffs,
                     Image3F* HWY_RESTRICT block_diff_ac, size_t c) {
-  PROFILER_FUNC;
-  const double len = 3.75;
-  static const double mulli = 0.611612573796;
   HWY_DYNAMIC_DISPATCH(MaltaDiffMapLF)
-  (lum0, lum1, w_0gt1, w_0lt1, norm1, len, mulli, diffs, block_diff_ac, c);
+  (lum0, lum1, w_0gt1, w_0lt1, norm1, diffs, &block_diff_ac->Plane(c));
 }
 
 }  // namespace
 
 void ButteraugliComparator::DiffmapPsychoImage(const PsychoImage& pi1,
                                                ImageF& diffmap) const {
-  PROFILER_FUNC;
   if (xsize_ < 8 || ysize_ < 8) {
     ZeroFillImage(&diffmap);
     return;
@@ -1723,62 +1823,39 @@ void ButteraugliComparator::DiffmapPsychoImage(const PsychoImage& pi1,
   ImageF diffs(xsize_, ysize_);
   Image3F block_diff_ac(xsize_, ysize_);
   ZeroFillImage(&block_diff_ac);
-  static const double wUhfMalta = 1.10039032555;
-  static const double norm1Uhf = 71.7800275169;
   MaltaDiffMap(pi0_.uhf[1], pi1.uhf[1], wUhfMalta * hf_asymmetry_,
                wUhfMalta / hf_asymmetry_, norm1Uhf, &diffs, &block_diff_ac, 1);
-
-  static const double wUhfMaltaX = 173.5;
-  static const double norm1UhfX = 5.0;
   MaltaDiffMap(pi0_.uhf[0], pi1.uhf[0], wUhfMaltaX * hf_asymmetry_,
                wUhfMaltaX / hf_asymmetry_, norm1UhfX, &diffs, &block_diff_ac,
                0);
-
-  static const double wHfMalta = 18.7237414387;
-  static const double norm1Hf = 4498534.45232;
   MaltaDiffMapLF(pi0_.hf[1], pi1.hf[1], wHfMalta * std::sqrt(hf_asymmetry_),
                  wHfMalta / std::sqrt(hf_asymmetry_), norm1Hf, &diffs,
                  &block_diff_ac, 1);
-
-  static const double wHfMaltaX = 6923.99476109;
-  static const double norm1HfX = 8051.15833247;
   MaltaDiffMapLF(pi0_.hf[0], pi1.hf[0], wHfMaltaX * std::sqrt(hf_asymmetry_),
                  wHfMaltaX / std::sqrt(hf_asymmetry_), norm1HfX, &diffs,
                  &block_diff_ac, 0);
-
-  static const double wMfMalta = 37.0819870399;
-  static const double norm1Mf = 130262059.556;
   MaltaDiffMapLF(pi0_.mf.Plane(1), pi1.mf.Plane(1), wMfMalta, wMfMalta, norm1Mf,
                  &diffs, &block_diff_ac, 1);
-
-  static const double wMfMaltaX = 8246.75321353;
-  static const double norm1MfX = 1009002.70582;
   MaltaDiffMapLF(pi0_.mf.Plane(0), pi1.mf.Plane(0), wMfMaltaX, wMfMaltaX,
                  norm1MfX, &diffs, &block_diff_ac, 0);
 
-  static const double wmul[9] = {
-      400.0,         1.50815703118,  0,
-      2150.0,        10.6195433239,  16.2176043152,
-      29.2353797994, 0.844626970982, 0.703646627719,
-  };
   Image3F block_diff_dc(xsize_, ysize_);
   for (size_t c = 0; c < 3; ++c) {
     if (c < 2) {  // No blue channel error accumulated at HF.
       HWY_DYNAMIC_DISPATCH(L2DiffAsymmetric)
       (pi0_.hf[c], pi1.hf[c], wmul[c] * hf_asymmetry_, wmul[c] / hf_asymmetry_,
-       &block_diff_ac, c);
+       &block_diff_ac.Plane(c));
     }
     HWY_DYNAMIC_DISPATCH(L2Diff)
-    (pi0_.mf.Plane(c), pi1.mf.Plane(c), wmul[3 + c], &block_diff_ac, c);
+    (pi0_.mf.Plane(c), pi1.mf.Plane(c), wmul[3 + c], &block_diff_ac.Plane(c));
     HWY_DYNAMIC_DISPATCH(SetL2Diff)
-    (pi0_.lf.Plane(c), pi1.lf.Plane(c), wmul[6 + c], &block_diff_dc, c);
+    (pi0_.lf.Plane(c), pi1.lf.Plane(c), wmul[6 + c], &block_diff_dc.Plane(c));
   }
 
   ImageF mask;
   HWY_DYNAMIC_DISPATCH(MaskPsychoImage)
-  (pi0_, pi1, xsize_, ysize_, params_, Temp(), &blur_temp_, &mask,
+  (pi0_, pi1, xsize_, ysize_, params_, &blur_temp_, &mask,
    &block_diff_ac.Plane(1));
-  ReleaseTemp();
 
   HWY_DYNAMIC_DISPATCH(CombineChannelsToDiffmap)
   (mask, block_diff_dc, block_diff_ac, xmul_, &diffmap);
@@ -1786,7 +1863,6 @@ void ButteraugliComparator::DiffmapPsychoImage(const PsychoImage& pi1,
 
 double ButteraugliScoreFromDiffmap(const ImageF& diffmap,
                                    const ButteraugliParams* params) {
-  PROFILER_FUNC;
   float retval = 0.0f;
   for (size_t y = 0; y < diffmap.ysize(); ++y) {
     const float* BUTTERAUGLI_RESTRICT row = diffmap.ConstRow(y);
@@ -1805,9 +1881,44 @@ bool ButteraugliDiffmap(const Image3F& rgb0, const Image3F& rgb1,
   return ButteraugliDiffmap(rgb0, rgb1, params, diffmap);
 }
 
+template <size_t kMax>
+bool ButteraugliDiffmapSmall(const Image3F& rgb0, const Image3F& rgb1,
+                             const ButteraugliParams& params, ImageF& diffmap) {
+  const size_t xsize = rgb0.xsize();
+  const size_t ysize = rgb0.ysize();
+  // Butteraugli values for small (where xsize or ysize is smaller
+  // than 8 pixels) images are non-sensical, but most likely it is
+  // less disruptive to try to compute something than just give up.
+  // Temporarily extend the borders of the image to fit 8 x 8 size.
+  size_t xborder = xsize < kMax ? (kMax - xsize) / 2 : 0;
+  size_t yborder = ysize < kMax ? (kMax - ysize) / 2 : 0;
+  size_t xscaled = std::max<size_t>(kMax, xsize);
+  size_t yscaled = std::max<size_t>(kMax, ysize);
+  Image3F scaled0(xscaled, yscaled);
+  Image3F scaled1(xscaled, yscaled);
+  for (int i = 0; i < 3; ++i) {
+    for (size_t y = 0; y < yscaled; ++y) {
+      for (size_t x = 0; x < xscaled; ++x) {
+        size_t x2 = std::min<size_t>(xsize - 1, x > xborder ? x - xborder : 0);
+        size_t y2 = std::min<size_t>(ysize - 1, y > yborder ? y - yborder : 0);
+        scaled0.PlaneRow(i, y)[x] = rgb0.PlaneRow(i, y2)[x2];
+        scaled1.PlaneRow(i, y)[x] = rgb1.PlaneRow(i, y2)[x2];
+      }
+    }
+  }
+  ImageF diffmap_scaled;
+  const bool ok = ButteraugliDiffmap(scaled0, scaled1, params, diffmap_scaled);
+  diffmap = ImageF(xsize, ysize);
+  for (size_t y = 0; y < ysize; ++y) {
+    for (size_t x = 0; x < xsize; ++x) {
+      diffmap.Row(y)[x] = diffmap_scaled.Row(y + yborder)[x + xborder];
+    }
+  }
+  return ok;
+}
+
 bool ButteraugliDiffmap(const Image3F& rgb0, const Image3F& rgb1,
                         const ButteraugliParams& params, ImageF& diffmap) {
-  PROFILER_FUNC;
   const size_t xsize = rgb0.xsize();
   const size_t ysize = rgb0.ysize();
   if (xsize < 1 || ysize < 1) {
@@ -1818,38 +1929,7 @@ bool ButteraugliDiffmap(const Image3F& rgb0, const Image3F& rgb1,
   }
   static const int kMax = 8;
   if (xsize < kMax || ysize < kMax) {
-    // Butteraugli values for small (where xsize or ysize is smaller
-    // than 8 pixels) images are non-sensical, but most likely it is
-    // less disruptive to try to compute something than just give up.
-    // Temporarily extend the borders of the image to fit 8 x 8 size.
-    size_t xborder = xsize < kMax ? (kMax - xsize) / 2 : 0;
-    size_t yborder = ysize < kMax ? (kMax - ysize) / 2 : 0;
-    size_t xscaled = std::max<size_t>(kMax, xsize);
-    size_t yscaled = std::max<size_t>(kMax, ysize);
-    Image3F scaled0(xscaled, yscaled);
-    Image3F scaled1(xscaled, yscaled);
-    for (int i = 0; i < 3; ++i) {
-      for (size_t y = 0; y < yscaled; ++y) {
-        for (size_t x = 0; x < xscaled; ++x) {
-          size_t x2 =
-              std::min<size_t>(xsize - 1, x > xborder ? x - xborder : 0);
-          size_t y2 =
-              std::min<size_t>(ysize - 1, y > yborder ? y - yborder : 0);
-          scaled0.PlaneRow(i, y)[x] = rgb0.PlaneRow(i, y2)[x2];
-          scaled1.PlaneRow(i, y)[x] = rgb1.PlaneRow(i, y2)[x2];
-        }
-      }
-    }
-    ImageF diffmap_scaled;
-    const bool ok =
-        ButteraugliDiffmap(scaled0, scaled1, params, diffmap_scaled);
-    diffmap = ImageF(xsize, ysize);
-    for (size_t y = 0; y < ysize; ++y) {
-      for (size_t x = 0; x < xsize; ++x) {
-        diffmap.Row(y)[x] = diffmap_scaled.Row(y + yborder)[x + xborder];
-      }
-    }
-    return ok;
+    return ButteraugliDiffmapSmall<kMax>(rgb0, rgb1, params, diffmap);
   }
   ButteraugliComparator butteraugli(rgb0, params);
   butteraugli.Diffmap(rgb1, diffmap);
@@ -1868,18 +1948,41 @@ bool ButteraugliInterface(const Image3F& rgb0, const Image3F& rgb1,
 bool ButteraugliInterface(const Image3F& rgb0, const Image3F& rgb1,
                           const ButteraugliParams& params, ImageF& diffmap,
                           double& diffvalue) {
-#if PROFILER_ENABLED
-  auto trace_start = std::chrono::steady_clock::now();
-#endif
   if (!ButteraugliDiffmap(rgb0, rgb1, params, diffmap)) {
     return false;
   }
-#if PROFILER_ENABLED
-  auto trace_end = std::chrono::steady_clock::now();
-  std::chrono::duration<double> elapsed = trace_end - trace_start;
-  const size_t mp = rgb0.xsize() * rgb0.ysize();
-  printf("diff MP/s %f\n", mp / elapsed.count() * 1E-6);
-#endif
+  diffvalue = ButteraugliScoreFromDiffmap(diffmap, &params);
+  return true;
+}
+
+bool ButteraugliInterfaceInPlace(Image3F&& rgb0, Image3F&& rgb1,
+                                 const ButteraugliParams& params,
+                                 ImageF& diffmap, double& diffvalue) {
+  const size_t xsize = rgb0.xsize();
+  const size_t ysize = rgb0.ysize();
+  if (xsize < 1 || ysize < 1) {
+    return JXL_FAILURE("Zero-sized image");
+  }
+  if (!SameSize(rgb0, rgb1)) {
+    return JXL_FAILURE("Size mismatch");
+  }
+  static const int kMax = 8;
+  if (xsize < kMax || ysize < kMax) {
+    bool ok = ButteraugliDiffmapSmall<kMax>(rgb0, rgb1, params, diffmap);
+    diffvalue = ButteraugliScoreFromDiffmap(diffmap, &params);
+    return ok;
+  }
+  ImageF subdiffmap;
+  if (xsize >= 15 && ysize >= 15) {
+    Image3F rgb0_sub = SubSample2x(rgb0);
+    Image3F rgb1_sub = SubSample2x(rgb1);
+    HWY_DYNAMIC_DISPATCH(ButteraugliDiffmapInPlace)
+    (rgb0_sub, rgb1_sub, params, subdiffmap);
+  }
+  HWY_DYNAMIC_DISPATCH(ButteraugliDiffmapInPlace)(rgb0, rgb1, params, diffmap);
+  if (xsize >= 15 && ysize >= 15) {
+    AddSupersampled2x(subdiffmap, 0.5, diffmap);
+  }
   diffvalue = ButteraugliScoreFromDiffmap(diffmap, &params);
   return true;
 }
index 652b952..29130e8 100644 (file)
@@ -9,7 +9,6 @@
 #define LIB_JXL_BUTTERAUGLI_BUTTERAUGLI_H_
 
 #include <stdint.h>
-#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
@@ -19,7 +18,6 @@
 #include <vector>
 
 #include "lib/jxl/base/compiler_specific.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/image_ops.h"
 
@@ -86,6 +84,13 @@ bool ButteraugliInterface(const Image3F &rgb0, const Image3F &rgb1,
                           float hf_asymmetry, float xmul, ImageF &diffmap,
                           double &diffvalue);
 
+// Same as ButteraugliInterface, but reuses rgb0 and rgb1 for other purposes
+// inside the function after they are not needed any more, and it ignores
+// params.xmul.
+bool ButteraugliInterfaceInPlace(Image3F &&rgb0, Image3F &&rgb1,
+                                 const ButteraugliParams &params,
+                                 ImageF &diffmap, double &diffvalue);
+
 // Converts the butteraugli score into fuzzy class values that are continuous
 // at the class boundary. The class boundary location is based on human
 // raters, but the slope is arbitrary. Particularly, it does not reflect
diff --git a/lib/jxl/butteraugli/butteraugli_test.cc b/lib/jxl/butteraugli/butteraugli_test.cc
new file mode 100644 (file)
index 0000000..68ee8c2
--- /dev/null
@@ -0,0 +1,117 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/butteraugli/butteraugli.h"
+
+#include <jxl/types.h>
+#include <stddef.h>
+
+#include <algorithm>
+#include <utility>
+
+#include "lib/extras/metrics.h"
+#include "lib/extras/packed_image.h"
+#include "lib/jxl/base/random.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/enc_external_image.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/test_image.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+using extras::PackedImage;
+using extras::PackedPixelFile;
+using test::TestImage;
+
+Image3F SinglePixelImage(float red, float green, float blue) {
+  Image3F img(1, 1);
+  img.PlaneRow(0, 0)[0] = red;
+  img.PlaneRow(1, 0)[0] = green;
+  img.PlaneRow(2, 0)[0] = blue;
+  return img;
+}
+
+Image3F GetColorImage(const PackedPixelFile& ppf) {
+  JXL_CHECK(!ppf.frames.empty());
+  const PackedImage& image = ppf.frames[0].color;
+  const JxlPixelFormat& format = image.format;
+  const uint8_t* pixels = reinterpret_cast<const uint8_t*>(image.pixels());
+  Image3F color(image.xsize, image.ysize);
+  for (size_t c = 0; c < format.num_channels; ++c) {
+    JXL_CHECK(ConvertFromExternal(Bytes(pixels, image.pixels_size), image.xsize,
+                                  image.ysize, ppf.info.bits_per_sample, format,
+                                  c, nullptr, &color.Plane(c)));
+  }
+  return color;
+}
+
+void AddUniformNoise(Image3F* img, float d, size_t seed) {
+  Rng generator(seed);
+  for (size_t y = 0; y < img->ysize(); ++y) {
+    for (int c = 0; c < 3; ++c) {
+      for (size_t x = 0; x < img->xsize(); ++x) {
+        img->PlaneRow(c, y)[x] += generator.UniformF(-d, d);
+      }
+    }
+  }
+}
+
+void AddEdge(Image3F* img, float d, size_t x0, size_t y0) {
+  const size_t h = std::min<size_t>(img->ysize() - y0, 100);
+  const size_t w = std::min<size_t>(img->xsize() - x0, 5);
+  for (size_t dy = 0; dy < h; ++dy) {
+    for (size_t dx = 0; dx < w; ++dx) {
+      img->PlaneRow(1, y0 + dy)[x0 + dx] += d;
+    }
+  }
+}
+
+TEST(ButteraugliInPlaceTest, SinglePixel) {
+  Image3F rgb0 = SinglePixelImage(0.5f, 0.5f, 0.5f);
+  Image3F rgb1 = SinglePixelImage(0.5f, 0.49f, 0.5f);
+  ButteraugliParams ba;
+  ImageF diffmap;
+  double diffval;
+  EXPECT_TRUE(ButteraugliInterface(rgb0, rgb1, ba, diffmap, diffval));
+  EXPECT_NEAR(diffval, 2.5, 0.5);
+  ImageF diffmap2;
+  double diffval2;
+  EXPECT_TRUE(ButteraugliInterfaceInPlace(std::move(rgb0), std::move(rgb1), ba,
+                                          diffmap2, diffval2));
+  EXPECT_NEAR(diffval, diffval2, 1e-10);
+}
+
+TEST(ButteraugliInPlaceTest, LargeImage) {
+  const size_t xsize = 1024;
+  const size_t ysize = 1024;
+  TestImage img;
+  img.SetDimensions(xsize, ysize).AddFrame().RandomFill(777);
+  Image3F rgb0 = GetColorImage(img.ppf());
+  Image3F rgb1(xsize, ysize);
+  CopyImageTo(rgb0, &rgb1);
+  AddUniformNoise(&rgb1, 0.02f, 7777);
+  AddEdge(&rgb1, 0.1f, xsize / 2, xsize / 2);
+  ButteraugliParams ba;
+  ImageF diffmap;
+  double diffval;
+  EXPECT_TRUE(ButteraugliInterface(rgb0, rgb1, ba, diffmap, diffval));
+  double distp = ComputeDistanceP(diffmap, ba, 3.0);
+  EXPECT_NEAR(diffval, 4.0, 0.5);
+  EXPECT_NEAR(distp, 1.5, 0.5);
+  ImageF diffmap2;
+  double diffval2;
+  EXPECT_TRUE(ButteraugliInterfaceInPlace(std::move(rgb0), std::move(rgb1), ba,
+                                          diffmap2, diffval2));
+  double distp2 = ComputeDistanceP(diffmap2, ba, 3.0);
+  EXPECT_NEAR(diffval, diffval2, 1e-10);
+  EXPECT_NEAR(distp, distp2, 1e-7);
+}
+
+}  // namespace
+}  // namespace jxl
diff --git a/lib/jxl/butteraugli_test.cc b/lib/jxl/butteraugli_test.cc
deleted file mode 100644 (file)
index 98ec788..0000000
+++ /dev/null
@@ -1,102 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "jxl/butteraugli.h"
-
-#include "gtest/gtest.h"
-#include "jxl/butteraugli_cxx.h"
-#include "lib/jxl/test_utils.h"
-
-TEST(ButteraugliTest, Lossless) {
-  uint32_t xsize = 171;
-  uint32_t ysize = 219;
-  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
-  JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
-
-  JxlButteraugliApiPtr api(JxlButteraugliApiCreate(nullptr));
-  JxlButteraugliResultPtr result(JxlButteraugliCompute(
-      api.get(), xsize, ysize, &pixel_format, pixels.data(), pixels.size(),
-      &pixel_format, pixels.data(), pixels.size()));
-  EXPECT_EQ(0.0, JxlButteraugliResultGetDistance(result.get(), 8.0));
-}
-
-TEST(ButteraugliTest, Distmap) {
-  uint32_t xsize = 171;
-  uint32_t ysize = 219;
-  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
-  JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
-
-  JxlButteraugliApiPtr api(JxlButteraugliApiCreate(nullptr));
-  JxlButteraugliResultPtr result(JxlButteraugliCompute(
-      api.get(), xsize, ysize, &pixel_format, pixels.data(), pixels.size(),
-      &pixel_format, pixels.data(), pixels.size()));
-  EXPECT_EQ(0.0, JxlButteraugliResultGetDistance(result.get(), 8.0));
-  const float* distmap;
-  uint32_t row_stride;
-  JxlButteraugliResultGetDistmap(result.get(), &distmap, &row_stride);
-  for (uint32_t y = 0; y < ysize; y++) {
-    for (uint32_t x = 0; x < xsize; x++) {
-      EXPECT_EQ(0.0, distmap[y * row_stride + x]);
-    }
-  }
-}
-
-TEST(ButteraugliTest, Distorted) {
-  uint32_t xsize = 171;
-  uint32_t ysize = 219;
-  std::vector<uint8_t> orig_pixels =
-      jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
-  std::vector<uint8_t> dist_pixels =
-      jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
-  dist_pixels[0] += 128;
-
-  JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
-
-  JxlButteraugliApiPtr api(JxlButteraugliApiCreate(nullptr));
-  JxlButteraugliResultPtr result(JxlButteraugliCompute(
-      api.get(), xsize, ysize, &pixel_format, orig_pixels.data(),
-      orig_pixels.size(), &pixel_format, dist_pixels.data(),
-      dist_pixels.size()));
-  EXPECT_NE(0.0, JxlButteraugliResultGetDistance(result.get(), 8.0));
-}
-
-TEST(ButteraugliTest, Api) {
-  uint32_t xsize = 171;
-  uint32_t ysize = 219;
-  std::vector<uint8_t> orig_pixels =
-      jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
-  std::vector<uint8_t> dist_pixels =
-      jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
-  dist_pixels[0] += 128;
-
-  JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
-
-  JxlButteraugliApiPtr api(JxlButteraugliApiCreate(nullptr));
-  JxlButteraugliApiSetHFAsymmetry(api.get(), 1.0f);
-  JxlButteraugliApiSetIntensityTarget(api.get(), 250.0f);
-  JxlButteraugliResultPtr result(JxlButteraugliCompute(
-      api.get(), xsize, ysize, &pixel_format, orig_pixels.data(),
-      orig_pixels.size(), &pixel_format, dist_pixels.data(),
-      dist_pixels.size()));
-  double distance0 = JxlButteraugliResultGetDistance(result.get(), 8.0);
-
-  JxlButteraugliApiSetHFAsymmetry(api.get(), 2.0f);
-  result.reset(JxlButteraugliCompute(api.get(), xsize, ysize, &pixel_format,
-                                     orig_pixels.data(), orig_pixels.size(),
-                                     &pixel_format, dist_pixels.data(),
-                                     dist_pixels.size()));
-  double distance1 = JxlButteraugliResultGetDistance(result.get(), 8.0);
-
-  EXPECT_NE(distance0, distance1);
-
-  JxlButteraugliApiSetIntensityTarget(api.get(), 80.0f);
-  result.reset(JxlButteraugliCompute(api.get(), xsize, ysize, &pixel_format,
-                                     orig_pixels.data(), orig_pixels.size(),
-                                     &pixel_format, dist_pixels.data(),
-                                     dist_pixels.size()));
-  double distance2 = JxlButteraugliResultGetDistance(result.get(), 8.0);
-
-  EXPECT_NE(distance1, distance2);
-}
diff --git a/lib/jxl/butteraugli_wrapper.cc b/lib/jxl/butteraugli_wrapper.cc
deleted file mode 100644 (file)
index 836b798..0000000
+++ /dev/null
@@ -1,203 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <atomic>
-
-#include "jxl/butteraugli.h"
-#include "jxl/parallel_runner.h"
-#include "lib/jxl/base/data_parallel.h"
-#include "lib/jxl/base/profiler.h"
-#include "lib/jxl/butteraugli/butteraugli.h"
-#include "lib/jxl/common.h"
-#include "lib/jxl/enc_butteraugli_comparator.h"
-#include "lib/jxl/enc_butteraugli_pnorm.h"
-#include "lib/jxl/enc_color_management.h"
-#include "lib/jxl/enc_external_image.h"
-#include "lib/jxl/image_bundle.h"
-#include "lib/jxl/memory_manager_internal.h"
-
-namespace {
-
-void SetMetadataFromPixelFormat(const JxlPixelFormat* pixel_format,
-                                jxl::ImageMetadata* metadata) {
-  uint32_t potential_alpha_bits = 0;
-  switch (pixel_format->data_type) {
-    case JXL_TYPE_FLOAT:
-      metadata->SetFloat32Samples();
-      potential_alpha_bits = 16;
-      break;
-    case JXL_TYPE_FLOAT16:
-      metadata->SetFloat16Samples();
-      potential_alpha_bits = 16;
-      break;
-    case JXL_TYPE_UINT16:
-      metadata->SetUintSamples(16);
-      potential_alpha_bits = 16;
-      break;
-    case JXL_TYPE_UINT8:
-      metadata->SetUintSamples(8);
-      potential_alpha_bits = 8;
-      break;
-    default:
-      JXL_ABORT("Unhandled JxlDataType");
-  }
-  if (pixel_format->num_channels == 2 || pixel_format->num_channels == 4) {
-    metadata->SetAlphaBits(potential_alpha_bits);
-  }
-}
-
-}  // namespace
-
-struct JxlButteraugliResultStruct {
-  JxlMemoryManager memory_manager;
-
-  jxl::ImageF distmap;
-  jxl::ButteraugliParams params;
-};
-
-struct JxlButteraugliApiStruct {
-  // Multiplier for penalizing new HF artifacts more than blurring away
-  // features. 1.0=neutral.
-  float hf_asymmetry = 1.0f;
-
-  // Multiplier for the psychovisual difference in the X channel.
-  float xmul = 1.0f;
-
-  // Number of nits that correspond to 1.0f input values.
-  float intensity_target = jxl::kDefaultIntensityTarget;
-
-  JxlCmsInterface cms;
-  JxlMemoryManager memory_manager;
-  std::unique_ptr<jxl::ThreadPool> thread_pool{nullptr};
-};
-
-JxlButteraugliApi* JxlButteraugliApiCreate(
-    const JxlMemoryManager* memory_manager) {
-  JxlMemoryManager local_memory_manager;
-  if (!jxl::MemoryManagerInit(&local_memory_manager, memory_manager))
-    return nullptr;
-
-  void* alloc =
-      jxl::MemoryManagerAlloc(&local_memory_manager, sizeof(JxlButteraugliApi));
-  if (!alloc) return nullptr;
-  // Placement new constructor on allocated memory
-  JxlButteraugliApi* ret = new (alloc) JxlButteraugliApi();
-  ret->cms = jxl::GetJxlCms();
-  ret->memory_manager = local_memory_manager;
-  return ret;
-}
-
-void JxlButteraugliApiSetParallelRunner(JxlButteraugliApi* api,
-                                        JxlParallelRunner parallel_runner,
-                                        void* parallel_runner_opaque) {
-  api->thread_pool = jxl::make_unique<jxl::ThreadPool>(parallel_runner,
-                                                       parallel_runner_opaque);
-}
-
-void JxlButteraugliApiSetHFAsymmetry(JxlButteraugliApi* api, float v) {
-  api->hf_asymmetry = v;
-}
-
-void JxlButteraugliApiSetIntensityTarget(JxlButteraugliApi* api, float v) {
-  api->intensity_target = v;
-}
-
-void JxlButteraugliApiDestroy(JxlButteraugliApi* api) {
-  if (api) {
-    JxlMemoryManager local_memory_manager = api->memory_manager;
-    // Call destructor directly since custom free function is used.
-    api->~JxlButteraugliApi();
-    jxl::MemoryManagerFree(&local_memory_manager, api);
-  }
-}
-
-JxlButteraugliResult* JxlButteraugliCompute(
-    const JxlButteraugliApi* api, uint32_t xsize, uint32_t ysize,
-    const JxlPixelFormat* pixel_format_orig, const void* buffer_orig,
-    size_t size_orig, const JxlPixelFormat* pixel_format_dist,
-    const void* buffer_dist, size_t size_dist) {
-  jxl::ImageMetadata orig_metadata;
-  SetMetadataFromPixelFormat(pixel_format_orig, &orig_metadata);
-  jxl::ImageBundle orig_ib(&orig_metadata);
-  jxl::ColorEncoding c_current;
-  if (pixel_format_orig->data_type == JXL_TYPE_FLOAT) {
-    c_current =
-        jxl::ColorEncoding::LinearSRGB(pixel_format_orig->num_channels < 3);
-  } else {
-    c_current = jxl::ColorEncoding::SRGB(pixel_format_orig->num_channels < 3);
-  }
-  if (!jxl::BufferToImageBundle(*pixel_format_orig, xsize, ysize, buffer_orig,
-                                size_orig, api->thread_pool.get(), c_current,
-                                &orig_ib)) {
-    return nullptr;
-  }
-
-  jxl::ImageMetadata dist_metadata;
-  SetMetadataFromPixelFormat(pixel_format_dist, &dist_metadata);
-  jxl::ImageBundle dist_ib(&dist_metadata);
-  if (pixel_format_dist->data_type == JXL_TYPE_FLOAT) {
-    c_current =
-        jxl::ColorEncoding::LinearSRGB(pixel_format_dist->num_channels < 3);
-  } else {
-    c_current = jxl::ColorEncoding::SRGB(pixel_format_dist->num_channels < 3);
-  }
-  if (!jxl::BufferToImageBundle(*pixel_format_dist, xsize, ysize, buffer_dist,
-                                size_dist, api->thread_pool.get(), c_current,
-                                &dist_ib)) {
-    return nullptr;
-  }
-
-  void* alloc = jxl::MemoryManagerAlloc(&api->memory_manager,
-                                        sizeof(JxlButteraugliResult));
-  if (!alloc) return nullptr;
-  // Placement new constructor on allocated memory
-  JxlButteraugliResult* result = new (alloc) JxlButteraugliResult();
-  result->memory_manager = api->memory_manager;
-  result->params.hf_asymmetry = api->hf_asymmetry;
-  result->params.xmul = api->xmul;
-  result->params.intensity_target = api->intensity_target;
-  jxl::ButteraugliDistance(orig_ib, dist_ib, result->params, api->cms,
-                           &result->distmap, api->thread_pool.get());
-
-  return result;
-}
-
-float JxlButteraugliResultGetDistance(const JxlButteraugliResult* result,
-                                      float pnorm) {
-  return static_cast<float>(
-      jxl::ComputeDistanceP(result->distmap, result->params, pnorm));
-}
-
-void JxlButteraugliResultGetDistmap(const JxlButteraugliResult* result,
-                                    const float** buffer,
-                                    uint32_t* row_stride) {
-  *buffer = result->distmap.Row(0);
-  *row_stride = result->distmap.PixelsPerRow();
-}
-
-float JxlButteraugliResultGetMaxDistance(const JxlButteraugliResult* result) {
-  float max_distance = 0.0;
-  for (uint32_t y = 0; y < result->distmap.ysize(); y++) {
-    for (uint32_t x = 0; x < result->distmap.xsize(); x++) {
-      if (result->distmap.ConstRow(y)[x] > max_distance) {
-        max_distance = result->distmap.ConstRow(y)[x];
-      }
-    }
-  }
-  return max_distance;
-}
-
-void JxlButteraugliResultDestroy(JxlButteraugliResult* result) {
-  if (result) {
-    JxlMemoryManager local_memory_manager = result->memory_manager;
-    // Call destructor directly since custom free function is used.
-    result->~JxlButteraugliResult();
-    jxl::MemoryManagerFree(&local_memory_manager, result);
-  }
-}
index c1ea19f..17d7ef6 100644 (file)
@@ -5,7 +5,7 @@
 
 #include "lib/jxl/base/byte_order.h"
 
-#include "gtest/gtest.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
 namespace {
similarity index 99%
rename from lib/jxl/base/cache_aligned.cc
rename to lib/jxl/cache_aligned.cc
index 9a9cc58..992efc4 100644 (file)
@@ -3,7 +3,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-#include "lib/jxl/base/cache_aligned.h"
+#include "lib/jxl/cache_aligned.h"
 
 #include <stdio.h>
 #include <stdlib.h>
similarity index 88%
rename from lib/jxl/base/cache_aligned.h
rename to lib/jxl/cache_aligned.h
index e57df14..d79d7be 100644 (file)
@@ -62,13 +62,6 @@ static inline CacheAlignedUniquePtr AllocateArray(const size_t bytes) {
       CacheAlignedDeleter());
 }
 
-static inline CacheAlignedUniquePtr AllocateArray(const size_t bytes,
-                                                  const size_t offset) {
-  return CacheAlignedUniquePtr(
-      static_cast<uint8_t*>(CacheAligned::Allocate(bytes, offset)),
-      CacheAlignedDeleter());
-}
-
 }  // namespace jxl
 
 #endif  // LIB_JXL_BASE_CACHE_ALIGNED_H_
index cf2f90e..cb3b710 100644 (file)
 
 #include <vector>
 
-#include "lib/jxl/aux_out.h"
-#include "lib/jxl/aux_out_fwd.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/common.h"
-#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/cms/opsin_params.h"
 #include "lib/jxl/dec_bit_reader.h"
-#include "lib/jxl/enc_bit_writer.h"
 #include "lib/jxl/entropy_coder.h"
 #include "lib/jxl/field_encodings.h"
 #include "lib/jxl/fields.h"
+#include "lib/jxl/frame_dimensions.h"
 #include "lib/jxl/image.h"
-#include "lib/jxl/opsin_params.h"
 #include "lib/jxl/quant_weights.h"
 
 namespace jxl {
@@ -141,7 +137,7 @@ struct ColorCorrelationMap {
   uint32_t color_factor_ = kDefaultColorFactor;
   float color_scale_ = 1.0f / color_factor_;
   float base_correlation_x_ = 0.0f;
-  float base_correlation_b_ = kYToBRatio;
+  float base_correlation_b_ = jxl::cms::kYToBRatio;
   int32_t ytox_dc_ = 0;
   int32_t ytob_dc_ = 0;
 };
diff --git a/lib/jxl/cms/color_encoding_cms.h b/lib/jxl/cms/color_encoding_cms.h
new file mode 100644 (file)
index 0000000..81b3289
--- /dev/null
@@ -0,0 +1,628 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_CMS_COLOR_ENCODING_CMS_H_
+#define LIB_JXL_CMS_COLOR_ENCODING_CMS_H_
+
+#include <jxl/cms.h>
+#include <jxl/cms_interface.h>
+#include <jxl/color_encoding.h>
+#include <jxl/types.h>
+
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/base/common.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/cms/jxl_cms_internal.h"
+
+namespace jxl {
+namespace cms {
+
+using IccBytes = std::vector<uint8_t>;
+
+// Returns whether the two inputs are approximately equal.
+static inline bool ApproxEq(const double a, const double b,
+                            double max_l1 = 1E-3) {
+  // Threshold should be sufficient for ICC's 15-bit fixed-point numbers.
+  // We have seen differences of 7.1E-5 with lcms2 and 1E-3 with skcms.
+  return std::abs(a - b) <= max_l1;
+}
+
+// (All CIE units are for the standard 1931 2 degree observer)
+
+// Color space the color pixel data is encoded in. The color pixel data is
+// 3-channel in all cases except in case of kGray, where it uses only 1 channel.
+// This also determines the amount of channels used in modular encoding.
+enum class ColorSpace : uint32_t {
+  // Trichromatic color data. This also includes CMYK if a kBlack
+  // ExtraChannelInfo is present. This implies, if there is an ICC profile, that
+  // the ICC profile uses a 3-channel color space if no kBlack extra channel is
+  // present, or uses color space 'CMYK' if a kBlack extra channel is present.
+  kRGB,
+  // Single-channel data. This implies, if there is an ICC profile, that the ICC
+  // profile also represents single-channel data and has the appropriate color
+  // space ('GRAY').
+  kGray,
+  // Like kRGB, but implies fixed values for primaries etc.
+  kXYB,
+  // For non-RGB/gray data, e.g. from non-electro-optical sensors. Otherwise
+  // the same conditions as kRGB apply.
+  kUnknown
+  // NB: don't forget to update EnumBits!
+};
+
+// Values from CICP ColourPrimaries.
+enum class WhitePoint : uint32_t {
+  kD65 = 1,     // sRGB/BT.709/Display P3/BT.2020
+  kCustom = 2,  // Actual values encoded in separate fields
+  kE = 10,      // XYZ
+  kDCI = 11,    // DCI-P3
+  // NB: don't forget to update EnumBits!
+};
+
+// Values from CICP ColourPrimaries
+enum class Primaries : uint32_t {
+  kSRGB = 1,    // Same as BT.709
+  kCustom = 2,  // Actual values encoded in separate fields
+  k2100 = 9,    // Same as BT.2020
+  kP3 = 11,
+  // NB: don't forget to update EnumBits!
+};
+
+// Values from CICP TransferCharacteristics
+enum class TransferFunction : uint32_t {
+  k709 = 1,
+  kUnknown = 2,
+  kLinear = 8,
+  kSRGB = 13,
+  kPQ = 16,   // from BT.2100
+  kDCI = 17,  // from SMPTE RP 431-2 reference projector
+  kHLG = 18,  // from BT.2100
+  // NB: don't forget to update EnumBits!
+};
+
+enum class RenderingIntent : uint32_t {
+  // Values match ICC sRGB encodings.
+  kPerceptual = 0,  // good for photos, requires a profile with LUT.
+  kRelative,        // good for logos.
+  kSaturation,      // perhaps useful for CG with fully saturated colors.
+  kAbsolute,        // leaves white point unchanged; good for proofing.
+  // NB: don't forget to update EnumBits!
+};
+
+// Chromaticity (Y is omitted because it is 1 for white points and implicit for
+// primaries)
+struct CIExy {
+  double x = 0.0;
+  double y = 0.0;
+};
+
+struct PrimariesCIExy {
+  CIExy r;
+  CIExy g;
+  CIExy b;
+};
+
+// Serializable form of CIExy.
+struct Customxy {
+  static constexpr uint32_t kMul = 1000000;
+  static constexpr double kRoughLimit = 4.0;
+  static constexpr int32_t kMin = -0x200000;
+  static constexpr int32_t kMax = 0x1FFFFF;
+
+  int32_t x = 0;
+  int32_t y = 0;
+
+  CIExy GetValue() const {
+    CIExy xy;
+    xy.x = x * (1.0 / kMul);
+    xy.y = y * (1.0 / kMul);
+    return xy;
+  }
+
+  Status SetValue(const CIExy& xy) {
+    bool ok = (std::abs(xy.x) < kRoughLimit) && (std::abs(xy.y) < kRoughLimit);
+    if (!ok) return JXL_FAILURE("X or Y is out of bounds");
+    x = static_cast<int32_t>(roundf(xy.x * kMul));
+    if (x < kMin || x > kMax) return JXL_FAILURE("X is out of bounds");
+    y = static_cast<int32_t>(roundf(xy.y * kMul));
+    if (y < kMin || y > kMax) return JXL_FAILURE("Y is out of bounds");
+    return true;
+  }
+
+  bool IsSame(const Customxy& other) const {
+    return (x == other.x) && (y == other.y);
+  }
+};
+
+static inline Status WhitePointFromExternal(const JxlWhitePoint external,
+                                            WhitePoint* out) {
+  switch (external) {
+    case JXL_WHITE_POINT_D65:
+      *out = WhitePoint::kD65;
+      return true;
+    case JXL_WHITE_POINT_CUSTOM:
+      *out = WhitePoint::kCustom;
+      return true;
+    case JXL_WHITE_POINT_E:
+      *out = WhitePoint::kE;
+      return true;
+    case JXL_WHITE_POINT_DCI:
+      *out = WhitePoint::kDCI;
+      return true;
+  }
+  return JXL_FAILURE("Invalid WhitePoint enum value %d",
+                     static_cast<int>(external));
+}
+
+static inline Status PrimariesFromExternal(const JxlPrimaries external,
+                                           Primaries* out) {
+  switch (external) {
+    case JXL_PRIMARIES_SRGB:
+      *out = Primaries::kSRGB;
+      return true;
+    case JXL_PRIMARIES_CUSTOM:
+      *out = Primaries::kCustom;
+      return true;
+    case JXL_PRIMARIES_2100:
+      *out = Primaries::k2100;
+      return true;
+    case JXL_PRIMARIES_P3:
+      *out = Primaries::kP3;
+      return true;
+  }
+  return JXL_FAILURE("Invalid Primaries enum value");
+}
+
+static inline Status RenderingIntentFromExternal(
+    const JxlRenderingIntent external, RenderingIntent* out) {
+  switch (external) {
+    case JXL_RENDERING_INTENT_PERCEPTUAL:
+      *out = RenderingIntent::kPerceptual;
+      return true;
+    case JXL_RENDERING_INTENT_RELATIVE:
+      *out = RenderingIntent::kRelative;
+      return true;
+    case JXL_RENDERING_INTENT_SATURATION:
+      *out = RenderingIntent::kSaturation;
+      return true;
+    case JXL_RENDERING_INTENT_ABSOLUTE:
+      *out = RenderingIntent::kAbsolute;
+      return true;
+  }
+  return JXL_FAILURE("Invalid RenderingIntent enum value");
+}
+
+struct CustomTransferFunction {
+  // Highest reasonable value for the gamma of a transfer curve.
+  static constexpr uint32_t kMaxGamma = 8192;
+  static constexpr uint32_t kGammaMul = 10000000;
+
+  bool have_gamma = false;
+
+  // OETF exponent to go from linear to gamma-compressed.
+  uint32_t gamma = 0;  // Only used if have_gamma_.
+
+  // Can be kUnknown.
+  TransferFunction transfer_function =
+      TransferFunction::kSRGB;  // Only used if !have_gamma_.
+
+  TransferFunction GetTransferFunction() const {
+    JXL_ASSERT(!have_gamma);
+    return transfer_function;
+  }
+  void SetTransferFunction(const TransferFunction tf) {
+    have_gamma = false;
+    transfer_function = tf;
+  }
+
+  bool IsUnknown() const {
+    return !have_gamma && (transfer_function == TransferFunction::kUnknown);
+  }
+  bool IsSRGB() const {
+    return !have_gamma && (transfer_function == TransferFunction::kSRGB);
+  }
+  bool IsLinear() const {
+    return !have_gamma && (transfer_function == TransferFunction::kLinear);
+  }
+  bool IsPQ() const {
+    return !have_gamma && (transfer_function == TransferFunction::kPQ);
+  }
+  bool IsHLG() const {
+    return !have_gamma && (transfer_function == TransferFunction::kHLG);
+  }
+  bool Is709() const {
+    return !have_gamma && (transfer_function == TransferFunction::k709);
+  }
+  bool IsDCI() const {
+    return !have_gamma && (transfer_function == TransferFunction::kDCI);
+  }
+
+  double GetGamma() const {
+    JXL_ASSERT(have_gamma);
+    return gamma * (1.0 / kGammaMul);  // (0, 1)
+  }
+  Status SetGamma(double new_gamma) {
+    if (new_gamma < (1.0 / kMaxGamma) || new_gamma > 1.0) {
+      return JXL_FAILURE("Invalid gamma %f", new_gamma);
+    }
+
+    have_gamma = false;
+    if (ApproxEq(new_gamma, 1.0)) {
+      transfer_function = TransferFunction::kLinear;
+      return true;
+    }
+    if (ApproxEq(new_gamma, 1.0 / 2.6)) {
+      transfer_function = TransferFunction::kDCI;
+      return true;
+    }
+    // Don't translate 0.45.. to kSRGB nor k709 - that might change pixel
+    // values because those curves also have a linear part.
+
+    have_gamma = true;
+    gamma = roundf(new_gamma * kGammaMul);
+    transfer_function = TransferFunction::kUnknown;
+    return true;
+  }
+
+  bool IsSame(const CustomTransferFunction& other) const {
+    if (have_gamma != other.have_gamma) {
+      return false;
+    }
+    if (have_gamma) {
+      if (gamma != other.gamma) {
+        return false;
+      }
+    } else {
+      if (transfer_function != other.transfer_function) {
+        return false;
+      }
+    }
+    return true;
+  }
+};
+
+static inline Status ConvertExternalToInternalTransferFunction(
+    const JxlTransferFunction external, TransferFunction* internal) {
+  switch (external) {
+    case JXL_TRANSFER_FUNCTION_709:
+      *internal = TransferFunction::k709;
+      return true;
+    case JXL_TRANSFER_FUNCTION_UNKNOWN:
+      *internal = TransferFunction::kUnknown;
+      return true;
+    case JXL_TRANSFER_FUNCTION_LINEAR:
+      *internal = TransferFunction::kLinear;
+      return true;
+    case JXL_TRANSFER_FUNCTION_SRGB:
+      *internal = TransferFunction::kSRGB;
+      return true;
+    case JXL_TRANSFER_FUNCTION_PQ:
+      *internal = TransferFunction::kPQ;
+      return true;
+    case JXL_TRANSFER_FUNCTION_DCI:
+      *internal = TransferFunction::kDCI;
+      return true;
+    case JXL_TRANSFER_FUNCTION_HLG:
+      *internal = TransferFunction::kHLG;
+      return true;
+    case JXL_TRANSFER_FUNCTION_GAMMA:
+      return JXL_FAILURE("Gamma should be handled separately");
+  }
+  return JXL_FAILURE("Invalid TransferFunction enum value");
+}
+
+// Compact encoding of data required to interpret and translate pixels to a
+// known color space. Stored in Metadata. Thread-compatible.
+struct ColorEncoding {
+  // Only valid if HaveFields()
+  WhitePoint white_point = WhitePoint::kD65;
+  Primaries primaries = Primaries::kSRGB;  // Only valid if HasPrimaries()
+  RenderingIntent rendering_intent = RenderingIntent::kRelative;
+
+  // When false, fields such as white_point and tf are invalid and must not be
+  // used. This occurs after setting a raw bytes-only ICC profile, only the
+  // ICC bytes may be used. The color_space_ field is still valid.
+  bool have_fields = true;
+
+  IccBytes icc;  // Valid ICC profile
+
+  ColorSpace color_space = ColorSpace::kRGB;  // Can be kUnknown
+  bool cmyk = false;
+
+  // "late sync" fields
+  CustomTransferFunction tf;
+  Customxy white;  // Only used if white_point == kCustom
+  Customxy red;    // Only used if primaries == kCustom
+  Customxy green;  // Only used if primaries == kCustom
+  Customxy blue;   // Only used if primaries == kCustom
+
+  // Returns false if the field is invalid and unusable.
+  bool HasPrimaries() const {
+    return (color_space != ColorSpace::kGray) &&
+           (color_space != ColorSpace::kXYB);
+  }
+
+  size_t Channels() const { return (color_space == ColorSpace::kGray) ? 1 : 3; }
+
+  PrimariesCIExy GetPrimaries() const {
+    JXL_DASSERT(have_fields);
+    JXL_ASSERT(HasPrimaries());
+    PrimariesCIExy xy;
+    switch (primaries) {
+      case Primaries::kCustom:
+        xy.r = red.GetValue();
+        xy.g = green.GetValue();
+        xy.b = blue.GetValue();
+        return xy;
+
+      case Primaries::kSRGB:
+        xy.r.x = 0.639998686;
+        xy.r.y = 0.330010138;
+        xy.g.x = 0.300003784;
+        xy.g.y = 0.600003357;
+        xy.b.x = 0.150002046;
+        xy.b.y = 0.059997204;
+        return xy;
+
+      case Primaries::k2100:
+        xy.r.x = 0.708;
+        xy.r.y = 0.292;
+        xy.g.x = 0.170;
+        xy.g.y = 0.797;
+        xy.b.x = 0.131;
+        xy.b.y = 0.046;
+        return xy;
+
+      case Primaries::kP3:
+        xy.r.x = 0.680;
+        xy.r.y = 0.320;
+        xy.g.x = 0.265;
+        xy.g.y = 0.690;
+        xy.b.x = 0.150;
+        xy.b.y = 0.060;
+        return xy;
+    }
+    JXL_UNREACHABLE("Invalid Primaries %u", static_cast<uint32_t>(primaries));
+  }
+
+  Status SetPrimaries(const PrimariesCIExy& xy) {
+    JXL_DASSERT(have_fields);
+    JXL_ASSERT(HasPrimaries());
+    if (xy.r.x == 0.0 || xy.r.y == 0.0 || xy.g.x == 0.0 || xy.g.y == 0.0 ||
+        xy.b.x == 0.0 || xy.b.y == 0.0) {
+      return JXL_FAILURE("Invalid primaries %f %f %f %f %f %f", xy.r.x, xy.r.y,
+                         xy.g.x, xy.g.y, xy.b.x, xy.b.y);
+    }
+
+    if (ApproxEq(xy.r.x, 0.64) && ApproxEq(xy.r.y, 0.33) &&
+        ApproxEq(xy.g.x, 0.30) && ApproxEq(xy.g.y, 0.60) &&
+        ApproxEq(xy.b.x, 0.15) && ApproxEq(xy.b.y, 0.06)) {
+      primaries = Primaries::kSRGB;
+      return true;
+    }
+
+    if (ApproxEq(xy.r.x, 0.708) && ApproxEq(xy.r.y, 0.292) &&
+        ApproxEq(xy.g.x, 0.170) && ApproxEq(xy.g.y, 0.797) &&
+        ApproxEq(xy.b.x, 0.131) && ApproxEq(xy.b.y, 0.046)) {
+      primaries = Primaries::k2100;
+      return true;
+    }
+    if (ApproxEq(xy.r.x, 0.680) && ApproxEq(xy.r.y, 0.320) &&
+        ApproxEq(xy.g.x, 0.265) && ApproxEq(xy.g.y, 0.690) &&
+        ApproxEq(xy.b.x, 0.150) && ApproxEq(xy.b.y, 0.060)) {
+      primaries = Primaries::kP3;
+      return true;
+    }
+
+    primaries = Primaries::kCustom;
+    JXL_RETURN_IF_ERROR(red.SetValue(xy.r));
+    JXL_RETURN_IF_ERROR(green.SetValue(xy.g));
+    JXL_RETURN_IF_ERROR(blue.SetValue(xy.b));
+    return true;
+  }
+
+  CIExy GetWhitePoint() const {
+    JXL_DASSERT(have_fields);
+    CIExy xy;
+    switch (white_point) {
+      case WhitePoint::kCustom:
+        return white.GetValue();
+
+      case WhitePoint::kD65:
+        xy.x = 0.3127;
+        xy.y = 0.3290;
+        return xy;
+
+      case WhitePoint::kDCI:
+        // From https://ieeexplore.ieee.org/document/7290729 C.2 page 11
+        xy.x = 0.314;
+        xy.y = 0.351;
+        return xy;
+
+      case WhitePoint::kE:
+        xy.x = xy.y = 1.0 / 3;
+        return xy;
+    }
+    JXL_UNREACHABLE("Invalid WhitePoint %u",
+                    static_cast<uint32_t>(white_point));
+  }
+
+  Status SetWhitePoint(const CIExy& xy) {
+    JXL_DASSERT(have_fields);
+    if (xy.x == 0.0 || xy.y == 0.0) {
+      return JXL_FAILURE("Invalid white point %f %f", xy.x, xy.y);
+    }
+    if (ApproxEq(xy.x, 0.3127) && ApproxEq(xy.y, 0.3290)) {
+      white_point = WhitePoint::kD65;
+      return true;
+    }
+    if (ApproxEq(xy.x, 1.0 / 3) && ApproxEq(xy.y, 1.0 / 3)) {
+      white_point = WhitePoint::kE;
+      return true;
+    }
+    if (ApproxEq(xy.x, 0.314) && ApproxEq(xy.y, 0.351)) {
+      white_point = WhitePoint::kDCI;
+      return true;
+    }
+    white_point = WhitePoint::kCustom;
+    return white.SetValue(xy);
+  }
+
+  // Checks if the color spaces (including white point / primaries) are the
+  // same, but ignores the transfer function, rendering intent and ICC bytes.
+  bool SameColorSpace(const ColorEncoding& other) const {
+    if (color_space != other.color_space) return false;
+
+    if (white_point != other.white_point) return false;
+    if (white_point == WhitePoint::kCustom) {
+      if (!white.IsSame(other.white)) {
+        return false;
+      }
+    }
+
+    if (HasPrimaries() != other.HasPrimaries()) return false;
+    if (HasPrimaries()) {
+      if (primaries != other.primaries) return false;
+      if (primaries == Primaries::kCustom) {
+        if (!red.IsSame(other.red)) return false;
+        if (!green.IsSame(other.green)) return false;
+        if (!blue.IsSame(other.blue)) return false;
+      }
+    }
+    return true;
+  }
+
+  // Checks if the color space and transfer function are the same, ignoring
+  // rendering intent and ICC bytes
+  bool SameColorEncoding(const ColorEncoding& other) const {
+    return SameColorSpace(other) && tf.IsSame(other.tf);
+  }
+
+  // Returns true if all fields have been initialized (possibly to kUnknown).
+  // Returns false if the ICC profile is invalid or decoding it fails.
+  Status SetFieldsFromICC(IccBytes&& new_icc, const JxlCmsInterface& cms) {
+    // In case parsing fails, mark the ColorEncoding as invalid.
+    JXL_ASSERT(!new_icc.empty());
+    color_space = ColorSpace::kUnknown;
+    tf.transfer_function = TransferFunction::kUnknown;
+    icc.clear();
+
+    JxlColorEncoding external;
+    JXL_BOOL new_cmyk;
+    JXL_RETURN_IF_ERROR(cms.set_fields_from_icc(cms.set_fields_data,
+                                                new_icc.data(), new_icc.size(),
+                                                &external, &new_cmyk));
+    cmyk = new_cmyk;
+    if (cmyk) return true;
+    JXL_RETURN_IF_ERROR(FromExternal(external));
+    icc = std::move(new_icc);
+    return true;
+  }
+
+  JxlColorEncoding ToExternal() const {
+    JxlColorEncoding external = {};
+    if (!have_fields) {
+      external.color_space = JXL_COLOR_SPACE_UNKNOWN;
+      external.primaries = JXL_PRIMARIES_CUSTOM;
+      external.rendering_intent = JXL_RENDERING_INTENT_PERCEPTUAL;  //?
+      external.transfer_function = JXL_TRANSFER_FUNCTION_UNKNOWN;
+      external.white_point = JXL_WHITE_POINT_CUSTOM;
+      return external;
+    }
+    external.color_space = static_cast<JxlColorSpace>(color_space);
+
+    external.white_point = static_cast<JxlWhitePoint>(white_point);
+
+    CIExy wp = GetWhitePoint();
+    external.white_point_xy[0] = wp.x;
+    external.white_point_xy[1] = wp.y;
+
+    if (external.color_space == JXL_COLOR_SPACE_RGB ||
+        external.color_space == JXL_COLOR_SPACE_UNKNOWN) {
+      external.primaries = static_cast<JxlPrimaries>(primaries);
+      PrimariesCIExy p = GetPrimaries();
+      external.primaries_red_xy[0] = p.r.x;
+      external.primaries_red_xy[1] = p.r.y;
+      external.primaries_green_xy[0] = p.g.x;
+      external.primaries_green_xy[1] = p.g.y;
+      external.primaries_blue_xy[0] = p.b.x;
+      external.primaries_blue_xy[1] = p.b.y;
+    }
+
+    if (tf.have_gamma) {
+      external.transfer_function = JXL_TRANSFER_FUNCTION_GAMMA;
+      external.gamma = tf.GetGamma();
+    } else {
+      external.transfer_function =
+          static_cast<JxlTransferFunction>(tf.GetTransferFunction());
+      external.gamma = 0;
+    }
+
+    external.rendering_intent =
+        static_cast<JxlRenderingIntent>(rendering_intent);
+    return external;
+  }
+
+  // NB: does not create ICC.
+  Status FromExternal(const JxlColorEncoding& external) {
+    // TODO(eustas): update non-serializable on call-site
+    color_space = static_cast<ColorSpace>(external.color_space);
+
+    JXL_RETURN_IF_ERROR(
+        WhitePointFromExternal(external.white_point, &white_point));
+    if (external.white_point == JXL_WHITE_POINT_CUSTOM) {
+      CIExy wp;
+      wp.x = external.white_point_xy[0];
+      wp.y = external.white_point_xy[1];
+      JXL_RETURN_IF_ERROR(SetWhitePoint(wp));
+    }
+
+    if (external.color_space == JXL_COLOR_SPACE_RGB ||
+        external.color_space == JXL_COLOR_SPACE_UNKNOWN) {
+      JXL_RETURN_IF_ERROR(
+          PrimariesFromExternal(external.primaries, &primaries));
+      if (external.primaries == JXL_PRIMARIES_CUSTOM) {
+        PrimariesCIExy primaries;
+        primaries.r.x = external.primaries_red_xy[0];
+        primaries.r.y = external.primaries_red_xy[1];
+        primaries.g.x = external.primaries_green_xy[0];
+        primaries.g.y = external.primaries_green_xy[1];
+        primaries.b.x = external.primaries_blue_xy[0];
+        primaries.b.y = external.primaries_blue_xy[1];
+        JXL_RETURN_IF_ERROR(SetPrimaries(primaries));
+      }
+    }
+    CustomTransferFunction tf;
+    if (external.transfer_function == JXL_TRANSFER_FUNCTION_GAMMA) {
+      JXL_RETURN_IF_ERROR(tf.SetGamma(external.gamma));
+    } else {
+      TransferFunction tf_enum;
+      // JXL_TRANSFER_FUNCTION_GAMMA is not handled by this function since
+      // there's no internal enum value for it.
+      JXL_RETURN_IF_ERROR(ConvertExternalToInternalTransferFunction(
+          external.transfer_function, &tf_enum));
+      tf.SetTransferFunction(tf_enum);
+    }
+    this->tf = tf;
+
+    JXL_RETURN_IF_ERROR(RenderingIntentFromExternal(external.rendering_intent,
+                                                    &rendering_intent));
+
+    icc.clear();
+
+    return true;
+  }
+};
+
+}  // namespace cms
+}  // namespace jxl
+
+#endif  // LIB_JXL_CMS_COLOR_ENCODING_CMS_H_
similarity index 76%
rename from lib/jxl/enc_color_management.cc
rename to lib/jxl/cms/jxl_cms.cc
index 0b031d2..69143ae 100644 (file)
@@ -3,38 +3,37 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-#include "lib/jxl/enc_color_management.h"
+#include <jxl/cms.h>
 
 #ifndef JPEGXL_ENABLE_SKCMS
 #define JPEGXL_ENABLE_SKCMS 0
 #endif
 
-#include <math.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
+#include <jxl/cms_interface.h>
 
 #include <algorithm>
 #include <array>
-#include <atomic>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
 #include <memory>
-#include <string>
-#include <utility>
 
 #undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "lib/jxl/enc_color_management.cc"
+#define HWY_TARGET_INCLUDE "lib/jxl/cms/jxl_cms.cc"
 #include <hwy/foreach_target.h>
 #include <hwy/highway.h>
 
 #include "lib/jxl/base/compiler_specific.h"
-#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/matrix_ops.h"
 #include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/span.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/field_encodings.h"
-#include "lib/jxl/linalg.h"
-#include "lib/jxl/transfer_functions-inl.h"
+#include "lib/jxl/cms/jxl_cms_internal.h"
+#include "lib/jxl/cms/transfer_functions-inl.h"
+#include "lib/jxl/color_encoding_internal.h"
 #if JPEGXL_ENABLE_SKCMS
-#include "lib/jxl/enc_jxl_skcms.h"
+#include "skcms.h"
 #else  // JPEGXL_ENABLE_SKCMS
 #include "lcms2.h"
 #include "lcms2_plugin.h"
 
 // Define these only once. We can't use HWY_ONCE here because it is defined as
 // 1 only on the last pass.
-#ifndef LIB_JXL_ENC_COLOR_MANAGEMENT_CC_
-#define LIB_JXL_ENC_COLOR_MANAGEMENT_CC_
+#ifndef LIB_JXL_JXL_CMS_CC
+#define LIB_JXL_JXL_CMS_CC
 
 namespace jxl {
 namespace {
+
+using ::jxl::cms::ColorEncoding;
+
 struct JxlCms {
 #if JPEGXL_ENABLE_SKCMS
-  PaddedBytes icc_src, icc_dst;
+  IccBytes icc_src, icc_dst;
   skcms_ICCProfile profile_src, profile_dst;
 #else
   void* lcms_transform;
@@ -65,8 +67,12 @@ struct JxlCms {
 
   size_t channels_src;
   size_t channels_dst;
-  ImageF buf_src;
-  ImageF buf_dst;
+
+  std::vector<float> src_storage;
+  std::vector<float*> buf_src;
+  std::vector<float> dst_storage;
+  std::vector<float*> buf_dst;
+
   float intensity_target;
   bool skip_lcms = false;
   ExtraTF preprocess = ExtraTF::kNone;
@@ -78,7 +84,7 @@ Status ApplyHlgOotf(JxlCms* t, float* JXL_RESTRICT buf, size_t xsize,
 }  // namespace
 }  // namespace jxl
 
-#endif  // LIB_JXL_ENC_COLOR_MANAGEMENT_CC_
+#endif  // LIB_JXL_JXL_CMS_CC
 
 HWY_BEFORE_NAMESPACE();
 namespace jxl {
@@ -97,16 +103,11 @@ Status BeforeTransform(JxlCms* t, const float* buf_src, float* xform_src,
       break;
 
     case ExtraTF::kPQ: {
-      // By default, PQ content has an intensity target of 10000, stored
-      // exactly.
       HWY_FULL(float) df;
-      const auto multiplier = Set(df, t->intensity_target == 10000.f
-                                          ? 1.0f
-                                          : 10000.f / t->intensity_target);
+      TF_PQ tf_pq(t->intensity_target);
       for (size_t i = 0; i < buf_size; i += Lanes(df)) {
         const auto val = Load(df, buf_src + i);
-        const auto result =
-            Mul(multiplier, TF_PQ().DisplayFromEncoded(df, val));
+        const auto result = tf_pq.DisplayFromEncoded(df, val);
         Store(result, df, xform_src + i);
       }
 #if JXL_CMS_VERBOSE >= 2
@@ -120,7 +121,7 @@ Status BeforeTransform(JxlCms* t, const float* buf_src, float* xform_src,
     case ExtraTF::kHLG:
       for (size_t i = 0; i < buf_size; ++i) {
         xform_src[i] = static_cast<float>(
-            TF_HLG().DisplayFromEncoded(static_cast<double>(buf_src[i])));
+            TF_HLG_Base::DisplayFromEncoded(static_cast<double>(buf_src[i])));
       }
       if (t->apply_hlg_ootf) {
         JXL_RETURN_IF_ERROR(
@@ -158,13 +159,10 @@ Status AfterTransform(JxlCms* t, float* JXL_RESTRICT buf_dst, size_t buf_size) {
       break;
     case ExtraTF::kPQ: {
       HWY_FULL(float) df;
-      const auto multiplier =
-          Set(df, t->intensity_target == 10000.f ? 1.0f
-                                                 : t->intensity_target * 1e-4f);
+      TF_PQ tf_pq(t->intensity_target);
       for (size_t i = 0; i < buf_size; i += Lanes(df)) {
         const auto val = Load(df, buf_dst + i);
-        const auto result =
-            TF_PQ().EncodedFromDisplay(df, Mul(multiplier, val));
+        const auto result = tf_pq.EncodedFromDisplay(df, val);
         Store(result, df, buf_dst + i);
       }
 #if JXL_CMS_VERBOSE >= 2
@@ -180,7 +178,7 @@ Status AfterTransform(JxlCms* t, float* JXL_RESTRICT buf_dst, size_t buf_size) {
       }
       for (size_t i = 0; i < buf_size; ++i) {
         buf_dst[i] = static_cast<float>(
-            TF_HLG().EncodedFromDisplay(static_cast<double>(buf_dst[i])));
+            TF_HLG_Base::EncodedFromDisplay(static_cast<double>(buf_dst[i])));
       }
 #if JXL_CMS_VERBOSE >= 2
       printf("after HLG enc %.4f %.4f %.4f\n", buf_dst[3 * kX],
@@ -191,8 +189,7 @@ Status AfterTransform(JxlCms* t, float* JXL_RESTRICT buf_dst, size_t buf_size) {
       HWY_FULL(float) df;
       for (size_t i = 0; i < buf_size; i += Lanes(df)) {
         const auto val = Load(df, buf_dst + i);
-        const auto result =
-            TF_SRGB().EncodedFromDisplay(HWY_FULL(float)(), val);
+        const auto result = TF_SRGB().EncodedFromDisplay(df, val);
         Store(result, df, buf_dst + i);
       }
 #if JXL_CMS_VERBOSE >= 2
@@ -212,7 +209,7 @@ Status DoColorSpaceTransform(void* cms_data, const size_t thread,
 
   const float* xform_src = buf_src;  // Read-only.
   if (t->preprocess != ExtraTF::kNone) {
-    float* mutable_xform_src = t->buf_src.Row(thread);  // Writable buffer.
+    float* mutable_xform_src = t->buf_src[thread];  // Writable buffer.
     JXL_RETURN_IF_ERROR(BeforeTransform(t, buf_src, mutable_xform_src,
                                         xsize * t->channels_src));
     xform_src = mutable_xform_src;
@@ -221,8 +218,8 @@ Status DoColorSpaceTransform(void* cms_data, const size_t thread,
 #if JPEGXL_ENABLE_SKCMS
   if (t->channels_src == 1 && !t->skip_lcms) {
     // Expand from 1 to 3 channels, starting from the end in case
-    // xform_src == t->buf_src.Row(thread).
-    float* mutable_xform_src = t->buf_src.Row(thread);
+    // xform_src == t->buf_src[thread].
+    float* mutable_xform_src = t->buf_src[thread];
     for (size_t i = 0; i < xsize; ++i) {
       const size_t x = xsize - i - 1;
       mutable_xform_src[x * 3] = mutable_xform_src[x * 3 + 1] =
@@ -233,7 +230,7 @@ Status DoColorSpaceTransform(void* cms_data, const size_t thread,
 #else
   if (t->channels_src == 4 && !t->skip_lcms) {
     // LCMS does CMYK in a weird way: 0 = white, 100 = max ink
-    float* mutable_xform_src = t->buf_src.Row(thread);
+    float* mutable_xform_src = t->buf_src[thread];
     for (size_t x = 0; x < xsize * 4; ++x) {
       mutable_xform_src[x] = 100.f - 100.f * mutable_xform_src[x];
     }
@@ -275,7 +272,7 @@ Status DoColorSpaceTransform(void* cms_data, const size_t thread,
 #if JPEGXL_ENABLE_SKCMS
   if (t->channels_dst == 1 && !t->skip_lcms) {
     // Contract back from 3 to 1 channel, this time forward.
-    float* grayscale_buf_dst = t->buf_dst.Row(thread);
+    float* grayscale_buf_dst = t->buf_dst[thread];
     for (size_t x = 0; x < xsize; ++x) {
       grayscale_buf_dst[x] = buf_dst[x * 3];
     }
@@ -339,11 +336,6 @@ JXL_MUST_USE_RESULT cmsCIEXYZ D50_XYZ() {
   return {0.96420288, 1.0, 0.82490540};
 }
 
-JXL_MUST_USE_RESULT cmsCIExyY xyYFromCIExy(const CIExy& xy) {
-  const cmsCIExyY xyY = {xy.x, xy.y, 1.0};
-  return xyY;
-}
-
 // RAII
 
 struct ProfileDeleter {
@@ -380,8 +372,8 @@ Status DecodeProfile(const uint8_t* icc, size_t size,
   }
   return true;
 }
-#else  // JPEGXL_ENABLE_SKCMS
-Status DecodeProfile(const cmsContext context, const PaddedBytes& icc,
+#else   // JPEGXL_ENABLE_SKCMS
+Status DecodeProfile(const cmsContext context, Span<const uint8_t> icc,
                      Profile* profile) {
   profile->reset(cmsOpenProfileFromMemTHR(context, icc.data(), icc.size()));
   if (profile->get() == nullptr) {
@@ -412,14 +404,6 @@ ColorSpace ColorSpaceFromProfile(const skcms_ICCProfile& profile) {
   }
 }
 
-// "profile1" is pre-decoded to save time in DetectTransferFunction.
-Status ProfileEquivalentToICC(const skcms_ICCProfile& profile1,
-                              const PaddedBytes& icc) {
-  skcms_ICCProfile profile2;
-  JXL_RETURN_IF_ERROR(skcms_Parse(icc.data(), icc.size(), &profile2));
-  return skcms_ApproximatelyEqualProfiles(&profile1, &profile2);
-}
-
 // vector_out := matmul(matrix, vector_in)
 void MatrixProduct(const skcms_Matrix3x3& matrix, const float vector_in[3],
                    float vector_out[3]) {
@@ -475,7 +459,8 @@ Status IdentifyPrimaries(const skcms_ICCProfile& profile,
          {-0.0085287, 0.0400428, 0.9684867}}};
     static constexpr float kWpD50XYZ[3] = {0.96420288, 1.0, 0.82490540};
     float wp_unadapted_XYZ[3];
-    JXL_RETURN_IF_ERROR(CIEXYZFromWhiteCIExy(wp_unadapted, wp_unadapted_XYZ));
+    JXL_RETURN_IF_ERROR(
+        CIEXYZFromWhiteCIExy(wp_unadapted.x, wp_unadapted.y, wp_unadapted_XYZ));
     float wp_D50_LMS[3], wp_unadapted_LMS[3];
     MatrixProduct(kLMSFromXYZ, kWpD50XYZ, wp_D50_LMS);
     MatrixProduct(kLMSFromXYZ, wp_unadapted_XYZ, wp_unadapted_LMS);
@@ -502,23 +487,64 @@ Status IdentifyPrimaries(const skcms_ICCProfile& profile,
   return c->SetPrimaries(primaries);
 }
 
+bool IsApproximatelyEqual(const skcms_ICCProfile& profile,
+                          const ColorEncoding& JXL_RESTRICT c) {
+  IccBytes bytes;
+  if (!MaybeCreateProfile(c.ToExternal(), &bytes)) {
+    return false;
+  }
+
+  skcms_ICCProfile profile_test;
+  if (!DecodeProfile(bytes.data(), bytes.size(), &profile_test)) {
+    return false;
+  }
+
+  if (!skcms_ApproximatelyEqualProfiles(&profile_test, &profile)) {
+    return false;
+  }
+
+  return true;
+}
+
 void DetectTransferFunction(const skcms_ICCProfile& profile,
                             ColorEncoding* JXL_RESTRICT c) {
-  if (c->tf.SetImplicit()) return;
+  JXL_CHECK(c->color_space != ColorSpace::kXYB);
+
+  float gamma[3] = {};
+  if (profile.has_trc) {
+    const auto IsGamma = [](const skcms_TransferFunction& tf) {
+      return tf.a == 1 && tf.b == 0 &&
+             /* if b and d are zero, it is fine for c not to be */ tf.d == 0 &&
+             tf.e == 0 && tf.f == 0;
+    };
+    for (int i = 0; i < 3; ++i) {
+      if (profile.trc[i].table_entries == 0 &&
+          IsGamma(profile.trc->parametric)) {
+        gamma[i] = 1.f / profile.trc->parametric.g;
+      } else {
+        skcms_TransferFunction approximate_tf;
+        float max_error;
+        if (skcms_ApproximateCurve(&profile.trc[i], &approximate_tf,
+                                   &max_error)) {
+          if (IsGamma(approximate_tf)) {
+            gamma[i] = 1.f / approximate_tf.g;
+          }
+        }
+      }
+    }
+  }
+  if (gamma[0] != 0 && std::abs(gamma[0] - gamma[1]) < 1e-4f &&
+      std::abs(gamma[1] - gamma[2]) < 1e-4f) {
+    if (c->tf.SetGamma(gamma[0])) {
+      if (IsApproximatelyEqual(profile, *c)) return;
+    }
+  }
 
   for (TransferFunction tf : Values<TransferFunction>()) {
     // Can only create profile from known transfer function.
     if (tf == TransferFunction::kUnknown) continue;
-
     c->tf.SetTransferFunction(tf);
-
-    skcms_ICCProfile profile_test;
-    PaddedBytes bytes;
-    if (MaybeCreateProfile(*c, &bytes) &&
-        DecodeProfile(bytes.data(), bytes.size(), &profile_test) &&
-        skcms_ApproximatelyEqualProfiles(&profile, &profile_test)) {
-      return;
-    }
+    if (IsApproximatelyEqual(profile, *c)) return;
   }
 
   c->tf.SetTransferFunction(TransferFunction::kUnknown);
@@ -528,12 +554,12 @@ void DetectTransferFunction(const skcms_ICCProfile& profile,
 
 uint32_t Type32(const ColorEncoding& c, bool cmyk) {
   if (cmyk) return TYPE_CMYK_FLT;
-  if (c.IsGray()) return TYPE_GRAY_FLT;
+  if (c.color_space == ColorSpace::kGray) return TYPE_GRAY_FLT;
   return TYPE_RGB_FLT;
 }
 
 uint32_t Type64(const ColorEncoding& c) {
-  if (c.IsGray()) return TYPE_GRAY_DBL;
+  if (c.color_space == ColorSpace::kGray) return TYPE_GRAY_DBL;
   return TYPE_RGB_DBL;
 }
 
@@ -551,11 +577,11 @@ ColorSpace ColorSpaceFromProfile(const Profile& profile) {
 
 // "profile1" is pre-decoded to save time in DetectTransferFunction.
 Status ProfileEquivalentToICC(const cmsContext context, const Profile& profile1,
-                              const PaddedBytes& icc, const ColorEncoding& c) {
+                              const IccBytes& icc, const ColorEncoding& c) {
   const uint32_t type_src = Type64(c);
 
   Profile profile2;
-  JXL_RETURN_IF_ERROR(DecodeProfile(context, icc, &profile2));
+  JXL_RETURN_IF_ERROR(DecodeProfile(context, Bytes(icc), &profile2));
 
   Profile profile_xyz;
   JXL_RETURN_IF_ERROR(CreateProfileXYZ(context, &profile_xyz));
@@ -581,12 +607,12 @@ Status ProfileEquivalentToICC(const cmsContext context, const Profile& profile1,
   const double init = 1E-3;
   const double step = 0.2;
 
-  if (c.IsGray()) {
+  if (c.color_space == ColorSpace::kGray) {
     // Finer sampling and replicate each component.
     for (in[0] = init; in[0] < 1.0; in[0] += step / 8) {
       cmsDoTransform(xform1.get(), in, out1, 1);
       cmsDoTransform(xform2.get(), in, out2, 1);
-      if (!ApproxEq(out1[0], out2[0], 2E-4)) {
+      if (!cms::ApproxEq(out1[0], out2[0], 2E-4)) {
         return false;
       }
     }
@@ -597,7 +623,7 @@ Status ProfileEquivalentToICC(const cmsContext context, const Profile& profile1,
           cmsDoTransform(xform1.get(), in, out1, 1);
           cmsDoTransform(xform2.get(), in, out2, 1);
           for (size_t i = 0; i < 3; ++i) {
-            if (!ApproxEq(out1[i], out2[i], 2E-4)) {
+            if (!cms::ApproxEq(out1[i], out2[i], 2E-4)) {
               return false;
             }
           }
@@ -704,7 +730,44 @@ Status IdentifyPrimaries(const cmsContext context, const Profile& profile,
 
 void DetectTransferFunction(const cmsContext context, const Profile& profile,
                             ColorEncoding* JXL_RESTRICT c) {
-  if (c->tf.SetImplicit()) return;
+  JXL_CHECK(c->color_space != ColorSpace::kXYB);
+
+  float gamma = 0;
+  if (const auto* gray_trc = reinterpret_cast<const cmsToneCurve*>(
+          cmsReadTag(profile.get(), cmsSigGrayTRCTag))) {
+    const double estimated_gamma =
+        cmsEstimateGamma(gray_trc, /*precision=*/1e-4);
+    if (estimated_gamma > 0) {
+      gamma = 1. / estimated_gamma;
+    }
+  } else {
+    float rgb_gamma[3] = {};
+    int i = 0;
+    for (const auto tag :
+         {cmsSigRedTRCTag, cmsSigGreenTRCTag, cmsSigBlueTRCTag}) {
+      if (const auto* trc = reinterpret_cast<const cmsToneCurve*>(
+              cmsReadTag(profile.get(), tag))) {
+        const double estimated_gamma =
+            cmsEstimateGamma(trc, /*precision=*/1e-4);
+        if (estimated_gamma > 0) {
+          rgb_gamma[i] = 1. / estimated_gamma;
+        }
+      }
+      ++i;
+    }
+    if (rgb_gamma[0] != 0 && std::abs(rgb_gamma[0] - rgb_gamma[1]) < 1e-4f &&
+        std::abs(rgb_gamma[1] - rgb_gamma[2]) < 1e-4f) {
+      gamma = rgb_gamma[0];
+    }
+  }
+
+  if (gamma != 0 && c->tf.SetGamma(gamma)) {
+    IccBytes icc_test;
+    if (MaybeCreateProfile(c->ToExternal(), &icc_test) &&
+        ProfileEquivalentToICC(context, profile, icc_test, *c)) {
+      return;
+    }
+  }
 
   for (TransferFunction tf : Values<TransferFunction>()) {
     // Can only create profile from known transfer function.
@@ -712,8 +775,8 @@ void DetectTransferFunction(const cmsContext context, const Profile& profile,
 
     c->tf.SetTransferFunction(tf);
 
-    PaddedBytes icc_test;
-    if (MaybeCreateProfile(*c, &icc_test) &&
+    IccBytes icc_test;
+    if (MaybeCreateProfile(c->ToExternal(), &icc_test) &&
         ProfileEquivalentToICC(context, profile, icc_test, *c)) {
       return;
     }
@@ -767,8 +830,8 @@ Status GetPrimariesLuminances(const ColorEncoding& encoding,
   // that primary.
 
   float white_XYZ[3];
-  JXL_RETURN_IF_ERROR(
-      CIEXYZFromWhiteCIExy(encoding.GetWhitePoint(), white_XYZ));
+  CIExy wp = encoding.GetWhitePoint();
+  JXL_RETURN_IF_ERROR(CIEXYZFromWhiteCIExy(wp.x, wp.y, white_XYZ));
 
   const PrimariesCIExy primaries = encoding.GetPrimaries();
   double chromaticities[3][3] = {
@@ -840,51 +903,111 @@ Status ApplyHlgOotf(JxlCms* t, float* JXL_RESTRICT buf, size_t xsize,
   return true;
 }
 
-}  // namespace
+bool IsKnownTransferFunction(jxl::cms::TransferFunction tf) {
+  using TF = jxl::cms::TransferFunction;
+  // All but kUnknown
+  return tf == TF::k709 || tf == TF::kLinear || tf == TF::kSRGB ||
+         tf == TF::kPQ || tf == TF::kDCI || tf == TF::kHLG;
+}
+
+constexpr uint8_t kColorPrimariesP3_D65 = 12;
+
+bool IsKnownColorPrimaries(uint8_t color_primaries) {
+  using P = jxl::cms::Primaries;
+  // All but kCustom
+  if (color_primaries == kColorPrimariesP3_D65) return true;
+  const auto p = static_cast<Primaries>(color_primaries);
+  return p == P::kSRGB || p == P::k2100 || p == P::kP3;
+}
+
+bool ApplyCICP(const uint8_t color_primaries,
+               const uint8_t transfer_characteristics,
+               const uint8_t matrix_coefficients, const uint8_t full_range,
+               ColorEncoding* JXL_RESTRICT c) {
+  if (matrix_coefficients != 0) return false;
+  if (full_range != 1) return false;
+
+  const auto primaries = static_cast<Primaries>(color_primaries);
+  const auto tf = static_cast<TransferFunction>(transfer_characteristics);
+  if (!IsKnownTransferFunction(tf)) return false;
+  if (!IsKnownColorPrimaries(color_primaries)) return false;
+  c->color_space = ColorSpace::kRGB;
+  c->tf.SetTransferFunction(tf);
+  if (primaries == Primaries::kP3) {
+    c->white_point = WhitePoint::kDCI;
+    c->primaries = Primaries::kP3;
+  } else if (color_primaries == kColorPrimariesP3_D65) {
+    c->white_point = WhitePoint::kD65;
+    c->primaries = Primaries::kP3;
+  } else {
+    c->white_point = WhitePoint::kD65;
+    c->primaries = primaries;
+  }
+  return true;
+}
+
+JXL_BOOL JxlCmsSetFieldsFromICC(void* user_data, const uint8_t* icc_data,
+                                size_t icc_size, JxlColorEncoding* c,
+                                JXL_BOOL* cmyk) {
+  if (c == nullptr) return JXL_FALSE;
+  if (cmyk == nullptr) return JXL_FALSE;
+
+  *cmyk = JXL_FALSE;
 
-Status ColorEncoding::SetFieldsFromICC() {
   // In case parsing fails, mark the ColorEncoding as invalid.
-  SetColorSpace(ColorSpace::kUnknown);
-  tf.SetTransferFunction(TransferFunction::kUnknown);
+  c->color_space = JXL_COLOR_SPACE_UNKNOWN;
+  c->transfer_function = JXL_TRANSFER_FUNCTION_UNKNOWN;
+
+  if (icc_size == 0) return JXL_FAILURE("Empty ICC profile");
 
-  if (icc_.empty()) return JXL_FAILURE("Empty ICC profile");
+  ColorEncoding c_enc;
 
 #if JPEGXL_ENABLE_SKCMS
-  if (icc_.size() < 128) {
+  if (icc_size < 128) {
     return JXL_FAILURE("ICC file too small");
   }
 
   skcms_ICCProfile profile;
-  JXL_RETURN_IF_ERROR(skcms_Parse(icc_.data(), icc_.size(), &profile));
+  JXL_RETURN_IF_ERROR(skcms_Parse(icc_data, icc_size, &profile));
 
   // skcms does not return the rendering intent, so get it from the file. It
   // is encoded as big-endian 32-bit integer in bytes 60..63.
-  uint32_t rendering_intent32 = icc_[67];
-  if (rendering_intent32 > 3 || icc_[64] != 0 || icc_[65] != 0 ||
-      icc_[66] != 0) {
+  uint32_t rendering_intent32 = icc_data[67];
+  if (rendering_intent32 > 3 || icc_data[64] != 0 || icc_data[65] != 0 ||
+      icc_data[66] != 0) {
     return JXL_FAILURE("Invalid rendering intent %u\n", rendering_intent32);
   }
+  // ICC and RenderingIntent have the same values (0..3).
+  c_enc.rendering_intent = static_cast<RenderingIntent>(rendering_intent32);
+
+  if (profile.has_CICP &&
+      ApplyCICP(profile.CICP.color_primaries,
+                profile.CICP.transfer_characteristics,
+                profile.CICP.matrix_coefficients,
+                profile.CICP.video_full_range_flag, &c_enc)) {
+    *c = c_enc.ToExternal();
+    return true;
+  }
 
-  SetColorSpace(ColorSpaceFromProfile(profile));
-  cmyk_ = (profile.data_color_space == skcms_Signature_CMYK);
+  c_enc.color_space = ColorSpaceFromProfile(profile);
+  *cmyk = (profile.data_color_space == skcms_Signature_CMYK);
 
   CIExy wp_unadapted;
   JXL_RETURN_IF_ERROR(UnadaptedWhitePoint(profile, &wp_unadapted));
-  JXL_RETURN_IF_ERROR(SetWhitePoint(wp_unadapted));
+  JXL_RETURN_IF_ERROR(c_enc.SetWhitePoint(wp_unadapted));
 
   // Relies on color_space.
-  JXL_RETURN_IF_ERROR(IdentifyPrimaries(profile, wp_unadapted, this));
+  JXL_RETURN_IF_ERROR(IdentifyPrimaries(profile, wp_unadapted, &c_enc));
 
   // Relies on color_space/white point/primaries being set already.
-  DetectTransferFunction(profile, this);
-  // ICC and RenderingIntent have the same values (0..3).
-  rendering_intent = static_cast<RenderingIntent>(rendering_intent32);
+  DetectTransferFunction(profile, &c_enc);
 #else  // JPEGXL_ENABLE_SKCMS
 
   const cmsContext context = GetContext();
 
   Profile profile;
-  JXL_RETURN_IF_ERROR(DecodeProfile(context, icc_, &profile));
+  JXL_RETURN_IF_ERROR(
+      DecodeProfile(context, Bytes(icc_data, icc_size), &profile));
 
   const cmsUInt32Number rendering_intent32 =
       cmsGetHeaderRenderingIntent(profile.get());
@@ -892,49 +1015,44 @@ Status ColorEncoding::SetFieldsFromICC() {
     return JXL_FAILURE("Invalid rendering intent %u\n", rendering_intent32);
   }
   // ICC and RenderingIntent have the same values (0..3).
-  rendering_intent = static_cast<RenderingIntent>(rendering_intent32);
+  c_enc.rendering_intent = static_cast<RenderingIntent>(rendering_intent32);
+
+  static constexpr size_t kCICPSize = 12;
+  static constexpr auto kCICPSignature =
+      static_cast<cmsTagSignature>(0x63696370);
+  uint8_t cicp_buffer[kCICPSize];
+  if (cmsReadRawTag(profile.get(), kCICPSignature, cicp_buffer, kCICPSize) ==
+          kCICPSize &&
+      ApplyCICP(cicp_buffer[8], cicp_buffer[9], cicp_buffer[10],
+                cicp_buffer[11], &c_enc)) {
+    *c = c_enc.ToExternal();
+    return true;
+  }
 
-  SetColorSpace(ColorSpaceFromProfile(profile));
+  c_enc.color_space = ColorSpaceFromProfile(profile);
   if (cmsGetColorSpace(profile.get()) == cmsSigCmykData) {
-    cmyk_ = true;
+    *cmyk = JXL_TRUE;
+    *c = c_enc.ToExternal();
     return true;
   }
 
-  const cmsCIEXYZ wp_unadapted = UnadaptedWhitePoint(context, profile, *this);
-  JXL_RETURN_IF_ERROR(SetWhitePoint(CIExyFromXYZ(wp_unadapted)));
+  const cmsCIEXYZ wp_unadapted = UnadaptedWhitePoint(context, profile, c_enc);
+  JXL_RETURN_IF_ERROR(c_enc.SetWhitePoint(CIExyFromXYZ(wp_unadapted)));
 
   // Relies on color_space.
-  JXL_RETURN_IF_ERROR(IdentifyPrimaries(context, profile, wp_unadapted, this));
+  JXL_RETURN_IF_ERROR(
+      IdentifyPrimaries(context, profile, wp_unadapted, &c_enc));
 
   // Relies on color_space/white point/primaries being set already.
-  DetectTransferFunction(context, profile, this);
+  DetectTransferFunction(context, profile, &c_enc);
 
 #endif  // JPEGXL_ENABLE_SKCMS
 
+  *c = c_enc.ToExternal();
   return true;
 }
 
-void ColorEncoding::DecideIfWantICC() {
-  PaddedBytes icc_new;
-  bool equivalent;
-#if JPEGXL_ENABLE_SKCMS
-  skcms_ICCProfile profile;
-  if (!DecodeProfile(ICC().data(), ICC().size(), &profile)) return;
-  if (!MaybeCreateProfile(*this, &icc_new)) return;
-  equivalent = ProfileEquivalentToICC(profile, icc_new);
-#else   // JPEGXL_ENABLE_SKCMS
-  const cmsContext context = GetContext();
-  Profile profile;
-  if (!DecodeProfile(context, ICC(), &profile)) return;
-  if (cmsGetColorSpace(profile.get()) == cmsSigCmykData) return;
-  if (!MaybeCreateProfile(*this, &icc_new)) return;
-  equivalent = ProfileEquivalentToICC(context, profile, icc_new, *this);
-#endif  // JPEGXL_ENABLE_SKCMS
-
-  // Successfully created a profile => reconstruction should be equivalent.
-  JXL_ASSERT(equivalent);
-  want_icc_ = false;
-}
+}  // namespace
 
 namespace {
 
@@ -947,20 +1065,45 @@ void JxlCmsDestroy(void* cms_data) {
   delete t;
 }
 
+void AllocateBuffer(size_t length, size_t num_threads,
+                    std::vector<float>* storage, std::vector<float*>* view) {
+  constexpr size_t kAlign = 128 / sizeof(float);
+  size_t stride = RoundUpTo(length, kAlign);
+  storage->resize(stride * num_threads + kAlign);
+  intptr_t addr = reinterpret_cast<intptr_t>(storage->data());
+  size_t offset =
+      (RoundUpTo(addr, kAlign * sizeof(float)) - addr) / sizeof(float);
+  view->clear();
+  view->reserve(num_threads);
+  for (size_t i = 0; i < num_threads; ++i) {
+    view->emplace_back(storage->data() + offset + i * stride);
+  }
+}
+
 void* JxlCmsInit(void* init_data, size_t num_threads, size_t xsize,
                  const JxlColorProfile* input, const JxlColorProfile* output,
                  float intensity_target) {
+  JXL_ASSERT(init_data != nullptr);
+  auto cms = static_cast<const JxlCmsInterface*>(init_data);
   auto t = jxl::make_unique<JxlCms>();
-  PaddedBytes icc_src, icc_dst;
+  IccBytes icc_src, icc_dst;
+  if (input->icc.size == 0) {
+    JXL_NOTIFY_ERROR("JxlCmsInit: empty input ICC");
+    return nullptr;
+  }
+  if (output->icc.size == 0) {
+    JXL_NOTIFY_ERROR("JxlCmsInit: empty OUTPUT ICC");
+    return nullptr;
+  }
   icc_src.assign(input->icc.data, input->icc.data + input->icc.size);
   ColorEncoding c_src;
-  if (!c_src.SetICC(std::move(icc_src))) {
+  if (!c_src.SetFieldsFromICC(std::move(icc_src), *cms)) {
     JXL_NOTIFY_ERROR("JxlCmsInit: failed to parse input ICC");
     return nullptr;
   }
   icc_dst.assign(output->icc.data, output->icc.data + output->icc.size);
   ColorEncoding c_dst;
-  if (!c_dst.SetICC(std::move(icc_dst))) {
+  if (!c_dst.SetFieldsFromICC(std::move(icc_dst), *cms)) {
     JXL_NOTIFY_ERROR("JxlCmsInit: failed to parse output ICC");
     return nullptr;
   }
@@ -980,11 +1123,11 @@ void* JxlCmsInit(void* init_data, size_t num_threads, size_t xsize,
 #else   // JPEGXL_ENABLE_SKCMS
   const cmsContext context = GetContext();
   Profile profile_src, profile_dst;
-  if (!DecodeProfile(context, c_src.ICC(), &profile_src)) {
+  if (!DecodeProfile(context, Bytes(c_src.icc), &profile_src)) {
     JXL_NOTIFY_ERROR("JxlCmsInit: lcms failed to parse input ICC");
     return nullptr;
   }
-  if (!DecodeProfile(context, c_dst.ICC(), &profile_dst)) {
+  if (!DecodeProfile(context, Bytes(c_dst.icc), &profile_dst)) {
     JXL_NOTIFY_ERROR("JxlCmsInit: lcms failed to parse output ICC");
     return nullptr;
   }
@@ -1022,15 +1165,15 @@ void* JxlCmsInit(void* init_data, size_t num_threads, size_t xsize,
     c_linear_src.tf.SetTransferFunction(TransferFunction::kLinear);
 #if JPEGXL_ENABLE_SKCMS
     skcms_ICCProfile new_src;
-#else  // JPEGXL_ENABLE_SKCMS
+#else   // JPEGXL_ENABLE_SKCMS
     Profile new_src;
 #endif  // JPEGXL_ENABLE_SKCMS
         // Only enable ExtraTF if profile creation succeeded.
-    if (MaybeCreateProfile(c_linear_src, &icc_src) &&
+    if (MaybeCreateProfile(c_linear_src.ToExternal(), &icc_src) &&
 #if JPEGXL_ENABLE_SKCMS
         DecodeProfile(icc_src.data(), icc_src.size(), &new_src)) {
 #else   // JPEGXL_ENABLE_SKCMS
-        DecodeProfile(context, icc_src, &new_src)) {
+        DecodeProfile(context, Bytes(icc_src), &new_src)) {
 #endif  // JPEGXL_ENABLE_SKCMS
 #if JXL_CMS_VERBOSE
       printf("Special HLG/PQ/sRGB -> linear\n");
@@ -1067,11 +1210,11 @@ void* JxlCmsInit(void* init_data, size_t num_threads, size_t xsize,
     Profile new_dst;
 #endif  // JPEGXL_ENABLE_SKCMS
     // Only enable ExtraTF if profile creation succeeded.
-    if (MaybeCreateProfile(c_linear_dst, &icc_dst) &&
+    if (MaybeCreateProfile(c_linear_dst.ToExternal(), &icc_dst) &&
 #if JPEGXL_ENABLE_SKCMS
         DecodeProfile(icc_dst.data(), icc_dst.size(), &new_dst)) {
 #else   // JPEGXL_ENABLE_SKCMS
-        DecodeProfile(context, icc_dst, &new_dst)) {
+        DecodeProfile(context, Bytes(icc_dst), &new_dst)) {
 #endif  // JPEGXL_ENABLE_SKCMS
 #if JXL_CMS_VERBOSE
       printf("Special linear -> HLG/PQ/sRGB\n");
@@ -1108,13 +1251,13 @@ void* JxlCmsInit(void* init_data, size_t num_threads, size_t xsize,
   if (!skcms_MakeUsableAsDestination(&t->profile_dst)) {
     JXL_NOTIFY_ERROR(
         "Failed to make %s usable as a color transform destination",
-        Description(c_dst).c_str());
+        ColorEncodingDescription(c_dst.ToExternal()).c_str());
     return nullptr;
   }
 #endif  // JPEGXL_ENABLE_SKCMS
 
   // Not including alpha channel (copied separately).
-  const size_t channels_src = (c_src.IsCMYK() ? 4 : c_src.Channels());
+  const size_t channels_src = (c_src.cmyk ? 4 : c_src.Channels());
   const size_t channels_dst = c_dst.Channels();
   JXL_CHECK(channels_src == channels_dst ||
             (channels_src == 4 && channels_dst == 3));
@@ -1151,41 +1294,50 @@ void* JxlCmsInit(void* init_data, size_t num_threads, size_t xsize,
   // outputs (or vice versa), we use floating point input/output.
   t->channels_src = channels_src;
   t->channels_dst = channels_dst;
+  size_t actual_channels_src = channels_src;
+  size_t actual_channels_dst = channels_dst;
 #if JPEGXL_ENABLE_SKCMS
   // SkiaCMS doesn't support grayscale float buffers, so we create space for RGB
   // float buffers anyway.
-  t->buf_src = ImageF(xsize * (channels_src == 4 ? 4 : 3), num_threads);
-  t->buf_dst = ImageF(xsize * 3, num_threads);
-#else
-  t->buf_src = ImageF(xsize * channels_src, num_threads);
-  t->buf_dst = ImageF(xsize * channels_dst, num_threads);
+  actual_channels_src = (channels_src == 4 ? 4 : 3);
+  actual_channels_dst = 3;
 #endif
+  AllocateBuffer(xsize * actual_channels_src, num_threads, &t->src_storage,
+                 &t->buf_src);
+  AllocateBuffer(xsize * actual_channels_dst, num_threads, &t->dst_storage,
+                 &t->buf_dst);
   t->intensity_target = intensity_target;
   return t.release();
 }
 
 float* JxlCmsGetSrcBuf(void* cms_data, size_t thread) {
   JxlCms* t = reinterpret_cast<JxlCms*>(cms_data);
-  return t->buf_src.Row(thread);
+  return t->buf_src[thread];
 }
 
 float* JxlCmsGetDstBuf(void* cms_data, size_t thread) {
   JxlCms* t = reinterpret_cast<JxlCms*>(cms_data);
-  return t->buf_dst.Row(thread);
+  return t->buf_dst[thread];
 }
 
 }  // namespace
 
-const JxlCmsInterface& GetJxlCms() {
+extern "C" {
+
+const JxlCmsInterface* JxlGetDefaultCms() {
   static constexpr JxlCmsInterface kInterface = {
-      /*init_data=*/nullptr,
+      /*set_fields_data=*/nullptr,
+      /*set_fields_from_icc=*/&JxlCmsSetFieldsFromICC,
+      /*init_data=*/const_cast<void*>(static_cast<const void*>(&kInterface)),
       /*init=*/&JxlCmsInit,
       /*get_src_buf=*/&JxlCmsGetSrcBuf,
       /*get_dst_buf=*/&JxlCmsGetDstBuf,
       /*run=*/&DoColorSpaceTransform,
       /*destroy=*/&JxlCmsDestroy};
-  return kInterface;
+  return &kInterface;
 }
 
+}  // extern "C"
+
 }  // namespace jxl
 #endif  // HWY_ONCE
diff --git a/lib/jxl/cms/jxl_cms_internal.h b/lib/jxl/cms/jxl_cms_internal.h
new file mode 100644 (file)
index 0000000..9b3899e
--- /dev/null
@@ -0,0 +1,1081 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_CMS_JXL_CMS_INTERNAL_H_
+#define LIB_JXL_CMS_JXL_CMS_INTERNAL_H_
+
+// ICC profiles and color space conversions.
+
+#include <jxl/color_encoding.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <string>
+#include <vector>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/matrix_ops.h"
+#include "lib/jxl/base/span.h"  // Bytes
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/cms/opsin_params.h"
+#include "lib/jxl/cms/tone_mapping.h"
+#include "lib/jxl/cms/transfer_functions.h"
+
+#ifndef JXL_ENABLE_3D_ICC_TONEMAPPING
+#define JXL_ENABLE_3D_ICC_TONEMAPPING 1
+#endif
+
+namespace jxl {
+
+enum class ExtraTF {
+  kNone,
+  kPQ,
+  kHLG,
+  kSRGB,
+};
+
+static Status PrimariesToXYZ(float rx, float ry, float gx, float gy, float bx,
+                             float by, float wx, float wy, float matrix[9]) {
+  bool ok = (wx >= 0) && (wx <= 1) && (wy > 0) && (wy <= 1);
+  if (!ok) {
+    return JXL_FAILURE("Invalid white point");
+  }
+  // TODO(lode): also require rx, ry, gx, gy, bx, to be in range 0-1? ICC
+  // profiles in theory forbid negative XYZ values, but in practice the ACES P0
+  // color space uses a negative y for the blue primary.
+  float primaries[9] = {
+      rx, gx, bx, ry, gy, by, 1.0f - rx - ry, 1.0f - gx - gy, 1.0f - bx - by};
+  float primaries_inv[9];
+  memcpy(primaries_inv, primaries, sizeof(float) * 9);
+  JXL_RETURN_IF_ERROR(Inv3x3Matrix(primaries_inv));
+
+  float w[3] = {wx / wy, 1.0f, (1.0f - wx - wy) / wy};
+  // 1 / tiny float can still overflow
+  JXL_RETURN_IF_ERROR(std::isfinite(w[0]) && std::isfinite(w[2]));
+  float xyz[3];
+  Mul3x3Vector(primaries_inv, w, xyz);
+
+  float a[9] = {
+      xyz[0], 0, 0, 0, xyz[1], 0, 0, 0, xyz[2],
+  };
+
+  Mul3x3Matrix(primaries, a, matrix);
+  return true;
+}
+
+/* Chromatic adaptation matrices*/
+constexpr float kBradford[9] = {
+    0.8951f, 0.2664f, -0.1614f, -0.7502f, 1.7135f,
+    0.0367f, 0.0389f, -0.0685f, 1.0296f,
+};
+constexpr float kBradfordInv[9] = {
+    0.9869929f, -0.1470543f, 0.1599627f, 0.4323053f, 0.5183603f,
+    0.0492912f, -0.0085287f, 0.0400428f, 0.9684867f,
+};
+
+// Adapts whitepoint x, y to D50
+static Status AdaptToXYZD50(float wx, float wy, float matrix[9]) {
+  bool ok = (wx >= 0) && (wx <= 1) && (wy > 0) && (wy <= 1);
+  if (!ok) {
+    // Out of range values can cause division through zero
+    // further down with the bradford adaptation too.
+    return JXL_FAILURE("Invalid white point");
+  }
+  float w[3] = {wx / wy, 1.0f, (1.0f - wx - wy) / wy};
+  // 1 / tiny float can still overflow
+  JXL_RETURN_IF_ERROR(std::isfinite(w[0]) && std::isfinite(w[2]));
+  float w50[3] = {0.96422f, 1.0f, 0.82521f};
+
+  float lms[3];
+  float lms50[3];
+
+  Mul3x3Vector(kBradford, w, lms);
+  Mul3x3Vector(kBradford, w50, lms50);
+
+  if (lms[0] == 0 || lms[1] == 0 || lms[2] == 0) {
+    return JXL_FAILURE("Invalid white point");
+  }
+  float a[9] = {
+      //       /----> 0, 1, 2, 3,          /----> 4, 5, 6, 7,          /----> 8,
+      lms50[0] / lms[0], 0, 0, 0, lms50[1] / lms[1], 0, 0, 0, lms50[2] / lms[2],
+  };
+  if (!std::isfinite(a[0]) || !std::isfinite(a[4]) || !std::isfinite(a[8])) {
+    return JXL_FAILURE("Invalid white point");
+  }
+
+  float b[9];
+  Mul3x3Matrix(a, kBradford, b);
+  Mul3x3Matrix(kBradfordInv, b, matrix);
+
+  return true;
+}
+
+static Status PrimariesToXYZD50(float rx, float ry, float gx, float gy,
+                                float bx, float by, float wx, float wy,
+                                float matrix[9]) {
+  float toXYZ[9];
+  JXL_RETURN_IF_ERROR(PrimariesToXYZ(rx, ry, gx, gy, bx, by, wx, wy, toXYZ));
+  float d50[9];
+  JXL_RETURN_IF_ERROR(AdaptToXYZD50(wx, wy, d50));
+
+  Mul3x3Matrix(d50, toXYZ, matrix);
+  return true;
+}
+
+static Status ToneMapPixel(const JxlColorEncoding& c, const float in[3],
+                           uint8_t pcslab_out[3]) {
+  float primaries_XYZ[9];
+  JXL_RETURN_IF_ERROR(PrimariesToXYZ(
+      c.primaries_red_xy[0], c.primaries_red_xy[1], c.primaries_green_xy[0],
+      c.primaries_green_xy[1], c.primaries_blue_xy[0], c.primaries_blue_xy[1],
+      c.white_point_xy[0], c.white_point_xy[1], primaries_XYZ));
+  const float luminances[3] = {primaries_XYZ[3], primaries_XYZ[4],
+                               primaries_XYZ[5]};
+  float linear[3];
+  JxlTransferFunction tf = c.transfer_function;
+  if (tf == JXL_TRANSFER_FUNCTION_PQ) {
+    for (size_t i = 0; i < 3; ++i) {
+      linear[i] = TF_PQ_Base::DisplayFromEncoded(
+          /*display_intensity_target=*/10000.0, in[i]);
+    }
+  } else {
+    for (size_t i = 0; i < 3; ++i) {
+      linear[i] = TF_HLG_Base::DisplayFromEncoded(in[i]);
+    }
+  }
+  if (tf == JXL_TRANSFER_FUNCTION_PQ) {
+    Rec2408ToneMapperBase tone_mapper({0, 10000}, {0, 250}, luminances);
+    tone_mapper.ToneMap(&linear[0], &linear[1], &linear[2]);
+  } else {
+    HlgOOTF_Base ootf(/*source_luminance=*/300, /*target_luminance=*/80,
+                      luminances);
+    ootf.Apply(&linear[0], &linear[1], &linear[2]);
+  }
+  GamutMapScalar(&linear[0], &linear[1], &linear[2], luminances,
+                 /*preserve_saturation=*/0.3f);
+
+  float chad[9];
+  JXL_RETURN_IF_ERROR(
+      AdaptToXYZD50(c.white_point_xy[0], c.white_point_xy[1], chad));
+  float to_xyzd50[9];
+  Mul3x3Matrix(chad, primaries_XYZ, to_xyzd50);
+
+  float xyz[3] = {0, 0, 0};
+  for (size_t xyz_c = 0; xyz_c < 3; ++xyz_c) {
+    for (size_t rgb_c = 0; rgb_c < 3; ++rgb_c) {
+      xyz[xyz_c] += linear[rgb_c] * to_xyzd50[3 * xyz_c + rgb_c];
+    }
+  }
+
+  const auto lab_f = [](const float x) {
+    static constexpr float kDelta = 6. / 29;
+    return x <= kDelta * kDelta * kDelta
+               ? x * (1 / (3 * kDelta * kDelta)) + 4.f / 29
+               : std::cbrt(x);
+  };
+  static constexpr float kXn = 0.964212;
+  static constexpr float kYn = 1;
+  static constexpr float kZn = 0.825188;
+
+  const float f_x = lab_f(xyz[0] / kXn);
+  const float f_y = lab_f(xyz[1] / kYn);
+  const float f_z = lab_f(xyz[2] / kZn);
+
+  pcslab_out[0] =
+      static_cast<uint8_t>(.5f + 255.f * Clamp1(1.16f * f_y - .16f, 0.f, 1.f));
+  pcslab_out[1] = static_cast<uint8_t>(
+      .5f + 128.f + Clamp1(500 * (f_x - f_y), -128.f, 127.f));
+  pcslab_out[2] = static_cast<uint8_t>(
+      .5f + 128.f + Clamp1(200 * (f_y - f_z), -128.f, 127.f));
+
+  return true;
+}
+
+static std::vector<uint16_t> CreateTableCurve(uint32_t N, const ExtraTF tf,
+                                              bool tone_map) {
+  // The generated PQ curve will make room for highlights up to this luminance.
+  // TODO(sboukortt): make this variable?
+  static constexpr float kPQIntensityTarget = 10000;
+
+  JXL_ASSERT(N <= 4096);  // ICC MFT2 only allows 4K entries
+  JXL_ASSERT(tf == ExtraTF::kPQ || tf == ExtraTF::kHLG);
+
+  static constexpr float kLuminances[] = {1.f / 3, 1.f / 3, 1.f / 3};
+  Rec2408ToneMapperBase tone_mapper({0, kPQIntensityTarget},
+                                    {0, kDefaultIntensityTarget}, kLuminances);
+  // No point using float - LCMS converts to 16-bit for A2B/MFT.
+  std::vector<uint16_t> table(N);
+  for (uint32_t i = 0; i < N; ++i) {
+    const float x = static_cast<float>(i) / (N - 1);  // 1.0 at index N - 1.
+    const double dx = static_cast<double>(x);
+    // LCMS requires EOTF (e.g. 2.4 exponent).
+    double y = (tf == ExtraTF::kHLG)
+                   ? TF_HLG_Base::DisplayFromEncoded(dx)
+                   : TF_PQ_Base::DisplayFromEncoded(kPQIntensityTarget, dx);
+    if (tone_map && tf == ExtraTF::kPQ &&
+        kPQIntensityTarget > kDefaultIntensityTarget) {
+      float r = y * 10000 / kPQIntensityTarget, g = r, b = r;
+      tone_mapper.ToneMap(&r, &g, &b);
+      y = r;
+    }
+    JXL_ASSERT(y >= 0.0);
+    // Clamp to table range - necessary for HLG.
+    if (y > 1.0) y = 1.0;
+    // 1.0 corresponds to table value 0xFFFF.
+    table[i] = static_cast<uint16_t>(roundf(y * 65535.0));
+  }
+  return table;
+}
+
+static Status CIEXYZFromWhiteCIExy(double wx, double wy, float XYZ[3]) {
+  // Target Y = 1.
+  if (std::abs(wy) < 1e-12) return JXL_FAILURE("Y value is too small");
+  const float factor = 1 / wy;
+  XYZ[0] = wx * factor;
+  XYZ[1] = 1;
+  XYZ[2] = (1 - wx - wy) * factor;
+  return true;
+}
+
+namespace detail {
+
+constexpr bool kEnable3DToneMapping = JXL_ENABLE_3D_ICC_TONEMAPPING;
+
+static bool CanToneMap(const JxlColorEncoding& encoding) {
+  // If the color space cannot be represented by a CICP tag in the ICC profile
+  // then the rest of the profile must unambiguously identify it; we have less
+  // freedom to do use it for tone mapping.
+  JxlTransferFunction tf = encoding.transfer_function;
+  JxlPrimaries p = encoding.primaries;
+  JxlWhitePoint wp = encoding.white_point;
+  return encoding.color_space == JXL_COLOR_SPACE_RGB &&
+         (tf == JXL_TRANSFER_FUNCTION_PQ || tf == JXL_TRANSFER_FUNCTION_HLG) &&
+         ((p == JXL_PRIMARIES_P3 &&
+           (wp == JXL_WHITE_POINT_D65 || wp == JXL_WHITE_POINT_DCI)) ||
+          (p != JXL_PRIMARIES_CUSTOM && wp == JXL_WHITE_POINT_D65));
+}
+
+static void ICCComputeMD5(const std::vector<uint8_t>& data, uint8_t sum[16])
+    JXL_NO_SANITIZE("unsigned-integer-overflow") {
+  std::vector<uint8_t> data64 = data;
+  data64.push_back(128);
+  // Add bytes such that ((size + 8) & 63) == 0.
+  size_t extra = ((64 - ((data64.size() + 8) & 63)) & 63);
+  data64.resize(data64.size() + extra, 0);
+  for (uint64_t i = 0; i < 64; i += 8) {
+    data64.push_back(static_cast<uint64_t>(data.size() << 3u) >> i);
+  }
+
+  static const uint32_t sineparts[64] = {
+      0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, 0xf57c0faf, 0x4787c62a,
+      0xa8304613, 0xfd469501, 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
+      0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821, 0xf61e2562, 0xc040b340,
+      0x265e5a51, 0xe9b6c7aa, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
+      0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed, 0xa9e3e905, 0xfcefa3f8,
+      0x676f02d9, 0x8d2a4c8a, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
+      0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70, 0x289b7ec6, 0xeaa127fa,
+      0xd4ef3085, 0x04881d05, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
+      0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039, 0x655b59c3, 0x8f0ccc92,
+      0xffeff47d, 0x85845dd1, 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
+      0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391,
+  };
+  static const uint32_t shift[64] = {
+      7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22,
+      5, 9,  14, 20, 5, 9,  14, 20, 5, 9,  14, 20, 5, 9,  14, 20,
+      4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23,
+      6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21,
+  };
+
+  uint32_t a0 = 0x67452301, b0 = 0xefcdab89, c0 = 0x98badcfe, d0 = 0x10325476;
+
+  for (size_t i = 0; i < data64.size(); i += 64) {
+    uint32_t a = a0, b = b0, c = c0, d = d0, f, g;
+    for (size_t j = 0; j < 64; j++) {
+      if (j < 16) {
+        f = (b & c) | ((~b) & d);
+        g = j;
+      } else if (j < 32) {
+        f = (d & b) | ((~d) & c);
+        g = (5 * j + 1) & 0xf;
+      } else if (j < 48) {
+        f = b ^ c ^ d;
+        g = (3 * j + 5) & 0xf;
+      } else {
+        f = c ^ (b | (~d));
+        g = (7 * j) & 0xf;
+      }
+      uint32_t dg0 = data64[i + g * 4 + 0], dg1 = data64[i + g * 4 + 1],
+               dg2 = data64[i + g * 4 + 2], dg3 = data64[i + g * 4 + 3];
+      uint32_t u = dg0 | (dg1 << 8u) | (dg2 << 16u) | (dg3 << 24u);
+      f += a + sineparts[j] + u;
+      a = d;
+      d = c;
+      c = b;
+      b += (f << shift[j]) | (f >> (32u - shift[j]));
+    }
+    a0 += a;
+    b0 += b;
+    c0 += c;
+    d0 += d;
+  }
+  sum[0] = a0;
+  sum[1] = a0 >> 8u;
+  sum[2] = a0 >> 16u;
+  sum[3] = a0 >> 24u;
+  sum[4] = b0;
+  sum[5] = b0 >> 8u;
+  sum[6] = b0 >> 16u;
+  sum[7] = b0 >> 24u;
+  sum[8] = c0;
+  sum[9] = c0 >> 8u;
+  sum[10] = c0 >> 16u;
+  sum[11] = c0 >> 24u;
+  sum[12] = d0;
+  sum[13] = d0 >> 8u;
+  sum[14] = d0 >> 16u;
+  sum[15] = d0 >> 24u;
+}
+
+static Status CreateICCChadMatrix(double wx, double wy, float result[9]) {
+  float m[9];
+  if (wy == 0) {  // WhitePoint can not be pitch-black.
+    return JXL_FAILURE("Invalid WhitePoint");
+  }
+  JXL_RETURN_IF_ERROR(AdaptToXYZD50(wx, wy, m));
+  memcpy(result, m, sizeof(float) * 9);
+  return true;
+}
+
+// Creates RGB to XYZ matrix given RGB primaries and whitepoint in xy.
+static Status CreateICCRGBMatrix(double rx, double ry, double gx, double gy,
+                                 double bx, double by, double wx, double wy,
+                                 float result[9]) {
+  float m[9];
+  JXL_RETURN_IF_ERROR(PrimariesToXYZD50(rx, ry, gx, gy, bx, by, wx, wy, m));
+  memcpy(result, m, sizeof(float) * 9);
+  return true;
+}
+
+static void WriteICCUint32(uint32_t value, size_t pos,
+                           std::vector<uint8_t>* icc) {
+  if (icc->size() < pos + 4) icc->resize(pos + 4);
+  (*icc)[pos + 0] = (value >> 24u) & 255;
+  (*icc)[pos + 1] = (value >> 16u) & 255;
+  (*icc)[pos + 2] = (value >> 8u) & 255;
+  (*icc)[pos + 3] = value & 255;
+}
+
+static void WriteICCUint16(uint16_t value, size_t pos,
+                           std::vector<uint8_t>* icc) {
+  if (icc->size() < pos + 2) icc->resize(pos + 2);
+  (*icc)[pos + 0] = (value >> 8u) & 255;
+  (*icc)[pos + 1] = value & 255;
+}
+
+static void WriteICCUint8(uint8_t value, size_t pos,
+                          std::vector<uint8_t>* icc) {
+  if (icc->size() < pos + 1) icc->resize(pos + 1);
+  (*icc)[pos] = value;
+}
+
+// Writes a 4-character tag
+static void WriteICCTag(const char* value, size_t pos,
+                        std::vector<uint8_t>* icc) {
+  if (icc->size() < pos + 4) icc->resize(pos + 4);
+  memcpy(icc->data() + pos, value, 4);
+}
+
+static Status WriteICCS15Fixed16(float value, size_t pos,
+                                 std::vector<uint8_t>* icc) {
+  // "nextafterf" for 32768.0f towards zero are:
+  // 32767.998046875, 32767.99609375, 32767.994140625
+  // Even the first value works well,...
+  bool ok = (-32767.995f <= value) && (value <= 32767.995f);
+  if (!ok) return JXL_FAILURE("ICC value is out of range / NaN");
+  int32_t i = value * 65536.0f + 0.5f;
+  // Use two's complement
+  uint32_t u = static_cast<uint32_t>(i);
+  WriteICCUint32(u, pos, icc);
+  return true;
+}
+
+static Status CreateICCHeader(const JxlColorEncoding& c,
+                              std::vector<uint8_t>* header) {
+  // TODO(lode): choose color management engine name, e.g. "skia" if
+  // integrated in skia.
+  static const char* kCmm = "jxl ";
+
+  header->resize(128, 0);
+
+  WriteICCUint32(0, 0, header);  // size, correct value filled in at end
+  WriteICCTag(kCmm, 4, header);
+  WriteICCUint32(0x04400000u, 8, header);
+  const char* profile_type =
+      c.color_space == JXL_COLOR_SPACE_XYB ? "scnr" : "mntr";
+  WriteICCTag(profile_type, 12, header);
+  WriteICCTag(c.color_space == JXL_COLOR_SPACE_GRAY ? "GRAY" : "RGB ", 16,
+              header);
+  if (kEnable3DToneMapping && CanToneMap(c)) {
+    // We are going to use a 3D LUT for tone mapping, which will be more compact
+    // with an 8-bit LUT to CIELAB than with a 16-bit LUT to XYZ. 8-bit XYZ
+    // would not be viable due to XYZ being linear, whereas it is fine with
+    // CIELAB's ~cube root.
+    WriteICCTag("Lab ", 20, header);
+  } else {
+    WriteICCTag("XYZ ", 20, header);
+  }
+
+  // Three uint32_t's date/time encoding.
+  // TODO(lode): encode actual date and time, this is a placeholder
+  uint32_t year = 2019, month = 12, day = 1;
+  uint32_t hour = 0, minute = 0, second = 0;
+  WriteICCUint16(year, 24, header);
+  WriteICCUint16(month, 26, header);
+  WriteICCUint16(day, 28, header);
+  WriteICCUint16(hour, 30, header);
+  WriteICCUint16(minute, 32, header);
+  WriteICCUint16(second, 34, header);
+
+  WriteICCTag("acsp", 36, header);
+  WriteICCTag("APPL", 40, header);
+  WriteICCUint32(0, 44, header);  // flags
+  WriteICCUint32(0, 48, header);  // device manufacturer
+  WriteICCUint32(0, 52, header);  // device model
+  WriteICCUint32(0, 56, header);  // device attributes
+  WriteICCUint32(0, 60, header);  // device attributes
+  WriteICCUint32(static_cast<uint32_t>(c.rendering_intent), 64, header);
+
+  // Mandatory D50 white point of profile connection space
+  WriteICCUint32(0x0000f6d6, 68, header);
+  WriteICCUint32(0x00010000, 72, header);
+  WriteICCUint32(0x0000d32d, 76, header);
+
+  WriteICCTag(kCmm, 80, header);
+
+  return true;
+}
+
+static void AddToICCTagTable(const char* tag, size_t offset, size_t size,
+                             std::vector<uint8_t>* tagtable,
+                             std::vector<size_t>* offsets) {
+  WriteICCTag(tag, tagtable->size(), tagtable);
+  // writing true offset deferred to later
+  WriteICCUint32(0, tagtable->size(), tagtable);
+  offsets->push_back(offset);
+  WriteICCUint32(size, tagtable->size(), tagtable);
+}
+
+static void FinalizeICCTag(std::vector<uint8_t>* tags, size_t* offset,
+                           size_t* size) {
+  while ((tags->size() & 3) != 0) {
+    tags->push_back(0);
+  }
+  *offset += *size;
+  *size = tags->size() - *offset;
+}
+
+// The input text must be ASCII, writing other characters to UTF-16 is not
+// implemented.
+static void CreateICCMlucTag(const std::string& text,
+                             std::vector<uint8_t>* tags) {
+  WriteICCTag("mluc", tags->size(), tags);
+  WriteICCUint32(0, tags->size(), tags);
+  WriteICCUint32(1, tags->size(), tags);
+  WriteICCUint32(12, tags->size(), tags);
+  WriteICCTag("enUS", tags->size(), tags);
+  WriteICCUint32(text.size() * 2, tags->size(), tags);
+  WriteICCUint32(28, tags->size(), tags);
+  for (size_t i = 0; i < text.size(); i++) {
+    tags->push_back(0);  // prepend 0 for UTF-16
+    tags->push_back(text[i]);
+  }
+}
+
+static Status CreateICCXYZTag(float xyz[3], std::vector<uint8_t>* tags) {
+  WriteICCTag("XYZ ", tags->size(), tags);
+  WriteICCUint32(0, tags->size(), tags);
+  for (size_t i = 0; i < 3; ++i) {
+    JXL_RETURN_IF_ERROR(WriteICCS15Fixed16(xyz[i], tags->size(), tags));
+  }
+  return true;
+}
+
+static Status CreateICCChadTag(float chad[9], std::vector<uint8_t>* tags) {
+  WriteICCTag("sf32", tags->size(), tags);
+  WriteICCUint32(0, tags->size(), tags);
+  for (size_t i = 0; i < 9; i++) {
+    JXL_RETURN_IF_ERROR(WriteICCS15Fixed16(chad[i], tags->size(), tags));
+  }
+  return true;
+}
+
+static void MaybeCreateICCCICPTag(const JxlColorEncoding& c,
+                                  std::vector<uint8_t>* tags, size_t* offset,
+                                  size_t* size, std::vector<uint8_t>* tagtable,
+                                  std::vector<size_t>* offsets) {
+  if (c.color_space != JXL_COLOR_SPACE_RGB) {
+    return;
+  }
+  uint8_t primaries = 0;
+  if (c.primaries == JXL_PRIMARIES_P3) {
+    if (c.white_point == JXL_WHITE_POINT_D65) {
+      primaries = 12;
+    } else if (c.white_point == JXL_WHITE_POINT_DCI) {
+      primaries = 11;
+    } else {
+      return;
+    }
+  } else if (c.primaries != JXL_PRIMARIES_CUSTOM &&
+             c.white_point == JXL_WHITE_POINT_D65) {
+    primaries = static_cast<uint8_t>(c.primaries);
+  } else {
+    return;
+  }
+  JxlTransferFunction tf = c.transfer_function;
+  if (tf == JXL_TRANSFER_FUNCTION_UNKNOWN ||
+      tf == JXL_TRANSFER_FUNCTION_GAMMA) {
+    return;
+  }
+  WriteICCTag("cicp", tags->size(), tags);
+  WriteICCUint32(0, tags->size(), tags);
+  WriteICCUint8(primaries, tags->size(), tags);
+  WriteICCUint8(static_cast<uint8_t>(tf), tags->size(), tags);
+  // Matrix
+  WriteICCUint8(0, tags->size(), tags);
+  // Full range
+  WriteICCUint8(1, tags->size(), tags);
+  FinalizeICCTag(tags, offset, size);
+  AddToICCTagTable("cicp", *offset, *size, tagtable, offsets);
+}
+
+static void CreateICCCurvCurvTag(const std::vector<uint16_t>& curve,
+                                 std::vector<uint8_t>* tags) {
+  size_t pos = tags->size();
+  tags->resize(tags->size() + 12 + curve.size() * 2, 0);
+  WriteICCTag("curv", pos, tags);
+  WriteICCUint32(0, pos + 4, tags);
+  WriteICCUint32(curve.size(), pos + 8, tags);
+  for (size_t i = 0; i < curve.size(); i++) {
+    WriteICCUint16(curve[i], pos + 12 + i * 2, tags);
+  }
+}
+
+// Writes 12 + 4*params.size() bytes
+static Status CreateICCCurvParaTag(std::vector<float> params, size_t curve_type,
+                                   std::vector<uint8_t>* tags) {
+  WriteICCTag("para", tags->size(), tags);
+  WriteICCUint32(0, tags->size(), tags);
+  WriteICCUint16(curve_type, tags->size(), tags);
+  WriteICCUint16(0, tags->size(), tags);
+  for (size_t i = 0; i < params.size(); i++) {
+    JXL_RETURN_IF_ERROR(WriteICCS15Fixed16(params[i], tags->size(), tags));
+  }
+  return true;
+}
+
+static Status CreateICCLutAtoBTagForXYB(std::vector<uint8_t>* tags) {
+  WriteICCTag("mAB ", tags->size(), tags);
+  // 4 reserved bytes set to 0
+  WriteICCUint32(0, tags->size(), tags);
+  // number of input channels
+  WriteICCUint8(3, tags->size(), tags);
+  // number of output channels
+  WriteICCUint8(3, tags->size(), tags);
+  // 2 reserved bytes for padding
+  WriteICCUint16(0, tags->size(), tags);
+  // offset to first B curve
+  WriteICCUint32(32, tags->size(), tags);
+  // offset to matrix
+  WriteICCUint32(244, tags->size(), tags);
+  // offset to first M curve
+  WriteICCUint32(148, tags->size(), tags);
+  // offset to CLUT
+  WriteICCUint32(80, tags->size(), tags);
+  // offset to first A curve
+  // (reuse linear B curves)
+  WriteICCUint32(32, tags->size(), tags);
+
+  // offset = 32
+  // no-op curves
+  JXL_RETURN_IF_ERROR(CreateICCCurvParaTag({1.0f}, 0, tags));
+  JXL_RETURN_IF_ERROR(CreateICCCurvParaTag({1.0f}, 0, tags));
+  JXL_RETURN_IF_ERROR(CreateICCCurvParaTag({1.0f}, 0, tags));
+  // offset = 80
+  // number of grid points for each input channel
+  for (int i = 0; i < 16; ++i) {
+    WriteICCUint8(i < 3 ? 2 : 0, tags->size(), tags);
+  }
+  // precision = 2
+  WriteICCUint8(2, tags->size(), tags);
+  // 3 bytes of padding
+  WriteICCUint8(0, tags->size(), tags);
+  WriteICCUint16(0, tags->size(), tags);
+  // 2*2*2*3 entries of 2 bytes each = 48 bytes
+  const jxl::cms::ColorCube3D& cube = jxl::cms::UnscaledA2BCube();
+  for (size_t ix = 0; ix < 2; ++ix) {
+    for (size_t iy = 0; iy < 2; ++iy) {
+      for (size_t ib = 0; ib < 2; ++ib) {
+        const jxl::cms::ColorCube0D& out_f = cube[ix][iy][ib];
+        for (int i = 0; i < 3; ++i) {
+          int32_t val = static_cast<int32_t>(0.5f + 65535 * out_f[i]);
+          JXL_DASSERT(val >= 0 && val <= 65535);
+          WriteICCUint16(val, tags->size(), tags);
+        }
+      }
+    }
+  }
+  // offset = 148
+  // 3 curves with 5 parameters = 3 * (12 + 5 * 4) = 96 bytes
+  for (size_t i = 0; i < 3; ++i) {
+    const float b = -jxl::cms::kXYBOffset[i] -
+                    std::cbrt(jxl::cms::kNegOpsinAbsorbanceBiasRGB[i]);
+    std::vector<float> params = {
+        3,
+        1.0f / jxl::cms::kXYBScale[i],
+        b,
+        0,                                           // unused
+        std::max(0.f, -b * jxl::cms::kXYBScale[i]),  // make skcms happy
+    };
+    JXL_RETURN_IF_ERROR(CreateICCCurvParaTag(params, 3, tags));
+  }
+  // offset = 244
+  const double matrix[] = {1.5170095, -1.1065225, 0.071623,
+                           -0.050022, 0.5683655,  -0.018344,
+                           -1.387676, 1.1145555,  0.6857255};
+  // 12 * 4 = 48 bytes
+  for (size_t i = 0; i < 9; ++i) {
+    JXL_RETURN_IF_ERROR(WriteICCS15Fixed16(matrix[i], tags->size(), tags));
+  }
+  for (size_t i = 0; i < 3; ++i) {
+    float intercept = 0;
+    for (size_t j = 0; j < 3; ++j) {
+      intercept += matrix[i * 3 + j] * jxl::cms::kNegOpsinAbsorbanceBiasRGB[j];
+    }
+    JXL_RETURN_IF_ERROR(WriteICCS15Fixed16(intercept, tags->size(), tags));
+  }
+  return true;
+}
+
+static Status CreateICCLutAtoBTagForHDR(JxlColorEncoding c,
+                                        std::vector<uint8_t>* tags) {
+  static constexpr size_t k3DLutDim = 9;
+  WriteICCTag("mft1", tags->size(), tags);
+  // 4 reserved bytes set to 0
+  WriteICCUint32(0, tags->size(), tags);
+  // number of input channels
+  WriteICCUint8(3, tags->size(), tags);
+  // number of output channels
+  WriteICCUint8(3, tags->size(), tags);
+  // number of CLUT grid points
+  WriteICCUint8(k3DLutDim, tags->size(), tags);
+  // 1 reserved bytes for padding
+  WriteICCUint8(0, tags->size(), tags);
+
+  // Matrix (per specification, must be identity if input is not XYZ)
+  for (size_t i = 0; i < 3; ++i) {
+    for (size_t j = 0; j < 3; ++j) {
+      JXL_RETURN_IF_ERROR(
+          WriteICCS15Fixed16(i == j ? 1.f : 0.f, tags->size(), tags));
+    }
+  }
+
+  // Input tables
+  for (size_t c = 0; c < 3; ++c) {
+    for (size_t i = 0; i < 256; ++i) {
+      WriteICCUint8(i, tags->size(), tags);
+    }
+  }
+
+  for (size_t ix = 0; ix < k3DLutDim; ++ix) {
+    for (size_t iy = 0; iy < k3DLutDim; ++iy) {
+      for (size_t ib = 0; ib < k3DLutDim; ++ib) {
+        float f[3] = {ix * (1.0f / (k3DLutDim - 1)),
+                      iy * (1.0f / (k3DLutDim - 1)),
+                      ib * (1.0f / (k3DLutDim - 1))};
+        uint8_t pcslab_out[3];
+        JXL_RETURN_IF_ERROR(ToneMapPixel(c, f, pcslab_out));
+        for (uint8_t val : pcslab_out) {
+          WriteICCUint8(val, tags->size(), tags);
+        }
+      }
+    }
+  }
+
+  // Output tables
+  for (size_t c = 0; c < 3; ++c) {
+    for (size_t i = 0; i < 256; ++i) {
+      WriteICCUint8(i, tags->size(), tags);
+    }
+  }
+
+  return true;
+}
+
+// Some software (Apple Safari, Preview) requires this.
+static Status CreateICCNoOpBToATag(std::vector<uint8_t>* tags) {
+  WriteICCTag("mBA ", tags->size(), tags);
+  // 4 reserved bytes set to 0
+  WriteICCUint32(0, tags->size(), tags);
+  // number of input channels
+  WriteICCUint8(3, tags->size(), tags);
+  // number of output channels
+  WriteICCUint8(3, tags->size(), tags);
+  // 2 reserved bytes for padding
+  WriteICCUint16(0, tags->size(), tags);
+  // offset to first B curve
+  WriteICCUint32(32, tags->size(), tags);
+  // offset to matrix
+  WriteICCUint32(0, tags->size(), tags);
+  // offset to first M curve
+  WriteICCUint32(0, tags->size(), tags);
+  // offset to CLUT
+  WriteICCUint32(0, tags->size(), tags);
+  // offset to first A curve
+  WriteICCUint32(0, tags->size(), tags);
+
+  JXL_RETURN_IF_ERROR(CreateICCCurvParaTag({1.0f}, 0, tags));
+  JXL_RETURN_IF_ERROR(CreateICCCurvParaTag({1.0f}, 0, tags));
+  JXL_RETURN_IF_ERROR(CreateICCCurvParaTag({1.0f}, 0, tags));
+
+  return true;
+}
+
+// These strings are baked into Description - do not change.
+
+static std::string ToString(JxlColorSpace color_space) {
+  switch (color_space) {
+    case JXL_COLOR_SPACE_RGB:
+      return "RGB";
+    case JXL_COLOR_SPACE_GRAY:
+      return "Gra";
+    case JXL_COLOR_SPACE_XYB:
+      return "XYB";
+    case JXL_COLOR_SPACE_UNKNOWN:
+      return "CS?";
+  }
+  // Should not happen - visitor fails if enum is invalid.
+  JXL_UNREACHABLE("Invalid ColorSpace %u", static_cast<uint32_t>(color_space));
+}
+
+static std::string ToString(JxlWhitePoint white_point) {
+  switch (white_point) {
+    case JXL_WHITE_POINT_D65:
+      return "D65";
+    case JXL_WHITE_POINT_CUSTOM:
+      return "Cst";
+    case JXL_WHITE_POINT_E:
+      return "EER";
+    case JXL_WHITE_POINT_DCI:
+      return "DCI";
+  }
+  // Should not happen - visitor fails if enum is invalid.
+  JXL_UNREACHABLE("Invalid WhitePoint %u", static_cast<uint32_t>(white_point));
+}
+
+static std::string ToString(JxlPrimaries primaries) {
+  switch (primaries) {
+    case JXL_PRIMARIES_SRGB:
+      return "SRG";
+    case JXL_PRIMARIES_2100:
+      return "202";
+    case JXL_PRIMARIES_P3:
+      return "DCI";
+    case JXL_PRIMARIES_CUSTOM:
+      return "Cst";
+  }
+  // Should not happen - visitor fails if enum is invalid.
+  JXL_UNREACHABLE("Invalid Primaries %u", static_cast<uint32_t>(primaries));
+}
+
+static std::string ToString(JxlTransferFunction transfer_function) {
+  switch (transfer_function) {
+    case JXL_TRANSFER_FUNCTION_SRGB:
+      return "SRG";
+    case JXL_TRANSFER_FUNCTION_LINEAR:
+      return "Lin";
+    case JXL_TRANSFER_FUNCTION_709:
+      return "709";
+    case JXL_TRANSFER_FUNCTION_PQ:
+      return "PeQ";
+    case JXL_TRANSFER_FUNCTION_HLG:
+      return "HLG";
+    case JXL_TRANSFER_FUNCTION_DCI:
+      return "DCI";
+    case JXL_TRANSFER_FUNCTION_UNKNOWN:
+      return "TF?";
+    case JXL_TRANSFER_FUNCTION_GAMMA:
+      JXL_UNREACHABLE("Invalid TransferFunction: gamma");
+  }
+  // Should not happen - visitor fails if enum is invalid.
+  JXL_UNREACHABLE("Invalid TransferFunction %u",
+                  static_cast<uint32_t>(transfer_function));
+}
+
+static std::string ToString(JxlRenderingIntent rendering_intent) {
+  switch (rendering_intent) {
+    case JXL_RENDERING_INTENT_PERCEPTUAL:
+      return "Per";
+    case JXL_RENDERING_INTENT_RELATIVE:
+      return "Rel";
+    case JXL_RENDERING_INTENT_SATURATION:
+      return "Sat";
+    case JXL_RENDERING_INTENT_ABSOLUTE:
+      return "Abs";
+  }
+  // Should not happen - visitor fails if enum is invalid.
+  JXL_UNREACHABLE("Invalid RenderingIntent %u",
+                  static_cast<uint32_t>(rendering_intent));
+}
+
+static std::string ColorEncodingDescriptionImpl(const JxlColorEncoding& c) {
+  std::string d = ToString(c.color_space);
+
+  bool explicit_wp_tf = (c.color_space != JXL_COLOR_SPACE_XYB);
+  if (explicit_wp_tf) {
+    d += '_';
+    if (c.white_point == JXL_WHITE_POINT_CUSTOM) {
+      d += jxl::ToString(c.white_point_xy[0]) + ';';
+      d += jxl::ToString(c.white_point_xy[1]);
+    } else {
+      d += ToString(c.white_point);
+    }
+  }
+
+  if ((c.color_space != JXL_COLOR_SPACE_GRAY) &&
+      (c.color_space != JXL_COLOR_SPACE_XYB)) {
+    d += '_';
+    if (c.primaries == JXL_PRIMARIES_CUSTOM) {
+      d += jxl::ToString(c.primaries_red_xy[0]) + ';';
+      d += jxl::ToString(c.primaries_red_xy[1]) + ';';
+      d += jxl::ToString(c.primaries_green_xy[0]) + ';';
+      d += jxl::ToString(c.primaries_green_xy[1]) + ';';
+      d += jxl::ToString(c.primaries_blue_xy[0]) + ';';
+      d += jxl::ToString(c.primaries_blue_xy[1]);
+    } else {
+      d += ToString(c.primaries);
+    }
+  }
+
+  d += '_';
+  d += ToString(c.rendering_intent);
+
+  if (explicit_wp_tf) {
+    JxlTransferFunction tf = c.transfer_function;
+    d += '_';
+    if (tf == JXL_TRANSFER_FUNCTION_GAMMA) {
+      d += 'g';
+      d += jxl::ToString(c.gamma);
+    } else {
+      d += ToString(tf);
+    }
+  }
+  return d;
+}
+
+static Status MaybeCreateProfileImpl(const JxlColorEncoding& c,
+                                     std::vector<uint8_t>* icc) {
+  std::vector<uint8_t> header, tagtable, tags;
+  JxlTransferFunction tf = c.transfer_function;
+  if (c.color_space == JXL_COLOR_SPACE_UNKNOWN ||
+      tf == JXL_TRANSFER_FUNCTION_UNKNOWN) {
+    return false;  // Not an error
+  }
+
+  switch (c.color_space) {
+    case JXL_COLOR_SPACE_RGB:
+    case JXL_COLOR_SPACE_GRAY:
+    case JXL_COLOR_SPACE_XYB:
+      break;  // OK
+    default:
+      return JXL_FAILURE("Invalid CS %u",
+                         static_cast<unsigned int>(c.color_space));
+  }
+
+  if (c.color_space == JXL_COLOR_SPACE_XYB &&
+      c.rendering_intent != JXL_RENDERING_INTENT_PERCEPTUAL) {
+    return JXL_FAILURE(
+        "Only perceptual rendering intent implemented for XYB "
+        "ICC profile.");
+  }
+
+  JXL_RETURN_IF_ERROR(CreateICCHeader(c, &header));
+
+  std::vector<size_t> offsets;
+  // tag count, deferred to later
+  WriteICCUint32(0, tagtable.size(), &tagtable);
+
+  size_t tag_offset = 0, tag_size = 0;
+
+  CreateICCMlucTag(ColorEncodingDescriptionImpl(c), &tags);
+  FinalizeICCTag(&tags, &tag_offset, &tag_size);
+  AddToICCTagTable("desc", tag_offset, tag_size, &tagtable, &offsets);
+
+  const std::string copyright = "CC0";
+  CreateICCMlucTag(copyright, &tags);
+  FinalizeICCTag(&tags, &tag_offset, &tag_size);
+  AddToICCTagTable("cprt", tag_offset, tag_size, &tagtable, &offsets);
+
+  // TODO(eustas): isn't it the other way round: gray image has d50 WhitePoint?
+  if (c.color_space == JXL_COLOR_SPACE_GRAY) {
+    float wtpt[3];
+    JXL_RETURN_IF_ERROR(
+        CIEXYZFromWhiteCIExy(c.white_point_xy[0], c.white_point_xy[1], wtpt));
+    JXL_RETURN_IF_ERROR(CreateICCXYZTag(wtpt, &tags));
+  } else {
+    float d50[3] = {0.964203, 1.0, 0.824905};
+    JXL_RETURN_IF_ERROR(CreateICCXYZTag(d50, &tags));
+  }
+  FinalizeICCTag(&tags, &tag_offset, &tag_size);
+  AddToICCTagTable("wtpt", tag_offset, tag_size, &tagtable, &offsets);
+
+  if (c.color_space != JXL_COLOR_SPACE_GRAY) {
+    // Chromatic adaptation matrix
+    float chad[9];
+    JXL_RETURN_IF_ERROR(
+        CreateICCChadMatrix(c.white_point_xy[0], c.white_point_xy[1], chad));
+
+    JXL_RETURN_IF_ERROR(CreateICCChadTag(chad, &tags));
+    FinalizeICCTag(&tags, &tag_offset, &tag_size);
+    AddToICCTagTable("chad", tag_offset, tag_size, &tagtable, &offsets);
+  }
+
+  if (c.color_space == JXL_COLOR_SPACE_RGB) {
+    MaybeCreateICCCICPTag(c, &tags, &tag_offset, &tag_size, &tagtable,
+                          &offsets);
+
+    float m[9];
+    JXL_RETURN_IF_ERROR(CreateICCRGBMatrix(
+        c.primaries_red_xy[0], c.primaries_red_xy[1], c.primaries_green_xy[0],
+        c.primaries_green_xy[1], c.primaries_blue_xy[0], c.primaries_blue_xy[1],
+        c.white_point_xy[0], c.white_point_xy[1], m));
+    float r[3] = {m[0], m[3], m[6]};
+    float g[3] = {m[1], m[4], m[7]};
+    float b[3] = {m[2], m[5], m[8]};
+
+    JXL_RETURN_IF_ERROR(CreateICCXYZTag(r, &tags));
+    FinalizeICCTag(&tags, &tag_offset, &tag_size);
+    AddToICCTagTable("rXYZ", tag_offset, tag_size, &tagtable, &offsets);
+
+    JXL_RETURN_IF_ERROR(CreateICCXYZTag(g, &tags));
+    FinalizeICCTag(&tags, &tag_offset, &tag_size);
+    AddToICCTagTable("gXYZ", tag_offset, tag_size, &tagtable, &offsets);
+
+    JXL_RETURN_IF_ERROR(CreateICCXYZTag(b, &tags));
+    FinalizeICCTag(&tags, &tag_offset, &tag_size);
+    AddToICCTagTable("bXYZ", tag_offset, tag_size, &tagtable, &offsets);
+  }
+
+  if (c.color_space == JXL_COLOR_SPACE_XYB) {
+    JXL_RETURN_IF_ERROR(CreateICCLutAtoBTagForXYB(&tags));
+    FinalizeICCTag(&tags, &tag_offset, &tag_size);
+    AddToICCTagTable("A2B0", tag_offset, tag_size, &tagtable, &offsets);
+    JXL_RETURN_IF_ERROR(CreateICCNoOpBToATag(&tags));
+    FinalizeICCTag(&tags, &tag_offset, &tag_size);
+    AddToICCTagTable("B2A0", tag_offset, tag_size, &tagtable, &offsets);
+  } else if (kEnable3DToneMapping && CanToneMap(c)) {
+    JXL_RETURN_IF_ERROR(CreateICCLutAtoBTagForHDR(c, &tags));
+    FinalizeICCTag(&tags, &tag_offset, &tag_size);
+    AddToICCTagTable("A2B0", tag_offset, tag_size, &tagtable, &offsets);
+    JXL_RETURN_IF_ERROR(CreateICCNoOpBToATag(&tags));
+    FinalizeICCTag(&tags, &tag_offset, &tag_size);
+    AddToICCTagTable("B2A0", tag_offset, tag_size, &tagtable, &offsets);
+  } else {
+    if (tf == JXL_TRANSFER_FUNCTION_GAMMA) {
+      float gamma = 1.0 / c.gamma;
+      JXL_RETURN_IF_ERROR(CreateICCCurvParaTag({gamma}, 0, &tags));
+    } else if (c.color_space != JXL_COLOR_SPACE_XYB) {
+      switch (tf) {
+        case JXL_TRANSFER_FUNCTION_HLG:
+          CreateICCCurvCurvTag(
+              CreateTableCurve(64, ExtraTF::kHLG, CanToneMap(c)), &tags);
+          break;
+        case JXL_TRANSFER_FUNCTION_PQ:
+          CreateICCCurvCurvTag(
+              CreateTableCurve(64, ExtraTF::kPQ, CanToneMap(c)), &tags);
+          break;
+        case JXL_TRANSFER_FUNCTION_SRGB:
+          JXL_RETURN_IF_ERROR(CreateICCCurvParaTag(
+              {2.4, 1.0 / 1.055, 0.055 / 1.055, 1.0 / 12.92, 0.04045}, 3,
+              &tags));
+          break;
+        case JXL_TRANSFER_FUNCTION_709:
+          JXL_RETURN_IF_ERROR(CreateICCCurvParaTag(
+              {1.0 / 0.45, 1.0 / 1.099, 0.099 / 1.099, 1.0 / 4.5, 0.081}, 3,
+              &tags));
+          break;
+        case JXL_TRANSFER_FUNCTION_LINEAR:
+          JXL_RETURN_IF_ERROR(
+              CreateICCCurvParaTag({1.0, 1.0, 0.0, 1.0, 0.0}, 3, &tags));
+          break;
+        case JXL_TRANSFER_FUNCTION_DCI:
+          JXL_RETURN_IF_ERROR(
+              CreateICCCurvParaTag({2.6, 1.0, 0.0, 1.0, 0.0}, 3, &tags));
+          break;
+        default:
+          JXL_UNREACHABLE("Unknown TF %u", static_cast<unsigned int>(tf));
+      }
+    }
+    FinalizeICCTag(&tags, &tag_offset, &tag_size);
+    if (c.color_space == JXL_COLOR_SPACE_GRAY) {
+      AddToICCTagTable("kTRC", tag_offset, tag_size, &tagtable, &offsets);
+    } else {
+      AddToICCTagTable("rTRC", tag_offset, tag_size, &tagtable, &offsets);
+      AddToICCTagTable("gTRC", tag_offset, tag_size, &tagtable, &offsets);
+      AddToICCTagTable("bTRC", tag_offset, tag_size, &tagtable, &offsets);
+    }
+  }
+
+  // Tag count
+  WriteICCUint32(offsets.size(), 0, &tagtable);
+  for (size_t i = 0; i < offsets.size(); i++) {
+    WriteICCUint32(offsets[i] + header.size() + tagtable.size(), 4 + 12 * i + 4,
+                   &tagtable);
+  }
+
+  // ICC profile size
+  WriteICCUint32(header.size() + tagtable.size() + tags.size(), 0, &header);
+
+  *icc = header;
+  Bytes(tagtable).AppendTo(icc);
+  Bytes(tags).AppendTo(icc);
+
+  // The MD5 checksum must be computed on the profile with profile flags,
+  // rendering intent, and region of the checksum itself, set to 0.
+  // TODO(lode): manually verify with a reliable tool that this creates correct
+  // signature (profile id) for ICC profiles.
+  std::vector<uint8_t> icc_sum = *icc;
+  if (icc_sum.size() >= 64 + 4) {
+    memset(icc_sum.data() + 44, 0, 4);
+    memset(icc_sum.data() + 64, 0, 4);
+  }
+  uint8_t checksum[16];
+  detail::ICCComputeMD5(icc_sum, checksum);
+
+  memcpy(icc->data() + 84, checksum, sizeof(checksum));
+
+  return true;
+}
+
+}  // namespace detail
+
+// Returns a representation of the ColorEncoding fields (not icc).
+// Example description: "RGB_D65_SRG_Rel_Lin"
+static std::string ColorEncodingDescription(const JxlColorEncoding& c) {
+  return detail::ColorEncodingDescriptionImpl(c);
+}
+
+// NOTE: for XYB colorspace, the created profile can be used to transform a
+// *scaled* XYB image (created by ScaleXYB()) to another colorspace.
+static Status MaybeCreateProfile(const JxlColorEncoding& c,
+                                 std::vector<uint8_t>* icc) {
+  return detail::MaybeCreateProfileImpl(c, icc);
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_CMS_JXL_CMS_INTERNAL_H_
diff --git a/lib/jxl/cms/opsin_params.h b/lib/jxl/cms/opsin_params.h
new file mode 100644 (file)
index 0000000..48e8e25
--- /dev/null
@@ -0,0 +1,160 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_CMS_OPSIN_PARAMS_H_
+#define LIB_JXL_CMS_OPSIN_PARAMS_H_
+
+#include <array>
+
+// Constants that define the XYB color space.
+
+namespace jxl {
+namespace cms {
+
+// Parameters for opsin absorbance.
+constexpr float kM02 = 0.078f;
+constexpr float kM00 = 0.30f;
+constexpr float kM01 = 1.0f - kM02 - kM00;
+
+constexpr float kM12 = 0.078f;
+constexpr float kM10 = 0.23f;
+constexpr float kM11 = 1.0f - kM12 - kM10;
+
+constexpr float kM20 = 0.24342268924547819f;
+constexpr float kM21 = 0.20476744424496821f;
+constexpr float kM22 = 1.0f - kM20 - kM21;
+
+constexpr float kBScale = 1.0f;
+constexpr float kYToBRatio = 1.0f;  // works better with 0.50017729543783418
+constexpr float kBToYRatio = 1.0f / kYToBRatio;
+
+constexpr float kOpsinAbsorbanceBias0 = 0.0037930732552754493f;
+constexpr float kOpsinAbsorbanceBias1 = kOpsinAbsorbanceBias0;
+constexpr float kOpsinAbsorbanceBias2 = kOpsinAbsorbanceBias0;
+
+// Opsin absorbance matrix is now frozen.
+constexpr std::array<float, 9> kOpsinAbsorbanceMatrix = {
+    kM00, kM01, kM02, kM10, kM11, kM12, kM20, kM21, kM22,
+};
+
+constexpr std::array<float, 9> kDefaultInverseOpsinAbsorbanceMatrix = {
+    11.031566901960783f,  -9.866943921568629f, -0.16462299647058826f,
+    -3.254147380392157f,  4.418770392156863f,  -0.16462299647058826f,
+    -3.6588512862745097f, 2.7129230470588235f, 1.9459282392156863f};
+
+// Must be the inverse matrix of kOpsinAbsorbanceMatrix and match the spec.
+static inline const float* DefaultInverseOpsinAbsorbanceMatrix() {
+  return kDefaultInverseOpsinAbsorbanceMatrix.data();
+}
+
+constexpr std::array<float, 3> kOpsinAbsorbanceBias = {
+    kOpsinAbsorbanceBias0,
+    kOpsinAbsorbanceBias1,
+    kOpsinAbsorbanceBias2,
+};
+
+constexpr std::array<float, 4> kNegOpsinAbsorbanceBiasRGB = {
+    -kOpsinAbsorbanceBias0, -kOpsinAbsorbanceBias1, -kOpsinAbsorbanceBias2,
+    1.0f};
+
+constexpr float kScaledXYBOffset0 = 0.015386134f;
+constexpr float kScaledXYBOffset1 = 0.0f;
+constexpr float kScaledXYBOffset2 = 0.27770459f;
+
+constexpr std::array<float, 3> kScaledXYBOffset = {
+    kScaledXYBOffset0, kScaledXYBOffset1, kScaledXYBOffset2};
+
+constexpr float kScaledXYBScale0 = 22.995788804f;
+constexpr float kScaledXYBScale1 = 1.183000077f;
+constexpr float kScaledXYBScale2 = 1.502141333f;
+
+constexpr std::array<float, 3> kScaledXYBScale = {
+    kScaledXYBScale0,
+    kScaledXYBScale1,
+    kScaledXYBScale2,
+};
+
+// NB(eustas): following function/variable names are just "namos".
+
+// More precise calculation of 1 / ((1 / r1) + (1 / r2))
+constexpr float ReciprocialSum(float r1, float r2) {
+  return (r1 * r2) / (r1 + r2);
+}
+
+constexpr float kXYBOffset0 = kScaledXYBOffset0 + kScaledXYBOffset1;
+constexpr float kXYBOffset1 =
+    kScaledXYBOffset1 - kScaledXYBOffset0 + (1.0f / kScaledXYBScale0);
+constexpr float kXYBOffset2 = kScaledXYBOffset1 + kScaledXYBOffset2;
+
+constexpr std::array<float, 3> kXYBOffset = {kXYBOffset0, kXYBOffset1,
+                                             kXYBOffset2};
+
+constexpr float kXYBScale0 = ReciprocialSum(kScaledXYBScale0, kScaledXYBScale1);
+constexpr float kXYBScale1 = ReciprocialSum(kScaledXYBScale0, kScaledXYBScale1);
+constexpr float kXYBScale2 = ReciprocialSum(kScaledXYBScale1, kScaledXYBScale2);
+
+constexpr std::array<float, 3> kXYBScale = {kXYBScale0, kXYBScale1, kXYBScale2};
+
+template <size_t idx>
+constexpr float ScaledXYBScale() {
+  return (idx == 0)   ? kScaledXYBScale0
+         : (idx == 1) ? kScaledXYBScale1
+                      : kScaledXYBScale2;
+}
+
+template <size_t idx>
+constexpr float ScaledXYBOffset() {
+  return (idx == 0)   ? kScaledXYBOffset0
+         : (idx == 1) ? kScaledXYBOffset1
+                      : kScaledXYBOffset2;
+}
+
+template <size_t x, size_t y, size_t b, size_t idx>
+constexpr float XYBCorner() {
+  return (((idx == 0)   ? x
+           : (idx == 1) ? y
+                        : b) /
+          ScaledXYBScale<idx>()) -
+         ScaledXYBOffset<idx>();
+}
+
+template <size_t x, size_t y, size_t b, size_t idx>
+constexpr float ScaledA2BCorner() {
+  return (idx == 0)   ? (XYBCorner<x, y, b, 1>() + XYBCorner<x, y, b, 0>())
+         : (idx == 1) ? (XYBCorner<x, y, b, 1>() - XYBCorner<x, y, b, 0>())
+                      : (XYBCorner<x, y, b, 2>() + XYBCorner<x, y, b, 1>());
+}
+
+typedef std::array<float, 3> ColorCube0D;
+template <size_t x, size_t y, size_t b>
+constexpr ColorCube0D UnscaledA2BCorner() {
+  return {(ScaledA2BCorner<x, y, b, 0>() + kXYBOffset0) * kXYBScale0,
+          (ScaledA2BCorner<x, y, b, 1>() + kXYBOffset1) * kXYBScale1,
+          (ScaledA2BCorner<x, y, b, 2>() + kXYBOffset2) * kXYBScale2};
+}
+
+typedef std::array<ColorCube0D, 2> ColorCube1D;
+template <size_t x, size_t y>
+constexpr ColorCube1D UnscaledA2BCubeXY() {
+  return {UnscaledA2BCorner<x, y, 0>(), UnscaledA2BCorner<x, y, 1>()};
+}
+
+typedef std::array<ColorCube1D, 2> ColorCube2D;
+template <size_t x>
+constexpr ColorCube2D UnscaledA2BCubeX() {
+  return {UnscaledA2BCubeXY<x, 0>(), UnscaledA2BCubeXY<x, 1>()};
+}
+
+typedef std::array<ColorCube2D, 2> ColorCube3D;
+constexpr ColorCube3D UnscaledA2BCube() {
+  return {UnscaledA2BCubeX<0>(), UnscaledA2BCubeX<1>()};
+}
+
+constexpr ColorCube3D kUnscaledA2BCube = UnscaledA2BCube();
+
+}  // namespace cms
+}  // namespace jxl
+
+#endif  // LIB_JXL_CMS_OPSIN_PARAMS_H_
similarity index 56%
rename from lib/jxl/dec_tone_mapping-inl.h
rename to lib/jxl/cms/tone_mapping-inl.h
index a326037..3d94cce 100644 (file)
@@ -3,16 +3,17 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-#if defined(LIB_JXL_DEC_TONE_MAPPING_INL_H_) == defined(HWY_TARGET_TOGGLE)
-#ifdef LIB_JXL_DEC_TONE_MAPPING_INL_H_
-#undef LIB_JXL_DEC_TONE_MAPPING_INL_H_
+#if defined(LIB_JXL_CMS_TONE_MAPPING_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JXL_CMS_TONE_MAPPING_INL_H_
+#undef LIB_JXL_CMS_TONE_MAPPING_INL_H_
 #else
-#define LIB_JXL_DEC_TONE_MAPPING_INL_H_
+#define LIB_JXL_CMS_TONE_MAPPING_INL_H_
 #endif
 
 #include <hwy/highway.h>
 
-#include "lib/jxl/transfer_functions-inl.h"
+#include "lib/jxl/cms/tone_mapping.h"
+#include "lib/jxl/cms/transfer_functions-inl.h"
 
 HWY_BEFORE_NAMESPACE();
 namespace jxl {
@@ -25,19 +26,12 @@ using hwy::HWY_NAMESPACE::Max;
 using hwy::HWY_NAMESPACE::ZeroIfNegative;
 
 template <typename D>
-class Rec2408ToneMapper {
+class Rec2408ToneMapper : Rec2408ToneMapperBase {
  private:
   using V = hwy::HWY_NAMESPACE::Vec<D>;
 
  public:
-  explicit Rec2408ToneMapper(std::pair<float, float> source_range,
-                             std::pair<float, float> target_range,
-                             const float primaries_luminances[3])
-      : source_range_(source_range),
-        target_range_(target_range),
-        red_Y_(primaries_luminances[0]),
-        green_Y_(primaries_luminances[1]),
-        blue_Y_(primaries_luminances[2]) {}
+  using Rec2408ToneMapperBase::Rec2408ToneMapperBase;
 
   void ToneMap(V* red, V* green, V* blue) const {
     const V luminance = Mul(Set(df_, source_range_.second),
@@ -61,26 +55,21 @@ class Rec2408ToneMapper {
     const V e4 = MulAdd(e3, pq_mastering_range, pq_mastering_min);
     const V new_luminance =
         Min(Set(df_, target_range_.second),
-            ZeroIfNegative(
-                Mul(Set(df_, 10000), TF_PQ().DisplayFromEncoded(df_, e4))));
-
-    const V ratio = Div(new_luminance, luminance);
-
+            ZeroIfNegative(tf_pq_.DisplayFromEncoded(df_, e4)));
+    const V min_luminance = Set(df_, 1e-6f);
+    const auto use_cap = Le(luminance, min_luminance);
+    const V ratio = Div(new_luminance, Max(luminance, min_luminance));
+    const V cap = Mul(new_luminance, Set(df_, inv_target_peak_));
     const V normalizer = Set(df_, normalizer_);
+    const V multiplier = Mul(ratio, normalizer);
     for (V* const val : {red, green, blue}) {
-      *val = Mul(IfThenElse(Le(luminance, Set(df_, 1e-6f)), new_luminance,
-                            Mul(*val, ratio)),
-                 normalizer);
+      *val = IfThenElse(use_cap, cap, Mul(*val, multiplier));
     }
   }
 
  private:
   V InvEOTF(const V luminance) const {
-    return TF_PQ().EncodedFromDisplay(df_,
-                                      Mul(luminance, Set(df_, 1. / 10000)));
-  }
-  float InvEOTF(const float luminance) const {
-    return TF_PQ().EncodedFromDisplay(luminance / 10000.0f);
+    return tf_pq_.EncodedFromDisplay(df_, luminance);
   }
   V T(const V a) const {
     const V ks = Set(df_, ks_);
@@ -98,42 +87,17 @@ class Rec2408ToneMapper {
         ks,
         MulAdd(Add(t_b_3, MulAdd(Set(df_, -2), t_b_2, t_b)),
                Sub(Set(df_, 1), ks),
-               MulAdd(Set(df_, -2), t_b_3,
-                      Mul(Mul(Set(df_, 3), t_b_2), max_lum))));
+               Mul(MulAdd(Set(df_, -2), t_b_3, Mul(Set(df_, 3), t_b_2)),
+                   max_lum)));
   }
 
   D df_;
-  const std::pair<float, float> source_range_;
-  const std::pair<float, float> target_range_;
-  const float red_Y_;
-  const float green_Y_;
-  const float blue_Y_;
-
-  const float pq_mastering_min_ = InvEOTF(source_range_.first);
-  const float pq_mastering_max_ = InvEOTF(source_range_.second);
-  const float pq_mastering_range_ = pq_mastering_max_ - pq_mastering_min_;
-  const float inv_pq_mastering_range_ = 1.0f / pq_mastering_range_;
-  // TODO(eustas): divide instead of inverse-multiply?
-  const float min_lum_ = (InvEOTF(target_range_.first) - pq_mastering_min_) *
-                         inv_pq_mastering_range_;
-  // TODO(eustas): divide instead of inverse-multiply?
-  const float max_lum_ = (InvEOTF(target_range_.second) - pq_mastering_min_) *
-                         inv_pq_mastering_range_;
-  const float ks_ = 1.5f * max_lum_ - 0.5f;
-  const float b_ = min_lum_;
-
-  const float inv_one_minus_ks_ = 1.0f / std::max(1e-6f, 1.0f - ks_);
-
-  const float normalizer_ = source_range_.second / target_range_.second;
+  const TF_PQ tf_pq_ = TF_PQ(/*display_intensity_target=*/1.0);
 };
 
-class HlgOOTF {
+class HlgOOTF : HlgOOTF_Base {
  public:
-  explicit HlgOOTF(float source_luminance, float target_luminance,
-                   const float primaries_luminances[3])
-      : HlgOOTF(/*gamma=*/std::pow(
-                    1.111f, std::log2(target_luminance / source_luminance)),
-                primaries_luminances) {}
+  using HlgOOTF_Base::HlgOOTF_Base;
 
   static HlgOOTF FromSceneLight(float display_luminance,
                                 const float primaries_luminances[3]) {
@@ -165,18 +129,6 @@ class HlgOOTF {
   }
 
   bool WarrantsGamutMapping() const { return apply_ootf_ && exponent_ < 0; }
-
- private:
-  explicit HlgOOTF(float gamma, const float luminances[3])
-      : exponent_(gamma - 1),
-        red_Y_(luminances[0]),
-        green_Y_(luminances[1]),
-        blue_Y_(luminances[2]) {}
-  const float exponent_;
-  const bool apply_ootf_ = exponent_ < -0.01f || 0.01f < exponent_;
-  const float red_Y_;
-  const float green_Y_;
-  const float blue_Y_;
 };
 
 template <typename V>
@@ -196,30 +148,37 @@ void GamutMap(V* red, V* green, V* blue, const float primaries_luminances[3],
   // That will reduce its luminance.
   // - For luminance preservation, getting all components below 1 is
   // done by mixing in yet more gray. That will desaturate it further.
-  V gray_mix_saturation = Zero(df);
-  V gray_mix_luminance = Zero(df);
+  const V zero = Zero(df);
+  const V one = Set(df, 1);
+  V gray_mix_saturation = zero;
+  V gray_mix_luminance = zero;
   for (const V* ch : {red, green, blue}) {
     const V& val = *ch;
-    const V inv_val_minus_gray = Div(Set(df, 1), (Sub(val, luminance)));
+    const V val_minus_gray = Sub(val, luminance);
+    const V inv_val_minus_gray =
+        Div(one, IfThenElse(Eq(val_minus_gray, zero), one, val_minus_gray));
+    const V val_over_val_minus_gray = Mul(val, inv_val_minus_gray);
     gray_mix_saturation =
-        IfThenElse(Ge(val, luminance), gray_mix_saturation,
-                   Max(gray_mix_saturation, Mul(val, inv_val_minus_gray)));
+        IfThenElse(Ge(val_minus_gray, zero), gray_mix_saturation,
+                   Max(gray_mix_saturation, val_over_val_minus_gray));
     gray_mix_luminance =
         Max(gray_mix_luminance,
-            IfThenElse(Le(val, luminance), gray_mix_saturation,
-                       Mul(Sub(val, Set(df, 1)), inv_val_minus_gray)));
+            IfThenElse(Le(val_minus_gray, zero), gray_mix_saturation,
+                       Sub(val_over_val_minus_gray, inv_val_minus_gray)));
   }
   const V gray_mix = Clamp(
       MulAdd(Set(df, preserve_saturation),
              Sub(gray_mix_saturation, gray_mix_luminance), gray_mix_luminance),
-      Zero(df), Set(df, 1));
-  for (V* const val : {red, green, blue}) {
-    *val = MulAdd(gray_mix, Sub(luminance, *val), *val);
+      zero, one);
+  for (V* const ch : {red, green, blue}) {
+    V& val = *ch;
+    val = MulAdd(gray_mix, Sub(luminance, val), val);
   }
-  const V normalizer =
-      Div(Set(df, 1), Max(Set(df, 1), Max(*red, Max(*green, *blue))));
-  for (V* const val : {red, green, blue}) {
-    *val = Mul(*val, normalizer);
+  const V max_clr = Max(Max(one, *red), Max(*green, *blue));
+  const V normalizer = Div(one, max_clr);
+  for (V* const ch : {red, green, blue}) {
+    V& val = *ch;
+    val = Mul(val, normalizer);
   }
 }
 
@@ -229,4 +188,4 @@ void GamutMap(V* red, V* green, V* blue, const float primaries_luminances[3],
 }  // namespace jxl
 HWY_AFTER_NAMESPACE();
 
-#endif  // LIB_JXL_DEC_TONE_MAPPING_INL_H_
+#endif  // LIB_JXL_CMS_TONE_MAPPING_INL_H_
diff --git a/lib/jxl/cms/tone_mapping.h b/lib/jxl/cms/tone_mapping.h
new file mode 100644 (file)
index 0000000..31b9233
--- /dev/null
@@ -0,0 +1,176 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_CMS_TONE_MAPPING_H_
+#define LIB_JXL_CMS_TONE_MAPPING_H_
+
+#include <algorithm>
+#include <cmath>
+
+#include "lib/jxl/base/common.h"
+#include "lib/jxl/cms/transfer_functions.h"
+
+namespace jxl {
+
+class Rec2408ToneMapperBase {
+ public:
+  explicit Rec2408ToneMapperBase(std::pair<float, float> source_range,
+                                 std::pair<float, float> target_range,
+                                 const float primaries_luminances[3])
+      : source_range_(source_range),
+        target_range_(target_range),
+        red_Y_(primaries_luminances[0]),
+        green_Y_(primaries_luminances[1]),
+        blue_Y_(primaries_luminances[2]) {}
+
+  // TODO(eustas): test me
+  void ToneMap(float* red, float* green, float* blue) const {
+    const float luminance =
+        source_range_.second *
+        (red_Y_ * *red + green_Y_ * *green + blue_Y_ * *blue);
+    const float normalized_pq =
+        std::min(1.f, (InvEOTF(luminance) - pq_mastering_min_) *
+                          inv_pq_mastering_range_);
+    const float e2 = (normalized_pq < ks_) ? normalized_pq : P(normalized_pq);
+    const float one_minus_e2 = 1 - e2;
+    const float one_minus_e2_2 = one_minus_e2 * one_minus_e2;
+    const float one_minus_e2_4 = one_minus_e2_2 * one_minus_e2_2;
+    const float e3 = min_lum_ * one_minus_e2_4 + e2;
+    const float e4 = e3 * pq_mastering_range_ + pq_mastering_min_;
+    const float d4 =
+        TF_PQ_Base::DisplayFromEncoded(/*display_intensity_target=*/1.0, e4);
+    const float new_luminance = Clamp1(d4, 0.f, target_range_.second);
+    const float min_luminance = 1e-6f;
+    const bool use_cap = (luminance <= min_luminance);
+    const float ratio = new_luminance / std::max(luminance, min_luminance);
+    const float cap = new_luminance * inv_target_peak_;
+    const float multiplier = ratio * normalizer_;
+    for (float* const val : {red, green, blue}) {
+      *val = use_cap ? cap : *val * multiplier;
+    }
+  }
+
+ protected:
+  float InvEOTF(const float luminance) const {
+    return TF_PQ_Base::EncodedFromDisplay(/*display_intensity_target=*/1.0,
+                                          luminance);
+  }
+  float T(const float a) const { return (a - ks_) * inv_one_minus_ks_; }
+  float P(const float b) const {
+    const float t_b = T(b);
+    const float t_b_2 = t_b * t_b;
+    const float t_b_3 = t_b_2 * t_b;
+    return (2 * t_b_3 - 3 * t_b_2 + 1) * ks_ +
+           (t_b_3 - 2 * t_b_2 + t_b) * (1 - ks_) +
+           (-2 * t_b_3 + 3 * t_b_2) * max_lum_;
+  }
+
+  const std::pair<float, float> source_range_;
+  const std::pair<float, float> target_range_;
+  const float red_Y_;
+  const float green_Y_;
+  const float blue_Y_;
+
+  const float pq_mastering_min_ = InvEOTF(source_range_.first);
+  const float pq_mastering_max_ = InvEOTF(source_range_.second);
+  const float pq_mastering_range_ = pq_mastering_max_ - pq_mastering_min_;
+  const float inv_pq_mastering_range_ = 1.0f / pq_mastering_range_;
+  // TODO(eustas): divide instead of inverse-multiply?
+  const float min_lum_ = (InvEOTF(target_range_.first) - pq_mastering_min_) *
+                         inv_pq_mastering_range_;
+  // TODO(eustas): divide instead of inverse-multiply?
+  const float max_lum_ = (InvEOTF(target_range_.second) - pq_mastering_min_) *
+                         inv_pq_mastering_range_;
+  const float ks_ = 1.5f * max_lum_ - 0.5f;
+
+  const float inv_one_minus_ks_ = 1.0f / std::max(1e-6f, 1.0f - ks_);
+
+  const float normalizer_ = source_range_.second / target_range_.second;
+  const float inv_target_peak_ = 1.f / target_range_.second;
+};
+
+class HlgOOTF_Base {
+ public:
+  explicit HlgOOTF_Base(float source_luminance, float target_luminance,
+                        const float primaries_luminances[3])
+      : HlgOOTF_Base(/*gamma=*/std::pow(1.111f, std::log2(target_luminance /
+                                                          source_luminance)),
+                     primaries_luminances) {}
+
+  // TODO(eustas): test me
+  void Apply(float* red, float* green, float* blue) const {
+    if (!apply_ootf_) return;
+    const float luminance = red_Y_ * *red + green_Y_ * *green + blue_Y_ * *blue;
+    const float ratio = std::min<float>(powf(luminance, exponent_), 1e9);
+    *red *= ratio;
+    *green *= ratio;
+    *blue *= ratio;
+  }
+
+ protected:
+  explicit HlgOOTF_Base(float gamma, const float luminances[3])
+      : exponent_(gamma - 1),
+        red_Y_(luminances[0]),
+        green_Y_(luminances[1]),
+        blue_Y_(luminances[2]) {}
+  const float exponent_;
+  const bool apply_ootf_ = exponent_ < -0.01f || 0.01f < exponent_;
+  const float red_Y_;
+  const float green_Y_;
+  const float blue_Y_;
+};
+
+static void GamutMapScalar(float* red, float* green, float* blue,
+                           const float primaries_luminances[3],
+                           float preserve_saturation = 0.1f) {
+  const float luminance = primaries_luminances[0] * *red +
+                          primaries_luminances[1] * *green +
+                          primaries_luminances[2] * *blue;
+
+  // Desaturate out-of-gamut pixels. This is done by mixing each pixel
+  // with just enough gray of the target luminance to make all
+  // components non-negative.
+  // - For saturation preservation, if a component is still larger than
+  // 1 then the pixel is normalized to have a maximum component of 1.
+  // That will reduce its luminance.
+  // - For luminance preservation, getting all components below 1 is
+  // done by mixing in yet more gray. That will desaturate it further.
+  float gray_mix_saturation = 0.0f;
+  float gray_mix_luminance = 0.0f;
+  for (const float* ch : {red, green, blue}) {
+    const float& val = *ch;
+    const float val_minus_gray = val - luminance;
+    const float inv_val_minus_gray =
+        1.0f / ((val_minus_gray == 0.0f) ? 1.0f : val_minus_gray);
+    const float val_over_val_minus_gray = val * inv_val_minus_gray;
+    gray_mix_saturation =
+        (val_minus_gray >= 0.0f)
+            ? gray_mix_saturation
+            : std::max(gray_mix_saturation, val_over_val_minus_gray);
+    gray_mix_luminance =
+        std::max(gray_mix_luminance,
+                 (val_minus_gray <= 0.0f)
+                     ? gray_mix_saturation
+                     : (val_over_val_minus_gray - inv_val_minus_gray));
+  }
+  const float gray_mix =
+      Clamp1((preserve_saturation * (gray_mix_saturation - gray_mix_luminance) +
+              gray_mix_luminance),
+             0.0f, 1.0f);
+  for (float* const ch : {red, green, blue}) {
+    float& val = *ch;
+    val = gray_mix * (luminance - val) + val;
+  }
+  const float max_clr = std::max({1.0f, *red, *green, *blue});
+  const float normalizer = 1.0f / max_clr;
+  for (float* const ch : {red, green, blue}) {
+    float& val = *ch;
+    val *= normalizer;
+  }
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_CMS_TONE_MAPPING_H_
diff --git a/lib/jxl/cms/tone_mapping_test.cc b/lib/jxl/cms/tone_mapping_test.cc
new file mode 100644 (file)
index 0000000..dda2bbb
--- /dev/null
@@ -0,0 +1,147 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/cms/tone_mapping_test.cc"
+#include "lib/jxl/cms/tone_mapping.h"
+
+#include <cstdio>
+#include <hwy/foreach_target.h>
+
+#include "lib/jxl/base/random.h"
+#include "lib/jxl/cms/tone_mapping-inl.h"
+#include "lib/jxl/testing.h"
+
+// Test utils
+#include <hwy/highway.h>
+#include <hwy/tests/hwy_gtest.h>
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+HWY_NOINLINE void TestRec2408ToneMap() {
+  constexpr size_t kNumTrials = 1 << 23;
+  Rng rng(1);
+  float max_abs_err = 0;
+  HWY_FULL(float) d;
+  for (size_t i = 0; i < kNumTrials; i++) {
+    float src = 11000.0 + rng.UniformF(-150.0f, 150.0f);
+    float tgt = 250 + rng.UniformF(-5.0f, 5.0f);
+    float luminances[3] = {rng.UniformF(0.2f, 0.4f), rng.UniformF(0.2f, 0.4f),
+                           rng.UniformF(0.2f, 0.4f)};
+    float rgb[3] = {rng.UniformF(0.0f, 1.0f), rng.UniformF(0.0f, 1.0f),
+                    rng.UniformF(0.0f, 1.0f)};
+    Rec2408ToneMapper<decltype(d)> tone_mapper({0, src}, {0, tgt}, luminances);
+    auto r = Set(d, rgb[0]);
+    auto g = Set(d, rgb[1]);
+    auto b = Set(d, rgb[2]);
+    tone_mapper.ToneMap(&r, &g, &b);
+    Rec2408ToneMapperBase tone_mapper_base({0, src}, {0, tgt}, luminances);
+    tone_mapper_base.ToneMap(&rgb[0], &rgb[1], &rgb[2]);
+    const float actual_r = GetLane(r);
+    const float expected_r = rgb[0];
+    const float abs_err_r = std::abs(expected_r - actual_r);
+    EXPECT_LT(abs_err_r, 2.75e-5);
+    const float actual_g = GetLane(g);
+    const float expected_g = rgb[1];
+    const float abs_err_g = std::abs(expected_g - actual_g);
+    EXPECT_LT(abs_err_g, 2.75e-5);
+    const float actual_b = GetLane(b);
+    const float expected_b = rgb[2];
+    const float abs_err_b = std::abs(expected_b - actual_b);
+    EXPECT_LT(abs_err_b, 2.75e-5);
+    max_abs_err = std::max({max_abs_err, abs_err_r, abs_err_g, abs_err_b});
+  }
+  printf("max abs err %e\n", static_cast<double>(max_abs_err));
+}
+
+HWY_NOINLINE void TestHlgOotfApply() {
+  constexpr size_t kNumTrials = 1 << 23;
+  Rng rng(1);
+  float max_abs_err = 0;
+  HWY_FULL(float) d;
+  for (size_t i = 0; i < kNumTrials; i++) {
+    float src = 300.0 + rng.UniformF(-50.0f, 50.0f);
+    float tgt = 80 + rng.UniformF(-5.0f, 5.0f);
+    float luminances[3] = {rng.UniformF(0.2f, 0.4f), rng.UniformF(0.2f, 0.4f),
+                           rng.UniformF(0.2f, 0.4f)};
+    float rgb[3] = {rng.UniformF(0.0f, 1.0f), rng.UniformF(0.0f, 1.0f),
+                    rng.UniformF(0.0f, 1.0f)};
+    HlgOOTF ootf(src, tgt, luminances);
+    auto r = Set(d, rgb[0]);
+    auto g = Set(d, rgb[1]);
+    auto b = Set(d, rgb[2]);
+    ootf.Apply(&r, &g, &b);
+    HlgOOTF_Base ootf_base(src, tgt, luminances);
+    ootf_base.Apply(&rgb[0], &rgb[1], &rgb[2]);
+    const float actual_r = GetLane(r);
+    const float expected_r = rgb[0];
+    const float abs_err_r = std::abs(expected_r - actual_r);
+    EXPECT_LT(abs_err_r, 7.2e-7);
+    const float actual_g = GetLane(g);
+    const float expected_g = rgb[1];
+    const float abs_err_g = std::abs(expected_g - actual_g);
+    EXPECT_LT(abs_err_g, 7.2e-7);
+    const float actual_b = GetLane(b);
+    const float expected_b = rgb[2];
+    const float abs_err_b = std::abs(expected_b - actual_b);
+    EXPECT_LT(abs_err_b, 7.2e-7);
+    max_abs_err = std::max({max_abs_err, abs_err_r, abs_err_g, abs_err_b});
+  }
+  printf("max abs err %e\n", static_cast<double>(max_abs_err));
+}
+
+HWY_NOINLINE void TestGamutMap() {
+  constexpr size_t kNumTrials = 1 << 23;
+  Rng rng(1);
+  float max_abs_err = 0;
+  HWY_FULL(float) d;
+  for (size_t i = 0; i < kNumTrials; i++) {
+    float preserve_saturation = rng.UniformF(0.2f, 0.4f);
+    float luminances[3] = {rng.UniformF(0.2f, 0.4f), rng.UniformF(0.2f, 0.4f),
+                           rng.UniformF(0.2f, 0.4f)};
+    float rgb[3] = {rng.UniformF(0.0f, 1.0f), rng.UniformF(0.0f, 1.0f),
+                    rng.UniformF(0.0f, 1.0f)};
+    auto r = Set(d, rgb[0]);
+    auto g = Set(d, rgb[1]);
+    auto b = Set(d, rgb[2]);
+    GamutMap(&r, &g, &b, luminances, preserve_saturation);
+    GamutMapScalar(&rgb[0], &rgb[1], &rgb[2], luminances, preserve_saturation);
+    const float actual_r = GetLane(r);
+    const float expected_r = rgb[0];
+    const float abs_err_r = std::abs(expected_r - actual_r);
+    EXPECT_LT(abs_err_r, 1e-10);
+    const float actual_g = GetLane(g);
+    const float expected_g = rgb[1];
+    const float abs_err_g = std::abs(expected_g - actual_g);
+    EXPECT_LT(abs_err_g, 1e-10);
+    const float actual_b = GetLane(b);
+    const float expected_b = rgb[2];
+    const float abs_err_b = std::abs(expected_b - actual_b);
+    EXPECT_LT(abs_err_b, 1e-10);
+    max_abs_err = std::max({max_abs_err, abs_err_r, abs_err_g, abs_err_b});
+  }
+  printf("max abs err %e\n", static_cast<double>(max_abs_err));
+}
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+class ToneMappingTargetTest : public hwy::TestWithParamTarget {};
+HWY_TARGET_INSTANTIATE_TEST_SUITE_P(ToneMappingTargetTest);
+
+HWY_EXPORT_AND_TEST_P(ToneMappingTargetTest, TestRec2408ToneMap);
+HWY_EXPORT_AND_TEST_P(ToneMappingTargetTest, TestHlgOotfApply);
+HWY_EXPORT_AND_TEST_P(ToneMappingTargetTest, TestGamutMap);
+
+}  // namespace jxl
+#endif  // HWY_ONCE
similarity index 78%
rename from lib/jxl/transfer_functions-inl.h
rename to lib/jxl/cms/transfer_functions-inl.h
index 9f4c10c..84bcbb4 100644 (file)
@@ -5,11 +5,11 @@
 
 // Transfer functions for color encodings.
 
-#if defined(LIB_JXL_TRANSFER_FUNCTIONS_INL_H_) == defined(HWY_TARGET_TOGGLE)
-#ifdef LIB_JXL_TRANSFER_FUNCTIONS_INL_H_
-#undef LIB_JXL_TRANSFER_FUNCTIONS_INL_H_
+#if defined(LIB_JXL_CMS_TRANSFER_FUNCTIONS_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JXL_CMS_TRANSFER_FUNCTIONS_INL_H_
+#undef LIB_JXL_CMS_TRANSFER_FUNCTIONS_INL_H_
 #else
-#define LIB_JXL_TRANSFER_FUNCTIONS_INL_H_
+#define LIB_JXL_CMS_TRANSFER_FUNCTIONS_INL_H_
 #endif
 
 #include <algorithm>
 #include <hwy/highway.h>
 
 #include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/fast_math-inl.h"
+#include "lib/jxl/base/rational_polynomial-inl.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/fast_math-inl.h"
-#include "lib/jxl/rational_polynomial-inl.h"
+#include "lib/jxl/cms/transfer_functions.h"
 
 HWY_BEFORE_NAMESPACE();
 namespace jxl {
@@ -48,18 +49,8 @@ using hwy::HWY_NAMESPACE::TableLookupBytes;
 // and extend the function domains above 1.
 
 // Hybrid Log-Gamma.
-class TF_HLG {
+class TF_HLG : TF_HLG_Base {
  public:
-  // EOTF. e = encoded.
-  JXL_INLINE double DisplayFromEncoded(const double e) const {
-    return OOTF(InvOETF(e));
-  }
-
-  // Inverse EOTF. d = display.
-  JXL_INLINE double EncodedFromDisplay(const double d) const {
-    return OETF(InvOOTF(d));
-  }
-
   // Maximum error 5e-7.
   template <class D, class V>
   JXL_INLINE V EncodedFromDisplay(D d, V x) const {
@@ -74,54 +65,6 @@ class TF_HLG {
     const V magnitude = IfThenElse(Le(x, Set(d, kDiv12)), below_div12, e);
     return Or(AndNot(kSign, magnitude), original_sign);
   }
-
- private:
-  // OETF (defines the HLG approach). s = scene, returns encoded.
-  JXL_INLINE double OETF(double s) const {
-    if (s == 0.0) return 0.0;
-    const double original_sign = s;
-    s = std::abs(s);
-
-    if (s <= kDiv12) return copysignf(std::sqrt(3.0 * s), original_sign);
-
-    const double e = kA * std::log(12 * s - kB) + kC;
-    JXL_ASSERT(e > 0.0);
-    return copysignf(e, original_sign);
-  }
-
-  // e = encoded, returns scene.
-  JXL_INLINE double InvOETF(double e) const {
-    if (e == 0.0) return 0.0;
-    const double original_sign = e;
-    e = std::abs(e);
-
-    if (e <= 0.5) return copysignf(e * e * (1.0 / 3), original_sign);
-
-    const double s = (std::exp((e - kC) * kRA) + kB) * kDiv12;
-    JXL_ASSERT(s >= 0);
-    return copysignf(s, original_sign);
-  }
-
-  // s = scene, returns display.
-  JXL_INLINE double OOTF(const double s) const {
-    // The actual (red channel) OOTF is RD = alpha * YS^(gamma-1) * RS, where
-    // YS = 0.2627 * RS + 0.6780 * GS + 0.0593 * BS. Let alpha = 1 so we return
-    // "display" (normalized [0, 1]) instead of nits. Our transfer function
-    // interface does not allow a dependency on YS. Fortunately, the system
-    // gamma at 334 nits is 1.0, so this reduces to RD = RS.
-    return s;
-  }
-
-  // d = display, returns scene.
-  JXL_INLINE double InvOOTF(const double d) const {
-    return d;  // see OOTF().
-  }
-
-  static constexpr double kA = 0.17883277;
-  static constexpr double kRA = 1.0 / kA;
-  static constexpr double kB = 1 - 4 * kA;
-  static constexpr double kC = 0.5599107295;
-  static constexpr double kDiv12 = 1.0 / 12;
 };
 
 class TF_709 {
@@ -163,22 +106,13 @@ class TF_709 {
 };
 
 // Perceptual Quantization
-class TF_PQ {
+class TF_PQ : TF_PQ_Base {
  public:
-  // EOTF (defines the PQ approach). e = encoded.
-  JXL_INLINE double DisplayFromEncoded(double e) const {
-    if (e == 0.0) return 0.0;
-    const double original_sign = e;
-    e = std::abs(e);
-
-    const double xp = std::pow(e, 1.0 / kM2);
-    const double num = std::max(xp - kC1, 0.0);
-    const double den = kC2 - kC3 * xp;
-    JXL_DASSERT(den != 0.0);
-    const double d = std::pow(num / den, 1.0 / kM1);
-    JXL_DASSERT(d >= 0.0);  // Equal for e ~= 1E-9
-    return copysignf(d, original_sign);
-  }
+  explicit TF_PQ(float display_intensity_target = kDefaultIntensityTarget)
+      : display_scaling_factor_to_10000_nits_(display_intensity_target *
+                                              (1.0f / 10000.0f)),
+        display_scaling_factor_from_10000_nits_(10000.0f /
+                                                display_intensity_target) {}
 
   // Maximum error 3e-6
   template <class D, class V>
@@ -201,21 +135,10 @@ class TF_PQ {
         HWY_REP4(2.67718770e+00f),
     };
     auto magnitude = EvalRationalPolynomial(d, xpxx, p, q);
-    return Or(AndNot(kSign, magnitude), original_sign);
-  }
-
-  // Inverse EOTF. d = display.
-  JXL_INLINE double EncodedFromDisplay(double d) const {
-    if (d == 0.0) return 0.0;
-    const double original_sign = d;
-    d = std::abs(d);
-
-    const double xp = std::pow(d, kM1);
-    const double num = kC1 + xp * kC2;
-    const double den = 1.0 + xp * kC3;
-    const double e = std::pow(num / den, kM2);
-    JXL_DASSERT(e > 0.0);
-    return copysignf(e, original_sign);
+    return Or(
+        AndNot(kSign,
+               Mul(magnitude, Set(d, display_scaling_factor_from_10000_nits_))),
+        original_sign);
   }
 
   // Maximum error 7e-7.
@@ -227,7 +150,8 @@ class TF_PQ {
     x = AndNot(kSign, x);  // abs
     // 4-over-4-degree rational polynomial approximation on x**0.25, with two
     // different polynomials above and below 1e-4.
-    auto xto025 = Sqrt(Sqrt(x));
+    auto xto025 =
+        Sqrt(Sqrt(Mul(x, Set(d, display_scaling_factor_to_10000_nits_))));
     HWY_ALIGN constexpr float p[(4 + 1) * 4] = {
         HWY_REP4(1.351392e-02f), HWY_REP4(-1.095778e+00f),
         HWY_REP4(5.522776e+01f), HWY_REP4(1.492516e+02f),
@@ -257,11 +181,8 @@ class TF_PQ {
   }
 
  private:
-  static constexpr double kM1 = 2610.0 / 16384;
-  static constexpr double kM2 = (2523.0 / 4096) * 128;
-  static constexpr double kC1 = 3424.0 / 4096;
-  static constexpr double kC2 = (2413.0 / 4096) * 32;
-  static constexpr double kC3 = (2392.0 / 4096) * 32;
+  const float display_scaling_factor_to_10000_nits_;
+  const float display_scaling_factor_from_10000_nits_;
 };
 
 // sRGB
@@ -410,4 +331,4 @@ V FastLinearToSRGB(D d, V v) {
 }  // namespace jxl
 HWY_AFTER_NAMESPACE();
 
-#endif  // LIB_JXL_TRANSFER_FUNCTIONS_INL_H_
+#endif  // LIB_JXL_CMS_TRANSFER_FUNCTIONS_INL_H_
diff --git a/lib/jxl/cms/transfer_functions.h b/lib/jxl/cms/transfer_functions.h
new file mode 100644 (file)
index 0000000..3165872
--- /dev/null
@@ -0,0 +1,132 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Transfer functions for color encodings.
+
+#ifndef LIB_JXL_CMS_TRANSFER_FUNCTIONS_H_
+#define LIB_JXL_CMS_TRANSFER_FUNCTIONS_H_
+
+#include <algorithm>
+#include <cmath>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+// Definitions for BT.2100-2 transfer functions (used inside/outside SIMD):
+// "display" is linear light (nits) normalized to [0, 1].
+// "encoded" is a nonlinear encoding (e.g. PQ) in [0, 1].
+// "scene" is a linear function of photon counts, normalized to [0, 1].
+
+// Despite the stated ranges, we need unbounded transfer functions: see
+// http://www.littlecms.com/CIC18_UnboundedCMM.pdf. Inputs can be negative or
+// above 1 due to chromatic adaptation. To avoid severe round-trip errors caused
+// by clamping, we mirror negative inputs via copysign (f(-x) = -f(x), see
+// https://developer.apple.com/documentation/coregraphics/cgcolorspace/1644735-extendedsrgb)
+// and extend the function domains above 1.
+
+// Hybrid Log-Gamma.
+class TF_HLG_Base {
+ public:
+  // EOTF. e = encoded.
+  static double DisplayFromEncoded(const double e) { return OOTF(InvOETF(e)); }
+
+  // Inverse EOTF. d = display.
+  static double EncodedFromDisplay(const double d) { return OETF(InvOOTF(d)); }
+
+ private:
+  // OETF (defines the HLG approach). s = scene, returns encoded.
+  static double OETF(double s) {
+    if (s == 0.0) return 0.0;
+    const double original_sign = s;
+    s = std::abs(s);
+
+    if (s <= kDiv12) return copysignf(std::sqrt(3.0 * s), original_sign);
+
+    const double e = kA * std::log(12 * s - kB) + kC;
+    JXL_ASSERT(e > 0.0);
+    return copysignf(e, original_sign);
+  }
+
+  // e = encoded, returns scene.
+  static double InvOETF(double e) {
+    if (e == 0.0) return 0.0;
+    const double original_sign = e;
+    e = std::abs(e);
+
+    if (e <= 0.5) return copysignf(e * e * (1.0 / 3), original_sign);
+
+    const double s = (std::exp((e - kC) * kRA) + kB) * kDiv12;
+    JXL_ASSERT(s >= 0);
+    return copysignf(s, original_sign);
+  }
+
+  // s = scene, returns display.
+  static double OOTF(const double s) {
+    // The actual (red channel) OOTF is RD = alpha * YS^(gamma-1) * RS, where
+    // YS = 0.2627 * RS + 0.6780 * GS + 0.0593 * BS. Let alpha = 1 so we return
+    // "display" (normalized [0, 1]) instead of nits. Our transfer function
+    // interface does not allow a dependency on YS. Fortunately, the system
+    // gamma at 334 nits is 1.0, so this reduces to RD = RS.
+    return s;
+  }
+
+  // d = display, returns scene.
+  static double InvOOTF(const double d) {
+    return d;  // see OOTF().
+  }
+
+ protected:
+  static constexpr double kA = 0.17883277;
+  static constexpr double kRA = 1.0 / kA;
+  static constexpr double kB = 1 - 4 * kA;
+  static constexpr double kC = 0.5599107295;
+  static constexpr double kDiv12 = 1.0 / 12;
+};
+
+// Perceptual Quantization
+class TF_PQ_Base {
+ public:
+  static double DisplayFromEncoded(float display_intensity_target, double e) {
+    if (e == 0.0) return 0.0;
+    const double original_sign = e;
+    e = std::abs(e);
+
+    const double xp = std::pow(e, 1.0 / kM2);
+    const double num = std::max(xp - kC1, 0.0);
+    const double den = kC2 - kC3 * xp;
+    JXL_DASSERT(den != 0.0);
+    const double d = std::pow(num / den, 1.0 / kM1);
+    JXL_DASSERT(d >= 0.0);  // Equal for e ~= 1E-9
+    return copysignf(d * (10000.0f / display_intensity_target), original_sign);
+  }
+
+  // Inverse EOTF. d = display.
+  static double EncodedFromDisplay(float display_intensity_target, double d) {
+    if (d == 0.0) return 0.0;
+    const double original_sign = d;
+    d = std::abs(d);
+
+    const double xp =
+        std::pow(d * (display_intensity_target * (1.0f / 10000.0f)), kM1);
+    const double num = kC1 + xp * kC2;
+    const double den = 1.0 + xp * kC3;
+    const double e = std::pow(num / den, kM2);
+    JXL_DASSERT(e > 0.0);
+    return copysignf(e, original_sign);
+  }
+
+ protected:
+  static constexpr double kM1 = 2610.0 / 16384;
+  static constexpr double kM2 = (2523.0 / 4096) * 128;
+  static constexpr double kC1 = 3424.0 / 4096;
+  static constexpr double kC2 = (2413.0 / 4096) * 32;
+  static constexpr double kC3 = (2392.0 / 4096) * 32;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_CMS_TRANSFER_FUNCTIONS_H_
diff --git a/lib/jxl/cms/transfer_functions_test.cc b/lib/jxl/cms/transfer_functions_test.cc
new file mode 100644 (file)
index 0000000..26de409
--- /dev/null
@@ -0,0 +1,94 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/cms/transfer_functions_test.cc"
+#include "lib/jxl/cms/transfer_functions.h"
+
+#include <cstdio>
+#include <hwy/foreach_target.h>
+
+#include "lib/jxl/base/random.h"
+#include "lib/jxl/cms/transfer_functions-inl.h"
+#include "lib/jxl/testing.h"
+
+// Test utils
+#include <hwy/highway.h>
+#include <hwy/tests/hwy_gtest.h>
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+namespace {
+
+HWY_NOINLINE void TestPqEncodedFromDisplay() {
+  constexpr size_t kNumTrials = 1 << 23;
+  Rng rng(1);
+  float max_abs_err = 0;
+  HWY_FULL(float) d;
+  for (size_t i = 0; i < kNumTrials; i++) {
+    double intensity = 11000.0 + rng.UniformF(-150.0f, 150.0f);
+    TF_PQ tf_pq(intensity);
+    const float f = rng.UniformF(0.0f, 1.0f);
+    const float actual = GetLane(tf_pq.EncodedFromDisplay(d, Set(d, f)));
+    const float expected = TF_PQ_Base::EncodedFromDisplay(intensity, f);
+    const float abs_err = std::abs(expected - actual);
+    EXPECT_LT(abs_err, 5e-7) << "f = " << f;
+    max_abs_err = std::max(max_abs_err, abs_err);
+  }
+  printf("max abs err %e\n", static_cast<double>(max_abs_err));
+}
+
+HWY_NOINLINE void TestHlgEncodedFromDisplay() {
+  constexpr size_t kNumTrials = 1 << 23;
+  Rng rng(1);
+  float max_abs_err = 0;
+  HWY_FULL(float) d;
+  for (size_t i = 0; i < kNumTrials; i++) {
+    const float f = rng.UniformF(0.0f, 1.0f);
+    const float actual = GetLane(TF_HLG().EncodedFromDisplay(d, Set(d, f)));
+    const float expected = TF_HLG_Base::EncodedFromDisplay(f);
+    const float abs_err = std::abs(expected - actual);
+    EXPECT_LT(abs_err, 4e-7) << "f = " << f;
+    max_abs_err = std::max(max_abs_err, abs_err);
+  }
+  printf("max abs err %e\n", static_cast<double>(max_abs_err));
+}
+
+HWY_NOINLINE void TestPqDisplayFromEncoded() {
+  constexpr size_t kNumTrials = 1 << 23;
+  Rng rng(1);
+  float max_abs_err = 0;
+  HWY_FULL(float) d;
+  for (size_t i = 0; i < kNumTrials; i++) {
+    double intensity = 11000.0 + rng.UniformF(-150.0f, 150.0f);
+    TF_PQ tf_pq(intensity);
+    const float f = rng.UniformF(0.0f, 1.0f);
+    const float actual = GetLane(tf_pq.DisplayFromEncoded(d, Set(d, f)));
+    const float expected = TF_PQ_Base::DisplayFromEncoded(intensity, f);
+    const float abs_err = std::abs(expected - actual);
+    EXPECT_LT(abs_err, 3E-6) << "f = " << f;
+    max_abs_err = std::max(max_abs_err, abs_err);
+  }
+  printf("max abs err %e\n", static_cast<double>(max_abs_err));
+}
+
+}  // namespace
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+class TransferFunctionsTargetTest : public hwy::TestWithParamTarget {};
+HWY_TARGET_INSTANTIATE_TEST_SUITE_P(TransferFunctionsTargetTest);
+
+HWY_EXPORT_AND_TEST_P(TransferFunctionsTargetTest, TestPqEncodedFromDisplay);
+HWY_EXPORT_AND_TEST_P(TransferFunctionsTargetTest, TestHlgEncodedFromDisplay);
+HWY_EXPORT_AND_TEST_P(TransferFunctionsTargetTest, TestPqDisplayFromEncoded);
+
+}  // namespace jxl
+#endif  // HWY_ONCE
index 23f0a4a..028f3ec 100644 (file)
@@ -9,52 +9,22 @@
 // Holds inputs/outputs for decoding/encoding images.
 
 #include <stddef.h>
+#include <stdint.h>
 
+#include <type_traits>
 #include <utility>
 #include <vector>
 
-#include "lib/jxl/alpha.h"
 #include "lib/jxl/base/data_parallel.h"
-#include "lib/jxl/common.h"
+#include "lib/jxl/base/status.h"
 #include "lib/jxl/frame_header.h"
 #include "lib/jxl/headers.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/image_bundle.h"
 #include "lib/jxl/luminance.h"
-#include "lib/jxl/size_constraints.h"
 
 namespace jxl {
 
-// Per-channel interval, used to convert between (full-range) external and
-// (bounded or unbounded) temp values. See external_image.cc for the definitions
-// of temp/external.
-struct CodecInterval {
-  CodecInterval() = default;
-  constexpr CodecInterval(float min, float max) : min(min), width(max - min) {}
-  // Defaults for temp.
-  float min = 0.0f;
-  float width = 1.0f;
-};
-
-template <typename T,
-          class = typename std::enable_if<std::is_unsigned<T>::value>::type>
-Status VerifyDimensions(const SizeConstraints* constraints, T xs, T ys) {
-  if (!constraints) return true;
-
-  if (xs == 0 || ys == 0) return JXL_FAILURE("Empty image.");
-  if (xs > constraints->dec_max_xsize) return JXL_FAILURE("Image too wide.");
-  if (ys > constraints->dec_max_ysize) return JXL_FAILURE("Image too tall.");
-
-  const uint64_t num_pixels = static_cast<uint64_t>(xs) * ys;
-  if (num_pixels > constraints->dec_max_pixels) {
-    return JXL_FAILURE("Image too big.");
-  }
-
-  return true;
-}
-
-using CodecIntervals = std::array<CodecInterval, 4>;  // RGB[A] or Y[A]
-
 // Optional text/EXIF metadata.
 struct Blobs {
   std::vector<uint8_t> exif;
@@ -92,7 +62,7 @@ class CodecInOut {
   // If c_current.IsGray(), all planes must be identical.
   void SetFromImage(Image3F&& color, const ColorEncoding& c_current) {
     Main().SetFromImage(std::move(color), c_current);
-    SetIntensityTarget(this);
+    SetIntensityTarget(&this->metadata.m);
     SetSize(Main().xsize(), Main().ysize());
   }
 
@@ -123,80 +93,6 @@ class CodecInOut {
     SetSize(xsize, ysize);
   }
 
-  // Calls TransformTo for each ImageBundle (preview/frames).
-  Status TransformTo(const ColorEncoding& c_desired, const JxlCmsInterface& cms,
-                     ThreadPool* pool = nullptr) {
-    if (metadata.m.have_preview) {
-      JXL_RETURN_IF_ERROR(preview_frame.TransformTo(c_desired, cms, pool));
-    }
-    for (ImageBundle& ib : frames) {
-      JXL_RETURN_IF_ERROR(ib.TransformTo(c_desired, cms, pool));
-    }
-    return true;
-  }
-  // Performs "PremultiplyAlpha" for each ImageBundle (preview/frames).
-  bool PremultiplyAlpha() {
-    const auto doPremultiplyAlpha = [](ImageBundle& bundle) {
-      if (!bundle.HasAlpha()) return;
-      if (!bundle.HasColor()) return;
-      auto* color = bundle.color();
-      const auto* alpha = bundle.alpha();
-      JXL_CHECK(color->ysize() == alpha->ysize());
-      JXL_CHECK(color->xsize() == alpha->xsize());
-      for (size_t y = 0; y < color->ysize(); y++) {
-        ::jxl::PremultiplyAlpha(color->PlaneRow(0, y), color->PlaneRow(1, y),
-                                color->PlaneRow(2, y), alpha->Row(y),
-                                color->xsize());
-      }
-    };
-    ExtraChannelInfo* eci = metadata.m.Find(ExtraChannel::kAlpha);
-    if (eci == nullptr || eci->alpha_associated) return false;
-    if (metadata.m.have_preview) {
-      doPremultiplyAlpha(preview_frame);
-    }
-    for (ImageBundle& ib : frames) {
-      doPremultiplyAlpha(ib);
-    }
-    eci->alpha_associated = true;
-    return true;
-  }
-
-  bool UnpremultiplyAlpha() {
-    const auto doUnpremultiplyAlpha = [](ImageBundle& bundle) {
-      if (!bundle.HasAlpha()) return;
-      if (!bundle.HasColor()) return;
-      auto* color = bundle.color();
-      const auto* alpha = bundle.alpha();
-      JXL_CHECK(color->ysize() == alpha->ysize());
-      JXL_CHECK(color->xsize() == alpha->xsize());
-      for (size_t y = 0; y < color->ysize(); y++) {
-        ::jxl::UnpremultiplyAlpha(color->PlaneRow(0, y), color->PlaneRow(1, y),
-                                  color->PlaneRow(2, y), alpha->Row(y),
-                                  color->xsize());
-      }
-    };
-    ExtraChannelInfo* eci = metadata.m.Find(ExtraChannel::kAlpha);
-    if (eci == nullptr || !eci->alpha_associated) return false;
-    if (metadata.m.have_preview) {
-      doUnpremultiplyAlpha(preview_frame);
-    }
-    for (ImageBundle& ib : frames) {
-      doUnpremultiplyAlpha(ib);
-    }
-    eci->alpha_associated = false;
-    return true;
-  }
-
-  // -- DECODER INPUT:
-
-  SizeConstraints constraints;
-
-  // -- DECODER OUTPUT:
-
-  // Total number of pixels decoded (may differ from #frames * xsize * ysize
-  // if frames are cropped)
-  uint64_t dec_pixels = 0;
-
   // -- DECODER OUTPUT, ENCODER INPUT:
 
   // Metadata stored into / retrieved from bitstreams.
diff --git a/lib/jxl/codec_y4m_testonly.cc b/lib/jxl/codec_y4m_testonly.cc
deleted file mode 100644 (file)
index dfcad9d..0000000
+++ /dev/null
@@ -1,202 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "lib/jxl/codec_y4m_testonly.h"
-
-#include <stddef.h>
-
-namespace jxl {
-namespace test {
-
-struct HeaderY4M {
-  size_t xsize;
-  size_t ysize;
-  size_t bits_per_sample;
-  int is_yuv;  // Y4M: where 1 = 444, 2 = 422, 3 = 420
-};
-
-// Decode Y4M images.
-class Y4MParser {
- public:
-  explicit Y4MParser(const Span<const uint8_t> bytes)
-      : pos_(bytes.data()), end_(pos_ + bytes.size()) {}
-
-  // TODO(jon): support multi-frame y4m
-  Status ParseHeader(HeaderY4M* header, const uint8_t** pos) {
-    JXL_RETURN_IF_ERROR(ExpectString("YUV4MPEG2", 9));
-    header->is_yuv = 3;
-    // TODO(jon): check if 4:2:0 is indeed the default
-    header->bits_per_sample = 8;
-    // TODO(jon): check if there's a y4m convention for higher bit depths
-    while (pos_ < end_) {
-      char next = 0;
-      JXL_RETURN_IF_ERROR(ReadChar(&next));
-      if (next == 0x0A) break;
-      if (next != ' ') continue;
-      char field = 0;
-      JXL_RETURN_IF_ERROR(ReadChar(&field));
-      switch (field) {
-        case 'W':
-          JXL_RETURN_IF_ERROR(ParseUnsigned(&header->xsize));
-          break;
-        case 'H':
-          JXL_RETURN_IF_ERROR(ParseUnsigned(&header->ysize));
-          break;
-        case 'I':
-          JXL_RETURN_IF_ERROR(ReadChar(&next));
-          if (next != 'p') {
-            return JXL_FAILURE(
-                "Y4M: only progressive (no frame interlacing) allowed");
-          }
-          break;
-        case 'C': {
-          char c1 = 0;
-          JXL_RETURN_IF_ERROR(ReadChar(&c1));
-          char c2 = 0;
-          JXL_RETURN_IF_ERROR(ReadChar(&c2));
-          char c3 = 0;
-          JXL_RETURN_IF_ERROR(ReadChar(&c3));
-          if (c1 != '4') return JXL_FAILURE("Y4M: invalid C param");
-          if (c2 == '4') {
-            if (c3 != '4') return JXL_FAILURE("Y4M: invalid C param");
-            header->is_yuv = 1;  // 444
-          } else if (c2 == '2') {
-            if (c3 == '2') {
-              header->is_yuv = 2;  // 422
-            } else if (c3 == '0') {
-              header->is_yuv = 3;  // 420
-            } else {
-              return JXL_FAILURE("Y4M: invalid C param");
-            }
-          } else {
-            return JXL_FAILURE("Y4M: invalid C param");
-          }
-        }
-          [[fallthrough]];
-          // no break: fallthrough because this field can have values like
-          // "C420jpeg" (we are ignoring the chroma sample location and treat
-          // everything like C420jpeg)
-        case 'F':  // Framerate in fps as numerator:denominator
-                   // TODO(jon): actually read this and set corresponding jxl
-                   // metadata
-        case 'A':  // Pixel aspect ratio (ignoring it, could perhaps adjust
-                   // intrinsic dimensions based on this?)
-        case 'X':  // Comment, ignore
-          // ignore the field value and go to next one
-          while (pos_ < end_) {
-            if (pos_[0] == ' ' || pos_[0] == 0x0A) break;
-            pos_++;
-          }
-          break;
-        default:
-          return JXL_FAILURE("Y4M: parse error");
-      }
-    }
-    JXL_RETURN_IF_ERROR(ExpectString("FRAME", 5));
-    while (true) {
-      char next = 0;
-      JXL_RETURN_IF_ERROR(ReadChar(&next));
-      if (next == 0x0A) {
-        *pos = pos_;
-        return true;
-      }
-    }
-  }
-
- private:
-  Status ExpectString(const char* str, size_t len) {
-    // Unlikely to happen.
-    if (pos_ + len < pos_) return JXL_FAILURE("Y4M: overflow");
-
-    if (pos_ + len > end_ || strncmp(str, (const char*)pos_, len) != 0) {
-      return JXL_FAILURE("Y4M: expected %s", str);
-    }
-    pos_ += len;
-    return true;
-  }
-
-  Status ReadChar(char* out) {
-    // Unlikely to happen.
-    if (pos_ + 1 < pos_) return JXL_FAILURE("Y4M: overflow");
-
-    if (pos_ >= end_) {
-      return JXL_FAILURE("Y4M: unexpected end of input");
-    }
-    *out = *pos_;
-    pos_++;
-    return true;
-  }
-
-  static bool IsDigit(const uint8_t c) { return '0' <= c && c <= '9'; }
-
-  Status ParseUnsigned(size_t* number) {
-    if (pos_ == end_) return JXL_FAILURE("PNM: reached end before number");
-    if (!IsDigit(*pos_)) return JXL_FAILURE("PNM: expected unsigned number");
-
-    *number = 0;
-    while (pos_ < end_ && *pos_ >= '0' && *pos_ <= '9') {
-      *number *= 10;
-      *number += *pos_ - '0';
-      ++pos_;
-    }
-
-    return true;
-  }
-
-  const uint8_t* pos_;
-  const uint8_t* const end_;
-};
-
-Status DecodeImageY4M(const Span<const uint8_t> bytes, CodecInOut* io) {
-  Y4MParser parser(bytes);
-  HeaderY4M header = {};
-  const uint8_t* pos = nullptr;
-  JXL_RETURN_IF_ERROR(parser.ParseHeader(&header, &pos));
-
-  Image3F yuvdata(header.xsize, header.ysize);
-  ImageBundle bundle(&io->metadata.m);
-  const int hshift[3][3] = {{0, 0, 0}, {0, 1, 1}, {0, 1, 1}};
-  const int vshift[3][3] = {{0, 0, 0}, {0, 0, 0}, {0, 1, 1}};
-
-  for (size_t c = 0; c < 3; c++) {
-    for (size_t y = 0; y < header.ysize >> vshift[header.is_yuv - 1][c]; ++y) {
-      float* const JXL_RESTRICT row = yuvdata.PlaneRow((c == 2 ? 2 : 1 - c), y);
-      if (pos + (header.xsize >> hshift[header.is_yuv - 1][c]) >
-          bytes.data() + bytes.size())
-        return JXL_FAILURE("Not enough image data");
-      for (size_t x = 0; x < header.xsize >> hshift[header.is_yuv - 1][c];
-           ++x) {
-        row[x] = (1.f / 255.f) * ((*pos++) - 128.f);
-      }
-    }
-  }
-  bundle.SetFromImage(std::move(yuvdata), io->metadata.m.color_encoding);
-  bundle.color_transform = ColorTransform::kYCbCr;
-
-  YCbCrChromaSubsampling subsampling;
-  uint8_t cssh[3] = {
-      2, static_cast<uint8_t>(hshift[header.is_yuv - 1][1] ? 1 : 2),
-      static_cast<uint8_t>(hshift[header.is_yuv - 1][2] ? 1 : 2)};
-  uint8_t cssv[3] = {
-      2, static_cast<uint8_t>(vshift[header.is_yuv - 1][1] ? 1 : 2),
-      static_cast<uint8_t>(vshift[header.is_yuv - 1][2] ? 1 : 2)};
-
-  JXL_RETURN_IF_ERROR(subsampling.Set(cssh, cssv));
-  bundle.chroma_subsampling = subsampling;
-  io->Main() = std::move(bundle);
-
-  JXL_RETURN_IF_ERROR(io->metadata.m.color_encoding.SetSRGB(ColorSpace::kRGB));
-  io->metadata.m.SetUintSamples(header.bits_per_sample);
-  io->metadata.m.SetAlphaBits(0);
-  io->dec_pixels = header.xsize * header.ysize;
-
-  io->metadata.m.bit_depth.bits_per_sample = io->Main().DetectRealBitdepth();
-  io->SetSize(header.xsize, header.ysize);
-  SetIntensityTarget(io);
-  return true;
-}
-
-}  // namespace test
-}  // namespace jxl
diff --git a/lib/jxl/codec_y4m_testonly.h b/lib/jxl/codec_y4m_testonly.h
deleted file mode 100644 (file)
index f65759d..0000000
+++ /dev/null
@@ -1,18 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include <stdint.h>
-
-#include "lib/jxl/base/padded_bytes.h"
-#include "lib/jxl/base/status.h"
-#include "lib/jxl/codec_in_out.h"
-
-namespace jxl {
-namespace test {
-
-Status DecodeImageY4M(const Span<const uint8_t> bytes, CodecInOut* io);
-
-}  // namespace test
-}  // namespace jxl
index 399febb..73cb7ec 100644 (file)
 #include <vector>
 
 #include "lib/jxl/ans_params.h"
-#include "lib/jxl/aux_out.h"
-#include "lib/jxl/aux_out_fwd.h"
-#include "lib/jxl/base/padded_bytes.h"
-#include "lib/jxl/base/profiler.h"
 #include "lib/jxl/base/span.h"
 #include "lib/jxl/coeff_order_fwd.h"
 #include "lib/jxl/dec_ans.h"
@@ -82,7 +78,6 @@ Status DecodeCoeffOrder(AcStrategy acs, coeff_order_t* order, BitReader* br,
                         ANSSymbolReader* reader,
                         std::vector<coeff_order_t>& natural_order,
                         const std::vector<uint8_t>& context_map) {
-  PROFILER_FUNC;
   const size_t llf = acs.covered_blocks_x() * acs.covered_blocks_y();
   const size_t size = kDCTBlockSize * llf;
 
index 5061851..75f6f99 100644 (file)
 #include <stdint.h>
 
 #include "lib/jxl/ac_strategy.h"
-#include "lib/jxl/aux_out_fwd.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/coeff_order_fwd.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/dct_util.h"
-#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/frame_dimensions.h"
 
 namespace jxl {
 
+class BitReader;
+
 // Those offsets get multiplied by kDCTBlockSize.
 static constexpr size_t kCoeffOrderOffset[] = {
     0,    1,    2,    3,    4,    5,    6,    10,   14,   18,
index 700e9a8..2630657 100644 (file)
@@ -11,7 +11,7 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include "base/compiler_specific.h"
+#include "lib/jxl/base/compiler_specific.h"
 
 namespace jxl {
 
index 810c725..a88dcfa 100644 (file)
@@ -5,20 +5,18 @@
 
 #include "lib/jxl/coeff_order.h"
 
-#include <stdio.h>
-
 #include <algorithm>
 #include <numeric>  // iota
 #include <utility>
 #include <vector>
 
-#include "gtest/gtest.h"
 #include "lib/jxl/base/printf_macros.h"
 #include "lib/jxl/base/random.h"
 #include "lib/jxl/base/span.h"
 #include "lib/jxl/coeff_order_fwd.h"
 #include "lib/jxl/dec_bit_reader.h"
 #include "lib/jxl/enc_coeff_order.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
 namespace {
index a2eca44..19273da 100644 (file)
 
 #include "lib/jxl/color_encoding_internal.h"
 
-#include <errno.h>
-
 #include <array>
-#include <cmath>
 
-#include "lib/jxl/color_management.h"
-#include "lib/jxl/common.h"
+#include "lib/jxl/base/common.h"
+#include "lib/jxl/cms/color_encoding_cms.h"
+#include "lib/jxl/cms/jxl_cms_internal.h"
 #include "lib/jxl/fields.h"
-#include "lib/jxl/linalg.h"
+#include "lib/jxl/pack_signed.h"
 
 namespace jxl {
-namespace {
-
-// Highest reasonable value for the gamma of a transfer curve.
-constexpr uint32_t kMaxGamma = 8192;
-
-// These strings are baked into Description - do not change.
-
-std::string ToString(ColorSpace color_space) {
-  switch (color_space) {
-    case ColorSpace::kRGB:
-      return "RGB";
-    case ColorSpace::kGray:
-      return "Gra";
-    case ColorSpace::kXYB:
-      return "XYB";
-    case ColorSpace::kUnknown:
-      return "CS?";
-  }
-  // Should not happen - visitor fails if enum is invalid.
-  JXL_ABORT("Invalid ColorSpace %u", static_cast<uint32_t>(color_space));
-}
-
-std::string ToString(WhitePoint white_point) {
-  switch (white_point) {
-    case WhitePoint::kD65:
-      return "D65";
-    case WhitePoint::kCustom:
-      return "Cst";
-    case WhitePoint::kE:
-      return "EER";
-    case WhitePoint::kDCI:
-      return "DCI";
-  }
-  // Should not happen - visitor fails if enum is invalid.
-  JXL_ABORT("Invalid WhitePoint %u", static_cast<uint32_t>(white_point));
-}
-
-std::string ToString(Primaries primaries) {
-  switch (primaries) {
-    case Primaries::kSRGB:
-      return "SRG";
-    case Primaries::k2100:
-      return "202";
-    case Primaries::kP3:
-      return "DCI";
-    case Primaries::kCustom:
-      return "Cst";
-  }
-  // Should not happen - visitor fails if enum is invalid.
-  JXL_ABORT("Invalid Primaries %u", static_cast<uint32_t>(primaries));
-}
-
-std::string ToString(TransferFunction transfer_function) {
-  switch (transfer_function) {
-    case TransferFunction::kSRGB:
-      return "SRG";
-    case TransferFunction::kLinear:
-      return "Lin";
-    case TransferFunction::k709:
-      return "709";
-    case TransferFunction::kPQ:
-      return "PeQ";
-    case TransferFunction::kHLG:
-      return "HLG";
-    case TransferFunction::kDCI:
-      return "DCI";
-    case TransferFunction::kUnknown:
-      return "TF?";
-  }
-  // Should not happen - visitor fails if enum is invalid.
-  JXL_ABORT("Invalid TransferFunction %u",
-            static_cast<uint32_t>(transfer_function));
-}
-
-std::string ToString(RenderingIntent rendering_intent) {
-  switch (rendering_intent) {
-    case RenderingIntent::kPerceptual:
-      return "Per";
-    case RenderingIntent::kRelative:
-      return "Rel";
-    case RenderingIntent::kSaturation:
-      return "Sat";
-    case RenderingIntent::kAbsolute:
-      return "Abs";
-  }
-  // Should not happen - visitor fails if enum is invalid.
-  JXL_ABORT("Invalid RenderingIntent %u",
-            static_cast<uint32_t>(rendering_intent));
-}
-
-static double F64FromCustomxyI32(const int32_t i) { return i * 1E-6; }
-static Status F64ToCustomxyI32(const double f, int32_t* JXL_RESTRICT i) {
-  if (!(-4 <= f && f <= 4)) {
-    return JXL_FAILURE("F64 out of bounds for CustomxyI32");
-  }
-  *i = static_cast<int32_t>(roundf(f * 1E6));
-  return true;
-}
-
-Status ConvertExternalToInternalWhitePoint(const JxlWhitePoint external,
-                                           WhitePoint* internal) {
-  switch (external) {
-    case JXL_WHITE_POINT_D65:
-      *internal = WhitePoint::kD65;
-      return true;
-    case JXL_WHITE_POINT_CUSTOM:
-      *internal = WhitePoint::kCustom;
-      return true;
-    case JXL_WHITE_POINT_E:
-      *internal = WhitePoint::kE;
-      return true;
-    case JXL_WHITE_POINT_DCI:
-      *internal = WhitePoint::kDCI;
-      return true;
-  }
-  return JXL_FAILURE("Invalid WhitePoint enum value");
-}
-
-Status ConvertExternalToInternalPrimaries(const JxlPrimaries external,
-                                          Primaries* internal) {
-  switch (external) {
-    case JXL_PRIMARIES_SRGB:
-      *internal = Primaries::kSRGB;
-      return true;
-    case JXL_PRIMARIES_CUSTOM:
-      *internal = Primaries::kCustom;
-      return true;
-    case JXL_PRIMARIES_2100:
-      *internal = Primaries::k2100;
-      return true;
-    case JXL_PRIMARIES_P3:
-      *internal = Primaries::kP3;
-      return true;
-  }
-  return JXL_FAILURE("Invalid Primaries enum value");
-}
-
-Status ConvertExternalToInternalTransferFunction(
-    const JxlTransferFunction external, TransferFunction* internal) {
-  switch (external) {
-    case JXL_TRANSFER_FUNCTION_709:
-      *internal = TransferFunction::k709;
-      return true;
-    case JXL_TRANSFER_FUNCTION_UNKNOWN:
-      *internal = TransferFunction::kUnknown;
-      return true;
-    case JXL_TRANSFER_FUNCTION_LINEAR:
-      *internal = TransferFunction::kLinear;
-      return true;
-    case JXL_TRANSFER_FUNCTION_SRGB:
-      *internal = TransferFunction::kSRGB;
-      return true;
-    case JXL_TRANSFER_FUNCTION_PQ:
-      *internal = TransferFunction::kPQ;
-      return true;
-    case JXL_TRANSFER_FUNCTION_DCI:
-      *internal = TransferFunction::kDCI;
-      return true;
-    case JXL_TRANSFER_FUNCTION_HLG:
-      *internal = TransferFunction::kHLG;
-      return true;
-    case JXL_TRANSFER_FUNCTION_GAMMA:
-      return JXL_FAILURE("Gamma should be handled separately");
-  }
-  return JXL_FAILURE("Invalid TransferFunction enum value");
-}
-
-Status ConvertExternalToInternalRenderingIntent(
-    const JxlRenderingIntent external, RenderingIntent* internal) {
-  switch (external) {
-    case JXL_RENDERING_INTENT_PERCEPTUAL:
-      *internal = RenderingIntent::kPerceptual;
-      return true;
-    case JXL_RENDERING_INTENT_RELATIVE:
-      *internal = RenderingIntent::kRelative;
-      return true;
-    case JXL_RENDERING_INTENT_SATURATION:
-      *internal = RenderingIntent::kSaturation;
-      return true;
-    case JXL_RENDERING_INTENT_ABSOLUTE:
-      *internal = RenderingIntent::kAbsolute;
-      return true;
-  }
-  return JXL_FAILURE("Invalid RenderingIntent enum value");
-}
-
-}  // namespace
-
-CIExy Customxy::Get() const {
-  CIExy xy;
-  xy.x = F64FromCustomxyI32(x);
-  xy.y = F64FromCustomxyI32(y);
-  return xy;
-}
-
-Status Customxy::Set(const CIExy& xy) {
-  JXL_RETURN_IF_ERROR(F64ToCustomxyI32(xy.x, &x));
-  JXL_RETURN_IF_ERROR(F64ToCustomxyI32(xy.y, &y));
-  size_t extension_bits, total_bits;
-  if (!Bundle::CanEncode(*this, &extension_bits, &total_bits)) {
-    return JXL_FAILURE("Unable to encode XY %f %f", xy.x, xy.y);
-  }
-  return true;
-}
 
 bool CustomTransferFunction::SetImplicit() {
   if (nonserialized_color_space == ColorSpace::kXYB) {
-    if (!SetGamma(1.0 / 3)) JXL_ASSERT(false);
+    if (!storage_.SetGamma(1.0 / 3)) JXL_ASSERT(false);
     return true;
   }
   return false;
 }
 
-Status CustomTransferFunction::SetGamma(double gamma) {
-  if (gamma < (1.0f / kMaxGamma) || gamma > 1.0) {
-    return JXL_FAILURE("Invalid gamma %f", gamma);
-  }
-
-  have_gamma_ = false;
-  if (ApproxEq(gamma, 1.0)) {
-    transfer_function_ = TransferFunction::kLinear;
-    return true;
-  }
-  if (ApproxEq(gamma, 1.0 / 2.6)) {
-    transfer_function_ = TransferFunction::kDCI;
-    return true;
-  }
-  // Don't translate 0.45.. to kSRGB nor k709 - that might change pixel
-  // values because those curves also have a linear part.
-
-  have_gamma_ = true;
-  gamma_ = roundf(gamma * kGammaMul);
-  transfer_function_ = TransferFunction::kUnknown;
-  return true;
-}
-
-namespace {
-
-std::array<ColorEncoding, 2> CreateC2(const Primaries pr,
-                                      const TransferFunction tf) {
+std::array<ColorEncoding, 2> ColorEncoding::CreateC2(Primaries pr,
+                                                     TransferFunction tf) {
   std::array<ColorEncoding, 2> c2;
 
-  {
-    ColorEncoding* c_rgb = c2.data() + 0;
-    c_rgb->SetColorSpace(ColorSpace::kRGB);
-    c_rgb->white_point = WhitePoint::kD65;
-    c_rgb->primaries = pr;
-    c_rgb->tf.SetTransferFunction(tf);
-    JXL_CHECK(c_rgb->CreateICC());
-  }
+  ColorEncoding* c_rgb = c2.data() + 0;
+  c_rgb->SetColorSpace(ColorSpace::kRGB);
+  c_rgb->storage_.white_point = WhitePoint::kD65;
+  c_rgb->storage_.primaries = pr;
+  c_rgb->storage_.tf.SetTransferFunction(tf);
+  JXL_CHECK(c_rgb->CreateICC());
 
-  {
-    ColorEncoding* c_gray = c2.data() + 1;
-    c_gray->SetColorSpace(ColorSpace::kGray);
-    c_gray->white_point = WhitePoint::kD65;
-    c_gray->primaries = pr;
-    c_gray->tf.SetTransferFunction(tf);
-    JXL_CHECK(c_gray->CreateICC());
-  }
+  ColorEncoding* c_gray = c2.data() + 1;
+  c_gray->SetColorSpace(ColorSpace::kGray);
+  c_gray->storage_.white_point = WhitePoint::kD65;
+  c_gray->storage_.primaries = pr;
+  c_gray->storage_.tf.SetTransferFunction(tf);
+  JXL_CHECK(c_gray->CreateICC());
 
   return c2;
 }
 
-}  // namespace
-
 const ColorEncoding& ColorEncoding::SRGB(bool is_gray) {
   static std::array<ColorEncoding, 2> c2 =
       CreateC2(Primaries::kSRGB, TransferFunction::kSRGB);
@@ -292,219 +55,73 @@ const ColorEncoding& ColorEncoding::LinearSRGB(bool is_gray) {
   return c2[is_gray];
 }
 
-CIExy ColorEncoding::GetWhitePoint() const {
-  JXL_DASSERT(have_fields_);
-  CIExy xy;
-  switch (white_point) {
-    case WhitePoint::kCustom:
-      return white_.Get();
-
-    case WhitePoint::kD65:
-      xy.x = 0.3127;
-      xy.y = 0.3290;
-      return xy;
-
-    case WhitePoint::kDCI:
-      // From https://ieeexplore.ieee.org/document/7290729 C.2 page 11
-      xy.x = 0.314;
-      xy.y = 0.351;
-      return xy;
-
-    case WhitePoint::kE:
-      xy.x = xy.y = 1.0 / 3;
-      return xy;
-  }
-  JXL_ABORT("Invalid WhitePoint %u", static_cast<uint32_t>(white_point));
-}
-
-Status ColorEncoding::SetWhitePoint(const CIExy& xy) {
-  JXL_DASSERT(have_fields_);
-  if (xy.x == 0.0 || xy.y == 0.0) {
-    return JXL_FAILURE("Invalid white point %f %f", xy.x, xy.y);
-  }
-  if (ApproxEq(xy.x, 0.3127) && ApproxEq(xy.y, 0.3290)) {
-    white_point = WhitePoint::kD65;
-    return true;
-  }
-  if (ApproxEq(xy.x, 1.0 / 3) && ApproxEq(xy.y, 1.0 / 3)) {
-    white_point = WhitePoint::kE;
-    return true;
-  }
-  if (ApproxEq(xy.x, 0.314) && ApproxEq(xy.y, 0.351)) {
-    white_point = WhitePoint::kDCI;
-    return true;
-  }
-  white_point = WhitePoint::kCustom;
-  return white_.Set(xy);
-}
-
-PrimariesCIExy ColorEncoding::GetPrimaries() const {
-  JXL_DASSERT(have_fields_);
-  JXL_ASSERT(HasPrimaries());
-  PrimariesCIExy xy;
-  switch (primaries) {
-    case Primaries::kCustom:
-      xy.r = red_.Get();
-      xy.g = green_.Get();
-      xy.b = blue_.Get();
-      return xy;
-
-    case Primaries::kSRGB:
-      xy.r.x = 0.639998686;
-      xy.r.y = 0.330010138;
-      xy.g.x = 0.300003784;
-      xy.g.y = 0.600003357;
-      xy.b.x = 0.150002046;
-      xy.b.y = 0.059997204;
-      return xy;
-
-    case Primaries::k2100:
-      xy.r.x = 0.708;
-      xy.r.y = 0.292;
-      xy.g.x = 0.170;
-      xy.g.y = 0.797;
-      xy.b.x = 0.131;
-      xy.b.y = 0.046;
-      return xy;
-
-    case Primaries::kP3:
-      xy.r.x = 0.680;
-      xy.r.y = 0.320;
-      xy.g.x = 0.265;
-      xy.g.y = 0.690;
-      xy.b.x = 0.150;
-      xy.b.y = 0.060;
-      return xy;
-  }
-  JXL_ABORT("Invalid Primaries %u", static_cast<uint32_t>(primaries));
-}
-
-Status ColorEncoding::SetPrimaries(const PrimariesCIExy& xy) {
-  JXL_DASSERT(have_fields_);
-  JXL_ASSERT(HasPrimaries());
-  if (xy.r.x == 0.0 || xy.r.y == 0.0 || xy.g.x == 0.0 || xy.g.y == 0.0 ||
-      xy.b.x == 0.0 || xy.b.y == 0.0) {
-    return JXL_FAILURE("Invalid primaries %f %f %f %f %f %f", xy.r.x, xy.r.y,
-                       xy.g.x, xy.g.y, xy.b.x, xy.b.y);
-  }
-
-  if (ApproxEq(xy.r.x, 0.64) && ApproxEq(xy.r.y, 0.33) &&
-      ApproxEq(xy.g.x, 0.30) && ApproxEq(xy.g.y, 0.60) &&
-      ApproxEq(xy.b.x, 0.15) && ApproxEq(xy.b.y, 0.06)) {
-    primaries = Primaries::kSRGB;
-    return true;
-  }
-
-  if (ApproxEq(xy.r.x, 0.708) && ApproxEq(xy.r.y, 0.292) &&
-      ApproxEq(xy.g.x, 0.170) && ApproxEq(xy.g.y, 0.797) &&
-      ApproxEq(xy.b.x, 0.131) && ApproxEq(xy.b.y, 0.046)) {
-    primaries = Primaries::k2100;
-    return true;
-  }
-  if (ApproxEq(xy.r.x, 0.680) && ApproxEq(xy.r.y, 0.320) &&
-      ApproxEq(xy.g.x, 0.265) && ApproxEq(xy.g.y, 0.690) &&
-      ApproxEq(xy.b.x, 0.150) && ApproxEq(xy.b.y, 0.060)) {
-    primaries = Primaries::kP3;
-    return true;
-  }
-
-  primaries = Primaries::kCustom;
-  JXL_RETURN_IF_ERROR(red_.Set(xy.r));
-  JXL_RETURN_IF_ERROR(green_.Set(xy.g));
-  JXL_RETURN_IF_ERROR(blue_.Set(xy.b));
+Status ColorEncoding::SetWhitePointType(const WhitePoint& wp) {
+  JXL_DASSERT(storage_.have_fields);
+  storage_.white_point = wp;
   return true;
 }
 
-Status ColorEncoding::CreateICC() {
-  InternalRemoveICC();
-  if (!MaybeCreateProfile(*this, &icc_)) {
-    return JXL_FAILURE("Failed to create profile from fields");
-  }
+Status ColorEncoding::SetPrimariesType(const Primaries& p) {
+  JXL_DASSERT(storage_.have_fields);
+  JXL_ASSERT(HasPrimaries());
+  storage_.primaries = p;
   return true;
 }
 
-std::string Description(const ColorEncoding& c_in) {
-  // Copy required for Implicit*
-  ColorEncoding c = c_in;
-
-  std::string d = ToString(c.GetColorSpace());
-
-  if (!c.ImplicitWhitePoint()) {
-    d += '_';
-    if (c.white_point == WhitePoint::kCustom) {
-      const CIExy wp = c.GetWhitePoint();
-      d += ToString(wp.x) + ';';
-      d += ToString(wp.y);
-    } else {
-      d += ToString(c.white_point);
-    }
-  }
+void ColorEncoding::DecideIfWantICC(const JxlCmsInterface& cms) {
+  if (storage_.icc.empty()) return;
 
-  if (c.HasPrimaries()) {
-    d += '_';
-    if (c.primaries == Primaries::kCustom) {
-      const PrimariesCIExy pr = c.GetPrimaries();
-      d += ToString(pr.r.x) + ';';
-      d += ToString(pr.r.y) + ';';
-      d += ToString(pr.g.x) + ';';
-      d += ToString(pr.g.y) + ';';
-      d += ToString(pr.b.x) + ';';
-      d += ToString(pr.b.y);
-    } else {
-      d += ToString(c.primaries);
-    }
+  JxlColorEncoding c;
+  JXL_BOOL cmyk;
+  if (!cms.set_fields_from_icc(cms.set_fields_data, storage_.icc.data(),
+                               storage_.icc.size(), &c, &cmyk)) {
+    return;
   }
+  if (cmyk) return;
 
-  d += '_';
-  d += ToString(c.rendering_intent);
-
-  if (!c.tf.SetImplicit()) {
-    d += '_';
-    if (c.tf.IsGamma()) {
-      d += 'g';
-      d += ToString(c.tf.GetGamma());
-    } else {
-      d += ToString(c.tf.GetTransferFunction());
-    }
-  }
+  std::vector<uint8_t> icc;
+  if (!MaybeCreateProfile(c, &icc)) return;
 
-  return d;
+  want_icc_ = false;
 }
 
 Customxy::Customxy() { Bundle::Init(this); }
 Status Customxy::VisitFields(Visitor* JXL_RESTRICT visitor) {
-  uint32_t ux = PackSigned(x);
+  uint32_t ux = PackSigned(storage_.x);
   JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Bits(19), BitsOffset(19, 524288),
                                          BitsOffset(20, 1048576),
                                          BitsOffset(21, 2097152), 0, &ux));
-  x = UnpackSigned(ux);
-  uint32_t uy = PackSigned(y);
+  storage_.x = UnpackSigned(ux);
+  uint32_t uy = PackSigned(storage_.y);
   JXL_QUIET_RETURN_IF_ERROR(visitor->U32(Bits(19), BitsOffset(19, 524288),
                                          BitsOffset(20, 1048576),
                                          BitsOffset(21, 2097152), 0, &uy));
-  y = UnpackSigned(uy);
+  storage_.y = UnpackSigned(uy);
   return true;
 }
 
 CustomTransferFunction::CustomTransferFunction() { Bundle::Init(this); }
 Status CustomTransferFunction::VisitFields(Visitor* JXL_RESTRICT visitor) {
   if (visitor->Conditional(!SetImplicit())) {
-    JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &have_gamma_));
+    JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(false, &storage_.have_gamma));
 
-    if (visitor->Conditional(have_gamma_)) {
+    if (visitor->Conditional(storage_.have_gamma)) {
       // Gamma is represented as a 24-bit int, the exponent used is
       // gamma_ / 1e7. Valid values are (0, 1]. On the low end side, we also
       // limit it to kMaxGamma/1e7.
-      JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(24, kGammaMul, &gamma_));
-      if (gamma_ > kGammaMul ||
-          static_cast<uint64_t>(gamma_) * kMaxGamma < kGammaMul) {
-        return JXL_FAILURE("Invalid gamma %u", gamma_);
+      JXL_QUIET_RETURN_IF_ERROR(visitor->Bits(
+          24, ::jxl::cms::CustomTransferFunction::kGammaMul, &storage_.gamma));
+      if (storage_.gamma > ::jxl::cms::CustomTransferFunction::kGammaMul ||
+          static_cast<uint64_t>(storage_.gamma) *
+                  ::jxl::cms::CustomTransferFunction::kMaxGamma <
+              ::jxl::cms::CustomTransferFunction::kGammaMul) {
+        return JXL_FAILURE("Invalid gamma %u", storage_.gamma);
       }
     }
 
-    if (visitor->Conditional(!have_gamma_)) {
+    if (visitor->Conditional(!storage_.have_gamma)) {
       JXL_QUIET_RETURN_IF_ERROR(
-          visitor->Enum(TransferFunction::kSRGB, &transfer_function_));
+          visitor->Enum(TransferFunction::kSRGB, &storage_.transfer_function));
     }
   }
 
@@ -523,41 +140,57 @@ Status ColorEncoding::VisitFields(Visitor* JXL_RESTRICT visitor) {
 
   // Always send even if want_icc_ because this affects decoding.
   // We can skip the white point/primaries because they do not.
-  JXL_QUIET_RETURN_IF_ERROR(visitor->Enum(ColorSpace::kRGB, &color_space_));
+  JXL_QUIET_RETURN_IF_ERROR(
+      visitor->Enum(ColorSpace::kRGB, &storage_.color_space));
 
   if (visitor->Conditional(!WantICC())) {
     // Serialize enums. NOTE: we set the defaults to the most common values so
     // ImageMetadata.all_default is true in the common case.
 
     if (visitor->Conditional(!ImplicitWhitePoint())) {
-      JXL_QUIET_RETURN_IF_ERROR(visitor->Enum(WhitePoint::kD65, &white_point));
-      if (visitor->Conditional(white_point == WhitePoint::kCustom)) {
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->Enum(WhitePoint::kD65, &storage_.white_point));
+      if (visitor->Conditional(storage_.white_point == WhitePoint::kCustom)) {
+        white_.storage_ = storage_.white;
         JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&white_));
+        storage_.white = white_.storage_;
       }
     }
 
     if (visitor->Conditional(HasPrimaries())) {
-      JXL_QUIET_RETURN_IF_ERROR(visitor->Enum(Primaries::kSRGB, &primaries));
-      if (visitor->Conditional(primaries == Primaries::kCustom)) {
+      JXL_QUIET_RETURN_IF_ERROR(
+          visitor->Enum(Primaries::kSRGB, &storage_.primaries));
+      if (visitor->Conditional(storage_.primaries == Primaries::kCustom)) {
+        red_.storage_ = storage_.red;
         JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&red_));
+        storage_.red = red_.storage_;
+        green_.storage_ = storage_.green;
         JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&green_));
+        storage_.green = green_.storage_;
+        blue_.storage_ = storage_.blue;
         JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&blue_));
+        storage_.blue = blue_.storage_;
       }
     }
 
-    JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&tf));
+    tf_.nonserialized_color_space = storage_.color_space;
+    tf_.storage_ = storage_.tf;
+    JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&tf_));
+    storage_.tf = tf_.storage_;
 
     JXL_QUIET_RETURN_IF_ERROR(
-        visitor->Enum(RenderingIntent::kRelative, &rendering_intent));
+        visitor->Enum(RenderingIntent::kRelative, &storage_.rendering_intent));
 
     // We didn't have ICC, so all fields should be known.
-    if (color_space_ == ColorSpace::kUnknown || tf.IsUnknown()) {
+    if (storage_.color_space == ColorSpace::kUnknown ||
+        storage_.tf.IsUnknown()) {
       return JXL_FAILURE(
           "No ICC but cs %u and tf %u%s",
-          static_cast<unsigned int>(color_space_),
-          tf.IsGamma() ? 0
-                       : static_cast<unsigned int>(tf.GetTransferFunction()),
-          tf.IsGamma() ? "(gamma)" : "");
+          static_cast<unsigned int>(storage_.color_space),
+          storage_.tf.have_gamma
+              ? 0
+              : static_cast<unsigned int>(storage_.tf.transfer_function),
+          storage_.tf.have_gamma ? "(gamma)" : "");
     }
 
     JXL_RETURN_IF_ERROR(CreateICC());
@@ -572,181 +205,4 @@ Status ColorEncoding::VisitFields(Visitor* JXL_RESTRICT visitor) {
   return true;
 }
 
-void ConvertInternalToExternalColorEncoding(const ColorEncoding& internal,
-                                            JxlColorEncoding* external) {
-  external->color_space = static_cast<JxlColorSpace>(internal.GetColorSpace());
-
-  external->white_point = static_cast<JxlWhitePoint>(internal.white_point);
-
-  jxl::CIExy whitepoint = internal.GetWhitePoint();
-  external->white_point_xy[0] = whitepoint.x;
-  external->white_point_xy[1] = whitepoint.y;
-
-  if (external->color_space == JXL_COLOR_SPACE_RGB ||
-      external->color_space == JXL_COLOR_SPACE_UNKNOWN) {
-    external->primaries = static_cast<JxlPrimaries>(internal.primaries);
-    jxl::PrimariesCIExy primaries = internal.GetPrimaries();
-    external->primaries_red_xy[0] = primaries.r.x;
-    external->primaries_red_xy[1] = primaries.r.y;
-    external->primaries_green_xy[0] = primaries.g.x;
-    external->primaries_green_xy[1] = primaries.g.y;
-    external->primaries_blue_xy[0] = primaries.b.x;
-    external->primaries_blue_xy[1] = primaries.b.y;
-  }
-
-  if (internal.tf.IsGamma()) {
-    external->transfer_function = JXL_TRANSFER_FUNCTION_GAMMA;
-    external->gamma = internal.tf.GetGamma();
-  } else {
-    external->transfer_function =
-        static_cast<JxlTransferFunction>(internal.tf.GetTransferFunction());
-    external->gamma = 0;
-  }
-
-  external->rendering_intent =
-      static_cast<JxlRenderingIntent>(internal.rendering_intent);
-}
-
-Status ConvertExternalToInternalColorEncoding(const JxlColorEncoding& external,
-                                              ColorEncoding* internal) {
-  internal->SetColorSpace(static_cast<ColorSpace>(external.color_space));
-
-  JXL_RETURN_IF_ERROR(ConvertExternalToInternalWhitePoint(
-      external.white_point, &internal->white_point));
-  if (external.white_point == JXL_WHITE_POINT_CUSTOM) {
-    CIExy wp;
-    wp.x = external.white_point_xy[0];
-    wp.y = external.white_point_xy[1];
-    JXL_RETURN_IF_ERROR(internal->SetWhitePoint(wp));
-  }
-
-  if (external.color_space == JXL_COLOR_SPACE_RGB ||
-      external.color_space == JXL_COLOR_SPACE_UNKNOWN) {
-    JXL_RETURN_IF_ERROR(ConvertExternalToInternalPrimaries(
-        external.primaries, &internal->primaries));
-    if (external.primaries == JXL_PRIMARIES_CUSTOM) {
-      PrimariesCIExy primaries;
-      primaries.r.x = external.primaries_red_xy[0];
-      primaries.r.y = external.primaries_red_xy[1];
-      primaries.g.x = external.primaries_green_xy[0];
-      primaries.g.y = external.primaries_green_xy[1];
-      primaries.b.x = external.primaries_blue_xy[0];
-      primaries.b.y = external.primaries_blue_xy[1];
-      JXL_RETURN_IF_ERROR(internal->SetPrimaries(primaries));
-    }
-  }
-  CustomTransferFunction tf;
-  if (external.transfer_function == JXL_TRANSFER_FUNCTION_GAMMA) {
-    JXL_RETURN_IF_ERROR(tf.SetGamma(external.gamma));
-  } else {
-    TransferFunction tf_enum;
-    // JXL_TRANSFER_FUNCTION_GAMMA is not handled by this function since there's
-    // no internal enum value for it.
-    JXL_RETURN_IF_ERROR(ConvertExternalToInternalTransferFunction(
-        external.transfer_function, &tf_enum));
-    tf.SetTransferFunction(tf_enum);
-  }
-  internal->tf = tf;
-
-  JXL_RETURN_IF_ERROR(ConvertExternalToInternalRenderingIntent(
-      external.rendering_intent, &internal->rendering_intent));
-
-  // The ColorEncoding caches an ICC profile it created earlier that may no
-  // longer match the profile with the changed fields, so re-create it.
-  if (!(internal->CreateICC())) {
-    // This is not an error: for example, it doesn't have ICC profile creation
-    // implemented for XYB. This should not be returned as error, since
-    // ConvertExternalToInternalColorEncoding still worked correctly, and what
-    // matters is that internal->ICC() will not return the wrong profile.
-  }
-
-  return true;
-}
-
-/* Chromatic adaptation matrices*/
-static const float kBradford[9] = {
-    0.8951f, 0.2664f, -0.1614f, -0.7502f, 1.7135f,
-    0.0367f, 0.0389f, -0.0685f, 1.0296f,
-};
-
-static const float kBradfordInv[9] = {
-    0.9869929f, -0.1470543f, 0.1599627f, 0.4323053f, 0.5183603f,
-    0.0492912f, -0.0085287f, 0.0400428f, 0.9684867f,
-};
-
-// Adapts whitepoint x, y to D50
-Status AdaptToXYZD50(float wx, float wy, float matrix[9]) {
-  if (wx < 0 || wx > 1 || wy <= 0 || wy > 1) {
-    // Out of range values can cause division through zero
-    // further down with the bradford adaptation too.
-    return JXL_FAILURE("Invalid white point");
-  }
-  float w[3] = {wx / wy, 1.0f, (1.0f - wx - wy) / wy};
-  // 1 / tiny float can still overflow
-  JXL_RETURN_IF_ERROR(std::isfinite(w[0]) && std::isfinite(w[2]));
-  float w50[3] = {0.96422f, 1.0f, 0.82521f};
-
-  float lms[3];
-  float lms50[3];
-
-  MatMul(kBradford, w, 3, 3, 1, lms);
-  MatMul(kBradford, w50, 3, 3, 1, lms50);
-
-  if (lms[0] == 0 || lms[1] == 0 || lms[2] == 0) {
-    return JXL_FAILURE("Invalid white point");
-  }
-  float a[9] = {
-      //       /----> 0, 1, 2, 3,          /----> 4, 5, 6, 7,          /----> 8,
-      lms50[0] / lms[0], 0, 0, 0, lms50[1] / lms[1], 0, 0, 0, lms50[2] / lms[2],
-  };
-  if (!std::isfinite(a[0]) || !std::isfinite(a[4]) || !std::isfinite(a[8])) {
-    return JXL_FAILURE("Invalid white point");
-  }
-
-  float b[9];
-  MatMul(a, kBradford, 3, 3, 3, b);
-  MatMul(kBradfordInv, b, 3, 3, 3, matrix);
-
-  return true;
-}
-
-Status PrimariesToXYZ(float rx, float ry, float gx, float gy, float bx,
-                      float by, float wx, float wy, float matrix[9]) {
-  if (wx < 0 || wx > 1 || wy <= 0 || wy > 1) {
-    return JXL_FAILURE("Invalid white point");
-  }
-  // TODO(lode): also require rx, ry, gx, gy, bx, to be in range 0-1? ICC
-  // profiles in theory forbid negative XYZ values, but in practice the ACES P0
-  // color space uses a negative y for the blue primary.
-  float primaries[9] = {
-      rx, gx, bx, ry, gy, by, 1.0f - rx - ry, 1.0f - gx - gy, 1.0f - bx - by};
-  float primaries_inv[9];
-  memcpy(primaries_inv, primaries, sizeof(float) * 9);
-  JXL_RETURN_IF_ERROR(Inv3x3Matrix(primaries_inv));
-
-  float w[3] = {wx / wy, 1.0f, (1.0f - wx - wy) / wy};
-  // 1 / tiny float can still overflow
-  JXL_RETURN_IF_ERROR(std::isfinite(w[0]) && std::isfinite(w[2]));
-  float xyz[3];
-  MatMul(primaries_inv, w, 3, 3, 1, xyz);
-
-  float a[9] = {
-      xyz[0], 0, 0, 0, xyz[1], 0, 0, 0, xyz[2],
-  };
-
-  MatMul(primaries, a, 3, 3, 3, matrix);
-  return true;
-}
-
-Status PrimariesToXYZD50(float rx, float ry, float gx, float gy, float bx,
-                         float by, float wx, float wy, float matrix[9]) {
-  float toXYZ[9];
-  JXL_RETURN_IF_ERROR(PrimariesToXYZ(rx, ry, gx, gy, bx, by, wx, wy, toXYZ));
-  float d50[9];
-  JXL_RETURN_IF_ERROR(AdaptToXYZD50(wx, wy, d50));
-
-  MatMul(d50, toXYZ, 3, 3, 3, matrix);
-  return true;
-}
-
 }  // namespace jxl
index d9e0448..9296bed 100644 (file)
@@ -8,44 +8,36 @@
 
 // Metadata for color space conversions.
 
+#include <jxl/cms.h>
+#include <jxl/cms_interface.h>
+#include <jxl/color_encoding.h>
 #include <stddef.h>
 #include <stdint.h>
-#include <stdlib.h>
 
-#include <cmath>  // std::abs
+#include <array>
+#include <cstdlib>  // free
 #include <ostream>
 #include <string>
-#include <vector>
+#include <utility>
 
-#include "jxl/color_encoding.h"
 #include "lib/jxl/base/compiler_specific.h"
-#include "lib/jxl/base/padded_bytes.h"
 #include "lib/jxl/base/status.h"
+#include "lib/jxl/cms/color_encoding_cms.h"
+#include "lib/jxl/cms/jxl_cms_internal.h"
 #include "lib/jxl/field_encodings.h"
 
 namespace jxl {
 
-// (All CIE units are for the standard 1931 2 degree observer)
-
-// Color space the color pixel data is encoded in. The color pixel data is
-// 3-channel in all cases except in case of kGray, where it uses only 1 channel.
-// This also determines the amount of channels used in modular encoding.
-enum class ColorSpace : uint32_t {
-  // Trichromatic color data. This also includes CMYK if a kBlack
-  // ExtraChannelInfo is present. This implies, if there is an ICC profile, that
-  // the ICC profile uses a 3-channel color space if no kBlack extra channel is
-  // present, or uses color space 'CMYK' if a kBlack extra channel is present.
-  kRGB,
-  // Single-channel data. This implies, if there is an ICC profile, that the ICC
-  // profile also represents single-channel data and has the appropriate color
-  // space ('GRAY').
-  kGray,
-  // Like kRGB, but implies fixed values for primaries etc.
-  kXYB,
-  // For non-RGB/gray data, e.g. from non-electro-optical sensors. Otherwise
-  // the same conditions as kRGB apply.
-  kUnknown
-};
+using IccBytes = ::jxl::cms::IccBytes;
+using ColorSpace = ::jxl::cms::ColorSpace;
+using WhitePoint = ::jxl::cms::WhitePoint;
+using Primaries = ::jxl::cms::Primaries;
+using TransferFunction = ::jxl::cms::TransferFunction;
+using RenderingIntent = ::jxl::cms::RenderingIntent;
+using CIExy = ::jxl::cms::CIExy;
+using PrimariesCIExy = ::jxl::cms::PrimariesCIExy;
+
+namespace cms {
 
 static inline const char* EnumName(ColorSpace /*unused*/) {
   return "ColorSpace";
@@ -56,14 +48,6 @@ static inline constexpr uint64_t EnumBits(ColorSpace /*unused*/) {
          MakeBit(CS::kUnknown);
 }
 
-// Values from CICP ColourPrimaries.
-enum class WhitePoint : uint32_t {
-  kD65 = 1,     // sRGB/BT.709/Display P3/BT.2020
-  kCustom = 2,  // Actual values encoded in separate fields
-  kE = 10,      // XYZ
-  kDCI = 11,    // DCI-P3
-};
-
 static inline const char* EnumName(WhitePoint /*unused*/) {
   return "WhitePoint";
 }
@@ -72,14 +56,6 @@ static inline constexpr uint64_t EnumBits(WhitePoint /*unused*/) {
          MakeBit(WhitePoint::kE) | MakeBit(WhitePoint::kDCI);
 }
 
-// Values from CICP ColourPrimaries
-enum class Primaries : uint32_t {
-  kSRGB = 1,    // Same as BT.709
-  kCustom = 2,  // Actual values encoded in separate fields
-  k2100 = 9,    // Same as BT.2020
-  kP3 = 11,
-};
-
 static inline const char* EnumName(Primaries /*unused*/) { return "Primaries"; }
 static inline constexpr uint64_t EnumBits(Primaries /*unused*/) {
   using Pr = Primaries;
@@ -87,20 +63,10 @@ static inline constexpr uint64_t EnumBits(Primaries /*unused*/) {
          MakeBit(Pr::kP3);
 }
 
-// Values from CICP TransferCharacteristics
-enum class TransferFunction : uint32_t {
-  k709 = 1,
-  kUnknown = 2,
-  kLinear = 8,
-  kSRGB = 13,
-  kPQ = 16,   // from BT.2100
-  kDCI = 17,  // from SMPTE RP 431-2 reference projector
-  kHLG = 18,  // from BT.2100
-};
-
 static inline const char* EnumName(TransferFunction /*unused*/) {
   return "TransferFunction";
 }
+
 static inline constexpr uint64_t EnumBits(TransferFunction /*unused*/) {
   using TF = TransferFunction;
   return MakeBit(TF::k709) | MakeBit(TF::kLinear) | MakeBit(TF::kSRGB) |
@@ -108,14 +74,6 @@ static inline constexpr uint64_t EnumBits(TransferFunction /*unused*/) {
          MakeBit(TF::kUnknown);
 }
 
-enum class RenderingIntent : uint32_t {
-  // Values match ICC sRGB encodings.
-  kPerceptual = 0,  // good for photos, requires a profile with LUT.
-  kRelative,        // good for logos.
-  kSaturation,      // perhaps useful for CG with fully saturated colors.
-  kAbsolute,        // leaves white point unchanged; good for proofing.
-};
-
 static inline const char* EnumName(RenderingIntent /*unused*/) {
   return "RenderingIntent";
 }
@@ -125,17 +83,9 @@ static inline constexpr uint64_t EnumBits(RenderingIntent /*unused*/) {
          MakeBit(RI::kSaturation) | MakeBit(RI::kAbsolute);
 }
 
-// Chromaticity (Y is omitted because it is 1 for primaries/white points)
-struct CIExy {
-  double x = 0.0;
-  double y = 0.0;
-};
+}  // namespace cms
 
-struct PrimariesCIExy {
-  CIExy r;
-  CIExy g;
-  CIExy b;
-};
+struct ColorEncoding;
 
 // Serializable form of CIExy.
 struct Customxy : public Fields {
@@ -144,12 +94,9 @@ struct Customxy : public Fields {
 
   Status VisitFields(Visitor* JXL_RESTRICT visitor) override;
 
-  CIExy Get() const;
-  // Returns false if x or y do not fit in the encoding.
-  Status Set(const CIExy& xy);
-
-  int32_t x;
-  int32_t y;
+ private:
+  friend struct ColorEncoding;
+  ::jxl::cms::Customxy storage_;
 };
 
 struct CustomTransferFunction : public Fields {
@@ -160,69 +107,14 @@ struct CustomTransferFunction : public Fields {
   // transfer function, otherwise leaves fields unchanged and returns false.
   bool SetImplicit();
 
-  // Gamma: only used for PNG inputs
-  bool IsGamma() const { return have_gamma_; }
-  double GetGamma() const {
-    JXL_ASSERT(IsGamma());
-    return gamma_ * 1E-7;  // (0, 1)
-  }
-  Status SetGamma(double gamma);
-
-  TransferFunction GetTransferFunction() const {
-    JXL_ASSERT(!IsGamma());
-    return transfer_function_;
-  }
-  void SetTransferFunction(const TransferFunction tf) {
-    have_gamma_ = false;
-    transfer_function_ = tf;
-  }
-
-  bool IsUnknown() const {
-    return !have_gamma_ && (transfer_function_ == TransferFunction::kUnknown);
-  }
-  bool IsSRGB() const {
-    return !have_gamma_ && (transfer_function_ == TransferFunction::kSRGB);
-  }
-  bool IsLinear() const {
-    return !have_gamma_ && (transfer_function_ == TransferFunction::kLinear);
-  }
-  bool IsPQ() const {
-    return !have_gamma_ && (transfer_function_ == TransferFunction::kPQ);
-  }
-  bool IsHLG() const {
-    return !have_gamma_ && (transfer_function_ == TransferFunction::kHLG);
-  }
-  bool Is709() const {
-    return !have_gamma_ && (transfer_function_ == TransferFunction::k709);
-  }
-  bool IsDCI() const {
-    return !have_gamma_ && (transfer_function_ == TransferFunction::kDCI);
-  }
-  bool IsSame(const CustomTransferFunction& other) const {
-    if (have_gamma_ != other.have_gamma_) return false;
-    if (have_gamma_) {
-      if (gamma_ != other.gamma_) return false;
-    } else {
-      if (transfer_function_ != other.transfer_function_) return false;
-    }
-    return true;
-  }
-
   Status VisitFields(Visitor* JXL_RESTRICT visitor) override;
 
   // Must be set before calling VisitFields!
   ColorSpace nonserialized_color_space = ColorSpace::kRGB;
 
  private:
-  static constexpr uint32_t kGammaMul = 10000000;
-
-  bool have_gamma_;
-
-  // OETF exponent to go from linear to gamma-compressed.
-  uint32_t gamma_;  // Only used if have_gamma_.
-
-  // Can be kUnknown.
-  TransferFunction transfer_function_;  // Only used if !have_gamma_.
+  friend struct ColorEncoding;
+  ::jxl::cms::CustomTransferFunction storage_;
 };
 
 // Compact encoding of data required to interpret and translate pixels to a
@@ -237,31 +129,29 @@ struct ColorEncoding : public Fields {
 
   // Returns true if an ICC profile was successfully created from fields.
   // Must be called after modifying fields. Defined in color_management.cc.
-  Status CreateICC();
+  Status CreateICC() {
+    storage_.icc.clear();
+    const JxlColorEncoding external = ToExternal();
+    if (!MaybeCreateProfile(external, &storage_.icc)) {
+      storage_.icc.clear();
+      return JXL_FAILURE("Failed to create ICC profile");
+    }
+    return true;
+  }
 
   // Returns non-empty and valid ICC profile, unless:
-  // - between calling InternalRemoveICC() and CreateICC() in tests;
   // - WantICC() == true and SetICC() was not yet called;
   // - after a failed call to SetSRGB(), SetICC(), or CreateICC().
-  const PaddedBytes& ICC() const { return icc_; }
-
-  // Internal only, do not call except from tests.
-  void InternalRemoveICC() { icc_.clear(); }
+  const IccBytes& ICC() const { return storage_.icc; }
 
   // Returns true if `icc` is assigned and decoded successfully. If so,
   // subsequent WantICC() will return true until DecideIfWantICC() changes it.
   // Returning false indicates data has been lost.
-  Status SetICC(PaddedBytes&& icc) {
-    if (icc.empty()) return false;
-    icc_ = std::move(icc);
-
-    if (!SetFieldsFromICC()) {
-      InternalRemoveICC();
-      return false;
-    }
-
-    want_icc_ = true;
-    return true;
+  Status SetICC(IccBytes&& icc, const JxlCmsInterface* cms) {
+    JXL_ASSERT(cms != nullptr);
+    JXL_ASSERT(!icc.empty());
+    want_icc_ = storage_.SetFieldsFromICC(std::move(icc), *cms);
+    return want_icc_;
   }
 
   // Sets the raw ICC profile bytes, without parsing the ICC, and without
@@ -269,13 +159,11 @@ struct ColorEncoding : public Fields {
   // space. Functions to get and set fields, such as SetWhitePoint, cannot be
   // used anymore after this and functions such as IsSRGB return false no matter
   // what the contents of the icc profile.
-  Status SetICCRaw(PaddedBytes&& icc) {
-    if (icc.empty()) return false;
-    icc_ = std::move(icc);
-
+  void SetICCRaw(IccBytes&& icc) {
+    JXL_ASSERT(!icc.empty());
+    storage_.icc = std::move(icc);
+    storage_.have_fields = false;
     want_icc_ = true;
-    have_fields_ = false;
-    return true;
   }
 
   // Returns whether to send the ICC profile in the codestream.
@@ -283,26 +171,24 @@ struct ColorEncoding : public Fields {
 
   // Return whether the direct fields are set, if false but ICC is set, only
   // raw ICC bytes are known.
-  bool HaveFields() const { return have_fields_; }
+  bool HaveFields() const { return storage_.have_fields; }
 
   // Causes WantICC() to return false if ICC() can be reconstructed from fields.
-  // Defined in color_management.cc.
-  void DecideIfWantICC();
+  void DecideIfWantICC(const JxlCmsInterface& cms);
 
-  bool IsGray() const { return color_space_ == ColorSpace::kGray; }
-  bool IsCMYK() const { return cmyk_; }
-  size_t Channels() const { return IsGray() ? 1 : 3; }
+  bool IsGray() const { return storage_.color_space == ColorSpace::kGray; }
+  bool IsCMYK() const { return storage_.cmyk; }
+  size_t Channels() const { return storage_.Channels(); }
 
   // Returns false if the field is invalid and unusable.
-  bool HasPrimaries() const {
-    return !IsGray() && color_space_ != ColorSpace::kXYB;
-  }
+  bool HasPrimaries() const { return storage_.HasPrimaries(); }
 
   // Returns true after setting the field to a value defined by color_space,
   // otherwise false and leaves the field unchanged.
   bool ImplicitWhitePoint() {
-    if (color_space_ == ColorSpace::kXYB) {
-      white_point = WhitePoint::kD65;
+    // TODO(eustas): inline
+    if (storage_.color_space == ColorSpace::kXYB) {
+      storage_.white_point = WhitePoint::kD65;
       return true;
     }
     return false;
@@ -312,11 +198,11 @@ struct ColorEncoding : public Fields {
   // profile is set without the fields being set, this returns false, even if
   // the content of the ICC profile would match sRGB.
   bool IsSRGB() const {
-    if (!have_fields_) return false;
-    if (!IsGray() && color_space_ != ColorSpace::kRGB) return false;
-    if (white_point != WhitePoint::kD65) return false;
-    if (primaries != Primaries::kSRGB) return false;
-    if (!tf.IsSRGB()) return false;
+    if (!storage_.have_fields) return false;
+    if (!IsGray() && storage_.color_space != ColorSpace::kRGB) return false;
+    if (storage_.white_point != WhitePoint::kD65) return false;
+    if (storage_.primaries != Primaries::kSRGB) return false;
+    if (!storage_.tf.IsSRGB()) return false;
     return true;
   }
 
@@ -324,139 +210,152 @@ struct ColorEncoding : public Fields {
   // unparsed ICC profile is set without the fields being set, this returns
   // false, even if the content of the ICC profile would match linear sRGB.
   bool IsLinearSRGB() const {
-    if (!have_fields_) return false;
-    if (!IsGray() && color_space_ != ColorSpace::kRGB) return false;
-    if (white_point != WhitePoint::kD65) return false;
-    if (primaries != Primaries::kSRGB) return false;
-    if (!tf.IsLinear()) return false;
+    if (!storage_.have_fields) return false;
+    if (!IsGray() && storage_.color_space != ColorSpace::kRGB) return false;
+    if (storage_.white_point != WhitePoint::kD65) return false;
+    if (storage_.primaries != Primaries::kSRGB) return false;
+    if (!storage_.tf.IsLinear()) return false;
     return true;
   }
 
   Status SetSRGB(const ColorSpace cs,
                  const RenderingIntent ri = RenderingIntent::kRelative) {
-    InternalRemoveICC();
+    storage_.icc.clear();
     JXL_ASSERT(cs == ColorSpace::kGray || cs == ColorSpace::kRGB);
-    color_space_ = cs;
-    white_point = WhitePoint::kD65;
-    primaries = Primaries::kSRGB;
-    tf.SetTransferFunction(TransferFunction::kSRGB);
-    rendering_intent = ri;
+    storage_.color_space = cs;
+    storage_.white_point = WhitePoint::kD65;
+    storage_.primaries = Primaries::kSRGB;
+    storage_.tf.transfer_function = TransferFunction::kSRGB;
+    storage_.rendering_intent = ri;
     return CreateICC();
   }
 
   Status VisitFields(Visitor* JXL_RESTRICT visitor) override;
 
   // Accessors ensure tf.nonserialized_color_space is updated at the same time.
-  ColorSpace GetColorSpace() const { return color_space_; }
-  void SetColorSpace(const ColorSpace cs) {
-    color_space_ = cs;
-    tf.nonserialized_color_space = cs;
-  }
+  ColorSpace GetColorSpace() const { return storage_.color_space; }
+  void SetColorSpace(const ColorSpace cs) { storage_.color_space = cs; }
+  CIExy GetWhitePoint() const { return storage_.GetWhitePoint(); }
 
-  CIExy GetWhitePoint() const;
-  Status SetWhitePoint(const CIExy& xy);
+  WhitePoint GetWhitePointType() const { return storage_.white_point; }
+  Status SetWhitePointType(const WhitePoint& wp);
+  PrimariesCIExy GetPrimaries() const { return storage_.GetPrimaries(); }
 
-  PrimariesCIExy GetPrimaries() const;
-  Status SetPrimaries(const PrimariesCIExy& xy);
+  Primaries GetPrimariesType() const { return storage_.primaries; }
+  Status SetPrimariesType(const Primaries& p);
 
-  // Checks if the color spaces (including white point / primaries) are the
-  // same, but ignores the transfer function, rendering intent and ICC bytes.
-  bool SameColorSpace(const ColorEncoding& other) const {
-    if (color_space_ != other.color_space_) return false;
+  jxl::cms::CustomTransferFunction& Tf() { return storage_.tf; }
+  const jxl::cms::CustomTransferFunction& Tf() const { return storage_.tf; }
 
-    if (white_point != other.white_point) return false;
-    if (white_point == WhitePoint::kCustom) {
-      if (white_.x != other.white_.x || white_.y != other.white_.y)
-        return false;
-    }
-
-    if (HasPrimaries() != other.HasPrimaries()) return false;
-    if (HasPrimaries()) {
-      if (primaries != other.primaries) return false;
-      if (primaries == Primaries::kCustom) {
-        if (red_.x != other.red_.x || red_.y != other.red_.y) return false;
-        if (green_.x != other.green_.x || green_.y != other.green_.y)
-          return false;
-        if (blue_.x != other.blue_.x || blue_.y != other.blue_.y) return false;
-      }
-    }
-    return true;
+  RenderingIntent GetRenderingIntent() const {
+    return storage_.rendering_intent;
+  }
+  void SetRenderingIntent(const RenderingIntent& ri) {
+    storage_.rendering_intent = ri;
   }
 
-  // Checks if the color space and transfer function are the same, ignoring
-  // rendering intent and ICC bytes
   bool SameColorEncoding(const ColorEncoding& other) const {
-    return SameColorSpace(other) && tf.IsSame(other.tf);
+    return storage_.SameColorEncoding(other.storage_);
   }
 
   mutable bool all_default;
 
-  // Only valid if HaveFields()
-  WhitePoint white_point;
-  Primaries primaries;  // Only valid if HasPrimaries()
-  CustomTransferFunction tf;
-  RenderingIntent rendering_intent;
+  JxlColorEncoding ToExternal() const { return storage_.ToExternal(); }
+  Status FromExternal(const JxlColorEncoding& external) {
+    JXL_RETURN_IF_ERROR(storage_.FromExternal(external));
+    (void)CreateICC();
+    return true;
+  }
+  const jxl::cms::ColorEncoding& View() const { return storage_; }
+  std::string Description() const;
 
  private:
-  // Returns true if all fields have been initialized (possibly to kUnknown).
-  // Returns false if the ICC profile is invalid or decoding it fails.
-  // Defined in enc_color_management.cc.
-  Status SetFieldsFromICC();
+  static std::array<ColorEncoding, 2> CreateC2(Primaries pr,
+                                               TransferFunction tf);
 
   // If true, the codestream contains an ICC profile and we do not serialize
   // fields. Otherwise, fields are serialized and we create an ICC profile.
   bool want_icc_;
 
-  // When false, fields such as white_point and tf are invalid and must not be
-  // used. This occurs after setting a raw bytes-only ICC profile, only the
-  // ICC bytes may be used. The color_space_ field is still valid.
-  bool have_fields_ = true;
-
-  PaddedBytes icc_;  // Valid ICC profile
-
-  ColorSpace color_space_;  // Can be kUnknown
-  bool cmyk_ = false;
-
+  ::jxl::cms::ColorEncoding storage_;
   // Only used if white_point == kCustom.
   Customxy white_;
 
+  // Only valid if HaveFields()
+  CustomTransferFunction tf_;
+
   // Only used if primaries == kCustom.
   Customxy red_;
   Customxy green_;
   Customxy blue_;
 };
 
-// Returns whether the two inputs are approximately equal.
-static inline bool ApproxEq(const double a, const double b,
-#if JPEGXL_ENABLE_SKCMS
-                            double max_l1 = 1E-3) {
-#else
-                            double max_l1 = 8E-5) {
-#endif
-  // Threshold should be sufficient for ICC's 15-bit fixed-point numbers.
-  // We have seen differences of 7.1E-5 with lcms2 and 1E-3 with skcms.
-  return std::abs(a - b) <= max_l1;
+static inline std::string Description(const ColorEncoding& c) {
+  const JxlColorEncoding external = c.View().ToExternal();
+  return ColorEncodingDescription(external);
 }
 
-// Returns a representation of the ColorEncoding fields (not icc).
-// Example description: "RGB_D65_SRG_Rel_Lin"
-std::string Description(const ColorEncoding& c);
 static inline std::ostream& operator<<(std::ostream& os,
                                        const ColorEncoding& c) {
   return os << Description(c);
 }
 
-void ConvertInternalToExternalColorEncoding(const jxl::ColorEncoding& internal,
-                                            JxlColorEncoding* external);
+class ColorSpaceTransform {
+ public:
+  explicit ColorSpaceTransform(const JxlCmsInterface& cms) : cms_(cms) {}
+  ~ColorSpaceTransform() {
+    if (cms_data_ != nullptr) {
+      cms_.destroy(cms_data_);
+    }
+  }
+
+  // Cannot copy.
+  ColorSpaceTransform(const ColorSpaceTransform&) = delete;
+  ColorSpaceTransform& operator=(const ColorSpaceTransform&) = delete;
+
+  Status Init(const ColorEncoding& c_src, const ColorEncoding& c_dst,
+              float intensity_target, size_t xsize, size_t num_threads) {
+    xsize_ = xsize;
+    JxlColorProfile input_profile;
+    icc_src_ = c_src.ICC();
+    input_profile.icc.data = icc_src_.data();
+    input_profile.icc.size = icc_src_.size();
+    input_profile.color_encoding = c_src.ToExternal();
+    input_profile.num_channels = c_src.IsCMYK() ? 4 : c_src.Channels();
+    JxlColorProfile output_profile;
+    icc_dst_ = c_dst.ICC();
+    output_profile.icc.data = icc_dst_.data();
+    output_profile.icc.size = icc_dst_.size();
+    output_profile.color_encoding = c_dst.ToExternal();
+    if (c_dst.IsCMYK())
+      return JXL_FAILURE("Conversion to CMYK is not supported");
+    output_profile.num_channels = c_dst.Channels();
+    cms_data_ = cms_.init(cms_.init_data, num_threads, xsize, &input_profile,
+                          &output_profile, intensity_target);
+    JXL_RETURN_IF_ERROR(cms_data_ != nullptr);
+    return true;
+  }
 
-Status ConvertExternalToInternalColorEncoding(const JxlColorEncoding& external,
-                                              jxl::ColorEncoding* internal);
+  float* BufSrc(const size_t thread) const {
+    return cms_.get_src_buf(cms_data_, thread);
+  }
+
+  float* BufDst(const size_t thread) const {
+    return cms_.get_dst_buf(cms_data_, thread);
+  }
 
-Status PrimariesToXYZ(float rx, float ry, float gx, float gy, float bx,
-                      float by, float wx, float wy, float matrix[9]);
-Status PrimariesToXYZD50(float rx, float ry, float gx, float gy, float bx,
-                         float by, float wx, float wy, float matrix[9]);
-Status AdaptToXYZD50(float wx, float wy, float matrix[9]);
+  Status Run(const size_t thread, const float* buf_src, float* buf_dst) {
+    return cms_.run(cms_data_, thread, buf_src, buf_dst, xsize_);
+  }
+
+ private:
+  JxlCmsInterface cms_;
+  void* cms_data_ = nullptr;
+  // The interface may retain pointers into these.
+  IccBytes icc_src_;
+  IccBytes icc_dst_;
+  size_t xsize_;
+};
 
 }  // namespace jxl
 
index 32bd0cc..008b3ce 100644 (file)
@@ -5,18 +5,23 @@
 
 #include "lib/jxl/color_encoding_internal.h"
 
-#include <stdio.h>
+#include <jxl/color_encoding.h>
 
-#include "gtest/gtest.h"
+#include <cstdlib>  // rand
+
+#include "lib/jxl/cms/color_encoding_cms.h"
 #include "lib/jxl/encode_internal.h"
 #include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
 namespace {
 
+using jxl::cms::ColorEncoding;
+
 TEST(ColorEncodingTest, RoundTripAll) {
   for (const test::ColorEncodingDescriptor& cdesc : test::AllEncodings()) {
-    const ColorEncoding c_original = test::ColorEncodingFromDescriptor(cdesc);
+    ColorEncoding c_original = test::ColorEncodingFromDescriptor(cdesc).View();
     // Verify Set(Get) yields the same white point/primaries/gamma.
     {
       ColorEncoding c;
@@ -28,7 +33,7 @@ TEST(ColorEncodingTest, RoundTripAll) {
       EXPECT_TRUE(c.SetPrimaries(c_original.GetPrimaries()));
       EXPECT_EQ(c_original.primaries, c.primaries);
     }
-    if (c_original.tf.IsGamma()) {
+    if (c_original.tf.have_gamma) {
       ColorEncoding c;
       EXPECT_TRUE(c.tf.SetGamma(c_original.tf.GetGamma()));
       EXPECT_TRUE(c_original.tf.IsSame(c.tf));
@@ -76,26 +81,25 @@ TEST(ColorEncodingTest, CustomGamma) {
   EXPECT_FALSE(c.tf.SetGamma(1.001));
 #endif
   EXPECT_TRUE(c.tf.SetGamma(1.0));
-  EXPECT_FALSE(c.tf.IsGamma());
+  EXPECT_FALSE(c.tf.have_gamma);
   EXPECT_TRUE(c.tf.IsLinear());
 
   EXPECT_TRUE(c.tf.SetGamma(0.123));
-  EXPECT_TRUE(c.tf.IsGamma());
+  EXPECT_TRUE(c.tf.have_gamma);
   const double gamma = c.tf.GetGamma();
 
   ColorEncoding c2;
   EXPECT_TRUE(c2.tf.SetGamma(gamma));
   EXPECT_TRUE(c.SameColorEncoding(c2));
-  EXPECT_TRUE(c2.tf.IsGamma());
+  EXPECT_TRUE(c2.tf.have_gamma);
 }
 
 TEST(ColorEncodingTest, InternalExternalConversion) {
   ColorEncoding source_internal;
-  JxlColorEncoding external;
   ColorEncoding destination_internal;
 
   for (int i = 0; i < 100; i++) {
-    source_internal.SetColorSpace(static_cast<ColorSpace>(rand() % 4));
+    source_internal.color_space = static_cast<ColorSpace>(rand() % 4);
     CIExy wp;
     wp.x = (float(rand()) / float((RAND_MAX)) * 0.5) + 0.25;
     wp.y = (float(rand()) / float((RAND_MAX)) * 0.5) + 0.25;
@@ -110,38 +114,33 @@ TEST(ColorEncodingTest, InternalExternalConversion) {
       primaries.b.y = (float(rand()) / float((RAND_MAX)) * 0.5) + 0.25;
       EXPECT_TRUE(source_internal.SetPrimaries(primaries));
     }
-    CustomTransferFunction tf;
+    jxl::cms::CustomTransferFunction tf;
     EXPECT_TRUE(tf.SetGamma((float(rand()) / float((RAND_MAX)) * 0.5) + 0.25));
     source_internal.tf = tf;
     source_internal.rendering_intent = static_cast<RenderingIntent>(rand() % 4);
 
-    ConvertInternalToExternalColorEncoding(source_internal, &external);
-    EXPECT_TRUE(ConvertExternalToInternalColorEncoding(external,
-                                                       &destination_internal));
+    JxlColorEncoding external = source_internal.ToExternal();
+    EXPECT_TRUE(destination_internal.FromExternal(external));
 
-    EXPECT_EQ(source_internal.GetColorSpace(),
-              destination_internal.GetColorSpace());
+    EXPECT_EQ(source_internal.color_space, destination_internal.color_space);
     EXPECT_EQ(source_internal.white_point, destination_internal.white_point);
-    EXPECT_EQ(source_internal.GetWhitePoint().x,
-              destination_internal.GetWhitePoint().x);
-    EXPECT_EQ(source_internal.GetWhitePoint().y,
-              destination_internal.GetWhitePoint().y);
+    CIExy src_wp = source_internal.GetWhitePoint();
+    CIExy dst_wp = destination_internal.GetWhitePoint();
+    EXPECT_EQ(src_wp.x, dst_wp.x);
+    EXPECT_EQ(src_wp.y, dst_wp.y);
     if (source_internal.HasPrimaries()) {
-      EXPECT_EQ(source_internal.GetPrimaries().r.x,
-                destination_internal.GetPrimaries().r.x);
-      EXPECT_EQ(source_internal.GetPrimaries().r.y,
-                destination_internal.GetPrimaries().r.y);
-      EXPECT_EQ(source_internal.GetPrimaries().g.x,
-                destination_internal.GetPrimaries().g.x);
-      EXPECT_EQ(source_internal.GetPrimaries().g.y,
-                destination_internal.GetPrimaries().g.y);
-      EXPECT_EQ(source_internal.GetPrimaries().b.x,
-                destination_internal.GetPrimaries().b.x);
-      EXPECT_EQ(source_internal.GetPrimaries().b.y,
-                destination_internal.GetPrimaries().b.y);
+      PrimariesCIExy src_p = source_internal.GetPrimaries();
+      PrimariesCIExy dst_p = destination_internal.GetPrimaries();
+      EXPECT_EQ(src_p.r.x, dst_p.r.x);
+      EXPECT_EQ(src_p.r.y, dst_p.r.y);
+      EXPECT_EQ(src_p.g.x, dst_p.g.x);
+      EXPECT_EQ(src_p.g.y, dst_p.g.y);
+      EXPECT_EQ(src_p.b.x, dst_p.b.x);
+      EXPECT_EQ(src_p.b.y, dst_p.b.y);
     }
-    EXPECT_EQ(source_internal.tf.IsGamma(), destination_internal.tf.IsGamma());
-    if (source_internal.tf.IsGamma()) {
+    EXPECT_EQ(source_internal.tf.have_gamma,
+              destination_internal.tf.have_gamma);
+    if (source_internal.tf.have_gamma) {
       EXPECT_EQ(source_internal.tf.GetGamma(),
                 destination_internal.tf.GetGamma());
     } else {
diff --git a/lib/jxl/color_management.cc b/lib/jxl/color_management.cc
deleted file mode 100644 (file)
index 521a75a..0000000
+++ /dev/null
@@ -1,516 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "lib/jxl/color_management.h"
-
-#include <math.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <algorithm>
-#include <array>
-#include <atomic>
-#include <memory>
-#include <string>
-#include <utility>
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "lib/jxl/color_management.cc"
-#include <hwy/foreach_target.h>
-#include <hwy/highway.h>
-
-#include "lib/jxl/base/compiler_specific.h"
-#include "lib/jxl/base/data_parallel.h"
-#include "lib/jxl/base/status.h"
-#include "lib/jxl/field_encodings.h"
-#include "lib/jxl/linalg.h"  // MatMul, Inv3x3Matrix
-#include "lib/jxl/transfer_functions-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace jxl {
-namespace HWY_NAMESPACE {
-
-// NOTE: this is only used to provide a reasonable ICC profile that other
-// software can read. Our own transforms use ExtraTF instead because that is
-// more precise and supports unbounded mode.
-std::vector<uint16_t> CreateTableCurve(uint32_t N, const ExtraTF tf) {
-  JXL_ASSERT(N <= 4096);  // ICC MFT2 only allows 4K entries
-  JXL_ASSERT(tf == ExtraTF::kPQ || tf == ExtraTF::kHLG);
-  // No point using float - LCMS converts to 16-bit for A2B/MFT.
-  std::vector<uint16_t> table(N);
-  for (uint32_t i = 0; i < N; ++i) {
-    const float x = static_cast<float>(i) / (N - 1);  // 1.0 at index N - 1.
-    const double dx = static_cast<double>(x);
-    // LCMS requires EOTF (e.g. 2.4 exponent).
-    double y = (tf == ExtraTF::kHLG) ? TF_HLG().DisplayFromEncoded(dx)
-                                     : TF_PQ().DisplayFromEncoded(dx);
-    JXL_ASSERT(y >= 0.0);
-    // Clamp to table range - necessary for HLG.
-    if (y > 1.0) y = 1.0;
-    // 1.0 corresponds to table value 0xFFFF.
-    table[i] = static_cast<uint16_t>(roundf(y * 65535.0));
-  }
-  return table;
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace jxl
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace jxl {
-
-HWY_EXPORT(CreateTableCurve);  // Local function.
-
-Status CIEXYZFromWhiteCIExy(const CIExy& xy, float XYZ[3]) {
-  // Target Y = 1.
-  if (std::abs(xy.y) < 1e-12) return JXL_FAILURE("Y value is too small");
-  const float factor = 1 / xy.y;
-  XYZ[0] = xy.x * factor;
-  XYZ[1] = 1;
-  XYZ[2] = (1 - xy.x - xy.y) * factor;
-  return true;
-}
-
-namespace {
-
-// NOTE: this is only used to provide a reasonable ICC profile that other
-// software can read. Our own transforms use ExtraTF instead because that is
-// more precise and supports unbounded mode.
-template <class Func>
-std::vector<uint16_t> CreateTableCurve(uint32_t N, const Func& func) {
-  JXL_ASSERT(N <= 4096);  // ICC MFT2 only allows 4K entries
-  // No point using float - LCMS converts to 16-bit for A2B/MFT.
-  std::vector<uint16_t> table(N);
-  for (uint32_t i = 0; i < N; ++i) {
-    const float x = static_cast<float>(i) / (N - 1);  // 1.0 at index N - 1.
-    // LCMS requires EOTF (e.g. 2.4 exponent).
-    double y = func.DisplayFromEncoded(static_cast<double>(x));
-    JXL_ASSERT(y >= 0.0);
-    // Clamp to table range - necessary for HLG.
-    if (y > 1.0) y = 1.0;
-    // 1.0 corresponds to table value 0xFFFF.
-    table[i] = static_cast<uint16_t>(roundf(y * 65535.0));
-  }
-  return table;
-}
-
-void ICCComputeMD5(const PaddedBytes& data, uint8_t sum[16])
-    JXL_NO_SANITIZE("unsigned-integer-overflow") {
-  PaddedBytes data64 = data;
-  data64.push_back(128);
-  // Add bytes such that ((size + 8) & 63) == 0.
-  size_t extra = ((64 - ((data64.size() + 8) & 63)) & 63);
-  data64.resize(data64.size() + extra, 0);
-  for (uint64_t i = 0; i < 64; i += 8) {
-    data64.push_back(static_cast<uint64_t>(data.size() << 3u) >> i);
-  }
-
-  static const uint32_t sineparts[64] = {
-      0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, 0xf57c0faf, 0x4787c62a,
-      0xa8304613, 0xfd469501, 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
-      0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821, 0xf61e2562, 0xc040b340,
-      0x265e5a51, 0xe9b6c7aa, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
-      0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed, 0xa9e3e905, 0xfcefa3f8,
-      0x676f02d9, 0x8d2a4c8a, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
-      0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70, 0x289b7ec6, 0xeaa127fa,
-      0xd4ef3085, 0x04881d05, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
-      0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039, 0x655b59c3, 0x8f0ccc92,
-      0xffeff47d, 0x85845dd1, 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
-      0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391,
-  };
-  static const uint32_t shift[64] = {
-      7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22,
-      5, 9,  14, 20, 5, 9,  14, 20, 5, 9,  14, 20, 5, 9,  14, 20,
-      4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23,
-      6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21,
-  };
-
-  uint32_t a0 = 0x67452301, b0 = 0xefcdab89, c0 = 0x98badcfe, d0 = 0x10325476;
-
-  for (size_t i = 0; i < data64.size(); i += 64) {
-    uint32_t a = a0, b = b0, c = c0, d = d0, f, g;
-    for (size_t j = 0; j < 64; j++) {
-      if (j < 16) {
-        f = (b & c) | ((~b) & d);
-        g = j;
-      } else if (j < 32) {
-        f = (d & b) | ((~d) & c);
-        g = (5 * j + 1) & 0xf;
-      } else if (j < 48) {
-        f = b ^ c ^ d;
-        g = (3 * j + 5) & 0xf;
-      } else {
-        f = c ^ (b | (~d));
-        g = (7 * j) & 0xf;
-      }
-      uint32_t dg0 = data64[i + g * 4 + 0], dg1 = data64[i + g * 4 + 1],
-               dg2 = data64[i + g * 4 + 2], dg3 = data64[i + g * 4 + 3];
-      uint32_t u = dg0 | (dg1 << 8u) | (dg2 << 16u) | (dg3 << 24u);
-      f += a + sineparts[j] + u;
-      a = d;
-      d = c;
-      c = b;
-      b += (f << shift[j]) | (f >> (32u - shift[j]));
-    }
-    a0 += a;
-    b0 += b;
-    c0 += c;
-    d0 += d;
-  }
-  sum[0] = a0;
-  sum[1] = a0 >> 8u;
-  sum[2] = a0 >> 16u;
-  sum[3] = a0 >> 24u;
-  sum[4] = b0;
-  sum[5] = b0 >> 8u;
-  sum[6] = b0 >> 16u;
-  sum[7] = b0 >> 24u;
-  sum[8] = c0;
-  sum[9] = c0 >> 8u;
-  sum[10] = c0 >> 16u;
-  sum[11] = c0 >> 24u;
-  sum[12] = d0;
-  sum[13] = d0 >> 8u;
-  sum[14] = d0 >> 16u;
-  sum[15] = d0 >> 24u;
-}
-
-Status CreateICCChadMatrix(CIExy w, float result[9]) {
-  float m[9];
-  if (w.y == 0) {  // WhitePoint can not be pitch-black.
-    return JXL_FAILURE("Invalid WhitePoint");
-  }
-  JXL_RETURN_IF_ERROR(AdaptToXYZD50(w.x, w.y, m));
-  memcpy(result, m, sizeof(float) * 9);
-  return true;
-}
-
-// Creates RGB to XYZ matrix given RGB primaries and whitepoint in xy.
-Status CreateICCRGBMatrix(CIExy r, CIExy g, CIExy b, CIExy w, float result[9]) {
-  float m[9];
-  JXL_RETURN_IF_ERROR(
-      PrimariesToXYZD50(r.x, r.y, g.x, g.y, b.x, b.y, w.x, w.y, m));
-  memcpy(result, m, sizeof(float) * 9);
-  return true;
-}
-
-void WriteICCUint32(uint32_t value, size_t pos, PaddedBytes* JXL_RESTRICT icc) {
-  if (icc->size() < pos + 4) icc->resize(pos + 4);
-  (*icc)[pos + 0] = (value >> 24u) & 255;
-  (*icc)[pos + 1] = (value >> 16u) & 255;
-  (*icc)[pos + 2] = (value >> 8u) & 255;
-  (*icc)[pos + 3] = value & 255;
-}
-
-void WriteICCUint16(uint16_t value, size_t pos, PaddedBytes* JXL_RESTRICT icc) {
-  if (icc->size() < pos + 2) icc->resize(pos + 2);
-  (*icc)[pos + 0] = (value >> 8u) & 255;
-  (*icc)[pos + 1] = value & 255;
-}
-
-// Writes a 4-character tag
-void WriteICCTag(const char* value, size_t pos, PaddedBytes* JXL_RESTRICT icc) {
-  if (icc->size() < pos + 4) icc->resize(pos + 4);
-  memcpy(icc->data() + pos, value, 4);
-}
-
-Status WriteICCS15Fixed16(float value, size_t pos,
-                          PaddedBytes* JXL_RESTRICT icc) {
-  // "nextafterf" for 32768.0f towards zero are:
-  // 32767.998046875, 32767.99609375, 32767.994140625
-  // Even the first value works well,...
-  bool ok = (-32767.995f <= value) && (value <= 32767.995f);
-  if (!ok) return JXL_FAILURE("ICC value is out of range / NaN");
-  int32_t i = value * 65536.0f + 0.5f;
-  // Use two's complement
-  uint32_t u = static_cast<uint32_t>(i);
-  WriteICCUint32(u, pos, icc);
-  return true;
-}
-
-Status CreateICCHeader(const ColorEncoding& c,
-                       PaddedBytes* JXL_RESTRICT header) {
-  // TODO(lode): choose color management engine name, e.g. "skia" if
-  // integrated in skia.
-  static const char* kCmm = "jxl ";
-
-  header->resize(128, 0);
-
-  WriteICCUint32(0, 0, header);  // size, correct value filled in at end
-  WriteICCTag(kCmm, 4, header);
-  WriteICCUint32(0x04300000u, 8, header);
-  WriteICCTag("mntr", 12, header);
-  WriteICCTag(c.IsGray() ? "GRAY" : "RGB ", 16, header);
-  WriteICCTag("XYZ ", 20, header);
-
-  // Three uint32_t's date/time encoding.
-  // TODO(lode): encode actual date and time, this is a placeholder
-  uint32_t year = 2019, month = 12, day = 1;
-  uint32_t hour = 0, minute = 0, second = 0;
-  WriteICCUint16(year, 24, header);
-  WriteICCUint16(month, 26, header);
-  WriteICCUint16(day, 28, header);
-  WriteICCUint16(hour, 30, header);
-  WriteICCUint16(minute, 32, header);
-  WriteICCUint16(second, 34, header);
-
-  WriteICCTag("acsp", 36, header);
-  WriteICCTag("APPL", 40, header);
-  WriteICCUint32(0, 44, header);  // flags
-  WriteICCUint32(0, 48, header);  // device manufacturer
-  WriteICCUint32(0, 52, header);  // device model
-  WriteICCUint32(0, 56, header);  // device attributes
-  WriteICCUint32(0, 60, header);  // device attributes
-  WriteICCUint32(static_cast<uint32_t>(c.rendering_intent), 64, header);
-
-  // Mandatory D50 white point of profile connection space
-  WriteICCUint32(0x0000f6d6, 68, header);
-  WriteICCUint32(0x00010000, 72, header);
-  WriteICCUint32(0x0000d32d, 76, header);
-
-  WriteICCTag(kCmm, 80, header);
-
-  return true;
-}
-
-void AddToICCTagTable(const char* tag, size_t offset, size_t size,
-                      PaddedBytes* JXL_RESTRICT tagtable,
-                      std::vector<size_t>* offsets) {
-  WriteICCTag(tag, tagtable->size(), tagtable);
-  // writing true offset deferred to later
-  WriteICCUint32(0, tagtable->size(), tagtable);
-  offsets->push_back(offset);
-  WriteICCUint32(size, tagtable->size(), tagtable);
-}
-
-void FinalizeICCTag(PaddedBytes* JXL_RESTRICT tags, size_t* offset,
-                    size_t* size) {
-  while ((tags->size() & 3) != 0) {
-    tags->push_back(0);
-  }
-  *offset += *size;
-  *size = tags->size() - *offset;
-}
-
-// The input text must be ASCII, writing other characters to UTF-16 is not
-// implemented.
-void CreateICCMlucTag(const std::string& text, PaddedBytes* JXL_RESTRICT tags) {
-  WriteICCTag("mluc", tags->size(), tags);
-  WriteICCUint32(0, tags->size(), tags);
-  WriteICCUint32(1, tags->size(), tags);
-  WriteICCUint32(12, tags->size(), tags);
-  WriteICCTag("enUS", tags->size(), tags);
-  WriteICCUint32(text.size() * 2, tags->size(), tags);
-  WriteICCUint32(28, tags->size(), tags);
-  for (size_t i = 0; i < text.size(); i++) {
-    tags->push_back(0);  // prepend 0 for UTF-16
-    tags->push_back(text[i]);
-  }
-}
-
-Status CreateICCXYZTag(float xyz[3], PaddedBytes* JXL_RESTRICT tags) {
-  WriteICCTag("XYZ ", tags->size(), tags);
-  WriteICCUint32(0, tags->size(), tags);
-  for (size_t i = 0; i < 3; ++i) {
-    JXL_RETURN_IF_ERROR(WriteICCS15Fixed16(xyz[i], tags->size(), tags));
-  }
-  return true;
-}
-
-Status CreateICCChadTag(float chad[9], PaddedBytes* JXL_RESTRICT tags) {
-  WriteICCTag("sf32", tags->size(), tags);
-  WriteICCUint32(0, tags->size(), tags);
-  for (size_t i = 0; i < 9; i++) {
-    JXL_RETURN_IF_ERROR(WriteICCS15Fixed16(chad[i], tags->size(), tags));
-  }
-  return true;
-}
-
-void CreateICCCurvCurvTag(const std::vector<uint16_t>& curve,
-                          PaddedBytes* JXL_RESTRICT tags) {
-  size_t pos = tags->size();
-  tags->resize(tags->size() + 12 + curve.size() * 2, 0);
-  WriteICCTag("curv", pos, tags);
-  WriteICCUint32(0, pos + 4, tags);
-  WriteICCUint32(curve.size(), pos + 8, tags);
-  for (size_t i = 0; i < curve.size(); i++) {
-    WriteICCUint16(curve[i], pos + 12 + i * 2, tags);
-  }
-}
-
-Status CreateICCCurvParaTag(std::vector<float> params, size_t curve_type,
-                            PaddedBytes* JXL_RESTRICT tags) {
-  WriteICCTag("para", tags->size(), tags);
-  WriteICCUint32(0, tags->size(), tags);
-  WriteICCUint16(curve_type, tags->size(), tags);
-  WriteICCUint16(0, tags->size(), tags);
-  for (size_t i = 0; i < params.size(); i++) {
-    JXL_RETURN_IF_ERROR(WriteICCS15Fixed16(params[i], tags->size(), tags));
-  }
-  return true;
-}
-}  // namespace
-
-Status MaybeCreateProfile(const ColorEncoding& c,
-                          PaddedBytes* JXL_RESTRICT icc) {
-  PaddedBytes header, tagtable, tags;
-
-  if (c.GetColorSpace() == ColorSpace::kUnknown || c.tf.IsUnknown()) {
-    return false;  // Not an error
-  }
-
-  switch (c.GetColorSpace()) {
-    case ColorSpace::kRGB:
-    case ColorSpace::kGray:
-      break;  // OK
-    case ColorSpace::kXYB:
-      return JXL_FAILURE("XYB ICC not yet implemented");
-    default:
-      return JXL_FAILURE("Invalid CS %u",
-                         static_cast<unsigned int>(c.GetColorSpace()));
-  }
-
-  JXL_RETURN_IF_ERROR(CreateICCHeader(c, &header));
-
-  std::vector<size_t> offsets;
-  // tag count, deferred to later
-  WriteICCUint32(0, tagtable.size(), &tagtable);
-
-  size_t tag_offset = 0, tag_size = 0;
-
-  CreateICCMlucTag(Description(c), &tags);
-  FinalizeICCTag(&tags, &tag_offset, &tag_size);
-  AddToICCTagTable("desc", tag_offset, tag_size, &tagtable, &offsets);
-
-  const std::string copyright =
-      "Copyright 2019 Google LLC, CC-BY-SA 3.0 Unported "
-      "license(https://creativecommons.org/licenses/by-sa/3.0/legalcode)";
-  CreateICCMlucTag(copyright, &tags);
-  FinalizeICCTag(&tags, &tag_offset, &tag_size);
-  AddToICCTagTable("cprt", tag_offset, tag_size, &tagtable, &offsets);
-
-  // TODO(eustas): isn't it the other way round: gray image has d50 WhitePoint?
-  if (c.IsGray()) {
-    float wtpt[3];
-    JXL_RETURN_IF_ERROR(CIEXYZFromWhiteCIExy(c.GetWhitePoint(), wtpt));
-    JXL_RETURN_IF_ERROR(CreateICCXYZTag(wtpt, &tags));
-  } else {
-    float d50[3] = {0.964203, 1.0, 0.824905};
-    JXL_RETURN_IF_ERROR(CreateICCXYZTag(d50, &tags));
-  }
-  FinalizeICCTag(&tags, &tag_offset, &tag_size);
-  AddToICCTagTable("wtpt", tag_offset, tag_size, &tagtable, &offsets);
-
-  if (!c.IsGray()) {
-    // Chromatic adaptation matrix
-    float chad[9];
-    JXL_RETURN_IF_ERROR(CreateICCChadMatrix(c.GetWhitePoint(), chad));
-
-    const PrimariesCIExy primaries = c.GetPrimaries();
-    float m[9];
-    JXL_RETURN_IF_ERROR(CreateICCRGBMatrix(primaries.r, primaries.g,
-                                           primaries.b, c.GetWhitePoint(), m));
-    float r[3] = {m[0], m[3], m[6]};
-    float g[3] = {m[1], m[4], m[7]};
-    float b[3] = {m[2], m[5], m[8]};
-
-    JXL_RETURN_IF_ERROR(CreateICCChadTag(chad, &tags));
-    FinalizeICCTag(&tags, &tag_offset, &tag_size);
-    AddToICCTagTable("chad", tag_offset, tag_size, &tagtable, &offsets);
-
-    JXL_RETURN_IF_ERROR(CreateICCXYZTag(r, &tags));
-    FinalizeICCTag(&tags, &tag_offset, &tag_size);
-    AddToICCTagTable("rXYZ", tag_offset, tag_size, &tagtable, &offsets);
-
-    JXL_RETURN_IF_ERROR(CreateICCXYZTag(g, &tags));
-    FinalizeICCTag(&tags, &tag_offset, &tag_size);
-    AddToICCTagTable("gXYZ", tag_offset, tag_size, &tagtable, &offsets);
-
-    JXL_RETURN_IF_ERROR(CreateICCXYZTag(b, &tags));
-    FinalizeICCTag(&tags, &tag_offset, &tag_size);
-    AddToICCTagTable("bXYZ", tag_offset, tag_size, &tagtable, &offsets);
-  }
-
-  if (c.tf.IsGamma()) {
-    float gamma = 1.0 / c.tf.GetGamma();
-    JXL_RETURN_IF_ERROR(
-        CreateICCCurvParaTag({gamma, 1.0, 0.0, 1.0, 0.0}, 3, &tags));
-  } else {
-    switch (c.tf.GetTransferFunction()) {
-      case TransferFunction::kHLG:
-        CreateICCCurvCurvTag(
-            HWY_DYNAMIC_DISPATCH(CreateTableCurve)(4096, ExtraTF::kHLG), &tags);
-        break;
-      case TransferFunction::kPQ:
-        CreateICCCurvCurvTag(
-            HWY_DYNAMIC_DISPATCH(CreateTableCurve)(4096, ExtraTF::kPQ), &tags);
-        break;
-      case TransferFunction::kSRGB:
-        JXL_RETURN_IF_ERROR(CreateICCCurvParaTag(
-            {2.4, 1.0 / 1.055, 0.055 / 1.055, 1.0 / 12.92, 0.04045}, 3, &tags));
-        break;
-      case TransferFunction::k709:
-        JXL_RETURN_IF_ERROR(CreateICCCurvParaTag(
-            {1.0 / 0.45, 1.0 / 1.099, 0.099 / 1.099, 1.0 / 4.5, 0.081}, 3,
-            &tags));
-        break;
-      case TransferFunction::kLinear:
-        JXL_RETURN_IF_ERROR(
-            CreateICCCurvParaTag({1.0, 1.0, 0.0, 1.0, 0.0}, 3, &tags));
-        break;
-      case TransferFunction::kDCI:
-        JXL_RETURN_IF_ERROR(
-            CreateICCCurvParaTag({2.6, 1.0, 0.0, 1.0, 0.0}, 3, &tags));
-        break;
-      default:
-        JXL_ABORT("Unknown TF %u",
-                  static_cast<unsigned int>(c.tf.GetTransferFunction()));
-    }
-  }
-  FinalizeICCTag(&tags, &tag_offset, &tag_size);
-  if (c.IsGray()) {
-    AddToICCTagTable("kTRC", tag_offset, tag_size, &tagtable, &offsets);
-  } else {
-    AddToICCTagTable("rTRC", tag_offset, tag_size, &tagtable, &offsets);
-    AddToICCTagTable("gTRC", tag_offset, tag_size, &tagtable, &offsets);
-    AddToICCTagTable("bTRC", tag_offset, tag_size, &tagtable, &offsets);
-  }
-
-  // Tag count
-  WriteICCUint32(offsets.size(), 0, &tagtable);
-  for (size_t i = 0; i < offsets.size(); i++) {
-    WriteICCUint32(offsets[i] + header.size() + tagtable.size(), 4 + 12 * i + 4,
-                   &tagtable);
-  }
-
-  // ICC profile size
-  WriteICCUint32(header.size() + tagtable.size() + tags.size(), 0, &header);
-
-  *icc = header;
-  icc->append(tagtable);
-  icc->append(tags);
-
-  // The MD5 checksum must be computed on the profile with profile flags,
-  // rendering intent, and region of the checksum itself, set to 0.
-  // TODO(lode): manually verify with a reliable tool that this creates correct
-  // signature (profile id) for ICC profiles.
-  PaddedBytes icc_sum = *icc;
-  if (icc_sum.size() >= 64 + 4) {
-    memset(icc_sum.data() + 44, 0, 4);
-    memset(icc_sum.data() + 64, 0, 4);
-  }
-  uint8_t checksum[16];
-  ICCComputeMD5(icc_sum, checksum);
-
-  memcpy(icc->data() + 84, checksum, sizeof(checksum));
-
-  return true;
-}
-
-}  // namespace jxl
-#endif  // HWY_ONCE
diff --git a/lib/jxl/color_management.h b/lib/jxl/color_management.h
deleted file mode 100644 (file)
index f728fe5..0000000
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#ifndef LIB_JXL_COLOR_MANAGEMENT_H_
-#define LIB_JXL_COLOR_MANAGEMENT_H_
-
-// ICC profiles and color space conversions.
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include <vector>
-
-#include "lib/jxl/base/padded_bytes.h"
-#include "lib/jxl/base/status.h"
-#include "lib/jxl/color_encoding_internal.h"
-#include "lib/jxl/common.h"
-#include "lib/jxl/image.h"
-
-namespace jxl {
-
-enum class ExtraTF {
-  kNone,
-  kPQ,
-  kHLG,
-  kSRGB,
-};
-
-Status MaybeCreateProfile(const ColorEncoding& c,
-                          PaddedBytes* JXL_RESTRICT icc);
-
-Status CIEXYZFromWhiteCIExy(const CIExy& xy, float XYZ[3]);
-
-}  // namespace jxl
-
-#endif  // LIB_JXL_COLOR_MANAGEMENT_H_
index 99382ca..c3a6640 100644 (file)
@@ -3,24 +3,28 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-#include "lib/jxl/color_management.h"
-
+#include <jxl/cms.h>
+#include <jxl/cms_interface.h>
 #include <stdint.h>
-#include <stdio.h>
 
 #include <algorithm>
+#include <cstdint>
 #include <new>
 #include <string>
 #include <utility>
 
+#include "lib/jxl/base/common.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/data_parallel.h"
-#include "lib/jxl/base/file_io.h"
-#include "lib/jxl/base/thread_pool_internal.h"
-#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/base/random.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/cms/color_encoding_cms.h"
+#include "lib/jxl/cms/opsin_params.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/enc_xyb.h"
 #include "lib/jxl/image_test_utils.h"
 #include "lib/jxl/test_utils.h"
-#include "lib/jxl/testdata.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
 
@@ -41,6 +45,50 @@ using ::testing::FloatNear;
 // Small enough to be fast. If changed, must update Generate*.
 static constexpr size_t kWidth = 16;
 
+static constexpr size_t kNumThreads = 1;  // only have a single row.
+
+MATCHER_P(HasSameFieldsAs, expected, "") {
+  if (arg.GetRenderingIntent() != expected.GetRenderingIntent()) {
+    *result_listener << "which has a different rendering intent: "
+                     << ToString(arg.GetRenderingIntent()) << " instead of "
+                     << ToString(expected.GetRenderingIntent());
+    return false;
+  }
+  if (arg.GetColorSpace() != expected.GetColorSpace()) {
+    *result_listener << "which has a different color space: "
+                     << ToString(arg.GetColorSpace()) << " instead of "
+                     << ToString(expected.GetColorSpace());
+    return false;
+  }
+  if (arg.GetWhitePointType() != expected.GetWhitePointType()) {
+    *result_listener << "which has a different white point: "
+                     << ToString(arg.GetWhitePointType()) << " instead of "
+                     << ToString(expected.GetWhitePointType());
+    return false;
+  }
+  if (arg.HasPrimaries() &&
+      arg.GetPrimariesType() != expected.GetPrimariesType()) {
+    *result_listener << "which has different primaries: "
+                     << ToString(arg.GetPrimariesType()) << " instead of "
+                     << ToString(expected.GetPrimariesType());
+    return false;
+  }
+  if (!arg.Tf().IsSame(expected.Tf())) {
+    static const auto tf_to_string =
+        [](const jxl::cms::CustomTransferFunction& tf) {
+          if (tf.have_gamma) {
+            return "g" + ToString(tf.GetGamma());
+          }
+          return ToString(tf.transfer_function);
+        };
+    *result_listener << "which has a different transfer function: "
+                     << tf_to_string(arg.Tf()) << " instead of "
+                     << tf_to_string(expected.Tf());
+    return false;
+  }
+  return true;
+}
+
 struct Globals {
   // TODO(deymo): Make this a const.
   static Globals* GetInstance() {
@@ -49,9 +97,7 @@ struct Globals {
   }
 
  private:
-  static constexpr size_t kNumThreads = 0;  // only have a single row.
-
-  Globals() : pool(kNumThreads) {
+  Globals() {
     in_gray = GenerateGray();
     in_color = GenerateColor();
     out_gray = ImageF(kWidth, 1);
@@ -101,8 +147,6 @@ struct Globals {
   }
 
  public:
-  ThreadPoolInternal pool;
-
   // ImageF so we can use VerifyRelativeError; all are interleaved RGB.
   ImageF in_gray;
   ImageF in_color;
@@ -115,30 +159,19 @@ struct Globals {
 class ColorManagementTest
     : public ::testing::TestWithParam<test::ColorEncodingDescriptor> {
  public:
-  static void VerifySameFields(const ColorEncoding& c,
-                               const ColorEncoding& c2) {
-    ASSERT_EQ(c.rendering_intent, c2.rendering_intent);
-    ASSERT_EQ(c.GetColorSpace(), c2.GetColorSpace());
-    ASSERT_EQ(c.white_point, c2.white_point);
-    if (c.HasPrimaries()) {
-      ASSERT_EQ(c.primaries, c2.primaries);
-    }
-    ASSERT_TRUE(c.tf.IsSame(c2.tf));
-  }
-
   // "Same" pixels after converting g->c_native -> c -> g->c_native.
   static void VerifyPixelRoundTrip(const ColorEncoding& c) {
     Globals* g = Globals::GetInstance();
     const ColorEncoding& c_native = c.IsGray() ? g->c_gray : g->c_native;
-    const JxlCmsInterface& cms = GetJxlCms();
+    const JxlCmsInterface& cms = *JxlGetDefaultCms();
     ColorSpaceTransform xform_fwd(cms);
     ColorSpaceTransform xform_rev(cms);
     const float intensity_target =
-        c.tf.IsHLG() ? 1000 : kDefaultIntensityTarget;
-    ASSERT_TRUE(xform_fwd.Init(c_native, c, intensity_target, kWidth,
-                               g->pool.NumThreads()));
-    ASSERT_TRUE(xform_rev.Init(c, c_native, intensity_target, kWidth,
-                               g->pool.NumThreads()));
+        c.Tf().IsHLG() ? 1000 : kDefaultIntensityTarget;
+    ASSERT_TRUE(
+        xform_fwd.Init(c_native, c, intensity_target, kWidth, kNumThreads));
+    ASSERT_TRUE(
+        xform_rev.Init(c, c_native, intensity_target, kWidth, kNumThreads));
 
     const size_t thread = 0;
     const ImageF& in = c.IsGray() ? g->in_gray : g->in_color;
@@ -146,16 +179,12 @@ class ColorManagementTest
     ASSERT_TRUE(xform_fwd.Run(thread, in.Row(0), xform_fwd.BufDst(thread)));
     ASSERT_TRUE(xform_rev.Run(thread, xform_fwd.BufDst(thread), out->Row(0)));
 
-#if JPEGXL_ENABLE_SKCMS
+    // With lcms2, this value is lower: 5E-5
     double max_l1 = 7E-4;
-    double max_rel = 4E-7;
-#else
-    double max_l1 = 5E-5;
     // Most are lower; reached 3E-7 with D60 AP0.
     double max_rel = 4E-7;
-#endif
     if (c.IsGray()) max_rel = 2E-5;
-    VerifyRelativeError(in, *out, max_l1, max_rel);
+    JXL_ASSERT_OK(VerifyRelativeError(in, *out, max_l1, max_rel, _));
   }
 };
 JXL_GTEST_INSTANTIATE_TEST_SUITE_P(ColorManagementTestInstantiation,
@@ -173,8 +202,8 @@ TEST_P(ColorManagementTest, VerifyAllProfiles) {
 
   // Can set an equivalent ColorEncoding from the generated ICC profile.
   ColorEncoding c3;
-  ASSERT_TRUE(c3.SetICC(PaddedBytes(c.ICC())));
-  VerifySameFields(c, c3);
+  ASSERT_TRUE(c3.SetICC(IccBytes(c.ICC()), JxlGetDefaultCms()));
+  EXPECT_THAT(c3, HasSameFieldsAs(c));
 
   VerifyPixelRoundTrip(c);
 }
@@ -203,9 +232,12 @@ TEST_F(ColorManagementTest, sRGBChromaticity) {
 }
 
 TEST_F(ColorManagementTest, D2700Chromaticity) {
-  PaddedBytes icc = ReadTestData("jxl/color_management/sRGB-D2700.icc");
+  std::vector<uint8_t> icc_data =
+      jxl::test::ReadTestData("jxl/color_management/sRGB-D2700.icc");
+  IccBytes icc;
+  Bytes(icc_data).AppendTo(&icc);
   ColorEncoding sRGB_D2700;
-  ASSERT_TRUE(sRGB_D2700.SetICC(std::move(icc)));
+  ASSERT_TRUE(sRGB_D2700.SetICC(std::move(icc), JxlGetDefaultCms()));
 
   EXPECT_THAT(sRGB_D2700.GetWhitePoint(), CIExyIs(0.45986, 0.41060));
   // The illuminant-relative chromaticities of this profile's primaries are the
@@ -217,11 +249,14 @@ TEST_F(ColorManagementTest, D2700Chromaticity) {
 }
 
 TEST_F(ColorManagementTest, D2700ToSRGB) {
-  PaddedBytes icc = ReadTestData("jxl/color_management/sRGB-D2700.icc");
+  std::vector<uint8_t> icc_data =
+      jxl::test::ReadTestData("jxl/color_management/sRGB-D2700.icc");
+  IccBytes icc;
+  Bytes(icc_data).AppendTo(&icc);
   ColorEncoding sRGB_D2700;
-  ASSERT_TRUE(sRGB_D2700.SetICC(std::move(icc)));
+  ASSERT_TRUE(sRGB_D2700.SetICC(std::move(icc), JxlGetDefaultCms()));
 
-  ColorSpaceTransform transform(GetJxlCms());
+  ColorSpaceTransform transform(*JxlGetDefaultCms());
   ASSERT_TRUE(transform.Init(sRGB_D2700, ColorEncoding::SRGB(),
                              kDefaultIntensityTarget, 1, 1));
   const float sRGB_D2700_values[3] = {0.863, 0.737, 0.490};
@@ -235,16 +270,16 @@ TEST_F(ColorManagementTest, D2700ToSRGB) {
 TEST_F(ColorManagementTest, P3HlgTo2020Hlg) {
   ColorEncoding p3_hlg;
   p3_hlg.SetColorSpace(ColorSpace::kRGB);
-  p3_hlg.white_point = WhitePoint::kD65;
-  p3_hlg.primaries = Primaries::kP3;
-  p3_hlg.tf.SetTransferFunction(TransferFunction::kHLG);
+  ASSERT_TRUE(p3_hlg.SetWhitePointType(WhitePoint::kD65));
+  ASSERT_TRUE(p3_hlg.SetPrimariesType(Primaries::kP3));
+  p3_hlg.Tf().SetTransferFunction(TransferFunction::kHLG);
   ASSERT_TRUE(p3_hlg.CreateICC());
 
   ColorEncoding rec2020_hlg = p3_hlg;
-  rec2020_hlg.primaries = Primaries::k2100;
+  ASSERT_TRUE(rec2020_hlg.SetPrimariesType(Primaries::k2100));
   ASSERT_TRUE(rec2020_hlg.CreateICC());
 
-  ColorSpaceTransform transform(GetJxlCms());
+  ColorSpaceTransform transform(*JxlGetDefaultCms());
   ASSERT_TRUE(transform.Init(p3_hlg, rec2020_hlg, 1000, 1, 1));
   const float p3_hlg_values[3] = {0., 0.75, 0.};
   float rec2020_hlg_values[3];
@@ -257,12 +292,12 @@ TEST_F(ColorManagementTest, P3HlgTo2020Hlg) {
 TEST_F(ColorManagementTest, HlgOotf) {
   ColorEncoding p3_hlg;
   p3_hlg.SetColorSpace(ColorSpace::kRGB);
-  p3_hlg.white_point = WhitePoint::kD65;
-  p3_hlg.primaries = Primaries::kP3;
-  p3_hlg.tf.SetTransferFunction(TransferFunction::kHLG);
+  ASSERT_TRUE(p3_hlg.SetWhitePointType(WhitePoint::kD65));
+  ASSERT_TRUE(p3_hlg.SetPrimariesType(Primaries::kP3));
+  p3_hlg.Tf().SetTransferFunction(TransferFunction::kHLG);
   ASSERT_TRUE(p3_hlg.CreateICC());
 
-  ColorSpaceTransform transform_to_1000(GetJxlCms());
+  ColorSpaceTransform transform_to_1000(*JxlGetDefaultCms());
   ASSERT_TRUE(
       transform_to_1000.Init(p3_hlg, ColorEncoding::LinearSRGB(), 1000, 1, 1));
   // HDR reference white: https://www.itu.int/pub/R-REP-BT.2408-4-2021
@@ -275,7 +310,7 @@ TEST_F(ColorManagementTest, HlgOotf) {
               ElementsAre(FloatNear(0.203, 1e-3), FloatNear(0.203, 1e-3),
                           FloatNear(0.203, 1e-3)));
 
-  ColorSpaceTransform transform_to_400(GetJxlCms());
+  ColorSpaceTransform transform_to_400(*JxlGetDefaultCms());
   ASSERT_TRUE(
       transform_to_400.Init(p3_hlg, ColorEncoding::LinearSRGB(), 400, 1, 1));
   ASSERT_TRUE(transform_to_400.Run(0, p3_hlg_values, linear_srgb_values));
@@ -290,7 +325,7 @@ TEST_F(ColorManagementTest, HlgOotf) {
               ElementsAre(FloatNear(0.201, 1e-3), FloatNear(0.201, 1e-3),
                           FloatNear(0.050, 1e-3)));
 
-  ColorSpaceTransform transform_from_400(GetJxlCms());
+  ColorSpaceTransform transform_from_400(*JxlGetDefaultCms());
   ASSERT_TRUE(
       transform_from_400.Init(ColorEncoding::LinearSRGB(), p3_hlg, 400, 1, 1));
   linear_srgb_values[0] = linear_srgb_values[1] = linear_srgb_values[2] = 0.250;
@@ -301,11 +336,11 @@ TEST_F(ColorManagementTest, HlgOotf) {
 
   ColorEncoding grayscale_hlg;
   grayscale_hlg.SetColorSpace(ColorSpace::kGray);
-  grayscale_hlg.white_point = WhitePoint::kD65;
-  grayscale_hlg.tf.SetTransferFunction(TransferFunction::kHLG);
+  ASSERT_TRUE(grayscale_hlg.SetWhitePointType(WhitePoint::kD65));
+  grayscale_hlg.Tf().SetTransferFunction(TransferFunction::kHLG);
   ASSERT_TRUE(grayscale_hlg.CreateICC());
 
-  ColorSpaceTransform grayscale_transform(GetJxlCms());
+  ColorSpaceTransform grayscale_transform(*JxlGetDefaultCms());
   ASSERT_TRUE(grayscale_transform.Init(
       grayscale_hlg, ColorEncoding::LinearSRGB(/*is_gray=*/true), 1000, 1, 1));
   const float grayscale_hlg_value = 0.75;
@@ -315,5 +350,114 @@ TEST_F(ColorManagementTest, HlgOotf) {
   EXPECT_THAT(linear_grayscale_value, FloatNear(0.203, 1e-3));
 }
 
+TEST_F(ColorManagementTest, XYBProfile) {
+  ColorEncoding c_xyb;
+  c_xyb.SetColorSpace(ColorSpace::kXYB);
+  c_xyb.SetRenderingIntent(RenderingIntent::kPerceptual);
+  ASSERT_TRUE(c_xyb.CreateICC());
+  ColorEncoding c_native = ColorEncoding::LinearSRGB(false);
+
+  static const size_t kGridDim = 17;
+  static const size_t kNumColors = kGridDim * kGridDim * kGridDim;
+  const JxlCmsInterface& cms = *JxlGetDefaultCms();
+  ColorSpaceTransform xform(cms);
+  ASSERT_TRUE(
+      xform.Init(c_xyb, c_native, kDefaultIntensityTarget, kNumColors, 1));
+
+  ImageMetadata metadata;
+  metadata.color_encoding = c_native;
+  ImageBundle ib(&metadata);
+  Image3F native(kNumColors, 1);
+  float mul = 1.0f / (kGridDim - 1);
+  for (size_t ir = 0, x = 0; ir < kGridDim; ++ir) {
+    for (size_t ig = 0; ig < kGridDim; ++ig) {
+      for (size_t ib = 0; ib < kGridDim; ++ib, ++x) {
+        native.PlaneRow(0, 0)[x] = ir * mul;
+        native.PlaneRow(1, 0)[x] = ig * mul;
+        native.PlaneRow(2, 0)[x] = ib * mul;
+      }
+    }
+  }
+  ib.SetFromImage(std::move(native), c_native);
+  const Image3F& in = *ib.color();
+  Image3F opsin(kNumColors, 1);
+  ToXYB(ib, nullptr, &opsin, cms, nullptr);
+
+  Image3F opsin2(kNumColors, 1);
+  CopyImageTo(opsin, &opsin2);
+  ScaleXYB(&opsin2);
+
+  float* src = xform.BufSrc(0);
+  for (size_t i = 0; i < kNumColors; ++i) {
+    for (size_t c = 0; c < 3; ++c) {
+      src[3 * i + c] = opsin2.PlaneRow(c, 0)[i];
+    }
+  }
+
+  float* dst = xform.BufDst(0);
+  ASSERT_TRUE(xform.Run(0, src, dst));
+
+  Image3F out(kNumColors, 1);
+  for (size_t i = 0; i < kNumColors; ++i) {
+    for (size_t c = 0; c < 3; ++c) {
+      out.PlaneRow(c, 0)[i] = dst[3 * i + c];
+    }
+  }
+
+  auto debug_print_color = [&](size_t i) {
+    printf(
+        "(%f, %f, %f) -> (%9.6f, %f, %f) -> (%f, %f, %f) -> "
+        "(%9.6f, %9.6f, %9.6f)",
+        in.PlaneRow(0, 0)[i], in.PlaneRow(1, 0)[i], in.PlaneRow(2, 0)[i],
+        opsin.PlaneRow(0, 0)[i], opsin.PlaneRow(1, 0)[i],
+        opsin.PlaneRow(2, 0)[i], opsin2.PlaneRow(0, 0)[i],
+        opsin2.PlaneRow(1, 0)[i], opsin2.PlaneRow(2, 0)[i],
+        out.PlaneRow(0, 0)[i], out.PlaneRow(1, 0)[i], out.PlaneRow(2, 0)[i]);
+  };
+
+  float max_err[3] = {};
+  size_t max_err_i[3] = {};
+  for (size_t i = 0; i < kNumColors; ++i) {
+    for (size_t c = 0; c < 3; ++c) {
+      // debug_print_color(i); printf("\n");
+      float err = std::abs(in.PlaneRow(c, 0)[i] - out.PlaneRow(c, 0)[i]);
+      if (err > max_err[c]) {
+        max_err[c] = err;
+        max_err_i[c] = i;
+      }
+    }
+  }
+  static float kMaxError[3] = {9e-4, 4e-4, 5e-4};
+  printf("Maximum errors:\n");
+  for (size_t c = 0; c < 3; ++c) {
+    debug_print_color(max_err_i[c]);
+    printf("    %f\n", max_err[c]);
+    EXPECT_LT(max_err[c], kMaxError[c]);
+  }
+}
+
+TEST_F(ColorManagementTest, GoldenXYBCube) {
+  std::vector<int32_t> actual;
+  const jxl::cms::ColorCube3D& cube = jxl::cms::UnscaledA2BCube();
+  for (size_t ix = 0; ix < 2; ++ix) {
+    for (size_t iy = 0; iy < 2; ++iy) {
+      for (size_t ib = 0; ib < 2; ++ib) {
+        const jxl::cms::ColorCube0D& out_f = cube[ix][iy][ib];
+        for (int i = 0; i < 3; ++i) {
+          int32_t val = static_cast<int32_t>(0.5f + 65535 * out_f[i]);
+          ASSERT_TRUE(val >= 0 && val <= 65535);
+          actual.push_back(val);
+        }
+      }
+    }
+  }
+
+  std::vector<int32_t> expected = {0,     3206,  0,     0,     3206,  28873,
+                                   62329, 65535, 36662, 62329, 65535, 65535,
+                                   3206,  0,     0,     3206,  0,     28873,
+                                   65535, 62329, 36662, 65535, 62329, 65535};
+  EXPECT_EQ(actual, expected);
+}
+
 }  // namespace
 }  // namespace jxl
index b213c8d..d619711 100644 (file)
@@ -6,18 +6,9 @@
 #ifndef LIB_JXL_COMMON_H_
 #define LIB_JXL_COMMON_H_
 
-// Shared constants and helper functions.
+// Shared constants.
 
-#include <inttypes.h>
-#include <stddef.h>
-#include <stdio.h>
-
-#include <limits>  // numeric_limits
-#include <memory>  // unique_ptr
-#include <string>
-
-#include "lib/jxl/base/compiler_specific.h"
-#include "lib/jxl/base/padded_bytes.h"
+#include <cstddef>
 
 #ifndef JXL_HIGH_PRECISION
 #define JXL_HIGH_PRECISION 1
 #define JPEGXL_ENABLE_TRANSCODE_JPEG 1
 #endif  // JPEGXL_ENABLE_TRANSCODE_JPEG
 
+// Macro that defines whether support for decoding boxes is enabled.
+#ifndef JPEGXL_ENABLE_BOXES
+#define JPEGXL_ENABLE_BOXES 1
+#endif  // JPEGXL_ENABLE_BOXES
+
 namespace jxl {
 // Some enums and typedefs used by more than one header file.
 
-constexpr size_t kBitsPerByte = 8;  // more clear than CHAR_BIT
-
-constexpr inline size_t RoundUpBitsToByteMultiple(size_t bits) {
-  return (bits + 7) & ~size_t(7);
-}
-
-constexpr inline size_t RoundUpToBlockDim(size_t dim) {
-  return (dim + 7) & ~size_t(7);
-}
-
-static inline bool JXL_MAYBE_UNUSED SafeAdd(const uint64_t a, const uint64_t b,
-                                            uint64_t& sum) {
-  sum = a + b;
-  return sum >= a;  // no need to check b - either sum >= both or < both.
-}
-
-template <typename T1, typename T2>
-constexpr inline T1 DivCeil(T1 a, T2 b) {
-  return (a + b - 1) / b;
-}
-
-// Works for any `align`; if a power of two, compiler emits ADD+AND.
-constexpr inline size_t RoundUpTo(size_t what, size_t align) {
-  return DivCeil(what, align) * align;
-}
-
-constexpr double kPi = 3.14159265358979323846264338327950288;
-
-// Reasonable default for sRGB, matches common monitors. We map white to this
-// many nits (cd/m^2) by default. Butteraugli was tuned for 250 nits, which is
-// very close.
-static constexpr float kDefaultIntensityTarget = 255;
-
-template <typename T>
-constexpr T Pi(T multiplier) {
-  return static_cast<T>(multiplier * kPi);
-}
-
-// Block is the square grid of pixels to which an "energy compaction"
-// transformation (e.g. DCT) is applied. Each block has its own AC quantizer.
-constexpr size_t kBlockDim = 8;
-
-constexpr size_t kDCTBlockSize = kBlockDim * kBlockDim;
-
-constexpr size_t kGroupDim = 256;
-static_assert(kGroupDim % kBlockDim == 0,
-              "Group dim should be divisible by block dim");
-constexpr size_t kGroupDimInBlocks = kGroupDim / kBlockDim;
-
 // Maximum number of passes in an image.
 constexpr size_t kMaxNumPasses = 11;
 
 // Maximum number of reference frames.
 constexpr size_t kMaxNumReferenceFrames = 4;
 
-// Dimensions of a frame, in pixels, and other derived dimensions.
-// Computed from FrameHeader.
-// TODO(veluca): add extra channels.
-struct FrameDimensions {
-  void Set(size_t xsize, size_t ysize, size_t group_size_shift,
-           size_t max_hshift, size_t max_vshift, bool modular_mode,
-           size_t upsampling) {
-    group_dim = (kGroupDim >> 1) << group_size_shift;
-    dc_group_dim = group_dim * kBlockDim;
-    xsize_upsampled = xsize;
-    ysize_upsampled = ysize;
-    this->xsize = DivCeil(xsize, upsampling);
-    this->ysize = DivCeil(ysize, upsampling);
-    xsize_blocks = DivCeil(this->xsize, kBlockDim << max_hshift) << max_hshift;
-    ysize_blocks = DivCeil(this->ysize, kBlockDim << max_vshift) << max_vshift;
-    xsize_padded = xsize_blocks * kBlockDim;
-    ysize_padded = ysize_blocks * kBlockDim;
-    if (modular_mode) {
-      // Modular mode doesn't have any padding.
-      xsize_padded = this->xsize;
-      ysize_padded = this->ysize;
-    }
-    xsize_upsampled_padded = xsize_padded * upsampling;
-    ysize_upsampled_padded = ysize_padded * upsampling;
-    xsize_groups = DivCeil(this->xsize, group_dim);
-    ysize_groups = DivCeil(this->ysize, group_dim);
-    xsize_dc_groups = DivCeil(xsize_blocks, group_dim);
-    ysize_dc_groups = DivCeil(ysize_blocks, group_dim);
-    num_groups = xsize_groups * ysize_groups;
-    num_dc_groups = xsize_dc_groups * ysize_dc_groups;
-  }
-
-  // Image size without any upsampling, i.e. original_size / upsampling.
-  size_t xsize;
-  size_t ysize;
-  // Original image size.
-  size_t xsize_upsampled;
-  size_t ysize_upsampled;
-  // Image size after upsampling the padded image.
-  size_t xsize_upsampled_padded;
-  size_t ysize_upsampled_padded;
-  // Image size after padding to a multiple of kBlockDim (if VarDCT mode).
-  size_t xsize_padded;
-  size_t ysize_padded;
-  // Image size in kBlockDim blocks.
-  size_t xsize_blocks;
-  size_t ysize_blocks;
-  // Image size in number of groups.
-  size_t xsize_groups;
-  size_t ysize_groups;
-  // Image size in number of DC groups.
-  size_t xsize_dc_groups;
-  size_t ysize_dc_groups;
-  // Number of AC or DC groups.
-  size_t num_groups;
-  size_t num_dc_groups;
-  // Size of a group.
-  size_t group_dim;
-  size_t dc_group_dim;
-};
-
-// Prior to C++14 (i.e. C++11): provide our own make_unique
-#if __cplusplus < 201402L
-template <typename T, typename... Args>
-std::unique_ptr<T> make_unique(Args&&... args) {
-  return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
-}
-#else
-using std::make_unique;
-#endif
-
-template <typename T>
-JXL_INLINE T Clamp1(T val, T low, T hi) {
-  return val < low ? low : val > hi ? hi : val;
-}
-
-// Encodes non-negative (X) into (2 * X), negative (-X) into (2 * X - 1)
-constexpr uint32_t PackSigned(int32_t value)
-    JXL_NO_SANITIZE("unsigned-integer-overflow") {
-  return (static_cast<uint32_t>(value) << 1) ^
-         ((static_cast<uint32_t>(~value) >> 31) - 1);
-}
-
-// Reverse to PackSigned, i.e. UnpackSigned(PackSigned(X)) == X.
-// (((~value) & 1) - 1) is either 0 or 0xFF...FF and it will have an expected
-// unsigned-integer-overflow.
-constexpr intptr_t UnpackSigned(size_t value)
-    JXL_NO_SANITIZE("unsigned-integer-overflow") {
-  return static_cast<intptr_t>((value >> 1) ^ (((~value) & 1) - 1));
-}
-
-// conversion from integer to string.
-template <typename T>
-std::string ToString(T n) {
-  char data[32] = {};
-  if (T(0.1) != T(0)) {
-    // float
-    snprintf(data, sizeof(data), "%g", static_cast<double>(n));
-  } else if (T(-1) > T(0)) {
-    // unsigned
-    snprintf(data, sizeof(data), "%llu", static_cast<unsigned long long>(n));
-  } else {
-    // signed
-    snprintf(data, sizeof(data), "%lld", static_cast<long long>(n));
-  }
-  return data;
-}
-
-namespace {
-static inline uint64_t DecodeVarInt(const uint8_t* input, size_t inputSize,
-                                    size_t* pos) {
-  size_t i;
-  uint64_t ret = 0;
-  for (i = 0; *pos + i < inputSize && i < 10; ++i) {
-    ret |= uint64_t(input[*pos + i] & 127) << uint64_t(7 * i);
-    // If the next-byte flag is not set, stop
-    if ((input[*pos + i] & 128) == 0) break;
-  }
-  // TODO: Return a decoding error if i == 10.
-  *pos += i + 1;
-  return ret;
-}
-
-static inline bool EncodeVarInt(uint64_t value, size_t output_size,
-                                size_t* output_pos, uint8_t* output) {
-  // While more than 7 bits of data are left,
-  // store 7 bits and set the next byte flag
-  while (value > 127) {
-    if (*output_pos > output_size) return false;
-    // |128: Set the next byte flag
-    output[(*output_pos)++] = ((uint8_t)(value & 127)) | 128;
-    // Remove the seven bits we just wrote
-    value >>= 7;
-  }
-  if (*output_pos > output_size) return false;
-  output[(*output_pos)++] = ((uint8_t)value) & 127;
-  return true;
-}
-
-static inline void EncodeVarInt(uint64_t value, PaddedBytes* data) {
-  size_t pos = data->size();
-  data->resize(data->size() + 9);
-  JXL_CHECK(EncodeVarInt(value, data->size(), &pos, data->data()));
-  data->resize(pos);
-}
-}  // namespace
-
 }  // namespace jxl
 
 #endif  // LIB_JXL_COMMON_H_
index 3b2c323..b21b1da 100644 (file)
 
 #include "lib/jxl/ac_strategy.h"
 #include "lib/jxl/ans_params.h"
-#include "lib/jxl/aux_out.h"
-#include "lib/jxl/aux_out_fwd.h"
 #include "lib/jxl/base/bits.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/data_parallel.h"
-#include "lib/jxl/base/padded_bytes.h"
-#include "lib/jxl/base/profiler.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/chroma_from_luma.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/dec_ans.h"
 #include "lib/jxl/dec_bit_reader.h"
 #include "lib/jxl/dec_cache.h"
@@ -147,8 +142,6 @@ void AdaptiveDCSmoothing(const float* dc_factors, Image3F* dc,
   // the x and b channels through color correlation.
   JXL_ASSERT(w1 + w2 < 0.25f);
 
-  PROFILER_FUNC;
-
   Image3F smoothed(xsize, ysize);
   // Fill in borders that the loop below will not. First and last are unused.
   for (size_t c = 0; c < 3; c++) {
index 054c9c6..cd79153 100644 (file)
@@ -12,7 +12,6 @@
 
 #include <hwy/highway.h>
 
-#include "lib/jxl/base/profiler.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/image_ops.h"
 
@@ -179,7 +178,6 @@ class ConvolveT {
   template <class Image, class Weights>
   static void Run(const Image& in, const Rect& rect, const Weights& weights,
                   ThreadPool* pool, Image* out) {
-    PROFILER_ZONE("ConvolveT::Run");
     JXL_CHECK(SameSize(rect, *out));
     JXL_CHECK(rect.xsize() >= MinWidth());
 
index fffe5f7..91e11dc 100644 (file)
@@ -114,8 +114,6 @@ void SlowSymmetric3Row(const ImageF& in, const int64_t iy, const int64_t xsize,
 void SlowSymmetric3(const ImageF& in, const Rect& rect,
                     const WeightsSymmetric3& weights, ThreadPool* pool,
                     ImageF* JXL_RESTRICT out) {
-  PROFILER_FUNC;
-
   const int64_t xsize = static_cast<int64_t>(rect.xsize());
   const int64_t ysize = static_cast<int64_t>(rect.ysize());
   const int64_t kRadius = 1;
@@ -168,7 +166,6 @@ float SlowSeparablePixel(const ImageF& in, const Rect& rect, const int64_t x,
 void SlowSeparable5(const ImageF& in, const Rect& rect,
                     const WeightsSeparable5& weights, ThreadPool* pool,
                     ImageF* out) {
-  PROFILER_FUNC;
   const float* horz_weights = &weights.horz[0];
   const float* vert_weights = &weights.vert[0];
 
@@ -190,7 +187,6 @@ void SlowSeparable5(const ImageF& in, const Rect& rect,
 void SlowSeparable7(const ImageF& in, const Rect& rect,
                     const WeightsSeparable7& weights, ThreadPool* pool,
                     ImageF* out) {
-  PROFILER_FUNC;
   const float* horz_weights = &weights.horz[0];
   const float* vert_weights = &weights.vert[0];
 
index 55a1689..5636a82 100644 (file)
@@ -10,7 +10,7 @@
 #include <hwy/foreach_target.h>
 #include <hwy/highway.h>
 
-#include "lib/jxl/common.h"  // RoundUpTo
+#include "lib/jxl/base/common.h"
 #include "lib/jxl/convolve-inl.h"
 
 HWY_BEFORE_NAMESPACE();
@@ -149,8 +149,6 @@ static JXL_NOINLINE void Symmetric5BorderRow(const ImageF& in, const Rect& rect,
 void Symmetric5(const ImageF& in, const Rect& rect,
                 const WeightsSymmetric5& weights, ThreadPool* pool,
                 ImageF* JXL_RESTRICT out) {
-  PROFILER_FUNC;
-
   const size_t ysize = rect.ysize();
   JXL_CHECK(RunOnPool(
       pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
index 2d75c31..49fe846 100644 (file)
 #include <hwy/foreach_target.h>
 #include <hwy/highway.h>
 #include <hwy/nanobenchmark.h>
-#include <hwy/tests/test_util-inl.h>
+#include <hwy/tests/hwy_gtest.h>
 #include <vector>
 
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/base/printf_macros.h"
-#include "lib/jxl/base/thread_pool_internal.h"
+#include "lib/jxl/base/random.h"
 #include "lib/jxl/image_ops.h"
 #include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
 
 #ifndef JXL_DEBUG_CONVOLVE
 #define JXL_DEBUG_CONVOLVE 0
@@ -35,24 +37,27 @@ namespace HWY_NAMESPACE {
 void TestNeighbors() {
   const Neighbors::D d;
   const Neighbors::V v = Iota(d, 0);
-  HWY_ALIGN float actual[hwy::kTestMaxVectorSize / sizeof(float)] = {0};
+  constexpr size_t kMaxVectorSize = 64;
+  constexpr size_t M = kMaxVectorSize / sizeof(float);
+  HWY_ALIGN float actual[M] = {0};
 
-  HWY_ALIGN float first_l1[hwy::kTestMaxVectorSize / sizeof(float)] = {
-      0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14};
+  HWY_ALIGN float first_l1[M] = {0, 0, 1, 2,  3,  4,  5,  6,
+                                 7, 8, 9, 10, 11, 12, 13, 14};
   Store(Neighbors::FirstL1(v), d, actual);
   const size_t N = Lanes(d);
+  ASSERT_LE(N, M);
   EXPECT_EQ(std::vector<float>(first_l1, first_l1 + N),
             std::vector<float>(actual, actual + N));
 
 #if HWY_TARGET != HWY_SCALAR
-  HWY_ALIGN float first_l2[hwy::kTestMaxVectorSize / sizeof(float)] = {
-      1, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13};
+  HWY_ALIGN float first_l2[M] = {1, 0, 0, 1, 2,  3,  4,  5,
+                                 6, 7, 8, 9, 10, 11, 12, 13};
   Store(Neighbors::FirstL2(v), d, actual);
   EXPECT_EQ(std::vector<float>(first_l2, first_l2 + N),
             std::vector<float>(actual, actual + N));
 
-  HWY_ALIGN float first_l3[hwy::kTestMaxVectorSize / sizeof(float)] = {
-      2, 1, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  HWY_ALIGN float first_l3[] = {2, 1, 0, 0, 1, 2,  3,  4,
+                                5, 6, 7, 8, 9, 10, 11, 12};
   Store(Neighbors::FirstL3(v), d, actual);
   EXPECT_EQ(std::vector<float>(first_l3, first_l3 + N),
             std::vector<float>(actual, actual + N));
@@ -73,7 +78,7 @@ void VerifySymmetric3(const size_t xsize, const size_t ysize, ThreadPool* pool,
   Symmetric3(in, rect, weights, pool, &out_expected);
   SlowSymmetric3(in, rect, weights, pool, &out_actual);
 
-  VerifyRelativeError(out_expected, out_actual, 1E-5f, 1E-5f);
+  JXL_ASSERT_OK(VerifyRelativeError(out_expected, out_actual, 1E-5f, 1E-5f, _));
 }
 
 // Ensures Symmetric and Separable give the same result.
@@ -90,7 +95,7 @@ void VerifySymmetric5(const size_t xsize, const size_t ysize, ThreadPool* pool,
   Separable5(in, Rect(in), WeightsSeparable5Lowpass(), pool, &out_expected);
   Symmetric5(in, rect, WeightsSymmetric5Lowpass(), pool, &out_actual);
 
-  VerifyRelativeError(out_expected, out_actual, 1E-5f, 1E-5f);
+  JXL_ASSERT_OK(VerifyRelativeError(out_expected, out_actual, 1E-5f, 1E-5f, _));
 }
 
 void VerifySeparable5(const size_t xsize, const size_t ysize, ThreadPool* pool,
@@ -107,7 +112,7 @@ void VerifySeparable5(const size_t xsize, const size_t ysize, ThreadPool* pool,
   Separable5(in, Rect(in), weights, pool, &out_expected);
   SlowSeparable5(in, rect, weights, pool, &out_actual);
 
-  VerifyRelativeError(out_expected, out_actual, 1E-5f, 1E-5f);
+  JXL_ASSERT_OK(VerifyRelativeError(out_expected, out_actual, 1E-5f, 1E-5f, _));
 }
 
 void VerifySeparable7(const size_t xsize, const size_t ysize, ThreadPool* pool,
@@ -129,14 +134,14 @@ void VerifySeparable7(const size_t xsize, const size_t ysize, ThreadPool* pool,
   SlowSeparable7(in, rect, weights, pool, &out_expected);
   Separable7(in, Rect(in), weights, pool, &out_actual);
 
-  VerifyRelativeError(out_expected, out_actual, 1E-5f, 1E-5f);
+  JXL_ASSERT_OK(VerifyRelativeError(out_expected, out_actual, 1E-5f, 1E-5f, _));
 }
 
 // For all xsize/ysize and kernels:
 void TestConvolve() {
   TestNeighbors();
 
-  ThreadPoolInternal pool(4);
+  test::ThreadPoolForTests pool(4);
   EXPECT_EQ(true,
             RunOnPool(
                 &pool, kConvolveMaxRadius, 40, ThreadPool::NoInit,
@@ -145,7 +150,7 @@ void TestConvolve() {
                   Rng rng(129 + 13 * xsize);
 
                   ThreadPool* null_pool = nullptr;
-                  ThreadPoolInternal pool3(3);
+                  test::ThreadPoolForTests pool3(3);
                   for (size_t ysize = kConvolveMaxRadius; ysize < 16; ++ysize) {
                     JXL_DEBUG(JXL_DEBUG_CONVOLVE,
                               "%" PRIuS " x %" PRIuS " (target %" PRIx64
index dd6ea62..ee2a97f 100644 (file)
@@ -5,9 +5,8 @@
 
 #include "lib/jxl/base/data_parallel.h"
 
-#include "gtest/gtest.h"
-#include "lib/jxl/base/thread_pool_internal.h"
 #include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
 namespace {
@@ -47,7 +46,6 @@ class DataParallelTest : public ::testing::Test {
 
 // JxlParallelRunInit interface.
 typedef int (*JxlParallelRunInit)();
-int TestInit(void* jpegxl_opaque, size_t num_threads) { return 0; }
 
 }  // namespace
 
index 5326060..cb6c54b 100644 (file)
@@ -154,12 +154,12 @@ struct DCT1DImpl;
 
 template <size_t SZ>
 struct DCT1DImpl<1, SZ> {
-  JXL_INLINE void operator()(float* JXL_RESTRICT mem) {}
+  JXL_INLINE void operator()(float* JXL_RESTRICT mem, float*) {}
 };
 
 template <size_t SZ>
 struct DCT1DImpl<2, SZ> {
-  JXL_INLINE void operator()(float* JXL_RESTRICT mem) {
+  JXL_INLINE void operator()(float* JXL_RESTRICT mem, float*) {
     auto in1 = Load(FV<SZ>(), mem);
     auto in2 = Load(FV<SZ>(), mem + SZ);
     Store(Add(in1, in2), FV<SZ>(), mem);
@@ -169,14 +169,12 @@ struct DCT1DImpl<2, SZ> {
 
 template <size_t N, size_t SZ>
 struct DCT1DImpl {
-  void operator()(float* JXL_RESTRICT mem) {
-    // This is relatively small (4kB with 64-DCT and AVX-512)
-    HWY_ALIGN float tmp[N * SZ];
+  void operator()(float* JXL_RESTRICT mem, float* JXL_RESTRICT tmp) {
     CoeffBundle<N / 2, SZ>::AddReverse(mem, mem + N / 2 * SZ, tmp);
-    DCT1DImpl<N / 2, SZ>()(tmp);
+    DCT1DImpl<N / 2, SZ>()(tmp, tmp + N * SZ);
     CoeffBundle<N / 2, SZ>::SubReverse(mem, mem + N / 2 * SZ, tmp + N / 2 * SZ);
     CoeffBundle<N, SZ>::Multiply(tmp);
-    DCT1DImpl<N / 2, SZ>()(tmp + N / 2 * SZ);
+    DCT1DImpl<N / 2, SZ>()(tmp + N / 2 * SZ, tmp + N * SZ);
     CoeffBundle<N / 2, SZ>::B(tmp + N / 2 * SZ);
     CoeffBundle<N, SZ>::InverseEvenOdd(tmp, mem);
   }
@@ -188,7 +186,7 @@ struct IDCT1DImpl;
 template <size_t SZ>
 struct IDCT1DImpl<1, SZ> {
   JXL_INLINE void operator()(const float* from, size_t from_stride, float* to,
-                             size_t to_stride) {
+                             size_t to_stride, float* JXL_RESTRICT) {
     StoreU(LoadU(FV<SZ>(), from), FV<SZ>(), to);
   }
 };
@@ -196,7 +194,7 @@ struct IDCT1DImpl<1, SZ> {
 template <size_t SZ>
 struct IDCT1DImpl<2, SZ> {
   JXL_INLINE void operator()(const float* from, size_t from_stride, float* to,
-                             size_t to_stride) {
+                             size_t to_stride, float* JXL_RESTRICT) {
     JXL_DASSERT(from_stride >= SZ);
     JXL_DASSERT(to_stride >= SZ);
     auto in1 = LoadU(FV<SZ>(), from);
@@ -209,74 +207,79 @@ struct IDCT1DImpl<2, SZ> {
 template <size_t N, size_t SZ>
 struct IDCT1DImpl {
   void operator()(const float* from, size_t from_stride, float* to,
-                  size_t to_stride) {
+                  size_t to_stride, float* JXL_RESTRICT tmp) {
     JXL_DASSERT(from_stride >= SZ);
     JXL_DASSERT(to_stride >= SZ);
-    // This is relatively small (4kB with 64-DCT and AVX-512)
-    HWY_ALIGN float tmp[N * SZ];
     CoeffBundle<N, SZ>::ForwardEvenOdd(from, from_stride, tmp);
-    IDCT1DImpl<N / 2, SZ>()(tmp, SZ, tmp, SZ);
+    IDCT1DImpl<N / 2, SZ>()(tmp, SZ, tmp, SZ, tmp + N * SZ);
     CoeffBundle<N / 2, SZ>::BTranspose(tmp + N / 2 * SZ);
-    IDCT1DImpl<N / 2, SZ>()(tmp + N / 2 * SZ, SZ, tmp + N / 2 * SZ, SZ);
+    IDCT1DImpl<N / 2, SZ>()(tmp + N / 2 * SZ, SZ, tmp + N / 2 * SZ, SZ,
+                            tmp + N * SZ);
     CoeffBundle<N, SZ>::MultiplyAndAdd(tmp, to, to_stride);
   }
 };
 
 template <size_t N, size_t M_or_0, typename FromBlock, typename ToBlock>
-void DCT1DWrapper(const FromBlock& from, const ToBlock& to, size_t Mp) {
+void DCT1DWrapper(const FromBlock& from, const ToBlock& to, size_t Mp,
+                  float* JXL_RESTRICT tmp) {
   size_t M = M_or_0 != 0 ? M_or_0 : Mp;
   constexpr size_t SZ = MaxLanes(FV<M_or_0>());
-  HWY_ALIGN float tmp[N * SZ];
   for (size_t i = 0; i < M; i += Lanes(FV<M_or_0>())) {
     // TODO(veluca): consider removing the temporary memory here (as is done in
     // IDCT), if it turns out that some compilers don't optimize away the loads
     // and this is performance-critical.
     CoeffBundle<N, SZ>::LoadFromBlock(from, i, tmp);
-    DCT1DImpl<N, SZ>()(tmp);
+    DCT1DImpl<N, SZ>()(tmp, tmp + N * SZ);
     CoeffBundle<N, SZ>::StoreToBlockAndScale(tmp, to, i);
   }
 }
 
 template <size_t N, size_t M_or_0, typename FromBlock, typename ToBlock>
-void IDCT1DWrapper(const FromBlock& from, const ToBlock& to, size_t Mp) {
+void IDCT1DWrapper(const FromBlock& from, const ToBlock& to, size_t Mp,
+                   float* JXL_RESTRICT tmp) {
   size_t M = M_or_0 != 0 ? M_or_0 : Mp;
   constexpr size_t SZ = MaxLanes(FV<M_or_0>());
   for (size_t i = 0; i < M; i += Lanes(FV<M_or_0>())) {
     IDCT1DImpl<N, SZ>()(from.Address(0, i), from.Stride(), to.Address(0, i),
-                        to.Stride());
+                        to.Stride(), tmp);
   }
 }
 
 template <size_t N, size_t M, typename = void>
 struct DCT1D {
   template <typename FromBlock, typename ToBlock>
-  void operator()(const FromBlock& from, const ToBlock& to) {
-    return DCT1DWrapper<N, M>(from, to, M);
+  void operator()(const FromBlock& from, const ToBlock& to,
+                  float* JXL_RESTRICT tmp) {
+    return DCT1DWrapper<N, M>(from, to, M, tmp);
   }
 };
 
 template <size_t N, size_t M>
 struct DCT1D<N, M, typename std::enable_if<(M > MaxLanes(FV<0>()))>::type> {
   template <typename FromBlock, typename ToBlock>
-  void operator()(const FromBlock& from, const ToBlock& to) {
-    return NoInlineWrapper(DCT1DWrapper<N, 0, FromBlock, ToBlock>, from, to, M);
+  void operator()(const FromBlock& from, const ToBlock& to,
+                  float* JXL_RESTRICT tmp) {
+    return NoInlineWrapper(DCT1DWrapper<N, 0, FromBlock, ToBlock>, from, to, M,
+                           tmp);
   }
 };
 
 template <size_t N, size_t M, typename = void>
 struct IDCT1D {
   template <typename FromBlock, typename ToBlock>
-  void operator()(const FromBlock& from, const ToBlock& to) {
-    return IDCT1DWrapper<N, M>(from, to, M);
+  void operator()(const FromBlock& from, const ToBlock& to,
+                  float* JXL_RESTRICT tmp) {
+    return IDCT1DWrapper<N, M>(from, to, M, tmp);
   }
 };
 
 template <size_t N, size_t M>
 struct IDCT1D<N, M, typename std::enable_if<(M > MaxLanes(FV<0>()))>::type> {
   template <typename FromBlock, typename ToBlock>
-  void operator()(const FromBlock& from, const ToBlock& to) {
-    return NoInlineWrapper(IDCT1DWrapper<N, 0, FromBlock, ToBlock>, from, to,
-                           M);
+  void operator()(const FromBlock& from, const ToBlock& to,
+                  float* JXL_RESTRICT tmp) {
+    return NoInlineWrapper(IDCT1DWrapper<N, 0, FromBlock, ToBlock>, from, to, M,
+                           tmp);
   }
 };
 
@@ -290,15 +293,16 @@ struct ComputeScaledDCT {
   HWY_MAYBE_UNUSED void operator()(const From& from, float* to,
                                    float* JXL_RESTRICT scratch_space) {
     float* JXL_RESTRICT block = scratch_space;
+    float* JXL_RESTRICT tmp = scratch_space + ROWS * COLS;
     if (ROWS < COLS) {
-      DCT1D<ROWS, COLS>()(from, DCTTo(block, COLS));
+      DCT1D<ROWS, COLS>()(from, DCTTo(block, COLS), tmp);
       Transpose<ROWS, COLS>::Run(DCTFrom(block, COLS), DCTTo(to, ROWS));
-      DCT1D<COLS, ROWS>()(DCTFrom(to, ROWS), DCTTo(block, ROWS));
+      DCT1D<COLS, ROWS>()(DCTFrom(to, ROWS), DCTTo(block, ROWS), tmp);
       Transpose<COLS, ROWS>::Run(DCTFrom(block, ROWS), DCTTo(to, COLS));
     } else {
-      DCT1D<ROWS, COLS>()(from, DCTTo(to, COLS));
+      DCT1D<ROWS, COLS>()(from, DCTTo(to, COLS), tmp);
       Transpose<ROWS, COLS>::Run(DCTFrom(to, COLS), DCTTo(block, ROWS));
-      DCT1D<COLS, ROWS>()(DCTFrom(block, ROWS), DCTTo(to, ROWS));
+      DCT1D<COLS, ROWS>()(DCTFrom(block, ROWS), DCTTo(to, ROWS), tmp);
     }
   }
 };
@@ -312,16 +316,17 @@ struct ComputeScaledIDCT {
   HWY_MAYBE_UNUSED void operator()(float* JXL_RESTRICT from, const To& to,
                                    float* JXL_RESTRICT scratch_space) {
     float* JXL_RESTRICT block = scratch_space;
+    float* JXL_RESTRICT tmp = scratch_space + ROWS * COLS;
     // Reverse the steps done in ComputeScaledDCT.
     if (ROWS < COLS) {
       Transpose<ROWS, COLS>::Run(DCTFrom(from, COLS), DCTTo(block, ROWS));
-      IDCT1D<COLS, ROWS>()(DCTFrom(block, ROWS), DCTTo(from, ROWS));
+      IDCT1D<COLS, ROWS>()(DCTFrom(block, ROWS), DCTTo(from, ROWS), tmp);
       Transpose<COLS, ROWS>::Run(DCTFrom(from, ROWS), DCTTo(block, COLS));
-      IDCT1D<ROWS, COLS>()(DCTFrom(block, COLS), to);
+      IDCT1D<ROWS, COLS>()(DCTFrom(block, COLS), to, tmp);
     } else {
-      IDCT1D<COLS, ROWS>()(DCTFrom(from, ROWS), DCTTo(block, ROWS));
+      IDCT1D<COLS, ROWS>()(DCTFrom(from, ROWS), DCTTo(block, ROWS), tmp);
       Transpose<COLS, ROWS>::Run(DCTFrom(block, ROWS), DCTTo(from, COLS));
-      IDCT1D<ROWS, COLS>()(DCTFrom(from, COLS), to);
+      IDCT1D<ROWS, COLS>()(DCTFrom(from, COLS), to, tmp);
     }
   }
 };
index 8e32aa7..58dd75e 100644 (file)
@@ -13,7 +13,7 @@
 #include <cmath>
 #include <vector>
 
-#include "lib/jxl/common.h"  // Pi
+#include "lib/jxl/base/common.h"
 
 namespace jxl {
 
index 8c9bc27..e4982e2 100644 (file)
 #define HWY_TARGET_INCLUDE "lib/jxl/dct_test.cc"
 #include <hwy/foreach_target.h>
 #include <hwy/highway.h>
-#include <hwy/tests/test_util-inl.h>
+#include <hwy/tests/hwy_gtest.h>
 
-#include "lib/jxl/base/thread_pool_internal.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/dct-inl.h"
 #include "lib/jxl/dct_for_test.h"
 #include "lib/jxl/dct_scales.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
 
 HWY_BEFORE_NAMESPACE();
 namespace jxl {
@@ -34,7 +33,7 @@ namespace HWY_NAMESPACE {
 template <size_t N>
 void ComputeDCT(float block[N * N]) {
   HWY_ALIGN float tmp_block[N * N];
-  HWY_ALIGN float scratch_space[N * N];
+  HWY_ALIGN float scratch_space[4 * N * N];
   ComputeScaledDCT<N, N>()(DCTFrom(block, N), tmp_block, scratch_space);
 
   // Untranspose.
@@ -46,7 +45,7 @@ void ComputeDCT(float block[N * N]) {
 template <int N>
 void ComputeIDCT(float block[N * N]) {
   HWY_ALIGN float tmp_block[N * N];
-  HWY_ALIGN float scratch_space[N * N];
+  HWY_ALIGN float scratch_space[4 * N * N];
   // Untranspose.
   Transpose<N, N>::Run(DCTFrom(block, N), DCTTo(tmp_block, N));
 
@@ -89,6 +88,7 @@ void ColumnDctRoundtripT(float accuracy) {
   // regular 8x8 block transformation. On the bright side - we could check all
   // 8 basis vectors at once.
   HWY_ALIGN float block[kBlockSize];
+  HWY_ALIGN float scratch[3 * kBlockSize];
   DCTTo to(block, N);
   DCTFrom from(block, N);
   for (size_t i = 0; i < N; ++i) {
@@ -103,8 +103,8 @@ void ColumnDctRoundtripT(float accuracy) {
   DCTTo to_tmp(tmp, N);
   DCTFrom from_tmp(tmp, N);
 
-  DCT1D<N, N>()(from, to_tmp);
-  IDCT1D<N, N>()(from_tmp, to);
+  DCT1D<N, N>()(from, to_tmp, scratch);
+  IDCT1D<N, N>()(from_tmp, to, scratch);
 
   for (size_t i = 0; i < N; ++i) {
     for (size_t j = 0; j < N; ++j) {
@@ -157,7 +157,7 @@ void TestIdctAccuracy(float accuracy, size_t start = 0, size_t end = N * N) {
 
 template <size_t N>
 void TestInverseT(float accuracy) {
-  ThreadPoolInternal pool(N < 32 ? 0 : 8);
+  test::ThreadPoolForTests pool(N < 32 ? 0 : 8);
   enum { kBlockSize = N * N };
   EXPECT_TRUE(RunOnPool(
       &pool, 0, kBlockSize, ThreadPool::NoInit,
@@ -230,7 +230,7 @@ void TestRectInverseT(float accuracy) {
     HWY_ALIGN float out[kBlockSize] = {0.0f};
     x[i] = 1.0;
     HWY_ALIGN float coeffs[kBlockSize] = {0.0f};
-    HWY_ALIGN float scratch_space[kBlockSize * 2];
+    HWY_ALIGN float scratch_space[kBlockSize * 5];
 
     ComputeScaledDCT<ROWS, COLS>()(DCTFrom(x, COLS), coeffs, scratch_space);
     ComputeScaledIDCT<ROWS, COLS>()(coeffs, DCTTo(out, COLS), scratch_space);
@@ -264,7 +264,7 @@ void TestRectInverse() {
 template <size_t ROWS, size_t COLS>
 void TestRectTransposeT(float accuracy) {
   constexpr size_t kBlockSize = ROWS * COLS;
-  HWY_ALIGN float scratch_space[kBlockSize * 2];
+  HWY_ALIGN float scratch_space[kBlockSize * 5];
   for (size_t px = 0; px < COLS; ++px) {
     for (size_t py = 0; py < ROWS; ++py) {
       HWY_ALIGN float x1[kBlockSize] = {0.0f};
index fb6ce3b..2f29449 100644 (file)
@@ -11,7 +11,6 @@
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/image_ops.h"
 
index c914547..4268c1f 100644 (file)
@@ -13,9 +13,7 @@
 #include "lib/jxl/ans_params.h"
 #include "lib/jxl/base/bits.h"
 #include "lib/jxl/base/printf_macros.h"
-#include "lib/jxl/base/profiler.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/dec_context_map.h"
 #include "lib/jxl/fields.h"
 
@@ -329,7 +327,6 @@ void ANSCode::UpdateMaxNumBits(size_t ctx, size_t symbol) {
 
 Status DecodeHistograms(BitReader* br, size_t num_contexts, ANSCode* code,
                         std::vector<uint8_t>* context_map, bool disallow_lz77) {
-  PROFILER_FUNC;
   JXL_RETURN_IF_ERROR(Bundle::Read(br, &code->lz77));
   if (code->lz77.enabled) {
     num_contexts++;
index 0f44067..57faad2 100644 (file)
@@ -19,8 +19,8 @@
 #include "lib/jxl/ans_params.h"
 #include "lib/jxl/base/bits.h"
 #include "lib/jxl/base/byte_order.h"
-#include "lib/jxl/base/cache_aligned.h"
 #include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/cache_aligned.h"
 #include "lib/jxl/dec_bit_reader.h"
 #include "lib/jxl/dec_huffman.h"
 #include "lib/jxl/field_encodings.h"
@@ -276,9 +276,10 @@ class ANSSymbolReader {
   }
 
   // Takes a *clustered* idx. Can only use if HuffRleOnly() is true.
-  void ReadHybridUintClusteredHuffRleOnly(size_t ctx,
-                                          BitReader* JXL_RESTRICT br,
-                                          uint32_t* value, uint32_t* run) {
+  JXL_INLINE void ReadHybridUintClusteredHuffRleOnly(size_t ctx,
+                                                     BitReader* JXL_RESTRICT br,
+                                                     uint32_t* value,
+                                                     uint32_t* run) {
     JXL_DASSERT(HuffRleOnly());
     br->Refill();  // covers ReadSymbolWithoutRefill + PeekBits
     size_t token = ReadSymbolHuffWithoutRefill(ctx, br);
@@ -300,55 +301,97 @@ class ANSSymbolReader {
     if (configs[lz77_ctx_].split_token > 1) return false;
     return true;
   }
-
-  // Takes a *clustered* idx.
-  size_t ReadHybridUintClustered(size_t ctx, BitReader* JXL_RESTRICT br) {
-    if (JXL_UNLIKELY(num_to_copy_ > 0)) {
-      size_t ret = lz77_window_[(copy_pos_++) & kWindowMask];
-      num_to_copy_--;
-      lz77_window_[(num_decoded_++) & kWindowMask] = ret;
-      return ret;
+  bool UsesLZ77() { return lz77_window_ != nullptr; }
+
+  // Takes a *clustered* idx. Inlined, for use in hot paths.
+  template <bool uses_lz77>
+  JXL_INLINE size_t ReadHybridUintClusteredInlined(size_t ctx,
+                                                   BitReader* JXL_RESTRICT br) {
+    if (uses_lz77) {
+      if (JXL_UNLIKELY(num_to_copy_ > 0)) {
+        size_t ret = lz77_window_[(copy_pos_++) & kWindowMask];
+        num_to_copy_--;
+        lz77_window_[(num_decoded_++) & kWindowMask] = ret;
+        return ret;
+      }
     }
+
     br->Refill();  // covers ReadSymbolWithoutRefill + PeekBits
     size_t token = ReadSymbolWithoutRefill(ctx, br);
-    if (JXL_UNLIKELY(token >= lz77_threshold_)) {
-      num_to_copy_ =
-          ReadHybridUintConfig(lz77_length_uint_, token - lz77_threshold_, br) +
-          lz77_min_length_;
-      br->Refill();  // covers ReadSymbolWithoutRefill + PeekBits
-      // Distance code.
-      size_t token = ReadSymbolWithoutRefill(lz77_ctx_, br);
-      size_t distance = ReadHybridUintConfig(configs[lz77_ctx_], token, br);
-      if (JXL_LIKELY(distance < num_special_distances_)) {
-        distance = special_distances_[distance];
-      } else {
-        distance = distance + 1 - num_special_distances_;
-      }
-      if (JXL_UNLIKELY(distance > num_decoded_)) {
-        distance = num_decoded_;
-      }
-      if (JXL_UNLIKELY(distance > kWindowSize)) {
-        distance = kWindowSize;
-      }
-      copy_pos_ = num_decoded_ - distance;
-      if (JXL_UNLIKELY(distance == 0)) {
-        JXL_DASSERT(lz77_window_ != nullptr);
-        // distance 0 -> num_decoded_ == copy_pos_ == 0
-        size_t to_fill = std::min<size_t>(num_to_copy_, kWindowSize);
-        memset(lz77_window_, 0, to_fill * sizeof(lz77_window_[0]));
+    if (uses_lz77) {
+      if (JXL_UNLIKELY(token >= lz77_threshold_)) {
+        num_to_copy_ = ReadHybridUintConfig(lz77_length_uint_,
+                                            token - lz77_threshold_, br) +
+                       lz77_min_length_;
+        br->Refill();  // covers ReadSymbolWithoutRefill + PeekBits
+        // Distance code.
+        size_t token = ReadSymbolWithoutRefill(lz77_ctx_, br);
+        size_t distance = ReadHybridUintConfig(configs[lz77_ctx_], token, br);
+        if (JXL_LIKELY(distance < num_special_distances_)) {
+          distance = special_distances_[distance];
+        } else {
+          distance = distance + 1 - num_special_distances_;
+        }
+        if (JXL_UNLIKELY(distance > num_decoded_)) {
+          distance = num_decoded_;
+        }
+        if (JXL_UNLIKELY(distance > kWindowSize)) {
+          distance = kWindowSize;
+        }
+        copy_pos_ = num_decoded_ - distance;
+        if (JXL_UNLIKELY(distance == 0)) {
+          JXL_DASSERT(lz77_window_ != nullptr);
+          // distance 0 -> num_decoded_ == copy_pos_ == 0
+          size_t to_fill = std::min<size_t>(num_to_copy_, kWindowSize);
+          memset(lz77_window_, 0, to_fill * sizeof(lz77_window_[0]));
+        }
+        // TODO(eustas): overflow; mark BitReader as unhealthy
+        if (num_to_copy_ < lz77_min_length_) return 0;
+        // the code below is the same as doing this:
+        //        return ReadHybridUintClustered<uses_lz77>(ctx, br);
+        // but gcc doesn't like recursive inlining
+
+        size_t ret = lz77_window_[(copy_pos_++) & kWindowMask];
+        num_to_copy_--;
+        lz77_window_[(num_decoded_++) & kWindowMask] = ret;
+        return ret;
       }
-      // TODO(eustas): overflow; mark BitReader as unhealthy
-      if (num_to_copy_ < lz77_min_length_) return 0;
-      return ReadHybridUintClustered(ctx, br);  // will trigger a copy.
     }
     size_t ret = ReadHybridUintConfig(configs[ctx], token, br);
-    if (lz77_window_) lz77_window_[(num_decoded_++) & kWindowMask] = ret;
+    if (uses_lz77 && lz77_window_)
+      lz77_window_[(num_decoded_++) & kWindowMask] = ret;
     return ret;
   }
 
-  JXL_INLINE size_t ReadHybridUint(size_t ctx, BitReader* JXL_RESTRICT br,
-                                   const std::vector<uint8_t>& context_map) {
-    return ReadHybridUintClustered(context_map[ctx], br);
+  // same but not inlined
+  template <bool uses_lz77>
+  size_t ReadHybridUintClustered(size_t ctx, BitReader* JXL_RESTRICT br) {
+    return ReadHybridUintClusteredInlined<uses_lz77>(ctx, br);
+  }
+
+  // inlined only in the no-lz77 case
+  template <bool uses_lz77>
+  JXL_INLINE size_t
+  ReadHybridUintClusteredMaybeInlined(size_t ctx, BitReader* JXL_RESTRICT br) {
+    if (uses_lz77) {
+      return ReadHybridUintClustered<uses_lz77>(ctx, br);
+    } else {
+      return ReadHybridUintClusteredInlined<uses_lz77>(ctx, br);
+    }
+  }
+
+  // inlined, for use in hot paths
+  template <bool uses_lz77>
+  JXL_INLINE size_t
+  ReadHybridUintInlined(size_t ctx, BitReader* JXL_RESTRICT br,
+                        const std::vector<uint8_t>& context_map) {
+    return ReadHybridUintClustered<uses_lz77>(context_map[ctx], br);
+  }
+
+  // not inlined, for use in non-hot paths
+  size_t ReadHybridUint(size_t ctx, BitReader* JXL_RESTRICT br,
+                        const std::vector<uint8_t>& context_map) {
+    return ReadHybridUintClustered</*uses_lz77=*/true>(context_map[ctx], br);
   }
 
   // ctx is a *clustered* context!
index df70284..ea71d75 100644 (file)
 #endif
 
 #include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/common.h"
 #include "lib/jxl/base/compiler_specific.h"
-#include "lib/jxl/base/profiler.h"
 #include "lib/jxl/base/span.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/common.h"
 
 namespace jxl {
 
@@ -228,7 +227,7 @@ class BitReader {
     JXL_ASSERT(TotalBitsConsumed() % kBitsPerByte == 0);
     const size_t offset = TotalBitsConsumed() / kBitsPerByte;  // no remainder
     JXL_ASSERT(offset <= TotalBytes());
-    return Span<const uint8_t>(first_byte_ + offset, TotalBytes() - offset);
+    return Bytes(first_byte_ + offset, TotalBytes() - offset);
   }
 
   // Returns whether all the bits read so far have been within the input bounds.
@@ -264,7 +263,6 @@ class BitReader {
  private:
   // Separate function avoids inlining this relatively cold code into callers.
   JXL_NOINLINE void BoundsCheckedRefill() {
-    PROFILER_FUNC;
     const uint8_t* end = end_minus_8_ + 8;
 
     // Read whole bytes until we have [56, 64) bits (same as LoadLE64)
index b819b51..d5e15bc 100644 (file)
@@ -6,8 +6,10 @@
 #include "lib/jxl/dec_cache.h"
 
 #include "lib/jxl/blending.h"
+#include "lib/jxl/common.h"  // JXL_HIGH_PRECISION
 #include "lib/jxl/render_pipeline/stage_blending.h"
 #include "lib/jxl/render_pipeline/stage_chroma_upsampling.h"
+#include "lib/jxl/render_pipeline/stage_cms.h"
 #include "lib/jxl/render_pipeline/stage_epf.h"
 #include "lib/jxl/render_pipeline/stage_from_linear.h"
 #include "lib/jxl/render_pipeline/stage_gaborish.h"
@@ -28,7 +30,7 @@ Status PassesDecoderState::PreparePipeline(ImageBundle* decoded,
                                            PipelineOptions options) {
   const FrameHeader& frame_header = shared->frame_header;
   size_t num_c = 3 + frame_header.nonserialized_metadata->m.num_extra_channels;
-  if ((frame_header.flags & FrameHeader::kNoise) != 0) {
+  if (options.render_noise && (frame_header.flags & FrameHeader::kNoise) != 0) {
     num_c += 3;
   }
 
@@ -110,8 +112,7 @@ Status PassesDecoderState::PreparePipeline(ImageBundle* decoded,
           CeilLog2Nonzero(frame_header.upsampling)));
     }
   }
-
-  if ((frame_header.flags & FrameHeader::kNoise) != 0) {
+  if (options.render_noise && (frame_header.flags & FrameHeader::kNoise) != 0) {
     builder.AddStage(GetConvolveNoiseStage(num_c - 3));
     builder.AddStage(GetAddNoiseStage(shared->image_features.noise_params,
                                       shared->cmap, num_c - 3));
@@ -138,29 +139,29 @@ Status PassesDecoderState::PreparePipeline(ImageBundle* decoded,
     }
   }
 
-  size_t width = options.coalescing
-                     ? frame_header.nonserialized_metadata->xsize()
-                     : shared->frame_dim.xsize_upsampled;
-  size_t height = options.coalescing
-                      ? frame_header.nonserialized_metadata->ysize()
-                      : shared->frame_dim.ysize_upsampled;
-
   if (fast_xyb_srgb8_conversion) {
+#if !JXL_HIGH_PRECISION
     JXL_ASSERT(!NeedsBlending(this));
     JXL_ASSERT(!frame_header.CanBeReferenced() ||
                frame_header.save_before_color_transform);
     JXL_ASSERT(!options.render_spotcolors ||
                !decoded->metadata()->Find(ExtraChannel::kSpotColor));
-    builder.AddStage(GetFastXYBTosRGB8Stage(rgb_output, rgb_stride, width,
-                                            height, rgb_output_is_rgba,
-                                            has_alpha, alpha_c));
+    bool is_rgba = (main_output.format.num_channels == 4);
+    uint8_t* rgb_output = reinterpret_cast<uint8_t*>(main_output.buffer);
+    builder.AddStage(GetFastXYBTosRGB8Stage(rgb_output, main_output.stride,
+                                            width, height, is_rgba, has_alpha,
+                                            alpha_c));
+#endif
   } else {
     bool linear = false;
     if (frame_header.color_transform == ColorTransform::kYCbCr) {
       builder.AddStage(GetYCbCrStage());
     } else if (frame_header.color_transform == ColorTransform::kXYB) {
-      builder.AddStage(GetXYBStage(output_encoding_info.opsin_params));
-      linear = true;
+      builder.AddStage(GetXYBStage(output_encoding_info));
+      if (output_encoding_info.color_encoding.GetColorSpace() !=
+          ColorSpace::kXYB) {
+        linear = true;
+      }
     }  // Nothing to do for kNone.
 
     if (options.coalescing && NeedsBlending(this)) {
@@ -200,29 +201,57 @@ Status PassesDecoderState::PreparePipeline(ImageBundle* decoded,
       if (!linear) {
         auto to_linear_stage = GetToLinearStage(output_encoding_info);
         if (!to_linear_stage) {
-          return JXL_FAILURE(
-              "attempting to perform tone mapping on colorspace not "
-              "convertible to linear");
+          if (!output_encoding_info.cms_set) {
+            return JXL_FAILURE("Cannot tonemap this colorspace without a CMS");
+          }
+          auto cms_stage = GetCmsStage(output_encoding_info);
+          if (cms_stage) {
+            builder.AddStage(std::move(cms_stage));
+          }
+        } else {
+          builder.AddStage(std::move(to_linear_stage));
         }
-        builder.AddStage(std::move(to_linear_stage));
         linear = true;
       }
       builder.AddStage(std::move(tone_mapping_stage));
     }
 
     if (linear) {
-      builder.AddStage(GetFromLinearStage(output_encoding_info));
+      const size_t channels_src =
+          (output_encoding_info.orig_color_encoding.IsCMYK()
+               ? 4
+               : output_encoding_info.orig_color_encoding.Channels());
+      const size_t channels_dst =
+          output_encoding_info.color_encoding.Channels();
+      bool mixing_color_and_grey = (channels_dst != channels_src);
+      if ((output_encoding_info.color_encoding_is_original) ||
+          (!output_encoding_info.cms_set) || mixing_color_and_grey) {
+        // in those cases we only need a linear stage in other cases we attempt
+        // to obtain an cms stage: the cases are
+        // - output_encoding_info.color_encoding_is_original: no cms stage
+        // needed because it would be a no-op
+        // - !output_encoding_info.cms_set: can't use the cms, so no point in
+        // trying to add a cms stage
+        // - mixing_color_and_grey: cms stage can't handle that
+        // TODO(firsching): remove "mixing_color_and_grey" condition after
+        // adding support for greyscale to cms stage.
+        builder.AddStage(GetFromLinearStage(output_encoding_info));
+      } else {
+        if (!output_encoding_info.linear_color_encoding.CreateICC()) {
+          return JXL_FAILURE("Failed to create ICC");
+        }
+        auto cms_stage = GetCmsStage(output_encoding_info);
+        if (cms_stage) {
+          builder.AddStage(std::move(cms_stage));
+        }
+      }
       linear = false;
     }
 
-    if (pixel_callback.IsPresent()) {
-      builder.AddStage(GetWriteToPixelCallbackStage(
-          pixel_callback, width, height, rgb_output_is_rgba, has_alpha,
-          unpremul_alpha, alpha_c));
-    } else if (rgb_output) {
-      builder.AddStage(GetWriteToU8Stage(rgb_output, rgb_stride, height,
-                                         rgb_output_is_rgba, has_alpha,
-                                         alpha_c));
+    if (main_output.callback.IsPresent() || main_output.buffer) {
+      builder.AddStage(GetWriteToOutputStage(main_output, width, height,
+                                             has_alpha, unpremul_alpha, alpha_c,
+                                             undo_orientation, extra_output));
     } else {
       builder.AddStage(GetWriteToImageBundleStage(
           decoded, output_encoding_info.color_encoding));
index 7105ba8..30c75bb 100644 (file)
@@ -6,17 +6,17 @@
 #ifndef LIB_JXL_DEC_CACHE_H_
 #define LIB_JXL_DEC_CACHE_H_
 
+#include <jxl/decode.h>
 #include <stdint.h>
 
 #include <atomic>
 #include <hwy/base.h>  // HWY_ALIGN_MAX
 
-#include "jxl/decode.h"
 #include "lib/jxl/ac_strategy.h"
-#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/common.h"  // kMaxNumPasses
 #include "lib/jxl/coeff_order.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/convolve.h"
+#include "lib/jxl/dec_ans.h"
 #include "lib/jxl/dec_group_border.h"
 #include "lib/jxl/dec_noise.h"
 #include "lib/jxl/image.h"
@@ -56,6 +56,20 @@ struct PixelCallback {
   void* init_opaque = nullptr;
 };
 
+struct ImageOutput {
+  // Pixel format of the output pixels, used for buffer and callback output.
+  JxlPixelFormat format;
+  // Output bit depth for unsigned data types, used for float to int conversion.
+  size_t bits_per_sample;
+  // Callback for line-by-line output.
+  PixelCallback callback;
+  // Pixel buffer for image output.
+  void* buffer;
+  size_t buffer_size;
+  // Length of a row of image_buffer in bytes (based on oriented width).
+  size_t stride;
+};
+
 // Per-frame decoder state. All the images here should be accessed through a
 // group rect (either with block units or pixel units).
 struct PassesDecoderState {
@@ -77,30 +91,22 @@ struct PassesDecoderState {
   // Sigma values for EPF.
   ImageF sigma;
 
-  // RGB8 output buffer. If not nullptr, image data will be written to this
-  // buffer instead of being written to the output ImageBundle. The image data
-  // is assumed to have the stride given by `rgb_stride`, hence row `i` starts
-  // at position `i * rgb_stride`.
-  uint8_t* rgb_output;
-  size_t rgb_stride = 0;
+  // Image dimensions before applying undo_orientation.
+  size_t width;
+  size_t height;
+  ImageOutput main_output;
+  std::vector<ImageOutput> extra_output;
 
   // Whether to use int16 float-XYB-to-uint8-srgb conversion.
   bool fast_xyb_srgb8_conversion;
 
-  // If true, rgb_output or callback output is RGBA using 4 instead of 3 bytes
-  // per pixel.
-  bool rgb_output_is_rgba;
   // If true, the RGBA output will be unpremultiplied before writing to the
-  // output callback (the output buffer case is handled in ConvertToExternal).
+  // output.
   bool unpremul_alpha;
 
-  // Callback for line-by-line output.
-  PixelCallback pixel_callback;
-
-  // Buffer of upsampling * kApplyImageFeaturesTileDim ones.
-  std::vector<float> opaque_alpha;
-  // One row per thread
-  std::vector<std::vector<float>> pixel_callback_rows;
+  // The render pipeline will apply this orientation to bring the image to the
+  // intended display orientation.
+  Orientation undo_orientation;
 
   // Used for seeding noise.
   size_t visible_frame_index = 0;
@@ -122,6 +128,7 @@ struct PassesDecoderState {
     bool use_slow_render_pipeline;
     bool coalescing;
     bool render_spotcolors;
+    bool render_noise;
   };
 
   Status PreparePipeline(ImageBundle* decoded, PipelineOptions options);
@@ -136,11 +143,14 @@ struct PassesDecoderState {
     b_dm_multiplier =
         std::pow(1 / (1.25f), shared->frame_header.b_qm_scale - 2.0f);
 
-    rgb_output = nullptr;
-    rgb_output_is_rgba = false;
-    unpremul_alpha = false;
+    main_output.callback = PixelCallback();
+    main_output.buffer = nullptr;
+    extra_output.clear();
+
     fast_xyb_srgb8_conversion = false;
-    pixel_callback = PixelCallback();
+    unpremul_alpha = false;
+    undo_orientation = Orientation::kIdentity;
+
     used_acs = 0;
 
     upsampler8x = GetUpsamplingStage(shared->metadata->transform_data, 0, 3);
@@ -179,8 +189,6 @@ struct PassesDecoderState {
 // for large images because we only initialize min(#threads, #groups) instances.
 struct GroupDecCache {
   void InitOnce(size_t num_passes, size_t used_acs) {
-    PROFILER_FUNC;
-
     for (size_t i = 0; i < num_passes; i++) {
       if (num_nzeroes[i].xsize() == 0) {
         // Allocate enough for a whole group - partial groups on the
@@ -204,7 +212,7 @@ struct GroupDecCache {
       max_block_area_ = max_block_area;
       // We need 3x float blocks for dequantized coefficients and 1x for scratch
       // space for transforms.
-      float_memory_ = hwy::AllocateAligned<float>(max_block_area_ * 4);
+      float_memory_ = hwy::AllocateAligned<float>(max_block_area_ * 7);
       // We need 3x int32 or int16 blocks for quantized coefficients.
       int32_memory_ = hwy::AllocateAligned<int32_t>(max_block_area_ * 3);
       int16_memory_ = hwy::AllocateAligned<int16_t>(max_block_area_ * 3);
index 93c59f7..2c93672 100644 (file)
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/dec_ans.h"
 #include "lib/jxl/entropy_coder.h"
+#include "lib/jxl/inverse_mtf-inl.h"
 
 namespace jxl {
 
 namespace {
 
-void MoveToFront(uint8_t* v, uint8_t index) {
-  uint8_t value = v[index];
-  uint8_t i = index;
-  for (; i; --i) v[i] = v[i - 1];
-  v[0] = value;
-}
-
-void InverseMoveToFrontTransform(uint8_t* v, int v_len) {
-  uint8_t mtf[256];
-  int i;
-  for (i = 0; i < 256; ++i) {
-    mtf[i] = static_cast<uint8_t>(i);
-  }
-  for (i = 0; i < v_len; ++i) {
-    uint8_t index = v[i];
-    v[i] = mtf[index];
-    if (index) MoveToFront(mtf, index);
-  }
-}
-
 Status VerifyContextMap(const std::vector<uint8_t>& context_map,
                         const size_t num_htrees) {
   std::vector<bool> have_htree(num_htrees);
@@ -73,24 +54,27 @@ Status DecodeContextMap(std::vector<uint8_t>* context_map, size_t* num_htrees,
   } else {
     bool use_mtf = input->ReadFixedBits<1>();
     ANSCode code;
-    std::vector<uint8_t> dummy_ctx_map;
+    std::vector<uint8_t> sink_ctx_map;
     // Usage of LZ77 is disallowed if decoding only two symbols. This doesn't
     // make sense in non-malicious bitstreams, and could cause a stack overflow
     // in malicious bitstreams by making every context map require its own
     // context map.
     JXL_RETURN_IF_ERROR(
-        DecodeHistograms(input, 1, &code, &dummy_ctx_map,
+        DecodeHistograms(input, 1, &code, &sink_ctx_map,
                          /*disallow_lz77=*/context_map->size() <= 2));
     ANSSymbolReader reader(&code, input);
     size_t i = 0;
+    uint32_t maxsym = 0;
     while (i < context_map->size()) {
-      uint32_t sym = reader.ReadHybridUint(0, input, dummy_ctx_map);
-      if (sym >= kMaxClusters) {
-        return JXL_FAILURE("Invalid cluster ID");
-      }
+      uint32_t sym = reader.ReadHybridUintInlined</*uses_lz77=*/true>(
+          0, input, sink_ctx_map);
+      maxsym = sym > maxsym ? sym : maxsym;
       (*context_map)[i] = sym;
       i++;
     }
+    if (maxsym >= kMaxClusters) {
+      return JXL_FAILURE("Invalid cluster ID");
+    }
     if (!reader.CheckANSFinalState()) {
       return JXL_FAILURE("Invalid context map");
     }
index abf3ed4..37f7abf 100644 (file)
@@ -5,6 +5,7 @@
 
 #include "lib/jxl/dec_external_image.h"
 
+#include <jxl/types.h>
 #include <string.h>
 
 #include <algorithm>
 
 #include "lib/jxl/alpha.h"
 #include "lib/jxl/base/byte_order.h"
-#include "lib/jxl/base/cache_aligned.h"
+#include "lib/jxl/base/common.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/printf_macros.h"
-#include "lib/jxl/color_management.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/sanitizers.h"
-#include "lib/jxl/transfer_functions-inl.h"
 
 HWY_BEFORE_NAMESPACE();
 namespace jxl {
@@ -34,6 +32,7 @@ namespace HWY_NAMESPACE {
 
 // These templates are not found via ADL.
 using hwy::HWY_NAMESPACE::Clamp;
+using hwy::HWY_NAMESPACE::Mul;
 using hwy::HWY_NAMESPACE::NearestInt;
 
 // TODO(jon): check if this can be replaced by a FloatToU16 function
@@ -235,33 +234,25 @@ void StoreFloatRow(const float* JXL_RESTRICT* rows_in, size_t num_channels,
 
 void JXL_INLINE Store8(uint32_t value, uint8_t* dest) { *dest = value & 0xff; }
 
-// Maximum number of channels for the ConvertChannelsToExternal function.
-const size_t kConvertMaxChannels = 4;
+}  // namespace
 
-// Converts a list of channels to an interleaved image, applying transformations
-// when needed.
-// The input channels are given as a (non-const!) array of channel pointers and
-// interleaved in that order.
-//
-// Note: if a pointer in channels[] is nullptr, a 1.0 value will be used
-// instead. This is useful for handling when a user requests an alpha channel
-// from an image that doesn't have one. The first channel in the list may not
-// be nullptr, since it is used to determine the image size.
-Status ConvertChannelsToExternal(const ImageF* channels[], size_t num_channels,
-                                 size_t bits_per_sample, bool float_out,
-                                 JxlEndianness endianness, size_t stride,
-                                 jxl::ThreadPool* pool, void* out_image,
-                                 size_t out_size,
+Status ConvertChannelsToExternal(const ImageF* in_channels[],
+                                 size_t num_channels, size_t bits_per_sample,
+                                 bool float_out, JxlEndianness endianness,
+                                 size_t stride, jxl::ThreadPool* pool,
+                                 void* out_image, size_t out_size,
                                  const PixelCallback& out_callback,
                                  jxl::Orientation undo_orientation) {
   JXL_DASSERT(num_channels != 0 && num_channels <= kConvertMaxChannels);
-  JXL_DASSERT(channels[0] != nullptr);
+  JXL_DASSERT(in_channels[0] != nullptr);
   JXL_CHECK(float_out ? bits_per_sample == 16 || bits_per_sample == 32
                       : bits_per_sample > 0 && bits_per_sample <= 16);
   if (!!out_image == out_callback.IsPresent()) {
     return JXL_FAILURE(
         "Must provide either an out_image or an out_callback, but not both.");
   }
+  std::vector<const ImageF*> channels;
+  channels.assign(in_channels, in_channels + num_channels);
 
   const size_t bytes_per_channel = DivCeil(bits_per_sample, jxl::kBitsPerByte);
   const size_t bytes_per_pixel = num_channels * bytes_per_channel;
@@ -448,8 +439,6 @@ Status ConvertChannelsToExternal(const ImageF* channels[], size_t num_channels,
   return true;
 }
 
-}  // namespace
-
 Status ConvertToExternal(const jxl::ImageBundle& ib, size_t bits_per_sample,
                          bool float_out, size_t num_channels,
                          JxlEndianness endianness, size_t stride,
@@ -489,17 +478,5 @@ Status ConvertToExternal(const jxl::ImageBundle& ib, size_t bits_per_sample,
       pool, out_image, out_size, out_callback, undo_orientation);
 }
 
-Status ConvertToExternal(const jxl::ImageF& channel, size_t bits_per_sample,
-                         bool float_out, JxlEndianness endianness,
-                         size_t stride, jxl::ThreadPool* pool, void* out_image,
-                         size_t out_size, const PixelCallback& out_callback,
-                         jxl::Orientation undo_orientation) {
-  const ImageF* channels[1];
-  channels[0] = &channel;
-  return ConvertChannelsToExternal(channels, 1, bits_per_sample, float_out,
-                                   endianness, stride, pool, out_image,
-                                   out_size, out_callback, undo_orientation);
-}
-
 }  // namespace jxl
 #endif  // HWY_ONCE
index 9b3b8bf..d33370e 100644 (file)
@@ -8,11 +8,12 @@
 
 // Interleaved image for color transforms and Codec.
 
+#include <jxl/decode.h>
+#include <jxl/types.h>
 #include <stddef.h>
 #include <stdint.h>
 
-#include "jxl/decode.h"
-#include "jxl/types.h"
+#include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/color_encoding_internal.h"
 #include "lib/jxl/dec_cache.h"
 
 namespace jxl {
 
+// Maximum number of channels for the ConvertChannelsToExternal function.
+const size_t kConvertMaxChannels = 4;
+
+// Converts a list of channels to an interleaved image, applying transformations
+// when needed.
+// The input channels are given as a (non-const!) array of channel pointers and
+// interleaved in that order.
+//
+// Note: if a pointer in channels[] is nullptr, a 1.0 value will be used
+// instead. This is useful for handling when a user requests an alpha channel
+// from an image that doesn't have one. The first channel in the list may not
+// be nullptr, since it is used to determine the image size.
+Status ConvertChannelsToExternal(const ImageF* in_channels[],
+                                 size_t num_channels, size_t bits_per_sample,
+                                 bool float_out, JxlEndianness endianness,
+                                 size_t stride, jxl::ThreadPool* pool,
+                                 void* out_image, size_t out_size,
+                                 const PixelCallback& out_callback,
+                                 jxl::Orientation undo_orientation);
+
 // Converts ib to interleaved void* pixel buffer with the given format.
 // bits_per_sample: must be 16 or 32 if float_out is true, and at most 16
 // if it is false. No bit packing is done.
@@ -41,16 +62,6 @@ Status ConvertToExternal(const jxl::ImageBundle& ib, size_t bits_per_sample,
                          jxl::Orientation undo_orientation,
                          bool unpremul_alpha = false);
 
-// Converts single-channel image to interleaved void* pixel buffer with the
-// given format, with a single channel.
-// This supports the features needed for the C API to get extra channels.
-// Arguments are similar to the multi-channel function above.
-Status ConvertToExternal(const jxl::ImageF& channel, size_t bits_per_sample,
-                         bool float_out, JxlEndianness endianness,
-                         size_t stride_out, jxl::ThreadPool* thread_pool,
-                         void* out_image, size_t out_size,
-                         const PixelCallback& out_callback,
-                         jxl::Orientation undo_orientation);
 }  // namespace jxl
 
 #endif  // LIB_JXL_DEC_EXTERNAL_IMAGE_H_
index 0011792..c87a4d5 100644 (file)
@@ -25,7 +25,7 @@ void BM_DecExternalImage_ConvertImageRGBA(benchmark::State& state) {
   ib.SetFromImage(std::move(color), ColorEncoding::SRGB());
   ImageF alpha(xsize, ysize);
   ZeroFillImage(&alpha);
-  ib.SetAlpha(std::move(alpha), /*alpha_is_premultiplied=*/false);
+  ib.SetAlpha(std::move(alpha));
 
   const size_t bytes_per_row = xsize * num_channels;
   std::vector<uint8_t> interleaved(bytes_per_row * ysize);
index c90bb47..881a1f6 100644 (file)
@@ -5,6 +5,7 @@
 
 #include "lib/jxl/dec_frame.h"
 
+#include <jxl/types.h>
 #include <stddef.h>
 #include <stdint.h>
 
 #include <utility>
 #include <vector>
 
-#include "jxl/types.h"
 #include "lib/jxl/ac_context.h"
 #include "lib/jxl/ac_strategy.h"
 #include "lib/jxl/ans_params.h"
 #include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/common.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/base/printf_macros.h"
-#include "lib/jxl/base/profiler.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/chroma_from_luma.h"
 #include "lib/jxl/coeff_order.h"
 #include "lib/jxl/coeff_order_fwd.h"
-#include "lib/jxl/color_management.h"
-#include "lib/jxl/common.h"
+#include "lib/jxl/common.h"  // kMaxNumPasses
 #include "lib/jxl/compressed_dc.h"
 #include "lib/jxl/dec_ans.h"
 #include "lib/jxl/dec_bit_reader.h"
 #include "lib/jxl/dec_xyb.h"
 #include "lib/jxl/epf.h"
 #include "lib/jxl/fields.h"
+#include "lib/jxl/frame_dimensions.h"
 #include "lib/jxl/frame_header.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/image_bundle.h"
 #include "lib/jxl/image_ops.h"
 #include "lib/jxl/jpeg/jpeg_data.h"
 #include "lib/jxl/loop_filter.h"
-#include "lib/jxl/luminance.h"
 #include "lib/jxl/passes_state.h"
 #include "lib/jxl/quant_weights.h"
 #include "lib/jxl/quantizer.h"
@@ -59,7 +58,6 @@ namespace jxl {
 namespace {
 Status DecodeGlobalDCInfo(BitReader* reader, bool is_jpeg,
                           PassesDecoderState* state, ThreadPool* pool) {
-  PROFILER_FUNC;
   JXL_RETURN_IF_ERROR(state->shared_storage.quantizer.Decode(reader));
 
   JXL_RETURN_IF_ERROR(
@@ -84,10 +82,11 @@ Status DecodeFrame(PassesDecoderState* dec_state, ThreadPool* JXL_RESTRICT pool,
   FrameDecoder frame_decoder(dec_state, metadata, pool,
                              use_slow_rendering_pipeline);
 
-  BitReader reader(Span<const uint8_t>(next_in, avail_in));
+  BitReader reader(Bytes(next_in, avail_in));
   JXL_RETURN_IF_ERROR(frame_decoder.InitFrame(&reader, decoded,
-                                              /*is_preview=*/false,
-                                              /*output_needed=*/true));
+                                              /*is_preview=*/false));
+  JXL_RETURN_IF_ERROR(frame_decoder.InitFrameOutput());
+
   JXL_RETURN_IF_ERROR(reader.AllReadsWithinBounds());
   size_t header_bytes = reader.TotalBitsConsumed() / kBitsPerByte;
   JXL_RETURN_IF_ERROR(reader.Close());
@@ -100,12 +99,12 @@ Status DecodeFrame(PassesDecoderState* dec_state, ThreadPool* JXL_RESTRICT pool,
     std::vector<FrameDecoder::SectionInfo> section_info;
     std::vector<FrameDecoder::SectionStatus> section_status;
     size_t pos = header_bytes;
+    size_t index = 0;
     for (auto toc_entry : frame_decoder.Toc()) {
       JXL_RETURN_IF_ERROR(pos + toc_entry.size <= avail_in);
-      auto br = make_unique<BitReader>(
-          Span<const uint8_t>(next_in + pos, toc_entry.size));
+      auto br = make_unique<BitReader>(Bytes(next_in + pos, toc_entry.size));
       section_info.emplace_back(
-          FrameDecoder::SectionInfo{br.get(), toc_entry.id});
+          FrameDecoder::SectionInfo{br.get(), toc_entry.id, index++});
       section_closers.emplace_back(
           make_unique<BitReaderScopedCloser>(br.get(), &close_ok));
       section_readers.emplace_back(std::move(br));
@@ -126,8 +125,7 @@ Status DecodeFrame(PassesDecoderState* dec_state, ThreadPool* JXL_RESTRICT pool,
 }
 
 Status FrameDecoder::InitFrame(BitReader* JXL_RESTRICT br, ImageBundle* decoded,
-                               bool is_preview, bool output_needed) {
-  PROFILER_FUNC;
+                               bool is_preview) {
   decoded_ = decoded;
   JXL_ASSERT(is_finalized_);
 
@@ -196,18 +194,20 @@ Status FrameDecoder::InitFrame(BitReader* JXL_RESTRICT br, ImageBundle* decoded,
         "Non-444 chroma subsampling is not allowed when adaptive DC "
         "smoothing is enabled");
   }
+  return true;
+}
 
-  if (!output_needed) return true;
+Status FrameDecoder::InitFrameOutput() {
   JXL_RETURN_IF_ERROR(
       InitializePassesSharedState(frame_header_, &dec_state_->shared_storage));
   JXL_RETURN_IF_ERROR(dec_state_->Init());
   modular_frame_decoder_.Init(frame_dim_);
 
-  if (decoded->IsJPEG()) {
+  if (decoded_->IsJPEG()) {
     if (frame_header_.encoding == FrameEncoding::kModular) {
       return JXL_FAILURE("Cannot output JPEG from Modular");
     }
-    jpeg::JPEGData* jpeg_data = decoded->jpeg_data.get();
+    jpeg::JPEGData* jpeg_data = decoded_->jpeg_data.get();
     size_t num_components = jpeg_data->components.size();
     if (num_components != 1 && num_components != 3) {
       return JXL_FAILURE("Invalid number of components");
@@ -216,8 +216,8 @@ Status FrameDecoder::InitFrame(BitReader* JXL_RESTRICT br, ImageBundle* decoded,
       return JXL_FAILURE("Cannot decode to JPEG an XYB image");
     }
     auto jpeg_c_map = JpegOrder(ColorTransform::kYCbCr, num_components == 1);
-    decoded->jpeg_data->width = frame_dim_.xsize;
-    decoded->jpeg_data->height = frame_dim_.ysize;
+    decoded_->jpeg_data->width = frame_dim_.xsize;
+    decoded_->jpeg_data->height = frame_dim_.ysize;
     for (size_t c = 0; c < num_components; c++) {
       auto& component = jpeg_data->components[jpeg_c_map[c]];
       component.width_in_blocks =
@@ -250,7 +250,6 @@ Status FrameDecoder::InitFrame(BitReader* JXL_RESTRICT br, ImageBundle* decoded,
 }
 
 Status FrameDecoder::ProcessDCGlobal(BitReader* br) {
-  PROFILER_FUNC;
   PassesSharedState& shared = dec_state_->shared_storage;
   if (shared.frame_header.flags & FrameHeader::kPatches) {
     bool uses_extra_channels = false;
@@ -299,7 +298,6 @@ Status FrameDecoder::ProcessDCGlobal(BitReader* br) {
 }
 
 Status FrameDecoder::ProcessDCGroup(size_t dc_group_id, BitReader* br) {
-  PROFILER_FUNC;
   const size_t gx = dc_group_id % frame_dim_.xsize_dc_groups;
   const size_t gy = dc_group_id / frame_dim_.xsize_dc_groups;
   const LoopFilter& lf = dec_state_->shared->frame_header.loop_filter;
@@ -449,7 +447,6 @@ Status FrameDecoder::ProcessACGroup(size_t ac_group_id,
                                     BitReader* JXL_RESTRICT* br,
                                     size_t num_passes, size_t thread,
                                     bool force_draw, bool dc_only) {
-  PROFILER_ZONE("process_group");
   size_t group_dim = frame_dim_.group_dim;
   const size_t gx = ac_group_id % frame_dim_.xsize_groups;
   const size_t gy = ac_group_id / frame_dim_.xsize_groups;
@@ -504,7 +501,6 @@ Status FrameDecoder::ProcessACGroup(size_t ac_group_id,
   decoded_passes_per_ac_group_[ac_group_id] += num_passes;
 
   if ((frame_header_.flags & FrameHeader::kNoise) != 0) {
-    PROFILER_ZONE("GenerateNoise");
     size_t noise_c_start =
         3 + frame_header_.nonserialized_metadata->m.num_extra_channels;
     // When the color channels are downsampled, we need to generate more noise
@@ -649,6 +645,7 @@ Status FrameDecoder::ProcessSections(const SectionInfo* sections, size_t num,
     pipeline_options.use_slow_render_pipeline = use_slow_rendering_pipeline_;
     pipeline_options.coalescing = coalescing_;
     pipeline_options.render_spotcolors = render_spotcolors_;
+    pipeline_options.render_noise = true;
     JXL_RETURN_IF_ERROR(
         dec_state_->PreparePipeline(decoded_, pipeline_options));
     FinalizeDC();
@@ -857,20 +854,17 @@ Status FrameDecoder::FinalizeFrame() {
     // Nothing to do.
     return true;
   }
-  if (!finalized_dc_) {
-    // We don't have all of DC, and render pipeline is not created yet, so we
-    // can not call Flush() yet.
-    return JXL_FAILURE("FinalizeFrame called before DC was fully decoded");
-  }
 
-  JXL_RETURN_IF_ERROR(Flush());
+  // undo global modular transforms and copy int pixel buffers to float ones
+  JXL_RETURN_IF_ERROR(
+      modular_frame_decoder_.FinalizeDecoding(dec_state_, pool_,
+                                              /*inplace=*/true));
 
   if (frame_header_.CanBeReferenced()) {
     auto& info = dec_state_->shared_storage
                      .reference_frames[frame_header_.save_as_reference];
-    info.storage = std::move(dec_state_->frame_storage_for_referencing);
+    info.frame = std::move(dec_state_->frame_storage_for_referencing);
     info.ib_is_in_xyb = frame_header_.save_before_color_transform;
-    info.frame = &info.storage;
   }
   return true;
 }
index 62c61c0..9b55459 100644 (file)
@@ -6,17 +6,16 @@
 #ifndef LIB_JXL_DEC_FRAME_H_
 #define LIB_JXL_DEC_FRAME_H_
 
+#include <jxl/decode.h>
+#include <jxl/types.h>
 #include <stdint.h>
 
-#include "jxl/decode.h"
-#include "jxl/types.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/base/span.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/blending.h"
-#include "lib/jxl/codec_in_out.h"
-#include "lib/jxl/common.h"
+#include "lib/jxl/common.h"  // JXL_HIGH_PRECISION
 #include "lib/jxl/dec_bit_reader.h"
 #include "lib/jxl/dec_cache.h"
 #include "lib/jxl/dec_modular.h"
@@ -50,14 +49,20 @@ class FrameDecoder {
   void SetCoalescing(bool c) { coalescing_ = c; }
 
   // Read FrameHeader and table of contents from the given BitReader.
-  // Also checks frame dimensions for their limits, and sets the output
-  // image buffer.
   Status InitFrame(BitReader* JXL_RESTRICT br, ImageBundle* decoded,
-                   bool is_preview, bool output_needed);
+                   bool is_preview);
+
+  // Checks frame dimensions for their limits, and sets the output
+  // image buffer.
+  Status InitFrameOutput();
 
   struct SectionInfo {
     BitReader* JXL_RESTRICT br;
+    // Logical index of the section, regardless of any permutation that may be
+    // applied in the table of contents or of the physical position in the file.
     size_t id;
+    // Index of the section in the order of the bytes inside the frame.
+    size_t index;
   };
 
   struct TocEntry {
@@ -118,7 +123,7 @@ class FrameDecoder {
   size_t NumCompletePasses() const {
     return *std::min_element(decoded_passes_per_ac_group_.begin(),
                              decoded_passes_per_ac_group_.end());
-  };
+  }
 
   // If enabled, ProcessSections will stop and return true when the DC
   // sections have been processed, instead of starting the AC sections. This
@@ -139,7 +144,7 @@ class FrameDecoder {
         // but the implementation may not yet correctly support this for Flush.
         // Therefore, can't correctly pause for a progressive step if there is
         // an extra channel (including alpha channel)
-        // TOOD(firsching): Check if this is still the case.
+        // TODO(firsching): Check if this is still the case.
         decoded_->metadata()->extra_channel_info.empty() &&
         // DC is not guaranteed to be available in modular mode and may be a
         // black image. If squeeze is used, it may be available depending on the
@@ -172,71 +177,60 @@ class FrameDecoder {
                                          : std::numeric_limits<size_t>::max());
   }
 
-  void MaybeSetUnpremultiplyAlpha(bool unpremul_alpha) {
+  // Sets the pixel callback or image buffer where the pixels will be decoded.
+  //
+  // @param undo_orientation: if true, indicates the frame decoder should apply
+  // the exif orientation to bring the image to the intended display
+  // orientation.
+  void SetImageOutput(const PixelCallback& pixel_callback, void* image_buffer,
+                      size_t image_buffer_size, size_t xsize, size_t ysize,
+                      JxlPixelFormat format, size_t bits_per_sample,
+                      bool unpremul_alpha, bool undo_orientation) const {
+    dec_state_->width = xsize;
+    dec_state_->height = ysize;
+    dec_state_->main_output.format = format;
+    dec_state_->main_output.bits_per_sample = bits_per_sample;
+    dec_state_->main_output.callback = pixel_callback;
+    dec_state_->main_output.buffer = image_buffer;
+    dec_state_->main_output.buffer_size = image_buffer_size;
+    dec_state_->main_output.stride = GetStride(xsize, format);
     const jxl::ExtraChannelInfo* alpha =
         decoded_->metadata()->Find(jxl::ExtraChannel::kAlpha);
     if (alpha && alpha->alpha_associated && unpremul_alpha) {
       dec_state_->unpremul_alpha = true;
     }
-  }
-
-  // Sets the buffer to which uint8 sRGB pixels will be decoded. This is not
-  // supported for all images. If it succeeds, HasRGBBuffer() will return true.
-  // If it does not succeed, the image is decoded to the ImageBundle passed to
-  // InitFrame instead.
-  // If an output callback is set, this function *may not* be called.
-  //
-  // @param undo_orientation: if true, indicates the frame decoder should apply
-  // the exif orientation to bring the image to the intended display
-  // orientation. Performing this operation is not yet supported, so this
-  // results in not setting the buffer if the image has a non-identity EXIF
-  // orientation. When outputting to the ImageBundle, no orientation is undone.
-  void MaybeSetRGB8OutputBuffer(uint8_t* rgb_output, size_t stride,
-                                bool is_rgba, bool undo_orientation) const {
-    if (!CanDoLowMemoryPath(undo_orientation) || dec_state_->unpremul_alpha) {
-      return;
+    if (undo_orientation) {
+      dec_state_->undo_orientation = decoded_->metadata()->GetOrientation();
+      if (static_cast<int>(dec_state_->undo_orientation) > 4) {
+        std::swap(dec_state_->width, dec_state_->height);
+      }
     }
-    dec_state_->rgb_output = rgb_output;
-    dec_state_->rgb_output_is_rgba = is_rgba;
-    dec_state_->rgb_stride = stride;
-    JXL_ASSERT(!dec_state_->pixel_callback.IsPresent());
+    dec_state_->extra_output.clear();
 #if !JXL_HIGH_PRECISION
-    if (decoded_->metadata()->xyb_encoded &&
+    if (dec_state_->main_output.buffer &&
+        (format.data_type == JXL_TYPE_UINT8) && (format.num_channels >= 3) &&
+        !dec_state_->unpremul_alpha &&
+        (dec_state_->undo_orientation == Orientation::kIdentity) &&
+        decoded_->metadata()->xyb_encoded &&
         dec_state_->output_encoding_info.color_encoding.IsSRGB() &&
         dec_state_->output_encoding_info.all_default_opsin &&
-        dec_state_->output_encoding_info.desired_intensity_target ==
-            dec_state_->output_encoding_info.orig_intensity_target &&
+        (dec_state_->output_encoding_info.desired_intensity_target ==
+         dec_state_->output_encoding_info.orig_intensity_target) &&
         HasFastXYBTosRGB8() && frame_header_.needs_color_transform()) {
       dec_state_->fast_xyb_srgb8_conversion = true;
     }
 #endif
   }
 
-  // Same as MaybeSetRGB8OutputBuffer, but with a float callback. This is not
-  // supported for all images. If it succeeds, HasRGBBuffer() will return true.
-  // If it does not succeed, the image is decoded to the ImageBundle passed to
-  // InitFrame instead.
-  // If a RGB8 output buffer is set, this function *may not* be called.
-  //
-  // @param undo_orientation: if true, indicates the frame decoder should apply
-  // the exif orientation to bring the image to the intended display
-  // orientation. Performing this operation is not yet supported, so this
-  // results in not setting the buffer if the image has a non-identity EXIF
-  // orientation. When outputting to the ImageBundle, no orientation is undone.
-  void MaybeSetFloatCallback(const PixelCallback& pixel_callback, bool is_rgba,
-                             bool unpremul_alpha, bool undo_orientation) const {
-    if (!CanDoLowMemoryPath(undo_orientation)) return;
-    dec_state_->pixel_callback = pixel_callback;
-    dec_state_->rgb_output_is_rgba = is_rgba;
-    JXL_ASSERT(dec_state_->rgb_output == nullptr);
-  }
-
-  // Returns true if the rgb output buffer passed by MaybeSetRGB8OutputBuffer
-  // has been/will be populated by Flush() / FinalizeFrame(), or if a pixel
-  // callback has been used.
-  bool HasRGBBuffer() const {
-    return dec_state_->rgb_output != nullptr ||
-           dec_state_->pixel_callback.IsPresent();
+  void AddExtraChannelOutput(void* buffer, size_t buffer_size, size_t xsize,
+                             JxlPixelFormat format, size_t bits_per_sample) {
+    ImageOutput out;
+    out.format = format;
+    out.bits_per_sample = bits_per_sample;
+    out.buffer = buffer;
+    out.buffer_size = buffer_size;
+    out.stride = GetStride(xsize, format);
+    dec_state_->extra_output.push_back(out);
   }
 
  private:
@@ -277,15 +271,19 @@ class FrameDecoder {
     return thread;
   }
 
-  // If the image has default exif orientation (or has an orientation but should
-  // not be undone) and no blending, the current frame cannot be referenced by
-  // future frames, there are no spot colors to be rendered, and alpha is not
-  // premultiplied, then low memory options can be used
-  // (uint8 output buffer or float pixel callback).
-  // TODO(veluca): reduce this set of restrictions.
-  bool CanDoLowMemoryPath(bool undo_orientation) const {
-    return !(undo_orientation &&
-             decoded_->metadata()->GetOrientation() != Orientation::kIdentity);
+  static size_t BytesPerChannel(JxlDataType data_type) {
+    return (data_type == JXL_TYPE_UINT8   ? 1u
+            : data_type == JXL_TYPE_FLOAT ? 4u
+                                          : 2u);
+  }
+
+  static size_t GetStride(const size_t xsize, JxlPixelFormat format) {
+    size_t stride =
+        (xsize * BytesPerChannel(format.data_type) * format.num_channels);
+    if (format.align > 1) {
+      stride = (jxl::DivCeil(stride, format.align) * format.align);
+    }
+    return stride;
   }
 
   PassesDecoderState* dec_state_;
index 689bc81..430a010 100644 (file)
 
 #include "lib/jxl/ac_context.h"
 #include "lib/jxl/ac_strategy.h"
-#include "lib/jxl/aux_out.h"
 #include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/common.h"
 #include "lib/jxl/base/printf_macros.h"
-#include "lib/jxl/base/profiler.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/coeff_order.h"
-#include "lib/jxl/common.h"
+#include "lib/jxl/common.h"  // kMaxNumPasses
 #include "lib/jxl/convolve.h"
 #include "lib/jxl/dct_scales.h"
 #include "lib/jxl/dec_cache.h"
@@ -35,7 +34,6 @@
 #include "lib/jxl/dec_xyb.h"
 #include "lib/jxl/entropy_coder.h"
 #include "lib/jxl/epf.h"
-#include "lib/jxl/opsin_params.h"
 #include "lib/jxl/quant_weights.h"
 #include "lib/jxl/quantizer-inl.h"
 #include "lib/jxl/quantizer.h"
@@ -44,6 +42,8 @@
 #define LIB_JXL_DEC_GROUP_CC
 namespace jxl {
 
+struct AuxOut;
+
 // Interface for reading groups for DecodeGroupImpl.
 class GetBlock {
  public:
@@ -139,9 +139,8 @@ void DequantBlock(const AcStrategy& acs, float inv_global_scale, int quant,
                   const size_t* sbx,
                   const float* JXL_RESTRICT* JXL_RESTRICT dc_row,
                   size_t dc_stride, const float* JXL_RESTRICT biases,
-                  ACPtr qblock[3], float* JXL_RESTRICT block) {
-  PROFILER_FUNC;
-
+                  ACPtr qblock[3], float* JXL_RESTRICT block,
+                  float* JXL_RESTRICT scratch) {
   const auto scaled_dequant_s = inv_global_scale / quant;
 
   const auto scaled_dequant_x = Set(d, scaled_dequant_s * x_dm_multiplier);
@@ -157,7 +156,7 @@ void DequantBlock(const AcStrategy& acs, float inv_global_scale, int quant,
   }
   for (size_t c = 0; c < 3; c++) {
     LowestFrequenciesFromDC(acs.Strategy(), dc_row[c] + sbx[c], dc_stride,
-                            block + c * size);
+                            block + c * size, scratch);
   }
 }
 
@@ -168,7 +167,6 @@ Status DecodeGroupImpl(GetBlock* JXL_RESTRICT get_block,
                        RenderPipelineInput& render_pipeline_input,
                        ImageBundle* decoded, DrawMode draw) {
   // TODO(veluca): investigate cache usage in this function.
-  PROFILER_FUNC;
   const Rect block_rect = dec_state->shared->BlockGroupRect(group_idx);
   const AcStrategyImage& ac_strategy = dec_state->shared->ac_strategy;
 
@@ -304,7 +302,6 @@ Status DecodeGroupImpl(GetBlock* JXL_RESTRICT get_block,
           bx += llf_x;
           continue;
         }
-        PROFILER_ZONE("DecodeGroupImpl inner");
         const size_t log2_covered_blocks = acs.log2_covered_blocks();
 
         const size_t covered_blocks = 1 << log2_covered_blocks;
@@ -406,7 +403,7 @@ Status DecodeGroupImpl(GetBlock* JXL_RESTRICT get_block,
               acs.covered_blocks_y() * acs.covered_blocks_x(), sbx, dc_rows,
               dc_stride,
               dec_state->output_encoding_info.opsin_params.quant_biases, qblock,
-              block);
+              block, group_dec_cache->scratch_space);
 
           for (size_t c : {1, 0, 2}) {
             if ((sbx[c] << hshift[c] != bx) || (sby[c] << vshift[c] != by)) {
@@ -422,9 +419,6 @@ Status DecodeGroupImpl(GetBlock* JXL_RESTRICT get_block,
       }
     }
   }
-  if (draw == kDontDraw) {
-    return true;
-  }
   return true;
 }
 
@@ -438,7 +432,7 @@ namespace jxl {
 namespace {
 // Decode quantized AC coefficients of DCT blocks.
 // LLF components in the output block will not be modified.
-template <ACType ac_type>
+template <ACType ac_type, bool uses_lz77>
 Status DecodeACVarBlock(size_t ctx_offset, size_t log2_covered_blocks,
                         int32_t* JXL_RESTRICT row_nzeros,
                         const int32_t* JXL_RESTRICT row_nzeros_top,
@@ -451,7 +445,6 @@ Status DecodeACVarBlock(size_t ctx_offset, size_t log2_covered_blocks,
                         const uint8_t* qdc_row, const int32_t* qf_row,
                         const BlockCtxMap& block_ctx_map, ACPtr block,
                         size_t shift = 0) {
-  PROFILER_FUNC;
   // Equal to number of LLF coefficients.
   const size_t covered_blocks = 1 << log2_covered_blocks;
   const size_t size = covered_blocks * kDCTBlockSize;
@@ -466,7 +459,8 @@ Status DecodeACVarBlock(size_t ctx_offset, size_t log2_covered_blocks,
   const int32_t nzero_ctx =
       block_ctx_map.NonZeroContext(predicted_nzeros, block_ctx) + ctx_offset;
 
-  size_t nzeros = decoder->ReadHybridUint(nzero_ctx, br, context_map);
+  size_t nzeros =
+      decoder->ReadHybridUintInlined<uses_lz77>(nzero_ctx, br, context_map);
   if (nzeros + covered_blocks > size) {
     return JXL_FAILURE("Invalid AC: nzeros too large");
   }
@@ -480,36 +474,33 @@ Status DecodeACVarBlock(size_t ctx_offset, size_t log2_covered_blocks,
   const size_t histo_offset =
       ctx_offset + block_ctx_map.ZeroDensityContextsOffset(block_ctx);
 
-  // Skip LLF
-  {
-    PROFILER_ZONE("AcDecSkipLLF, reader");
-    size_t prev = (nzeros > size / 16 ? 0 : 1);
-    for (size_t k = covered_blocks; k < size && nzeros != 0; ++k) {
-      const size_t ctx =
-          histo_offset + ZeroDensityContext(nzeros, k, covered_blocks,
-                                            log2_covered_blocks, prev);
-      const size_t u_coeff = decoder->ReadHybridUint(ctx, br, context_map);
-      // Hand-rolled version of UnpackSigned, shifting before the conversion to
-      // signed integer to avoid undefined behavior of shifting negative
-      // numbers.
-      const size_t magnitude = u_coeff >> 1;
-      const size_t neg_sign = (~u_coeff) & 1;
-      const intptr_t coeff =
-          static_cast<intptr_t>((magnitude ^ (neg_sign - 1)) << shift);
-      if (ac_type == ACType::k16) {
-        block.ptr16[order[k]] += coeff;
-      } else {
-        block.ptr32[order[k]] += coeff;
-      }
-      prev = static_cast<size_t>(u_coeff != 0);
-      nzeros -= prev;
-    }
-    if (JXL_UNLIKELY(nzeros != 0)) {
-      return JXL_FAILURE("Invalid AC: nzeros not 0. Block (%" PRIuS ", %" PRIuS
-                         "), channel %" PRIuS,
-                         bx, by, c);
+  size_t prev = (nzeros > size / 16 ? 0 : 1);
+  for (size_t k = covered_blocks; k < size && nzeros != 0; ++k) {
+    const size_t ctx =
+        histo_offset + ZeroDensityContext(nzeros, k, covered_blocks,
+                                          log2_covered_blocks, prev);
+    const size_t u_coeff =
+        decoder->ReadHybridUintInlined<uses_lz77>(ctx, br, context_map);
+    // Hand-rolled version of UnpackSigned, shifting before the conversion to
+    // signed integer to avoid undefined behavior of shifting negative numbers.
+    const size_t magnitude = u_coeff >> 1;
+    const size_t neg_sign = (~u_coeff) & 1;
+    const intptr_t coeff =
+        static_cast<intptr_t>((magnitude ^ (neg_sign - 1)) << shift);
+    if (ac_type == ACType::k16) {
+      block.ptr16[order[k]] += coeff;
+    } else {
+      block.ptr32[order[k]] += coeff;
     }
+    prev = static_cast<size_t>(u_coeff != 0);
+    nzeros -= prev;
+  }
+  if (JXL_UNLIKELY(nzeros != 0)) {
+    return JXL_FAILURE("Invalid AC: nzeros not 0. Block (%" PRIuS ", %" PRIuS
+                       "), channel %" PRIuS,
+                       bx, by, c);
   }
+
   return true;
 }
 
@@ -537,9 +528,7 @@ struct GetBlockFromBitstream : public GetBlock {
   Status LoadBlock(size_t bx, size_t by, const AcStrategy& acs, size_t size,
                    size_t log2_covered_blocks, ACPtr block[3],
                    ACType ac_type) override {
-    auto decode_ac_varblock = ac_type == ACType::k16
-                                  ? DecodeACVarBlock<ACType::k16>
-                                  : DecodeACVarBlock<ACType::k32>;
+    ;
     for (size_t c : {1, 0, 2}) {
       size_t sbx = bx >> hshift[c];
       size_t sby = by >> vshift[c];
@@ -548,6 +537,12 @@ struct GetBlockFromBitstream : public GetBlock {
       }
 
       for (size_t pass = 0; JXL_UNLIKELY(pass < num_passes); pass++) {
+        auto decode_ac_varblock =
+            decoders[pass].UsesLZ77()
+                ? (ac_type == ACType::k16 ? DecodeACVarBlock<ACType::k16, 1>
+                                          : DecodeACVarBlock<ACType::k32, 1>)
+                : (ac_type == ACType::k16 ? DecodeACVarBlock<ACType::k16, 0>
+                                          : DecodeACVarBlock<ACType::k32, 0>);
         JXL_RETURN_IF_ERROR(decode_ac_varblock(
             ctx_offset[pass], log2_covered_blocks, row_nzeros[pass][c],
             row_nzeros_top[pass][c], nzeros_stride, c, sbx, sby, bx, acs,
@@ -675,8 +670,6 @@ Status DecodeGroup(BitReader* JXL_RESTRICT* JXL_RESTRICT readers,
                    RenderPipelineInput& render_pipeline_input,
                    ImageBundle* JXL_RESTRICT decoded, size_t first_pass,
                    bool force_draw, bool dc_only, bool* should_run_pipeline) {
-  PROFILER_FUNC;
-
   DrawMode draw = (num_passes + first_pass ==
                    dec_state->shared->frame_header.passes.num_passes) ||
                           force_draw
@@ -757,18 +750,18 @@ Status DecodeGroup(BitReader* JXL_RESTRICT* JXL_RESTRICT readers,
     histo_selector_bits = CeilLog2Nonzero(dec_state->shared->num_histograms);
   }
 
-  GetBlockFromBitstream get_block;
+  auto get_block = jxl::make_unique<GetBlockFromBitstream>();
   JXL_RETURN_IF_ERROR(
-      get_block.Init(readers, num_passes, group_idx, histo_selector_bits,
-                     dec_state->shared->BlockGroupRect(group_idx),
-                     group_dec_cache, dec_state, first_pass));
+      get_block->Init(readers, num_passes, group_idx, histo_selector_bits,
+                      dec_state->shared->BlockGroupRect(group_idx),
+                      group_dec_cache, dec_state, first_pass));
 
   JXL_RETURN_IF_ERROR(HWY_DYNAMIC_DISPATCH(DecodeGroupImpl)(
-      &get_block, group_dec_cache, dec_state, thread, group_idx,
+      get_block.get(), group_dec_cache, dec_state, thread, group_idx,
       render_pipeline_input, decoded, draw));
 
   for (size_t pass = 0; pass < num_passes; pass++) {
-    if (!get_block.decoders[pass].CheckANSFinalState()) {
+    if (!get_block->decoders[pass].CheckANSFinalState()) {
       return JXL_FAILURE("ANS checksum failure.");
     }
   }
@@ -783,8 +776,6 @@ Status DecodeGroupForRoundtrip(const std::vector<std::unique_ptr<ACImage>>& ac,
                                RenderPipelineInput& render_pipeline_input,
                                ImageBundle* JXL_RESTRICT decoded,
                                AuxOut* aux_out) {
-  PROFILER_FUNC;
-
   GetBlockFromEncoder get_block(ac, group_idx,
                                 dec_state->shared->frame_header.passes.shift);
   group_dec_cache->InitOnce(
index e50d22d..b55d5ab 100644 (file)
@@ -6,27 +6,22 @@
 #ifndef LIB_JXL_DEC_GROUP_H_
 #define LIB_JXL_DEC_GROUP_H_
 
-#include <stddef.h>
-#include <stdint.h>
-
+#include <cstddef>
+#include <memory>
 #include <vector>
 
-#include "lib/jxl/aux_out.h"
-#include "lib/jxl/aux_out_fwd.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/chroma_from_luma.h"
-#include "lib/jxl/coeff_order_fwd.h"
 #include "lib/jxl/dct_util.h"
-#include "lib/jxl/dec_ans.h"
 #include "lib/jxl/dec_bit_reader.h"
 #include "lib/jxl/dec_cache.h"
-#include "lib/jxl/frame_header.h"
-#include "lib/jxl/image.h"
-#include "lib/jxl/quantizer.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/render_pipeline/render_pipeline.h"
 
 namespace jxl {
 
+struct AuxOut;
+
 Status DecodeGroup(BitReader* JXL_RESTRICT* JXL_RESTRICT readers,
                    size_t num_passes, size_t group_idx,
                    PassesDecoderState* JXL_RESTRICT dec_state,
index 2d974c9..cb3ecbe 100644 (file)
@@ -12,7 +12,7 @@
 
 #include "lib/jxl/base/arch_macros.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/common.h"
+#include "lib/jxl/frame_dimensions.h"
 #include "lib/jxl/image.h"
 
 namespace jxl {
index bf85eaa..0509b32 100644 (file)
@@ -153,6 +153,7 @@ void int_to_float(const pixel_type* const JXL_RESTRICT row_in,
   }
 }
 
+#if JXL_DEBUG_V_LEVEL >= 1
 std::string ModularStreamId::DebugString() const {
   std::ostringstream os;
   os << (kind == kGlobalData   ? "ModularGlobal"
@@ -174,6 +175,7 @@ std::string ModularStreamId::DebugString() const {
   }
   return os.str();
 }
+#endif
 
 Status ModularFrameDecoder::DecodeGlobalInfo(BitReader* reader,
                                              const FrameHeader& frame_header,
index ec94b46..aae643c 100644 (file)
@@ -10,7 +10,6 @@
 
 #include <string>
 
-#include "lib/jxl/aux_out_fwd.h"
 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/dec_bit_reader.h"
index f48398b..ae46b10 100644 (file)
@@ -6,7 +6,6 @@
 #include "lib/jxl/dec_noise.h"
 
 #include <stdint.h>
-#include <stdio.h>
 #include <stdlib.h>
 
 #include <algorithm>
@@ -21,7 +20,6 @@
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/chroma_from_luma.h"
 #include "lib/jxl/image_ops.h"
-#include "lib/jxl/opsin_params.h"
 #include "lib/jxl/sanitizers.h"
 #include "lib/jxl/xorshift128plus-inl.h"
 HWY_BEFORE_NAMESPACE();
@@ -69,7 +67,7 @@ void RandomImage(Xorshift128Plus* rng, const Rect& rect,
 
     size_t x = 0;
     // Only entire batches (avoids exceeding the image padding).
-    for (; x + kFloatsPerBatch <= xsize; x += kFloatsPerBatch) {
+    for (; x + kFloatsPerBatch < xsize; x += kFloatsPerBatch) {
       rng->Fill(batch);
       for (size_t i = 0; i < kFloatsPerBatch; i += Lanes(df)) {
         BitsToFloat(reinterpret_cast<const uint32_t*>(batch) + i, row + x + i);
index f8c0286..ac05866 100644 (file)
@@ -11,7 +11,6 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include "lib/jxl/aux_out_fwd.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/chroma_from_luma.h"
 #include "lib/jxl/dec_bit_reader.h"
index 4f87209..a95e26b 100644 (file)
@@ -22,8 +22,7 @@
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/blending.h"
 #include "lib/jxl/chroma_from_luma.h"
-#include "lib/jxl/color_management.h"
-#include "lib/jxl/common.h"
+#include "lib/jxl/common.h"  // kMaxNumReferenceFrames
 #include "lib/jxl/dec_ans.h"
 #include "lib/jxl/dec_frame.h"
 #include "lib/jxl/entropy_coder.h"
@@ -31,6 +30,7 @@
 #include "lib/jxl/image.h"
 #include "lib/jxl/image_bundle.h"
 #include "lib/jxl/image_ops.h"
+#include "lib/jxl/pack_signed.h"
 #include "lib/jxl/patch_dictionary_internal.h"
 
 namespace jxl {
@@ -68,14 +68,14 @@ Status PatchDictionary::Decode(BitReader* br, size_t xsize, size_t ysize,
     PatchReferencePosition ref_pos;
     ref_pos.ref = read_num(kReferenceFrameContext);
     if (ref_pos.ref >= kMaxNumReferenceFrames ||
-        shared_->reference_frames[ref_pos.ref].frame->xsize() == 0) {
+        shared_->reference_frames[ref_pos.ref].frame.xsize() == 0) {
       return JXL_FAILURE("Invalid reference frame ID");
     }
     if (!shared_->reference_frames[ref_pos.ref].ib_is_in_xyb) {
       return JXL_FAILURE(
           "Patches cannot use frames saved post color transforms");
     }
-    const ImageBundle& ib = *shared_->reference_frames[ref_pos.ref].frame;
+    const ImageBundle& ib = shared_->reference_frames[ref_pos.ref].frame;
     ref_pos.x0 = read_num(kPatchReferencePositionContext);
     ref_pos.y0 = read_num(kPatchReferencePositionContext);
     ref_pos.xsize = read_num(kPatchSizeContext) + 1;
@@ -107,10 +107,20 @@ Status PatchDictionary::Decode(BitReader* br, size_t xsize, size_t ysize,
         pos.x = read_num(kPatchPositionContext);
         pos.y = read_num(kPatchPositionContext);
       } else {
-        pos.x =
-            positions_.back().x + UnpackSigned(read_num(kPatchOffsetContext));
-        pos.y =
-            positions_.back().y + UnpackSigned(read_num(kPatchOffsetContext));
+        ssize_t deltax = UnpackSigned(read_num(kPatchOffsetContext));
+        if (deltax < 0 && static_cast<size_t>(-deltax) > positions_.back().x) {
+          return JXL_FAILURE("Invalid patch: negative x coordinate (%" PRIuS
+                             " base x %" PRIdS " delta x)",
+                             positions_.back().x, deltax);
+        }
+        pos.x = positions_.back().x + deltax;
+        ssize_t deltay = UnpackSigned(read_num(kPatchOffsetContext));
+        if (deltay < 0 && static_cast<size_t>(-deltay) > positions_.back().y) {
+          return JXL_FAILURE("Invalid patch: negative y coordinate (%" PRIuS
+                             " base y %" PRIdS " delta y)",
+                             positions_.back().y, deltay);
+        }
+        pos.y = positions_.back().y + deltay;
       }
       if (pos.x + ref_pos.xsize > xsize) {
         return JXL_FAILURE("Invalid patch x: at %" PRIuS " + %" PRIuS
@@ -328,13 +338,13 @@ void PatchDictionary::AddOneRow(float* const* inout, size_t y, size_t x0,
     size_t patch_x0 = std::max(bx, x0);
     size_t patch_x1 = std::min(bx + patch_xsize, x0 + xsize);
     for (size_t c = 0; c < 3; c++) {
-      fg_ptrs[c] = shared_->reference_frames[ref].frame->color()->ConstPlaneRow(
+      fg_ptrs[c] = shared_->reference_frames[ref].frame.color().ConstPlaneRow(
                        c, ref_pos.y0 + iy) +
                    ref_pos.x0 + x0 - bx;
     }
     for (size_t i = 0; i < num_ec; i++) {
       fg_ptrs[3 + i] =
-          shared_->reference_frames[ref].frame->extra_channels()[i].ConstRow(
+          shared_->reference_frames[ref].frame.extra_channels()[i].ConstRow(
               ref_pos.y0 + iy) +
           ref_pos.x0 + x0 - bx;
     }
index a950e83..aac6111 100644 (file)
 #include <vector>
 
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/dec_bit_reader.h"
 #include "lib/jxl/image.h"
-#include "lib/jxl/opsin_params.h"
 
 namespace jxl {
 
index 075619b..9c90550 100644 (file)
@@ -32,15 +32,13 @@ using hwy::HWY_NAMESPACE::MulAdd;
 template <size_t DCT_ROWS, size_t DCT_COLS, size_t LF_ROWS, size_t LF_COLS,
           size_t ROWS, size_t COLS>
 JXL_INLINE void ReinterpretingDCT(const float* input, const size_t input_stride,
-                                  float* output, const size_t output_stride) {
+                                  float* output, const size_t output_stride,
+                                  float* JXL_RESTRICT block,
+                                  float* JXL_RESTRICT scratch_space) {
   static_assert(LF_ROWS == ROWS,
                 "ReinterpretingDCT should only be called with LF == N");
   static_assert(LF_COLS == COLS,
                 "ReinterpretingDCT should only be called with LF == N");
-  HWY_ALIGN float block[ROWS * COLS];
-
-  // ROWS, COLS <= 8, so we can put scratch space on the stack.
-  HWY_ALIGN float scratch_space[ROWS * COLS];
   ComputeScaledDCT<ROWS, COLS>()(DCTFrom(input, input_stride), block,
                                  scratch_space);
   if (ROWS < COLS) {
@@ -398,7 +396,7 @@ void AFVIDCT4x4(const float* JXL_RESTRICT coeffs, float* JXL_RESTRICT pixels) {
 template <size_t afv_kind>
 void AFVTransformToPixels(const float* JXL_RESTRICT coefficients,
                           float* JXL_RESTRICT pixels, size_t pixels_stride) {
-  HWY_ALIGN float scratch_space[4 * 8];
+  HWY_ALIGN float scratch_space[4 * 8 * 4];
   size_t afv_x = afv_kind & 1;
   size_t afv_y = afv_kind / 2;
   float dcs[3] = {};
@@ -460,7 +458,6 @@ HWY_MAYBE_UNUSED void TransformToPixels(const AcStrategy::Type strategy,
   using Type = AcStrategy::Type;
   switch (strategy) {
     case Type::IDENTITY: {
-      PROFILER_ZONE("IDCT Identity");
       float dcs[4] = {};
       float block00 = coefficients[0];
       float block01 = coefficients[1];
@@ -498,7 +495,6 @@ HWY_MAYBE_UNUSED void TransformToPixels(const AcStrategy::Type strategy,
       break;
     }
     case Type::DCT8X4: {
-      PROFILER_ZONE("IDCT 8x4");
       float dcs[2] = {};
       float block0 = coefficients[0];
       float block1 = coefficients[8];
@@ -519,7 +515,6 @@ HWY_MAYBE_UNUSED void TransformToPixels(const AcStrategy::Type strategy,
       break;
     }
     case Type::DCT4X8: {
-      PROFILER_ZONE("IDCT 4x8");
       float dcs[2] = {};
       float block0 = coefficients[0];
       float block1 = coefficients[8];
@@ -541,7 +536,6 @@ HWY_MAYBE_UNUSED void TransformToPixels(const AcStrategy::Type strategy,
       break;
     }
     case Type::DCT4X4: {
-      PROFILER_ZONE("IDCT 4");
       float dcs[4] = {};
       float block00 = coefficients[0];
       float block01 = coefficients[1];
@@ -570,7 +564,6 @@ HWY_MAYBE_UNUSED void TransformToPixels(const AcStrategy::Type strategy,
       break;
     }
     case Type::DCT2X2: {
-      PROFILER_ZONE("IDCT 2");
       HWY_ALIGN float coeffs[kDCTBlockSize];
       memcpy(coeffs, coefficients, sizeof(float) * kDCTBlockSize);
       IDCT2TopBlock<2>(coeffs, kBlockDim, coeffs);
@@ -584,247 +577,228 @@ HWY_MAYBE_UNUSED void TransformToPixels(const AcStrategy::Type strategy,
       break;
     }
     case Type::DCT16X16: {
-      PROFILER_ZONE("IDCT 16");
       ComputeScaledIDCT<16, 16>()(coefficients, DCTTo(pixels, pixels_stride),
                                   scratch_space);
       break;
     }
     case Type::DCT16X8: {
-      PROFILER_ZONE("IDCT 16x8");
       ComputeScaledIDCT<16, 8>()(coefficients, DCTTo(pixels, pixels_stride),
                                  scratch_space);
       break;
     }
     case Type::DCT8X16: {
-      PROFILER_ZONE("IDCT 8x16");
       ComputeScaledIDCT<8, 16>()(coefficients, DCTTo(pixels, pixels_stride),
                                  scratch_space);
       break;
     }
     case Type::DCT32X8: {
-      PROFILER_ZONE("IDCT 32x8");
       ComputeScaledIDCT<32, 8>()(coefficients, DCTTo(pixels, pixels_stride),
                                  scratch_space);
       break;
     }
     case Type::DCT8X32: {
-      PROFILER_ZONE("IDCT 8x32");
       ComputeScaledIDCT<8, 32>()(coefficients, DCTTo(pixels, pixels_stride),
                                  scratch_space);
       break;
     }
     case Type::DCT32X16: {
-      PROFILER_ZONE("IDCT 32x16");
       ComputeScaledIDCT<32, 16>()(coefficients, DCTTo(pixels, pixels_stride),
                                   scratch_space);
       break;
     }
     case Type::DCT16X32: {
-      PROFILER_ZONE("IDCT 16x32");
       ComputeScaledIDCT<16, 32>()(coefficients, DCTTo(pixels, pixels_stride),
                                   scratch_space);
       break;
     }
     case Type::DCT32X32: {
-      PROFILER_ZONE("IDCT 32");
       ComputeScaledIDCT<32, 32>()(coefficients, DCTTo(pixels, pixels_stride),
                                   scratch_space);
       break;
     }
     case Type::DCT: {
-      PROFILER_ZONE("IDCT 8");
       ComputeScaledIDCT<8, 8>()(coefficients, DCTTo(pixels, pixels_stride),
                                 scratch_space);
       break;
     }
     case Type::AFV0: {
-      PROFILER_ZONE("IAFV0");
       AFVTransformToPixels<0>(coefficients, pixels, pixels_stride);
       break;
     }
     case Type::AFV1: {
-      PROFILER_ZONE("IAFV1");
       AFVTransformToPixels<1>(coefficients, pixels, pixels_stride);
       break;
     }
     case Type::AFV2: {
-      PROFILER_ZONE("IAFV2");
       AFVTransformToPixels<2>(coefficients, pixels, pixels_stride);
       break;
     }
     case Type::AFV3: {
-      PROFILER_ZONE("IAFV3");
       AFVTransformToPixels<3>(coefficients, pixels, pixels_stride);
       break;
     }
     case Type::DCT64X32: {
-      PROFILER_ZONE("IDCT 64x32");
       ComputeScaledIDCT<64, 32>()(coefficients, DCTTo(pixels, pixels_stride),
                                   scratch_space);
       break;
     }
     case Type::DCT32X64: {
-      PROFILER_ZONE("IDCT 32x64");
       ComputeScaledIDCT<32, 64>()(coefficients, DCTTo(pixels, pixels_stride),
                                   scratch_space);
       break;
     }
     case Type::DCT64X64: {
-      PROFILER_ZONE("IDCT 64");
       ComputeScaledIDCT<64, 64>()(coefficients, DCTTo(pixels, pixels_stride),
                                   scratch_space);
       break;
     }
     case Type::DCT128X64: {
-      PROFILER_ZONE("IDCT 128x64");
       ComputeScaledIDCT<128, 64>()(coefficients, DCTTo(pixels, pixels_stride),
                                    scratch_space);
       break;
     }
     case Type::DCT64X128: {
-      PROFILER_ZONE("IDCT 64x128");
       ComputeScaledIDCT<64, 128>()(coefficients, DCTTo(pixels, pixels_stride),
                                    scratch_space);
       break;
     }
     case Type::DCT128X128: {
-      PROFILER_ZONE("IDCT 128");
       ComputeScaledIDCT<128, 128>()(coefficients, DCTTo(pixels, pixels_stride),
                                     scratch_space);
       break;
     }
     case Type::DCT256X128: {
-      PROFILER_ZONE("IDCT 256x128");
       ComputeScaledIDCT<256, 128>()(coefficients, DCTTo(pixels, pixels_stride),
                                     scratch_space);
       break;
     }
     case Type::DCT128X256: {
-      PROFILER_ZONE("IDCT 128x256");
       ComputeScaledIDCT<128, 256>()(coefficients, DCTTo(pixels, pixels_stride),
                                     scratch_space);
       break;
     }
     case Type::DCT256X256: {
-      PROFILER_ZONE("IDCT 256");
       ComputeScaledIDCT<256, 256>()(coefficients, DCTTo(pixels, pixels_stride),
                                     scratch_space);
       break;
     }
     case Type::kNumValidStrategies:
-      JXL_ABORT("Invalid strategy");
+      JXL_UNREACHABLE("Invalid strategy");
   }
 }
 
 HWY_MAYBE_UNUSED void LowestFrequenciesFromDC(const AcStrategy::Type strategy,
                                               const float* dc, size_t dc_stride,
-                                              float* llf) {
+                                              float* llf,
+                                              float* JXL_RESTRICT scratch) {
   using Type = AcStrategy::Type;
+  HWY_ALIGN float warm_block[4 * 4];
+  HWY_ALIGN float warm_scratch_space[4 * 4 * 4];
   switch (strategy) {
     case Type::DCT16X8: {
       ReinterpretingDCT</*DCT_ROWS=*/2 * kBlockDim, /*DCT_COLS=*/kBlockDim,
                         /*LF_ROWS=*/2, /*LF_COLS=*/1, /*ROWS=*/2, /*COLS=*/1>(
-          dc, dc_stride, llf, 2 * kBlockDim);
+          dc, dc_stride, llf, 2 * kBlockDim, warm_block, warm_scratch_space);
       break;
     }
     case Type::DCT8X16: {
       ReinterpretingDCT</*DCT_ROWS=*/kBlockDim, /*DCT_COLS=*/2 * kBlockDim,
                         /*LF_ROWS=*/1, /*LF_COLS=*/2, /*ROWS=*/1, /*COLS=*/2>(
-          dc, dc_stride, llf, 2 * kBlockDim);
+          dc, dc_stride, llf, 2 * kBlockDim, warm_block, warm_scratch_space);
       break;
     }
     case Type::DCT16X16: {
       ReinterpretingDCT</*DCT_ROWS=*/2 * kBlockDim, /*DCT_COLS=*/2 * kBlockDim,
                         /*LF_ROWS=*/2, /*LF_COLS=*/2, /*ROWS=*/2, /*COLS=*/2>(
-          dc, dc_stride, llf, 2 * kBlockDim);
+          dc, dc_stride, llf, 2 * kBlockDim, warm_block, warm_scratch_space);
       break;
     }
     case Type::DCT32X8: {
       ReinterpretingDCT</*DCT_ROWS=*/4 * kBlockDim, /*DCT_COLS=*/kBlockDim,
                         /*LF_ROWS=*/4, /*LF_COLS=*/1, /*ROWS=*/4, /*COLS=*/1>(
-          dc, dc_stride, llf, 4 * kBlockDim);
+          dc, dc_stride, llf, 4 * kBlockDim, warm_block, warm_scratch_space);
       break;
     }
     case Type::DCT8X32: {
       ReinterpretingDCT</*DCT_ROWS=*/kBlockDim, /*DCT_COLS=*/4 * kBlockDim,
                         /*LF_ROWS=*/1, /*LF_COLS=*/4, /*ROWS=*/1, /*COLS=*/4>(
-          dc, dc_stride, llf, 4 * kBlockDim);
+          dc, dc_stride, llf, 4 * kBlockDim, warm_block, warm_scratch_space);
       break;
     }
     case Type::DCT32X16: {
       ReinterpretingDCT</*DCT_ROWS=*/4 * kBlockDim, /*DCT_COLS=*/2 * kBlockDim,
                         /*LF_ROWS=*/4, /*LF_COLS=*/2, /*ROWS=*/4, /*COLS=*/2>(
-          dc, dc_stride, llf, 4 * kBlockDim);
+          dc, dc_stride, llf, 4 * kBlockDim, warm_block, warm_scratch_space);
       break;
     }
     case Type::DCT16X32: {
       ReinterpretingDCT</*DCT_ROWS=*/2 * kBlockDim, /*DCT_COLS=*/4 * kBlockDim,
                         /*LF_ROWS=*/2, /*LF_COLS=*/4, /*ROWS=*/2, /*COLS=*/4>(
-          dc, dc_stride, llf, 4 * kBlockDim);
+          dc, dc_stride, llf, 4 * kBlockDim, warm_block, warm_scratch_space);
       break;
     }
     case Type::DCT32X32: {
       ReinterpretingDCT</*DCT_ROWS=*/4 * kBlockDim, /*DCT_COLS=*/4 * kBlockDim,
                         /*LF_ROWS=*/4, /*LF_COLS=*/4, /*ROWS=*/4, /*COLS=*/4>(
-          dc, dc_stride, llf, 4 * kBlockDim);
+          dc, dc_stride, llf, 4 * kBlockDim, warm_block, warm_scratch_space);
       break;
     }
     case Type::DCT64X32: {
       ReinterpretingDCT</*DCT_ROWS=*/8 * kBlockDim, /*DCT_COLS=*/4 * kBlockDim,
                         /*LF_ROWS=*/8, /*LF_COLS=*/4, /*ROWS=*/8, /*COLS=*/4>(
-          dc, dc_stride, llf, 8 * kBlockDim);
+          dc, dc_stride, llf, 8 * kBlockDim, scratch, scratch + 8 * 4);
       break;
     }
     case Type::DCT32X64: {
       ReinterpretingDCT</*DCT_ROWS=*/4 * kBlockDim, /*DCT_COLS=*/8 * kBlockDim,
                         /*LF_ROWS=*/4, /*LF_COLS=*/8, /*ROWS=*/4, /*COLS=*/8>(
-          dc, dc_stride, llf, 8 * kBlockDim);
+          dc, dc_stride, llf, 8 * kBlockDim, scratch, scratch + 4 * 8);
       break;
     }
     case Type::DCT64X64: {
       ReinterpretingDCT</*DCT_ROWS=*/8 * kBlockDim, /*DCT_COLS=*/8 * kBlockDim,
                         /*LF_ROWS=*/8, /*LF_COLS=*/8, /*ROWS=*/8, /*COLS=*/8>(
-          dc, dc_stride, llf, 8 * kBlockDim);
+          dc, dc_stride, llf, 8 * kBlockDim, scratch, scratch + 8 * 8);
       break;
     }
     case Type::DCT128X64: {
       ReinterpretingDCT</*DCT_ROWS=*/16 * kBlockDim, /*DCT_COLS=*/8 * kBlockDim,
                         /*LF_ROWS=*/16, /*LF_COLS=*/8, /*ROWS=*/16, /*COLS=*/8>(
-          dc, dc_stride, llf, 16 * kBlockDim);
+          dc, dc_stride, llf, 16 * kBlockDim, scratch, scratch + 16 * 8);
       break;
     }
     case Type::DCT64X128: {
       ReinterpretingDCT</*DCT_ROWS=*/8 * kBlockDim, /*DCT_COLS=*/16 * kBlockDim,
                         /*LF_ROWS=*/8, /*LF_COLS=*/16, /*ROWS=*/8, /*COLS=*/16>(
-          dc, dc_stride, llf, 16 * kBlockDim);
+          dc, dc_stride, llf, 16 * kBlockDim, scratch, scratch + 8 * 16);
       break;
     }
     case Type::DCT128X128: {
       ReinterpretingDCT<
           /*DCT_ROWS=*/16 * kBlockDim, /*DCT_COLS=*/16 * kBlockDim,
           /*LF_ROWS=*/16, /*LF_COLS=*/16, /*ROWS=*/16, /*COLS=*/16>(
-          dc, dc_stride, llf, 16 * kBlockDim);
+          dc, dc_stride, llf, 16 * kBlockDim, scratch, scratch + 16 * 16);
       break;
     }
     case Type::DCT256X128: {
       ReinterpretingDCT<
           /*DCT_ROWS=*/32 * kBlockDim, /*DCT_COLS=*/16 * kBlockDim,
           /*LF_ROWS=*/32, /*LF_COLS=*/16, /*ROWS=*/32, /*COLS=*/16>(
-          dc, dc_stride, llf, 32 * kBlockDim);
+          dc, dc_stride, llf, 32 * kBlockDim, scratch, scratch + 32 * 16);
       break;
     }
     case Type::DCT128X256: {
       ReinterpretingDCT<
           /*DCT_ROWS=*/16 * kBlockDim, /*DCT_COLS=*/32 * kBlockDim,
           /*LF_ROWS=*/16, /*LF_COLS=*/32, /*ROWS=*/16, /*COLS=*/32>(
-          dc, dc_stride, llf, 32 * kBlockDim);
+          dc, dc_stride, llf, 32 * kBlockDim, scratch, scratch + 16 * 32);
       break;
     }
     case Type::DCT256X256: {
       ReinterpretingDCT<
           /*DCT_ROWS=*/32 * kBlockDim, /*DCT_COLS=*/32 * kBlockDim,
           /*LF_ROWS=*/32, /*LF_COLS=*/32, /*ROWS=*/32, /*COLS=*/32>(
-          dc, dc_stride, llf, 32 * kBlockDim);
+          dc, dc_stride, llf, 32 * kBlockDim, scratch, scratch + 32 * 32);
       break;
     }
     case Type::DCT:
@@ -840,7 +814,7 @@ HWY_MAYBE_UNUSED void LowestFrequenciesFromDC(const AcStrategy::Type strategy,
       llf[0] = dc[0];
       break;
     case Type::kNumValidStrategies:
-      JXL_ABORT("Invalid strategy");
+      JXL_UNREACHABLE("Invalid strategy");
   };
 }
 
index 9ee80c5..2d40740 100644 (file)
@@ -27,9 +27,10 @@ void TransformToPixels(AcStrategy::Type strategy,
 
 HWY_EXPORT(LowestFrequenciesFromDC);
 void LowestFrequenciesFromDC(const jxl::AcStrategy::Type strategy,
-                             const float* dc, size_t dc_stride, float* llf) {
+                             const float* dc, size_t dc_stride, float* llf,
+                             float* JXL_RESTRICT scratch) {
   return HWY_DYNAMIC_DISPATCH(LowestFrequenciesFromDC)(strategy, dc, dc_stride,
-                                                       llf);
+                                                       llf, scratch);
 }
 
 HWY_EXPORT(AFVIDCT4x4);
index 97c4ca5..f68481f 100644 (file)
@@ -23,7 +23,8 @@ void TransformToPixels(AcStrategy::Type strategy,
 
 // Equivalent of the above for DC image.
 void LowestFrequenciesFromDC(const jxl::AcStrategy::Type strategy,
-                             const float* dc, size_t dc_stride, float* llf);
+                             const float* dc, size_t dc_stride, float* llf,
+                             float* JXL_RESTRICT scratch);
 
 void AFVIDCT4x4(const float* JXL_RESTRICT coeffs, float* JXL_RESTRICT pixels);
 
index a4f24cd..495693b 100644 (file)
@@ -333,7 +333,7 @@ static inline HWY_MAYBE_UNUSED void FastXYBTosRGB8(const float* input[4],
   (void)output;
   (void)is_rgba;
   (void)xsize;
-  JXL_ABORT("Unreachable");
+  JXL_UNREACHABLE("Unreachable");
 #endif
 }
 
index ef4088f..7010f0d 100644 (file)
 #include <hwy/highway.h>
 
 #include "lib/jxl/base/compiler_specific.h"
-#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/matrix_ops.h"
 #include "lib/jxl/base/status.h"
+#include "lib/jxl/cms/jxl_cms_internal.h"
+#include "lib/jxl/cms/opsin_params.h"
+#include "lib/jxl/color_encoding_internal.h"
 #include "lib/jxl/dec_group_border.h"
 #include "lib/jxl/dec_xyb-inl.h"
 #include "lib/jxl/fields.h"
@@ -27,12 +30,10 @@ namespace jxl {
 namespace HWY_NAMESPACE {
 
 // These templates are not found via ADL.
-using hwy::HWY_NAMESPACE::Broadcast;
 using hwy::HWY_NAMESPACE::MulAdd;
 
 void OpsinToLinearInplace(Image3F* JXL_RESTRICT inout, ThreadPool* pool,
                           const OpsinParams& opsin_params) {
-  PROFILER_FUNC;
   JXL_CHECK_IMAGE_INITIALIZED(*inout, Rect(*inout));
 
   const size_t xsize = inout->xsize();  // not padded
@@ -70,8 +71,6 @@ void OpsinToLinearInplace(Image3F* JXL_RESTRICT inout, ThreadPool* pool,
 void OpsinToLinear(const Image3F& opsin, const Rect& rect, ThreadPool* pool,
                    Image3F* JXL_RESTRICT linear,
                    const OpsinParams& opsin_params) {
-  PROFILER_FUNC;
-
   JXL_ASSERT(SameSize(rect, *linear));
   JXL_CHECK_IMAGE_INITIALIZED(opsin, rect);
 
@@ -189,8 +188,8 @@ void FastXYBTosRGB8(const float* input[4], uint8_t* output, bool is_rgba,
 void OpsinParams::Init(float intensity_target) {
   InitSIMDInverseMatrix(GetOpsinAbsorbanceInverseMatrix(), inverse_opsin_matrix,
                         intensity_target);
-  memcpy(opsin_biases, kNegOpsinAbsorbanceBiasRGB,
-         sizeof(kNegOpsinAbsorbanceBiasRGB));
+  memcpy(opsin_biases, jxl::cms::kNegOpsinAbsorbanceBiasRGB.data(),
+         sizeof(jxl::cms::kNegOpsinAbsorbanceBiasRGB));
   memcpy(quant_biases, kDefaultQuantBias, sizeof(kDefaultQuantBias));
   for (size_t c = 0; c < 4; c++) {
     opsin_biases_cbrt[c] = cbrtf(opsin_biases[c]);
@@ -202,12 +201,12 @@ bool CanOutputToColorEncoding(const ColorEncoding& c_desired) {
     return false;
   }
   // TODO(veluca): keep in sync with dec_reconstruct.cc
-  if (!c_desired.tf.IsPQ() && !c_desired.tf.IsSRGB() &&
-      !c_desired.tf.IsGamma() && !c_desired.tf.IsLinear() &&
-      !c_desired.tf.IsHLG() && !c_desired.tf.IsDCI() && !c_desired.tf.Is709()) {
+  const auto& tf = c_desired.Tf();
+  if (!tf.IsPQ() && !tf.IsSRGB() && !tf.have_gamma && !tf.IsLinear() &&
+      !tf.IsHLG() && !tf.IsDCI() && !tf.Is709()) {
     return false;
   }
-  if (c_desired.IsGray() && c_desired.white_point != WhitePoint::kD65) {
+  if (c_desired.IsGray() && c_desired.GetWhitePointType() != WhitePoint::kD65) {
     // TODO(veluca): figure out what should happen here.
     return false;
   }
@@ -239,7 +238,13 @@ Status OutputEncodingInfo::SetFromMetadata(const CodecMetadata& metadata) {
 
 Status OutputEncodingInfo::MaybeSetColorEncoding(
     const ColorEncoding& c_desired) {
-  if (!xyb_encoded || !CanOutputToColorEncoding(c_desired)) {
+  if (c_desired.GetColorSpace() == ColorSpace::kXYB &&
+      ((color_encoding.GetColorSpace() == ColorSpace::kRGB &&
+        color_encoding.GetPrimariesType() != Primaries::kSRGB) ||
+       color_encoding.Tf().IsPQ())) {
+    return false;
+  }
+  if (!xyb_encoded && !CanOutputToColorEncoding(c_desired)) {
     return false;
   }
   return SetColorEncoding(c_desired);
@@ -247,6 +252,8 @@ Status OutputEncodingInfo::MaybeSetColorEncoding(
 
 Status OutputEncodingInfo::SetColorEncoding(const ColorEncoding& c_desired) {
   color_encoding = c_desired;
+  linear_color_encoding = color_encoding;
+  linear_color_encoding.Tf().SetTransferFunction(TransferFunction::kLinear);
   color_encoding_is_original = orig_color_encoding.SameColorEncoding(c_desired);
 
   // Compute the opsin inverse matrix and luminances based on primaries and
@@ -256,35 +263,35 @@ Status OutputEncodingInfo::SetColorEncoding(const ColorEncoding& c_desired) {
   memcpy(inverse_matrix, orig_inverse_matrix, sizeof(inverse_matrix));
   constexpr float kSRGBLuminances[3] = {0.2126, 0.7152, 0.0722};
   memcpy(luminances, kSRGBLuminances, sizeof(luminances));
-  if ((c_desired.primaries != Primaries::kSRGB ||
-       c_desired.white_point != WhitePoint::kD65) &&
+  if ((c_desired.GetPrimariesType() != Primaries::kSRGB ||
+       c_desired.GetWhitePointType() != WhitePoint::kD65) &&
       !c_desired.IsGray()) {
     float srgb_to_xyzd50[9];
     const auto& srgb = ColorEncoding::SRGB(/*is_gray=*/false);
-    JXL_CHECK(PrimariesToXYZD50(
-        srgb.GetPrimaries().r.x, srgb.GetPrimaries().r.y,
-        srgb.GetPrimaries().g.x, srgb.GetPrimaries().g.y,
-        srgb.GetPrimaries().b.x, srgb.GetPrimaries().b.y,
-        srgb.GetWhitePoint().x, srgb.GetWhitePoint().y, srgb_to_xyzd50));
+    PrimariesCIExy p = srgb.GetPrimaries();
+    CIExy w = srgb.GetWhitePoint();
+    JXL_CHECK(PrimariesToXYZD50(p.r.x, p.r.y, p.g.x, p.g.y, p.b.x, p.b.y, w.x,
+                                w.y, srgb_to_xyzd50));
     float original_to_xyz[3][3];
-    JXL_RETURN_IF_ERROR(PrimariesToXYZ(
-        c_desired.GetPrimaries().r.x, c_desired.GetPrimaries().r.y,
-        c_desired.GetPrimaries().g.x, c_desired.GetPrimaries().g.y,
-        c_desired.GetPrimaries().b.x, c_desired.GetPrimaries().b.y,
-        c_desired.GetWhitePoint().x, c_desired.GetWhitePoint().y,
-        &original_to_xyz[0][0]));
+    p = c_desired.GetPrimaries();
+    w = c_desired.GetWhitePoint();
+    if (!PrimariesToXYZ(p.r.x, p.r.y, p.g.x, p.g.y, p.b.x, p.b.y, w.x, w.y,
+                        &original_to_xyz[0][0])) {
+      return JXL_FAILURE("PrimariesToXYZ failed");
+    }
     memcpy(luminances, original_to_xyz[1], sizeof luminances);
     if (xyb_encoded) {
       float adapt_to_d50[9];
-      JXL_RETURN_IF_ERROR(AdaptToXYZD50(c_desired.GetWhitePoint().x,
-                                        c_desired.GetWhitePoint().y,
-                                        adapt_to_d50));
+      if (!AdaptToXYZD50(c_desired.GetWhitePoint().x,
+                         c_desired.GetWhitePoint().y, adapt_to_d50)) {
+        return JXL_FAILURE("AdaptToXYZD50 failed");
+      }
       float xyzd50_to_original[9];
-      MatMul(adapt_to_d50, &original_to_xyz[0][0], 3, 3, 3, xyzd50_to_original);
+      Mul3x3Matrix(adapt_to_d50, &original_to_xyz[0][0], xyzd50_to_original);
       JXL_RETURN_IF_ERROR(Inv3x3Matrix(xyzd50_to_original));
       float srgb_to_original[9];
-      MatMul(xyzd50_to_original, srgb_to_xyzd50, 3, 3, 3, srgb_to_original);
-      MatMul(srgb_to_original, orig_inverse_matrix, 3, 3, 3, inverse_matrix);
+      Mul3x3Matrix(xyzd50_to_original, srgb_to_xyzd50, srgb_to_original);
+      Mul3x3Matrix(srgb_to_original, orig_inverse_matrix, inverse_matrix);
       inverse_matrix_is_default = false;
     }
   }
@@ -296,26 +303,24 @@ Status OutputEncodingInfo::SetColorEncoding(const ColorEncoding& c_desired) {
     memcpy(&srgb_to_luma[0], luminances, sizeof(luminances));
     memcpy(&srgb_to_luma[3], luminances, sizeof(luminances));
     memcpy(&srgb_to_luma[6], luminances, sizeof(luminances));
-    MatMul(srgb_to_luma, tmp_inv_matrix, 3, 3, 3, inverse_matrix);
+    Mul3x3Matrix(srgb_to_luma, tmp_inv_matrix, inverse_matrix);
   }
 
   // The internal XYB color space uses absolute luminance, so we scale back the
   // opsin inverse matrix to relative luminance where 1.0 corresponds to the
-  // original intensity target, or to absolute luminance for PQ, where 1.0
-  // corresponds to 10000 nits.
+  // original intensity target.
   if (xyb_encoded) {
-    float intensity_target =
-        (c_desired.tf.IsPQ() ? 10000 : orig_intensity_target);
     InitSIMDInverseMatrix(inverse_matrix, opsin_params.inverse_opsin_matrix,
-                          intensity_target);
-    all_default_opsin = (std::abs(intensity_target - 255.0) <= 0.1f &&
+                          orig_intensity_target);
+    all_default_opsin = (std::abs(orig_intensity_target - 255.0) <= 0.1f &&
                          inverse_matrix_is_default);
   }
 
   // Set the inverse gamma based on color space transfer function.
-  inverse_gamma = (c_desired.tf.IsGamma() ? c_desired.tf.GetGamma()
-                   : c_desired.tf.IsDCI() ? 1.0f / 2.6f
-                                          : 1.0);
+  const auto& tf = c_desired.Tf();
+  inverse_gamma = (tf.have_gamma ? tf.GetGamma()
+                   : tf.IsDCI()  ? 1.0f / 2.6f
+                                 : 1.0);
   return true;
 }
 
index ebaae9a..29378f5 100644 (file)
@@ -8,6 +8,8 @@
 
 // XYB -> linear sRGB.
 
+#include <jxl/cms_interface.h>
+
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/base/status.h"
@@ -15,7 +17,6 @@
 #include "lib/jxl/dec_bit_reader.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/image_metadata.h"
-#include "lib/jxl/opsin_params.h"
 
 namespace jxl {
 
@@ -42,7 +43,11 @@ struct OutputEncodingInfo {
   //
   // Fields depending on output color encoding
   //
+  // The requested color encoding.
   ColorEncoding color_encoding;
+  // This is expected as the output of the conversion from XYB.
+  // It is equal to `color_encoding`, but with a linear tone response curve.
+  ColorEncoding linear_color_encoding;
   bool color_encoding_is_original;
   // Contains an opsin matrix that converts to the primaries of the output
   // encoding.
@@ -56,6 +61,8 @@ struct OutputEncodingInfo {
   float luminances[3];
   // Used for the HLG inverse OOTF and PQ tone mapping.
   float desired_intensity_target;
+  bool cms_set = false;
+  JxlCmsInterface color_management_system;
 
   Status SetFromMetadata(const CodecMetadata& metadata);
   Status MaybeSetColorEncoding(const ColorEncoding& c_desired);
index 1a0facc..615ba20 100644 (file)
@@ -3,18 +3,36 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-#include "jxl/decode.h"
+#include <jxl/decode.h>
+#include <jxl/types.h>
+
+#include <algorithm>
+#include <array>
+#include <functional>
+#include <memory>
+#include <utility>
+#include <vector>
 
-#include "jxl/types.h"
 #include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/common.h"
 #include "lib/jxl/base/span.h"
 #include "lib/jxl/base/status.h"
+#include "lib/jxl/padded_bytes.h"
+
+// JPEGXL_ENABLE_BOXES, JPEGXL_ENABLE_TRANSCODE_JPEG
+#include "lib/jxl/common.h"
+
+#if JPEGXL_ENABLE_BOXES || JPEGXL_ENABLE_TRANSCODE_JPEG
 #include "lib/jxl/box_content_decoder.h"
+#endif
 #include "lib/jxl/dec_external_image.h"
 #include "lib/jxl/dec_frame.h"
 #include "lib/jxl/dec_modular.h"
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
 #include "lib/jxl/decode_to_jpeg.h"
+#endif
 #include "lib/jxl/fields.h"
+#include "lib/jxl/frame_dimensions.h"
 #include "lib/jxl/frame_header.h"
 #include "lib/jxl/headers.h"
 #include "lib/jxl/icc_codec.h"
@@ -34,14 +52,6 @@ bool OutOfBounds(size_t a, size_t b, size_t size) {
   return false;
 }
 
-bool SumOverflows(size_t a, size_t b, size_t c) {
-  size_t sum = a + b;
-  if (sum < b) return true;
-  sum += c;
-  if (sum < c) return true;
-  return false;
-}
-
 JXL_INLINE size_t InitialBasicInfoSizeHint() {
   // Amount of bytes before the start of the codestream in the container format,
   // assuming that the codestream is the first box after the signature and
@@ -73,12 +83,22 @@ JXL_INLINE size_t InitialBasicInfoSizeHint() {
    JXL_DEC_ERROR)
 #endif  // JXL_CRASH_ON_ERROR
 
+// Error caused by bad input (invalid file) rather than incorrect API usage.
+// For now there is no way to distinguish these two types of errors yet.
+#define JXL_INPUT_ERROR(format, ...) JXL_API_ERROR(format, ##__VA_ARGS__)
+
 JxlDecoderStatus ConvertStatus(JxlDecoderStatus status) { return status; }
 
 JxlDecoderStatus ConvertStatus(jxl::Status status) {
   return status ? JXL_DEC_SUCCESS : JXL_DEC_ERROR;
 }
 
+#define JXL_API_RETURN_IF_ERROR(expr)               \
+  {                                                 \
+    JxlDecoderStatus status_ = ConvertStatus(expr); \
+    if (status_ != JXL_DEC_SUCCESS) return status_; \
+  }
+
 JxlSignature ReadSignature(const uint8_t* buf, size_t len, size_t* pos) {
   if (*pos >= len) return JXL_SIG_NOT_ENOUGH_BYTES;
 
@@ -144,6 +164,19 @@ size_t BitsPerChannel(JxlDataType data_type) {
   }
 }
 
+template <typename T>
+uint32_t GetBitDepth(JxlBitDepth bit_depth, const T& metadata,
+                     JxlPixelFormat format) {
+  if (bit_depth.type == JXL_BIT_DEPTH_FROM_PIXEL_FORMAT) {
+    return BitsPerChannel(format.data_type);
+  } else if (bit_depth.type == JXL_BIT_DEPTH_FROM_CODESTREAM) {
+    return metadata.bit_depth.bits_per_sample;
+  } else if (bit_depth.type == JXL_BIT_DEPTH_CUSTOM) {
+    return bit_depth.bits_per_sample;
+  }
+  return 0;
+}
+
 enum class DecoderStage : uint32_t {
   kInited,              // Decoder created, no JxlDecoderProcessInput called yet
   kStarted,             // Running JxlDecoderProcessInput calls
@@ -157,10 +190,9 @@ enum class DecoderStage : uint32_t {
 };
 
 enum class FrameStage : uint32_t {
-  kHeader,      // Must parse frame header.
-  kTOC,         // Must parse TOC
-  kFull,        // Must parse full pixels
-  kFullOutput,  // Must output full pixels
+  kHeader,  // Must parse frame header.
+  kTOC,     // Must parse TOC
+  kFull,    // Must parse full pixels
 };
 
 enum class BoxStage : uint32_t {
@@ -177,7 +209,6 @@ enum class JpegReconStage : uint32_t {
   kNone,             // Not outputting
   kSettingMetadata,  // Ready to output, must set metadata to the jpeg_data
   kOutputting,       // Currently outputting the JPEG bytes
-  kFinished,         // JPEG reconstruction fully handled
 };
 
 /*
@@ -329,9 +360,9 @@ struct JxlDecoderStruct {
   bool last_codestream_seen;
   bool got_codestream_signature;
   bool got_basic_info;
-  bool got_transform_data;            // To skip everything before ICC.
-  bool got_all_headers;               // Codestream metadata headers.
-  bool post_headers;                  // Already decoding pixels.
+  bool got_transform_data;  // To skip everything before ICC.
+  bool got_all_headers;     // Codestream metadata headers.
+  bool post_headers;        // Already decoding pixels.
   jxl::ICCReader icc_reader;
   jxl::JxlDecoderFrameIndexBox frame_index_box;
   // This means either we actually got the preview image, or determined we
@@ -398,16 +429,10 @@ struct JxlDecoderStruct {
   // The intended downsampling ratio for the current progression step.
   size_t downsampling_target;
 
-  // Whether the preview out buffer was set. It is possible for the buffer to
-  // be nullptr and buffer_set to be true, indicating it was deliberately
-  // set to nullptr.
-  bool preview_out_buffer_set;
-  // Idem for the image buffer.
   // Set to true if either an image out buffer or an image out callback was set.
   bool image_out_buffer_set;
 
-  // Owned by the caller, buffers for DC image and full resolution images
-  void* preview_out_buffer;
+  // Owned by the caller, buffer for preview or full resolution image.
   void* image_out_buffer;
   JxlImageOutInitCallback image_out_init_callback;
   JxlImageOutRunCallback image_out_run_callback;
@@ -419,11 +444,10 @@ struct JxlDecoderStruct {
   };
   SimpleImageOutCallback simple_image_out_callback;
 
-  size_t preview_out_size;
   size_t image_out_size;
 
-  JxlPixelFormat preview_out_format;
   JxlPixelFormat image_out_format;
+  JxlBitDepth image_out_bit_depth;
 
   // For extra channels. Empty if no extra channels are requested, and they are
   // reset each frame
@@ -439,8 +463,6 @@ struct JxlDecoderStruct {
   std::unique_ptr<jxl::FrameDecoder> frame_dec;
   size_t next_section;
   std::vector<char> section_processed;
-  // The FrameDecoder is initialized, and not yet finalized
-  bool frame_dec_in_progress;
 
   // headers and TOC for the current frame. When got_toc is true, this is
   // always the frame header of the last frame of the current still series,
@@ -503,8 +525,11 @@ struct JxlDecoderStruct {
 
   BoxStage box_stage;
 
-  jxl::JxlToJpegDecoder jpeg_decoder;
+#if JPEGXL_ENABLE_BOXES
   jxl::JxlBoxContentDecoder box_content_decoder;
+#endif
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+  jxl::JxlToJpegDecoder jpeg_decoder;
   // Decodes Exif or XMP metadata for JPEG reconstruction
   jxl::JxlBoxContentDecoder metadata_decoder;
   std::vector<uint8_t> exif_metadata;
@@ -525,9 +550,7 @@ struct JxlDecoderStruct {
     if (store_xmp < 2 && recon_xmp_size > 0) return true;
     return false;
   }
-
-  // Statistics which CodecInOut can keep
-  uint64_t dec_pixels;
+#endif
 
   const uint8_t* next_in;
   size_t avail_in;
@@ -602,15 +625,15 @@ struct JxlDecoderStruct {
       if (avail_codestream == 0) {
         return RequestMoreInput();
       }
-      *span = jxl::Span<const uint8_t>(next_in, avail_codestream);
+      *span = jxl::Bytes(next_in, avail_codestream);
       return JXL_DEC_SUCCESS;
     } else {
       codestream_copy.insert(codestream_copy.end(),
                              next_in + codestream_unconsumed,
                              next_in + avail_codestream);
       codestream_unconsumed = avail_codestream;
-      *span = jxl::Span<const uint8_t>(codestream_copy.data() + codestream_pos,
-                                       codestream_copy.size() - codestream_pos);
+      *span = jxl::Bytes(codestream_copy.data() + codestream_pos,
+                         codestream_copy.size() - codestream_pos);
       return JXL_DEC_SUCCESS;
     }
   }
@@ -619,7 +642,7 @@ struct JxlDecoderStruct {
   // This returns false if the user didn't subscribe to any events that
   // require the codestream (e.g. only subscribed to metadata boxes), or all
   // parts of the codestream that are subscribed to (e.g. only basic info) have
-  // already occured.
+  // already occurred.
   bool CanUseMoreCodestreamInput() const {
     // The decoder can set this to finished early if all relevant events were
     // processed, so this check works.
@@ -653,14 +676,6 @@ bool CheckSizeLimit(JxlDecoder* dec, size_t xsize, size_t ysize) {
 
 }  // namespace
 
-// TODO(zond): Make this depend on the data loaded into the decoder.
-JxlDecoderStatus JxlDecoderDefaultPixelFormat(const JxlDecoder* dec,
-                                              JxlPixelFormat* format) {
-  if (!dec->got_basic_info) return JXL_DEC_NEED_MORE_INPUT;
-  *format = {4, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0};
-  return JXL_DEC_SUCCESS;
-}
-
 // Resets the state that must be reset for both Rewind and Reset
 void JxlDecoderRewindDecodingState(JxlDecoder* dec) {
   dec->stage = DecoderStage::kInited;
@@ -691,6 +706,8 @@ void JxlDecoderRewindDecodingState(JxlDecoder* dec) {
   dec->box_out_buffer_size = 0;
   dec->box_out_buffer_begin = 0;
   dec->box_out_buffer_pos = 0;
+
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
   dec->exif_metadata.clear();
   dec->xmp_metadata.clear();
   dec->store_exif = 0;
@@ -699,24 +716,22 @@ void JxlDecoderRewindDecodingState(JxlDecoder* dec) {
   dec->recon_exif_size = 0;
   dec->recon_xmp_size = 0;
   dec->recon_output_jpeg = JpegReconStage::kNone;
+#endif
 
-  dec->events_wanted = 0;
+  dec->events_wanted = dec->orig_events_wanted;
   dec->basic_info_size_hint = InitialBasicInfoSizeHint();
   dec->have_container = 0;
   dec->box_count = 0;
   dec->downsampling_target = 8;
-  dec->preview_out_buffer_set = false;
   dec->image_out_buffer_set = false;
-  dec->preview_out_buffer = nullptr;
   dec->image_out_buffer = nullptr;
   dec->image_out_init_callback = nullptr;
   dec->image_out_run_callback = nullptr;
   dec->image_out_destroy_callback = nullptr;
   dec->image_out_init_opaque = nullptr;
-  dec->preview_out_size = 0;
   dec->image_out_size = 0;
+  dec->image_out_bit_depth.type = JXL_BIT_DEPTH_FROM_PIXEL_FORMAT;
   dec->extra_channel_output.clear();
-  dec->dec_pixels = 0;
   dec->next_in = 0;
   dec->avail_in = 0;
   dec->input_closed = false;
@@ -725,7 +740,6 @@ void JxlDecoderRewindDecodingState(JxlDecoder* dec) {
   dec->frame_dec.reset(nullptr);
   dec->next_section = 0;
   dec->section_processed.clear();
-  dec->frame_dec_in_progress = false;
 
   dec->ib.reset();
   dec->metadata = jxl::CodecMetadata();
@@ -757,6 +771,7 @@ void JxlDecoderReset(JxlDecoder* dec) {
   dec->coalescing = true;
   dec->desired_intensity_target = 0;
   dec->orig_events_wanted = 0;
+  dec->events_wanted = 0;
   dec->frame_references.clear();
   dec->frame_saved_as.clear();
   dec->frame_external_to_internal.clear();
@@ -832,12 +847,12 @@ void JxlDecoderSkipFrames(JxlDecoder* dec, size_t amount) {
 }
 
 JxlDecoderStatus JxlDecoderSkipCurrentFrame(JxlDecoder* dec) {
-  if (!dec->frame_dec || !dec->frame_dec_in_progress) {
-    return JXL_DEC_ERROR;
+  if (dec->frame_stage != FrameStage::kFull) {
+    return JXL_API_ERROR("JxlDecoderSkipCurrentFrame called at the wrong time");
   }
+  JXL_DASSERT(dec->frame_dec);
   dec->frame_stage = FrameStage::kHeader;
   dec->AdvanceCodestream(dec->remaining_frame_size);
-  dec->frame_dec_in_progress = false;
   if (dec->is_last_of_still) {
     dec->image_out_buffer_set = false;
   }
@@ -848,7 +863,8 @@ JXL_EXPORT JxlDecoderStatus
 JxlDecoderSetParallelRunner(JxlDecoder* dec, JxlParallelRunner parallel_runner,
                             void* parallel_runner_opaque) {
   if (dec->stage != DecoderStage::kInited) {
-    return JXL_API_ERROR("parallel_runner must be set before starting");
+    return JXL_API_ERROR(
+        "JxlDecoderSetParallelRunner must be called before starting");
   }
   dec->thread_pool.reset(
       new jxl::ThreadPool(parallel_runner, parallel_runner_opaque));
@@ -873,11 +889,11 @@ JxlDecoderStatus JxlDecoderSubscribeEvents(JxlDecoder* dec, int events_wanted) {
 }
 
 JxlDecoderStatus JxlDecoderSetKeepOrientation(JxlDecoder* dec,
-                                              JXL_BOOL keep_orientation) {
+                                              JXL_BOOL skip_reorientation) {
   if (dec->stage != DecoderStage::kInited) {
     return JXL_API_ERROR("Must set keep_orientation option before starting");
   }
-  dec->keep_orientation = !!keep_orientation;
+  dec->keep_orientation = !!skip_reorientation;
   return JXL_DEC_SUCCESS;
 }
 
@@ -909,20 +925,19 @@ JxlDecoderStatus JxlDecoderSetCoalescing(JxlDecoder* dec, JXL_BOOL coalescing) {
 
 namespace {
 // helper function to get the dimensions of the current image buffer
-void GetCurrentDimensions(const JxlDecoder* dec, size_t& xsize, size_t& ysize,
-                          bool oriented) {
+void GetCurrentDimensions(const JxlDecoder* dec, size_t& xsize, size_t& ysize) {
   if (dec->frame_header->nonserialized_is_preview) {
     xsize = dec->metadata.oriented_preview_xsize(dec->keep_orientation);
     ysize = dec->metadata.oriented_preview_ysize(dec->keep_orientation);
     return;
   }
-  xsize = dec->metadata.oriented_xsize(dec->keep_orientation || !oriented);
-  ysize = dec->metadata.oriented_ysize(dec->keep_orientation || !oriented);
+  xsize = dec->metadata.oriented_xsize(dec->keep_orientation);
+  ysize = dec->metadata.oriented_ysize(dec->keep_orientation);
   if (!dec->coalescing) {
     const auto frame_dim = dec->frame_header->ToFrameDimensions();
     xsize = frame_dim.xsize_upsampled;
     ysize = frame_dim.ysize_upsampled;
-    if (!dec->keep_orientation && oriented &&
+    if (!dec->keep_orientation &&
         static_cast<int>(dec->metadata.m.GetOrientation()) > 4) {
       std::swap(xsize, ysize);
     }
@@ -957,12 +972,6 @@ JxlDecoderStatus ReadBundle(JxlDecoder* dec, Span<const uint8_t> data,
   return JXL_DEC_SUCCESS;
 }
 
-#define JXL_API_RETURN_IF_ERROR(expr)               \
-  {                                                 \
-    JxlDecoderStatus status_ = ConvertStatus(expr); \
-    if (status_ != JXL_DEC_SUCCESS) return status_; \
-  }
-
 std::unique_ptr<BitReader, std::function<void(BitReader*)>> GetBitReader(
     Span<const uint8_t> span) {
   BitReader* reader = new BitReader(span);
@@ -987,7 +996,7 @@ JxlDecoderStatus JxlDecoderReadBasicInfo(JxlDecoder* dec) {
       return dec->RequestMoreInput();
     }
     if (span.data()[0] != 0xff || span.data()[1] != jxl::kCodestreamMarker) {
-      return JXL_API_ERROR("invalid signature");
+      return JXL_INPUT_ERROR("invalid signature");
     }
     dec->got_codestream_signature = true;
     dec->AdvanceCodestream(2);
@@ -1010,7 +1019,7 @@ JxlDecoderStatus JxlDecoderReadBasicInfo(JxlDecoder* dec) {
 
   if (!CheckSizeLimit(dec, dec->metadata.size.xsize(),
                       dec->metadata.size.ysize())) {
-    return JXL_API_ERROR("image is too large");
+    return JXL_INPUT_ERROR("image is too large");
   }
 
   return JXL_DEC_SUCCESS;
@@ -1053,8 +1062,8 @@ JxlDecoderStatus JxlDecoderReadAllHeaders(JxlDecoder* dec) {
       // Other non-successful status is an error
       return JXL_DEC_ERROR;
     }
-    PaddedBytes icc;
-    status = dec->icc_reader.Process(reader.get(), &icc);
+    PaddedBytes decoded_icc;
+    status = dec->icc_reader.Process(reader.get(), &decoded_icc);
     if (status.code() == StatusCode::kNotEnoughBytes) {
       return dec->RequestMoreInput();
     }
@@ -1062,9 +1071,12 @@ JxlDecoderStatus JxlDecoderReadAllHeaders(JxlDecoder* dec) {
       // Other non-successful status is an error
       return JXL_DEC_ERROR;
     }
-    if (!dec->metadata.m.color_encoding.SetICCRaw(std::move(icc))) {
+    if (decoded_icc.empty()) {
       return JXL_DEC_ERROR;
     }
+    IccBytes icc;
+    Bytes(decoded_icc).AppendTo(&icc);
+    dec->metadata.m.color_encoding.SetICCRaw(std::move(icc));
   }
 
   dec->got_all_headers = true;
@@ -1088,58 +1100,6 @@ JxlDecoderStatus JxlDecoderReadAllHeaders(JxlDecoder* dec) {
   return JXL_DEC_SUCCESS;
 }
 
-static size_t GetStride(const JxlDecoder* dec, const JxlPixelFormat& format) {
-  size_t xsize, ysize;
-  GetCurrentDimensions(dec, xsize, ysize, true);
-  size_t stride = xsize * (BitsPerChannel(format.data_type) *
-                           format.num_channels / jxl::kBitsPerByte);
-  if (format.align > 1) {
-    stride = jxl::DivCeil(stride, format.align) * format.align;
-  }
-  return stride;
-}
-
-// Internal wrapper around jxl::ConvertToExternal which converts the stride,
-// format and orientation and allows to choose whether to get all RGB(A)
-// channels or alternatively get a single extra channel.
-// If want_extra_channel, a valid index to a single extra channel must be
-// given, the output must be single-channel, and format.num_channels is ignored
-// and treated as if it is 1.
-static JxlDecoderStatus ConvertImageInternal(
-    const JxlDecoder* dec, const jxl::ImageBundle& frame,
-    const JxlPixelFormat& format, bool want_extra_channel,
-    size_t extra_channel_index, void* out_image, size_t out_size,
-    const PixelCallback& out_callback) {
-  // TODO(lode): handle mismatch of RGB/grayscale color profiles and pixel data
-  // color/grayscale format
-  const size_t stride = GetStride(dec, format);
-
-  bool float_format = format.data_type == JXL_TYPE_FLOAT ||
-                      format.data_type == JXL_TYPE_FLOAT16;
-
-  jxl::Orientation undo_orientation = dec->keep_orientation
-                                          ? jxl::Orientation::kIdentity
-                                          : dec->metadata.m.GetOrientation();
-
-  jxl::Status status(true);
-  if (want_extra_channel) {
-    JXL_ASSERT(extra_channel_index < frame.extra_channels().size());
-    status = jxl::ConvertToExternal(frame.extra_channels()[extra_channel_index],
-                                    BitsPerChannel(format.data_type),
-                                    float_format, format.endianness, stride,
-                                    dec->thread_pool.get(), out_image, out_size,
-                                    out_callback, undo_orientation);
-  } else {
-    status = jxl::ConvertToExternal(
-        frame, BitsPerChannel(format.data_type), float_format,
-        format.num_channels, format.endianness, stride, dec->thread_pool.get(),
-        out_image, out_size, out_callback, undo_orientation,
-        dec->unpremul_alpha);
-  }
-
-  return status ? JXL_DEC_SUCCESS : JXL_DEC_ERROR;
-}
-
 JxlDecoderStatus JxlDecoderProcessSections(JxlDecoder* dec) {
   Span<const uint8_t> span;
   JXL_API_RETURN_IF_ERROR(dec->GetCodestreamInput(&span));
@@ -1148,15 +1108,17 @@ JxlDecoderStatus JxlDecoderProcessSections(JxlDecoder* dec) {
   std::vector<jxl::FrameDecoder::SectionInfo> section_info;
   std::vector<jxl::FrameDecoder::SectionStatus> section_status;
   for (size_t i = dec->next_section; i < toc.size(); ++i) {
-    if (dec->section_processed[i]) continue;
+    if (dec->section_processed[i]) {
+      pos += toc[i].size;
+      continue;
+    }
     size_t id = toc[i].id;
     size_t size = toc[i].size;
     if (OutOfBounds(pos, size, span.size())) {
       break;
     }
-    auto br =
-        new jxl::BitReader(jxl::Span<const uint8_t>(span.data() + pos, size));
-    section_info.emplace_back(jxl::FrameDecoder::SectionInfo{br, id});
+    auto br = new jxl::BitReader(jxl::Bytes(span.data() + pos, size));
+    section_info.emplace_back(jxl::FrameDecoder::SectionInfo{br, id, i});
     section_status.emplace_back();
     pos += size;
   }
@@ -1176,31 +1138,27 @@ JxlDecoderStatus JxlDecoderProcessSections(JxlDecoder* dec) {
     // If any bit reader indicates out of bounds, it's an error, not just
     // needing more input, since we ensure only bit readers containing
     // a complete section are provided to the FrameDecoder.
-    return JXL_API_ERROR("frame out of bounds");
+    return JXL_INPUT_ERROR("frame out of bounds");
   }
   if (!status) {
-    return JXL_API_ERROR("frame processing failed");
+    return JXL_INPUT_ERROR("frame processing failed");
   }
-  bool found_skipped_section = false;
-  size_t num_done = 0;
-  size_t processed_bytes = 0;
   for (size_t i = 0; i < section_status.size(); ++i) {
     auto status = section_status[i];
     if (status == jxl::FrameDecoder::kDone) {
-      if (!found_skipped_section) {
-        processed_bytes += toc[dec->next_section + i].size;
-        ++num_done;
-      }
-      dec->section_processed[dec->next_section + i] = 1;
-    } else if (status == jxl::FrameDecoder::kSkipped) {
-      found_skipped_section = true;
-    } else {
-      return JXL_API_ERROR("unexpected section status");
+      dec->section_processed[section_info[i].index] = 1;
+    } else if (status != jxl::FrameDecoder::kSkipped) {
+      return JXL_INPUT_ERROR("unexpected section status");
     }
   }
-  dec->next_section += num_done;
-  dec->remaining_frame_size -= processed_bytes;
-  dec->AdvanceCodestream(processed_bytes);
+  size_t completed_prefix_bytes = 0;
+  while (dec->next_section < dec->section_processed.size() &&
+         dec->section_processed[dec->next_section] == 1) {
+    completed_prefix_bytes += toc[dec->next_section].size;
+    ++dec->next_section;
+  }
+  dec->remaining_frame_size -= completed_prefix_bytes;
+  dec->AdvanceCodestream(completed_prefix_bytes);
   return JXL_DEC_SUCCESS;
 }
 
@@ -1262,6 +1220,7 @@ JxlDecoderStatus JxlDecoderProcessCodestream(JxlDecoder* dec) {
       break;
     }
     if (dec->frame_stage == FrameStage::kHeader) {
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
       if (dec->recon_output_jpeg == JpegReconStage::kSettingMetadata ||
           dec->recon_output_jpeg == JpegReconStage::kOutputting) {
         // The image bundle contains the JPEG reconstruction frame, but the
@@ -1271,14 +1230,16 @@ JxlDecoderStatus JxlDecoderProcessCodestream(JxlDecoder* dec) {
         return JXL_API_ERROR(
             "cannot decode a next frame after JPEG reconstruction frame");
       }
+#endif
       if (!dec->ib) {
         dec->ib.reset(new jxl::ImageBundle(&dec->image_metadata));
       }
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
       // If JPEG reconstruction is wanted and possible, set the jpeg_data of
       // the ImageBundle.
       if (!dec->jpeg_decoder.SetImageBundleJpegData(dec->ib.get()))
         return JXL_DEC_ERROR;
-
+#endif
       dec->frame_dec.reset(new FrameDecoder(
           dec->passes_state.get(), dec->metadata, dec->thread_pool.get(),
           /*use_slow_rendering_pipeline=*/false));
@@ -1286,33 +1247,36 @@ JxlDecoderStatus JxlDecoderProcessCodestream(JxlDecoder* dec) {
       Span<const uint8_t> span;
       JXL_API_RETURN_IF_ERROR(dec->GetCodestreamInput(&span));
       auto reader = GetBitReader(span);
-      bool output_needed =
-          (dec->preview_frame ? (dec->events_wanted & JXL_DEC_PREVIEW_IMAGE)
-                              : (dec->events_wanted & JXL_DEC_FULL_IMAGE));
       jxl::Status status = dec->frame_dec->InitFrame(
-          reader.get(), dec->ib.get(), dec->preview_frame, output_needed);
+          reader.get(), dec->ib.get(), dec->preview_frame);
       if (!reader->AllReadsWithinBounds() ||
           status.code() == StatusCode::kNotEnoughBytes) {
         return dec->RequestMoreInput();
       } else if (!status) {
-        return JXL_API_ERROR("invalid frame header");
+        return JXL_INPUT_ERROR("invalid frame header");
       }
       dec->AdvanceCodestream(reader->TotalBitsConsumed() / kBitsPerByte);
       *dec->frame_header = dec->frame_dec->GetFrameHeader();
       jxl::FrameDimensions frame_dim = dec->frame_header->ToFrameDimensions();
       if (!CheckSizeLimit(dec, frame_dim.xsize_upsampled_padded,
                           frame_dim.ysize_upsampled_padded)) {
-        return JXL_API_ERROR("frame is too large");
+        return JXL_INPUT_ERROR("frame is too large");
+      }
+      bool output_needed =
+          (dec->preview_frame ? (dec->events_wanted & JXL_DEC_PREVIEW_IMAGE)
+                              : (dec->events_wanted & JXL_DEC_FULL_IMAGE));
+      if (output_needed) {
+        JXL_API_RETURN_IF_ERROR(dec->frame_dec->InitFrameOutput());
       }
       if (dec->cpu_limit_base != 0) {
         // No overflow, checked in CheckSizeLimit.
         size_t num_pixels = frame_dim.xsize * frame_dim.ysize;
         if (dec->used_cpu_base + num_pixels < dec->used_cpu_base) {
-          return JXL_API_ERROR("used too much CPU");
+          return JXL_INPUT_ERROR("image too large");
         }
         dec->used_cpu_base += num_pixels;
         if (dec->used_cpu_base > dec->cpu_limit_base) {
-          return JXL_API_ERROR("used too much CPU");
+          return JXL_INPUT_ERROR("image too large");
         }
       }
       dec->remaining_frame_size = dec->frame_dec->SumSectionSizes();
@@ -1417,10 +1381,8 @@ JxlDecoderStatus JxlDecoderProcessCodestream(JxlDecoder* dec) {
       dec->section_processed.clear();
       dec->section_processed.resize(dec->frame_dec->Toc().size(), 0);
 
-      // If we don't need pixels, we can skip actually decoding the frames
-      // (kFull / kFullOut).
+      // If we don't need pixels, we can skip actually decoding the frames.
       if (dec->preview_frame || (dec->events_wanted & JXL_DEC_FULL_IMAGE)) {
-        dec->frame_dec_in_progress = true;
         dec->frame_stage = FrameStage::kFull;
       } else if (!dec->is_last_total) {
         dec->frame_stage = FrameStage::kHeader;
@@ -1431,61 +1393,45 @@ JxlDecoderStatus JxlDecoderProcessCodestream(JxlDecoder* dec) {
       }
     }
 
-    bool return_full_image = false;
-
     if (dec->frame_stage == FrameStage::kFull) {
-      if (dec->preview_frame) {
-        if (!dec->preview_out_buffer_set) {
+      if (!dec->image_out_buffer_set) {
+        if (dec->preview_frame) {
           return JXL_DEC_NEED_PREVIEW_OUT_BUFFER;
         }
-      } else if (dec->events_wanted & JXL_DEC_FULL_IMAGE) {
-        if (!dec->image_out_buffer_set &&
+        if (
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
             (!dec->jpeg_decoder.IsOutputSet() ||
              dec->ib->jpeg_data == nullptr) &&
-            dec->is_last_of_still) {
+#endif
+            dec->is_last_of_still && !dec->skipping_frame) {
           // TODO(lode): remove the dec->is_last_of_still condition if the
           // frame decoder needs the image buffer as working space for decoding
           // non-visible or blending frames too
-          if (!dec->skipping_frame) {
-            return JXL_DEC_NEED_IMAGE_OUT_BUFFER;
-          }
+          return JXL_DEC_NEED_IMAGE_OUT_BUFFER;
         }
       }
 
-      dec->frame_dec->MaybeSetUnpremultiplyAlpha(dec->unpremul_alpha);
-
-      if (!dec->preview_frame && dec->image_out_buffer_set &&
-          !!dec->image_out_buffer &&
-          dec->image_out_format.data_type == JXL_TYPE_UINT8 &&
-          dec->image_out_format.num_channels >= 3 &&
-          dec->extra_channel_output.empty()) {
-        bool is_rgba = dec->image_out_format.num_channels == 4;
-        dec->frame_dec->MaybeSetRGB8OutputBuffer(
-            reinterpret_cast<uint8_t*>(dec->image_out_buffer),
-            GetStride(dec, dec->image_out_format), is_rgba,
-            !dec->keep_orientation);
-      }
-
-      const bool little_endian =
-          dec->image_out_format.endianness == JXL_LITTLE_ENDIAN ||
-          (dec->image_out_format.endianness == JXL_NATIVE_ENDIAN &&
-           IsLittleEndian());
-      bool swap_endianness = little_endian != IsLittleEndian();
-
-      // TODO(lode): Support more formats than just native endian float32 for
-      // the low-memory callback path
-      if (!dec->preview_frame && dec->image_out_buffer_set &&
-          !!dec->image_out_init_callback && !!dec->image_out_run_callback &&
-          dec->image_out_format.data_type == JXL_TYPE_FLOAT &&
-          dec->image_out_format.num_channels >= 3 &&
-          dec->extra_channel_output.empty() && !swap_endianness &&
-          dec->frame_dec_in_progress) {
-        bool is_rgba = dec->image_out_format.num_channels == 4;
-        dec->frame_dec->MaybeSetFloatCallback(
+      if (dec->image_out_buffer_set) {
+        size_t xsize, ysize;
+        GetCurrentDimensions(dec, xsize, ysize);
+        size_t bits_per_sample = GetBitDepth(
+            dec->image_out_bit_depth, dec->metadata.m, dec->image_out_format);
+        dec->frame_dec->SetImageOutput(
             PixelCallback{
                 dec->image_out_init_callback, dec->image_out_run_callback,
                 dec->image_out_destroy_callback, dec->image_out_init_opaque},
-            is_rgba, dec->unpremul_alpha, !dec->keep_orientation);
+            reinterpret_cast<uint8_t*>(dec->image_out_buffer),
+            dec->image_out_size, xsize, ysize, dec->image_out_format,
+            bits_per_sample, dec->unpremul_alpha, !dec->keep_orientation);
+        for (size_t i = 0; i < dec->extra_channel_output.size(); ++i) {
+          const auto& extra = dec->extra_channel_output[i];
+          size_t ec_bits_per_sample =
+              GetBitDepth(dec->image_out_bit_depth,
+                          dec->metadata.m.extra_channel_info[i], extra.format);
+          dec->frame_dec->AddExtraChannelOutput(extra.buffer, extra.buffer_size,
+                                                xsize, extra.format,
+                                                ec_bits_per_sample);
+        }
       }
 
       size_t next_num_passes_to_pause = dec->frame_dec->NextNumPassesToPause();
@@ -1526,103 +1472,40 @@ JxlDecoderStatus JxlDecoderProcessCodestream(JxlDecoder* dec) {
         // this frame was skipped before and set to 255, while only now we know
         // the true value.
         dec->frame_references[internal_index] = dec->frame_dec->References();
-        // Copy exif/xmp metadata from their boxes into the jpeg_data, if
-        // JPEG reconstruction is requested.
-        if (dec->jpeg_decoder.IsOutputSet() && dec->ib->jpeg_data != nullptr) {
-        }
       }
 
       if (!dec->frame_dec->FinalizeFrame()) {
-        return JXL_API_ERROR("decoding frame failed");
+        return JXL_INPUT_ERROR("decoding frame failed");
       }
-
-      dec->frame_dec_in_progress = false;
-      dec->frame_stage = FrameStage::kFullOutput;
-    }
-
-    bool output_jpeg_reconstruction = false;
-
-    if (dec->frame_stage == FrameStage::kFullOutput) {
-      if (dec->preview_frame) {
-        JxlDecoderStatus status =
-            ConvertImageInternal(dec, *dec->ib, dec->preview_out_format,
-                                 /*want_extra_channel=*/false,
-                                 /*extra_channel_index=*/0,
-                                 dec->preview_out_buffer, dec->preview_out_size,
-                                 /*out_callback=*/{});
-        if (status != JXL_DEC_SUCCESS) return status;
-      } else if (dec->is_last_of_still) {
-        if (dec->events_wanted & JXL_DEC_FULL_IMAGE) {
-          dec->events_wanted &= ~JXL_DEC_FULL_IMAGE;
-          return_full_image = true;
-        }
-
-        // Frame finished, restore the events_wanted with the per-frame events
-        // from orig_events_wanted, in case there is a next frame.
-        dec->events_wanted |=
-            (dec->orig_events_wanted &
-             (JXL_DEC_FULL_IMAGE | JXL_DEC_FRAME | JXL_DEC_FRAME_PROGRESSION));
-
-        // If no output buffer was set, we merely return the JXL_DEC_FULL_IMAGE
-        // status without outputting pixels.
-        if (dec->jpeg_decoder.IsOutputSet() && dec->ib->jpeg_data != nullptr) {
-          output_jpeg_reconstruction = true;
-        } else if (return_full_image && dec->image_out_buffer_set) {
-          if (!dec->frame_dec->HasRGBBuffer()) {
-            // Copy pixels if desired.
-            JxlDecoderStatus status = ConvertImageInternal(
-                dec, *dec->ib, dec->image_out_format,
-                /*want_extra_channel=*/false,
-                /*extra_channel_index=*/0, dec->image_out_buffer,
-                dec->image_out_size,
-                PixelCallback{dec->image_out_init_callback,
-                              dec->image_out_run_callback,
-                              dec->image_out_destroy_callback,
-                              dec->image_out_init_opaque});
-            if (status != JXL_DEC_SUCCESS) return status;
-          }
-          dec->image_out_buffer_set = false;
-
-          bool has_ec = !dec->ib->extra_channels().empty();
-          for (size_t i = 0; i < dec->extra_channel_output.size(); ++i) {
-            void* buffer = dec->extra_channel_output[i].buffer;
-            // buffer nullptr indicates this extra channel is not requested
-            if (!buffer) continue;
-            if (!has_ec) {
-              JXL_WARNING(
-                  "Extra channels are not supported when callback is used");
-              return JXL_DEC_ERROR;
-            }
-            const JxlPixelFormat* format = &dec->extra_channel_output[i].format;
-            JxlDecoderStatus status = ConvertImageInternal(
-                dec, *dec->ib, *format,
-                /*want_extra_channel=*/true, /*extra_channel_index=*/i, buffer,
-                dec->extra_channel_output[i].buffer_size, /*out_callback=*/{});
-            if (status != JXL_DEC_SUCCESS) return status;
-          }
-
-          dec->extra_channel_output.clear();
-        }
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+      // If jpeg output was requested, we merely return the JXL_DEC_FULL_IMAGE
+      // status without outputting pixels.
+      if (dec->jpeg_decoder.IsOutputSet() && dec->ib->jpeg_data != nullptr) {
+        dec->frame_stage = FrameStage::kHeader;
+        dec->recon_output_jpeg = JpegReconStage::kSettingMetadata;
+        return JXL_DEC_FULL_IMAGE;
+      }
+#endif
+      if (dec->preview_frame || dec->is_last_of_still) {
+        dec->image_out_buffer_set = false;
+        dec->extra_channel_output.clear();
       }
     }
 
     dec->frame_stage = FrameStage::kHeader;
 
-    if (output_jpeg_reconstruction) {
-      dec->recon_output_jpeg = JpegReconStage::kSettingMetadata;
+    // The pixels have been output or are not needed, do not keep them in
+    // memory here.
+    dec->ib.reset();
+    if (dec->preview_frame) {
+      dec->got_preview_image = true;
+      dec->preview_frame = false;
+      dec->events_wanted &= ~JXL_DEC_PREVIEW_IMAGE;
+      return JXL_DEC_PREVIEW_IMAGE;
+    } else if (dec->is_last_of_still &&
+               (dec->events_wanted & JXL_DEC_FULL_IMAGE) &&
+               !dec->skipping_frame) {
       return JXL_DEC_FULL_IMAGE;
-    } else {
-      // The pixels have been output or are not needed, do not keep them in
-      // memory here.
-      dec->ib.reset();
-      if (dec->preview_frame) {
-        dec->got_preview_image = true;
-        dec->preview_frame = false;
-        dec->events_wanted &= ~JXL_DEC_PREVIEW_IMAGE;
-        return JXL_DEC_PREVIEW_IMAGE;
-      } else if (return_full_image && !dec->skipping_frame) {
-        return JXL_DEC_FULL_IMAGE;
-      }
     }
   }
 
@@ -1659,6 +1542,7 @@ void JxlDecoderCloseInput(JxlDecoder* dec) { dec->input_closed = true; }
 
 JxlDecoderStatus JxlDecoderSetJPEGBuffer(JxlDecoder* dec, uint8_t* data,
                                          size_t size) {
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
   // JPEG reconstruction buffer can only set and updated before or during the
   // first frame, the reconstruction box refers to the first frame and in
   // theory multi-frame images should not be used with a jbrd box.
@@ -1669,10 +1553,17 @@ JxlDecoderStatus JxlDecoderSetJPEGBuffer(JxlDecoder* dec, uint8_t* data,
     return JXL_API_ERROR("Already set JPEG buffer");
   }
   return dec->jpeg_decoder.SetOutputBuffer(data, size);
+#else
+  return JXL_API_ERROR("JPEG reconstruction is not supported.");
+#endif
 }
 
 size_t JxlDecoderReleaseJPEGBuffer(JxlDecoder* dec) {
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
   return dec->jpeg_decoder.ReleaseOutputBuffer();
+#else
+  return JXL_API_ERROR("JPEG reconstruction is not supported.");
+#endif
 }
 
 // Parses the header of the box, outputting the 4-character type and the box
@@ -1703,20 +1594,20 @@ static JxlDecoderStatus ParseBoxHeader(const uint8_t* in, size_t size,
   // Box size, including this header itself.
   *box_size = LoadBE32(in + pos);
   pos += 4;
+  memcpy(type, in + pos, 4);
+  pos += 4;
   if (*box_size == 1) {
     *header_size = 16;
-    if (OutOfBounds(pos, 12, size)) return JXL_DEC_NEED_MORE_INPUT;
+    if (OutOfBounds(pos, 8, size)) return JXL_DEC_NEED_MORE_INPUT;
     *box_size = LoadBE64(in + pos);
     pos += 8;
   }
-  memcpy(type, in + pos, 4);
-  pos += 4;
   *header_size = pos - box_start;
   if (*box_size > 0 && *box_size < *header_size) {
-    return JXL_API_ERROR("invalid box size");
+    return JXL_INPUT_ERROR("invalid box size");
   }
-  if (SumOverflows(file_pos, pos, *box_size)) {
-    return JXL_API_ERROR("Box size overflow");
+  if (file_pos + *box_size < file_pos) {
+    return JXL_INPUT_ERROR("Box size overflow");
   }
   return JXL_DEC_SUCCESS;
 }
@@ -1728,6 +1619,7 @@ static JxlDecoderStatus HandleBoxes(JxlDecoder* dec) {
     if (dec->box_stage != BoxStage::kHeader) {
       dec->AdvanceInput(dec->header_size);
       dec->header_size = 0;
+#if JPEGXL_ENABLE_BOXES
       if ((dec->events_wanted & JXL_DEC_BOX) &&
           dec->box_out_buffer_set_current_box) {
         uint8_t* next_out = dec->box_out_buffer + dec->box_out_buffer_pos;
@@ -1748,7 +1640,8 @@ static JxlDecoderStatus HandleBoxes(JxlDecoder* dec) {
           return box_result;
         }
       }
-
+#endif
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
       if (dec->store_exif == 1 || dec->store_xmp == 1) {
         std::vector<uint8_t>& metadata =
             (dec->store_exif == 1) ? dec->exif_metadata : dec->xmp_metadata;
@@ -1786,8 +1679,9 @@ static JxlDecoderStatus HandleBoxes(JxlDecoder* dec) {
           }
         }
       }
+#endif
     }
-
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
     if (dec->recon_output_jpeg == JpegReconStage::kSettingMetadata &&
         !dec->JbrdNeedMoreBoxes()) {
       jxl::jpeg::JPEGData* jpeg_data = dec->ib->jpeg_data.get();
@@ -1809,7 +1703,7 @@ static JxlDecoderStatus HandleBoxes(JxlDecoder* dec) {
       JxlDecoderStatus status =
           dec->jpeg_decoder.WriteOutput(*dec->ib->jpeg_data);
       if (status != JXL_DEC_SUCCESS) return status;
-      dec->recon_output_jpeg = JpegReconStage::kFinished;
+      dec->recon_output_jpeg = JpegReconStage::kNone;
       dec->ib.reset();
       if (dec->events_wanted & JXL_DEC_FULL_IMAGE) {
         // Return the full image event here now, this may be delayed if this
@@ -1818,6 +1712,7 @@ static JxlDecoderStatus HandleBoxes(JxlDecoder* dec) {
         return JXL_DEC_FULL_IMAGE;
       }
     }
+#endif
 
     if (dec->box_stage == BoxStage::kHeader) {
       if (!dec->have_container) {
@@ -1832,9 +1727,11 @@ static JxlDecoderStatus HandleBoxes(JxlDecoder* dec) {
           // Not yet seen (all) codestream boxes.
           return JXL_DEC_NEED_MORE_INPUT;
         }
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
         if (dec->JbrdNeedMoreBoxes()) {
           return JXL_DEC_NEED_MORE_INPUT;
         }
+#endif
         if (dec->input_closed) {
           return JXL_DEC_SUCCESS;
         }
@@ -1855,7 +1752,10 @@ static JxlDecoderStatus HandleBoxes(JxlDecoder* dec) {
       bool boxed_codestream_done =
           ((dec->events_wanted & JXL_DEC_BOX) &&
            dec->stage == DecoderStage::kCodestreamFinished &&
-           dec->last_codestream_seen && !dec->JbrdNeedMoreBoxes());
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+           !dec->JbrdNeedMoreBoxes() &&
+#endif
+           dec->last_codestream_seen);
       if (boxed_codestream_done && dec->avail_in >= 2 &&
           dec->next_in[0] == 0xff &&
           dec->next_in[1] == jxl::kCodestreamMarker) {
@@ -1896,10 +1796,10 @@ static JxlDecoderStatus HandleBoxes(JxlDecoder* dec) {
         return JXL_DEC_SUCCESS;
       }
       if (dec->box_count == 2 && memcmp(dec->box_type, "ftyp", 4) != 0) {
-        return JXL_API_ERROR("the second box must be the ftyp box");
+        return JXL_INPUT_ERROR("the second box must be the ftyp box");
       }
       if (memcmp(dec->box_type, "ftyp", 4) == 0 && dec->box_count != 2) {
-        return JXL_API_ERROR("the ftyp box must come second");
+        return JXL_INPUT_ERROR("the ftyp box must come second");
       }
 
       dec->box_contents_unbounded = (box_size == 0);
@@ -1910,7 +1810,7 @@ static JxlDecoderStatus HandleBoxes(JxlDecoder* dec) {
           dec->box_contents_unbounded ? 0 : (box_size - header_size);
       dec->box_size = box_size;
       dec->header_size = header_size;
-
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
       if (dec->orig_events_wanted & JXL_DEC_JPEG_RECONSTRUCTION) {
         // Initiate storing of Exif or XMP data for JPEG reconstruction
         if (dec->store_exif == 0 &&
@@ -1924,36 +1824,41 @@ static JxlDecoderStatus HandleBoxes(JxlDecoder* dec) {
           dec->recon_out_buffer_pos = 0;
         }
       }
-
+#endif
+#if JPEGXL_ENABLE_BOXES
       if (dec->events_wanted & JXL_DEC_BOX) {
         bool decompress =
             dec->decompress_boxes && memcmp(dec->box_type, "brob", 4) == 0;
         dec->box_content_decoder.StartBox(
             decompress, dec->box_contents_unbounded, dec->box_contents_size);
       }
+#endif
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
       if (dec->store_exif == 1 || dec->store_xmp == 1) {
         bool brob = memcmp(dec->box_type, "brob", 4) == 0;
         dec->metadata_decoder.StartBox(brob, dec->box_contents_unbounded,
                                        dec->box_contents_size);
       }
-
+#endif
       if (memcmp(dec->box_type, "ftyp", 4) == 0) {
         dec->box_stage = BoxStage::kFtyp;
       } else if (memcmp(dec->box_type, "jxlc", 4) == 0) {
         if (dec->last_codestream_seen) {
-          return JXL_API_ERROR("there can only be one jxlc box");
+          return JXL_INPUT_ERROR("there can only be one jxlc box");
         }
         dec->last_codestream_seen = true;
         dec->box_stage = BoxStage::kCodestream;
       } else if (memcmp(dec->box_type, "jxlp", 4) == 0) {
         dec->box_stage = BoxStage::kPartialCodestream;
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
       } else if ((dec->orig_events_wanted & JXL_DEC_JPEG_RECONSTRUCTION) &&
                  memcmp(dec->box_type, "jbrd", 4) == 0) {
         if (!(dec->events_wanted & JXL_DEC_JPEG_RECONSTRUCTION)) {
-          return JXL_API_ERROR(
+          return JXL_INPUT_ERROR(
               "multiple JPEG reconstruction boxes not supported");
         }
         dec->box_stage = BoxStage::kJpegRecon;
+#endif
       } else {
         dec->box_stage = BoxStage::kSkip;
       }
@@ -1965,22 +1870,22 @@ static JxlDecoderStatus HandleBoxes(JxlDecoder* dec) {
       }
     } else if (dec->box_stage == BoxStage::kFtyp) {
       if (dec->box_contents_size < 12) {
-        return JXL_API_ERROR("file type box too small");
+        return JXL_INPUT_ERROR("file type box too small");
       }
       if (dec->avail_in < 4) return JXL_DEC_NEED_MORE_INPUT;
       if (memcmp(dec->next_in, "jxl ", 4) != 0) {
-        return JXL_API_ERROR("file type box major brand must be \"jxl \"");
+        return JXL_INPUT_ERROR("file type box major brand must be \"jxl \"");
       }
       dec->AdvanceInput(4);
       dec->box_stage = BoxStage::kSkip;
     } else if (dec->box_stage == BoxStage::kPartialCodestream) {
       if (dec->last_codestream_seen) {
-        return JXL_API_ERROR("cannot have jxlp box after last jxlp box");
+        return JXL_INPUT_ERROR("cannot have jxlp box after last jxlp box");
       }
       // TODO(lode): error if box is unbounded but last bit not set
       if (dec->avail_in < 4) return JXL_DEC_NEED_MORE_INPUT;
       if (!dec->box_contents_unbounded && dec->box_contents_size < 4) {
-        return JXL_API_ERROR("jxlp box too small to contain index");
+        return JXL_INPUT_ERROR("jxlp box too small to contain index");
       }
       size_t jxlp_index = LoadBE32(dec->next_in);
       // The high bit of jxlp_index indicates whether this is the last
@@ -1992,11 +1897,13 @@ static JxlDecoderStatus HandleBoxes(JxlDecoder* dec) {
       dec->box_stage = BoxStage::kCodestream;
     } else if (dec->box_stage == BoxStage::kCodestream) {
       JxlDecoderStatus status = jxl::JxlDecoderProcessCodestream(dec);
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
       if (status == JXL_DEC_FULL_IMAGE) {
         if (dec->recon_output_jpeg != JpegReconStage::kNone) {
           continue;
         }
       }
+#endif
       if (status == JXL_DEC_NEED_MORE_INPUT) {
         if (dec->file_pos == dec->box_contents_end &&
             !dec->box_contents_unbounded) {
@@ -2006,10 +1913,12 @@ static JxlDecoderStatus HandleBoxes(JxlDecoder* dec) {
       }
 
       if (status == JXL_DEC_SUCCESS) {
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
         if (dec->JbrdNeedMoreBoxes()) {
           dec->box_stage = BoxStage::kSkip;
           continue;
         }
+#endif
         if (dec->box_contents_unbounded) {
           // Last box reached and codestream done, nothing more to do.
           break;
@@ -2021,6 +1930,7 @@ static JxlDecoderStatus HandleBoxes(JxlDecoder* dec) {
         }
       }
       return status;
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
     } else if (dec->box_stage == BoxStage::kJpegRecon) {
       if (!dec->jpeg_decoder.IsParsingBox()) {
         // This is a new JPEG reconstruction metadata box.
@@ -2039,36 +1949,37 @@ static JxlDecoderStatus HandleBoxes(JxlDecoder* dec) {
         size_t num_xmp = jxl::JxlToJpegDecoder::NumXmpMarkers(*jpeg_data);
         if (num_exif) {
           if (num_exif > 1) {
-            return JXL_API_ERROR(
+            return JXL_INPUT_ERROR(
                 "multiple exif markers for JPEG reconstruction not supported");
           }
           if (JXL_DEC_SUCCESS != jxl::JxlToJpegDecoder::ExifBoxContentSize(
                                      *jpeg_data, &dec->recon_exif_size)) {
-            return JXL_API_ERROR("invalid jbrd exif size");
+            return JXL_INPUT_ERROR("invalid jbrd exif size");
           }
         }
         if (num_xmp) {
           if (num_xmp > 1) {
-            return JXL_API_ERROR(
+            return JXL_INPUT_ERROR(
                 "multiple XMP markers for JPEG reconstruction not supported");
           }
           if (JXL_DEC_SUCCESS != jxl::JxlToJpegDecoder::XmlBoxContentSize(
                                      *jpeg_data, &dec->recon_xmp_size)) {
-            return JXL_API_ERROR("invalid jbrd XMP size");
+            return JXL_INPUT_ERROR("invalid jbrd XMP size");
           }
         }
 
         dec->box_stage = BoxStage::kHeader;
         // If successful JPEG reconstruction, return the success if the user
         // cares about it, otherwise continue.
-        if (dec->events_wanted & recon_result) {
-          dec->events_wanted &= ~recon_result;
-          return recon_result;
+        if (dec->events_wanted & JXL_DEC_JPEG_RECONSTRUCTION) {
+          dec->events_wanted &= ~JXL_DEC_JPEG_RECONSTRUCTION;
+          return JXL_DEC_JPEG_RECONSTRUCTION;
         }
       } else {
         // If anything else, return the result.
         return recon_result;
       }
+#endif
     } else if (dec->box_stage == BoxStage::kSkip) {
       if (dec->box_contents_unbounded) {
         if (dec->input_closed) {
@@ -2104,7 +2015,6 @@ static JxlDecoderStatus HandleBoxes(JxlDecoder* dec) {
       JXL_DASSERT(false);  // unknown box stage
     }
   }
-
   return JXL_DEC_SUCCESS;
 }
 
@@ -2120,10 +2030,10 @@ JxlDecoderStatus JxlDecoderProcessInput(JxlDecoder* dec) {
 
   if (!dec->got_signature) {
     JxlSignature sig = JxlSignatureCheck(dec->next_in, dec->avail_in);
-    if (sig == JXL_SIG_INVALID) return JXL_API_ERROR("invalid signature");
+    if (sig == JXL_SIG_INVALID) return JXL_INPUT_ERROR("invalid signature");
     if (sig == JXL_SIG_NOT_ENOUGH_BYTES) {
       if (dec->input_closed) {
-        return JXL_API_ERROR("file too small for signature");
+        return JXL_INPUT_ERROR("file too small for signature");
       }
       return JXL_DEC_NEED_MORE_INPUT;
     }
@@ -2140,18 +2050,20 @@ JxlDecoderStatus JxlDecoderProcessInput(JxlDecoder* dec) {
   JxlDecoderStatus status = HandleBoxes(dec);
 
   if (status == JXL_DEC_NEED_MORE_INPUT && dec->input_closed) {
-    return JXL_API_ERROR("missing input");
+    return JXL_INPUT_ERROR("premature end of input");
   }
 
   // Even if the box handling returns success, certain types of
   // data may be missing.
   if (status == JXL_DEC_SUCCESS) {
     if (dec->CanUseMoreCodestreamInput()) {
-      return JXL_API_ERROR("codestream never finished");
+      return JXL_INPUT_ERROR("codestream never finished");
     }
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
     if (dec->JbrdNeedMoreBoxes()) {
-      return JXL_API_ERROR("missing metadata boxes for jpeg reconstruction");
+      return JXL_INPUT_ERROR("missing metadata boxes for jpeg reconstruction");
     }
+#endif
   }
 
   return status;
@@ -2307,8 +2219,8 @@ JxlDecoderStatus GetColorEncodingForTarget(
 }  // namespace
 
 JxlDecoderStatus JxlDecoderGetColorAsEncodedProfile(
-    const JxlDecoder* dec, const JxlPixelFormat* unused_format,
-    JxlColorProfileTarget target, JxlColorEncoding* color_encoding) {
+    const JxlDecoder* dec, JxlColorProfileTarget target,
+    JxlColorEncoding* color_encoding) {
   const jxl::ColorEncoding* jxl_color_encoding = nullptr;
   JxlDecoderStatus status =
       GetColorEncodingForTarget(dec, target, &jxl_color_encoding);
@@ -2318,15 +2230,15 @@ JxlDecoderStatus JxlDecoderGetColorAsEncodedProfile(
     return JXL_DEC_ERROR;  // Indicate no encoded profile available.
 
   if (color_encoding) {
-    ConvertInternalToExternalColorEncoding(*jxl_color_encoding, color_encoding);
+    *color_encoding = jxl_color_encoding->ToExternal();
   }
 
   return JXL_DEC_SUCCESS;
 }
 
-JxlDecoderStatus JxlDecoderGetICCProfileSize(
-    const JxlDecoder* dec, const JxlPixelFormat* unused_format,
-    JxlColorProfileTarget target, size_t* size) {
+JxlDecoderStatus JxlDecoderGetICCProfileSize(const JxlDecoder* dec,
+                                             JxlColorProfileTarget target,
+                                             size_t* size) {
   const jxl::ColorEncoding* jxl_color_encoding = nullptr;
   JxlDecoderStatus status =
       GetColorEncodingForTarget(dec, target, &jxl_color_encoding);
@@ -2352,13 +2264,14 @@ JxlDecoderStatus JxlDecoderGetICCProfileSize(
   return JXL_DEC_SUCCESS;
 }
 
-JxlDecoderStatus JxlDecoderGetColorAsICCProfile(
-    const JxlDecoder* dec, const JxlPixelFormat* unused_format,
-    JxlColorProfileTarget target, uint8_t* icc_profile, size_t size) {
+JxlDecoderStatus JxlDecoderGetColorAsICCProfile(const JxlDecoder* dec,
+                                                JxlColorProfileTarget target,
+                                                uint8_t* icc_profile,
+                                                size_t size) {
   size_t wanted_size;
   // This also checks the NEED_MORE_INPUT and the unknown/xyb cases
   JxlDecoderStatus status =
-      JxlDecoderGetICCProfileSize(dec, nullptr, target, &wanted_size);
+      JxlDecoderGetICCProfileSize(dec, target, &wanted_size);
   if (status != JXL_DEC_SUCCESS) return status;
   if (size < wanted_size) return JXL_API_ERROR("ICC profile output too small");
 
@@ -2407,9 +2320,10 @@ size_t JxlDecoderGetIntendedDownsamplingRatio(JxlDecoder* dec) {
 
 JxlDecoderStatus JxlDecoderFlushImage(JxlDecoder* dec) {
   if (!dec->image_out_buffer_set) return JXL_DEC_ERROR;
-  if (!dec->frame_dec || !dec->frame_dec_in_progress) {
+  if (dec->frame_stage != FrameStage::kFull) {
     return JXL_DEC_ERROR;
   }
+  JXL_DASSERT(dec->frame_dec);
   if (!dec->frame_dec->HasDecodedDC()) {
     // FrameDecoder::Flush currently requires DC to have been decoded already
     // to work correctly.
@@ -2420,30 +2334,16 @@ JxlDecoderStatus JxlDecoderFlushImage(JxlDecoder* dec) {
     return JXL_DEC_ERROR;
   }
 
-  if (dec->jpeg_decoder.IsOutputSet() && dec->ib->jpeg_data != nullptr) {
-    return JXL_DEC_SUCCESS;
-  }
+  return JXL_DEC_SUCCESS;
+}
 
-  if (dec->frame_dec->HasRGBBuffer()) {
-    return JXL_DEC_SUCCESS;
+JXL_EXPORT JxlDecoderStatus JxlDecoderSetCms(JxlDecoder* dec,
+                                             const JxlCmsInterface cms) {
+  if (!dec->passes_state) {
+    dec->passes_state.reset(new jxl::PassesDecoderState());
   }
-
-  // Temporarily shrink `dec->ib` to the actual size of the full image to call
-  // ConvertImageInternal.
-  size_t xsize = dec->ib->xsize();
-  size_t ysize = dec->ib->ysize();
-  size_t xsize_nopadding, ysize_nopadding;
-  GetCurrentDimensions(dec, xsize_nopadding, ysize_nopadding, false);
-  dec->ib->ShrinkTo(xsize_nopadding, ysize_nopadding);
-  JxlDecoderStatus status = jxl::ConvertImageInternal(
-      dec, *dec->ib, dec->image_out_format,
-      /*want_extra_channel=*/false,
-      /*extra_channel_index=*/0, dec->image_out_buffer, dec->image_out_size,
-      jxl::PixelCallback{
-          dec->image_out_init_callback, dec->image_out_run_callback,
-          dec->image_out_destroy_callback, dec->image_out_init_opaque});
-  dec->ib->ShrinkTo(xsize, ysize);
-  if (status != JXL_DEC_SUCCESS) return status;
+  dec->passes_state->output_encoding_info.color_management_system = cms;
+  dec->passes_state->output_encoding_info.cms_set = true;
   return JXL_DEC_SUCCESS;
 }
 
@@ -2490,38 +2390,11 @@ JXL_EXPORT JxlDecoderStatus JxlDecoderSetPreviewOutBuffer(
 
   if (size < min_size) return JXL_DEC_ERROR;
 
-  dec->preview_out_buffer_set = true;
-  dec->preview_out_buffer = buffer;
-  dec->preview_out_size = size;
-  dec->preview_out_format = *format;
-
-  return JXL_DEC_SUCCESS;
-}
-
-JXL_EXPORT JxlDecoderStatus JxlDecoderDCOutBufferSize(
-    const JxlDecoder* dec, const JxlPixelFormat* format, size_t* size) {
-  size_t bits;
-  JxlDecoderStatus status = PrepareSizeCheck(dec, format, &bits);
-  if (status != JXL_DEC_SUCCESS) return status;
-
-  size_t xsize = jxl::DivCeil(
-      dec->metadata.oriented_xsize(dec->keep_orientation), jxl::kBlockDim);
-  size_t ysize = jxl::DivCeil(
-      dec->metadata.oriented_ysize(dec->keep_orientation), jxl::kBlockDim);
-
-  size_t row_size =
-      jxl::DivCeil(xsize * format->num_channels * bits, jxl::kBitsPerByte);
-  size_t last_row_size = row_size;
-  if (format->align > 1) {
-    row_size = jxl::DivCeil(row_size, format->align) * format->align;
-  }
-  *size = row_size * (ysize - 1) + last_row_size;
-  return JXL_DEC_SUCCESS;
-}
+  dec->image_out_buffer_set = true;
+  dec->image_out_buffer = buffer;
+  dec->image_out_size = size;
+  dec->image_out_format = *format;
 
-JXL_EXPORT JxlDecoderStatus JxlDecoderSetDCOutBuffer(
-    JxlDecoder* dec, const JxlPixelFormat* format, void* buffer, size_t size) {
-  // No buffer set: this feature is deprecated
   return JXL_DEC_SUCCESS;
 }
 
@@ -2535,7 +2408,7 @@ JXL_EXPORT JxlDecoderStatus JxlDecoderImageOutBufferSize(
     return JXL_API_ERROR("Number of channels is too low for color output");
   }
   size_t xsize, ysize;
-  GetCurrentDimensions(dec, xsize, ysize, true);
+  GetCurrentDimensions(dec, xsize, ysize);
   size_t row_size =
       jxl::DivCeil(xsize * format->num_channels * bits, jxl::kBitsPerByte);
   if (format->align > 1) {
@@ -2596,7 +2469,7 @@ JxlDecoderStatus JxlDecoderExtraChannelBufferSize(const JxlDecoder* dec,
   if (status != JXL_DEC_SUCCESS) return status;
 
   size_t xsize, ysize;
-  GetCurrentDimensions(dec, xsize, ysize, true);
+  GetCurrentDimensions(dec, xsize, ysize);
   size_t row_size =
       jxl::DivCeil(xsize * num_channels * bits, jxl::kBitsPerByte);
   if (format->align > 1) {
@@ -2675,8 +2548,8 @@ JxlDecoderStatus JxlDecoderSetMultithreadedImageOutCallback(
   }
 
   // Perform error checking for invalid format.
-  size_t bits_dummy;
-  JxlDecoderStatus status = PrepareSizeCheck(dec, format, &bits_dummy);
+  size_t bits_sink;
+  JxlDecoderStatus status = PrepareSizeCheck(dec, format, &bits_sink);
   if (status != JXL_DEC_SUCCESS) return status;
 
   dec->image_out_buffer_set = true;
@@ -2705,7 +2578,7 @@ JxlDecoderStatus JxlDecoderGetFrameHeader(const JxlDecoder* dec,
   header->name_length = dec->frame_header->name.size();
   header->is_last = dec->frame_header->is_last;
   size_t xsize, ysize;
-  GetCurrentDimensions(dec, xsize, ysize, true);
+  GetCurrentDimensions(dec, xsize, ysize);
   header->layer_info.xsize = xsize;
   header->layer_info.ysize = ysize;
   if (!dec->coalescing && dec->frame_header->custom_size_or_origin) {
@@ -2789,32 +2662,62 @@ JxlDecoderStatus JxlDecoderGetFrameName(const JxlDecoder* dec, char* name,
 
 JxlDecoderStatus JxlDecoderSetPreferredColorProfile(
     JxlDecoder* dec, const JxlColorEncoding* color_encoding) {
+  return JxlDecoderSetOutputColorProfile(dec, color_encoding,
+                                         /*icc_data=*/nullptr, /*icc_size=*/0);
+}
+
+JxlDecoderStatus JxlDecoderSetOutputColorProfile(
+    JxlDecoder* dec, const JxlColorEncoding* color_encoding,
+    const uint8_t* icc_data, size_t icc_size) {
+  if ((color_encoding != nullptr) && (icc_data != nullptr)) {
+    return JXL_API_ERROR("cannot set both color_encoding and icc_data");
+  }
+  if ((color_encoding == nullptr) && (icc_data == nullptr)) {
+    return JXL_API_ERROR("one of color_encoding and icc_data must be set");
+  }
   if (!dec->got_all_headers) {
     return JXL_API_ERROR("color info not yet available");
   }
   if (dec->post_headers) {
     return JXL_API_ERROR("too late to set the color encoding");
   }
-  if (dec->image_metadata.color_encoding.IsGray() &&
-      color_encoding->color_space != JXL_COLOR_SPACE_GRAY &&
-      ((dec->preview_out_buffer_set &&
-        dec->preview_out_format.num_channels < 3) ||
-       (dec->image_out_buffer_set && dec->image_out_format.num_channels < 3))) {
-    return JXL_API_ERROR("Number of channels is too low for color output");
-  }
-  if (color_encoding->color_space == JXL_COLOR_SPACE_UNKNOWN ||
-      color_encoding->color_space == JXL_COLOR_SPACE_XYB) {
-    return JXL_API_ERROR("only RGB or grayscale output supported");
+  if ((!dec->passes_state->output_encoding_info.cms_set) &&
+      (icc_data != nullptr)) {
+    return JXL_API_ERROR(
+        "must set color management system via JxlDecoderSetCms");
   }
-
-  jxl::ColorEncoding c_out;
-  JXL_API_RETURN_IF_ERROR(
-      ConvertExternalToInternalColorEncoding(*color_encoding, &c_out));
   auto& output_encoding = dec->passes_state->output_encoding_info;
-  if (!c_out.SameColorEncoding(output_encoding.color_encoding)) {
-    JXL_API_RETURN_IF_ERROR(output_encoding.MaybeSetColorEncoding(c_out));
-    dec->image_metadata.color_encoding = output_encoding.color_encoding;
+  if (color_encoding) {
+    if (dec->image_metadata.color_encoding.IsGray() &&
+        color_encoding->color_space != JXL_COLOR_SPACE_GRAY &&
+        dec->image_out_buffer_set && dec->image_out_format.num_channels < 3) {
+      return JXL_API_ERROR("Number of channels is too low for color output");
+    }
+    if (color_encoding->color_space == JXL_COLOR_SPACE_UNKNOWN) {
+      return JXL_API_ERROR("Unknown output colorspace");
+    }
+    jxl::ColorEncoding c_out;
+    JXL_API_RETURN_IF_ERROR(c_out.FromExternal(*color_encoding));
+    JXL_API_RETURN_IF_ERROR(!c_out.ICC().empty());
+    if (!c_out.SameColorEncoding(output_encoding.color_encoding)) {
+      JXL_API_RETURN_IF_ERROR(output_encoding.MaybeSetColorEncoding(c_out));
+      dec->image_metadata.color_encoding = output_encoding.color_encoding;
+    }
+    return JXL_DEC_SUCCESS;
   }
+  // icc_data != nullptr
+  // TODO(firsching): implement setting output color profile from icc_data.
+  jxl::ColorEncoding c_dst;
+  std::vector<uint8_t> padded_icc;
+  padded_icc.assign(icc_data, icc_data + icc_size);
+  if (!c_dst.SetICC(std::move(padded_icc),
+                    &output_encoding.color_management_system)) {
+    return JXL_API_ERROR(
+        "setting output color profile from icc_data not yet implemented.");
+  }
+  JXL_API_RETURN_IF_ERROR(
+      (int)output_encoding.MaybeSetColorEncoding(std::move(c_dst)));
+
   return JXL_DEC_SUCCESS;
 }
 
@@ -2905,3 +2808,33 @@ JxlDecoderStatus JxlDecoderSetProgressiveDetail(JxlDecoder* dec,
   dec->prog_detail = detail;
   return JXL_DEC_SUCCESS;
 }
+
+namespace {
+
+template <typename T>
+JxlDecoderStatus VerifyOutputBitDepth(JxlBitDepth bit_depth, const T& metadata,
+                                      JxlPixelFormat format) {
+  uint32_t bits_per_sample = GetBitDepth(bit_depth, metadata, format);
+  if (bits_per_sample == 0) return JXL_API_ERROR("Invalid output bit depth");
+  if (format.data_type == JXL_TYPE_UINT8 && bits_per_sample > 8) {
+    return JXL_API_ERROR("Invalid bit depth %u for uint8 output",
+                         bits_per_sample);
+  } else if (format.data_type == JXL_TYPE_UINT16 && bits_per_sample > 16) {
+    return JXL_API_ERROR("Invalid bit depth %u for uint16 output",
+                         bits_per_sample);
+  }
+  return JXL_DEC_SUCCESS;
+}
+
+}  // namespace
+
+JxlDecoderStatus JxlDecoderSetImageOutBitDepth(JxlDecoder* dec,
+                                               const JxlBitDepth* bit_depth) {
+  if (!dec->image_out_buffer_set) {
+    return JXL_API_ERROR("No image out buffer was set.");
+  }
+  JXL_API_RETURN_IF_ERROR(
+      VerifyOutputBitDepth(*bit_depth, dec->metadata.m, dec->image_out_format));
+  dec->image_out_bit_depth = *bit_depth;
+  return JXL_DEC_SUCCESS;
+}
index 5b9b735..c763608 100644 (file)
@@ -3,51 +3,56 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-#include "jxl/decode.h"
-
-#include <stdint.h>
-#include <stdlib.h>
-
+#include <jxl/cms.h>
+#include <jxl/color_encoding.h>
+#include <jxl/decode.h>
+#include <jxl/decode_cxx.h>
+#include <jxl/resizable_parallel_runner_cxx.h>
+#include <jxl/thread_parallel_runner_cxx.h>
+#include <jxl/types.h>
+
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
 #include <sstream>
 #include <string>
+#include <tuple>
 #include <utility>
 #include <vector>
 
-#include "gtest/gtest.h"
-#include "jxl/decode_cxx.h"
-#include "jxl/resizable_parallel_runner_cxx.h"
-#include "jxl/thread_parallel_runner_cxx.h"
-#include "jxl/types.h"
 #include "lib/extras/codec.h"
 #include "lib/extras/dec/color_description.h"
 #include "lib/jxl/base/byte_order.h"
-#include "lib/jxl/base/file_io.h"
-#include "lib/jxl/base/padded_bytes.h"
 #include "lib/jxl/base/span.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/common.h"
+#include "lib/jxl/cms/color_encoding_cms.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/dec_bit_reader.h"
 #include "lib/jxl/dec_external_image.h"
+#include "lib/jxl/enc_aux_out.h"
 #include "lib/jxl/enc_butteraugli_comparator.h"
-#include "lib/jxl/enc_color_management.h"
 #include "lib/jxl/enc_external_image.h"
-#include "lib/jxl/enc_file.h"
+#include "lib/jxl/enc_fields.h"
 #include "lib/jxl/enc_icc_codec.h"
+#include "lib/jxl/enc_progressive_split.h"
 #include "lib/jxl/encode_internal.h"
 #include "lib/jxl/fields.h"
+#include "lib/jxl/frame_dimensions.h"
 #include "lib/jxl/frame_header.h"
 #include "lib/jxl/headers.h"
 #include "lib/jxl/icc_codec.h"
 #include "lib/jxl/image_metadata.h"
 #include "lib/jxl/jpeg/enc_jpeg_data.h"
-#include "lib/jxl/progressive_split.h"
+#include "lib/jxl/padded_bytes.h"
+#include "lib/jxl/test_image.h"
 #include "lib/jxl/test_utils.h"
-#include "lib/jxl/testdata.h"
+#include "lib/jxl/testing.h"
 #include "lib/jxl/toc.h"
 
 ////////////////////////////////////////////////////////////////////////////////
 
 namespace {
-void AppendU32BE(uint32_t u32, jxl::PaddedBytes* bytes) {
+void AppendU32BE(uint32_t u32, std::vector<uint8_t>* bytes) {
   bytes->push_back(u32 >> 24);
   bytes->push_back(u32 >> 16);
   bytes->push_back(u32 >> 8);
@@ -115,7 +120,7 @@ size_t exif_uncompressed_size = 94;
 // but with, on purpose, rXYZ, bXYZ and gXYZ (the RGB primaries) switched to a
 // different order to ensure the profile does not match any known profile, so
 // the encoder cannot encode it in a compact struct instead.
-jxl::PaddedBytes GetIccTestProfile() {
+jxl::IccBytes GetIccTestProfile() {
   const uint8_t* profile = reinterpret_cast<const uint8_t*>(
       "\0\0\3\200lcms\0040\0\0mntrRGB XYZ "
       "\a\344\0\a\0\27\0\21\0$"
@@ -149,7 +154,7 @@ jxl::PaddedBytes GetIccTestProfile() {
       "\0l\0emluc\0\0\0\0\0\0\0\1\0\0\0\fenUS\0\0\0\26\0\0\0\34\0I\0m\0a\0g\0e"
       "\0 \0c\0o\0d\0e\0c\0\0");
   size_t profile_size = 896;
-  jxl::PaddedBytes icc_profile;
+  jxl::IccBytes icc_profile;
   icc_profile.assign(profile, profile + profile_size);
   return icc_profile;
 }
@@ -160,26 +165,61 @@ namespace jxl {
 namespace {
 
 void AppendTestBox(const char* type, const char* contents, size_t contents_size,
-                   bool unbounded, PaddedBytes* bytes) {
+                   bool unbounded, std::vector<uint8_t>* bytes) {
   AppendU32BE(contents_size + 8, bytes);
   bytes->push_back(type[0]);
   bytes->push_back(type[1]);
   bytes->push_back(type[2]);
   bytes->push_back(type[3]);
   const uint8_t* contents_u = reinterpret_cast<const uint8_t*>(contents);
-  bytes->append(contents_u, contents_u + contents_size);
+  Bytes(contents_u, contents_size).AppendTo(bytes);
+}
+
+enum PreviewMode {
+  kNoPreview,
+  kSmallPreview,
+  kBigPreview,
+  kNumPreviewModes,
+};
+
+void GeneratePreview(PreviewMode preview_mode, ImageBundle* ib) {
+  if (preview_mode == kSmallPreview) {
+    ib->ShrinkTo(ib->xsize() / 7, ib->ysize() / 7);
+  } else if (preview_mode == kBigPreview) {
+    auto upsample7 = [&](const ImageF& in, ImageF* out) {
+      for (size_t y = 0; y < out->ysize(); ++y) {
+        for (size_t x = 0; x < out->xsize(); ++x) {
+          out->Row(y)[x] = in.ConstRow(y / 7)[x / 7];
+        }
+      }
+    };
+    Image3F preview(ib->xsize() * 7, ib->ysize() * 7);
+    for (size_t c = 0; c < 3; ++c) {
+      upsample7(ib->color()->Plane(c), &preview.Plane(c));
+    }
+    std::vector<ImageF> extra_channels;
+    for (size_t i = 0; i < ib->extra_channels().size(); ++i) {
+      ImageF ec(ib->xsize() * 7, ib->ysize() * 7);
+      upsample7(ib->extra_channels()[i], &ec);
+      extra_channels.emplace_back(std::move(ec));
+    }
+    ib->RemoveColor();
+    ib->ClearExtraChannels();
+    ib->SetFromImage(std::move(preview), ib->c_current());
+    ib->SetExtraChannels(std::move(extra_channels));
+  }
 }
 
 struct TestCodestreamParams {
   CompressParams cparams;
   CodeStreamBoxFormat box_format = kCSBF_None;
   JxlOrientation orientation = JXL_ORIENT_IDENTITY;
-  bool add_preview = false;
+  PreviewMode preview_mode = kNoPreview;
   bool add_intrinsic_size = false;
   bool add_icc_profile = false;
   float intensity_target = 0.0;
   std::string color_space;
-  PaddedBytes* jpeg_codestream = nullptr;
+  std::vector<uint8_t>* jpeg_codestream = nullptr;
   const ProgressiveMode* progressive_mode = nullptr;
 };
 
@@ -193,9 +233,9 @@ struct TestCodestreamParams {
 // Providing jpeg_codestream will populate the jpeg_codestream with compressed
 // JPEG bytes, and make it possible to reconstruct those exact JPEG bytes using
 // the return value _if_ add_container indicates a box format.
-PaddedBytes CreateTestJXLCodestream(Span<const uint8_t> pixels, size_t xsize,
-                                    size_t ysize, size_t num_channels,
-                                    const TestCodestreamParams& params) {
+std::vector<uint8_t> CreateTestJXLCodestream(
+    Span<const uint8_t> pixels, size_t xsize, size_t ysize, size_t num_channels,
+    const TestCodestreamParams& params) {
   // Compress the pixels with JPEG XL.
   bool grayscale = (num_channels <= 2);
   bool include_alpha = !(num_channels & 1) && params.jpeg_codestream == nullptr;
@@ -207,16 +247,15 @@ PaddedBytes CreateTestJXLCodestream(Span<const uint8_t> pixels, size_t xsize,
     // the hardcoded ICC profile we attach requires RGB.
     EXPECT_EQ(false, grayscale);
     EXPECT_TRUE(params.color_space.empty());
-    EXPECT_TRUE(color_encoding.SetICC(GetIccTestProfile()));
+    EXPECT_TRUE(color_encoding.SetICC(GetIccTestProfile(), JxlGetDefaultCms()));
   } else if (!params.color_space.empty()) {
     JxlColorEncoding c;
     EXPECT_TRUE(jxl::ParseDescription(params.color_space, &c));
-    EXPECT_TRUE(ConvertExternalToInternalColorEncoding(c, &color_encoding));
+    EXPECT_TRUE(color_encoding.FromExternal(c));
     EXPECT_EQ(color_encoding.IsGray(), grayscale);
   } else {
     color_encoding = jxl::ColorEncoding::SRGB(/*is_gray=*/grayscale);
   }
-  ThreadPool pool(nullptr, nullptr);
   io.metadata.m.SetUintSamples(bitdepth);
   if (include_alpha) {
     io.metadata.m.SetAlphaBits(bitdepth);
@@ -224,35 +263,35 @@ PaddedBytes CreateTestJXLCodestream(Span<const uint8_t> pixels, size_t xsize,
   if (params.intensity_target != 0) {
     io.metadata.m.SetIntensityTarget(params.intensity_target);
   }
+  JxlPixelFormat format = {static_cast<uint32_t>(num_channels), JXL_TYPE_UINT16,
+                           JXL_BIG_ENDIAN, 0};
   // Make the grayscale-ness of the io metadata color_encoding and the packed
   // image match.
   io.metadata.m.color_encoding = color_encoding;
-  EXPECT_TRUE(ConvertFromExternal(
-      pixels, xsize, ysize, color_encoding, num_channels,
-      /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16, JXL_BIG_ENDIAN,
-      &pool, &io.Main(), /*float_in=*/false, /*align=*/0));
-  jxl::PaddedBytes jpeg_data;
+  EXPECT_TRUE(ConvertFromExternal(pixels, xsize, ysize, color_encoding,
+                                  /*bits_per_sample=*/16, format,
+                                  /* pool */ nullptr, &io.Main()));
+  std::vector<uint8_t> jpeg_data;
   if (params.jpeg_codestream != nullptr) {
-#if JPEGXL_ENABLE_JPEG
-    std::vector<uint8_t> jpeg_bytes;
-    io.jpeg_quality = 70;
-    EXPECT_TRUE(Encode(io, extras::Codec::kJPG, io.metadata.m.color_encoding,
-                       /*bits_per_sample=*/8, &jpeg_bytes, &pool));
-    params.jpeg_codestream->append(jpeg_bytes.data(),
-                                   jpeg_bytes.data() + jpeg_bytes.size());
-    EXPECT_TRUE(jxl::jpeg::DecodeImageJPG(
-        jxl::Span<const uint8_t>(jpeg_bytes.data(), jpeg_bytes.size()), &io));
-    EXPECT_TRUE(
-        EncodeJPEGData(*io.Main().jpeg_data, &jpeg_data, params.cparams));
-    io.metadata.m.xyb_encoded = false;
-#else   // JPEGXL_ENABLE_JPEG
-    JXL_ABORT(
-        "unable to create reconstructible JPEG without JPEG support enabled");
-#endif  // JPEGXL_ENABLE_JPEG
+    if (jxl::extras::CanDecode(jxl::extras::Codec::kJPG)) {
+      std::vector<uint8_t> jpeg_bytes;
+      io.jpeg_quality = 70;
+      EXPECT_TRUE(Encode(io, extras::Codec::kJPG, io.metadata.m.color_encoding,
+                         /*bits_per_sample=*/8, &jpeg_bytes));
+      Bytes(jpeg_bytes).AppendTo(params.jpeg_codestream);
+      EXPECT_TRUE(jxl::jpeg::DecodeImageJPG(
+          jxl::Bytes(jpeg_bytes.data(), jpeg_bytes.size()), &io));
+      EXPECT_TRUE(
+          EncodeJPEGData(*io.Main().jpeg_data, &jpeg_data, params.cparams));
+      io.metadata.m.xyb_encoded = false;
+    } else {
+      JXL_ABORT(
+          "unable to create reconstructible JPEG without JPEG support enabled");
+    }
   }
-  if (params.add_preview) {
+  if (params.preview_mode) {
     io.preview_frame = io.Main().Copy();
-    io.preview_frame.ShrinkTo(xsize / 7, ysize / 7);
+    GeneratePreview(params.preview_mode, &io.preview_frame);
     io.metadata.m.have_preview = true;
     EXPECT_TRUE(io.metadata.m.preview_size.Set(io.preview_frame.xsize(),
                                                io.preview_frame.ysize()));
@@ -261,14 +300,12 @@ PaddedBytes CreateTestJXLCodestream(Span<const uint8_t> pixels, size_t xsize,
     EXPECT_TRUE(io.metadata.m.intrinsic_size.Set(xsize / 3, ysize / 3));
   }
   io.metadata.m.orientation = params.orientation;
-  AuxOut aux_out;
-  PaddedBytes compressed;
+  std::vector<uint8_t> compressed;
   PassesEncoderState enc_state;
   if (params.progressive_mode) {
     enc_state.progressive_splitter.SetProgressiveMode(*params.progressive_mode);
   }
-  EXPECT_TRUE(EncodeFile(params.cparams, &io, &enc_state, &compressed,
-                         GetJxlCms(), &aux_out, &pool));
+  EXPECT_TRUE(test::EncodeFile(params.cparams, &io, &enc_state, &compressed));
   CodeStreamBoxFormat add_container = params.box_format;
   if (add_container != kCSBF_None) {
     // Header with signature box and ftyp box.
@@ -293,16 +330,16 @@ PaddedBytes CreateTestJXLCodestream(Span<const uint8_t> pixels, size_t xsize,
       std::vector<uint8_t> compressed2(compressed.data() + 2 * third,
                                        compressed.data() + compressed.size());
 
-      PaddedBytes c;
-      c.append(header, header + sizeof(header));
+      std::vector<uint8_t> c;
+      Bytes(header).AppendTo(&c);
       if (params.jpeg_codestream != nullptr) {
         jxl::AppendBoxHeader(jxl::MakeBoxType("jbrd"), jpeg_data.size(), false,
                              &c);
-        c.append(jpeg_data.data(), jpeg_data.data() + jpeg_data.size());
+        Bytes(jpeg_data).AppendTo(&c);
       }
       uint32_t jxlp_index = 0;
       if (add_container == kCSBF_Multi_First_Empty) {
-        // Dummy (empty) codestream part
+        // Empty placeholder codestream part
         AppendU32BE(12, &c);
         c.push_back('j');
         c.push_back('x');
@@ -317,11 +354,11 @@ PaddedBytes CreateTestJXLCodestream(Span<const uint8_t> pixels, size_t xsize,
       c.push_back('l');
       c.push_back('p');
       AppendU32BE(jxlp_index++, &c);
-      c.append(compressed0.data(), compressed0.data() + compressed0.size());
+      Bytes(compressed0).AppendTo(&c);
       // A few non-codestream boxes in between
       AppendTestBox(unk1_box_type, unk1_box_contents, unk1_box_size, false, &c);
       AppendTestBox(unk2_box_type, unk2_box_contents, unk2_box_size, false, &c);
-      // Dummy (empty) codestream part
+      // Empty placeholder codestream part
       AppendU32BE(12, &c);
       c.push_back('j');
       c.push_back('x');
@@ -335,7 +372,7 @@ PaddedBytes CreateTestJXLCodestream(Span<const uint8_t> pixels, size_t xsize,
       c.push_back('l');
       c.push_back('p');
       AppendU32BE(jxlp_index++, &c);
-      c.append(compressed1.data(), compressed1.data() + compressed1.size());
+      Bytes(compressed1).AppendTo(&c);
       // Third (last) codestream part
       AppendU32BE(add_container == kCSBF_Multi_Zero_Terminated
                       ? 0
@@ -350,9 +387,9 @@ PaddedBytes CreateTestJXLCodestream(Span<const uint8_t> pixels, size_t xsize,
       } else {
         AppendU32BE(jxlp_index++, &c);
       }
-      c.append(compressed2.data(), compressed2.data() + compressed2.size());
+      Bytes(compressed2).AppendTo(&c);
       if (add_container == kCSBF_Multi_Last_Empty_Other) {
-        // Dummy (empty) codestream part
+        // Empty placeholder codestream part
         AppendU32BE(12, &c);
         c.push_back('j');
         c.push_back('x');
@@ -372,15 +409,15 @@ PaddedBytes CreateTestJXLCodestream(Span<const uint8_t> pixels, size_t xsize,
       }
       compressed.swap(c);
     } else {
-      PaddedBytes c;
-      c.append(header, header + sizeof(header));
+      std::vector<uint8_t> c;
+      Bytes(header).AppendTo(&c);
       if (params.jpeg_codestream != nullptr) {
         jxl::AppendBoxHeader(jxl::MakeBoxType("jbrd"), jpeg_data.size(), false,
                              &c);
-        c.append(jpeg_data.data(), jpeg_data.data() + jpeg_data.size());
+        Bytes(jpeg_data).AppendTo(&c);
       }
       if (add_container == kCSBF_Brob_Exif) {
-        c.append(box_brob_exif, box_brob_exif + box_brob_exif_size);
+        Bytes(box_brob_exif, box_brob_exif_size).AppendTo(&c);
       }
       AppendU32BE(add_container == kCSBF_Single_Zero_Terminated
                       ? 0
@@ -390,7 +427,7 @@ PaddedBytes CreateTestJXLCodestream(Span<const uint8_t> pixels, size_t xsize,
       c.push_back('x');
       c.push_back('l');
       c.push_back('c');
-      c.append(compressed.data(), compressed.data() + compressed.size());
+      Bytes(compressed).AppendTo(&c);
       if (add_container == kCSBF_Single_Other) {
         AppendTestBox(unk1_box_type, unk1_box_contents, unk1_box_size, false,
                       &c);
@@ -403,9 +440,9 @@ PaddedBytes CreateTestJXLCodestream(Span<const uint8_t> pixels, size_t xsize,
 }
 
 JxlDecoderStatus ProcessInputIgnoreBoxes(JxlDecoder* dec) {
-  JxlDecoderStatus status;
-  while ((status = JxlDecoderProcessInput(dec)) == JXL_DEC_BOX) {
-    continue;
+  JxlDecoderStatus status = JXL_DEC_BOX;
+  while (status == JXL_DEC_BOX) {
+    status = JxlDecoderProcessInput(dec);
   }
   return status;
 }
@@ -417,7 +454,7 @@ std::vector<uint8_t> DecodeWithAPI(JxlDecoder* dec,
                                    bool use_callback, bool set_buffer_early,
                                    bool use_resizable_runner,
                                    bool require_boxes, bool expect_success,
-                                   PaddedBytes* icc = nullptr) {
+                                   std::vector<uint8_t>* icc = nullptr) {
   JxlThreadParallelRunnerPtr runner_fixed;
   JxlResizableParallelRunnerPtr runner_resizable;
   JxlParallelRunner runner_fn;
@@ -481,12 +518,12 @@ std::vector<uint8_t> DecodeWithAPI(JxlDecoder* dec,
   if (status == JXL_DEC_COLOR_ENCODING) {
     size_t icc_size = 0;
     EXPECT_EQ(JXL_DEC_SUCCESS,
-              JxlDecoderGetICCProfileSize(
-                  dec, &format, JXL_COLOR_PROFILE_TARGET_DATA, &icc_size));
+              JxlDecoderGetICCProfileSize(dec, JXL_COLOR_PROFILE_TARGET_DATA,
+                                          &icc_size));
     icc->resize(icc_size);
-    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetColorAsICCProfile(
-                                   dec, &format, JXL_COLOR_PROFILE_TARGET_DATA,
-                                   icc->data(), icc_size));
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderGetColorAsICCProfile(dec, JXL_COLOR_PROFILE_TARGET_DATA,
+                                             icc->data(), icc_size));
 
     status = process_input(dec);
   }
@@ -647,7 +684,7 @@ std::vector<uint8_t> GetTestHeader(size_t xsize, size_t ysize,
                                    size_t alpha_bits, bool xyb_encoded,
                                    bool have_container, bool metadata_default,
                                    bool insert_extra_box,
-                                   const jxl::PaddedBytes& icc_profile) {
+                                   const jxl::IccBytes& icc_profile) {
   jxl::BitWriter writer;
   jxl::BitWriter::Allotment allotment(&writer, 65536);  // Large enough
 
@@ -703,8 +740,9 @@ std::vector<uint8_t> GetTestHeader(size_t xsize, size_t ysize,
   }
 
   if (!icc_profile.empty()) {
-    jxl::PaddedBytes copy = icc_profile;
-    EXPECT_TRUE(metadata.m.color_encoding.SetICC(std::move(copy)));
+    jxl::IccBytes copy = icc_profile;
+    EXPECT_TRUE(
+        metadata.m.color_encoding.SetICC(std::move(copy), JxlGetDefaultCms()));
   }
 
   EXPECT_TRUE(jxl::Bundle::Write(metadata.m, &writer, 0, nullptr));
@@ -717,7 +755,7 @@ std::vector<uint8_t> GetTestHeader(size_t xsize, size_t ysize,
   }
 
   writer.ZeroPadToByte();
-  ReclaimAndCharge(&writer, &allotment, 0, nullptr);
+  allotment.ReclaimAndCharge(&writer, 0, nullptr);
   return std::vector<uint8_t>(
       writer.GetSpan().data(),
       writer.GetSpan().data() + writer.GetSpan().size());
@@ -922,7 +960,7 @@ TEST(DecodeTest, BasicInfoSizeHintTest) {
   JxlDecoderDestroy(dec);
 }
 
-std::vector<uint8_t> GetIccTestHeader(const jxl::PaddedBytes& icc_profile,
+std::vector<uint8_t> GetIccTestHeader(const jxl::IccBytes& icc_profile,
                                       bool xyb_encoded) {
   size_t xsize = 50;
   size_t ysize = 50;
@@ -937,10 +975,9 @@ std::vector<uint8_t> GetIccTestHeader(const jxl::PaddedBytes& icc_profile,
 
 // Tests the case where pixels and metadata ICC profile are the same
 TEST(DecodeTest, IccProfileTestOriginal) {
-  jxl::PaddedBytes icc_profile = GetIccTestProfile();
+  jxl::IccBytes icc_profile = GetIccTestProfile();
   bool xyb_encoded = false;
   std::vector<uint8_t> data = GetIccTestHeader(icc_profile, xyb_encoded);
-  JxlPixelFormat format = {4, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0};
 
   JxlDecoder* dec = JxlDecoderCreate(nullptr);
   EXPECT_EQ(JXL_DEC_SUCCESS,
@@ -961,40 +998,37 @@ TEST(DecodeTest, IccProfileTestOriginal) {
   // has an ICC profile instead
   EXPECT_EQ(JXL_DEC_ERROR,
             JxlDecoderGetColorAsEncodedProfile(
-                dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL, nullptr));
+                dec, JXL_COLOR_PROFILE_TARGET_ORIGINAL, nullptr));
 
   size_t dec_profile_size;
-  EXPECT_EQ(
-      JXL_DEC_SUCCESS,
-      JxlDecoderGetICCProfileSize(
-          dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL, &dec_profile_size));
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderGetICCProfileSize(dec, JXL_COLOR_PROFILE_TARGET_ORIGINAL,
+                                        &dec_profile_size));
 
   // Check that can get return status with NULL size
   EXPECT_EQ(JXL_DEC_SUCCESS,
-            JxlDecoderGetICCProfileSize(
-                dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL, nullptr));
+            JxlDecoderGetICCProfileSize(dec, JXL_COLOR_PROFILE_TARGET_ORIGINAL,
+                                        nullptr));
 
   // The profiles must be equal. This requires they have equal size, and if
   // they do, we can get the profile and compare the contents.
   EXPECT_EQ(icc_profile.size(), dec_profile_size);
   if (icc_profile.size() == dec_profile_size) {
-    jxl::PaddedBytes icc_profile2(icc_profile.size());
-    EXPECT_EQ(JXL_DEC_SUCCESS,
-              JxlDecoderGetColorAsICCProfile(
-                  dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL,
-                  icc_profile2.data(), icc_profile2.size()));
+    jxl::IccBytes icc_profile2(icc_profile.size());
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetColorAsICCProfile(
+                                   dec, JXL_COLOR_PROFILE_TARGET_ORIGINAL,
+                                   icc_profile2.data(), icc_profile2.size()));
     EXPECT_EQ(icc_profile, icc_profile2);
   }
 
   // the data is not xyb_encoded, so same result expected for the pixel data
   // color profile
-  EXPECT_EQ(JXL_DEC_ERROR,
-            JxlDecoderGetColorAsEncodedProfile(
-                dec, &format, JXL_COLOR_PROFILE_TARGET_DATA, nullptr));
+  EXPECT_EQ(JXL_DEC_ERROR, JxlDecoderGetColorAsEncodedProfile(
+                               dec, JXL_COLOR_PROFILE_TARGET_DATA, nullptr));
 
-  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetICCProfileSize(
-                                 dec, &format, JXL_COLOR_PROFILE_TARGET_DATA,
-                                 &dec_profile_size));
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderGetICCProfileSize(dec, JXL_COLOR_PROFILE_TARGET_DATA,
+                                        &dec_profile_size));
   EXPECT_EQ(icc_profile.size(), dec_profile_size);
 
   JxlDecoderDestroy(dec);
@@ -1002,11 +1036,9 @@ TEST(DecodeTest, IccProfileTestOriginal) {
 
 // Tests the case where pixels and metadata ICC profile are different
 TEST(DecodeTest, IccProfileTestXybEncoded) {
-  jxl::PaddedBytes icc_profile = GetIccTestProfile();
+  jxl::IccBytes icc_profile = GetIccTestProfile();
   bool xyb_encoded = true;
   std::vector<uint8_t> data = GetIccTestHeader(icc_profile, xyb_encoded);
-  JxlPixelFormat format = {4, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0};
-  JxlPixelFormat format_int = {4, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0};
 
   JxlDecoder* dec = JxlDecoderCreate(nullptr);
   EXPECT_EQ(JXL_DEC_SUCCESS,
@@ -1027,80 +1059,75 @@ TEST(DecodeTest, IccProfileTestXybEncoded) {
   // has an ICC profile instead
   EXPECT_EQ(JXL_DEC_ERROR,
             JxlDecoderGetColorAsEncodedProfile(
-                dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL, nullptr));
+                dec, JXL_COLOR_PROFILE_TARGET_ORIGINAL, nullptr));
 
   // Check that can get return status with NULL size
   EXPECT_EQ(JXL_DEC_SUCCESS,
-            JxlDecoderGetICCProfileSize(
-                dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL, nullptr));
+            JxlDecoderGetICCProfileSize(dec, JXL_COLOR_PROFILE_TARGET_ORIGINAL,
+                                        nullptr));
 
   size_t dec_profile_size;
-  EXPECT_EQ(
-      JXL_DEC_SUCCESS,
-      JxlDecoderGetICCProfileSize(
-          dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL, &dec_profile_size));
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderGetICCProfileSize(dec, JXL_COLOR_PROFILE_TARGET_ORIGINAL,
+                                        &dec_profile_size));
 
   // The profiles must be equal. This requires they have equal size, and if
   // they do, we can get the profile and compare the contents.
   EXPECT_EQ(icc_profile.size(), dec_profile_size);
   if (icc_profile.size() == dec_profile_size) {
-    jxl::PaddedBytes icc_profile2(icc_profile.size());
-    EXPECT_EQ(JXL_DEC_SUCCESS,
-              JxlDecoderGetColorAsICCProfile(
-                  dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL,
-                  icc_profile2.data(), icc_profile2.size()));
+    jxl::IccBytes icc_profile2(icc_profile.size());
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetColorAsICCProfile(
+                                   dec, JXL_COLOR_PROFILE_TARGET_ORIGINAL,
+                                   icc_profile2.data(), icc_profile2.size()));
     EXPECT_EQ(icc_profile, icc_profile2);
   }
 
   // Data is xyb_encoded, so the data profile is a different profile, encoded
   // as structured profile.
-  EXPECT_EQ(JXL_DEC_SUCCESS,
-            JxlDecoderGetColorAsEncodedProfile(
-                dec, &format, JXL_COLOR_PROFILE_TARGET_DATA, nullptr));
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetColorAsEncodedProfile(
+                                 dec, JXL_COLOR_PROFILE_TARGET_DATA, nullptr));
   JxlColorEncoding pixel_encoding;
   EXPECT_EQ(JXL_DEC_SUCCESS,
             JxlDecoderGetColorAsEncodedProfile(
-                dec, &format, JXL_COLOR_PROFILE_TARGET_DATA, &pixel_encoding));
+                dec, JXL_COLOR_PROFILE_TARGET_DATA, &pixel_encoding));
   EXPECT_EQ(JXL_PRIMARIES_SRGB, pixel_encoding.primaries);
   // The API returns LINEAR by default when the colorspace cannot be represented
   // by enum values.
   EXPECT_EQ(JXL_TRANSFER_FUNCTION_LINEAR, pixel_encoding.transfer_function);
 
   // Test the same but with integer format.
-  EXPECT_EQ(
-      JXL_DEC_SUCCESS,
-      JxlDecoderGetColorAsEncodedProfile(
-          dec, &format_int, JXL_COLOR_PROFILE_TARGET_DATA, &pixel_encoding));
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderGetColorAsEncodedProfile(
+                dec, JXL_COLOR_PROFILE_TARGET_DATA, &pixel_encoding));
   EXPECT_EQ(JXL_PRIMARIES_SRGB, pixel_encoding.primaries);
   EXPECT_EQ(JXL_TRANSFER_FUNCTION_LINEAR, pixel_encoding.transfer_function);
 
   // Test after setting the preferred color profile to non-linear sRGB:
   // for XYB images with ICC profile, this setting is expected to take effect.
   jxl::ColorEncoding temp_jxl_srgb = jxl::ColorEncoding::SRGB(false);
-  JxlColorEncoding pixel_encoding_srgb;
-  ConvertInternalToExternalColorEncoding(temp_jxl_srgb, &pixel_encoding_srgb);
+  JxlColorEncoding pixel_encoding_srgb = temp_jxl_srgb.ToExternal();
   EXPECT_EQ(JXL_DEC_SUCCESS,
             JxlDecoderSetPreferredColorProfile(dec, &pixel_encoding_srgb));
   EXPECT_EQ(JXL_DEC_SUCCESS,
             JxlDecoderGetColorAsEncodedProfile(
-                dec, &format, JXL_COLOR_PROFILE_TARGET_DATA, &pixel_encoding));
+                dec, JXL_COLOR_PROFILE_TARGET_DATA, &pixel_encoding));
   EXPECT_EQ(JXL_TRANSFER_FUNCTION_SRGB, pixel_encoding.transfer_function);
 
   // The decoder can also output this as a generated ICC profile anyway, and
   // we're certain that it will differ from the above defined profile since
   // the sRGB data should not have swapped R/G/B primaries.
 
-  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetICCProfileSize(
-                                 dec, &format, JXL_COLOR_PROFILE_TARGET_DATA,
-                                 &dec_profile_size));
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderGetICCProfileSize(dec, JXL_COLOR_PROFILE_TARGET_DATA,
+                                        &dec_profile_size));
   // We don't need to dictate exactly what size the generated ICC profile
   // must be (since there are many ways to represent the same color space),
   // but it should not be zero.
   EXPECT_NE(0u, dec_profile_size);
-  jxl::PaddedBytes icc_profile2(dec_profile_size);
+  jxl::IccBytes icc_profile2(dec_profile_size);
   if (0 != dec_profile_size) {
     EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetColorAsICCProfile(
-                                   dec, &format, JXL_COLOR_PROFILE_TARGET_DATA,
+                                   dec, JXL_COLOR_PROFILE_TARGET_DATA,
                                    icc_profile2.data(), icc_profile2.size()));
     // expected not equal
     EXPECT_NE(icc_profile, icc_profile2);
@@ -1111,24 +1138,22 @@ TEST(DecodeTest, IccProfileTestXybEncoded) {
   // updated.
 
   jxl::ColorEncoding temp_jxl_linear = jxl::ColorEncoding::LinearSRGB(false);
-  JxlColorEncoding pixel_encoding_linear;
-  ConvertInternalToExternalColorEncoding(temp_jxl_linear,
-                                         &pixel_encoding_linear);
+  JxlColorEncoding pixel_encoding_linear = temp_jxl_linear.ToExternal();
 
   EXPECT_EQ(JXL_DEC_SUCCESS,
             JxlDecoderSetPreferredColorProfile(dec, &pixel_encoding_linear));
   EXPECT_EQ(JXL_DEC_SUCCESS,
             JxlDecoderGetColorAsEncodedProfile(
-                dec, &format, JXL_COLOR_PROFILE_TARGET_DATA, &pixel_encoding));
+                dec, JXL_COLOR_PROFILE_TARGET_DATA, &pixel_encoding));
   EXPECT_EQ(JXL_TRANSFER_FUNCTION_LINEAR, pixel_encoding.transfer_function);
-  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetICCProfileSize(
-                                 dec, &format, JXL_COLOR_PROFILE_TARGET_DATA,
-                                 &dec_profile_size));
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderGetICCProfileSize(dec, JXL_COLOR_PROFILE_TARGET_DATA,
+                                        &dec_profile_size));
   EXPECT_NE(0u, dec_profile_size);
-  jxl::PaddedBytes icc_profile3(dec_profile_size);
+  jxl::IccBytes icc_profile3(dec_profile_size);
   if (0 != dec_profile_size) {
     EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetColorAsICCProfile(
-                                   dec, &format, JXL_COLOR_PROFILE_TARGET_DATA,
+                                   dec, JXL_COLOR_PROFILE_TARGET_DATA,
                                    icc_profile3.data(), icc_profile3.size()));
     // expected not equal to the previously set preferred profile.
     EXPECT_NE(icc_profile2, icc_profile3);
@@ -1143,9 +1168,8 @@ TEST(DecodeTest, IccProfileTestXybEncoded) {
 // handle the case of not enough input bytes with StatusCode::kNotEnoughBytes
 // rather than fatal error status codes.
 TEST(DecodeTest, ICCPartialTest) {
-  jxl::PaddedBytes icc_profile = GetIccTestProfile();
+  jxl::IccBytes icc_profile = GetIccTestProfile();
   std::vector<uint8_t> data = GetIccTestHeader(icc_profile, false);
-  JxlPixelFormat format = {4, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0};
 
   const uint8_t* next_in = data.data();
   size_t avail_in = 0;
@@ -1192,9 +1216,8 @@ TEST(DecodeTest, ICCPartialTest) {
       // Sanity check that the ICC profile was decoded correctly
       size_t dec_profile_size;
       EXPECT_EQ(JXL_DEC_SUCCESS,
-                JxlDecoderGetICCProfileSize(dec, &format,
-                                            JXL_COLOR_PROFILE_TARGET_ORIGINAL,
-                                            &dec_profile_size));
+                JxlDecoderGetICCProfileSize(
+                    dec, JXL_COLOR_PROFILE_TARGET_ORIGINAL, &dec_profile_size));
       EXPECT_EQ(icc_profile.size(), dec_profile_size);
 
     } else if (status == JXL_DEC_SUCCESS) {
@@ -1219,7 +1242,7 @@ struct PixelTestConfig {
   bool include_alpha;
   size_t xsize;
   size_t ysize;
-  bool add_preview;
+  jxl::PreviewMode preview_mode;
   bool add_intrinsic_size;
   // Output format.
   JxlEndianness endianness;
@@ -1262,11 +1285,11 @@ TEST_P(DecodeTestParam, PixelTest) {
   params.cparams.ec_resampling = config.upsampling;
   params.box_format = config.add_container;
   params.orientation = config.orientation;
-  params.add_preview = config.add_preview;
+  params.preview_mode = config.preview_mode;
   params.add_intrinsic_size = config.add_intrinsic_size;
-  jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
-      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), config.xsize,
-      config.ysize, orig_channels, params);
+  std::vector<uint8_t> compressed = jxl::CreateTestJXLCodestream(
+      jxl::Bytes(pixels.data(), pixels.size()), config.xsize, config.ysize,
+      orig_channels, params);
 
   JxlPixelFormat format = {config.output_channels, config.data_type,
                            config.endianness, 0};
@@ -1275,11 +1298,11 @@ TEST_P(DecodeTestParam, PixelTest) {
   size_t xsize = swap_xy ? config.ysize : config.xsize;
   size_t ysize = swap_xy ? config.xsize : config.ysize;
 
-  std::vector<uint8_t> pixels2 = jxl::DecodeWithAPI(
-      dec, jxl::Span<const uint8_t>(compressed.data(), compressed.size()),
-      format, config.use_callback, config.set_buffer_early,
-      config.use_resizable_runner, /*require_boxes=*/false,
-      /*expect_success=*/true);
+  std::vector<uint8_t> pixels2 =
+      jxl::DecodeWithAPI(dec, jxl::Bytes(compressed.data(), compressed.size()),
+                         format, config.use_callback, config.set_buffer_early,
+                         config.use_resizable_runner, /*require_boxes=*/false,
+                         /*expect_success=*/true);
   JxlDecoderReset(dec);
   EXPECT_EQ(num_pixels * config.output_channels *
                 jxl::test::GetDataBits(config.data_type) / jxl::kBitsPerByte,
@@ -1295,14 +1318,12 @@ TEST_P(DecodeTestParam, PixelTest) {
 
     jxl::CodecInOut io;
     if (config.include_alpha) io.metadata.m.SetAlphaBits(16);
+    io.metadata.m.color_encoding = color_encoding;
     io.SetSize(config.xsize, config.ysize);
 
     EXPECT_TRUE(ConvertFromExternal(bytes, config.xsize, config.ysize,
-                                    color_encoding, config.output_channels,
-                                    /*alpha_is_premultiplied=*/false, 16,
-                                    JXL_BIG_ENDIAN, nullptr, &io.Main(),
-                                    /*float_in=*/false,
-                                    /*align=*/0));
+                                    color_encoding, 16, format_orig, nullptr,
+                                    &io.Main()));
 
     for (size_t i = 0; i < pixels.size(); i++) pixels[i] = 0;
     EXPECT_TRUE(ConvertToExternal(
@@ -1359,16 +1380,16 @@ std::vector<PixelTestConfig> GeneratePixelTests() {
       {JXL_BIG_ENDIAN, JXL_TYPE_FLOAT},
   };
 
-  auto make_test = [&](ChannelInfo ch, size_t xsize, size_t ysize, bool preview,
-                       bool intrinsic_size, CodeStreamBoxFormat box,
-                       JxlOrientation orientation, bool keep_orientation,
-                       OutputFormat format, bool use_callback,
-                       bool set_buffer_early, bool resizable_runner,
-                       size_t upsampling) {
+  auto make_test = [&](ChannelInfo ch, size_t xsize, size_t ysize,
+                       jxl::PreviewMode preview_mode, bool intrinsic_size,
+                       CodeStreamBoxFormat box, JxlOrientation orientation,
+                       bool keep_orientation, OutputFormat format,
+                       bool use_callback, bool set_buffer_early,
+                       bool resizable_runner, size_t upsampling) {
     PixelTestConfig c;
     c.grayscale = ch.grayscale;
     c.include_alpha = ch.include_alpha;
-    c.add_preview = preview;
+    c.preview_mode = preview_mode;
     c.add_intrinsic_size = intrinsic_size;
     c.xsize = xsize;
     c.ysize = ysize;
@@ -1390,7 +1411,7 @@ std::vector<PixelTestConfig> GeneratePixelTests() {
     for (int use_callback = 0; use_callback <= 1; use_callback++) {
       for (size_t upsampling : {1, 2, 4, 8}) {
         for (OutputFormat fmt : out_formats) {
-          make_test(ch, 301, 33, /*add_preview=*/false,
+          make_test(ch, 301, 33, jxl::kNoPreview,
                     /*add_intrinsic_size=*/false,
                     CodeStreamBoxFormat::kCSBF_None, JXL_ORIENT_IDENTITY,
                     /*keep_orientation=*/false, fmt, use_callback,
@@ -1402,7 +1423,7 @@ std::vector<PixelTestConfig> GeneratePixelTests() {
   }
   // Test codestream formats.
   for (size_t box = 1; box < kCSBF_NUM_ENTRIES; ++box) {
-    make_test(ch_info[0], 77, 33, /*add_preview=*/false,
+    make_test(ch_info[0], 77, 33, jxl::kNoPreview,
               /*add_intrinsic_size=*/false, (CodeStreamBoxFormat)box,
               JXL_ORIENT_IDENTITY,
               /*keep_orientation=*/false, out_formats[0],
@@ -1410,9 +1431,11 @@ std::vector<PixelTestConfig> GeneratePixelTests() {
               /*set_buffer_early=*/false, /*resizable_runner=*/false, 1);
   }
   // Test previews.
-  for (int add_preview = 0; add_preview <= 1; add_preview++) {
-    make_test(ch_info[0], 77, 33, add_preview, /*add_intrinsic_size=*/false,
-              CodeStreamBoxFormat::kCSBF_None, JXL_ORIENT_IDENTITY,
+  for (int preview_mode = 0; preview_mode < jxl::kNumPreviewModes;
+       preview_mode++) {
+    make_test(ch_info[0], 77, 33, (jxl::PreviewMode)preview_mode,
+              /*add_intrinsic_size=*/false, CodeStreamBoxFormat::kCSBF_None,
+              JXL_ORIENT_IDENTITY,
               /*keep_orientation=*/false, out_formats[0],
               /*use_callback=*/false, /*set_buffer_early=*/false,
               /*resizable_runner=*/false, 1);
@@ -1420,14 +1443,14 @@ std::vector<PixelTestConfig> GeneratePixelTests() {
   // Test intrinsic sizes.
   for (int add_intrinsic_size = 0; add_intrinsic_size <= 1;
        add_intrinsic_size++) {
-    make_test(ch_info[0], 55, 34, /*add_preview=*/false, add_intrinsic_size,
+    make_test(ch_info[0], 55, 34, jxl::kNoPreview, add_intrinsic_size,
               CodeStreamBoxFormat::kCSBF_None, JXL_ORIENT_IDENTITY,
               /*keep_orientation=*/false, out_formats[0],
               /*use_callback=*/false, /*set_buffer_early=*/false,
               /*resizable_runner=*/false, 1);
   }
   // Test setting buffers early.
-  make_test(ch_info[0], 300, 33, /*add_preview=*/false,
+  make_test(ch_info[0], 300, 33, jxl::kNoPreview,
             /*add_intrinsic_size=*/false, CodeStreamBoxFormat::kCSBF_None,
             JXL_ORIENT_IDENTITY,
             /*keep_orientation=*/false, out_formats[0],
@@ -1436,7 +1459,7 @@ std::vector<PixelTestConfig> GeneratePixelTests() {
 
   // Test using the resizable runner
   for (size_t i = 0; i < 4; i++) {
-    make_test(ch_info[0], 300 << i, 33 << i, /*add_preview=*/false,
+    make_test(ch_info[0], 300 << i, 33 << i, jxl::kNoPreview,
               /*add_intrinsic_size=*/false, CodeStreamBoxFormat::kCSBF_None,
               JXL_ORIENT_IDENTITY,
               /*keep_orientation=*/false, out_formats[0],
@@ -1445,19 +1468,22 @@ std::vector<PixelTestConfig> GeneratePixelTests() {
   }
 
   // Test orientations.
-  for (int orientation = 1; orientation <= 8; ++orientation) {
-    make_test(ch_info[0], 280, 12, /*add_preview=*/false,
-              /*add_intrinsic_size=*/false, CodeStreamBoxFormat::kCSBF_None,
-              static_cast<JxlOrientation>(orientation),
-              /*keep_orientation=*/false, out_formats[0],
-              /*use_callback=*/false, /*set_buffer_early=*/true,
-              /*resizable_runner=*/false, 1);
-    make_test(ch_info[0], 280, 12, /*add_preview=*/false,
-              /*add_intrinsic_size=*/false, CodeStreamBoxFormat::kCSBF_None,
-              static_cast<JxlOrientation>(orientation),
-              /*keep_orientation=*/true, out_formats[0],
-              /*use_callback=*/false, /*set_buffer_early=*/true,
-              /*resizable_runner=*/false, 1);
+  for (int orientation = 2; orientation <= 8; ++orientation) {
+    for (int keep_orientation = 0; keep_orientation <= 1; keep_orientation++) {
+      for (int use_callback = 0; use_callback <= 1; use_callback++) {
+        for (ChannelInfo ch : ch_info) {
+          for (OutputFormat fmt : out_formats) {
+            make_test(ch, 280, 12, jxl::kNoPreview,
+                      /*add_intrinsic_size=*/false,
+                      CodeStreamBoxFormat::kCSBF_None,
+                      static_cast<JxlOrientation>(orientation),
+                      /*keep_orientation=*/keep_orientation, fmt,
+                      /*use_callback=*/use_callback, /*set_buffer_early=*/true,
+                      /*resizable_runner=*/false, 1);
+          }
+        }
+      }
+    }
   }
 
   return all_tests;
@@ -1498,7 +1524,8 @@ std::ostream& operator<<(std::ostream& os, const PixelTestConfig& c) {
     os << "Box";
     os << (size_t)c.add_container;
   }
-  if (c.add_preview) os << "Preview";
+  if (c.preview_mode == jxl::kSmallPreview) os << "Preview";
+  if (c.preview_mode == jxl::kBigPreview) os << "BigPreview";
   if (c.add_intrinsic_size) os << "IntrinicSize";
   if (c.use_callback) os << "Callback";
   if (c.set_buffer_early) os << "EarlyBuffer";
@@ -1534,17 +1561,16 @@ TEST(DecodeTest, PixelTestWithICCProfileLossless) {
   params.add_icc_profile = true;
   // For variation: some have container and no preview, others have preview
   // and no container.
-  jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
-      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 4,
-      params);
+  std::vector<uint8_t> compressed = jxl::CreateTestJXLCodestream(
+      jxl::Bytes(pixels.data(), pixels.size()), xsize, ysize, 4, params);
 
   for (uint32_t channels = 3; channels <= 4; ++channels) {
     {
       JxlPixelFormat format = {channels, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0};
 
       std::vector<uint8_t> pixels2 = jxl::DecodeWithAPI(
-          dec, jxl::Span<const uint8_t>(compressed.data(), compressed.size()),
-          format, /*use_callback=*/false, /*set_buffer_early=*/false,
+          dec, jxl::Bytes(compressed.data(), compressed.size()), format,
+          /*use_callback=*/false, /*set_buffer_early=*/false,
           /*use_resizable_runner=*/false, /*require_boxes=*/false,
           /*expect_success=*/true);
       JxlDecoderReset(dec);
@@ -1558,8 +1584,8 @@ TEST(DecodeTest, PixelTestWithICCProfileLossless) {
 
       // Test with the container for one of the pixel formats.
       std::vector<uint8_t> pixels2 = jxl::DecodeWithAPI(
-          dec, jxl::Span<const uint8_t>(compressed.data(), compressed.size()),
-          format, /*use_callback=*/true, /*set_buffer_early=*/true,
+          dec, jxl::Bytes(compressed.data(), compressed.size()), format,
+          /*use_callback=*/true, /*set_buffer_early=*/true,
           /*use_resizable_runner=*/false, /*require_boxes=*/false,
           /*expect_success=*/true);
       JxlDecoderReset(dec);
@@ -1573,9 +1599,9 @@ TEST(DecodeTest, PixelTestWithICCProfileLossless) {
       JxlPixelFormat format = {channels, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0};
 
       std::vector<uint8_t> pixels2 = jxl::DecodeWithAPI(
-          dec, jxl::Span<const uint8_t>(compressed.data(), compressed.size()),
-          format, /*use_callback=*/false, /*set_buffer_early=*/false,
-          /*use_resizable_runner=*/false, /*reuqire_boxes=*/false,
+          dec, jxl::Bytes(compressed.data(), compressed.size()), format,
+          /*use_callback=*/false, /*set_buffer_early=*/false,
+          /*use_resizable_runner=*/false, /*require_boxes=*/false,
           /*expect_success=*/true);
       JxlDecoderReset(dec);
       EXPECT_EQ(num_pixels * channels * 4, pixels2.size());
@@ -1597,58 +1623,55 @@ TEST(DecodeTest, PixelTestWithICCProfileLossy) {
   JxlPixelFormat format_orig = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
   jxl::TestCodestreamParams params;
   params.add_icc_profile = true;
-  jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
-      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 3,
-      params);
+  std::vector<uint8_t> compressed = jxl::CreateTestJXLCodestream(
+      jxl::Bytes(pixels.data(), pixels.size()), xsize, ysize, 3, params);
   uint32_t channels = 3;
 
   JxlPixelFormat format = {channels, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0};
 
-  jxl::PaddedBytes icc;
+  std::vector<uint8_t> icc_data;
   std::vector<uint8_t> pixels2 = jxl::DecodeWithAPI(
-      dec, jxl::Span<const uint8_t>(compressed.data(), compressed.size()),
-      format, /*use_callback=*/false, /*set_buffer_early=*/true,
+      dec, jxl::Bytes(compressed.data(), compressed.size()), format,
+      /*use_callback=*/false, /*set_buffer_early=*/true,
       /*use_resizable_runner=*/false, /*require_boxes=*/false,
-      /*expect_success=*/true, /*icc=*/&icc);
+      /*expect_success=*/true, /*icc=*/&icc_data);
   JxlDecoderReset(dec);
   EXPECT_EQ(num_pixels * channels * 4, pixels2.size());
 
   // The input pixels use the profile matching GetIccTestProfile, since we set
   // add_icc_profile for CreateTestJXLCodestream to true.
   jxl::ColorEncoding color_encoding0;
-  EXPECT_TRUE(color_encoding0.SetICC(GetIccTestProfile()));
+  EXPECT_TRUE(color_encoding0.SetICC(GetIccTestProfile(), JxlGetDefaultCms()));
   jxl::Span<const uint8_t> span0(pixels.data(), pixels.size());
   jxl::CodecInOut io0;
   io0.SetSize(xsize, ysize);
-  EXPECT_TRUE(
-      ConvertFromExternal(span0, xsize, ysize, color_encoding0, /*channels=*/3,
-                          /*alpha_is_premultiplied=*/false,
-                          /*bits_per_sample=*/16, format_orig.endianness,
-                          /*pool=*/nullptr, &io0.Main(), /*float_in=*/false,
-                          /*align=*/0));
+  EXPECT_TRUE(ConvertFromExternal(span0, xsize, ysize, color_encoding0,
+                                  /*bits_per_sample=*/16, format_orig,
+                                  /*pool=*/nullptr, &io0.Main()));
 
   jxl::ColorEncoding color_encoding1;
-  EXPECT_TRUE(color_encoding1.SetICC(std::move(icc)));
+  jxl::IccBytes icc;
+  jxl::Bytes(icc_data).AppendTo(&icc);
+  EXPECT_TRUE(color_encoding1.SetICC(std::move(icc), JxlGetDefaultCms()));
   jxl::Span<const uint8_t> span1(pixels2.data(), pixels2.size());
   jxl::CodecInOut io1;
   io1.SetSize(xsize, ysize);
   EXPECT_TRUE(ConvertFromExternal(span1, xsize, ysize, color_encoding1,
-                                  channels, /*alpha_is_premultiplied=*/false,
-                                  /*bits_per_sample=*/32, format.endianness,
-                                  /*pool=*/nullptr, &io1.Main(),
-                                  /*float_in=*/true, /*align=*/0));
+                                  /*bits_per_sample=*/32, format,
+                                  /*pool=*/nullptr, &io1.Main()));
 
   jxl::ButteraugliParams ba;
-  EXPECT_THAT(ButteraugliDistance(io0, io1, ba, jxl::GetJxlCms(),
-                                  /*distmap=*/nullptr, nullptr),
-              IsSlightlyBelow(0.785f));
+  EXPECT_THAT(
+      ButteraugliDistance(io0.frames, io1.frames, ba, *JxlGetDefaultCms(),
+                          /*distmap=*/nullptr, nullptr),
+      IsSlightlyBelow(0.55f));
 
   JxlDecoderDestroy(dec);
 }
 
 std::string ColorDescription(JxlColorEncoding c) {
   jxl::ColorEncoding color_encoding;
-  EXPECT_TRUE(ConvertExternalToInternalColorEncoding(c, &color_encoding));
+  EXPECT_TRUE(color_encoding.FromExternal(c));
   return Description(color_encoding);
 }
 
@@ -1656,7 +1679,7 @@ std::string GetOrigProfile(JxlDecoder* dec) {
   JxlColorEncoding c;
   JxlColorProfileTarget target = JXL_COLOR_PROFILE_TARGET_ORIGINAL;
   EXPECT_EQ(JXL_DEC_SUCCESS,
-            JxlDecoderGetColorAsEncodedProfile(dec, nullptr, target, &c));
+            JxlDecoderGetColorAsEncodedProfile(dec, target, &c));
   return ColorDescription(c);
 }
 
@@ -1664,7 +1687,7 @@ std::string GetDataProfile(JxlDecoder* dec) {
   JxlColorEncoding c;
   JxlColorProfileTarget target = JXL_COLOR_PROFILE_TARGET_DATA;
   EXPECT_EQ(JXL_DEC_SUCCESS,
-            JxlDecoderGetColorAsEncodedProfile(dec, nullptr, target, &c));
+            JxlDecoderGetColorAsEncodedProfile(dec, target, &c));
   return ColorDescription(c);
 }
 
@@ -1678,23 +1701,23 @@ double ButteraugliDistance(size_t xsize, size_t ysize,
   jxl::CodecInOut in;
   in.metadata.m.color_encoding = color_in;
   in.metadata.m.SetIntensityTarget(intensity_in);
+  JxlPixelFormat format_in = {static_cast<uint32_t>(color_in.Channels()),
+                              JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
   EXPECT_TRUE(jxl::ConvertFromExternal(
-      jxl::Span<const uint8_t>(pixels_in.data(), pixels_in.size()), xsize,
-      ysize, color_in, color_in.Channels(),
-      /*alpha_is_premultiplied=*/false,
-      /*bits_per_sample=*/16, JXL_BIG_ENDIAN,
-      /*pool=*/nullptr, &in.Main(), /*float_in=*/false, /*align=*/0));
+      jxl::Bytes(pixels_in.data(), pixels_in.size()), xsize, ysize, color_in,
+      /*bits_per_sample=*/16, format_in,
+      /*pool=*/nullptr, &in.Main()));
   jxl::CodecInOut out;
   out.metadata.m.color_encoding = color_out;
   out.metadata.m.SetIntensityTarget(intensity_out);
+  JxlPixelFormat format_out = {static_cast<uint32_t>(color_out.Channels()),
+                               JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
   EXPECT_TRUE(jxl::ConvertFromExternal(
-      jxl::Span<const uint8_t>(pixels_out.data(), pixels_out.size()), xsize,
-      ysize, color_out, color_out.Channels(),
-      /*alpha_is_premultiplied=*/false,
-      /*bits_per_sample=*/16, JXL_BIG_ENDIAN,
-      /*pool=*/nullptr, &out.Main(), /*float_in=*/false, /*align=*/0));
-  return ButteraugliDistance(in, out, jxl::ButteraugliParams(),
-                             jxl::GetJxlCms(), nullptr, nullptr);
+      jxl::Bytes(pixels_out.data(), pixels_out.size()), xsize, ysize, color_out,
+      /*bits_per_sample=*/16, format_out,
+      /*pool=*/nullptr, &out.Main()));
+  return ButteraugliDistance(in.frames, out.frames, jxl::ButteraugliParams(),
+                             *JxlGetDefaultCms(), nullptr, nullptr);
 }
 
 class DecodeAllEncodingsTest
@@ -1709,16 +1732,15 @@ TEST_P(DecodeAllEncodingsTest, PreserveOriginalProfileTest) {
   int events = JXL_DEC_BASIC_INFO | JXL_DEC_COLOR_ENCODING | JXL_DEC_FULL_IMAGE;
   const auto& cdesc = GetParam();
   jxl::ColorEncoding c_in = jxl::test::ColorEncodingFromDescriptor(cdesc);
-  if (c_in.rendering_intent != jxl::RenderingIntent::kRelative) return;
+  if (c_in.GetRenderingIntent() != jxl::RenderingIntent::kRelative) return;
   std::string color_space_in = Description(c_in);
-  float intensity_in = c_in.tf.IsPQ() ? 10000 : 255;
+  float intensity_in = c_in.Tf().IsPQ() ? 10000 : 255;
   printf("Testing input color space %s\n", color_space_in.c_str());
   jxl::TestCodestreamParams params;
   params.color_space = color_space_in;
   params.intensity_target = intensity_in;
-  jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
-      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 3,
-      params);
+  std::vector<uint8_t> data = jxl::CreateTestJXLCodestream(
+      jxl::Bytes(pixels.data(), pixels.size()), xsize, ysize, 3, params);
   JxlDecoder* dec = JxlDecoderCreate(nullptr);
   EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(dec, events));
   EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, data.data(), data.size()));
@@ -1738,43 +1760,56 @@ TEST_P(DecodeAllEncodingsTest, PreserveOriginalProfileTest) {
   EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
   double dist = ButteraugliDistance(xsize, ysize, pixels, c_in, intensity_in,
                                     out, c_in, intensity_in);
-  EXPECT_LT(dist, 1.2);
+  EXPECT_LT(dist, 1.29);
   EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderProcessInput(dec));
   JxlDecoderDestroy(dec);
 }
 
 namespace {
 void SetPreferredColorProfileTest(
-    const jxl::test::ColorEncodingDescriptor& from) {
+    const jxl::test::ColorEncodingDescriptor& from, bool icc_dst,
+    bool use_cms) {
   size_t xsize = 123, ysize = 77;
   int events = JXL_DEC_BASIC_INFO | JXL_DEC_COLOR_ENCODING | JXL_DEC_FULL_IMAGE;
   jxl::ColorEncoding c_in = jxl::test::ColorEncodingFromDescriptor(from);
-  if (c_in.rendering_intent != jxl::RenderingIntent::kRelative) return;
-  if (c_in.white_point != jxl::WhitePoint::kD65) return;
+  if (c_in.GetRenderingIntent() != jxl::RenderingIntent::kRelative) return;
+  if (c_in.GetWhitePointType() != jxl::WhitePoint::kD65) return;
   uint32_t num_channels = c_in.Channels();
   std::vector<uint8_t> pixels =
       jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0);
+
   JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
   std::string color_space_in = Description(c_in);
-  float intensity_in = c_in.tf.IsPQ() ? 10000 : 255;
+  float intensity_in = c_in.Tf().IsPQ() ? 10000 : 255;
   jxl::TestCodestreamParams params;
   params.color_space = color_space_in;
   params.intensity_target = intensity_in;
-  jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
-      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
-      num_channels, params);
-  for (const auto& c1 : jxl::test::AllEncodings()) {
+  std::vector<uint8_t> data =
+      jxl::CreateTestJXLCodestream(jxl::Bytes(pixels.data(), pixels.size()),
+                                   xsize, ysize, num_channels, params);
+  auto all_encodings = jxl::test::AllEncodings();
+  // TODO(firsching): understand why XYB does not work together with icc_dst.
+  if (!icc_dst) {
+    all_encodings.push_back(
+        {jxl::ColorSpace::kXYB, jxl::WhitePoint::kD65, jxl::Primaries::kCustom,
+         jxl::TransferFunction::kUnknown, jxl::RenderingIntent::kPerceptual});
+  }
+  for (const auto& c1 : all_encodings) {
     jxl::ColorEncoding c_out = jxl::test::ColorEncodingFromDescriptor(c1);
     float intensity_out = intensity_in;
-    if (c_out.rendering_intent != jxl::RenderingIntent::kRelative) continue;
-    if ((c_in.primaries == jxl::Primaries::k2100 &&
-         c_out.primaries != jxl::Primaries::k2100) ||
-        (c_in.primaries == jxl::Primaries::kP3 &&
-         c_out.primaries == jxl::Primaries::kSRGB)) {
-      // Converting to a narrower gamut does not work without gammut mapping.
-      continue;
+    if (c_out.GetColorSpace() != jxl::ColorSpace::kXYB) {
+      if (c_out.GetRenderingIntent() != jxl::RenderingIntent::kRelative) {
+        continue;
+      }
+      if ((c_in.GetPrimariesType() == jxl::Primaries::k2100 &&
+           c_out.GetPrimariesType() != jxl::Primaries::k2100) ||
+          (c_in.GetPrimariesType() == jxl::Primaries::kP3 &&
+           c_out.GetPrimariesType() == jxl::Primaries::kSRGB)) {
+        // Converting to a narrower gamut does not work without gamut mapping.
+        continue;
+      }
     }
-    if (c_out.tf.IsHLG() && intensity_out > 300) {
+    if (c_out.Tf().IsHLG() && intensity_out > 300) {
       // The Linear->HLG OOTF function at this intensity level can push
       // saturated colors out of gamut, so we would need gamut mapping in
       // this case too.
@@ -1796,13 +1831,42 @@ void SetPreferredColorProfileTest(
     EXPECT_FALSE(info.uses_original_profile);
     EXPECT_EQ(JXL_DEC_COLOR_ENCODING, JxlDecoderProcessInput(dec));
     EXPECT_EQ(GetOrigProfile(dec), color_space_in);
-    EXPECT_EQ(GetDataProfile(dec), color_space_in);
     JxlColorEncoding encoding_out;
     EXPECT_TRUE(jxl::ParseDescription(color_space_out, &encoding_out));
-    EXPECT_EQ(JXL_DEC_SUCCESS,
-              JxlDecoderSetPreferredColorProfile(dec, &encoding_out));
+    if (c_out.GetColorSpace() == jxl::ColorSpace::kXYB &&
+        (c_in.GetPrimariesType() != jxl::Primaries::kSRGB ||
+         c_in.Tf().IsPQ())) {
+      EXPECT_EQ(JXL_DEC_ERROR,
+                JxlDecoderSetPreferredColorProfile(dec, &encoding_out));
+      JxlDecoderDestroy(dec);
+      continue;
+    }
+    if (use_cms) {
+      JxlDecoderSetCms(dec, *JxlGetDefaultCms());
+    }
+    if (icc_dst) {
+      jxl::ColorEncoding internal_encoding_out;
+      EXPECT_TRUE(internal_encoding_out.FromExternal(encoding_out));
+      EXPECT_TRUE(internal_encoding_out.CreateICC());
+      std::vector<uint8_t> rewritten_icc = internal_encoding_out.ICC();
+
+      EXPECT_EQ(use_cms ? JXL_DEC_SUCCESS : JXL_DEC_ERROR,
+                JxlDecoderSetOutputColorProfile(
+                    dec, nullptr, rewritten_icc.data(), rewritten_icc.size()));
+      if (!use_cms) {
+        // continue if we don't have a cms here
+        JxlDecoderDestroy(dec);
+        continue;
+      }
+    } else {
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderSetPreferredColorProfile(dec, &encoding_out));
+    }
     EXPECT_EQ(GetOrigProfile(dec), color_space_in);
-    EXPECT_EQ(GetDataProfile(dec), color_space_out);
+    if (icc_dst) {
+    } else {
+      EXPECT_EQ(GetDataProfile(dec), color_space_out);
+    }
     EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
     size_t buffer_size;
     JxlPixelFormat out_format = format;
@@ -1815,8 +1879,9 @@ void SetPreferredColorProfileTest(
     EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
     double dist = ButteraugliDistance(xsize, ysize, pixels, c_in, intensity_in,
                                       out, c_out, intensity_out);
-    if (c_in.white_point == c_out.white_point) {
-      EXPECT_LT(dist, 1.2);
+
+    if (c_in.GetWhitePointType() == c_out.GetWhitePointType()) {
+      EXPECT_LT(dist, 1.29);
     } else {
       EXPECT_LT(dist, 4.0);
     }
@@ -1830,12 +1895,123 @@ TEST(DecodeTest, SetPreferredColorProfileTestFromGray) {
   jxl::test::ColorEncodingDescriptor gray = {
       jxl::ColorSpace::kGray, jxl::WhitePoint::kD65, jxl::Primaries::kSRGB,
       jxl::TransferFunction::kSRGB, jxl::RenderingIntent::kRelative};
-  SetPreferredColorProfileTest(gray);
+  SetPreferredColorProfileTest(gray, true, true);
+  SetPreferredColorProfileTest(gray, false, true);
+  SetPreferredColorProfileTest(gray, true, false);
+  SetPreferredColorProfileTest(gray, false, false);
+}
+
+static std::string DecodeAllEncodingsVariantsTestName(
+    const ::testing::TestParamInfo<
+        std::tuple<jxl::test::ColorEncodingDescriptor, bool, bool>>& info) {
+  const auto& encoding = std::get<0>(info.param);
+  bool icc_dst = std::get<1>(info.param);
+  bool use_cms = std::get<2>(info.param);
+
+  std::string encoding_name =
+      Description(ColorEncodingFromDescriptor(encoding));
+
+  return "From_" + encoding_name +
+         (icc_dst ? "_with_icc_dst" : "_without_icc_dst") +
+         (use_cms ? "_with_cms" : "_without_cms");
+}
+
+class DecodeAllEncodingsVariantsTest
+    : public ::testing::TestWithParam<
+          std::tuple<jxl::test::ColorEncodingDescriptor, bool, bool>> {};
+JXL_GTEST_INSTANTIATE_TEST_SUITE_P(
+    DecodeAllEncodingsVariantsTestInstantiation, DecodeAllEncodingsVariantsTest,
+    ::testing::Combine(::testing::ValuesIn(jxl::test::AllEncodings()),
+                       ::testing::Bool(), ::testing::Bool()),
+    DecodeAllEncodingsVariantsTestName);
+TEST_P(DecodeAllEncodingsVariantsTest, SetPreferredColorProfileTest) {
+  const auto& from = std::get<0>(GetParam());
+  bool icc_dst = std::get<1>(GetParam());
+  bool use_cms = std::get<2>(GetParam());
+  SetPreferredColorProfileTest(from, icc_dst, use_cms);
+}
+
+void DecodeImageWithColorEncoding(const std::vector<uint8_t>& compressed,
+                                  jxl::ColorEncoding& color_encoding,
+                                  bool with_cms, std::vector<uint8_t>& out,
+                                  JxlBasicInfo& info) {
+  JxlDecoder* dec = JxlDecoderCreate(nullptr);
+  int events = JXL_DEC_BASIC_INFO | JXL_DEC_COLOR_ENCODING | JXL_DEC_FULL_IMAGE;
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(dec, events));
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSetInput(dec, compressed.data(), compressed.size()));
+  EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+  EXPECT_EQ(JXL_DEC_COLOR_ENCODING, JxlDecoderProcessInput(dec));
+  std::string color_space_in = GetOrigProfile(dec);
+  if (with_cms) {
+    JxlDecoderSetCms(dec, *JxlGetDefaultCms());
+    EXPECT_TRUE(color_encoding.CreateICC());
+    std::vector<uint8_t> rewritten_icc = color_encoding.ICC();
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderSetOutputColorProfile(
+                  dec, nullptr, rewritten_icc.data(), rewritten_icc.size()));
+  } else {
+    JxlColorEncoding external_color_encoding = color_encoding.ToExternal();
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetOutputColorProfile(
+                                   dec, &external_color_encoding, nullptr, 0));
+  }
+  EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+
+  size_t buffer_size;
+  JxlPixelFormat format = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+
+  JxlPixelFormat out_format = format;
+  out_format.num_channels = color_encoding.Channels();
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderImageOutBufferSize(dec, &out_format, &buffer_size));
+  out.resize(buffer_size);
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetImageOutBuffer(
+                                 dec, &out_format, out.data(), out.size()));
+  EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+  JxlDecoderDestroy(dec);
 }
 
-TEST_P(DecodeAllEncodingsTest, SetPreferredColorProfileTest) {
-  const auto& from = GetParam();
-  SetPreferredColorProfileTest(from);
+class DecodeAllEncodingsWithCMSTest
+    : public ::testing::TestWithParam<jxl::test::ColorEncodingDescriptor> {};
+
+JXL_GTEST_INSTANTIATE_TEST_SUITE_P(
+    AllEncodings, DecodeAllEncodingsWithCMSTest,
+    testing::ValuesIn(jxl::test::AllEncodings()));
+
+TEST_P(DecodeAllEncodingsWithCMSTest, DecodeWithCMS) {
+  auto all_encodings = jxl::test::AllEncodings();
+  uint32_t num_channels = 3;
+  size_t xsize = 177, ysize = 123;
+  std::vector<uint8_t> pixels =
+      jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0);
+  jxl::TestCodestreamParams params;
+  std::vector<uint8_t> data =
+      jxl::CreateTestJXLCodestream(jxl::Bytes(pixels.data(), pixels.size()),
+                                   xsize, ysize, num_channels, params);
+
+  jxl::ColorEncoding color_encoding =
+      jxl::test::ColorEncodingFromDescriptor(GetParam());
+  fprintf(stderr, "color_description: %s\n",
+          Description(color_encoding).c_str());
+
+  std::vector<uint8_t> out_with_cms;
+  JxlBasicInfo info_with_cms;
+  DecodeImageWithColorEncoding(data, color_encoding, true, out_with_cms,
+                               info_with_cms);
+
+  std::vector<uint8_t> out_without_cms;
+  JxlBasicInfo info_without_cms;
+  DecodeImageWithColorEncoding(data, color_encoding, false, out_without_cms,
+                               info_without_cms);
+
+  EXPECT_EQ(info_with_cms.xsize, info_without_cms.xsize);
+  EXPECT_EQ(info_with_cms.ysize, info_without_cms.ysize);
+  EXPECT_EQ(out_with_cms.size(), out_without_cms.size());
+  double dist = ButteraugliDistance(xsize, ysize, out_with_cms, color_encoding,
+                                    255, out_without_cms, color_encoding, 255);
+
+  EXPECT_LT(dist, .1);
 }
 
 // Tests the case of lossy sRGB image without alpha channel, decoded to RGB8
@@ -1849,15 +2025,15 @@ TEST(DecodeTest, PixelTestOpaqueSrgbLossy) {
     std::vector<uint8_t> pixels =
         jxl::test::GetSomeTestImage(xsize, ysize, 3, 0);
     JxlPixelFormat format_orig = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
-    jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
-        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 3,
+    std::vector<uint8_t> compressed = jxl::CreateTestJXLCodestream(
+        jxl::Bytes(pixels.data(), pixels.size()), xsize, ysize, 3,
         jxl::TestCodestreamParams());
 
     JxlPixelFormat format = {channels, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0};
 
     std::vector<uint8_t> pixels2 = jxl::DecodeWithAPI(
-        dec, jxl::Span<const uint8_t>(compressed.data(), compressed.size()),
-        format, /*use_callback=*/true, /*set_buffer_early=*/false,
+        dec, jxl::Bytes(compressed.data(), compressed.size()), format,
+        /*use_callback=*/true, /*set_buffer_early=*/false,
         /*use_resizable_runner=*/false, /*require_boxes=*/false,
         /*expect_success*/ true);
     JxlDecoderReset(dec);
@@ -1867,27 +2043,22 @@ TEST(DecodeTest, PixelTestOpaqueSrgbLossy) {
     jxl::Span<const uint8_t> span0(pixels.data(), pixels.size());
     jxl::CodecInOut io0;
     io0.SetSize(xsize, ysize);
-    EXPECT_TRUE(ConvertFromExternal(
-        span0, xsize, ysize, color_encoding0, /*channels=*/3,
-        /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
-        format_orig.endianness,
-        /*pool=*/nullptr, &io0.Main(), /*float_in=*/false,
-        /*align=*/0));
+    EXPECT_TRUE(ConvertFromExternal(span0, xsize, ysize, color_encoding0,
+                                    /*bits_per_sample=*/16, format_orig,
+                                    /*pool=*/nullptr, &io0.Main()));
 
     jxl::ColorEncoding color_encoding1 = jxl::ColorEncoding::SRGB(false);
     jxl::Span<const uint8_t> span1(pixels2.data(), pixels2.size());
     jxl::CodecInOut io1;
     EXPECT_TRUE(ConvertFromExternal(span1, xsize, ysize, color_encoding1,
-                                    channels, /*alpha_is_premultiplied=*/false,
-                                    /*bits_per_sample=*/8, format.endianness,
-                                    /*pool=*/nullptr, &io1.Main(),
-                                    /*float_in=*/false,
-                                    /*align=*/0));
+                                    /*bits_per_sample=*/8, format,
+                                    /*pool=*/nullptr, &io1.Main()));
 
     jxl::ButteraugliParams ba;
-    EXPECT_THAT(ButteraugliDistance(io0, io1, ba, jxl::GetJxlCms(),
-                                    /*distmap=*/nullptr, nullptr),
-                IsSlightlyBelow(0.8f));
+    EXPECT_THAT(
+        ButteraugliDistance(io0.frames, io1.frames, ba, *JxlGetDefaultCms(),
+                            /*distmap=*/nullptr, nullptr),
+        IsSlightlyBelow(0.65f));
 
     JxlDecoderDestroy(dec);
   }
@@ -1905,15 +2076,14 @@ TEST(DecodeTest, PixelTestOpaqueSrgbLossyNoise) {
     JxlPixelFormat format_orig = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
     jxl::TestCodestreamParams params;
     params.cparams.noise = jxl::Override::kOn;
-    jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
-        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 3,
-        params);
+    std::vector<uint8_t> compressed = jxl::CreateTestJXLCodestream(
+        jxl::Bytes(pixels.data(), pixels.size()), xsize, ysize, 3, params);
 
     JxlPixelFormat format = {channels, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0};
 
     std::vector<uint8_t> pixels2 = jxl::DecodeWithAPI(
-        dec, jxl::Span<const uint8_t>(compressed.data(), compressed.size()),
-        format, /*use_callback=*/false, /*set_buffer_early=*/true,
+        dec, jxl::Bytes(compressed.data(), compressed.size()), format,
+        /*use_callback=*/false, /*set_buffer_early=*/true,
         /*use_resizable_runner=*/false, /*require_boxes=*/false,
         /*expect_success=*/true);
     JxlDecoderReset(dec);
@@ -1923,27 +2093,22 @@ TEST(DecodeTest, PixelTestOpaqueSrgbLossyNoise) {
     jxl::Span<const uint8_t> span0(pixels.data(), pixels.size());
     jxl::CodecInOut io0;
     io0.SetSize(xsize, ysize);
-    EXPECT_TRUE(ConvertFromExternal(
-        span0, xsize, ysize, color_encoding0, /*channels=*/3,
-        /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
-        format_orig.endianness,
-        /*pool=*/nullptr, &io0.Main(), /*float_in=*/false,
-        /*align=*/0));
+    EXPECT_TRUE(ConvertFromExternal(span0, xsize, ysize, color_encoding0,
+                                    /*bits_per_sample=*/16, format_orig,
+                                    /*pool=*/nullptr, &io0.Main()));
 
     jxl::ColorEncoding color_encoding1 = jxl::ColorEncoding::SRGB(false);
     jxl::Span<const uint8_t> span1(pixels2.data(), pixels2.size());
     jxl::CodecInOut io1;
     EXPECT_TRUE(ConvertFromExternal(span1, xsize, ysize, color_encoding1,
-                                    channels, /*alpha_is_premultiplied=*/false,
-                                    /*bits_per_sample=*/8, format.endianness,
-                                    /*pool=*/nullptr, &io1.Main(),
-                                    /*float_in=*/false,
-                                    /*align=*/0));
+                                    /*bits_per_sample=*/8, format,
+                                    /*pool=*/nullptr, &io1.Main()));
 
     jxl::ButteraugliParams ba;
-    EXPECT_THAT(ButteraugliDistance(io0, io1, ba, jxl::GetJxlCms(),
-                                    /*distmap=*/nullptr, nullptr),
-                IsSlightlyBelow(2.6f));
+    EXPECT_THAT(
+        ButteraugliDistance(io0.frames, io1.frames, ba, *JxlGetDefaultCms(),
+                            /*distmap=*/nullptr, nullptr),
+        IsSlightlyBelow(1.2222f));
 
     JxlDecoderDestroy(dec);
   }
@@ -1960,9 +2125,8 @@ TEST(DecodeTest, ProcessEmptyInputWithBoxes) {
     jxl::TestCodestreamParams params;
     params.box_format = (CodeStreamBoxFormat)i;
     printf("Testing empty input with box format %d\n", (int)params.box_format);
-    jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
-        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 3,
-        params);
+    std::vector<uint8_t> compressed = jxl::CreateTestJXLCodestream(
+        jxl::Bytes(pixels.data(), pixels.size()), xsize, ysize, 3, params);
     const int events =
         JXL_DEC_BASIC_INFO | JXL_DEC_FULL_IMAGE | JXL_DEC_COLOR_ENCODING;
     EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(dec, events));
@@ -1998,15 +2162,14 @@ TEST(DecodeTest, ExtraBytesAfterCompressedStream) {
     } else if (box_format == kCSBF_Multi_Other_Terminated) {
       last_unknown_box_size = unk3_box_size + 8;
     } else if (box_format == kCSBF_Multi_Last_Empty_Other) {
-      // If boxes are not required, the decoder wont consume the last empty
+      // If boxes are not required, the decoder won't consume the last empty
       // jxlp box.
       last_unknown_box_size = 12 + unk3_box_size + 8;
     }
     jxl::TestCodestreamParams params;
     params.box_format = box_format;
-    jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
-        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 3,
-        params);
+    std::vector<uint8_t> compressed = jxl::CreateTestJXLCodestream(
+        jxl::Bytes(pixels.data(), pixels.size()), xsize, ysize, 3, params);
     // Add some more bytes after compressed data.
     compressed.push_back(0);
     compressed.push_back(1);
@@ -2015,8 +2178,8 @@ TEST(DecodeTest, ExtraBytesAfterCompressedStream) {
     uint32_t channels = 3;
     JxlPixelFormat format = {channels, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0};
     std::vector<uint8_t> pixels2 = jxl::DecodeWithAPI(
-        dec, jxl::Span<const uint8_t>(compressed.data(), compressed.size()),
-        format, /*use_callback=*/false, /*set_buffer_early=*/true,
+        dec, jxl::Bytes(compressed.data(), compressed.size()), format,
+        /*use_callback=*/false, /*set_buffer_early=*/true,
         /*use_resizable_runner=*/false, /*require_boxes=*/false,
         /*expect_success=*/true);
     size_t unconsumed_bytes = JxlDecoderReleaseInput(dec);
@@ -2040,9 +2203,8 @@ TEST(DecodeTest, ExtraBytesAfterCompressedStreamRequireBoxes) {
                            box_format == kCSBF_Multi_Zero_Terminated);
     jxl::TestCodestreamParams params;
     params.box_format = box_format;
-    jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
-        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 3,
-        params);
+    std::vector<uint8_t> compressed = jxl::CreateTestJXLCodestream(
+        jxl::Bytes(pixels.data(), pixels.size()), xsize, ysize, 3, params);
     // Add some more bytes after compressed data.
     compressed.push_back(0);
     compressed.push_back(1);
@@ -2051,8 +2213,8 @@ TEST(DecodeTest, ExtraBytesAfterCompressedStreamRequireBoxes) {
     uint32_t channels = 3;
     JxlPixelFormat format = {channels, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0};
     std::vector<uint8_t> pixels2 = jxl::DecodeWithAPI(
-        dec, jxl::Span<const uint8_t>(compressed.data(), compressed.size()),
-        format, /*use_callback=*/false, /*set_buffer_early=*/true,
+        dec, jxl::Bytes(compressed.data(), compressed.size()), format,
+        /*use_callback=*/false, /*set_buffer_early=*/true,
         /*use_resizable_runner=*/false, /*require_boxes=*/true, expect_success);
     size_t unconsumed_bytes = JxlDecoderReleaseInput(dec);
     EXPECT_EQ(3, unconsumed_bytes);
@@ -2071,9 +2233,8 @@ TEST(DecodeTest, ConcatenatedCompressedStreams) {
     if (first_box_format == kCSBF_Multi_Other_Zero_Terminated) continue;
     jxl::TestCodestreamParams params1;
     params1.box_format = first_box_format;
-    jxl::PaddedBytes compressed1 = jxl::CreateTestJXLCodestream(
-        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 3,
-        params1);
+    std::vector<uint8_t> compressed1 = jxl::CreateTestJXLCodestream(
+        jxl::Bytes(pixels.data(), pixels.size()), xsize, ysize, 3, params1);
     for (int j = 0; j < kCSBF_NUM_ENTRIES; ++j) {
       CodeStreamBoxFormat second_box_format = (CodeStreamBoxFormat)j;
       if (second_box_format == kCSBF_Multi_Other_Zero_Terminated) continue;
@@ -2081,12 +2242,11 @@ TEST(DecodeTest, ConcatenatedCompressedStreams) {
              (int)second_box_format);
       jxl::TestCodestreamParams params2;
       params2.box_format = second_box_format;
-      jxl::PaddedBytes compressed2 = jxl::CreateTestJXLCodestream(
-          jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
-          3, params2);
-      jxl::PaddedBytes concat;
-      concat.append(compressed1);
-      concat.append(compressed2);
+      std::vector<uint8_t> compressed2 = jxl::CreateTestJXLCodestream(
+          jxl::Bytes(pixels.data(), pixels.size()), xsize, ysize, 3, params2);
+      std::vector<uint8_t> concat;
+      jxl::Bytes(compressed1).AppendTo(&concat);
+      jxl::Bytes(compressed2).AppendTo(&concat);
       uint32_t channels = 3;
       JxlPixelFormat format = {channels, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0};
       size_t remaining = concat.size();
@@ -2099,8 +2259,8 @@ TEST(DecodeTest, ConcatenatedCompressedStreams) {
              second_box_format == kCSBF_Single_Zero_Terminated ||
              second_box_format == kCSBF_Multi_Zero_Terminated);
         std::vector<uint8_t> pixels2 = jxl::DecodeWithAPI(
-            dec, jxl::Span<const uint8_t>(concat.data() + pos, remaining),
-            format, /*use_callback=*/false, /*set_buffer_early=*/true,
+            dec, jxl::Bytes(concat.data() + pos, remaining), format,
+            /*use_callback=*/false, /*set_buffer_early=*/true,
             /*use_resizable_runner=*/false, /*require_boxes=*/true,
             expect_success);
         EXPECT_EQ(num_pixels * channels * 4, pixels2.size());
@@ -2132,19 +2292,19 @@ void TestPartialStream(bool reconstructible_jpeg) {
   std::vector<uint8_t> pixels2;
   pixels2.resize(pixels.size());
 
-  jxl::PaddedBytes jpeg_output(64);
+  std::vector<uint8_t> jpeg_output(64);
   size_t used_jpeg_output = 0;
 
-  std::vector<jxl::PaddedBytes> codestreams(kCSBF_NUM_ENTRIES);
-  std::vector<jxl::PaddedBytes> jpeg_codestreams(kCSBF_NUM_ENTRIES);
+  std::vector<std::vector<uint8_t>> codestreams(kCSBF_NUM_ENTRIES);
+  std::vector<std::vector<uint8_t>> jpeg_codestreams(kCSBF_NUM_ENTRIES);
   for (size_t i = 0; i < kCSBF_NUM_ENTRIES; ++i) {
     params.box_format = (CodeStreamBoxFormat)i;
     if (reconstructible_jpeg) {
       params.jpeg_codestream = &jpeg_codestreams[i];
     }
-    codestreams[i] = jxl::CreateTestJXLCodestream(
-        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
-        channels, params);
+    codestreams[i] =
+        jxl::CreateTestJXLCodestream(jxl::Bytes(pixels.data(), pixels.size()),
+                                     xsize, ysize, channels, params);
   }
 
   // Test multiple step sizes, to test different combinations of the streaming
@@ -2157,7 +2317,7 @@ void TestPartialStream(bool reconstructible_jpeg) {
           (CodeStreamBoxFormat)i == CodeStreamBoxFormat::kCSBF_None) {
         continue;
       }
-      const jxl::PaddedBytes& data = codestreams[i];
+      const std::vector<uint8_t>& data = codestreams[i];
       const uint8_t* next_in = data.data();
       size_t avail_in = 0;
 
@@ -2264,12 +2424,11 @@ void TestPartialStream(bool reconstructible_jpeg) {
 // should return JXL_DEC_NEED_MORE_INPUT, not error.
 TEST(DecodeTest, PixelPartialTest) { TestPartialStream(false); }
 
-#if JPEGXL_ENABLE_JPEG
 // Tests the return status when trying to decode JPEG bytes on incomplete file.
 TEST(DecodeTest, JXL_TRANSCODE_JPEG_TEST(JPEGPartialTest)) {
+  TEST_LIBJPEG_SUPPORT();
   TestPartialStream(true);
 }
-#endif  // JPEGXL_ENABLE_JPEG
 
 // The DC event still exists, but is no longer implemented, it is deprecated.
 TEST(DecodeTest, DCNotGettableTest) {
@@ -2283,8 +2442,8 @@ TEST(DecodeTest, DCNotGettableTest) {
 
   JxlDecoder* dec = JxlDecoderCreate(NULL);
 
-  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(
-                                 dec, JXL_DEC_BASIC_INFO | JXL_DEC_DC_IMAGE));
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO));
   EXPECT_EQ(JXL_DEC_SUCCESS,
             JxlDecoderSetInput(
                 dec, reinterpret_cast<const uint8_t*>(compressed.data()),
@@ -2303,86 +2462,75 @@ TEST(DecodeTest, DCNotGettableTest) {
 TEST(DecodeTest, PreviewTest) {
   size_t xsize = 77, ysize = 120;
   std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 3, 0);
+  JxlPixelFormat format_orig = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  for (jxl::PreviewMode mode : {jxl::kSmallPreview, jxl::kBigPreview}) {
+    jxl::TestCodestreamParams params;
+    params.preview_mode = mode;
 
-  jxl::TestCodestreamParams params;
-  params.add_preview = true;
-  jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
-      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 3,
-      params);
+    std::vector<uint8_t> compressed = jxl::CreateTestJXLCodestream(
+        jxl::Bytes(pixels.data(), pixels.size()), xsize, ysize, 3, params);
 
-  JxlPixelFormat format = {3, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0};
+    JxlPixelFormat format = {3, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0};
 
-  JxlDecoder* dec = JxlDecoderCreate(NULL);
-  const uint8_t* next_in = compressed.data();
-  size_t avail_in = compressed.size();
+    JxlDecoder* dec = JxlDecoderCreate(NULL);
+    const uint8_t* next_in = compressed.data();
+    size_t avail_in = compressed.size();
 
-  EXPECT_EQ(JXL_DEC_SUCCESS,
-            JxlDecoderSubscribeEvents(
-                dec, JXL_DEC_BASIC_INFO | JXL_DEC_PREVIEW_IMAGE));
-  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderSubscribeEvents(
+                  dec, JXL_DEC_BASIC_INFO | JXL_DEC_PREVIEW_IMAGE));
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetInput(dec, next_in, avail_in));
 
-  EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
-  JxlBasicInfo info;
-  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
-  size_t buffer_size;
-  EXPECT_EQ(JXL_DEC_SUCCESS,
-            JxlDecoderPreviewOutBufferSize(dec, &format, &buffer_size));
-
-  // GetSomeTestImage is hardcoded to use a top-left cropped preview with
-  // floor of 1/7th of the size
-  size_t xsize_preview = (xsize / 7);
-  size_t ysize_preview = (ysize / 7);
-  EXPECT_EQ(xsize_preview, info.preview.xsize);
-  EXPECT_EQ(ysize_preview, info.preview.ysize);
-  EXPECT_EQ(xsize_preview * ysize_preview * 3, buffer_size);
-
-  EXPECT_EQ(JXL_DEC_NEED_PREVIEW_OUT_BUFFER, JxlDecoderProcessInput(dec));
-
-  std::vector<uint8_t> preview(xsize_preview * ysize_preview * 3);
-  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSetPreviewOutBuffer(
-                                 dec, &format, preview.data(), preview.size()));
-
-  EXPECT_EQ(JXL_DEC_PREVIEW_IMAGE, JxlDecoderProcessInput(dec));
-
-  jxl::Image3F preview0(xsize_preview, ysize_preview);
-  jxl::Image3F preview1(xsize_preview, ysize_preview);
-
-  // For preview0, the original: top-left crop the preview image the way
-  // GetSomeTestImage does.
-  for (size_t y = 0; y < ysize_preview; y++) {
-    for (size_t x = 0; x < xsize_preview; x++) {
-      preview0.PlaneRow(0, y)[x] =
-          (1.f / 255) * (pixels[(y * xsize + x) * 6 + 0]);
-      preview0.PlaneRow(1, y)[x] =
-          (1.f / 255) * (pixels[(y * xsize + x) * 6 + 2]);
-      preview0.PlaneRow(2, y)[x] =
-          (1.f / 255) * (pixels[(y * xsize + x) * 6 + 4]);
-      preview1.PlaneRow(0, y)[x] =
-          (1.f / 255) * (preview[(y * xsize_preview + x) * 3 + 0]);
-      preview1.PlaneRow(1, y)[x] =
-          (1.f / 255) * (preview[(y * xsize_preview + x) * 3 + 1]);
-      preview1.PlaneRow(2, y)[x] =
-          (1.f / 255) * (preview[(y * xsize_preview + x) * 3 + 2]);
-    }
-  }
+    EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+    JxlBasicInfo info;
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+    size_t buffer_size;
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderPreviewOutBufferSize(dec, &format, &buffer_size));
 
-  jxl::CodecInOut io0;
-  io0.SetFromImage(std::move(preview0), jxl::ColorEncoding::SRGB(false));
-  jxl::CodecInOut io1;
-  io1.SetFromImage(std::move(preview1), jxl::ColorEncoding::SRGB(false));
+    jxl::ColorEncoding c_srgb = jxl::ColorEncoding::SRGB(false);
+    jxl::CodecInOut io0;
+    EXPECT_TRUE(jxl::ConvertFromExternal(
+        jxl::Bytes(pixels.data(), pixels.size()), xsize, ysize, c_srgb,
+        /*bits_per_sample=*/16, format_orig, /*pool=*/nullptr, &io0.Main()));
+    GeneratePreview(params.preview_mode, &io0.Main());
 
-  jxl::ButteraugliParams ba;
-  // TODO(lode): this ButteraugliDistance silently returns 0 (dangerous for
-  // tests) if xsize or ysize is < 8, no matter how different the images, a tiny
-  // size that could happen for a preview. ButteraugliDiffmap does support
-  // smaller than 8x8, but jxl's ButteraugliDistance does not. Perhaps move
-  // butteraugli's <8x8 handling from ButteraugliDiffmap to
-  // ButteraugliComparator::Diffmap in butteraugli.cc.
-  EXPECT_LE(ButteraugliDistance(io0, io1, ba, jxl::GetJxlCms(),
-                                /*distmap=*/nullptr, nullptr),
-            0.6f);
+    size_t xsize_preview = io0.Main().xsize();
+    size_t ysize_preview = io0.Main().ysize();
+    EXPECT_EQ(xsize_preview, info.preview.xsize);
+    EXPECT_EQ(ysize_preview, info.preview.ysize);
+    EXPECT_EQ(xsize_preview * ysize_preview * 3, buffer_size);
 
-  JxlDecoderDestroy(dec);
+    EXPECT_EQ(JXL_DEC_NEED_PREVIEW_OUT_BUFFER, JxlDecoderProcessInput(dec));
+
+    std::vector<uint8_t> preview(buffer_size);
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderSetPreviewOutBuffer(dec, &format, preview.data(),
+                                            preview.size()));
+
+    EXPECT_EQ(JXL_DEC_PREVIEW_IMAGE, JxlDecoderProcessInput(dec));
+
+    jxl::CodecInOut io1;
+    EXPECT_TRUE(
+        jxl::ConvertFromExternal(jxl::Bytes(preview.data(), preview.size()),
+                                 xsize_preview, ysize_preview, c_srgb,
+                                 /*bits_per_sample=*/8, format,
+                                 /*pool=*/nullptr, &io1.Main()));
+
+    jxl::ButteraugliParams ba;
+    // TODO(lode): this ButteraugliDistance silently returns 0 (dangerous for
+    // tests) if xsize or ysize is < 8, no matter how different the images, a
+    // tiny size that could happen for a preview. ButteraugliDiffmap does
+    // support smaller than 8x8, but jxl's ButteraugliDistance does not. Perhaps
+    // move butteraugli's <8x8 handling from ButteraugliDiffmap to
+    // ButteraugliComparator::Diffmap in butteraugli.cc.
+    EXPECT_LE(
+        ButteraugliDistance(io0.frames, io1.frames, ba, *JxlGetDefaultCms(),
+                            /*distmap=*/nullptr, nullptr),
+        mode == jxl::kSmallPreview ? 0.7f : 1.2f);
+
+    JxlDecoderDestroy(dec);
+  }
 }
 
 TEST(DecodeTest, AlignTest) {
@@ -2394,9 +2542,8 @@ TEST(DecodeTest, AlignTest) {
   // Lossless to verify pixels exactly after roundtrip.
   params.cparams.SetLossless();
   params.cparams.speed_tier = jxl::SpeedTier::kThunder;
-  jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
-      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 4,
-      params);
+  std::vector<uint8_t> compressed = jxl::CreateTestJXLCodestream(
+      jxl::Bytes(pixels.data(), pixels.size()), xsize, ysize, 4, params);
 
   size_t align = 17;
   JxlPixelFormat format = {3, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, align};
@@ -2405,10 +2552,10 @@ TEST(DecodeTest, AlignTest) {
 
   for (int use_callback = 0; use_callback <= 1; ++use_callback) {
     std::vector<uint8_t> pixels2 = jxl::DecodeWithAPI(
-        jxl::Span<const uint8_t>(compressed.data(), compressed.size()), format,
-        use_callback, /*set_buffer_early=*/false,
+        jxl::Bytes(compressed.data(), compressed.size()), format, use_callback,
+        /*set_buffer_early=*/false,
         /*use_resizable_runner=*/false, /*require_boxes=*/false,
-        /*expect_succes=*/true);
+        /*expect_success=*/true);
     EXPECT_EQ(expected_line_bytes * ysize, pixels2.size());
     EXPECT_EQ(0u, jxl::test::ComparePixels(pixels.data(), pixels2.data(), xsize,
                                            ysize, format_orig, format));
@@ -2441,11 +2588,10 @@ TEST(DecodeTest, AnimationTest) {
     jxl::ImageBundle bundle(&io.metadata.m);
 
     EXPECT_TRUE(ConvertFromExternal(
-        jxl::Span<const uint8_t>(frames[i].data(), frames[i].size()), xsize,
-        ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false), /*channels=*/3,
-        /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
-        JXL_BIG_ENDIAN, /*pool=*/nullptr, &bundle,
-        /*float_in=*/false, /*align=*/0));
+        jxl::Bytes(frames[i].data(), frames[i].size()), xsize, ysize,
+        jxl::ColorEncoding::SRGB(/*is_gray=*/false),
+        /*bits_per_sample=*/16, format,
+        /*pool=*/nullptr, &bundle));
     bundle.duration = frame_durations[i];
     io.frames.push_back(std::move(bundle));
   }
@@ -2453,11 +2599,9 @@ TEST(DecodeTest, AnimationTest) {
   jxl::CompressParams cparams;
   cparams.SetLossless();  // Lossless to verify pixels exactly after roundtrip.
   cparams.speed_tier = jxl::SpeedTier::kThunder;
-  jxl::AuxOut aux_out;
-  jxl::PaddedBytes compressed;
+  std::vector<uint8_t> compressed;
   jxl::PassesEncoderState enc_state;
-  EXPECT_TRUE(jxl::EncodeFile(cparams, &io, &enc_state, &compressed,
-                              jxl::GetJxlCms(), &aux_out, nullptr));
+  EXPECT_TRUE(jxl::test::EncodeFile(cparams, &io, &enc_state, &compressed));
 
   // Decode and test the animation frames
 
@@ -2545,11 +2689,10 @@ TEST(DecodeTest, AnimationTestStreaming) {
     jxl::ImageBundle bundle(&io.metadata.m);
 
     EXPECT_TRUE(ConvertFromExternal(
-        jxl::Span<const uint8_t>(frames[i].data(), frames[i].size()), xsize,
-        ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false), /*channels=*/3,
-        /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
-        JXL_BIG_ENDIAN, /*pool=*/nullptr, &bundle,
-        /*float_in=*/false, /*align=*/0));
+        jxl::Bytes(frames[i].data(), frames[i].size()), xsize, ysize,
+        jxl::ColorEncoding::SRGB(/*is_gray=*/false),
+        /*bits_per_sample=*/16, format,
+        /*pool=*/nullptr, &bundle));
     bundle.duration = frame_durations[i];
     io.frames.push_back(std::move(bundle));
   }
@@ -2557,11 +2700,9 @@ TEST(DecodeTest, AnimationTestStreaming) {
   jxl::CompressParams cparams;
   cparams.SetLossless();  // Lossless to verify pixels exactly after roundtrip.
   cparams.speed_tier = jxl::SpeedTier::kThunder;
-  jxl::AuxOut aux_out;
-  jxl::PaddedBytes compressed;
+  std::vector<uint8_t> compressed;
   jxl::PassesEncoderState enc_state;
-  EXPECT_TRUE(jxl::EncodeFile(cparams, &io, &enc_state, &compressed,
-                              jxl::GetJxlCms(), &aux_out, nullptr));
+  EXPECT_TRUE(jxl::test::EncodeFile(cparams, &io, &enc_state, &compressed));
 
   // Decode and test the animation frames
 
@@ -2664,9 +2805,8 @@ TEST(DecodeTest, ExtraChannelTest) {
   // Lossless to verify pixels exactly after roundtrip.
   params.cparams.SetLossless();
   params.cparams.speed_tier = jxl::SpeedTier::kThunder;
-  jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
-      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 4,
-      params);
+  std::vector<uint8_t> compressed = jxl::CreateTestJXLCodestream(
+      jxl::Bytes(pixels.data(), pixels.size()), xsize, ysize, 4, params);
 
   size_t align = 17;
   JxlPixelFormat format = {3, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, align};
@@ -2764,26 +2904,22 @@ TEST(DecodeTest, SkipCurrentFrameTest) {
     }
 
     EXPECT_TRUE(ConvertFromExternal(
-        jxl::Span<const uint8_t>(frames[i].data(), frames[i].size()), xsize,
-        ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false), /*channels=*/3,
-        /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
-        JXL_BIG_ENDIAN, /*pool=*/nullptr, &bundle,
-        /*float_in=*/false, /*align=*/0));
+        jxl::Bytes(frames[i].data(), frames[i].size()), xsize, ysize,
+        jxl::ColorEncoding::SRGB(/*is_gray=*/false),
+        /*bits_per_sample=*/16, format,
+        /*pool=*/nullptr, &bundle));
     bundle.duration = frame_durations[i];
     io.frames.push_back(std::move(bundle));
   }
 
   jxl::CompressParams cparams;
   cparams.speed_tier = jxl::SpeedTier::kThunder;
-  jxl::AuxOut aux_out;
-  jxl::PaddedBytes compressed;
+  std::vector<uint8_t> compressed;
   jxl::PassesEncoderState enc_state;
-  jxl::PassDefinition passes[] = {
-      {2, 0, false, 4}, {4, 0, false, 4}, {8, 2, false, 2}, {8, 0, false, 1}};
+  jxl::PassDefinition passes[] = {{2, 0, 4}, {4, 0, 4}, {8, 2, 2}, {8, 0, 1}};
   jxl::ProgressiveMode progressive_mode{passes};
   enc_state.progressive_splitter.SetProgressiveMode(progressive_mode);
-  EXPECT_TRUE(jxl::EncodeFile(cparams, &io, &enc_state, &compressed,
-                              jxl::GetJxlCms(), &aux_out, nullptr));
+  EXPECT_TRUE(jxl::test::EncodeFile(cparams, &io, &enc_state, &compressed));
 
   JxlDecoder* dec = JxlDecoderCreate(NULL);
   const uint8_t* next_in = compressed.data();
@@ -2880,11 +3016,10 @@ TEST(DecodeTest, SkipFrameTest) {
     }
 
     EXPECT_TRUE(ConvertFromExternal(
-        jxl::Span<const uint8_t>(frames[i].data(), frames[i].size()), xsize,
-        ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false), /*channels=*/3,
-        /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
-        JXL_BIG_ENDIAN, /*pool=*/nullptr, &bundle,
-        /*float_in=*/false, /*align=*/0));
+        jxl::Bytes(frames[i].data(), frames[i].size()), xsize, ysize,
+        jxl::ColorEncoding::SRGB(/*is_gray=*/false),
+        /*bits_per_sample=*/16, format,
+        /*pool=*/nullptr, &bundle));
     bundle.duration = frame_durations[i];
     io.frames.push_back(std::move(bundle));
   }
@@ -2892,11 +3027,9 @@ TEST(DecodeTest, SkipFrameTest) {
   jxl::CompressParams cparams;
   cparams.SetLossless();  // Lossless to verify pixels exactly after roundtrip.
   cparams.speed_tier = jxl::SpeedTier::kThunder;
-  jxl::AuxOut aux_out;
-  jxl::PaddedBytes compressed;
+  std::vector<uint8_t> compressed;
   jxl::PassesEncoderState enc_state;
-  EXPECT_TRUE(jxl::EncodeFile(cparams, &io, &enc_state, &compressed,
-                              jxl::GetJxlCms(), &aux_out, nullptr));
+  EXPECT_TRUE(jxl::test::EncodeFile(cparams, &io, &enc_state, &compressed));
 
   // Decode and test the animation frames
 
@@ -3015,13 +3148,10 @@ TEST(DecodeTest, SkipFrameWithBlendingTest) {
       // rendered frames depend
       jxl::ImageBundle bundle_internal(&io.metadata.m);
       EXPECT_TRUE(ConvertFromExternal(
-          jxl::Span<const uint8_t>(frame_internal.data(),
-                                   frame_internal.size()),
-          xsize, ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
-          /*channels=*/3,
-          /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
-          JXL_BIG_ENDIAN, /*pool=*/nullptr, &bundle_internal,
-          /*float_in=*/false, /*align=*/0));
+          jxl::Bytes(frame_internal.data(), frame_internal.size()), xsize,
+          ysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
+          /*bits_per_sample=*/16, format,
+          /*pool=*/nullptr, &bundle_internal));
       bundle_internal.duration = 0;
       bundle_internal.use_for_next_frame = true;
       io.frames.push_back(std::move(bundle_internal));
@@ -3032,12 +3162,11 @@ TEST(DecodeTest, SkipFrameWithBlendingTest) {
     // Actual rendered frame
     frame_durations[i] = 5 + i;
     jxl::ImageBundle bundle(&io.metadata.m);
-    EXPECT_TRUE(ConvertFromExternal(
-        jxl::Span<const uint8_t>(frame.data(), frame.size()), xsize, ysize,
-        jxl::ColorEncoding::SRGB(/*is_gray=*/false), /*channels=*/3,
-        /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
-        JXL_BIG_ENDIAN, /*pool=*/nullptr, &bundle,
-        /*float_in=*/false, /*align=*/0));
+    EXPECT_TRUE(ConvertFromExternal(jxl::Bytes(frame.data(), frame.size()),
+                                    xsize, ysize,
+                                    jxl::ColorEncoding::SRGB(/*is_gray=*/false),
+                                    /*bits_per_sample=*/16, format,
+                                    /*pool=*/nullptr, &bundle));
     bundle.duration = frame_durations[i];
     // Create some variation in which frames depend on which.
     if (i != 3 && i != 9 && i != 10) {
@@ -3055,11 +3184,9 @@ TEST(DecodeTest, SkipFrameWithBlendingTest) {
   jxl::CompressParams cparams;
   cparams.SetLossless();  // Lossless to verify pixels exactly after roundtrip.
   cparams.speed_tier = jxl::SpeedTier::kThunder;
-  jxl::AuxOut aux_out;
-  jxl::PaddedBytes compressed;
+  std::vector<uint8_t> compressed;
   jxl::PassesEncoderState enc_state;
-  EXPECT_TRUE(jxl::EncodeFile(cparams, &io, &enc_state, &compressed,
-                              jxl::GetJxlCms(), &aux_out, nullptr));
+  EXPECT_TRUE(jxl::test::EncodeFile(cparams, &io, &enc_state, &compressed));
 
   // Independently decode all frames without any skipping, to create the
   // expected blended frames, for the actual tests below to compare with.
@@ -3242,13 +3369,10 @@ TEST(DecodeTest, SkipFrameWithAlphaBlendingTest) {
       // which the rendered frames depend
       jxl::ImageBundle bundle_internal(&io.metadata.m);
       EXPECT_TRUE(ConvertFromExternal(
-          jxl::Span<const uint8_t>(frame_internal.data(),
-                                   frame_internal.size()),
-          xsize / 2, ysize / 2, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
-          /*channels=*/4,
-          /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
-          JXL_BIG_ENDIAN, /*pool=*/nullptr, &bundle_internal,
-          /*float_in=*/false, /*align=*/0));
+          jxl::Bytes(frame_internal.data(), frame_internal.size()), xsize / 2,
+          ysize / 2, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
+          /*bits_per_sample=*/16, format,
+          /*pool=*/nullptr, &bundle_internal));
       bundle_internal.duration = 0;
       bundle_internal.use_for_next_frame = true;
       bundle_internal.origin = {13, 17};
@@ -3264,12 +3388,11 @@ TEST(DecodeTest, SkipFrameWithAlphaBlendingTest) {
         jxl::test::GetSomeTestImage(cropxsize, cropysize, 4, i * 2);
     // Actual rendered frame
     jxl::ImageBundle bundle(&io.metadata.m);
-    EXPECT_TRUE(ConvertFromExternal(
-        jxl::Span<const uint8_t>(frame.data(), frame.size()), cropxsize,
-        cropysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false), /*channels=*/4,
-        /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
-        JXL_BIG_ENDIAN, /*pool=*/nullptr, &bundle,
-        /*float_in=*/false, /*align=*/0));
+    EXPECT_TRUE(ConvertFromExternal(jxl::Bytes(frame.data(), frame.size()),
+                                    cropxsize, cropysize,
+                                    jxl::ColorEncoding::SRGB(/*is_gray=*/false),
+                                    /*bits_per_sample=*/16, format,
+                                    /*pool=*/nullptr, &bundle));
     bundle.duration = 5 + i;
     frame_durations_nc.push_back(5 + i);
     frame_durations_c.push_back(5 + i);
@@ -3292,11 +3415,9 @@ TEST(DecodeTest, SkipFrameWithAlphaBlendingTest) {
   jxl::CompressParams cparams;
   cparams.SetLossless();  // Lossless to verify pixels exactly after roundtrip.
   cparams.speed_tier = jxl::SpeedTier::kThunder;
-  jxl::AuxOut aux_out;
-  jxl::PaddedBytes compressed;
+  std::vector<uint8_t> compressed;
   jxl::PassesEncoderState enc_state;
-  EXPECT_TRUE(jxl::EncodeFile(cparams, &io, &enc_state, &compressed,
-                              jxl::GetJxlCms(), &aux_out, nullptr));
+  EXPECT_TRUE(jxl::test::EncodeFile(cparams, &io, &enc_state, &compressed));
   // try both with and without coalescing
   for (auto coalescing : {JXL_TRUE, JXL_FALSE}) {
     // Independently decode all frames without any skipping, to create the
@@ -3528,12 +3649,10 @@ TEST(DecodeTest, OrientedCroppedFrameTest) {
           jxl::test::GetSomeTestImage(cropxsize, cropysize, 4, i * 2);
       jxl::ImageBundle bundle(&io.metadata.m);
       EXPECT_TRUE(ConvertFromExternal(
-          jxl::Span<const uint8_t>(frame.data(), frame.size()), cropxsize,
-          cropysize, jxl::ColorEncoding::SRGB(/*is_gray=*/false),
-          /*channels=*/4,
-          /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
-          JXL_BIG_ENDIAN, /*pool=*/nullptr, &bundle,
-          /*float_in=*/false, /*align=*/0));
+          jxl::Bytes(frame.data(), frame.size()), cropxsize, cropysize,
+          jxl::ColorEncoding::SRGB(/*is_gray=*/false),
+          /*bits_per_sample=*/16, format,
+          /*pool=*/nullptr, &bundle));
       bundle.origin = {cropx0, cropy0};
       bundle.use_for_next_frame = true;
       io.frames.push_back(std::move(bundle));
@@ -3544,11 +3663,9 @@ TEST(DecodeTest, OrientedCroppedFrameTest) {
         .SetLossless();  // Lossless to verify pixels exactly after roundtrip.
     cparams.speed_tier = jxl::SpeedTier::kThunder;
     cparams.resampling = resampling;
-    jxl::AuxOut aux_out;
-    jxl::PaddedBytes compressed;
+    std::vector<uint8_t> compressed;
     jxl::PassesEncoderState enc_state;
-    EXPECT_TRUE(jxl::EncodeFile(cparams, &io, &enc_state, &compressed,
-                                jxl::GetJxlCms(), &aux_out, nullptr));
+    EXPECT_TRUE(jxl::test::EncodeFile(cparams, &io, &enc_state, &compressed));
 
     // 0 is merged frame as decoded with coalescing enabled (default)
     // 1-3 are non-coalesced frames as decoded with coalescing disabled
@@ -3663,7 +3780,7 @@ struct StreamPositions {
   std::vector<FramePositions> frames;
 };
 
-void AnalyzeCodestream(const jxl::PaddedBytes& data,
+void AnalyzeCodestream(const std::vector<uint8_t>& data,
                        StreamPositions* streampos) {
   // Unbox data to codestream and mark where it is broken up by boxes.
   std::vector<uint8_t> codestream;
@@ -3715,17 +3832,22 @@ void AnalyzeCodestream(const jxl::PaddedBytes& data,
     return pos + offset;
   };
   // Analyze the unboxed codestream.
-  jxl::BitReader br(
-      jxl::Span<const uint8_t>(codestream.data(), codestream.size()));
+  jxl::BitReader br(jxl::Bytes(codestream.data(), codestream.size()));
   ASSERT_EQ(br.ReadFixedBits<16>(), 0x0AFF);
   jxl::CodecMetadata metadata;
-  EXPECT_TRUE(ReadSizeHeader(&br, &metadata.size));
-  EXPECT_TRUE(ReadImageMetadata(&br, &metadata.m));
+  ASSERT_TRUE(ReadSizeHeader(&br, &metadata.size));
+  ASSERT_TRUE(ReadImageMetadata(&br, &metadata.m));
   streampos->basic_info =
       add_offset(br.TotalBitsConsumed() / jxl::kBitsPerByte);
   metadata.transform_data.nonserialized_xyb_encoded = metadata.m.xyb_encoded;
-  EXPECT_TRUE(jxl::Bundle::Read(&br, &metadata.transform_data));
-  EXPECT_TRUE(br.JumpToByteBoundary());
+  ASSERT_TRUE(jxl::Bundle::Read(&br, &metadata.transform_data));
+  if (metadata.m.color_encoding.WantICC()) {
+    std::vector<uint8_t> icc;
+    ASSERT_TRUE(jxl::test::ReadICC(&br, &icc));
+    ASSERT_TRUE(!icc.empty());
+    metadata.m.color_encoding.SetICCRaw(std::move(icc));
+  }
+  ASSERT_TRUE(br.JumpToByteBoundary());
   bool has_preview = metadata.m.have_preview;
   while (br.TotalBitsConsumed() < br.TotalBytes() * jxl::kBitsPerByte) {
     FramePositions p;
@@ -3735,7 +3857,7 @@ void AnalyzeCodestream(const jxl::PaddedBytes& data,
       frame_header.nonserialized_is_preview = true;
       has_preview = false;
     }
-    EXPECT_TRUE(ReadFrameHeader(&br, &frame_header));
+    ASSERT_TRUE(ReadFrameHeader(&br, &frame_header));
     p.header_end =
         add_offset(jxl::DivCeil(br.TotalBitsConsumed(), jxl::kBitsPerByte));
     jxl::FrameDimensions frame_dim = frame_header.ToFrameDimensions();
@@ -3745,7 +3867,7 @@ void AnalyzeCodestream(const jxl::PaddedBytes& data,
         frame_header.passes.num_passes, /*has_ac_global=*/true);
     std::vector<uint64_t> section_offsets;
     std::vector<uint32_t> section_sizes;
-    EXPECT_TRUE(ReadGroupOffsets(toc_entries, &br, &section_offsets,
+    ASSERT_TRUE(ReadGroupOffsets(toc_entries, &br, &section_offsets,
                                  &section_sizes, &groups_total_size));
     EXPECT_EQ(br.TotalBitsConsumed() % jxl::kBitsPerByte, 0);
     size_t sections_start = br.TotalBitsConsumed() / jxl::kBitsPerByte;
@@ -3770,7 +3892,7 @@ struct Breakpoint {
 
 void VerifyProgression(size_t xsize, size_t ysize, uint32_t num_channels,
                        const std::vector<uint8_t>& pixels,
-                       const jxl::PaddedBytes& data,
+                       const std::vector<uint8_t>& data,
                        std::vector<Breakpoint> breakpoints) {
   // Size large enough for multiple groups, required to have progressive stages.
   ASSERT_LT(256, xsize);
@@ -3849,10 +3971,10 @@ TEST(DecodeTest, ProgressionTest) {
       jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0);
   jxl::TestCodestreamParams params;
   params.cparams.progressive_dc = 1;
-  params.add_preview = true;
-  jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
-      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
-      num_channels, params);
+  params.preview_mode = jxl::kSmallPreview;
+  std::vector<uint8_t> data =
+      jxl::CreateTestJXLCodestream(jxl::Bytes(pixels.data(), pixels.size()),
+                                   xsize, ysize, num_channels, params);
   StreamPositions streampos;
   AnalyzeCodestream(data, &streampos);
   const std::vector<FramePositions>& fp = streampos.frames;
@@ -3886,9 +4008,9 @@ TEST(DecodeTest, ProgressionTestLosslessAlpha) {
   params.cparams.SetLossless();
   params.cparams.speed_tier = jxl::SpeedTier::kThunder;
   params.cparams.responsive = 1;
-  jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
-      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
-      num_channels, params);
+  std::vector<uint8_t> data =
+      jxl::CreateTestJXLCodestream(jxl::Bytes(pixels.data(), pixels.size()),
+                                   xsize, ysize, num_channels, params);
   StreamPositions streampos;
   AnalyzeCodestream(data, &streampos);
   const std::vector<FramePositions>& fp = streampos.frames;
@@ -3911,7 +4033,7 @@ TEST(DecodeTest, ProgressionTestLosslessAlpha) {
   VerifyProgression(xsize, ysize, num_channels, pixels, data, breakpoints);
 }
 
-void VerifyFilePosition(size_t expected_pos, const jxl::PaddedBytes& data,
+void VerifyFilePosition(size_t expected_pos, const std::vector<uint8_t>& data,
                         JxlDecoder* dec) {
   size_t remaining = JxlDecoderReleaseInput(dec);
   size_t pos = data.size() - remaining;
@@ -3929,11 +4051,11 @@ TEST(DecodeTest, InputHandlingTestOneShot) {
     printf("Testing with box format %d\n", i);
     jxl::TestCodestreamParams params;
     params.cparams.progressive_dc = 1;
-    params.add_preview = true;
+    params.preview_mode = jxl::kSmallPreview;
     params.box_format = (CodeStreamBoxFormat)i;
-    jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
-        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
-        num_channels, params);
+    std::vector<uint8_t> data =
+        jxl::CreateTestJXLCodestream(jxl::Bytes(pixels.data(), pixels.size()),
+                                     xsize, ysize, num_channels, params);
     JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
     StreamPositions streampos;
     AnalyzeCodestream(data, &streampos);
@@ -4011,8 +4133,8 @@ TEST(DecodeTest, InputHandlingTestOneShot) {
   }
 }
 
-#if JPEGXL_ENABLE_JPEG
 TEST(DecodeTest, JXL_TRANSCODE_JPEG_TEST(InputHandlingTestJPEGOneshot)) {
+  TEST_LIBJPEG_SUPPORT();
   size_t xsize = 123;
   size_t ysize = 77;
   size_t channels = 3;
@@ -4020,15 +4142,15 @@ TEST(DecodeTest, JXL_TRANSCODE_JPEG_TEST(InputHandlingTestJPEGOneshot)) {
       jxl::test::GetSomeTestImage(xsize, ysize, channels, /*seed=*/0);
   for (int i = 1; i < kCSBF_NUM_ENTRIES; ++i) {
     printf("Testing with box format %d\n", i);
-    jxl::PaddedBytes jpeg_codestream;
+    std::vector<uint8_t> jpeg_codestream;
     jxl::TestCodestreamParams params;
     params.cparams.color_transform = jxl::ColorTransform::kNone;
     params.jpeg_codestream = &jpeg_codestream;
-    params.add_preview = true;
+    params.preview_mode = jxl::kSmallPreview;
     params.box_format = (CodeStreamBoxFormat)i;
-    jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
-        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
-        channels, params);
+    std::vector<uint8_t> data =
+        jxl::CreateTestJXLCodestream(jxl::Bytes(pixels.data(), pixels.size()),
+                                     xsize, ysize, channels, params);
     JxlPixelFormat format = {3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
     StreamPositions streampos;
     AnalyzeCodestream(data, &streampos);
@@ -4102,7 +4224,6 @@ TEST(DecodeTest, JXL_TRANSCODE_JPEG_TEST(InputHandlingTestJPEGOneshot)) {
     }
   }
 }
-#endif  // JPEGXL_ENABLE_JPEG
 
 TEST(DecodeTest, InputHandlingTestStreaming) {
   size_t xsize = 508, ysize = 470;
@@ -4115,10 +4236,10 @@ TEST(DecodeTest, InputHandlingTestStreaming) {
     jxl::TestCodestreamParams params;
     params.cparams.progressive_dc = 1;
     params.box_format = (CodeStreamBoxFormat)i;
-    params.add_preview = true;
-    jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
-        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
-        num_channels, params);
+    params.preview_mode = jxl::kSmallPreview;
+    std::vector<uint8_t> data =
+        jxl::CreateTestJXLCodestream(jxl::Bytes(pixels.data(), pixels.size()),
+                                     xsize, ysize, num_channels, params);
     JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
     StreamPositions streampos;
     AnalyzeCodestream(data, &streampos);
@@ -4210,10 +4331,10 @@ TEST(DecodeTest, FlushTest) {
   std::vector<uint8_t> pixels =
       jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0);
   jxl::TestCodestreamParams params;
-  params.add_preview = true;
-  jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
-      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
-      num_channels, params);
+  params.preview_mode = jxl::kSmallPreview;
+  std::vector<uint8_t> data =
+      jxl::CreateTestJXLCodestream(jxl::Bytes(pixels.data(), pixels.size()),
+                                   xsize, ysize, num_channels, params);
   JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
 
   std::vector<uint8_t> pixels2;
@@ -4285,10 +4406,10 @@ TEST(DecodeTest, FlushTestImageOutCallback) {
   std::vector<uint8_t> pixels =
       jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0);
   jxl::TestCodestreamParams params;
-  params.add_preview = true;
-  jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
-      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
-      num_channels, params);
+  params.preview_mode = jxl::kSmallPreview;
+  std::vector<uint8_t> data =
+      jxl::CreateTestJXLCodestream(jxl::Bytes(pixels.data(), pixels.size()),
+                                   xsize, ysize, num_channels, params);
   JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
 
   std::vector<uint8_t> pixels2;
@@ -4371,10 +4492,10 @@ TEST(DecodeTest, FlushTestLossyProgressiveAlpha) {
   std::vector<uint8_t> pixels =
       jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0);
   jxl::TestCodestreamParams params;
-  params.add_preview = true;
-  jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
-      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
-      num_channels, params);
+  params.preview_mode = jxl::kSmallPreview;
+  std::vector<uint8_t> data =
+      jxl::CreateTestJXLCodestream(jxl::Bytes(pixels.data(), pixels.size()),
+                                   xsize, ysize, num_channels, params);
   JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
 
   std::vector<uint8_t> pixels2;
@@ -4443,10 +4564,10 @@ TEST(DecodeTest, FlushTestLossyProgressiveAlphaUpsampling) {
   jxl::TestCodestreamParams params;
   params.cparams.resampling = 2;
   params.cparams.ec_resampling = 4;
-  params.add_preview = true;
-  jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
-      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
-      num_channels, params);
+  params.preview_mode = jxl::kSmallPreview;
+  std::vector<uint8_t> data =
+      jxl::CreateTestJXLCodestream(jxl::Bytes(pixels.data(), pixels.size()),
+                                   xsize, ysize, num_channels, params);
   JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
 
   std::vector<uint8_t> pixels2;
@@ -4518,10 +4639,11 @@ TEST(DecodeTest, FlushTestLosslessProgressiveAlpha) {
   params.cparams.SetLossless();
   params.cparams.speed_tier = jxl::SpeedTier::kThunder;
   params.cparams.responsive = 1;
-  params.add_preview = true;
-  jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
-      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
-      num_channels, params);
+  params.cparams.modular_group_size_shift = 1;
+  params.preview_mode = jxl::kSmallPreview;
+  std::vector<uint8_t> data =
+      jxl::CreateTestJXLCodestream(jxl::Bytes(pixels.data(), pixels.size()),
+                                   xsize, ysize, num_channels, params);
   JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
 
   std::vector<uint8_t> pixels2;
@@ -4610,32 +4732,27 @@ TEST_P(DecodeProgressiveTest, ProgressiveEventTest) {
     }
     std::vector<uint8_t> pixels =
         jxl::test::GetSomeTestImage(xsize, ysize, num_channels, 0);
+    JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
     jxl::ColorEncoding color_encoding = jxl::ColorEncoding::SRGB(false);
     jxl::CodecInOut io;
     EXPECT_TRUE(jxl::ConvertFromExternal(
-        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
-        color_encoding, num_channels,
-        /*alpha_is_premultiplied=*/false,
-        /*bits_per_sample=*/16, JXL_BIG_ENDIAN,
-        /*pool=*/nullptr, &io.Main(), /*float_in=*/false, /*align=*/0));
+        jxl::Bytes(pixels.data(), pixels.size()), xsize, ysize, color_encoding,
+        /*bits_per_sample=*/16, format,
+        /*pool=*/nullptr, &io.Main()));
     jxl::TestCodestreamParams params;
     if (lossless) {
       params.cparams.SetLossless();
     } else {
       params.cparams.butteraugli_distance = 0.5f;
     }
-    jxl::PassDefinition passes[] = {{2, 0, false, 4},
-                                    {4, 0, false, 4},
-                                    {8, 2, false, 2},
-                                    {8, 1, false, 2},
-                                    {8, 0, false, 1}};
+    jxl::PassDefinition passes[] = {
+        {2, 0, 4}, {4, 0, 4}, {8, 2, 2}, {8, 1, 2}, {8, 0, 1}};
     const int kNumPasses = 5;
     jxl::ProgressiveMode progressive_mode{passes};
     params.progressive_mode = &progressive_mode;
-    jxl::PaddedBytes data = jxl::CreateTestJXLCodestream(
-        jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
-        num_channels, params);
-    JxlPixelFormat format = {num_channels, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+    std::vector<uint8_t> data =
+        jxl::CreateTestJXLCodestream(jxl::Bytes(pixels.data(), pixels.size()),
+                                     xsize, ysize, num_channels, params);
 
     for (size_t increment : {(size_t)1, data.size()}) {
       printf(
@@ -4735,14 +4852,12 @@ TEST_P(DecodeProgressiveTest, ProgressiveEventTest) {
       for (int p = 0;; p = next_pass(p)) {
         jxl::CodecInOut io1;
         EXPECT_TRUE(jxl::ConvertFromExternal(
-            jxl::Span<const uint8_t>(passes[p].data(), passes[p].size()), xsize,
-            ysize, color_encoding, num_channels,
-            /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16,
-            JXL_BIG_ENDIAN,
-            /*pool=*/nullptr, &io1.Main(), /*float_in=*/false,
-            /*align=*/0));
-        distances[p] = ButteraugliDistance(io, io1, ba, jxl::GetJxlCms(),
-                                           nullptr, nullptr);
+            jxl::Bytes(passes[p].data(), passes[p].size()), xsize, ysize,
+            color_encoding,
+            /*bits_per_sample=*/16, format,
+            /*pool=*/nullptr, &io1.Main()));
+        distances[p] = ButteraugliDistance(
+            io.frames, io1.frames, ba, *JxlGetDefaultCms(), nullptr, nullptr);
         if (p == kNumPasses) break;
       }
       const float kMaxDistance[kNumPasses + 1] = {30.0f, 20.0f, 10.0f,
@@ -4754,15 +4869,15 @@ TEST_P(DecodeProgressiveTest, ProgressiveEventTest) {
         // Verify that the returned pass image is actually not the
         // same as the next pass image, by checking that it has a bit
         // worse butteraugli score.
-        EXPECT_LT(distances[next_p] * 1.2f, distances[p]);
+        EXPECT_LT(distances[next_p] * 1.1f, distances[p]);
         p = next_p;
       }
     }
   }
 }
 
-void VerifyJPEGReconstruction(const jxl::PaddedBytes& container,
-                              const jxl::PaddedBytes& jpeg_bytes) {
+void VerifyJPEGReconstruction(jxl::Span<const uint8_t> container,
+                              jxl::Span<const uint8_t> jpeg_bytes) {
   JxlDecoderPtr dec = JxlDecoderMake(nullptr);
   EXPECT_EQ(JXL_DEC_SUCCESS,
             JxlDecoderSubscribeEvents(
@@ -4790,65 +4905,61 @@ void VerifyJPEGReconstruction(const jxl::PaddedBytes& container,
   EXPECT_EQ(0, memcmp(reconstructed_buffer.data(), jpeg_bytes.data(), used));
 }
 
-#if JPEGXL_ENABLE_JPEG
 TEST(DecodeTest, JXL_TRANSCODE_JPEG_TEST(JPEGReconstructTestCodestream)) {
+  TEST_LIBJPEG_SUPPORT();
   size_t xsize = 123;
   size_t ysize = 77;
   size_t channels = 3;
   std::vector<uint8_t> pixels =
       jxl::test::GetSomeTestImage(xsize, ysize, channels, /*seed=*/0);
-  jxl::PaddedBytes jpeg_codestream;
+  std::vector<uint8_t> jpeg_codestream;
   jxl::TestCodestreamParams params;
   params.cparams.color_transform = jxl::ColorTransform::kNone;
   params.box_format = kCSBF_Single;
   params.jpeg_codestream = &jpeg_codestream;
-  params.add_preview = true;
-  jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
-      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize,
-      channels, params);
-  VerifyJPEGReconstruction(compressed, jpeg_codestream);
+  params.preview_mode = jxl::kSmallPreview;
+  std::vector<uint8_t> compressed = jxl::CreateTestJXLCodestream(
+      jxl::Bytes(pixels.data(), pixels.size()), xsize, ysize, channels, params);
+  VerifyJPEGReconstruction(jxl::Bytes(compressed), jxl::Bytes(jpeg_codestream));
 }
-#endif  // JPEGXL_ENABLE_JPEG
 
 TEST(DecodeTest, JXL_TRANSCODE_JPEG_TEST(JPEGReconstructionTest)) {
   const std::string jpeg_path = "jxl/flower/flower.png.im_q85_420.jpg";
-  const jxl::PaddedBytes orig = jxl::ReadTestData(jpeg_path);
+  const std::vector<uint8_t> orig = jxl::test::ReadTestData(jpeg_path);
   jxl::CodecInOut orig_io;
-  ASSERT_TRUE(
-      jxl::jpeg::DecodeImageJPG(jxl::Span<const uint8_t>(orig), &orig_io));
+  ASSERT_TRUE(jxl::jpeg::DecodeImageJPG(jxl::Bytes(orig), &orig_io));
   orig_io.metadata.m.xyb_encoded = false;
   jxl::BitWriter writer;
-  ASSERT_TRUE(WriteHeaders(&orig_io.metadata, &writer, nullptr));
+  ASSERT_TRUE(WriteCodestreamHeaders(&orig_io.metadata, &writer, nullptr));
   writer.ZeroPadToByte();
   jxl::PassesEncoderState enc_state;
   jxl::CompressParams cparams;
   cparams.color_transform = jxl::ColorTransform::kNone;
   ASSERT_TRUE(jxl::EncodeFrame(cparams, jxl::FrameInfo{}, &orig_io.metadata,
-                               orig_io.Main(), &enc_state, jxl::GetJxlCms(),
+                               orig_io.Main(), &enc_state, *JxlGetDefaultCms(),
                                /*pool=*/nullptr, &writer,
                                /*aux_out=*/nullptr));
 
-  jxl::PaddedBytes jpeg_data;
+  std::vector<uint8_t> jpeg_data;
   ASSERT_TRUE(
       EncodeJPEGData(*orig_io.Main().jpeg_data.get(), &jpeg_data, cparams));
-  jxl::PaddedBytes container;
-  container.append(jxl::kContainerHeader,
-                   jxl::kContainerHeader + sizeof(jxl::kContainerHeader));
+  std::vector<uint8_t> container;
+  jxl::Bytes(jxl::kContainerHeader).AppendTo(&container);
   jxl::AppendBoxHeader(jxl::MakeBoxType("jbrd"), jpeg_data.size(), false,
                        &container);
-  container.append(jpeg_data.data(), jpeg_data.data() + jpeg_data.size());
+  jxl::Bytes(jpeg_data).AppendTo(&container);
   jxl::AppendBoxHeader(jxl::MakeBoxType("jxlc"), 0, true, &container);
   jxl::PaddedBytes codestream = std::move(writer).TakeBytes();
-  container.append(codestream.data(), codestream.data() + codestream.size());
-  VerifyJPEGReconstruction(container, orig);
+  jxl::Bytes(codestream).AppendTo(&container);
+  VerifyJPEGReconstruction(jxl::Bytes(container), jxl::Bytes(orig));
 }
 
 TEST(DecodeTest, JXL_TRANSCODE_JPEG_TEST(JPEGReconstructionMetadataTest)) {
   const std::string jpeg_path = "jxl/jpeg_reconstruction/1x1_exif_xmp.jpg";
   const std::string jxl_path = "jxl/jpeg_reconstruction/1x1_exif_xmp.jxl";
-  const jxl::PaddedBytes jpeg = jxl::ReadTestData(jpeg_path);
-  const jxl::PaddedBytes jxl = jxl::ReadTestData(jxl_path);
-  VerifyJPEGReconstruction(jxl, jpeg);
+  const std::vector<uint8_t> jpeg = jxl::test::ReadTestData(jpeg_path);
+  const std::vector<uint8_t> jxl = jxl::test::ReadTestData(jxl_path);
+  VerifyJPEGReconstruction(jxl::Bytes(jxl), jxl::Bytes(jpeg));
 }
 
 TEST(DecodeTest, ContinueFinalNonEssentialBoxTest) {
@@ -4857,9 +4968,8 @@ TEST(DecodeTest, ContinueFinalNonEssentialBoxTest) {
   jxl::TestCodestreamParams params;
   params.box_format = kCSBF_Multi_Other_Terminated;
   params.add_icc_profile = true;
-  jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
-      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 4,
-      params);
+  std::vector<uint8_t> compressed = jxl::CreateTestJXLCodestream(
+      jxl::Bytes(pixels.data(), pixels.size()), xsize, ysize, 4, params);
   StreamPositions streampos;
   AnalyzeCodestream(compressed, &streampos);
 
@@ -4918,7 +5028,7 @@ bool BoxTypeEquals(const std::string& type_string, JxlBoxType type) {
 
 TEST(DecodeTest, ExtentedBoxSizeTest) {
   const std::string jxl_path = "jxl/boxes/square-extended-size-container.jxl";
-  const jxl::PaddedBytes orig = jxl::ReadTestData(jxl_path);
+  const std::vector<uint8_t> orig = jxl::test::ReadTestData(jxl_path);
   JxlDecoder* dec = JxlDecoderCreate(nullptr);
 
   EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(dec, JXL_DEC_BOX));
@@ -4945,15 +5055,14 @@ TEST(DecodeTest, ExtentedBoxSizeTest) {
   JxlDecoderDestroy(dec);
 }
 
-TEST(DecodeTest, BoxTest) {
+TEST(DecodeTest, JXL_BOXES_TEST(BoxTest)) {
   size_t xsize = 1, ysize = 1;
   std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
   jxl::TestCodestreamParams params;
   params.box_format = kCSBF_Multi_Other_Terminated;
   params.add_icc_profile = true;
-  jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
-      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 4,
-      params);
+  std::vector<uint8_t> compressed = jxl::CreateTestJXLCodestream(
+      jxl::Bytes(pixels.data(), pixels.size()), xsize, ysize, 4, params);
 
   JxlDecoder* dec = JxlDecoderCreate(nullptr);
 
@@ -5018,7 +5127,7 @@ TEST(DecodeTest, BoxTest) {
   JxlDecoderDestroy(dec);
 }
 
-TEST(DecodeTest, ExifBrobBoxTest) {
+TEST(DecodeTest, JXL_BOXES_TEST(ExifBrobBoxTest)) {
   size_t xsize = 1, ysize = 1;
   std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
   jxl::TestCodestreamParams params;
@@ -5026,9 +5135,8 @@ TEST(DecodeTest, ExifBrobBoxTest) {
   params.cparams.SetLossless();
   params.box_format = kCSBF_Brob_Exif;
   params.add_icc_profile = true;
-  jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
-      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 4,
-      params);
+  std::vector<uint8_t> compressed = jxl::CreateTestJXLCodestream(
+      jxl::Bytes(pixels.data(), pixels.size()), xsize, ysize, 4, params);
 
   // Test raw brob box, not brotli-decompressing
   for (int streaming = 0; streaming < 2; ++streaming) {
@@ -5200,7 +5308,7 @@ TEST(DecodeTest, ExifBrobBoxTest) {
   }
 }
 
-TEST(DecodeTest, PartialCodestreamBoxTest) {
+TEST(DecodeTest, JXL_BOXES_TEST(PartialCodestreamBoxTest)) {
   size_t xsize = 23, ysize = 81;
   std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
   JxlPixelFormat format_orig = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
@@ -5210,9 +5318,8 @@ TEST(DecodeTest, PartialCodestreamBoxTest) {
   params.cparams.speed_tier = jxl::SpeedTier::kThunder;
   params.box_format = kCSBF_Multi;
   params.add_icc_profile = true;
-  jxl::PaddedBytes compressed = jxl::CreateTestJXLCodestream(
-      jxl::Span<const uint8_t>(pixels.data(), pixels.size()), xsize, ysize, 4,
-      params);
+  std::vector<uint8_t> compressed = jxl::CreateTestJXLCodestream(
+      jxl::Bytes(pixels.data(), pixels.size()), xsize, ysize, 4, params);
 
   std::vector<uint8_t> extracted_codestream;
 
@@ -5355,7 +5462,6 @@ TEST(DecodeTest, PartialCodestreamBoxTest) {
 }
 
 TEST(DecodeTest, SpotColorTest) {
-  jxl::ThreadPool* pool = nullptr;
   jxl::CodecInOut io;
   size_t xsize = 55, ysize = 257;
   io.metadata.m.color_encoding = jxl::ColorEncoding::LinearSRGB();
@@ -5393,11 +5499,11 @@ TEST(DecodeTest, SpotColorTest) {
   cparams.color_transform = jxl::ColorTransform::kNone;
   cparams.butteraugli_distance = 0.f;
 
-  jxl::PaddedBytes compressed;
+  std::vector<uint8_t> compressed;
   std::unique_ptr<jxl::PassesEncoderState> enc_state =
       jxl::make_unique<jxl::PassesEncoderState>();
-  EXPECT_TRUE(jxl::EncodeFile(cparams, &io, enc_state.get(), &compressed,
-                              jxl::GetJxlCms(), nullptr, pool));
+  EXPECT_TRUE(
+      jxl::test::EncodeFile(cparams, &io, enc_state.get(), &compressed));
 
   for (size_t render_spot = 0; render_spot < 2; render_spot++) {
     JxlPixelFormat format = {3, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0};
index aa57b27..bfbe08b 100644 (file)
@@ -5,6 +5,8 @@
 
 #include "lib/jxl/decode_to_jpeg.h"
 
+#include "lib/jxl/common.h"  // JPEGXL_ENABLE_TRANSCODE_JPEG
+
 namespace jxl {
 
 #if JPEGXL_ENABLE_TRANSCODE_JPEG
@@ -12,21 +14,21 @@ namespace jxl {
 JxlDecoderStatus JxlToJpegDecoder::Process(const uint8_t** next_in,
                                            size_t* avail_in) {
   if (!inside_box_) {
-    JXL_ABORT(
+    JXL_UNREACHABLE(
         "processing of JPEG reconstruction data outside JPEG reconstruction "
         "box");
   }
   Span<const uint8_t> to_decode;
   if (box_until_eof_) {
     // Until EOF means consume all data.
-    to_decode = Span<const uint8_t>(*next_in, *avail_in);
+    to_decode = Bytes(*next_in, *avail_in);
     *next_in += *avail_in;
     *avail_in = 0;
   } else {
     // Defined size means consume min(available, needed).
     size_t avail_recon_in =
         std::min<size_t>(*avail_in, box_size_ - buffer_.size());
-    to_decode = Span<const uint8_t>(*next_in, avail_recon_in);
+    to_decode = Bytes(*next_in, avail_recon_in);
     *next_in += avail_recon_in;
     *avail_in -= avail_recon_in;
   }
@@ -35,10 +37,10 @@ JxlDecoderStatus JxlToJpegDecoder::Process(const uint8_t** next_in,
     // Append incoming data to buffer if we already had data in the buffer.
     buffer_.insert(buffer_.end(), to_decode.data(),
                    to_decode.data() + to_decode.size());
-    to_decode = Span<const uint8_t>(buffer_.data(), buffer_.size());
+    to_decode = Bytes(buffer_.data(), buffer_.size());
   }
   if (!box_until_eof_ && to_decode.size() > box_size_) {
-    JXL_ABORT("JPEG reconstruction data to decode larger than expected");
+    JXL_UNREACHABLE("JPEG reconstruction data to decode larger than expected");
   }
   if (box_until_eof_ || to_decode.size() == box_size_) {
     // If undefined size, or the right size, try to decode.
index 68fd06e..a64ace2 100644 (file)
 // of the decoder state needed to parse the JPEG reconstruction box and provide
 // the reconstructed JPEG to the output buffer.
 
+#include <jxl/decode.h>
 #include <stdint.h>
 #include <stdlib.h>
 
 #include <memory>
 #include <vector>
 
-#include "jxl/decode.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/common.h"  // JPEGXL_ENABLE_TRANSCODE_JPEG
 #include "lib/jxl/image_bundle.h"
index a6de18f..a200b5d 100644 (file)
 #include "lib/jxl/ans_params.h"
 #include "lib/jxl/base/bits.h"
 #include "lib/jxl/base/compiler_specific.h"
-#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/fast_math-inl.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/coeff_order_fwd.h"
 #include "lib/jxl/convolve.h"
 #include "lib/jxl/dct_scales.h"
+#include "lib/jxl/dec_transforms-inl.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_debug_image.h"
 #include "lib/jxl/enc_params.h"
 #include "lib/jxl/enc_transforms-inl.h"
 #include "lib/jxl/entropy_coder.h"
-#include "lib/jxl/fast_math-inl.h"
+#include "lib/jxl/simd_util.h"
 
 // Some of the floating point constants in this file and in other
 // files in the libjxl project have been obtained using the
 // sensitive to some kind of degradation. Unfortunately image quality
 // is still more of an art than science.
 
+// Set JXL_DEBUG_AC_STRATEGY to 1 to enable debugging.
+#ifndef JXL_DEBUG_AC_STRATEGY
+#define JXL_DEBUG_AC_STRATEGY 0
+#endif
+
 // This must come before the begin/end_target, but HWY_ONCE is only true
 // after that, so use an "include guard".
 #ifndef LIB_JXL_ENC_AC_STRATEGY_
 #define LIB_JXL_ENC_AC_STRATEGY_
 // Parameters of the heuristic are marked with a OPTIMIZE comment.
 namespace jxl {
+namespace {
 
 // Debugging utilities.
 
@@ -207,7 +216,8 @@ const uint8_t* TypeMask(const uint8_t& raw_strategy) {
 }
 
 void DumpAcStrategy(const AcStrategyImage& ac_strategy, size_t xsize,
-                    size_t ysize, const char* tag, AuxOut* aux_out) {
+                    size_t ysize, const char* tag, AuxOut* aux_out,
+                    const CompressParams& cparams) {
   Image3F color_acs(xsize, ysize);
   for (size_t y = 0; y < ysize; y++) {
     float* JXL_RESTRICT rows[3] = {
@@ -259,9 +269,10 @@ void DumpAcStrategy(const AcStrategyImage& ac_strategy, size_t xsize,
       }
     }
   }
-  aux_out->DumpImage(tag, color_acs);
+  DumpImage(cparams, tag, color_acs);
 }
 
+}  // namespace
 }  // namespace jxl
 #endif  // LIB_JXL_ENC_AC_STRATEGY_
 
@@ -339,8 +350,8 @@ bool MultiBlockTransformCrossesVerticalBoundary(
   return false;
 }
 
-float EstimateEntropy(const AcStrategy& acs, size_t x, size_t y,
-                      const ACSConfig& config,
+float EstimateEntropy(const AcStrategy& acs, float entropy_mul, size_t x,
+                      size_t y, const ACSConfig& config,
                       const float* JXL_RESTRICT cmap_factors, float* block,
                       float* scratch_space, uint32_t* quantized) {
   const size_t size = (1 << acs.log2_covered_blocks()) * kDCTBlockSize;
@@ -351,34 +362,26 @@ float EstimateEntropy(const AcStrategy& acs, size_t x, size_t y,
     TransformFromPixels(acs.Strategy(), &config.Pixel(c, x, y),
                         config.src_stride, block_c, scratch_space);
   }
-
   HWY_FULL(float) df;
 
   const size_t num_blocks = acs.covered_blocks_x() * acs.covered_blocks_y();
-  float quant_norm8 = 0;
-  float masking = 0;
+  // avoid large blocks when there is a lot going on in red-green.
+  float quant_norm16 = 0;
   if (num_blocks == 1) {
     // When it is only one 8x8, we don't need aggregation of values.
-    quant_norm8 = config.Quant(x / 8, y / 8);
-    masking = 2.0f * config.Masking(x / 8, y / 8);
+    quant_norm16 = config.Quant(x / 8, y / 8);
   } else if (num_blocks == 2) {
     // Taking max instead of 8th norm seems to work
     // better for smallest blocks up to 16x8. Jyrki couldn't get
     // improvements in trying the same for 16x16 blocks.
     if (acs.covered_blocks_y() == 2) {
-      quant_norm8 =
+      quant_norm16 =
           std::max(config.Quant(x / 8, y / 8), config.Quant(x / 8, y / 8 + 1));
-      masking = 2.0f * std::max(config.Masking(x / 8, y / 8),
-                                config.Masking(x / 8, y / 8 + 1));
     } else {
-      quant_norm8 =
+      quant_norm16 =
           std::max(config.Quant(x / 8, y / 8), config.Quant(x / 8 + 1, y / 8));
-      masking = 2.0f * std::max(config.Masking(x / 8, y / 8),
-                                config.Masking(x / 8 + 1, y / 8));
     }
   } else {
-    float masking_norm2 = 0;
-    float masking_max = 0;
     // Load QF value, calculate empirical heuristic on masking field
     // for weighting the information loss. Information loss manifests
     // itself as ringing, and masking could hide it.
@@ -387,57 +390,80 @@ float EstimateEntropy(const AcStrategy& acs, size_t x, size_t y,
         float qval = config.Quant(x / 8 + ix, y / 8 + iy);
         qval *= qval;
         qval *= qval;
-        quant_norm8 += qval * qval;
-        float maskval = config.Masking(x / 8 + ix, y / 8 + iy);
-        masking_max = std::max<float>(masking_max, maskval);
-        masking_norm2 += maskval * maskval;
+        qval *= qval;
+        quant_norm16 += qval * qval;
       }
     }
-    quant_norm8 /= num_blocks;
-    quant_norm8 = FastPowf(quant_norm8, 1.0f / 8.0f);
-    masking_norm2 = sqrt(masking_norm2 / num_blocks);
-    // This is a highly empirical formula.
-    masking = (masking_norm2 + masking_max);
+    quant_norm16 /= num_blocks;
+    quant_norm16 = FastPowf(quant_norm16, 1.0f / 16.0f);
   }
-  const auto q = Set(df, quant_norm8);
+  const auto quant = Set(df, quant_norm16);
 
   // Compute entropy.
-  float entropy = config.base_entropy;
-  auto info_loss = Zero(df);
-  auto info_loss2 = Zero(df);
+  float entropy = 0.0f;
+  const HWY_CAPPED(float, 8) df8;
 
+  auto mem_alloc = hwy::AllocateAligned<float>(AcStrategy::kMaxCoeffArea);
+  float* mem = mem_alloc.get();
+  auto loss = Zero(df8);
   for (size_t c = 0; c < 3; c++) {
     const float* inv_matrix = config.dequant->InvMatrix(acs.RawStrategy(), c);
+    const float* matrix = config.dequant->Matrix(acs.RawStrategy(), c);
     const auto cmap_factor = Set(df, cmap_factors[c]);
 
     auto entropy_v = Zero(df);
     auto nzeros_v = Zero(df);
-    auto cost1 = Set(df, config.cost1);
-    auto cost2 = Set(df, config.cost2);
-    auto cost_delta = Set(df, config.cost_delta);
     for (size_t i = 0; i < num_blocks * kDCTBlockSize; i += Lanes(df)) {
       const auto in = Load(df, block + c * size + i);
       const auto in_y = Mul(Load(df, block + size + i), cmap_factor);
       const auto im = Load(df, inv_matrix + i);
-      const auto val = Mul(Sub(in, in_y), Mul(im, q));
+      const auto val = Mul(Sub(in, in_y), Mul(im, quant));
       const auto rval = Round(val);
-      const auto diff = AbsDiff(val, rval);
-      info_loss = Add(info_loss, diff);
-      info_loss2 = MulAdd(diff, diff, info_loss2);
+      const auto diff = Sub(val, rval);
+      const auto m = Load(df, matrix + i);
+      Store(Mul(m, diff), df, &mem[i]);
       const auto q = Abs(rval);
       const auto q_is_zero = Eq(q, Zero(df));
-      entropy_v = Add(entropy_v, IfThenElseZero(Ge(q, Set(df, 1.5f)), cost2));
       // We used to have q * C here, but that cost model seems to
       // be punishing large values more than necessary. Sqrt tries
-      // to avoid large values less aggressively. Having high accuracy
-      // around zero is most important at low qualities, and there
-      // we have directly specified costs for 0, 1, and 2.
-      entropy_v = MulAdd(Sqrt(q), cost_delta, entropy_v);
+      // to avoid large values less aggressively.
+      entropy_v = Add(Sqrt(q), entropy_v);
       nzeros_v = Add(nzeros_v, IfThenZeroElse(q_is_zero, Set(df, 1.0f)));
     }
-    entropy_v = MulAdd(nzeros_v, cost1, entropy_v);
 
-    entropy += GetLane(SumOfLanes(df, entropy_v));
+    {
+      auto lossc = Zero(df8);
+      TransformToPixels(acs.Strategy(), &mem[0], block,
+                        acs.covered_blocks_x() * 8, scratch_space);
+
+      for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
+        for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) {
+          for (size_t dy = 0; dy < kBlockDim; ++dy) {
+            for (size_t dx = 0; dx < kBlockDim; dx += Lanes(df8)) {
+              auto in = Load(df8, block +
+                                      (iy * kBlockDim + dy) *
+                                          (acs.covered_blocks_x() * kBlockDim) +
+                                      ix * kBlockDim + dx);
+              auto masku = Abs(Load(
+                  df8, config.MaskingPtr1x1(x + ix * 8 + dx, y + iy * 8 + dy)));
+              in = Mul(masku, in);
+              in = Mul(in, in);
+              in = Mul(in, in);
+              in = Mul(in, in);
+              lossc = Add(lossc, in);
+            }
+          }
+        }
+      }
+      static const double kChannelMul[3] = {
+          10.2,
+          1.0,
+          1.03,
+      };
+      lossc = Mul(Set(df8, pow(kChannelMul[c], 8.0)), lossc);
+      loss = Add(loss, lossc);
+    }
+    entropy += config.cost_delta * GetLane(SumOfLanes(df, entropy_v));
     size_t num_nzeros = GetLane(SumOfLanes(df, nzeros_v));
     // Add #bit of num_nonzeros, as an estimate of the cost for encoding the
     // number of non-zeros of the block.
@@ -446,17 +472,17 @@ float EstimateEntropy(const AcStrategy& acs, size_t x, size_t y,
     // bias.
     entropy += config.zeros_mul * (CeilLog2Nonzero(nbits + 17) + nbits);
   }
-  float ret =
-      entropy +
-      masking *
-          ((config.info_loss_multiplier * GetLane(SumOfLanes(df, info_loss))) +
-           (config.info_loss_multiplier2 *
-            sqrt(num_blocks * GetLane(SumOfLanes(df, info_loss2)))));
+  float loss_scalar =
+      pow(GetLane(SumOfLanes(df8, loss)) / (num_blocks * kDCTBlockSize),
+          1.0 / 8.0) *
+      (num_blocks * kDCTBlockSize) / quant_norm16;
+  float ret = entropy * entropy_mul;
+  ret += config.info_loss_multiplier * loss_scalar;
   return ret;
 }
 
 uint8_t FindBest8x8Transform(size_t x, size_t y, int encoding_speed_tier,
-                             const ACSConfig& config,
+                             float butteraugli_target, const ACSConfig& config,
                              const float* JXL_RESTRICT cmap_factors,
                              AcStrategyImage* JXL_RESTRICT ac_strategy,
                              float* block, float* scratch_space,
@@ -464,69 +490,58 @@ uint8_t FindBest8x8Transform(size_t x, size_t y, int encoding_speed_tier,
   struct TransformTry8x8 {
     AcStrategy::Type type;
     int encoding_speed_tier_max_limit;
-    float entropy_add;
-    float entropy_mul;
+    double entropy_mul;
   };
   static const TransformTry8x8 kTransforms8x8[] = {
       {
           AcStrategy::Type::DCT,
           9,
-          3.0f,
-          0.745f,
+          0.8,
       },
       {
           AcStrategy::Type::DCT4X4,
           5,
-          4.0f,
-          1.0179946967008329f,
+          1.08,
       },
       {
           AcStrategy::Type::DCT2X2,
-          4,
-          4.0f,
-          0.76721119707580943f,
+          5,
+          0.95,
       },
       {
           AcStrategy::Type::DCT4X8,
-          5,
-          0.0f,
-          0.700754622182473063f,
+          4,
+          0.85931637428340035,
       },
       {
           AcStrategy::Type::DCT8X4,
-          5,
-          0.0f,
-          0.700754622182473063f,
+          4,
+          0.85931637428340035,
       },
       {
           AcStrategy::Type::IDENTITY,
           5,
-          8.0f,
-          0.81217614513585534f,
+          1.0427542510634957,
       },
       {
           AcStrategy::Type::AFV0,
           4,
-          3.0f,
-          0.70086131125719425f,
+          0.81779489591359944,
       },
       {
           AcStrategy::Type::AFV1,
           4,
-          3.0f,
-          0.70086131125719425f,
+          0.81779489591359944,
       },
       {
           AcStrategy::Type::AFV2,
           4,
-          3.0f,
-          0.70086131125719425f,
+          0.81779489591359944,
       },
       {
           AcStrategy::Type::AFV3,
           4,
-          3.0f,
-          0.70086131125719425f,
+          0.81779489591359944,
       },
   };
   double best = 1e30;
@@ -536,9 +551,28 @@ uint8_t FindBest8x8Transform(size_t x, size_t y, int encoding_speed_tier,
       continue;
     }
     AcStrategy acs = AcStrategy::FromRawStrategy(tx.type);
-    float entropy = EstimateEntropy(acs, x, y, config, cmap_factors, block,
-                                    scratch_space, quantized);
-    entropy = tx.entropy_add + tx.entropy_mul * entropy;
+    float entropy_mul = tx.entropy_mul / kTransforms8x8[0].entropy_mul;
+    if ((tx.type == AcStrategy::Type::DCT2X2 ||
+         tx.type == AcStrategy::Type::IDENTITY) &&
+        butteraugli_target < 5.0) {
+      static const float kFavor2X2AtHighQuality = 0.4;
+      float weight = pow((5.0f - butteraugli_target) / 5.0f, 2.0);
+      entropy_mul -= kFavor2X2AtHighQuality * weight;
+    }
+    if ((tx.type != AcStrategy::Type::DCT &&
+         tx.type != AcStrategy::Type::DCT2X2 &&
+         tx.type != AcStrategy::Type::IDENTITY) &&
+        butteraugli_target > 4.0) {
+      static const float kAvoidEntropyOfTransforms = 0.5;
+      float mul = 1.0;
+      if (butteraugli_target < 12.0) {
+        mul *= (12.0 - 4.0) / (butteraugli_target - 4.0);
+      }
+      entropy_mul += kAvoidEntropyOfTransforms * mul;
+    }
+    float entropy =
+        EstimateEntropy(acs, entropy_mul, x, y, config, cmap_factors, block,
+                        scratch_space, quantized);
     if (entropy < best) {
       best_tx = tx.type;
       best = entropy;
@@ -572,9 +606,8 @@ void TryMergeAcs(AcStrategy::Type acs_raw, size_t bx, size_t by, size_t cx,
     }
   }
   float entropy_candidate =
-      entropy_mul * EstimateEntropy(acs, (bx + cx) * 8, (by + cy) * 8, config,
-                                    cmap_factors, block, scratch_space,
-                                    quantized);
+      EstimateEntropy(acs, entropy_mul, (bx + cx) * 8, (by + cy) * 8, config,
+                      cmap_factors, block, scratch_space, quantized);
   if (entropy_candidate >= entropy_current) return;
   // Accept the candidate.
   for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
@@ -690,42 +723,37 @@ void FindBestFirstLevelDivisionForSquare(
   float entropy_JXJ = std::numeric_limits<float>::max();
   if (allow_JXK) {
     if (row0[bx + cx + 0].RawStrategy() != acs_rawJXK) {
-      entropy_JXK_left =
-          entropy_mul_JXK *
-          EstimateEntropy(acsJXK, (bx + cx + 0) * 8, (by + cy + 0) * 8, config,
-                          cmap_factors, block, scratch_space, quantized);
+      entropy_JXK_left = EstimateEntropy(
+          acsJXK, entropy_mul_JXK, (bx + cx + 0) * 8, (by + cy + 0) * 8, config,
+          cmap_factors, block, scratch_space, quantized);
     }
     if (row0[bx + cx + blocks_half].RawStrategy() != acs_rawJXK) {
       entropy_JXK_right =
-          entropy_mul_JXK * EstimateEntropy(acsJXK, (bx + cx + blocks_half) * 8,
-                                            (by + cy + 0) * 8, config,
-                                            cmap_factors, block, scratch_space,
-                                            quantized);
+          EstimateEntropy(acsJXK, entropy_mul_JXK, (bx + cx + blocks_half) * 8,
+                          (by + cy + 0) * 8, config, cmap_factors, block,
+                          scratch_space, quantized);
     }
   }
   if (allow_KXJ) {
     if (row0[bx + cx].RawStrategy() != acs_rawKXJ) {
-      entropy_KXJ_top =
-          entropy_mul_JXK *
-          EstimateEntropy(acsKXJ, (bx + cx + 0) * 8, (by + cy + 0) * 8, config,
-                          cmap_factors, block, scratch_space, quantized);
+      entropy_KXJ_top = EstimateEntropy(
+          acsKXJ, entropy_mul_JXK, (bx + cx + 0) * 8, (by + cy + 0) * 8, config,
+          cmap_factors, block, scratch_space, quantized);
     }
     if (row1[bx + cx].RawStrategy() != acs_rawKXJ) {
       entropy_KXJ_bottom =
-          entropy_mul_JXK * EstimateEntropy(acsKXJ, (bx + cx + 0) * 8,
-                                            (by + cy + blocks_half) * 8, config,
-                                            cmap_factors, block, scratch_space,
-                                            quantized);
+          EstimateEntropy(acsKXJ, entropy_mul_JXK, (bx + cx + 0) * 8,
+                          (by + cy + blocks_half) * 8, config, cmap_factors,
+                          block, scratch_space, quantized);
     }
   }
   if (allow_square_transform) {
     // We control the exploration of the square transform separately so that
     // we can turn it off at high decoding speeds for 32x32, but still allow
     // exploring 16x32 and 32x16.
-    entropy_JXJ = entropy_mul_JXJ * EstimateEntropy(acsJXJ, (bx + cx + 0) * 8,
-                                                    (by + cy + 0) * 8, config,
-                                                    cmap_factors, block,
-                                                    scratch_space, quantized);
+    entropy_JXJ = EstimateEntropy(acsJXJ, entropy_mul_JXJ, (bx + cx + 0) * 8,
+                                  (by + cy + 0) * 8, config, cmap_factors,
+                                  block, scratch_space, quantized);
   }
 
   // Test if this block should have JXK or KXJ transforms,
@@ -769,7 +797,7 @@ void ProcessRectACS(PassesEncoderState* JXL_RESTRICT enc_state,
   // 2. Merging them into larger transforms where possibly, but
   // starting from the smallest transforms (16x8 and 8x16).
   // Additional complication: 16x8 and 8x16 are considered
-  // simultanouesly and fairly against each other.
+  // simultaneously and fairly against each other.
   // We are looking at 64x64 squares since the YtoX and YtoB
   // maps happen to be at that resolution, and having
   // integral transforms cross these boundaries leads to
@@ -777,8 +805,11 @@ void ProcessRectACS(PassesEncoderState* JXL_RESTRICT enc_state,
   const CompressParams& cparams = enc_state->cparams;
   const float butteraugli_target = cparams.butteraugli_distance;
   AcStrategyImage* ac_strategy = &enc_state->shared.ac_strategy;
+  const size_t dct_scratch_size =
+      3 * (MaxVectorSize() / sizeof(float)) * AcStrategy::kMaxBlockDim;
   // TODO(veluca): reuse allocations
-  auto mem = hwy::AllocateAligned<float>(5 * AcStrategy::kMaxCoeffArea);
+  auto mem = hwy::AllocateAligned<float>(5 * AcStrategy::kMaxCoeffArea +
+                                         dct_scratch_size);
   auto qmem = hwy::AllocateAligned<uint32_t>(AcStrategy::kMaxCoeffArea);
   uint32_t* JXL_RESTRICT quantized = qmem.get();
   float* JXL_RESTRICT block = mem.get();
@@ -804,8 +835,8 @@ void ProcessRectACS(PassesEncoderState* JXL_RESTRICT enc_state,
   float entropy_estimate[64] = {};
   // Favor all 8x8 transforms (against 16x8 and larger transforms)) at
   // low butteraugli_target distances.
-  static const float k8x8mul1 = -0.55;
-  static const float k8x8mul2 = 1.0735757687292623f;
+  static const float k8x8mul1 = -0.4;
+  static const float k8x8mul2 = 1.0;
   static const float k8x8base = 1.4;
   const float mul8x8 = k8x8mul2 + k8x8mul1 / (butteraugli_target + k8x8base);
   for (size_t iy = 0; iy < rect.ysize(); iy++) {
@@ -813,8 +844,8 @@ void ProcessRectACS(PassesEncoderState* JXL_RESTRICT enc_state,
       float entropy = 0.0;
       const uint8_t best_of_8x8s = FindBest8x8Transform(
           8 * (bx + ix), 8 * (by + iy), static_cast<int>(cparams.speed_tier),
-          config, cmap_factors, ac_strategy, block, scratch_space, quantized,
-          &entropy);
+          butteraugli_target, config, cmap_factors, ac_strategy, block,
+          scratch_space, quantized, &entropy);
       ac_strategy->Set(bx + ix, by + iy,
                        static_cast<AcStrategy::Type>(best_of_8x8s));
       entropy_estimate[iy * 8 + ix] = entropy * mul8x8;
@@ -829,28 +860,16 @@ void ProcessRectACS(PassesEncoderState* JXL_RESTRICT enc_state,
     uint8_t encoding_speed_tier_max_limit;
     float entropy_mul;
   };
-  static const float k8X16mul1 = -0.55;
-  static const float k8X16mul2 = 0.9019587899705066;
-  static const float k8X16base = 1.6;
-  const float entropy_mul16X8 =
-      k8X16mul2 + k8X16mul1 / (butteraugli_target + k8X16base);
-  //  const float entropy_mul16X8 = mul8X16 * 0.91195782912371126f;
-
-  static const float k16X16mul1 = -0.35;
-  static const float k16X16mul2 = 0.82;
-  static const float k16X16base = 2.0;
-  const float entropy_mul16X16 =
-      k16X16mul2 + k16X16mul1 / (butteraugli_target + k16X16base);
-  //  const float entropy_mul16X16 = mul16X16 * 0.83183417727960129f;
-
-  static const float k32X16mul1 = -0.1;
-  static const float k32X16mul2 = 0.84;
-  static const float k32X16base = 2.5;
-  const float entropy_mul16X32 =
-      k32X16mul2 + k32X16mul1 / (butteraugli_target + k32X16base);
-
-  const float entropy_mul32X32 = 0.9;
-  const float entropy_mul64X64 = 1.43f;
+  // These numbers need to be figured out manually and looking at
+  // ringing next to sky etc. Optimization will find larger numbers
+  // and produce more ringing than is ideal. Larger numbers will
+  // help stop ringing.
+  const float entropy_mul16X8 = 1.25;
+  const float entropy_mul16X16 = 1.35;
+  const float entropy_mul16X32 = 1.5;
+  const float entropy_mul32X32 = 1.5;
+  const float entropy_mul64X32 = 2.26;
+  const float entropy_mul64X64 = 2.26;
   // TODO(jyrki): Consider this feedback in further changes:
   // Also effectively when the multipliers for smaller blocks are
   // below 1, this raises the bar for the bigger blocks even higher
@@ -870,9 +889,8 @@ void ProcessRectACS(PassesEncoderState* JXL_RESTRICT enc_state,
       // FindBestFirstLevelDivisionForSquare looks for DCT32X32 and its
       // subdivisions. {AcStrategy::Type::DCT32X32, 5, 1, 5,
       // 0.9822994906548809f},
-      // TODO(jyrki): re-enable 64x32 and 64x64 if/when possible.
-      {AcStrategy::Type::DCT64X32, 6, 1, 3, 1.26f},
-      {AcStrategy::Type::DCT32X64, 6, 1, 3, 1.26f},
+      {AcStrategy::Type::DCT64X32, 6, 1, 3, entropy_mul64X32},
+      {AcStrategy::Type::DCT32X64, 6, 1, 3, entropy_mul64X32},
       // {AcStrategy::Type::DCT64X64, 8, 1, 3, 2.0846542128012948f},
   };
   /*
@@ -890,6 +908,7 @@ void ProcessRectACS(PassesEncoderState* JXL_RESTRICT enc_state,
   // Priority is a tricky kludge to avoid collisions so that transforms
   // don't overlap.
   uint8_t priority[64] = {};
+  bool enable_32x32 = cparams.decoding_speed_tier < 4;
   for (auto tx : kTransformsForMerge) {
     if (tx.decoding_speed_tier_max_limit < cparams.decoding_speed_tier) {
       continue;
@@ -928,7 +947,6 @@ void ProcessRectACS(PassesEncoderState* JXL_RESTRICT enc_state,
         if (cy + 3 < rect.ysize() && cx + 3 < rect.xsize()) {
           if (tx.type == AcStrategy::Type::DCT16X32) {
             // We handle both DCT8X16 and DCT16X8 at the same time.
-            bool enable_32x32 = cparams.decoding_speed_tier < 4;
             if ((cy | cx) % 4 == 0) {
               FindBestFirstLevelDivisionForSquare(
                   4, enable_32x32, bx, by, cx, cy, config, cmap_factors,
@@ -983,14 +1001,14 @@ void ProcessRectACS(PassesEncoderState* JXL_RESTRICT enc_state,
       }
     }
   }
-  // Here we still try to do some non-aligned matching, find a few more
-  // 16X8, 8X16 and 16X16s between the non-2-aligned blocks.
   if (cparams.speed_tier >= SpeedTier::kHare) {
     return;
   }
-  for (int ii = 0; ii < 3; ++ii) {
-    for (size_t cy = 1 - (ii == 1); cy + 1 < rect.ysize(); cy += 2) {
-      for (size_t cx = 1 - (ii == 2); cx + 1 < rect.xsize(); cx += 2) {
+  // Here we still try to do some non-aligned matching, find a few more
+  // 16X8, 8X16 and 16X16s between the non-2-aligned blocks.
+  for (size_t cy = 0; cy + 1 < rect.ysize(); ++cy) {
+    for (size_t cx = 0; cx + 1 < rect.xsize(); ++cx) {
+      if ((cy | cx) % 2 != 0) {
         FindBestFirstLevelDivisionForSquare(
             2, true, bx, by, cx, cy, config, cmap_factors, ac_strategy,
             entropy_mul16X8, entropy_mul16X16, entropy_estimate, block,
@@ -998,6 +1016,19 @@ void ProcessRectACS(PassesEncoderState* JXL_RESTRICT enc_state,
       }
     }
   }
+  // Non-aligned matching for 32X32, 16X32 and 32X16.
+  size_t step = cparams.speed_tier >= SpeedTier::kTortoise ? 2 : 1;
+  for (size_t cy = 0; cy + 3 < rect.ysize(); cy += step) {
+    for (size_t cx = 0; cx + 3 < rect.xsize(); cx += step) {
+      if ((cy | cx) % 4 == 0) {
+        continue;  // Already tried with loop above (DCT16X32 case).
+      }
+      FindBestFirstLevelDivisionForSquare(
+          4, enable_32x32, bx, by, cx, cy, config, cmap_factors, ac_strategy,
+          entropy_mul16X32, entropy_mul32X32, entropy_estimate, block,
+          scratch_space, quantized);
+    }
+  }
 }
 
 // NOLINTNEXTLINE(google-readability-namespace-comments)
@@ -1014,7 +1045,6 @@ void AcStrategyHeuristics::Init(const Image3F& src,
   this->enc_state = enc_state;
   config.dequant = &enc_state->shared.matrices;
   const CompressParams& cparams = enc_state->cparams;
-  const float butteraugli_target = cparams.butteraugli_distance;
 
   if (cparams.speed_tier >= SpeedTier::kCheetah) {
     JXL_CHECK(enc_state->shared.matrices.EnsureComputed(1));  // DCT8 only
@@ -1031,10 +1061,15 @@ void AcStrategyHeuristics::Init(const Image3F& src,
   config.quant_field_row = enc_state->initial_quant_field.Row(0);
   config.quant_field_stride = enc_state->initial_quant_field.PixelsPerRow();
   auto& mask = enc_state->initial_quant_masking;
+  auto& mask1x1 = enc_state->initial_quant_masking1x1;
   if (mask.xsize() > 0 && mask.ysize() > 0) {
     config.masking_field_row = mask.Row(0);
     config.masking_field_stride = mask.PixelsPerRow();
   }
+  if (mask1x1.xsize() > 0 && mask1x1.ysize() > 0) {
+    config.masking1x1_field_row = mask1x1.Row(0);
+    config.masking1x1_field_stride = mask1x1.PixelsPerRow();
+  }
 
   config.src_rows[0] = src.ConstPlaneRow(0, 0);
   config.src_rows[1] = src.ConstPlaneRow(1, 0);
@@ -1045,20 +1080,19 @@ void AcStrategyHeuristics::Init(const Image3F& src,
   //  - estimate of the number of bits that will be used by the block
   //  - information loss due to quantization
   // The following constant controls the relative weights of these components.
-  config.info_loss_multiplier = 138.0f;
-  config.info_loss_multiplier2 = 50.46839691767866;
-  // TODO(jyrki): explore base_entropy setting more.
-  // A small value (0?) works better at high distance, while a larger value
-  // may be more effective at low distance/high bpp.
-  config.base_entropy = 0.0;
-  config.zeros_mul = 7.565053364251793f;
-  // Lots of +1 and -1 coefficients at high quality, it is
-  // beneficial to favor them. At low qualities zeros matter more
-  // and +1 / -1 coefficients are already quite harmful.
-  float slope = std::min<float>(1.0f, butteraugli_target * (1.0f / 3));
-  config.cost1 = 1 + slope * 8.8703248061477744f;
-  config.cost2 = 4.4628149885273363f;
-  config.cost_delta = 5.3359184934516337f;
+  config.info_loss_multiplier = 1.2;
+  config.zeros_mul = 9.3089059022677905;
+  config.cost_delta = 10.833273317067883;
+
+  static const float kBias = 0.13731742964354549;
+  const float ratio = (cparams.butteraugli_distance + kBias) / (1.0f + kBias);
+
+  static const float kPow1 = 0.33677806662454718;
+  static const float kPow2 = 0.50990926717963703;
+  static const float kPow3 = 0.36702940662370243;
+  config.info_loss_multiplier *= pow(ratio, kPow1);
+  config.zeros_mul *= pow(ratio, kPow2);
+  config.cost_delta *= pow(ratio, kPow3);
   JXL_ASSERT(enc_state->shared.ac_strategy.xsize() ==
              enc_state->shared.frame_dim.xsize_blocks);
   JXL_ASSERT(enc_state->shared.ac_strategy.ysize() ==
@@ -1066,7 +1100,6 @@ void AcStrategyHeuristics::Init(const Image3F& src,
 }
 
 void AcStrategyHeuristics::ProcessRect(const Rect& rect) {
-  PROFILER_FUNC;
   const CompressParams& cparams = enc_state->cparams;
   // In Falcon mode, use DCT8 everywhere and uniform quantization.
   if (cparams.speed_tier >= SpeedTier::kCheetah) {
@@ -1113,9 +1146,11 @@ void AcStrategyHeuristics::Finalize(AuxOut* aux_out) {
         ac_strategy.CountBlocks(AcStrategy::Type::DCT64X64);
   }
 
-  if (WantDebugOutput(aux_out)) {
+  // if (JXL_DEBUG_AC_STRATEGY && WantDebugOutput(aux_out)) {
+  if (JXL_DEBUG_AC_STRATEGY && WantDebugOutput(enc_state->cparams)) {
     DumpAcStrategy(ac_strategy, enc_state->shared.frame_dim.xsize,
-                   enc_state->shared.frame_dim.ysize, "ac_strategy", aux_out);
+                   enc_state->shared.frame_dim.ysize, "ac_strategy", aux_out,
+                   enc_state->cparams);
   }
 }
 
index 409f18b..c89b39c 100644 (file)
@@ -6,18 +6,11 @@
 #ifndef LIB_JXL_ENC_AC_STRATEGY_H_
 #define LIB_JXL_ENC_AC_STRATEGY_H_
 
-#include <stdint.h>
+#include <cstddef>
 
-#include "lib/jxl/ac_strategy.h"
-#include "lib/jxl/aux_out.h"
-#include "lib/jxl/aux_out_fwd.h"
-#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/chroma_from_luma.h"
-#include "lib/jxl/common.h"
-#include "lib/jxl/dec_ans.h"
 #include "lib/jxl/enc_cache.h"
-#include "lib/jxl/enc_params.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/quant_weights.h"
 
 
 namespace jxl {
 
+struct AuxOut;
+
 // AC strategy selection: utility struct.
 
 struct ACSConfig {
   const DequantMatrices* JXL_RESTRICT dequant;
-  float info_loss_multiplier;
-  float info_loss_multiplier2;
   float* JXL_RESTRICT quant_field_row;
   size_t quant_field_stride;
   float* JXL_RESTRICT masking_field_row;
   size_t masking_field_stride;
+  float* JXL_RESTRICT masking1x1_field_row;
+  size_t masking1x1_field_stride;
   const float* JXL_RESTRICT src_rows[3];
   size_t src_stride;
-  // Cost for 1 (-1), 2 (-2) explicitly, cost for others computed with cost1 +
-  // cost2 + sqrt(q) * cost_delta.
-  float cost1;
-  float cost2;
+  float info_loss_multiplier;
   float cost_delta;
-  float base_entropy;
   float zeros_mul;
   const float& Pixel(size_t c, size_t x, size_t y) const {
     return src_rows[c][y * src_stride + x];
@@ -52,6 +43,10 @@ struct ACSConfig {
     JXL_DASSERT(masking_field_row[by * masking_field_stride + bx] > 0);
     return masking_field_row[by * masking_field_stride + bx];
   }
+  float* MaskingPtr1x1(size_t bx, size_t by) const {
+    JXL_DASSERT(masking1x1_field_row[by * masking1x1_field_stride + bx] > 0);
+    return &masking1x1_field_row[by * masking1x1_field_stride + bx];
+  }
   float Quant(size_t bx, size_t by) const {
     JXL_DASSERT(quant_field_row[by * quant_field_stride + bx] > 0);
     return quant_field_row[by * quant_field_stride + bx];
@@ -66,10 +61,6 @@ struct AcStrategyHeuristics {
   PassesEncoderState* enc_state;
 };
 
-// Debug.
-void DumpAcStrategy(const AcStrategyImage& ac_strategy, size_t xsize,
-                    size_t ysize, const char* tag, AuxOut* aux_out);
-
 }  // namespace jxl
 
 #endif  // LIB_JXL_ENC_AC_STRATEGY_H_
index 4d245b4..4d1b73c 100644 (file)
@@ -6,7 +6,6 @@
 #include "lib/jxl/enc_adaptive_quantization.h"
 
 #include <stddef.h>
-#include <stdio.h>
 #include <stdlib.h>
 
 #include <algorithm>
 #include <hwy/highway.h>
 
 #include "lib/jxl/ac_strategy.h"
-#include "lib/jxl/aux_out.h"
+#include "lib/jxl/base/common.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/data_parallel.h"
-#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/fast_math-inl.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/butteraugli/butteraugli.h"
+#include "lib/jxl/cms/opsin_params.h"
 #include "lib/jxl/coeff_order_fwd.h"
 #include "lib/jxl/color_encoding_internal.h"
-#include "lib/jxl/color_management.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/convolve.h"
 #include "lib/jxl/dec_cache.h"
 #include "lib/jxl/dec_group.h"
+#include "lib/jxl/enc_aux_out.h"
 #include "lib/jxl/enc_butteraugli_comparator.h"
 #include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_debug_image.h"
 #include "lib/jxl/enc_group.h"
 #include "lib/jxl/enc_modular.h"
 #include "lib/jxl/enc_params.h"
 #include "lib/jxl/enc_transforms-inl.h"
 #include "lib/jxl/epf.h"
-#include "lib/jxl/fast_math-inl.h"
+#include "lib/jxl/frame_dimensions.h"
 #include "lib/jxl/gauss_blur.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/image_bundle.h"
 #include "lib/jxl/image_ops.h"
-#include "lib/jxl/opsin_params.h"
 #include "lib/jxl/quant_weights.h"
+
+// Set JXL_DEBUG_ADAPTIVE_QUANTIZATION to 1 to enable debugging.
+#ifndef JXL_DEBUG_ADAPTIVE_QUANTIZATION
+#define JXL_DEBUG_ADAPTIVE_QUANTIZATION 0
+#endif
+
 HWY_BEFORE_NAMESPACE();
 namespace jxl {
 namespace HWY_NAMESPACE {
@@ -74,14 +79,14 @@ float ComputeMaskForAcStrategyUse(const float out_val) {
 
 template <class D, class V>
 V ComputeMask(const D d, const V out_val) {
-  const auto kBase = Set(d, -0.74174993f);
-  const auto kMul4 = Set(d, 3.2353257320940401f);
-  const auto kMul2 = Set(d, 12.906028311180409f);
-  const auto kOffset2 = Set(d, 305.04035728311436f);
-  const auto kMul3 = Set(d, 5.0220313103171232f);
-  const auto kOffset3 = Set(d, 2.1925739705298404f);
+  const auto kBase = Set(d, -0.7647f);
+  const auto kMul4 = Set(d, 9.4708735624378946f);
+  const auto kMul2 = Set(d, 17.35036561631863f);
+  const auto kOffset2 = Set(d, 302.59587815579727f);
+  const auto kMul3 = Set(d, 6.7943250517376494f);
+  const auto kOffset3 = Set(d, 3.7179635626140772f);
   const auto kOffset4 = Mul(Set(d, 0.25f), kOffset3);
-  const auto kMul0 = Set(d, 0.74760422233706747f);
+  const auto kMul0 = Set(d, 0.80061762862741759f);
   const auto k1 = Set(d, 1.0f);
 
   // Avoid division by zero.
@@ -97,23 +102,13 @@ V ComputeMask(const D d, const V out_val) {
   return Add(kBase, MulAdd(kMul4, v4, MulAdd(kMul2, v2, Mul(kMul3, v3))));
 }
 
-// For converting full vectors to a subset. Assumes `vfull` lanes are identical.
-template <class D, class VFull>
-Vec<D> CapTo(const D d, VFull vfull) {
-  using T = typename D::T;
-  const HWY_FULL(T) dfull;
-  HWY_ALIGN T lanes[MaxLanes(dfull)];
-  Store(vfull, dfull, lanes);
-  return Load(d, lanes);
-}
-
 // mul and mul2 represent a scaling difference between jxl and butteraugli.
-static const float kSGmul = 226.0480446705883f;
+static const float kSGmul = 226.77216153508914f;
 static const float kSGmul2 = 1.0f / 73.377132366608819f;
 static const float kLog2 = 0.693147181f;
 // Includes correction factor for std::log -> log2.
 static const float kSGRetMul = kSGmul2 * 18.6580932135f * kLog2;
-static const float kSGVOffset = 7.14672470003f;
+static const float kSGVOffset = 7.7825991679894591f;
 
 template <bool invert, typename D, typename V>
 V RatioOfDerivativesOfCubicRootToSimpleGamma(const D d, V v) {
@@ -171,9 +166,9 @@ template <class D, class V>
 V GammaModulation(const D d, const size_t x, const size_t y,
                   const ImageF& xyb_x, const ImageF& xyb_y, const V out_val) {
   const float kBias = 0.16f;
-  JXL_DASSERT(kBias > kOpsinAbsorbanceBias[0]);
-  JXL_DASSERT(kBias > kOpsinAbsorbanceBias[1]);
-  JXL_DASSERT(kBias > kOpsinAbsorbanceBias[2]);
+  JXL_DASSERT(kBias > jxl::cms::kOpsinAbsorbanceBias[0]);
+  JXL_DASSERT(kBias > jxl::cms::kOpsinAbsorbanceBias[1]);
+  JXL_DASSERT(kBias > jxl::cms::kOpsinAbsorbanceBias[2]);
   auto overall_ratio = Zero(d);
   auto bias = Set(d, kBias);
   auto half = Set(d, 0.5f);
@@ -198,73 +193,11 @@ V GammaModulation(const D d, const size_t x, const size_t y,
   // ideally -1.0, but likely optimal correction adds some entropy, so slightly
   // less than that.
   // ln(2) constant folded in because we want std::log but have FastLog2f.
-  const auto kGam = Set(d, -0.15526878023684174f * 0.693147180559945f);
+  static const float v = 0.14507933746197058f;
+  const auto kGam = Set(d, v * 0.693147180559945f);
   return MulAdd(kGam, FastLog2f(d, overall_ratio), out_val);
 }
 
-template <class D, class V>
-V ColorModulation(const D d, const size_t x, const size_t y,
-                  const ImageF& xyb_x, const ImageF& xyb_y, const ImageF& xyb_b,
-                  const double butteraugli_target, V out_val) {
-  static const float kStrengthMul = 2.177823400325309;
-  static const float kRedRampStart = 0.0073200141118951231;
-  static const float kRedRampLength = 0.019421555948474039;
-  static const float kBlueRampLength = 0.086890611400405895;
-  static const float kBlueRampStart = 0.26973418507870539;
-  const float strength = kStrengthMul * (1.0f - 0.25f * butteraugli_target);
-  if (strength < 0) {
-    return out_val;
-  }
-  // x values are smaller than y and b values, need to take the difference into
-  // account.
-  const float red_strength = strength * 5.992297772961519f;
-  const float blue_strength = strength;
-  {
-    // Reduce some bits from areas not blue or red.
-    const float offset = strength * -0.009174542291185913f;
-    out_val = Add(out_val, Set(d, offset));
-  }
-  // Calculate how much of the 8x8 block is covered with blue or red.
-  auto blue_coverage = Zero(d);
-  auto red_coverage = Zero(d);
-  for (size_t dy = 0; dy < 8; ++dy) {
-    const float* const JXL_RESTRICT row_in_x = xyb_x.Row(y + dy);
-    const float* const JXL_RESTRICT row_in_y = xyb_y.Row(y + dy);
-    const float* const JXL_RESTRICT row_in_b = xyb_b.Row(y + dy);
-    for (size_t dx = 0; dx < 8; dx += Lanes(d)) {
-      const auto pixel_x = Max(
-          Set(d, 0.0f), Sub(Load(d, row_in_x + x + dx), Set(d, kRedRampStart)));
-      const auto pixel_y = Load(d, row_in_y + x + dx);
-      const auto pixel_b =
-          Max(Set(d, 0.0f), Sub(Load(d, row_in_b + x + dx),
-                                Add(pixel_y, Set(d, kBlueRampStart))));
-      const auto blue_slope = Min(pixel_b, Set(d, kBlueRampLength));
-      const auto red_slope = Min(pixel_x, Set(d, kRedRampLength));
-      red_coverage = Add(red_coverage, red_slope);
-      blue_coverage = Add(blue_coverage, blue_slope);
-    }
-  }
-
-  // Saturate when the high red or high blue coverage is above a level.
-  // The idea here is that if a certain fraction of the block is red or
-  // blue we consider as if it was fully red or blue.
-  static const float ratio = 30.610615782142737f;  // out of 64 pixels.
-
-  auto overall_red_coverage = SumOfLanes(d, red_coverage);
-  overall_red_coverage =
-      Min(overall_red_coverage, Set(d, ratio * kRedRampLength));
-  overall_red_coverage =
-      Mul(overall_red_coverage, Set(d, red_strength / ratio));
-
-  auto overall_blue_coverage = SumOfLanes(d, blue_coverage);
-  overall_blue_coverage =
-      Min(overall_blue_coverage, Set(d, ratio * kBlueRampLength));
-  overall_blue_coverage =
-      Mul(overall_blue_coverage, Set(d, blue_strength / ratio));
-
-  return Add(overall_red_coverage, Add(overall_blue_coverage, out_val));
-}
-
 // Change precision in 8x8 blocks that have high frequency content.
 template <class D, class V>
 V HfModulation(const D d, const size_t x, const size_t y, const ImageF& xyb,
@@ -276,6 +209,8 @@ V HfModulation(const D d, const size_t x, const size_t y, const ImageF& xyb,
 
   auto sum = Zero(d);  // sum of absolute differences with right and below
 
+  static const float valmin = 0.020602694503245016f;
+  auto valminv = Set(d, valmin);
   for (size_t dy = 0; dy < 8; ++dy) {
     const float* JXL_RESTRICT row_in = xyb.Row(y + dy) + x;
     const float* JXL_RESTRICT row_in_next =
@@ -294,15 +229,25 @@ V HfModulation(const D d, const size_t x, const size_t y, const ImageF& xyb,
       const auto p = Load(d, row_in + dx);
       const auto pr = LoadU(d, row_in + dx + 1);
       const auto mask = BitCast(d, Load(du, kMaskRight + dx));
-      sum = Add(sum, And(mask, AbsDiff(p, pr)));
+      sum = Add(sum, And(mask, Min(valminv, AbsDiff(p, pr))));
 
       const auto pd = Load(d, row_in_next + dx);
-      sum = Add(sum, AbsDiff(p, pd));
+      sum = Add(sum, Min(valminv, AbsDiff(p, pd)));
     }
+#if HWY_TARGET == HWY_SCALAR
+    const auto p = Load(d, row_in + 7);
+    const auto pd = Load(d, row_in_next + 7);
+    sum = Add(sum, Min(valminv, AbsDiff(p, pd)));
+#endif
   }
-
+  // more negative value gives more bpp
+  static const float kOffset = -1.110929106987477;
+  static const float kMul = -0.38078920620238305;
   sum = SumOfLanes(d, sum);
-  return MulAdd(sum, Set(d, -2.0052193233688884f / 112), out_val);
+  float scalar_sum = GetLane(sum);
+  scalar_sum += kOffset;
+  scalar_sum *= kMul;
+  return Add(Set(d, scalar_sum), out_val);
 }
 
 void PerBlockModulations(const float butteraugli_target, const ImageF& xyb_x,
@@ -312,8 +257,8 @@ void PerBlockModulations(const float butteraugli_target, const ImageF& xyb_x,
   JXL_ASSERT(DivCeil(xyb_x.xsize(), kBlockDim) == out->xsize());
   JXL_ASSERT(DivCeil(xyb_x.ysize(), kBlockDim) == out->ysize());
 
-  float base_level = 0.5f * scale;
-  float kDampenRampStart = 7.0f;
+  float base_level = 0.48f * scale;
+  float kDampenRampStart = 2.0f;
   float kDampenRampEnd = 14.0f;
   float dampen = 1.0f;
   if (butteraugli_target >= kDampenRampStart) {
@@ -334,8 +279,6 @@ void PerBlockModulations(const float butteraugli_target, const ImageF& xyb_x,
       auto out_val = Set(df, row_out[ix]);
       out_val = ComputeMask(df, out_val);
       out_val = HfModulation(df, x, y, xyb_y, out_val);
-      out_val = ColorModulation(df, x, y, xyb_x, xyb_y, xyb_b,
-                                butteraugli_target, out_val);
       out_val = GammaModulation(df, x, y, xyb_x, xyb_y, out_val);
       // We want multiplicative quantization field, so everything
       // until this point has been modulating the exponent.
@@ -346,8 +289,8 @@ void PerBlockModulations(const float butteraugli_target, const ImageF& xyb_x,
 
 template <typename D, typename V>
 V MaskingSqrt(const D d, V v) {
-  static const float kLogOffset = 26.481471032459346f;
-  static const float kMul = 211.50759899638012f;
+  static const float kLogOffset = 27.97044946785558f;
+  static const float kMul = 211.53333281566171f;
   const auto mul_v = Set(d, kMul * 1e8);
   const auto offset_v = Set(d, kLogOffset);
   return Mul(Set(d, 0.25f), Sqrt(MulAdd(v, Sqrt(mul_v), offset_v)));
@@ -383,14 +326,38 @@ void StoreMin4(const float v, float& min0, float& min1, float& min2,
 // Look for smooth areas near the area of degradation.
 // If the areas are generally smooth, don't do masking.
 // Output is downsampled 2x.
-void FuzzyErosion(const Rect& from_rect, const ImageF& from,
-                  const Rect& to_rect, ImageF* to) {
+void FuzzyErosion(const float butteraugli_target, const Rect& from_rect,
+                  const ImageF& from, const Rect& to_rect, ImageF* to) {
   const size_t xsize = from.xsize();
   const size_t ysize = from.ysize();
   constexpr int kStep = 1;
   static_assert(kStep == 1, "Step must be 1");
   JXL_ASSERT(to_rect.xsize() * 2 == from_rect.xsize());
   JXL_ASSERT(to_rect.ysize() * 2 == from_rect.ysize());
+  static const float kMulBase0 = 0.125;
+  static const float kMulBase1 = 0.10;
+  static const float kMulBase2 = 0.09;
+  static const float kMulBase3 = 0.06;
+  static const float kMulAdd0 = 0.0;
+  static const float kMulAdd1 = -0.10;
+  static const float kMulAdd2 = -0.09;
+  static const float kMulAdd3 = -0.06;
+
+  float mul = 0.0;
+  if (butteraugli_target < 2.0f) {
+    mul = (2.0f - butteraugli_target) * (1.0f / 2.0f);
+  }
+  float kMul0 = kMulBase0 + mul * kMulAdd0;
+  float kMul1 = kMulBase1 + mul * kMulAdd1;
+  float kMul2 = kMulBase2 + mul * kMulAdd2;
+  float kMul3 = kMulBase3 + mul * kMulAdd3;
+  static const float kTotal = 0.29959705784054957;
+  float norm = kTotal / (kMul0 + kMul1 + kMul2 + kMul3);
+  kMul0 *= norm;
+  kMul1 *= norm;
+  kMul2 *= norm;
+  kMul3 *= norm;
+
   for (size_t fy = 0; fy < from_rect.ysize(); ++fy) {
     size_t y = fy + from_rect.y0();
     size_t ym1 = y >= kStep ? y - kStep : y;
@@ -420,13 +387,8 @@ void FuzzyErosion(const Rect& from_rect, const ImageF& from,
       StoreMin4(rowb[xm1], min0, min1, min2, min3);
       StoreMin4(rowb[x], min0, min1, min2, min3);
       StoreMin4(rowb[xp1], min0, min1, min2, min3);
-      static const float kMulC = 0.05f;
-      static const float kMul0 = 0.05f;
-      static const float kMul1 = 0.05f;
-      static const float kMul2 = 0.05f;
-      static const float kMul3 = 0.05f;
-      float v = kMulC * row[x] + kMul0 * min0 + kMul1 * min1 + kMul2 * min2 +
-                kMul3 * min3;
+
+      float v = kMul0 * min0 + kMul1 * min1 + kMul2 * min2 + kMul3 * min3;
       if (fx % 2 == 0 && fy % 2 == 0) {
         row_out[fx / 2] = v;
       } else {
@@ -453,8 +415,8 @@ struct AdaptiveQuantizationImpl {
   }
 
   void ComputeTile(float butteraugli_target, float scale, const Image3F& xyb,
-                   const Rect& rect, const int thread, ImageF* mask) {
-    PROFILER_ZONE("aq DiffPrecompute");
+                   const Rect& rect, const int thread, ImageF* mask,
+                   ImageF* mask1x1) {
     const size_t xsize = xyb.xsize();
     const size_t ysize = xyb.ysize();
 
@@ -466,22 +428,50 @@ struct AdaptiveQuantizationImpl {
     const float match_gamma_offset = 0.019;
 
     const HWY_FULL(float) df;
-    const float kXMul = 23.426802998210313f;
-    const auto kXMulv = Set(df, kXMul);
 
     size_t y_start = rect.y0() * 8;
     size_t y_end = y_start + rect.ysize() * 8;
 
-    size_t x0 = rect.x0() * 8;
-    size_t x1 = x0 + rect.xsize() * 8;
-    if (x0 != 0) x0 -= 4;
-    if (x1 != xyb.xsize()) x1 += 4;
-    if (y_start != 0) y_start -= 4;
-    if (y_end != xyb.ysize()) y_end += 4;
-    pre_erosion[thread].ShrinkTo((x1 - x0) / 4, (y_end - y_start) / 4);
+    size_t x_start = rect.x0() * 8;
+    size_t x_end = x_start + rect.xsize() * 8;
 
     // Computes image (padded to multiple of 8x8) of local pixel differences.
     // Subsample both directions by 4.
+    // 1x1 Laplacian of intensity.
+    for (size_t y = y_start; y < y_end; ++y) {
+      const size_t y2 = y + 1 < ysize ? y + 1 : y;
+      const size_t y1 = y > 0 ? y - 1 : y;
+      const float* row_in = xyb.PlaneRow(1, y);
+      const float* row_in1 = xyb.PlaneRow(1, y1);
+      const float* row_in2 = xyb.PlaneRow(1, y2);
+      float* mask1x1_out = mask1x1->Row(y);
+      auto scalar_pixel1x1 = [&](size_t x) {
+        const size_t x2 = x + 1 < xsize ? x + 1 : x;
+        const size_t x1 = x > 0 ? x - 1 : x;
+        const float base =
+            0.25f * (row_in2[x] + row_in1[x] + row_in[x1] + row_in[x2]);
+        const float gammac = RatioOfDerivativesOfCubicRootToSimpleGamma(
+            row_in[x] + match_gamma_offset);
+        float diff = fabs(gammac * (row_in[x] - base));
+        static const double kScaler = 1.0;
+        diff *= kScaler;
+        diff = log1p(diff);
+        static const float kMul = 1.0;
+        static const float kOffset = 0.01;
+        mask1x1_out[x] = kMul / (diff + kOffset);
+      };
+      for (size_t x = x_start; x < x_end; ++x) {
+        scalar_pixel1x1(x);
+      }
+    }
+
+    if (x_start != 0) x_start -= 4;
+    if (x_end != xyb.xsize()) x_end += 4;
+    if (y_start != 0) y_start -= 4;
+    if (y_end != xyb.ysize()) y_end += 4;
+    pre_erosion[thread].ShrinkTo((x_end - x_start) / 4, (y_end - y_start) / 4);
+
+    static const float limit = 0.2f;
     for (size_t y = y_start; y < y_end; ++y) {
       size_t y2 = y + 1 < ysize ? y + 1 : y;
       size_t y1 = y > 0 ? y - 1 : y;
@@ -489,9 +479,6 @@ struct AdaptiveQuantizationImpl {
       const float* row_in = xyb.PlaneRow(1, y);
       const float* row_in1 = xyb.PlaneRow(1, y1);
       const float* row_in2 = xyb.PlaneRow(1, y2);
-      const float* row_x_in = xyb.PlaneRow(0, y);
-      const float* row_x_in1 = xyb.PlaneRow(0, y1);
-      const float* row_x_in2 = xyb.PlaneRow(0, y2);
       float* JXL_RESTRICT row_out = diff_buffer.Row(thread);
 
       auto scalar_pixel = [&](size_t x) {
@@ -503,29 +490,27 @@ struct AdaptiveQuantizationImpl {
             row_in[x] + match_gamma_offset);
         float diff = gammac * (row_in[x] - base);
         diff *= diff;
-        const float base_x =
-            0.25f * (row_x_in2[x] + row_x_in1[x] + row_x_in[x1] + row_x_in[x2]);
-        float diff_x = gammac * (row_x_in[x] - base_x);
-        diff_x *= diff_x;
-        diff += kXMul * diff_x;
+        if (diff >= limit) {
+          diff = limit;
+        }
         diff = MaskingSqrt(diff);
         if ((y % 4) != 0) {
-          row_out[x - x0] += diff;
+          row_out[x - x_start] += diff;
         } else {
-          row_out[x - x0] = diff;
+          row_out[x - x_start] = diff;
         }
       };
 
-      size_t x = x0;
+      size_t x = x_start;
       // First pixel of the row.
-      if (x0 == 0) {
-        scalar_pixel(x0);
+      if (x_start == 0) {
+        scalar_pixel(x_start);
         ++x;
       }
       // SIMD
       const auto match_gamma_offset_v = Set(df, match_gamma_offset);
       const auto quarter = Set(df, 0.25f);
-      for (; x + 1 + Lanes(df) < x1; x += Lanes(df)) {
+      for (; x + 1 + Lanes(df) < x_end; x += Lanes(df)) {
         const auto in = LoadU(df, row_in + x);
         const auto in_r = LoadU(df, row_in + x + 1);
         const auto in_l = LoadU(df, row_in + x - 1);
@@ -537,39 +522,30 @@ struct AdaptiveQuantizationImpl {
                 df, Add(in, match_gamma_offset_v));
         auto diff = Mul(gammacv, Sub(in, base));
         diff = Mul(diff, diff);
-
-        const auto in_x = LoadU(df, row_x_in + x);
-        const auto in_x_r = LoadU(df, row_x_in + x + 1);
-        const auto in_x_l = LoadU(df, row_x_in + x - 1);
-        const auto in_x_t = LoadU(df, row_x_in2 + x);
-        const auto in_x_b = LoadU(df, row_x_in1 + x);
-        auto base_x =
-            Mul(quarter, Add(Add(in_x_r, in_x_l), Add(in_x_t, in_x_b)));
-        auto diff_x = Mul(gammacv, Sub(in_x, base_x));
-        diff_x = Mul(diff_x, diff_x);
-        diff = MulAdd(kXMulv, diff_x, diff);
+        diff = Min(diff, Set(df, limit));
         diff = MaskingSqrt(df, diff);
         if ((y & 3) != 0) {
-          diff = Add(diff, LoadU(df, row_out + x - x0));
+          diff = Add(diff, LoadU(df, row_out + x - x_start));
         }
-        StoreU(diff, df, row_out + x - x0);
+        StoreU(diff, df, row_out + x - x_start);
       }
       // Scalar
-      for (; x < x1; ++x) {
+      for (; x < x_end; ++x) {
         scalar_pixel(x);
       }
       if (y % 4 == 3) {
         float* row_dout = pre_erosion[thread].Row((y - y_start) / 4);
-        for (size_t x = 0; x < (x1 - x0) / 4; x++) {
+        for (size_t x = 0; x < (x_end - x_start) / 4; x++) {
           row_dout[x] = (row_out[x * 4] + row_out[x * 4 + 1] +
                          row_out[x * 4 + 2] + row_out[x * 4 + 3]) *
                         0.25f;
         }
       }
     }
-    Rect from_rect(x0 % 8 == 0 ? 0 : 1, y_start % 8 == 0 ? 0 : 1,
+    Rect from_rect(x_start % 8 == 0 ? 0 : 1, y_start % 8 == 0 ? 0 : 1,
                    rect.xsize() * 2, rect.ysize() * 2);
-    FuzzyErosion(from_rect, pre_erosion[thread], rect, &aq_map);
+    FuzzyErosion(butteraugli_target, from_rect, pre_erosion[thread], rect,
+                 &aq_map);
     for (size_t y = 0; y < rect.ysize(); ++y) {
       const float* aq_map_row = rect.ConstRow(aq_map, y);
       float* mask_row = rect.Row(mask, y);
@@ -585,15 +561,48 @@ struct AdaptiveQuantizationImpl {
   ImageF diff_buffer;
 };
 
+static void Blur1x1Masking(const FrameDimensions& frame_dim, ThreadPool* pool,
+                           ImageF* mask1x1) {
+  // Blur the mask1x1 to obtain the masking image.
+  // Before blurring it contains an image of absolute value of the
+  // Laplacian of the intensity channel.
+  static const float kFilterMask1x1[5] = {
+      static_cast<float>(0.25647067633737227),
+      static_cast<float>(0.2050056912354399075),
+      static_cast<float>(0.154082048668497307),
+      static_cast<float>(0.08149576591362004441),
+      static_cast<float>(0.0512750104812308467),
+  };
+  double sum =
+      1.0 + 4 * (kFilterMask1x1[0] + kFilterMask1x1[1] + kFilterMask1x1[2] +
+                 kFilterMask1x1[4] + 2 * kFilterMask1x1[3]);
+  if (sum < 1e-5) {
+    sum = 1e-5;
+  }
+  const float normalize = static_cast<float>(1.0 / sum);
+  const float normalize_mul = normalize;
+  WeightsSymmetric5 weights =
+      WeightsSymmetric5{{HWY_REP4(normalize)},
+                        {HWY_REP4(normalize_mul * kFilterMask1x1[0])},
+                        {HWY_REP4(normalize_mul * kFilterMask1x1[2])},
+                        {HWY_REP4(normalize_mul * kFilterMask1x1[1])},
+                        {HWY_REP4(normalize_mul * kFilterMask1x1[4])},
+                        {HWY_REP4(normalize_mul * kFilterMask1x1[3])}};
+  Rect from_rect(0, 0, 8 * frame_dim.xsize_blocks, 8 * frame_dim.ysize_blocks);
+  ImageF temp(mask1x1->xsize(), mask1x1->ysize());
+  Symmetric5(*mask1x1, from_rect, weights, pool, &temp);
+  CopyImageTo(temp, mask1x1);  // TODO: make it a swap
+}
+
 ImageF AdaptiveQuantizationMap(const float butteraugli_target,
                                const Image3F& xyb,
                                const FrameDimensions& frame_dim, float scale,
-                               ThreadPool* pool, ImageF* mask) {
-  PROFILER_ZONE("aq AdaptiveQuantMap");
-
+                               ThreadPool* pool, ImageF* mask,
+                               ImageF* mask1x1) {
   AdaptiveQuantizationImpl impl;
   impl.Init(xyb);
   *mask = ImageF(frame_dim.xsize_blocks, frame_dim.ysize_blocks);
+  *mask1x1 = ImageF(8 * frame_dim.xsize_blocks, 8 * frame_dim.ysize_blocks);
   JXL_CHECK(RunOnPool(
       pool, 0,
       DivCeil(frame_dim.xsize_blocks, kEncTileDimInBlocks) *
@@ -614,10 +623,12 @@ ImageF AdaptiveQuantizationMap(const float butteraugli_target,
         size_t bx1 =
             std::min((tx + 1) * kEncTileDimInBlocks, frame_dim.xsize_blocks);
         Rect r(bx0, by0, bx1 - bx0, by1 - by0);
-        impl.ComputeTile(butteraugli_target, scale, xyb, r, thread, mask);
+        impl.ComputeTile(butteraugli_target, scale, xyb, r, thread, mask,
+                         mask1x1);
       },
       "AQ DiffPrecompute"));
 
+  Blur1x1Masking(frame_dim, pool, mask1x1);
   return std::move(impl).aq_map;
 }
 
@@ -633,43 +644,47 @@ namespace jxl {
 HWY_EXPORT(AdaptiveQuantizationMap);
 
 namespace {
+
 // If true, prints the quantization maps at each iteration.
-bool FLAGS_dump_quant_state = false;
-
-void DumpHeatmap(const AuxOut* aux_out, const std::string& label,
-                 const ImageF& image, float good_threshold,
-                 float bad_threshold) {
-  Image3F heatmap = CreateHeatMapImage(image, good_threshold, bad_threshold);
-  char filename[200];
-  snprintf(filename, sizeof(filename), "%s%05d", label.c_str(),
-           aux_out->num_butteraugli_iters);
-  aux_out->DumpImage(filename, heatmap);
+constexpr bool FLAGS_dump_quant_state = false;
+
+void DumpHeatmap(const CompressParams& cparams, const AuxOut* aux_out,
+                 const std::string& label, const ImageF& image,
+                 float good_threshold, float bad_threshold) {
+  if (JXL_DEBUG_ADAPTIVE_QUANTIZATION) {
+    Image3F heatmap = CreateHeatMapImage(image, good_threshold, bad_threshold);
+    char filename[200];
+    snprintf(filename, sizeof(filename), "%s%05d", label.c_str(),
+             aux_out->num_butteraugli_iters);
+    DumpImage(cparams, filename, heatmap);
+  }
 }
 
-void DumpHeatmaps(const AuxOut* aux_out, float ba_target,
-                  const ImageF& quant_field, const ImageF& tile_heatmap,
-                  const ImageF& bt_diffmap) {
-  if (!WantDebugOutput(aux_out)) return;
-  ImageF inv_qmap(quant_field.xsize(), quant_field.ysize());
-  for (size_t y = 0; y < quant_field.ysize(); ++y) {
-    const float* JXL_RESTRICT row_q = quant_field.ConstRow(y);
-    float* JXL_RESTRICT row_inv_q = inv_qmap.Row(y);
-    for (size_t x = 0; x < quant_field.xsize(); ++x) {
-      row_inv_q[x] = 1.0f / row_q[x];  // never zero
+void DumpHeatmaps(const CompressParams& cparams, const AuxOut* aux_out,
+                  float ba_target, const ImageF& quant_field,
+                  const ImageF& tile_heatmap, const ImageF& bt_diffmap) {
+  if (JXL_DEBUG_ADAPTIVE_QUANTIZATION) {
+    if (!WantDebugOutput(cparams)) return;
+    ImageF inv_qmap(quant_field.xsize(), quant_field.ysize());
+    for (size_t y = 0; y < quant_field.ysize(); ++y) {
+      const float* JXL_RESTRICT row_q = quant_field.ConstRow(y);
+      float* JXL_RESTRICT row_inv_q = inv_qmap.Row(y);
+      for (size_t x = 0; x < quant_field.xsize(); ++x) {
+        row_inv_q[x] = 1.0f / row_q[x];  // never zero
+      }
     }
+    DumpHeatmap(cparams, aux_out, "quant_heatmap", inv_qmap, 4.0f * ba_target,
+                6.0f * ba_target);
+    DumpHeatmap(cparams, aux_out, "tile_heatmap", tile_heatmap, ba_target,
+                1.5f * ba_target);
+    // matches heat maps produced by the command line tool.
+    DumpHeatmap(cparams, aux_out, "bt_diffmap", bt_diffmap,
+                ButteraugliFuzzyInverse(1.5), ButteraugliFuzzyInverse(0.5));
   }
-  DumpHeatmap(aux_out, "quant_heatmap", inv_qmap, 4.0f * ba_target,
-              6.0f * ba_target);
-  DumpHeatmap(aux_out, "tile_heatmap", tile_heatmap, ba_target,
-              1.5f * ba_target);
-  // matches heat maps produced by the command line tool.
-  DumpHeatmap(aux_out, "bt_diffmap", bt_diffmap, ButteraugliFuzzyInverse(1.5),
-              ButteraugliFuzzyInverse(0.5));
 }
 
 ImageF TileDistMap(const ImageF& distmap, int tile_size, int margin,
                    const AcStrategyImage& ac_strategy) {
-  PROFILER_FUNC;
   const int tile_xsize = (distmap.xsize() + tile_size - 1) / tile_size;
   const int tile_ysize = (distmap.ysize() + tile_size - 1) / tile_size;
   ImageF tile_distmap(tile_xsize, tile_ysize);
@@ -733,9 +748,85 @@ ImageF TileDistMap(const ImageF& distmap, int tile_size, int margin,
   return tile_distmap;
 }
 
-constexpr float kDcQuantPow = 0.57f;
-static const float kDcQuant = 1.12f;
-static const float kAcQuant = 0.8294f;
+static const float kDcQuantPow = 0.83f;
+static const float kDcQuant = 1.095924047623553f;
+static const float kAcQuant = 0.7381485255235064f;
+
+// Computes the decoded image for a given set of compression parameters.
+ImageBundle RoundtripImage(const Image3F& opsin, PassesEncoderState* enc_state,
+                           const JxlCmsInterface& cms, ThreadPool* pool) {
+  std::unique_ptr<PassesDecoderState> dec_state =
+      jxl::make_unique<PassesDecoderState>();
+  JXL_CHECK(dec_state->output_encoding_info.SetFromMetadata(
+      *enc_state->shared.metadata));
+  dec_state->shared = &enc_state->shared;
+  JXL_ASSERT(opsin.ysize() % kBlockDim == 0);
+
+  const size_t xsize_groups = DivCeil(opsin.xsize(), kGroupDim);
+  const size_t ysize_groups = DivCeil(opsin.ysize(), kGroupDim);
+  const size_t num_groups = xsize_groups * ysize_groups;
+
+  size_t num_special_frames = enc_state->special_frames.size();
+
+  std::unique_ptr<ModularFrameEncoder> modular_frame_encoder =
+      jxl::make_unique<ModularFrameEncoder>(enc_state->shared.frame_header,
+                                            enc_state->cparams);
+  JXL_CHECK(InitializePassesEncoder(opsin, cms, pool, enc_state,
+                                    modular_frame_encoder.get(), nullptr));
+  JXL_CHECK(dec_state->Init());
+  JXL_CHECK(dec_state->InitForAC(pool));
+
+  ImageBundle decoded(&enc_state->shared.metadata->m);
+  decoded.origin = enc_state->shared.frame_header.frame_origin;
+  decoded.SetFromImage(Image3F(opsin.xsize(), opsin.ysize()),
+                       dec_state->output_encoding_info.color_encoding);
+
+  PassesDecoderState::PipelineOptions options;
+  options.use_slow_render_pipeline = false;
+  options.coalescing = false;
+  options.render_spotcolors = false;
+  options.render_noise = false;
+
+  // Same as dec_state->shared->frame_header.nonserialized_metadata->m
+  const ImageMetadata& metadata = *decoded.metadata();
+
+  JXL_CHECK(dec_state->PreparePipeline(&decoded, options));
+
+  hwy::AlignedUniquePtr<GroupDecCache[]> group_dec_caches;
+  const auto allocate_storage = [&](const size_t num_threads) -> Status {
+    JXL_RETURN_IF_ERROR(
+        dec_state->render_pipeline->PrepareForThreads(num_threads,
+                                                      /*use_group_ids=*/false));
+    group_dec_caches = hwy::MakeUniqueAlignedArray<GroupDecCache>(num_threads);
+    return true;
+  };
+  const auto process_group = [&](const uint32_t group_index,
+                                 const size_t thread) {
+    if (dec_state->shared->frame_header.loop_filter.epf_iters > 0) {
+      ComputeSigma(dec_state->shared->BlockGroupRect(group_index),
+                   dec_state.get());
+    }
+    RenderPipelineInput input =
+        dec_state->render_pipeline->GetInputBuffers(group_index, thread);
+    JXL_CHECK(DecodeGroupForRoundtrip(
+        enc_state->coeffs, group_index, dec_state.get(),
+        &group_dec_caches[thread], thread, input, &decoded, nullptr));
+    for (size_t c = 0; c < metadata.num_extra_channels; c++) {
+      std::pair<ImageF*, Rect> ri = input.GetBuffer(3 + c);
+      FillPlane(0.0f, ri.first, ri.second);
+    }
+    input.Done();
+  };
+  JXL_CHECK(RunOnPool(pool, 0, num_groups, allocate_storage, process_group,
+                      "AQ loop"));
+
+  // Ensure we don't create any new special frames.
+  enc_state->special_frames.resize(num_special_frames);
+
+  return decoded;
+}
+
+constexpr int kMaxButteraugliIters = 4;
 
 void FindBestQuantization(const ImageBundle& linear, const Image3F& opsin,
                           PassesEncoderState* enc_state,
@@ -771,12 +862,12 @@ void FindBestQuantization(const ImageBundle& linear, const Image3F& opsin,
     size_t orig_xsize;
     size_t orig_ysize;
   } t(const_cast<ImageBundle&>(linear),
-      enc_state->shared.frame_header.nonserialized_metadata->xsize(),
-      enc_state->shared.frame_header.nonserialized_metadata->ysize());
+      enc_state->shared.frame_header.frame_size.xsize,
+      enc_state->shared.frame_header.frame_size.ysize);
 
   const float butteraugli_target = cparams.butteraugli_distance;
   const float original_butteraugli = cparams.original_butteraugli_distance;
-  ButteraugliParams params = cparams.ba_params;
+  ButteraugliParams params;
   params.intensity_target = linear.metadata()->IntensityTarget();
   // Hack the default intensity target value to be 80.0, the intensity
   // target of sRGB images and a more reasonable viewing default than
@@ -790,9 +881,10 @@ void FindBestQuantization(const ImageBundle& linear, const Image3F& opsin,
       (comparator.GoodQualityScore() < comparator.BadQualityScore());
   const float initial_quant_dc = InitialQuantDC(butteraugli_target);
   AdjustQuantField(enc_state->shared.ac_strategy, Rect(quant_field),
-                   &quant_field);
+                   original_butteraugli, &quant_field);
   ImageF tile_distmap;
-  ImageF initial_quant_field = CopyImage(quant_field);
+  ImageF initial_quant_field(quant_field.xsize(), quant_field.ysize());
+  CopyImageTo(quant_field, &initial_quant_field);
 
   float initial_qf_min, initial_qf_max;
   ImageMinMax(initial_quant_field, &initial_qf_min, &initial_qf_max);
@@ -806,15 +898,12 @@ void FindBestQuantization(const ImageBundle& linear, const Image3F& opsin,
   JXL_ASSERT(qf_higher / qf_lower < 253);
 
   constexpr int kOriginalComparisonRound = 1;
-  int iters = cparams.max_butteraugli_iters;
-  if (iters > 7) {
-    iters = 7;
-  }
+  int iters = kMaxButteraugliIters;
   if (cparams.speed_tier != SpeedTier::kTortoise) {
     iters = 2;
   }
   for (int i = 0; i < iters + 1; ++i) {
-    if (FLAGS_dump_quant_state) {
+    if (JXL_DEBUG_ADAPTIVE_QUANTIZATION) {
       printf("\nQuantization field:\n");
       for (size_t y = 0; y < quant_field.ysize(); ++y) {
         for (size_t x = 0; x < quant_field.xsize(); ++x) {
@@ -825,26 +914,25 @@ void FindBestQuantization(const ImageBundle& linear, const Image3F& opsin,
     }
     quantizer.SetQuantField(initial_quant_dc, quant_field, &raw_quant_field);
     ImageBundle dec_linear = RoundtripImage(opsin, enc_state, cms, pool);
-    PROFILER_ZONE("enc Butteraugli");
     float score;
     ImageF diffmap;
     JXL_CHECK(comparator.CompareWith(dec_linear, &diffmap, &score));
     if (!lower_is_better) {
       score = -score;
-      diffmap = ScaleImage(-1.0f, diffmap);
+      ScaleImage(-1.0f, &diffmap);
     }
     tile_distmap = TileDistMap(diffmap, 8 * cparams.resampling, 0,
                                enc_state->shared.ac_strategy);
-    if (WantDebugOutput(aux_out)) {
-      aux_out->DumpImage(("dec" + ToString(i)).c_str(), *dec_linear.color());
-      DumpHeatmaps(aux_out, butteraugli_target, quant_field, tile_distmap,
-                   diffmap);
+    if (JXL_DEBUG_ADAPTIVE_QUANTIZATION && WantDebugOutput(cparams)) {
+      DumpImage(cparams, ("dec" + ToString(i)).c_str(), *dec_linear.color());
+      DumpHeatmaps(cparams, aux_out, butteraugli_target, quant_field,
+                   tile_distmap, diffmap);
     }
     if (aux_out != nullptr) ++aux_out->num_butteraugli_iters;
-    if (cparams.log_search_state) {
+    if (JXL_DEBUG_ADAPTIVE_QUANTIZATION) {
       float minval, maxval;
       ImageMinMax(quant_field, &minval, &maxval);
-      printf("\nButteraugli iter: %d/%d\n", i, cparams.max_butteraugli_iters);
+      printf("\nButteraugli iter: %d/%d\n", i, kMaxButteraugliIters);
       printf("Butteraugli distance: %f  (target = %f)\n", score,
              original_butteraugli);
       printf("quant range: %f ... %f  DC quant: %f\n", minval, maxval,
@@ -948,22 +1036,21 @@ void FindBestQuantizationMaxError(const Image3F& opsin,
   const float initial_quant_dc =
       16 * std::sqrt(0.1f / cparams.butteraugli_distance);
   AdjustQuantField(enc_state->shared.ac_strategy, Rect(quant_field),
-                   &quant_field);
+                   cparams.original_butteraugli_distance, &quant_field);
 
   const float inv_max_err[3] = {1.0f / enc_state->cparams.max_error[0],
                                 1.0f / enc_state->cparams.max_error[1],
                                 1.0f / enc_state->cparams.max_error[2]};
 
-  for (int i = 0; i < cparams.max_butteraugli_iters + 1; ++i) {
+  for (int i = 0; i < kMaxButteraugliIters + 1; ++i) {
     quantizer.SetQuantField(initial_quant_dc, quant_field, &raw_quant_field);
-    if (aux_out) {
-      aux_out->DumpXybImage(("ops" + ToString(i)).c_str(), opsin);
+    if (JXL_DEBUG_ADAPTIVE_QUANTIZATION && aux_out) {
+      DumpXybImage(cparams, ("ops" + ToString(i)).c_str(), opsin);
     }
     ImageBundle decoded = RoundtripImage(opsin, enc_state, cms, pool);
-    if (aux_out) {
-      aux_out->DumpXybImage(("dec" + ToString(i)).c_str(), *decoded.color());
+    if (JXL_DEBUG_ADAPTIVE_QUANTIZATION && aux_out) {
+      DumpXybImage(cparams, ("dec" + ToString(i)).c_str(), *decoded.color());
     }
-
     for (size_t by = 0; by < enc_state->shared.frame_dim.ysize_blocks; by++) {
       AcStrategyRow ac_strategy_row =
           enc_state->shared.ac_strategy.ConstRow(by);
@@ -1009,10 +1096,26 @@ void FindBestQuantizationMaxError(const Image3F& opsin,
 }  // namespace
 
 void AdjustQuantField(const AcStrategyImage& ac_strategy, const Rect& rect,
-                      ImageF* quant_field) {
+                      float butteraugli_target, ImageF* quant_field) {
   // Replace the whole quant_field in non-8x8 blocks with the maximum of each
   // 8x8 block.
   size_t stride = quant_field->PixelsPerRow();
+
+  // At low distances it is great to use max, but mean works better
+  // at high distances. We interpolate between them for a distance
+  // range.
+  float mean_max_mixer = 1.0f;
+  {
+    static const float kLimit = 1.54138f;
+    static const float kMul = 0.56391f;
+    static const float kMin = 0.0f;
+    if (butteraugli_target > kLimit) {
+      mean_max_mixer -= (butteraugli_target - kLimit) * kMul;
+      if (mean_max_mixer < kMin) {
+        mean_max_mixer = kMin;
+      }
+    }
+  }
   for (size_t y = 0; y < rect.ysize(); ++y) {
     AcStrategyRow ac_strategy_row = ac_strategy.ConstRow(rect, y);
     float* JXL_RESTRICT quant_row = rect.Row(quant_field, y);
@@ -1022,11 +1125,18 @@ void AdjustQuantField(const AcStrategyImage& ac_strategy, const Rect& rect,
       JXL_ASSERT(x + acs.covered_blocks_x() <= quant_field->xsize());
       JXL_ASSERT(y + acs.covered_blocks_y() <= quant_field->ysize());
       float max = quant_row[x];
+      float mean = 0.0;
       for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
         for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) {
+          mean += quant_row[x + ix + iy * stride];
           max = std::max(quant_row[x + ix + iy * stride], max);
         }
       }
+      mean /= acs.covered_blocks_y() * acs.covered_blocks_x();
+      if (acs.covered_blocks_y() * acs.covered_blocks_x() >= 4) {
+        max *= mean_max_mixer;
+        max += (1.0f - mean_max_mixer) * mean;
+      }
       for (size_t iy = 0; iy < acs.covered_blocks_y(); iy++) {
         for (size_t ix = 0; ix < acs.covered_blocks_x(); ix++) {
           quant_row[x + ix + iy * stride] = max;
@@ -1037,7 +1147,7 @@ void AdjustQuantField(const AcStrategyImage& ac_strategy, const Rect& rect,
 }
 
 float InitialQuantDC(float butteraugli_target) {
-  const float kDcMul = 2.9;  // Butteraugli target where non-linearity kicks in.
+  const float kDcMul = 0.3;  // Butteraugli target where non-linearity kicks in.
   const float butteraugli_target_dc = std::max<float>(
       0.5f * butteraugli_target,
       std::min<float>(butteraugli_target,
@@ -1052,11 +1162,11 @@ float InitialQuantDC(float butteraugli_target) {
 
 ImageF InitialQuantField(const float butteraugli_target, const Image3F& opsin,
                          const FrameDimensions& frame_dim, ThreadPool* pool,
-                         float rescale, ImageF* mask) {
-  PROFILER_FUNC;
+                         float rescale, ImageF* mask, ImageF* mask1x1) {
   const float quant_ac = kAcQuant / butteraugli_target;
   return HWY_DYNAMIC_DISPATCH(AdaptiveQuantizationMap)(
-      butteraugli_target, opsin, frame_dim, quant_ac * rescale, pool, mask);
+      butteraugli_target, opsin, frame_dim, quant_ac * rescale, pool, mask,
+      mask1x1);
 }
 
 void FindBestQuantizer(const ImageBundle* linear, const Image3F& opsin,
@@ -1065,87 +1175,12 @@ void FindBestQuantizer(const ImageBundle* linear, const Image3F& opsin,
                        AuxOut* aux_out, double rescale) {
   const CompressParams& cparams = enc_state->cparams;
   if (cparams.max_error_mode) {
-    PROFILER_ZONE("enc find best maxerr");
     FindBestQuantizationMaxError(opsin, enc_state, cms, pool, aux_out);
   } else if (cparams.speed_tier <= SpeedTier::kKitten) {
     // Normal encoding to a butteraugli score.
-    PROFILER_ZONE("enc find best2");
     FindBestQuantization(*linear, opsin, enc_state, cms, pool, aux_out);
   }
 }
 
-ImageBundle RoundtripImage(const Image3F& opsin, PassesEncoderState* enc_state,
-                           const JxlCmsInterface& cms, ThreadPool* pool) {
-  PROFILER_ZONE("enc roundtrip");
-  std::unique_ptr<PassesDecoderState> dec_state =
-      jxl::make_unique<PassesDecoderState>();
-  JXL_CHECK(dec_state->output_encoding_info.SetFromMetadata(
-      *enc_state->shared.metadata));
-  dec_state->shared = &enc_state->shared;
-  JXL_ASSERT(opsin.ysize() % kBlockDim == 0);
-
-  const size_t xsize_groups = DivCeil(opsin.xsize(), kGroupDim);
-  const size_t ysize_groups = DivCeil(opsin.ysize(), kGroupDim);
-  const size_t num_groups = xsize_groups * ysize_groups;
-
-  size_t num_special_frames = enc_state->special_frames.size();
-
-  std::unique_ptr<ModularFrameEncoder> modular_frame_encoder =
-      jxl::make_unique<ModularFrameEncoder>(enc_state->shared.frame_header,
-                                            enc_state->cparams);
-  JXL_CHECK(InitializePassesEncoder(opsin, cms, pool, enc_state,
-                                    modular_frame_encoder.get(), nullptr));
-  JXL_CHECK(dec_state->Init());
-  JXL_CHECK(dec_state->InitForAC(pool));
-
-  ImageBundle decoded(&enc_state->shared.metadata->m);
-  decoded.origin = enc_state->shared.frame_header.frame_origin;
-  decoded.SetFromImage(Image3F(opsin.xsize(), opsin.ysize()),
-                       dec_state->output_encoding_info.color_encoding);
-
-  PassesDecoderState::PipelineOptions options;
-  options.use_slow_render_pipeline = false;
-  options.coalescing = true;
-  options.render_spotcolors = false;
-
-  // Same as dec_state->shared->frame_header.nonserialized_metadata->m
-  const ImageMetadata& metadata = *decoded.metadata();
-
-  JXL_CHECK(dec_state->PreparePipeline(&decoded, options));
-
-  hwy::AlignedUniquePtr<GroupDecCache[]> group_dec_caches;
-  const auto allocate_storage = [&](const size_t num_threads) -> Status {
-    JXL_RETURN_IF_ERROR(
-        dec_state->render_pipeline->PrepareForThreads(num_threads,
-                                                      /*use_group_ids=*/false));
-    group_dec_caches = hwy::MakeUniqueAlignedArray<GroupDecCache>(num_threads);
-    return true;
-  };
-  const auto process_group = [&](const uint32_t group_index,
-                                 const size_t thread) {
-    if (dec_state->shared->frame_header.loop_filter.epf_iters > 0) {
-      ComputeSigma(dec_state->shared->BlockGroupRect(group_index),
-                   dec_state.get());
-    }
-    RenderPipelineInput input =
-        dec_state->render_pipeline->GetInputBuffers(group_index, thread);
-    JXL_CHECK(DecodeGroupForRoundtrip(
-        enc_state->coeffs, group_index, dec_state.get(),
-        &group_dec_caches[thread], thread, input, &decoded, nullptr));
-    for (size_t c = 0; c < metadata.num_extra_channels; c++) {
-      std::pair<ImageF*, Rect> ri = input.GetBuffer(3 + c);
-      FillPlane(0.0f, ri.first, ri.second);
-    }
-    input.Done();
-  };
-  JXL_CHECK(RunOnPool(pool, 0, num_groups, allocate_storage, process_group,
-                      "AQ loop"));
-
-  // Ensure we don't create any new special frames.
-  enc_state->special_frames.resize(num_special_frames);
-
-  return decoded;
-}
-
 }  // namespace jxl
 #endif  // HWY_ONCE
index 724353b..6b5fc32 100644 (file)
@@ -9,10 +9,7 @@
 #include <stddef.h>
 
 #include "lib/jxl/ac_strategy.h"
-#include "lib/jxl/aux_out.h"
 #include "lib/jxl/base/data_parallel.h"
-#include "lib/jxl/chroma_from_luma.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/enc_cache.h"
 #include "lib/jxl/enc_params.h"
 #include "lib/jxl/frame_header.h"
 
 namespace jxl {
 
-// Computes the decoded image for a given set of compression parameters. Mainly
-// used in the FindBestQuantization loops and in some tests.
-// TODO(veluca): this doesn't seem the best possible file for this function.
-ImageBundle RoundtripImage(const Image3F& opsin, PassesEncoderState* enc_state,
-                           const JxlCmsInterface& cms, ThreadPool* pool);
+struct AuxOut;
 
 // Returns an image subsampled by kBlockDim in each direction. If the value
 // at pixel (x,y) in the returned image is greater than 1.0, it means that
@@ -45,12 +38,13 @@ ImageBundle RoundtripImage(const Image3F& opsin, PassesEncoderState* enc_state,
 // can later be used to make better decisions about ac strategy.
 ImageF InitialQuantField(float butteraugli_target, const Image3F& opsin,
                          const FrameDimensions& frame_dim, ThreadPool* pool,
-                         float rescale, ImageF* initial_quant_mask);
+                         float rescale, ImageF* initial_quant_mask,
+                         ImageF* initial_quant_mask1x1);
 
 float InitialQuantDC(float butteraugli_target);
 
 void AdjustQuantField(const AcStrategyImage& ac_strategy, const Rect& rect,
-                      ImageF* quant_field);
+                      float butteraugli_target, ImageF* quant_field);
 
 // Returns a quantizer that uses an adjusted version of the provided
 // quant_field. Also computes the dequant_map corresponding to the given
index 81ff836..6b2f8b3 100644 (file)
 #include <vector>
 
 #include "lib/jxl/ans_common.h"
-#include "lib/jxl/aux_out.h"
-#include "lib/jxl/aux_out_fwd.h"
 #include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/fast_math-inl.h"
 #include "lib/jxl/dec_ans.h"
+#include "lib/jxl/enc_aux_out.h"
 #include "lib/jxl/enc_cluster.h"
 #include "lib/jxl/enc_context_map.h"
+#include "lib/jxl/enc_fields.h"
 #include "lib/jxl/enc_huffman.h"
-#include "lib/jxl/fast_math-inl.h"
 #include "lib/jxl/fields.h"
 
 namespace jxl {
 
 namespace {
 
-bool ans_fuzzer_friendly_ = false;
+#if !JXL_IS_DEBUG_BUILD
+constexpr
+#endif
+    bool ans_fuzzer_friendly_ = false;
 
 static const int kMaxNumSymbolsForSmallCode = 4;
 
@@ -78,6 +81,8 @@ float EstimateDataBits(const ANSHistBin* histogram, const ANSHistBin* counts,
     }
   }
   if (total_histogram > 0) {
+    // Used only in assert.
+    (void)total_counts;
     JXL_ASSERT(total_counts == ANS_TAB_SIZE);
   }
   return sum;
@@ -448,7 +453,7 @@ size_t BuildAndStoreANSEncodingData(
             &tmp_writer, 8 * alphabet_size + 8);  // safe upper bound
         BuildAndStoreHuffmanTree(histo.data(), alphabet_size, depths.data(),
                                  bits.data(), &tmp_writer);
-        ReclaimAndCharge(&tmp_writer, &allotment, 0, /*aux_out=*/nullptr);
+        allotment.ReclaimAndCharge(&tmp_writer, 0, /*aux_out=*/nullptr);
         cost = tmp_writer.BitsWritten();
       } else {
         size_t start = writer->BitsWritten();
@@ -785,7 +790,7 @@ class HistogramBuilder {
           num_symbol, log_alpha_size, use_prefix_code,
           codes->encoding_info.back().data(), writer);
       allotment.FinishedHistogram(writer);
-      ReclaimAndCharge(writer, &allotment, layer, aux_out);
+      allotment.ReclaimAndCharge(writer, layer, aux_out);
     }
     return cost;
   }
@@ -1461,7 +1466,7 @@ void ApplyLZ77(const HistogramParams& params, size_t num_contexts,
   } else if (params.lz77_method == HistogramParams::LZ77Method::kOptimal) {
     ApplyLZ77_Optimal(params, num_contexts, tokens, lz77, tokens_lz77);
   } else {
-    JXL_ABORT("Not implemented");
+    JXL_UNREACHABLE("Not implemented");
   }
 }
 }  // namespace
@@ -1572,7 +1577,7 @@ size_t BuildAndEncodeHistograms(const HistogramParams& params,
                                                   context_map, use_prefix_code,
                                                   writer, layer, aux_out);
   allotment.FinishedHistogram(writer);
-  ReclaimAndCharge(writer, &allotment, layer, aux_out);
+  allotment.ReclaimAndCharge(writer, layer, aux_out);
 
   if (aux_out != nullptr) {
     aux_out->layers[layer].num_clustered_histograms +=
@@ -1672,7 +1677,7 @@ void WriteTokens(const std::vector<Token>& tokens,
                  size_t layer, AuxOut* aux_out) {
   BitWriter::Allotment allotment(writer, 32 * tokens.size() + 32 * 1024 * 4);
   size_t num_extra_bits = WriteTokens(tokens, codes, context_map, writer);
-  ReclaimAndCharge(writer, &allotment, layer, aux_out);
+  allotment.ReclaimAndCharge(writer, layer, aux_out);
   if (aux_out != nullptr) {
     aux_out->layers[layer].extra_bits += num_extra_bits;
   }
index 2f720f5..bb4bdd9 100644 (file)
@@ -9,28 +9,19 @@
 // Library to encode the ANS population counts to the bit-stream and encode
 // symbols based on the respective distributions.
 
-#include <stddef.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <algorithm>
+#include <cstddef>
+#include <cstdint>
 #include <vector>
 
-#include "lib/jxl/ans_common.h"
 #include "lib/jxl/ans_params.h"
-#include "lib/jxl/aux_out.h"
-#include "lib/jxl/aux_out_fwd.h"
-#include "lib/jxl/base/compiler_specific.h"
-#include "lib/jxl/base/status.h"
 #include "lib/jxl/dec_ans.h"
 #include "lib/jxl/enc_ans_params.h"
 #include "lib/jxl/enc_bit_writer.h"
-#include "lib/jxl/huffman_table.h"
 
 namespace jxl {
 
+struct AuxOut;
+
 #define USE_MULT_BY_RECIPROCAL
 
 // precision must be equal to:  #bits(state_) + #bits(freq)
index 9030430..a83d2dc 100644 (file)
@@ -20,7 +20,6 @@
 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/chroma_from_luma.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/enc_adaptive_quantization.h"
 #include "lib/jxl/enc_params.h"
 #include "lib/jxl/image.h"
index ae9d399..aabe71f 100644 (file)
@@ -6,17 +6,17 @@
 #ifndef LIB_JXL_ENC_AR_CONTROL_FIELD_H_
 #define LIB_JXL_ENC_AR_CONTROL_FIELD_H_
 
-#include "lib/jxl/ac_strategy.h"
-#include "lib/jxl/base/data_parallel.h"
-#include "lib/jxl/chroma_from_luma.h"
-#include "lib/jxl/common.h"
-#include "lib/jxl/enc_cache.h"
+#include <stddef.h>
+
+#include <vector>
+
 #include "lib/jxl/enc_params.h"
 #include "lib/jxl/image.h"
-#include "lib/jxl/quant_weights.h"
 
 namespace jxl {
 
+struct PassesEncoderState;
+
 struct ArControlFieldHeuristics {
   struct TempImages {
     void InitOnce() {
diff --git a/lib/jxl/enc_aux_out.cc b/lib/jxl/enc_aux_out.cc
new file mode 100644 (file)
index 0000000..3320106
--- /dev/null
@@ -0,0 +1,126 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_aux_out.h"
+
+#include <inttypes.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <numeric>  // accumulate
+#include <sstream>
+
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+const char* LayerName(size_t layer) {
+  switch (layer) {
+    case kLayerHeader:
+      return "Headers";
+    case kLayerTOC:
+      return "TOC";
+    case kLayerDictionary:
+      return "Patches";
+    case kLayerSplines:
+      return "Splines";
+    case kLayerNoise:
+      return "Noise";
+    case kLayerQuant:
+      return "Quantizer";
+    case kLayerModularTree:
+      return "ModularTree";
+    case kLayerModularGlobal:
+      return "ModularGlobal";
+    case kLayerDC:
+      return "DC";
+    case kLayerModularDcGroup:
+      return "ModularDcGroup";
+    case kLayerControlFields:
+      return "ControlFields";
+    case kLayerOrder:
+      return "CoeffOrder";
+    case kLayerAC:
+      return "ACHistograms";
+    case kLayerACTokens:
+      return "ACTokens";
+    case kLayerModularAcGroup:
+      return "ModularAcGroup";
+    default:
+      JXL_UNREACHABLE("Invalid layer %d\n", static_cast<int>(layer));
+  }
+}
+
+void AuxOut::LayerTotals::Print(size_t num_inputs) const {
+  if (JXL_DEBUG_V_LEVEL > 0) {
+    printf("%10" PRId64, static_cast<int64_t>(total_bits));
+    if (histogram_bits != 0) {
+      printf("   [c/i:%6.2f | hst:%8" PRId64 " | ex:%8" PRId64
+             " | h+c+e:%12.3f",
+             num_clustered_histograms * 1.0 / num_inputs,
+             static_cast<int64_t>(histogram_bits >> 3),
+             static_cast<int64_t>(extra_bits >> 3),
+             (histogram_bits + clustered_entropy + extra_bits) / 8.0);
+      printf("]");
+    }
+    printf("\n");
+  }
+}
+
+void AuxOut::Assimilate(const AuxOut& victim) {
+  for (size_t i = 0; i < layers.size(); ++i) {
+    layers[i].Assimilate(victim.layers[i]);
+  }
+  num_blocks += victim.num_blocks;
+  num_small_blocks += victim.num_small_blocks;
+  num_dct4x8_blocks += victim.num_dct4x8_blocks;
+  num_afv_blocks += victim.num_afv_blocks;
+  num_dct8_blocks += victim.num_dct8_blocks;
+  num_dct8x16_blocks += victim.num_dct8x16_blocks;
+  num_dct8x32_blocks += victim.num_dct8x32_blocks;
+  num_dct16_blocks += victim.num_dct16_blocks;
+  num_dct16x32_blocks += victim.num_dct16x32_blocks;
+  num_dct32_blocks += victim.num_dct32_blocks;
+  num_dct32x64_blocks += victim.num_dct32x64_blocks;
+  num_dct64_blocks += victim.num_dct64_blocks;
+  num_butteraugli_iters += victim.num_butteraugli_iters;
+}
+
+void AuxOut::Print(size_t num_inputs) const {
+  if (JXL_DEBUG_V_LEVEL > 0) {
+    if (num_inputs == 0) return;
+
+    LayerTotals all_layers;
+    for (size_t i = 0; i < layers.size(); ++i) {
+      all_layers.Assimilate(layers[i]);
+    }
+
+    printf("Average butteraugli iters: %10.2f\n",
+           num_butteraugli_iters * 1.0 / num_inputs);
+
+    for (size_t i = 0; i < layers.size(); ++i) {
+      if (layers[i].total_bits != 0) {
+        printf("Total layer bits %-10s\t", LayerName(i));
+        printf("%10f%%", 100.0 * layers[i].total_bits / all_layers.total_bits);
+        layers[i].Print(num_inputs);
+      }
+    }
+    printf("Total image size           ");
+    all_layers.Print(num_inputs);
+
+    size_t total_blocks = 0;
+    size_t total_positions = 0;
+    if (total_blocks != 0 && total_positions != 0) {
+      printf("\n\t\t  Blocks\t\tPositions\t\t\tBlocks/Position\n");
+      printf(" Total:\t\t    %7" PRIuS "\t\t     %7" PRIuS " \t\t\t%10f%%\n\n",
+             total_blocks, total_positions,
+             100.0 * total_blocks / total_positions);
+    }
+  }
+}
+
+}  // namespace jxl
diff --git a/lib/jxl/enc_aux_out.h b/lib/jxl/enc_aux_out.h
new file mode 100644 (file)
index 0000000..545711a
--- /dev/null
@@ -0,0 +1,102 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_AUX_OUT_H_
+#define LIB_JXL_AUX_OUT_H_
+
+// Optional output information for debugging and analyzing size usage.
+
+#include <stddef.h>
+
+#include <array>
+#include <functional>
+#include <string>
+
+namespace jxl {
+
+struct ColorEncoding;
+
+// For LayerName and AuxOut::layers[] index. Order does not matter.
+enum {
+  kLayerHeader = 0,
+  kLayerTOC,
+  kLayerDictionary,
+  kLayerSplines,
+  kLayerNoise,
+  kLayerQuant,
+  kLayerModularTree,
+  kLayerModularGlobal,
+  kLayerDC,
+  kLayerModularDcGroup,
+  kLayerControlFields,
+  kLayerOrder,
+  kLayerAC,
+  kLayerACTokens,
+  kLayerModularAcGroup,
+  kNumImageLayers
+};
+
+const char* LayerName(size_t layer);
+
+// Statistics gathered during compression or decompression.
+struct AuxOut {
+ private:
+  struct LayerTotals {
+    void Assimilate(const LayerTotals& victim) {
+      num_clustered_histograms += victim.num_clustered_histograms;
+      histogram_bits += victim.histogram_bits;
+      extra_bits += victim.extra_bits;
+      total_bits += victim.total_bits;
+      clustered_entropy += victim.clustered_entropy;
+    }
+    void Print(size_t num_inputs) const;
+
+    size_t num_clustered_histograms = 0;
+    size_t extra_bits = 0;
+
+    // Set via BitsWritten below
+    size_t histogram_bits = 0;
+    size_t total_bits = 0;
+
+    double clustered_entropy = 0.0;
+  };
+
+ public:
+  AuxOut() = default;
+  AuxOut(const AuxOut&) = default;
+
+  void Assimilate(const AuxOut& victim);
+
+  void Print(size_t num_inputs) const;
+
+  size_t TotalBits() const {
+    size_t total = 0;
+    for (const auto& layer : layers) {
+      total += layer.total_bits;
+    }
+    return total;
+  }
+
+  std::array<LayerTotals, kNumImageLayers> layers;
+  size_t num_blocks = 0;
+
+  // Number of blocks that use larger DCT (set by ac_strategy).
+  size_t num_small_blocks = 0;
+  size_t num_dct4x8_blocks = 0;
+  size_t num_afv_blocks = 0;
+  size_t num_dct8_blocks = 0;
+  size_t num_dct8x16_blocks = 0;
+  size_t num_dct8x32_blocks = 0;
+  size_t num_dct16_blocks = 0;
+  size_t num_dct16x32_blocks = 0;
+  size_t num_dct32_blocks = 0;
+  size_t num_dct32x64_blocks = 0;
+  size_t num_dct64_blocks = 0;
+
+  int num_butteraugli_iters = 0;
+};
+}  // namespace jxl
+
+#endif  // LIB_JXL_AUX_OUT_H_
index 7bac7b9..6e8b658 100644 (file)
@@ -8,7 +8,9 @@
 #include <string.h>  // memcpy
 
 #include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/printf_macros.h"
 #include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/enc_aux_out.h"
 
 namespace jxl {
 
@@ -26,7 +28,7 @@ BitWriter::Allotment::Allotment(BitWriter* JXL_RESTRICT writer, size_t max_bits)
 BitWriter::Allotment::~Allotment() {
   if (!called_) {
     // Not calling is a bug - unused storage will not be reclaimed.
-    JXL_ABORT("Did not call Allotment::ReclaimUnused");
+    JXL_UNREACHABLE("Did not call Allotment::ReclaimUnused");
   }
 }
 
@@ -38,6 +40,25 @@ void BitWriter::Allotment::FinishedHistogram(BitWriter* JXL_RESTRICT writer) {
   histogram_bits_ = writer->BitsWritten() - prev_bits_written_;
 }
 
+void BitWriter::Allotment::ReclaimAndCharge(BitWriter* JXL_RESTRICT writer,
+                                            size_t layer,
+                                            AuxOut* JXL_RESTRICT aux_out) {
+  size_t used_bits = 0, unused_bits = 0;
+  PrivateReclaim(writer, &used_bits, &unused_bits);
+
+#if 0
+  printf("Layer %s bits: max %" PRIuS " used %" PRIuS " unused %" PRIuS "\n",
+         LayerName(layer), MaxBits(), used_bits, unused_bits);
+#endif
+
+  // This may be a nested call with aux_out == null. Whenever we know that
+  // aux_out is null, we can call ReclaimUnused directly.
+  if (aux_out != nullptr) {
+    aux_out->layers[layer].total_bits += used_bits;
+    aux_out->layers[layer].histogram_bits += HistogramBits();
+  }
+}
+
 void BitWriter::Allotment::PrivateReclaim(BitWriter* JXL_RESTRICT writer,
                                           size_t* JXL_RESTRICT used_bits,
                                           size_t* JXL_RESTRICT unused_bits) {
@@ -64,7 +85,7 @@ void BitWriter::Allotment::PrivateReclaim(BitWriter* JXL_RESTRICT writer,
 }
 
 void BitWriter::AppendByteAligned(const Span<const uint8_t>& span) {
-  if (!span.size()) return;
+  if (span.empty()) return;
   storage_.resize(storage_.size() + span.size() + 1);  // extra zero padding
 
   // Concatenate by copying bytes because both source and destination are bytes.
index 4cac8df..7ed2db5 100644 (file)
 #include <utility>
 #include <vector>
 
+#include "lib/jxl/base/common.h"
 #include "lib/jxl/base/compiler_specific.h"
-#include "lib/jxl/base/padded_bytes.h"
 #include "lib/jxl/base/span.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/common.h"
+#include "lib/jxl/padded_bytes.h"
 
 namespace jxl {
 
+struct AuxOut;
+
 struct BitWriter {
   // Upper bound on `n_bits` in each call to Write. We shift a 64-bit word by
   // 7 bits (max already valid bits in the last byte) and at least 1 bit is
@@ -43,7 +45,7 @@ struct BitWriter {
   Span<const uint8_t> GetSpan() const {
     // Callers must ensure byte alignment to avoid uninitialized bits.
     JXL_ASSERT(bits_written_ % kBitsPerByte == 0);
-    return Span<const uint8_t>(storage_.data(), bits_written_ / kBitsPerByte);
+    return Bytes(storage_.data(), bits_written_ / kBitsPerByte);
   }
 
   // Example usage: bytes = std::move(writer).TakeBytes(); Useful for the
@@ -84,13 +86,14 @@ struct BitWriter {
       return histogram_bits_;
     }
 
-    // Do not call directly - use ::ReclaimAndCharge instead, which ensures
-    // the bits are charged to a layer.
+    void ReclaimAndCharge(BitWriter* JXL_RESTRICT writer, size_t layer,
+                          AuxOut* JXL_RESTRICT aux_out);
+
+   private:
     void PrivateReclaim(BitWriter* JXL_RESTRICT writer,
                         size_t* JXL_RESTRICT used_bits,
                         size_t* JXL_RESTRICT unused_bits);
 
-   private:
     size_t prev_bits_written_;
     const size_t max_bits_;
     size_t histogram_bits_ = 0;
index e79c4b5..3086e9e 100644 (file)
@@ -8,7 +8,6 @@
 #include <algorithm>
 #include <vector>
 
-#include "lib/jxl/color_management.h"
 #include "lib/jxl/enc_image_bundle.h"
 
 namespace jxl {
@@ -75,22 +74,24 @@ float JxlButteraugliComparator::BadQualityScore() const {
 float ButteraugliDistance(const ImageBundle& rgb0, const ImageBundle& rgb1,
                           const ButteraugliParams& params,
                           const JxlCmsInterface& cms, ImageF* distmap,
-                          ThreadPool* pool) {
+                          ThreadPool* pool, bool ignore_alpha) {
   JxlButteraugliComparator comparator(params, cms);
-  return ComputeScore(rgb0, rgb1, &comparator, cms, distmap, pool);
+  return ComputeScore(rgb0, rgb1, &comparator, cms, distmap, pool,
+                      ignore_alpha);
 }
 
-float ButteraugliDistance(const CodecInOut& rgb0, const CodecInOut& rgb1,
+float ButteraugliDistance(const std::vector<ImageBundle>& frames0,
+                          const std::vector<ImageBundle>& frames1,
                           const ButteraugliParams& params,
                           const JxlCmsInterface& cms, ImageF* distmap,
                           ThreadPool* pool) {
   JxlButteraugliComparator comparator(params, cms);
-  JXL_ASSERT(rgb0.frames.size() == rgb1.frames.size());
+  JXL_ASSERT(frames0.size() == frames1.size());
   float max_dist = 0.0f;
-  for (size_t i = 0; i < rgb0.frames.size(); ++i) {
-    max_dist =
-        std::max(max_dist, ComputeScore(rgb0.frames[i], rgb1.frames[i],
-                                        &comparator, cms, distmap, pool));
+  for (size_t i = 0; i < frames0.size(); ++i) {
+    max_dist = std::max(
+        max_dist,
+        ComputeScore(frames0[i], frames1[i], &comparator, cms, distmap, pool));
   }
   return max_dist;
 }
index 6d0751c..28d9faa 100644 (file)
@@ -6,6 +6,7 @@
 #ifndef LIB_JXL_ENC_BUTTERAUGLI_COMPARATOR_H_
 #define LIB_JXL_ENC_BUTTERAUGLI_COMPARATOR_H_
 
+#include <jxl/cms_interface.h>
 #include <stddef.h>
 
 #include <memory>
@@ -13,7 +14,6 @@
 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/butteraugli/butteraugli.h"
-#include "lib/jxl/codec_in_out.h"
 #include "lib/jxl/enc_comparator.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/image_bundle.h"
@@ -46,9 +46,11 @@ class JxlButteraugliComparator : public Comparator {
 float ButteraugliDistance(const ImageBundle& rgb0, const ImageBundle& rgb1,
                           const ButteraugliParams& params,
                           const JxlCmsInterface& cms, ImageF* distmap = nullptr,
-                          ThreadPool* pool = nullptr);
+                          ThreadPool* pool = nullptr,
+                          bool ignore_alpha = false);
 
-float ButteraugliDistance(const CodecInOut& rgb0, const CodecInOut& rgb1,
+float ButteraugliDistance(const std::vector<ImageBundle>& frames0,
+                          const std::vector<ImageBundle>& frames1,
                           const ButteraugliParams& params,
                           const JxlCmsInterface& cms, ImageF* distmap = nullptr,
                           ThreadPool* pool = nullptr);
index a1f2a08..635df5a 100644 (file)
 #include <type_traits>
 
 #include "lib/jxl/ac_strategy.h"
-#include "lib/jxl/aux_out.h"
+#include "lib/jxl/base/common.h"
 #include "lib/jxl/base/compiler_specific.h"
-#include "lib/jxl/base/padded_bytes.h"
-#include "lib/jxl/base/profiler.h"
 #include "lib/jxl/base/span.h"
 #include "lib/jxl/color_encoding_internal.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/compressed_dc.h"
 #include "lib/jxl/dct_scales.h"
 #include "lib/jxl/dct_util.h"
 #include "lib/jxl/dec_frame.h"
+#include "lib/jxl/enc_aux_out.h"
 #include "lib/jxl/enc_frame.h"
 #include "lib/jxl/enc_group.h"
 #include "lib/jxl/enc_modular.h"
 #include "lib/jxl/enc_quant_weights.h"
+#include "lib/jxl/frame_dimensions.h"
 #include "lib/jxl/frame_header.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/image_bundle.h"
@@ -39,8 +38,6 @@ Status InitializePassesEncoder(const Image3F& opsin, const JxlCmsInterface& cms,
                                ThreadPool* pool, PassesEncoderState* enc_state,
                                ModularFrameEncoder* modular_frame_encoder,
                                AuxOut* aux_out) {
-  PROFILER_FUNC;
-
   PassesSharedState& JXL_RESTRICT shared = enc_state->shared;
 
   enc_state->histogram_idx.resize(shared.frame_dim.num_groups);
@@ -93,10 +90,12 @@ Status InitializePassesEncoder(const Image3F& opsin, const JxlCmsInterface& cms,
     // and kModular for the smallest DC (first in the bitstream)
     if (cparams.progressive_dc == 0) {
       cparams.modular_mode = true;
-      // TODO(jon): tweak mapping from image dist to dist for modular DC
+      cparams.speed_tier =
+          SpeedTier(std::max(static_cast<int>(SpeedTier::kTortoise),
+                             static_cast<int>(cparams.speed_tier) - 1));
       cparams.butteraugli_distance =
           std::max(kMinButteraugliDistance,
-                   enc_state->cparams.butteraugli_distance * 0.03f);
+                   enc_state->cparams.butteraugli_distance * 0.02f);
     } else {
       cparams.max_error_mode = true;
       for (size_t c = 0; c < 3; c++) {
@@ -114,9 +113,10 @@ Status InitializePassesEncoder(const Image3F& opsin, const JxlCmsInterface& cms,
         std::move(dc),
         ColorEncoding::LinearSRGB(shared.metadata->m.color_encoding.IsGray()));
     if (!ib.metadata()->extra_channel_info.empty()) {
-      // Add dummy extra channels to the patch image: dc_level frames do not yet
-      // support extra channels, but the codec expects that the amount of extra
-      // channels in frames matches that in the metadata of the codestream.
+      // Add placeholder extra channels to the patch image: dc_level frames do
+      // not yet support extra channels, but the codec expects that the amount
+      // of extra channels in frames matches that in the metadata of the
+      // codestream.
       std::vector<ImageF> extra_channels;
       extra_channels.reserve(ib.metadata()->extra_channel_info.size());
       for (size_t i = 0; i < ib.metadata()->extra_channel_info.size(); i++) {
@@ -139,9 +139,6 @@ Status InitializePassesEncoder(const Image3F& opsin, const JxlCmsInterface& cms,
     dc_frame_info.ib_needs_color_transform = false;
     dc_frame_info.save_before_color_transform = true;  // Implicitly true
     AuxOut dc_aux_out;
-    if (aux_out) {
-      dc_aux_out.debug_prefix = aux_out->debug_prefix;
-    }
     JXL_CHECK(EncodeFrame(cparams, dc_frame_info, shared.metadata, ib,
                           state.get(), cms, pool, special_frame.get(),
                           aux_out ? &dc_aux_out : nullptr));
@@ -171,17 +168,17 @@ Status InitializePassesEncoder(const Image3F& opsin, const JxlCmsInterface& cms,
     // dc_frame_info.dc_level = shared.frame_header.dc_level + 1, and
     // dc_frame_info.dc_level is used by EncodeFrame. However, if EncodeFrame
     // outputs multiple frames, this assumption could be wrong.
-    shared.dc_storage =
-        CopyImage(dec_state->shared->dc_frames[shared.frame_header.dc_level]);
+    const Image3F& dc_frame =
+        dec_state->shared->dc_frames[shared.frame_header.dc_level];
+    shared.dc_storage = Image3F(dc_frame.xsize(), dc_frame.ysize());
+    CopyImageTo(dc_frame, &shared.dc_storage);
     ZeroFillImage(&shared.quant_dc);
     shared.dc = &shared.dc_storage;
     JXL_CHECK(encoded_size == 0);
   } else {
     auto compute_dc_coeffs = [&](int group_index, int /* thread */) {
       modular_frame_encoder->AddVarDCTDC(
-          dc, group_index,
-          enc_state->cparams.butteraugli_distance >= 2.0f &&
-              enc_state->cparams.speed_tier < SpeedTier::kFalcon,
+          dc, group_index, enc_state->cparams.speed_tier < SpeedTier::kFalcon,
           enc_state, /*jpeg_transcode=*/false);
     };
     JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, shared.frame_dim.num_dc_groups,
@@ -200,16 +197,10 @@ Status InitializePassesEncoder(const Image3F& opsin, const JxlCmsInterface& cms,
                                 ThreadPool::NoInit, compute_ac_meta,
                                 "Compute AC Metadata"));
 
-  if (aux_out != nullptr) {
-    aux_out->InspectImage3F("compressed_image:InitializeFrameEncCache:dc_dec",
-                            shared.dc_storage);
-  }
   return true;
 }
 
 void EncCache::InitOnce() {
-  PROFILER_FUNC;
-
   if (num_nzeroes.xsize() == 0) {
     num_nzeroes = Image3I(kGroupDimInBlocks, kGroupDimInBlocks);
   }
index 04dff0b..52114f2 100644 (file)
 #include <vector>
 
 #include "lib/jxl/ac_strategy.h"
-#include "lib/jxl/aux_out.h"
 #include "lib/jxl/base/data_parallel.h"
-#include "lib/jxl/chroma_from_luma.h"
 #include "lib/jxl/coeff_order.h"
 #include "lib/jxl/coeff_order_fwd.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/dct_util.h"
 #include "lib/jxl/enc_ans.h"
 #include "lib/jxl/enc_heuristics.h"
 #include "lib/jxl/enc_params.h"
+#include "lib/jxl/enc_progressive_split.h"
 #include "lib/jxl/frame_header.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/image_bundle.h"
 #include "lib/jxl/passes_state.h"
-#include "lib/jxl/progressive_split.h"
 #include "lib/jxl/quant_weights.h"
 #include "lib/jxl/quantizer.h"
 
 namespace jxl {
 
+struct AuxOut;
+
 // Contains encoder state.
 struct PassesEncoderState {
   PassesSharedState shared;
 
   ImageF initial_quant_field;    // Invalid in Falcon mode.
   ImageF initial_quant_masking;  // Invalid in Falcon mode.
+  ImageF initial_quant_masking1x1;  // Invalid in Falcon mode.
 
   // Per-pass DCT coefficients for the image. One row per group.
   std::vector<std::unique_ptr<ACImage>> coeffs;
index 4f0798e..fa0d234 100644 (file)
 #include <hwy/foreach_target.h>
 #include <hwy/highway.h>
 
-#include "lib/jxl/aux_out.h"
 #include "lib/jxl/base/bits.h"
-#include "lib/jxl/base/padded_bytes.h"
-#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/common.h"
 #include "lib/jxl/base/span.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/common.h"
+#include "lib/jxl/cms/opsin_params.h"
 #include "lib/jxl/dec_transforms-inl.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_params.h"
 #include "lib/jxl/enc_transforms-inl.h"
 #include "lib/jxl/entropy_coder.h"
 #include "lib/jxl/image_ops.h"
 #include "lib/jxl/modular/encoding/encoding.h"
 #include "lib/jxl/quantizer.h"
+#include "lib/jxl/simd_util.h"
 HWY_BEFORE_NAMESPACE();
 namespace jxl {
 namespace HWY_NAMESPACE {
@@ -114,6 +115,7 @@ struct CFLFunction {
   float distance_mul;
 };
 
+// Chroma-from-luma search, values_m will have luma -- and values_s chroma.
 int32_t FindBestMultiplier(const float* values_m, const float* values_s,
                            size_t num, float base, float distance_mul,
                            bool fast) {
@@ -139,7 +141,7 @@ int32_t FindBestMultiplier(const float* values_m, const float* values_s,
     x = -GetLane(SumOfLanes(df, cb)) /
         (GetLane(SumOfLanes(df, ca)) + num * distance_mul * 0.5f);
   } else {
-    constexpr float eps = 1;
+    constexpr float eps = 100;
     constexpr float kClamp = 20.0f;
     CFLFunction fn(values_m, values_s, num, base, distance_mul);
     x = 0;
@@ -150,11 +152,26 @@ int32_t FindBestMultiplier(const float* values_m, const float* values_s,
       float dfpeps, dfmeps;
       float df = fn.Compute(x, eps, &dfpeps, &dfmeps);
       float ddf = (dfpeps - dfmeps) / (2 * eps);
-      float step = df / ddf;
+      float kExperimentalInsignificantStabilizer = 0.85;
+      float step = df / (ddf + kExperimentalInsignificantStabilizer);
       x -= std::min(kClamp, std::max(-kClamp, step));
       if (std::abs(step) < 3e-3) break;
     }
   }
+  // CFL seems to be tricky for larger transforms for HF components
+  // close to zero. This heuristic brings the solutions closer to zero
+  // and reduces red-green oscillations. A better approach would
+  // look into variance of the multiplier within separate (e.g. 8x8)
+  // areas and only apply this heuristic where there is a high variance.
+  // This would give about 1 % more compression density.
+  float towards_zero = 2.6;
+  if (x >= towards_zero) {
+    x -= towards_zero;
+  } else if (x <= -towards_zero) {
+    x += towards_zero;
+  } else {
+    x = 0;
+  }
   return std::max(-128.0f, std::min(127.0f, roundf(x)));
 }
 
@@ -185,17 +202,20 @@ void ComputeDC(const ImageF& dc_values, bool fast, int32_t* dc_x,
   *dc_x = FindBestMultiplier(dc_values_yx, dc_values_x, dc_values.xsize(), 0.0f,
                              kDistanceMultiplierDC, fast);
   *dc_b = FindBestMultiplier(dc_values_yb, dc_values_b, dc_values.xsize(),
-                             kYToBRatio, kDistanceMultiplierDC, fast);
+                             jxl::cms::kYToBRatio, kDistanceMultiplierDC, fast);
 }
 
 void ComputeTile(const Image3F& opsin, const DequantMatrices& dequant,
-                 const AcStrategyImage* ac_strategy, const Quantizer* quantizer,
+                 const AcStrategyImage* ac_strategy,
+                 const ImageI* raw_quant_field, const Quantizer* quantizer,
                  const Rect& r, bool fast, bool use_dct8, ImageSB* map_x,
                  ImageSB* map_b, ImageF* dc_values, float* mem) {
   static_assert(kEncTileDimInBlocks == kColorTileDimInBlocks,
                 "Invalid color tile dim");
   size_t xsize_blocks = opsin.xsize() / kBlockDim;
-  constexpr float kDistanceMultiplierAC = 1e-3f;
+  constexpr float kDistanceMultiplierAC = 1e-9f;
+  const size_t dct_scratch_size =
+      3 * (MaxVectorSize() / sizeof(float)) * AcStrategy::kMaxBlockDim;
 
   const size_t y0 = r.y0();
   const size_t x0 = r.x0();
@@ -222,8 +242,10 @@ void ComputeTile(const Image3F& opsin, const DequantMatrices& dequant,
   float* HWY_RESTRICT coeffs_yb = coeffs_x + kColorTileDim * kColorTileDim;
   float* HWY_RESTRICT coeffs_b = coeffs_yb + kColorTileDim * kColorTileDim;
   float* HWY_RESTRICT scratch_space = coeffs_b + kColorTileDim * kColorTileDim;
-  JXL_DASSERT(scratch_space + 2 * AcStrategy::kMaxCoeffArea ==
-              block_y + CfLHeuristics::kItemsPerThread);
+  float* scratch_space_end =
+      scratch_space + 2 * AcStrategy::kMaxCoeffArea + dct_scratch_size;
+  JXL_DASSERT(scratch_space_end == block_y + CfLHeuristics::ItemsPerThread());
+  (void)scratch_space_end;
 
   // Small (~256 bytes each)
   HWY_ALIGN_MAX float
@@ -259,9 +281,6 @@ void ComputeTile(const Image3F& opsin, const DequantMatrices& dequant,
           dequant.InvMatrix(acs.Strategy(), 0);
       const float* const JXL_RESTRICT qm_b =
           dequant.InvMatrix(acs.Strategy(), 2);
-      // Why does a constant seem to work better than
-      // raw_quant_field->Row(y)[x] ?
-      float q = use_dct8 ? 1 : quantizer->Scale() * 400.0f;
       float q_dc_x = use_dct8 ? 1 : 1.0f / quantizer->GetInvDcStep(0);
       float q_dc_b = use_dct8 ? 1 : 1.0f / quantizer->GetInvDcStep(2);
 
@@ -300,6 +319,14 @@ void ComputeTile(const Image3F& opsin, const DequantMatrices& dequant,
           block_b[cx * kBlockDim * iy + ix] = 0;
         }
       }
+      // Unclear why this is like it is. (This works slightly better
+      // than the previous approach which was also a hack.)
+      const float qq =
+          (raw_quant_field == nullptr) ? 1.0f : raw_quant_field->Row(y)[x];
+      // Experimentally values 128-130 seem best -- I don't know why we
+      // need this multiplier.
+      const float kStrangeMultiplier = 128;
+      float q = use_dct8 ? 1 : quantizer->Scale() * kStrangeMultiplier * qq;
       const auto qv = Set(df, q);
       for (size_t i = 0; i < cx * cy * 64; i += Lanes(df)) {
         const auto b_y = Load(df, block_y + i);
@@ -318,8 +345,9 @@ void ComputeTile(const Image3F& opsin, const DequantMatrices& dequant,
   JXL_CHECK(num_ac % Lanes(df) == 0);
   row_out_x[tx] = FindBestMultiplier(coeffs_yx, coeffs_x, num_ac, 0.0f,
                                      kDistanceMultiplierAC, fast);
-  row_out_b[tx] = FindBestMultiplier(coeffs_yb, coeffs_b, num_ac, kYToBRatio,
-                                     kDistanceMultiplierAC, fast);
+  row_out_b[tx] =
+      FindBestMultiplier(coeffs_yb, coeffs_b, num_ac, jxl::cms::kYToBRatio,
+                         kDistanceMultiplierAC, fast);
 }
 
 // NOLINTNEXTLINE(google-readability-namespace-comments)
@@ -344,12 +372,14 @@ void CfLHeuristics::Init(const Image3F& opsin) {
 void CfLHeuristics::ComputeTile(const Rect& r, const Image3F& opsin,
                                 const DequantMatrices& dequant,
                                 const AcStrategyImage* ac_strategy,
+                                const ImageI* raw_quant_field,
                                 const Quantizer* quantizer, bool fast,
                                 size_t thread, ColorCorrelationMap* cmap) {
   bool use_dct8 = ac_strategy == nullptr;
   HWY_DYNAMIC_DISPATCH(ComputeTile)
-  (opsin, dequant, ac_strategy, quantizer, r, fast, use_dct8, &cmap->ytox_map,
-   &cmap->ytob_map, &dc_values, mem.get() + thread * kItemsPerThread);
+  (opsin, dequant, ac_strategy, raw_quant_field, quantizer, r, fast, use_dct8,
+   &cmap->ytox_map, &cmap->ytob_map, &dc_values,
+   mem.get() + thread * ItemsPerThread());
 }
 
 void CfLHeuristics::ComputeDC(bool fast, ColorCorrelationMap* cmap) {
@@ -370,9 +400,10 @@ void ColorCorrelationMapEncodeDC(ColorCorrelationMap* map, BitWriter* writer,
 
   BitWriter::Allotment allotment(writer, 1 + 2 * kBitsPerByte + 12 + 32);
   if (ytox_dc == 0 && ytob_dc == 0 && color_factor == kDefaultColorFactor &&
-      base_correlation_x == 0.0f && base_correlation_b == kYToBRatio) {
+      base_correlation_x == 0.0f &&
+      base_correlation_b == jxl::cms::kYToBRatio) {
     writer->Write(1, 1);
-    ReclaimAndCharge(writer, &allotment, layer, aux_out);
+    allotment.ReclaimAndCharge(writer, layer, aux_out);
     return;
   }
   writer->Write(1, 0);
@@ -381,7 +412,7 @@ void ColorCorrelationMapEncodeDC(ColorCorrelationMap* map, BitWriter* writer,
   JXL_CHECK(F16Coder::Write(base_correlation_b, writer));
   writer->Write(kBitsPerByte, ytox_dc - std::numeric_limits<int8_t>::min());
   writer->Write(kBitsPerByte, ytob_dc - std::numeric_limits<int8_t>::min());
-  ReclaimAndCharge(writer, &allotment, layer, aux_out);
+  allotment.ReclaimAndCharge(writer, layer, aux_out);
 }
 
 }  // namespace jxl
index a097774..30487c6 100644 (file)
@@ -9,31 +9,21 @@
 // Chroma-from-luma, computed using heuristics to determine the best linear
 // model for the X and B channels from the Y channel.
 
-#include <stddef.h>
-#include <stdint.h>
+#include <cstddef>
+#include <hwy/aligned_allocator.h>
 
-#include <vector>
-
-#include "lib/jxl/aux_out.h"
-#include "lib/jxl/aux_out_fwd.h"
-#include "lib/jxl/base/compiler_specific.h"
-#include "lib/jxl/base/data_parallel.h"
-#include "lib/jxl/base/status.h"
+#include "lib/jxl/ac_strategy.h"
 #include "lib/jxl/chroma_from_luma.h"
-#include "lib/jxl/common.h"
-#include "lib/jxl/dec_ans.h"
-#include "lib/jxl/dec_bit_reader.h"
-#include "lib/jxl/enc_ans.h"
 #include "lib/jxl/enc_bit_writer.h"
-#include "lib/jxl/entropy_coder.h"
-#include "lib/jxl/field_encodings.h"
-#include "lib/jxl/fields.h"
 #include "lib/jxl/image.h"
-#include "lib/jxl/opsin_params.h"
 #include "lib/jxl/quant_weights.h"
+#include "lib/jxl/simd_util.h"
 
 namespace jxl {
 
+struct AuxOut;
+class Quantizer;
+
 void ColorCorrelationMapEncodeDC(ColorCorrelationMap* map, BitWriter* writer,
                                  size_t layer, AuxOut* aux_out);
 
@@ -41,14 +31,14 @@ struct CfLHeuristics {
   void Init(const Image3F& opsin);
 
   void PrepareForThreads(size_t num_threads) {
-    mem = hwy::AllocateAligned<float>(num_threads * kItemsPerThread);
+    mem = hwy::AllocateAligned<float>(num_threads * ItemsPerThread());
   }
 
   void ComputeTile(const Rect& r, const Image3F& opsin,
                    const DequantMatrices& dequant,
                    const AcStrategyImage* ac_strategy,
-                   const Quantizer* quantizer, bool fast, size_t thread,
-                   ColorCorrelationMap* cmap);
+                   const ImageI* raw_quant_field, const Quantizer* quantizer,
+                   bool fast, size_t thread, ColorCorrelationMap* cmap);
 
   void ComputeDC(bool fast, ColorCorrelationMap* cmap);
 
@@ -56,10 +46,14 @@ struct CfLHeuristics {
   hwy::AlignedFreeUniquePtr<float[]> mem;
 
   // Working set is too large for stack; allocate dynamically.
-  constexpr static size_t kItemsPerThread =
-      AcStrategy::kMaxCoeffArea * 3        // Blocks
-      + kColorTileDim * kColorTileDim * 4  // AC coeff storage
-      + AcStrategy::kMaxCoeffArea * 2;     // Scratch space
+  static size_t ItemsPerThread() {
+    const size_t dct_scratch_size =
+        3 * (MaxVectorSize() / sizeof(float)) * AcStrategy::kMaxBlockDim;
+    return AcStrategy::kMaxCoeffArea * 3        // Blocks
+           + kColorTileDim * kColorTileDim * 4  // AC coeff storage
+           + AcStrategy::kMaxCoeffArea * 2      // Scratch space
+           + dct_scratch_size;
+  }
 };
 
 }  // namespace jxl
index c79b3ac..a69b207 100644 (file)
@@ -20,8 +20,7 @@
 #include <hwy/highway.h>
 
 #include "lib/jxl/ac_context.h"
-#include "lib/jxl/base/profiler.h"
-#include "lib/jxl/fast_math-inl.h"
+#include "lib/jxl/base/fast_math-inl.h"
 HWY_BEFORE_NAMESPACE();
 namespace jxl {
 namespace HWY_NAMESPACE {
@@ -86,7 +85,6 @@ float HistogramDistance(const Histogram& a, const Histogram& b) {
 void FastClusterHistograms(const std::vector<Histogram>& in,
                            size_t max_histograms, std::vector<Histogram>* out,
                            std::vector<uint32_t>* histogram_symbols) {
-  PROFILER_FUNC;
   out->clear();
   out->reserve(max_histograms);
   histogram_symbols->clear();
index a06783f..4b062e8 100644 (file)
 namespace jxl {
 
 struct Histogram {
-  Histogram() { total_count_ = 0; }
+  Histogram() {
+    total_count_ = 0;
+    entropy_ = 0.0;
+  }
   void Clear() {
     data_.clear();
     total_count_ = 0;
index 8d75cc0..e8931a1 100644 (file)
@@ -6,13 +6,10 @@
 #include <stdint.h>
 
 #include <algorithm>
+#include <hwy/aligned_allocator.h>
 #include <vector>
 
 #include "lib/jxl/ans_params.h"
-#include "lib/jxl/aux_out.h"
-#include "lib/jxl/aux_out_fwd.h"
-#include "lib/jxl/base/padded_bytes.h"
-#include "lib/jxl/base/profiler.h"
 #include "lib/jxl/base/span.h"
 #include "lib/jxl/coeff_order.h"
 #include "lib/jxl/coeff_order_fwd.h"
@@ -27,6 +24,8 @@
 
 namespace jxl {
 
+struct AuxOut;
+
 std::pair<uint32_t, uint32_t> ComputeUsedOrders(
     const SpeedTier speed, const AcStrategyImage& ac_strategy,
     const Rect& rect) {
index 7a237f2..eaa8d26 100644 (file)
 #include <stdint.h>
 
 #include "lib/jxl/ac_strategy.h"
-#include "lib/jxl/aux_out_fwd.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/coeff_order.h"
 #include "lib/jxl/coeff_order_fwd.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/dct_util.h"
 #include "lib/jxl/dec_bit_reader.h"
 #include "lib/jxl/enc_bit_writer.h"
@@ -23,6 +21,8 @@
 
 namespace jxl {
 
+struct AuxOut;
+
 // Orders that are actually used in part of image. `rect` is in block units.
 // Returns {orders that are used, orders that might be made non-default}.
 std::pair<uint32_t, uint32_t> ComputeUsedOrders(
diff --git a/lib/jxl/enc_color_management.h b/lib/jxl/enc_color_management.h
deleted file mode 100644 (file)
index 0d701d7..0000000
+++ /dev/null
@@ -1,90 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#ifndef LIB_JXL_ENC_COLOR_MANAGEMENT_H_
-#define LIB_JXL_ENC_COLOR_MANAGEMENT_H_
-
-// ICC profiles and color space conversions.
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include <vector>
-
-#include "jxl/cms_interface.h"
-#include "lib/jxl/base/padded_bytes.h"
-#include "lib/jxl/base/status.h"
-#include "lib/jxl/color_encoding_internal.h"
-#include "lib/jxl/color_management.h"
-#include "lib/jxl/common.h"
-#include "lib/jxl/image.h"
-
-namespace jxl {
-
-// Internal C++ wrapper for a JxlCmsInterface.
-class ColorSpaceTransform {
- public:
-  explicit ColorSpaceTransform(const JxlCmsInterface& cms) : cms_(cms) {}
-  ~ColorSpaceTransform() {
-    if (cms_data_ != nullptr) {
-      cms_.destroy(cms_data_);
-    }
-  }
-
-  // Cannot copy.
-  ColorSpaceTransform(const ColorSpaceTransform&) = delete;
-  ColorSpaceTransform& operator=(const ColorSpaceTransform&) = delete;
-
-  Status Init(const ColorEncoding& c_src, const ColorEncoding& c_dst,
-              float intensity_target, size_t xsize, size_t num_threads) {
-    xsize_ = xsize;
-    JxlColorProfile input_profile;
-    icc_src_ = c_src.ICC();
-    input_profile.icc.data = icc_src_.data();
-    input_profile.icc.size = icc_src_.size();
-    ConvertInternalToExternalColorEncoding(c_src,
-                                           &input_profile.color_encoding);
-    input_profile.num_channels = c_src.IsCMYK() ? 4 : c_src.Channels();
-    JxlColorProfile output_profile;
-    icc_dst_ = c_dst.ICC();
-    output_profile.icc.data = icc_dst_.data();
-    output_profile.icc.size = icc_dst_.size();
-    ConvertInternalToExternalColorEncoding(c_dst,
-                                           &output_profile.color_encoding);
-    if (c_dst.IsCMYK())
-      return JXL_FAILURE("Conversion to CMYK is not supported");
-    output_profile.num_channels = c_dst.Channels();
-    cms_data_ = cms_.init(cms_.init_data, num_threads, xsize, &input_profile,
-                          &output_profile, intensity_target);
-    JXL_RETURN_IF_ERROR(cms_data_ != nullptr);
-    return true;
-  }
-
-  float* BufSrc(const size_t thread) const {
-    return cms_.get_src_buf(cms_data_, thread);
-  }
-
-  float* BufDst(const size_t thread) const {
-    return cms_.get_dst_buf(cms_data_, thread);
-  }
-
-  Status Run(const size_t thread, const float* buf_src, float* buf_dst) {
-    return cms_.run(cms_data_, thread, buf_src, buf_dst, xsize_);
-  }
-
- private:
-  JxlCmsInterface cms_;
-  void* cms_data_ = nullptr;
-  // The interface may retain pointers into these.
-  PaddedBytes icc_src_;
-  PaddedBytes icc_dst_;
-  size_t xsize_;
-};
-
-const JxlCmsInterface& GetJxlCms();
-
-}  // namespace jxl
-
-#endif  // LIB_JXL_ENC_COLOR_MANAGEMENT_H_
index a2d170d..268122a 100644 (file)
@@ -11,8 +11,6 @@
 #include <algorithm>
 
 #include "lib/jxl/base/compiler_specific.h"
-#include "lib/jxl/base/profiler.h"
-#include "lib/jxl/color_management.h"
 #include "lib/jxl/enc_gamma_correct.h"
 #include "lib/jxl/enc_image_bundle.h"
 
@@ -48,18 +46,6 @@ void AlphaBlend(const Image3F& in, const size_t c, float background_linear,
   }
 }
 
-const Image3F* AlphaBlend(const ImageBundle& ib, const Image3F& linear,
-                          float background_linear, Image3F* copy) {
-  // No alpha => all opaque.
-  if (!ib.HasAlpha()) return &linear;
-
-  *copy = Image3F(linear.xsize(), linear.ysize());
-  for (size_t c = 0; c < 3; ++c) {
-    AlphaBlend(linear, c, background_linear, ib.alpha(), copy);
-  }
-  return copy;
-}
-
 void AlphaBlend(float background_linear, ImageBundle* io_linear_srgb) {
   // No alpha => all opaque.
   if (!io_linear_srgb->HasAlpha()) return;
@@ -82,8 +68,7 @@ float ComputeScoreImpl(const ImageBundle& rgb0, const ImageBundle& rgb1,
 
 float ComputeScore(const ImageBundle& rgb0, const ImageBundle& rgb1,
                    Comparator* comparator, const JxlCmsInterface& cms,
-                   ImageF* diffmap, ThreadPool* pool) {
-  PROFILER_FUNC;
+                   ImageF* diffmap, ThreadPool* pool, bool ignore_alpha) {
   // Convert to linear sRGB (unless already in that space)
   ImageMetadata metadata0 = *rgb0.metadata();
   ImageBundle store0(&metadata0);
@@ -97,7 +82,7 @@ float ComputeScore(const ImageBundle& rgb0, const ImageBundle& rgb1,
                               cms, pool, &store1, &linear_srgb1));
 
   // No alpha: skip blending, only need a single call to Butteraugli.
-  if (!rgb0.HasAlpha() && !rgb1.HasAlpha()) {
+  if (ignore_alpha || (!rgb0.HasAlpha() && !rgb1.HasAlpha())) {
     return ComputeScoreImpl(*linear_srgb0, *linear_srgb1, comparator, diffmap);
   }
 
index 0ac4df8..c545ea6 100644 (file)
@@ -45,7 +45,8 @@ class Comparator {
 // alpha channel.
 float ComputeScore(const ImageBundle& rgb0, const ImageBundle& rgb1,
                    Comparator* comparator, const JxlCmsInterface& cms,
-                   ImageF* diffmap = nullptr, ThreadPool* pool = nullptr);
+                   ImageF* diffmap = nullptr, ThreadPool* pool = nullptr,
+                   bool ignore_alpha = false);
 
 }  // namespace jxl
 
index 82e5e61..5775658 100644 (file)
@@ -16,7 +16,9 @@
 #include "lib/jxl/base/bits.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/enc_ans.h"
+#include "lib/jxl/enc_aux_out.h"
 #include "lib/jxl/entropy_coder.h"
+#include "lib/jxl/pack_signed.h"
 
 namespace jxl {
 
@@ -69,7 +71,7 @@ void EncodeContextMap(const std::vector<uint8_t>& context_map,
   std::vector<uint8_t> transformed_symbols = MoveToFrontTransform(context_map);
   std::vector<std::vector<Token>> tokens(1), mtf_tokens(1);
   EntropyEncodingData codes;
-  std::vector<uint8_t> dummy_context_map;
+  std::vector<uint8_t> sink_context_map;
   for (size_t i = 0; i < context_map.size(); i++) {
     tokens[0].emplace_back(0, context_map[i]);
   }
@@ -79,9 +81,9 @@ void EncodeContextMap(const std::vector<uint8_t>& context_map,
   HistogramParams params;
   params.uint_method = HistogramParams::HybridUintMethod::kContextMap;
   size_t ans_cost = BuildAndEncodeHistograms(
-      params, 1, tokens, &codes, &dummy_context_map, nullptr, 0, nullptr);
+      params, 1, tokens, &codes, &sink_context_map, nullptr, 0, nullptr);
   size_t mtf_cost = BuildAndEncodeHistograms(
-      params, 1, mtf_tokens, &codes, &dummy_context_map, nullptr, 0, nullptr);
+      params, 1, mtf_tokens, &codes, &sink_context_map, nullptr, 0, nullptr);
   bool use_mtf = mtf_cost < ans_cost;
   // Rebuild token list.
   tokens[0].clear();
@@ -100,9 +102,9 @@ void EncodeContextMap(const std::vector<uint8_t>& context_map,
   } else {
     writer->Write(1, 0);
     writer->Write(1, use_mtf);  // Use/don't use MTF.
-    BuildAndEncodeHistograms(params, 1, tokens, &codes, &dummy_context_map,
+    BuildAndEncodeHistograms(params, 1, tokens, &codes, &sink_context_map,
                              writer, layer, aux_out);
-    WriteTokens(tokens[0], codes, dummy_context_map, writer);
+    WriteTokens(tokens[0], codes, sink_context_map, writer);
   }
 }
 
@@ -119,7 +121,7 @@ void EncodeBlockCtxMap(const BlockCtxMap& block_ctx_map, BitWriter* writer,
       ctx_map.size() == 21 &&
       std::equal(ctx_map.begin(), ctx_map.end(), BlockCtxMap::kDefaultCtxMap)) {
     writer->Write(1, 1);  // default
-    ReclaimAndCharge(writer, &allotment, kLayerAC, aux_out);
+    allotment.ReclaimAndCharge(writer, kLayerAC, aux_out);
     return;
   }
   writer->Write(1, 0);
@@ -134,7 +136,7 @@ void EncodeBlockCtxMap(const BlockCtxMap& block_ctx_map, BitWriter* writer,
     JXL_CHECK(U32Coder::Write(kQFThresholdDist, i - 1, writer));
   }
   EncodeContextMap(ctx_map, block_ctx_map.num_ctxs, writer, kLayerAC, aux_out);
-  ReclaimAndCharge(writer, &allotment, kLayerAC, aux_out);
+  allotment.ReclaimAndCharge(writer, kLayerAC, aux_out);
 }
 
 }  // namespace jxl
index 57e79a1..041e71d 100644 (file)
 #include <vector>
 
 #include "lib/jxl/ac_context.h"
-#include "lib/jxl/aux_out.h"
 #include "lib/jxl/enc_bit_writer.h"
 
 namespace jxl {
 
+struct AuxOut;
+
 // Max limit is 255 because encoding assumes numbers < 255
 // More clusters can help compression, but makes encode/decode somewhat slower
 static const size_t kClustersLimit = 128;
diff --git a/lib/jxl/enc_debug_image.cc b/lib/jxl/enc_debug_image.cc
new file mode 100644 (file)
index 0000000..261570e
--- /dev/null
@@ -0,0 +1,115 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_debug_image.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/dec_external_image.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/image_ops.h"
+
+namespace jxl {
+
+namespace {
+template <typename From>
+Plane<float> ConvertToFloat(const Plane<From>& from) {
+  float factor = 1.0f / std::numeric_limits<From>::max();
+  if (std::is_same<From, double>::value || std::is_same<From, float>::value) {
+    factor = 1.0f;
+  }
+  Plane<float> to(from.xsize(), from.ysize());
+  for (size_t y = 0; y < from.ysize(); ++y) {
+    const From* const JXL_RESTRICT row_from = from.Row(y);
+    float* const JXL_RESTRICT row_to = to.Row(y);
+    for (size_t x = 0; x < from.xsize(); ++x) {
+      row_to[x] = row_from[x] * factor;
+    }
+  }
+  return to;
+}
+template <typename From>
+Image3F ConvertToFloat(const Image3<From>& from) {
+  return Image3F(ConvertToFloat(from.Plane(0)), ConvertToFloat(from.Plane(1)),
+                 ConvertToFloat(from.Plane(2)));
+}
+
+template <typename T>
+void DumpImageT(const CompressParams& cparams, const char* label,
+                const ColorEncoding& color_encoding, const Image3<T>& image) {
+  if (!cparams.debug_image) return;
+  Image3F float_image = ConvertToFloat(image);
+  JxlColorEncoding color = color_encoding.ToExternal();
+  size_t num_pixels = 3 * image.xsize() * image.ysize();
+  std::vector<uint16_t> pixels(num_pixels);
+  const ImageF* channels[3];
+  for (int c = 0; c < 3; ++c) {
+    channels[c] = &float_image.Plane(c);
+  }
+  JXL_CHECK(ConvertChannelsToExternal(
+      channels, 3, 16, false, JXL_BIG_ENDIAN, 6 * image.xsize(), nullptr,
+      &pixels[0], 2 * num_pixels, PixelCallback(), Orientation::kIdentity));
+  (*cparams.debug_image)(cparams.debug_image_opaque, label, image.xsize(),
+                         image.ysize(), &color, &pixels[0]);
+}
+
+template <typename T>
+void DumpPlaneNormalizedT(const CompressParams& cparams, const char* label,
+                          const Plane<T>& image) {
+  T min;
+  T max;
+  ImageMinMax(image, &min, &max);
+  Image3B normalized(image.xsize(), image.ysize());
+  for (size_t c = 0; c < 3; ++c) {
+    float mul = min == max ? 0 : (255.0f / (max - min));
+    for (size_t y = 0; y < image.ysize(); ++y) {
+      const T* JXL_RESTRICT row_in = image.ConstRow(y);
+      uint8_t* JXL_RESTRICT row_out = normalized.PlaneRow(c, y);
+      for (size_t x = 0; x < image.xsize(); ++x) {
+        row_out[x] = static_cast<uint8_t>((row_in[x] - min) * mul);
+      }
+    }
+  }
+  DumpImageT(cparams, label, ColorEncoding::SRGB(), normalized);
+}
+
+}  // namespace
+
+void DumpImage(const CompressParams& cparams, const char* label,
+               const Image3<float>& image) {
+  DumpImageT(cparams, label, ColorEncoding::SRGB(), image);
+}
+
+void DumpImage(const CompressParams& cparams, const char* label,
+               const Image3<uint8_t>& image) {
+  DumpImageT(cparams, label, ColorEncoding::SRGB(), image);
+}
+
+void DumpXybImage(const CompressParams& cparams, const char* label,
+                  const Image3F& image) {
+  if (!cparams.debug_image) return;
+
+  Image3F linear(image.xsize(), image.ysize());
+  OpsinParams opsin_params;
+  opsin_params.Init(kDefaultIntensityTarget);
+  OpsinToLinear(image, Rect(linear), nullptr, &linear, opsin_params);
+
+  DumpImageT(cparams, label, ColorEncoding::LinearSRGB(), linear);
+}
+
+void DumpPlaneNormalized(const CompressParams& cparams, const char* label,
+                         const Plane<float>& image) {
+  DumpPlaneNormalizedT(cparams, label, image);
+}
+
+void DumpPlaneNormalized(const CompressParams& cparams, const char* label,
+                         const Plane<uint8_t>& image) {
+  DumpPlaneNormalizedT(cparams, label, image);
+}
+
+}  // namespace jxl
diff --git a/lib/jxl/enc_debug_image.h b/lib/jxl/enc_debug_image.h
new file mode 100644 (file)
index 0000000..33799a5
--- /dev/null
@@ -0,0 +1,37 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_DEBUG_IMAGE_H_
+#define LIB_JXL_ENC_DEBUG_IMAGE_H_
+
+// Optional output images for debugging.
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/image.h"
+
+namespace jxl {
+
+void DumpImage(const CompressParams& cparams, const char* label,
+               const Image3<float>& image);
+void DumpImage(const CompressParams& cparams, const char* label,
+               const Image3<uint8_t>& image);
+void DumpXybImage(const CompressParams& cparams, const char* label,
+                  const Image3<float>& image);
+void DumpPlaneNormalized(const CompressParams& cparams, const char* label,
+                         const Plane<float>& image);
+void DumpPlaneNormalized(const CompressParams& cparams, const char* label,
+                         const Plane<uint8_t>& image);
+
+// Used to skip image creation if they won't be written to debug directory.
+static inline bool WantDebugOutput(const CompressParams& cparams) {
+  return cparams.debug_image != nullptr;
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_DEBUG_IMAGE_H_
index f7021d6..4ee8808 100644 (file)
 #include <hwy/foreach_target.h>
 #include <hwy/highway.h>
 
+#include "lib/jxl/base/common.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/base/printf_macros.h"
-#include "lib/jxl/base/profiler.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/codec_in_out.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/convolve.h"
+#include "lib/jxl/enc_linalg.h"
+#include "lib/jxl/enc_optimize.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/image_ops.h"
-#include "lib/jxl/linalg.h"
-#include "lib/jxl/optimize.h"
 
 // Set JXL_DEBUG_DOT_DETECT to 1 to enable debugging.
 #ifndef JXL_DEBUG_DOT_DETECT
 #define JXL_DEBUG_DOT_DETECT 0
 #endif
 
-#if JXL_DEBUG_DOT_DETECT
-#include "lib/jxl/aux_out.h"
-#endif
-
 HWY_BEFORE_NAMESPACE();
 namespace jxl {
 namespace HWY_NAMESPACE {
@@ -150,8 +144,6 @@ const WeightsSeparable5& WeightsSeparable5Gaussian3() {
 
 ImageF ComputeEnergyImage(const Image3F& orig, Image3F* smooth,
                           ThreadPool* pool) {
-  PROFILER_FUNC;
-
   // Prepare guidance images for dot selection.
   Image3F forig(orig.xsize(), orig.ysize());
   *smooth = Image3F(orig.xsize(), orig.ysize());
@@ -167,13 +159,6 @@ ImageF ComputeEnergyImage(const Image3F& orig, Image3F* smooth,
     Separable5(orig.Plane(c), rect, weights1, pool, &forig.Plane(c));
   }
 
-#if JXL_DEBUG_DOT_DETECT
-  AuxOut aux;
-  aux.debug_prefix = "/tmp/sebastian/";
-  aux.DumpImage("filtered", forig);
-  aux.DumpImage("sm", *smooth);
-#endif
-
   return HWY_DYNAMIC_DISPATCH(SumOfSquareDifferences)(forig, *smooth, pool);
 }
 
@@ -193,7 +178,6 @@ const size_t kMaxCCSize = 1000;
 // of the component
 bool ExtractComponent(ImageF* img, std::vector<Pixel>* pixels,
                       const Pixel& seed, double threshold) {
-  PROFILER_FUNC;
   static const std::vector<Pixel> neighbors{{1, -1}, {1, 0},   {1, 1},  {0, -1},
                                             {0, 1},  {-1, -1}, {-1, 1}, {1, 0}};
   std::vector<Pixel> q{seed};
@@ -238,7 +222,6 @@ struct ConnectedComponent {
   Pixel mode;
 
   void CompStats(const ImageF& energy, int extra) {
-    PROFILER_FUNC;
     maxEnergy = 0.0;
     meanEnergy = 0.0;
     varEnergy = 0.0;
@@ -282,7 +265,6 @@ struct ConnectedComponent {
 };
 
 Rect BoundingRectangle(const std::vector<Pixel>& pixels) {
-  PROFILER_FUNC;
   JXL_ASSERT(!pixels.empty());
   int low_x, high_x, low_y, high_y;
   low_x = high_x = pixels[0].x;
@@ -299,9 +281,9 @@ Rect BoundingRectangle(const std::vector<Pixel>& pixels) {
 std::vector<ConnectedComponent> FindCC(const ImageF& energy, double t_low,
                                        double t_high, uint32_t maxWindow,
                                        double minScore) {
-  PROFILER_FUNC;
   const int kExtraRect = 4;
-  ImageF img = CopyImage(energy);
+  ImageF img(energy.xsize(), energy.ysize());
+  CopyImageTo(energy, &img);
   std::vector<ConnectedComponent> ans;
   for (size_t y = 0; y < img.ysize(); y++) {
     float* JXL_RESTRICT row = img.Row(y);
@@ -338,11 +320,10 @@ std::vector<ConnectedComponent> FindCC(const ImageF& energy, double t_low,
   return ans;
 }
 
-// TODO (sggonzalez): Adapt this function for the different color spaces or
+// TODO(sggonzalez): Adapt this function for the different color spaces or
 // remove it if the color space with the best performance does not need it
 void ComputeDotLosses(GaussianEllipse* ellipse, const ConnectedComponent& cc,
                       const Image3F& img, const Image3F& background) {
-  PROFILER_FUNC;
   const int rectBounds = 2;
   const double kIntensityR = 0.0;   // 0.015;
   const double kSigmaR = 0.0;       // 0.01;
@@ -407,7 +388,6 @@ void ComputeDotLosses(GaussianEllipse* ellipse, const ConnectedComponent& cc,
 GaussianEllipse FitGaussianFast(const ConnectedComponent& cc,
                                 const ImageF& energy, const Image3F& img,
                                 const Image3F& background) {
-  PROFILER_FUNC;
   constexpr bool leastSqIntensity = true;
   constexpr double kEpsilon = 1e-6;
   GaussianEllipse ans;
@@ -545,16 +525,9 @@ GaussianEllipse FitGaussian(const ConnectedComponent& cc, const ImageF& energy,
 std::vector<PatchInfo> DetectGaussianEllipses(
     const Image3F& opsin, const GaussianDetectParams& params,
     const EllipseQuantParams& qParams, ThreadPool* pool) {
-  PROFILER_FUNC;
   std::vector<PatchInfo> dots;
   Image3F smooth(opsin.xsize(), opsin.ysize());
   ImageF energy = ComputeEnergyImage(opsin, &smooth, pool);
-#if JXL_DEBUG_DOT_DETECT
-  AuxOut aux;
-  aux.debug_prefix = "/tmp/sebastian/";
-  aux.DumpXybImage("smooth", smooth);
-  aux.DumpPlaneNormalized("energy", energy);
-#endif  // JXL_DEBUG_DOT_DETECT
   std::vector<ConnectedComponent> components = FindCC(
       energy, params.t_low, params.t_high, params.maxWinSize, params.minScore);
   size_t numCC =
@@ -607,19 +580,6 @@ std::vector<PatchInfo> DetectGaussianEllipses(
       }
     }
   }
-#if JXL_DEBUG_DOT_DETECT
-  JXL_DEBUG(JXL_DEBUG_DOT_DETECT, "Candidates: %" PRIuS ", Dots: %" PRIuS "\n",
-            components.size(), dots.size());
-  ApplyGaussianEllipses(&smooth, dots, 1.0);
-  aux.DumpXybImage("draw", smooth);
-  ApplyGaussianEllipses(&smooth, dots, -1.0);
-
-  auto qdots = QuantizeGaussianEllipses(dots, qParams);
-  auto deq = DequantizeGaussianEllipses(qdots, qParams);
-  ApplyGaussianEllipses(&smooth, deq, 1.0);
-  aux.DumpXybImage("qdraw", smooth);
-  ApplyGaussianEllipses(&smooth, deq, -1.0);
-#endif  // JXL_DEBUG_DOT_DETECT
   return dots;
 }
 
index 1b5413b..a5b1af6 100644 (file)
@@ -11,7 +11,6 @@
 #include <array>
 #include <utility>
 
-#include "lib/jxl/aux_out.h"
 #include "lib/jxl/base/override.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/chroma_from_luma.h"
@@ -34,7 +33,7 @@ const double kEllipseMinSigma = 0.1;  // Minimum sigma value
 const double kEllipseMaxSigma = 3.1;  // Maximum Sigma value
 const size_t kEllipseSigmaQ = 16;     // Number of quantization levels for sigma
 const size_t kEllipseAngleQ = 8;      // Quantization level for the angle
-// TODO: fix these values.
+// TODO(user): fix these values.
 const std::array<double, 3> kEllipseMinIntensity{{-0.05, 0.0, -0.5}};
 const std::array<double, 3> kEllipseMaxIntensity{{0.05, 1.0, 0.4}};
 const std::array<size_t, 3> kEllipseIntensityQ{{10, 36, 10}};
index af76bfc..2ba4393 100644 (file)
@@ -13,8 +13,6 @@
 
 #include <vector>
 
-#include "lib/jxl/aux_out.h"
-#include "lib/jxl/aux_out_fwd.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/chroma_from_luma.h"
 #include "lib/jxl/dec_bit_reader.h"
index c634445..a25766a 100644 (file)
 #include "lib/jxl/ac_strategy.h"
 #include "lib/jxl/base/bits.h"
 #include "lib/jxl/base/compiler_specific.h"
-#include "lib/jxl/base/profiler.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/coeff_order.h"
 #include "lib/jxl/coeff_order_fwd.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/dec_ans.h"
 #include "lib/jxl/dec_bit_reader.h"
 #include "lib/jxl/dec_context_map.h"
@@ -33,6 +31,7 @@
 #include "lib/jxl/epf.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/image_ops.h"
+#include "lib/jxl/pack_signed.h"
 
 HWY_BEFORE_NAMESPACE();
 namespace jxl {
index 346182b..1096f2d 100644 (file)
 
 #include "lib/jxl/enc_external_image.h"
 
+#include <jxl/types.h>
 #include <string.h>
 
 #include <algorithm>
 #include <array>
+#include <atomic>
 #include <functional>
 #include <utility>
 #include <vector>
 
-#include "jxl/types.h"
 #include "lib/jxl/alpha.h"
 #include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/common.h"
+#include "lib/jxl/base/float.h"
 #include "lib/jxl/base/printf_macros.h"
-#include "lib/jxl/color_management.h"
-#include "lib/jxl/common.h"
 
 namespace jxl {
 namespace {
 
-// Based on highway scalar implementation, for testing
-float LoadFloat16(uint16_t bits16) {
-  const uint32_t sign = bits16 >> 15;
-  const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
-  const uint32_t mantissa = bits16 & 0x3FF;
-
-  // Subnormal or zero
-  if (biased_exp == 0) {
-    const float subnormal = (1.0f / 16384) * (mantissa * (1.0f / 1024));
-    return sign ? -subnormal : subnormal;
-  }
-
-  // Normalized: convert the representation directly (faster than ldexp/tables).
-  const uint32_t biased_exp32 = biased_exp + (127 - 15);
-  const uint32_t mantissa32 = mantissa << (23 - 10);
-  const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
-
-  float result;
-  memcpy(&result, &bits32, 4);
-  return result;
-}
-
-float LoadLEFloat16(const uint8_t* p) {
-  uint16_t bits16 = LoadLE16(p);
-  return LoadFloat16(bits16);
-}
-
-float LoadBEFloat16(const uint8_t* p) {
-  uint16_t bits16 = LoadBE16(p);
-  return LoadFloat16(bits16);
-}
-
-// Loads a float in big endian
-float LoadBEFloat(const uint8_t* p) {
-  float value;
-  const uint32_t u = LoadBE32(p);
-  memcpy(&value, &u, 4);
-  return value;
-}
-
-// Loads a float in little endian
-float LoadLEFloat(const uint8_t* p) {
-  float value;
-  const uint32_t u = LoadLE32(p);
-  memcpy(&value, &u, 4);
-  return value;
-}
-
-typedef uint32_t(LoadFuncType)(const uint8_t* p);
-template <LoadFuncType LoadFunc>
-void JXL_INLINE LoadFloatRow(float* JXL_RESTRICT row_out, const uint8_t* in,
-                             float mul, size_t xsize, size_t bytes_per_pixel) {
-  size_t i = 0;
-  for (size_t x = 0; x < xsize; ++x) {
-    row_out[x] = mul * LoadFunc(in + i);
-    i += bytes_per_pixel;
+size_t JxlDataTypeBytes(JxlDataType data_type) {
+  switch (data_type) {
+    case JXL_TYPE_UINT8:
+      return 1;
+    case JXL_TYPE_UINT16:
+      return 2;
+    case JXL_TYPE_FLOAT16:
+      return 2;
+    case JXL_TYPE_FLOAT:
+      return 4;
+    default:
+      return 0;
   }
 }
 
-uint32_t JXL_INLINE Load8(const uint8_t* p) { return *p; }
-
-Status PixelFormatToExternal(const JxlPixelFormat& pixel_format,
-                             size_t* bitdepth, bool* float_in) {
-  if (pixel_format.data_type == JXL_TYPE_FLOAT) {
-    *bitdepth = 32;
-    *float_in = true;
-  } else if (pixel_format.data_type == JXL_TYPE_FLOAT16) {
-    *bitdepth = 16;
-    *float_in = true;
-  } else if (pixel_format.data_type == JXL_TYPE_UINT8) {
-    *bitdepth = 8;
-    *float_in = false;
-  } else if (pixel_format.data_type == JXL_TYPE_UINT16) {
-    *bitdepth = 16;
-    *float_in = false;
-  } else {
-    return JXL_FAILURE("unsupported pixel format data type");
-  }
-  return true;
-}
 }  // namespace
 
 Status ConvertFromExternal(Span<const uint8_t> bytes, size_t xsize,
                            size_t ysize, size_t bits_per_sample,
-                           JxlEndianness endianness, ThreadPool* pool,
-                           ImageF* channel, bool float_in, size_t align) {
-  // TODO(firsching): Avoid code duplication with the function below.
-  JXL_CHECK(float_in ? bits_per_sample == 16 || bits_per_sample == 32
-                     : bits_per_sample > 0 && bits_per_sample <= 16);
-  const size_t bytes_per_pixel = DivCeil(bits_per_sample, jxl::kBitsPerByte);
+                           JxlPixelFormat format, size_t c, ThreadPool* pool,
+                           ImageF* channel) {
+  if (format.data_type == JXL_TYPE_UINT8) {
+    JXL_RETURN_IF_ERROR(bits_per_sample > 0 && bits_per_sample <= 8);
+  } else if (format.data_type == JXL_TYPE_UINT16) {
+    JXL_RETURN_IF_ERROR(bits_per_sample > 8 && bits_per_sample <= 16);
+  } else if (format.data_type != JXL_TYPE_FLOAT16 &&
+             format.data_type != JXL_TYPE_FLOAT) {
+    JXL_FAILURE("unsupported pixel format data type %d", format.data_type);
+  }
+  size_t bytes_per_channel = JxlDataTypeBytes(format.data_type);
+  size_t bytes_per_pixel = format.num_channels * bytes_per_channel;
+  size_t pixel_offset = c * bytes_per_channel;
+  // Only for uint8/16.
+  float scale = 1. / ((1ull << bits_per_sample) - 1);
+
   const size_t last_row_size = xsize * bytes_per_pixel;
+  const size_t align = format.align;
   const size_t row_size =
       (align > 1 ? jxl::DivCeil(last_row_size, align) * align : last_row_size);
   const size_t bytes_to_read = row_size * (ysize - 1) + last_row_size;
   if (xsize == 0 || ysize == 0) return JXL_FAILURE("Empty image");
   if (bytes.size() < bytes_to_read) {
-    return JXL_FAILURE("Buffer size is too small");
+    return JXL_FAILURE("Buffer size is too small, expected: %" PRIuS
+                       " got: %" PRIuS " (Image: %" PRIuS "x%" PRIuS
+                       "x%u, bytes_per_channel: %" PRIuS ")",
+                       bytes_to_read, bytes.size(), xsize, ysize,
+                       format.num_channels, bytes_per_channel);
   }
   JXL_ASSERT(channel->xsize() == xsize);
   JXL_ASSERT(channel->ysize() == ysize);
@@ -130,259 +81,73 @@ Status ConvertFromExternal(Span<const uint8_t> bytes, size_t xsize,
   }
 
   const bool little_endian =
-      endianness == JXL_LITTLE_ENDIAN ||
-      (endianness == JXL_NATIVE_ENDIAN && IsLittleEndian());
+      format.endianness == JXL_LITTLE_ENDIAN ||
+      (format.endianness == JXL_NATIVE_ENDIAN && IsLittleEndian());
 
   const uint8_t* const in = bytes.data();
-  if (float_in) {
-    JXL_RETURN_IF_ERROR(RunOnPool(
-        pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
-        [&](const uint32_t task, size_t /*thread*/) {
-          const size_t y = task;
-          size_t i = row_size * task;
-          float* JXL_RESTRICT row_out = channel->Row(y);
-          if (bits_per_sample == 16) {
-            if (little_endian) {
-              for (size_t x = 0; x < xsize; ++x) {
-                row_out[x] = LoadLEFloat16(in + i);
-                i += bytes_per_pixel;
-              }
-            } else {
-              for (size_t x = 0; x < xsize; ++x) {
-                row_out[x] = LoadBEFloat16(in + i);
-                i += bytes_per_pixel;
-              }
-            }
-          } else {
-            if (little_endian) {
-              for (size_t x = 0; x < xsize; ++x) {
-                row_out[x] = LoadLEFloat(in + i);
-                i += bytes_per_pixel;
-              }
-            } else {
-              for (size_t x = 0; x < xsize; ++x) {
-                row_out[x] = LoadBEFloat(in + i);
-                i += bytes_per_pixel;
-              }
-            }
-          }
-        },
-        "ConvertExtraChannelFloat"));
-  } else {
-    float mul = 1. / ((1ull << bits_per_sample) - 1);
-    JXL_RETURN_IF_ERROR(RunOnPool(
-        pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
-        [&](const uint32_t task, size_t /*thread*/) {
-          const size_t y = task;
-          size_t i = row_size * task;
-          float* JXL_RESTRICT row_out = channel->Row(y);
-          if (bits_per_sample <= 8) {
-            LoadFloatRow<Load8>(row_out, in + i, mul, xsize, bytes_per_pixel);
-          } else {
-            if (little_endian) {
-              LoadFloatRow<LoadLE16>(row_out, in + i, mul, xsize,
-                                     bytes_per_pixel);
-            } else {
-              LoadFloatRow<LoadBE16>(row_out, in + i, mul, xsize,
-                                     bytes_per_pixel);
-            }
-          }
-        },
-        "ConvertExtraChannelUint"));
+
+  std::atomic<size_t> error_count = {0};
+
+  const auto convert_row = [&](const uint32_t task, size_t /*thread*/) {
+    const size_t y = task;
+    size_t offset = row_size * task + pixel_offset;
+    float* JXL_RESTRICT row_out = channel->Row(y);
+    const auto save_value = [&](size_t index, float value) {
+      row_out[index] = value;
+    };
+    if (!LoadFloatRow(in + offset, xsize, bytes_per_pixel, format.data_type,
+                      little_endian, scale, save_value)) {
+      error_count++;
+    }
+  };
+  JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, static_cast<uint32_t>(ysize),
+                                ThreadPool::NoInit, convert_row,
+                                "ConvertExtraChannel"));
+
+  if (error_count) {
+    JXL_FAILURE("unsupported pixel format data type");
   }
 
   return true;
 }
 Status ConvertFromExternal(Span<const uint8_t> bytes, size_t xsize,
                            size_t ysize, const ColorEncoding& c_current,
-                           size_t channels, bool alpha_is_premultiplied,
-                           size_t bits_per_sample, JxlEndianness endianness,
-                           ThreadPool* pool, ImageBundle* ib, bool float_in,
-                           size_t align) {
-  JXL_CHECK(float_in ? bits_per_sample == 16 || bits_per_sample == 32
-                     : bits_per_sample > 0 && bits_per_sample <= 16);
-
+                           size_t bits_per_sample, JxlPixelFormat format,
+                           ThreadPool* pool, ImageBundle* ib) {
   const size_t color_channels = c_current.Channels();
-  bool has_alpha = channels == 2 || channels == 4;
-  if (channels < color_channels) {
+  bool has_alpha = format.num_channels == 2 || format.num_channels == 4;
+  if (format.num_channels < color_channels) {
     return JXL_FAILURE("Expected %" PRIuS
-                       " color channels, received only %" PRIuS " channels",
-                       color_channels, channels);
+                       " color channels, received only %u channels",
+                       color_channels, format.num_channels);
   }
 
-  const size_t bytes_per_channel = DivCeil(bits_per_sample, jxl::kBitsPerByte);
-  const size_t bytes_per_pixel = channels * bytes_per_channel;
-  if (bits_per_sample > 16 && bits_per_sample < 32) {
-    return JXL_FAILURE("not supported, try bits_per_sample=32");
-  }
-
-  const size_t last_row_size = xsize * bytes_per_pixel;
-  const size_t row_size =
-      (align > 1 ? jxl::DivCeil(last_row_size, align) * align : last_row_size);
-  const size_t bytes_to_read = row_size * (ysize - 1) + last_row_size;
-  if (xsize == 0 || ysize == 0) return JXL_FAILURE("Empty image");
-  if (bytes.size() < bytes_to_read) {
-    return JXL_FAILURE(
-        "Buffer size is too small: expected at least %" PRIuS
-        " bytes (= %" PRIuS " * %" PRIuS " * %" PRIuS "), got %" PRIuS " bytes",
-        bytes_to_read, xsize, ysize, bytes_per_pixel, bytes.size());
-  }
-  // Too large buffer is likely an application bug, so also fail for that.
-  // Do allow padding to stride in last row though.
-  if (bytes.size() > row_size * ysize) {
-    return JXL_FAILURE(
-        "Buffer size is too large: expected at most %" PRIuS " bytes (= %" PRIuS
-        " * %" PRIuS " * %" PRIuS "), got %" PRIuS " bytes",
-        row_size * ysize, xsize, ysize, bytes_per_pixel, bytes.size());
-  }
-  const bool little_endian =
-      endianness == JXL_LITTLE_ENDIAN ||
-      (endianness == JXL_NATIVE_ENDIAN && IsLittleEndian());
-
-  const uint8_t* const in = bytes.data();
-
   Image3F color(xsize, ysize);
-
-  if (float_in) {
-    for (size_t c = 0; c < color_channels; ++c) {
-      JXL_RETURN_IF_ERROR(RunOnPool(
-          pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
-          [&](const uint32_t task, size_t /*thread*/) {
-            const size_t y = task;
-            size_t i =
-                row_size * task + (c * bits_per_sample / jxl::kBitsPerByte);
-            float* JXL_RESTRICT row_out = color.PlaneRow(c, y);
-            if (bits_per_sample == 16) {
-              if (little_endian) {
-                for (size_t x = 0; x < xsize; ++x) {
-                  row_out[x] = LoadLEFloat16(in + i);
-                  i += bytes_per_pixel;
-                }
-              } else {
-                for (size_t x = 0; x < xsize; ++x) {
-                  row_out[x] = LoadBEFloat16(in + i);
-                  i += bytes_per_pixel;
-                }
-              }
-            } else {
-              if (little_endian) {
-                for (size_t x = 0; x < xsize; ++x) {
-                  row_out[x] = LoadLEFloat(in + i);
-                  i += bytes_per_pixel;
-                }
-              } else {
-                for (size_t x = 0; x < xsize; ++x) {
-                  row_out[x] = LoadBEFloat(in + i);
-                  i += bytes_per_pixel;
-                }
-              }
-            }
-          },
-          "ConvertRGBFloat"));
-    }
-  } else {
-    // Multiplier to convert from the integer range to floating point 0-1 range.
-    float mul = 1. / ((1ull << bits_per_sample) - 1);
-    for (size_t c = 0; c < color_channels; ++c) {
-      JXL_RETURN_IF_ERROR(RunOnPool(
-          pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
-          [&](const uint32_t task, size_t /*thread*/) {
-            const size_t y = task;
-            size_t i = row_size * task + c * bytes_per_channel;
-            float* JXL_RESTRICT row_out = color.PlaneRow(c, y);
-            if (bits_per_sample <= 8) {
-              LoadFloatRow<Load8>(row_out, in + i, mul, xsize, bytes_per_pixel);
-            } else {
-              if (little_endian) {
-                LoadFloatRow<LoadLE16>(row_out, in + i, mul, xsize,
-                                       bytes_per_pixel);
-              } else {
-                LoadFloatRow<LoadBE16>(row_out, in + i, mul, xsize,
-                                       bytes_per_pixel);
-              }
-            }
-          },
-          "ConvertRGBUint"));
-    }
+  for (size_t c = 0; c < color_channels; ++c) {
+    JXL_RETURN_IF_ERROR(ConvertFromExternal(bytes, xsize, ysize,
+                                            bits_per_sample, format, c, pool,
+                                            &color.Plane(c)));
   }
-
   if (color_channels == 1) {
     CopyImageTo(color.Plane(0), &color.Plane(1));
     CopyImageTo(color.Plane(0), &color.Plane(2));
   }
-
   ib->SetFromImage(std::move(color), c_current);
 
   // Passing an interleaved image with an alpha channel to an image that doesn't
   // have alpha channel just discards the passed alpha channel.
   if (has_alpha && ib->HasAlpha()) {
     ImageF alpha(xsize, ysize);
-
-    if (float_in) {
-      JXL_RETURN_IF_ERROR(RunOnPool(
-          pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
-          [&](const uint32_t task, size_t /*thread*/) {
-            const size_t y = task;
-            size_t i = row_size * task +
-                       ((channels - 1) * bits_per_sample / jxl::kBitsPerByte);
-            float* JXL_RESTRICT row_out = alpha.Row(y);
-            if (bits_per_sample == 16) {
-              if (little_endian) {
-                for (size_t x = 0; x < xsize; ++x) {
-                  row_out[x] = LoadLEFloat16(in + i);
-                  i += bytes_per_pixel;
-                }
-              } else {
-                for (size_t x = 0; x < xsize; ++x) {
-                  row_out[x] = LoadBEFloat16(in + i);
-                  i += bytes_per_pixel;
-                }
-              }
-            } else {
-              if (little_endian) {
-                for (size_t x = 0; x < xsize; ++x) {
-                  row_out[x] = LoadLEFloat(in + i);
-                  i += bytes_per_pixel;
-                }
-              } else {
-                for (size_t x = 0; x < xsize; ++x) {
-                  row_out[x] = LoadBEFloat(in + i);
-                  i += bytes_per_pixel;
-                }
-              }
-            }
-          },
-          "ConvertAlphaFloat"));
-    } else {
-      float mul = 1. / ((1ull << bits_per_sample) - 1);
-      JXL_RETURN_IF_ERROR(RunOnPool(
-          pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
-          [&](const uint32_t task, size_t /*thread*/) {
-            const size_t y = task;
-            size_t i = row_size * task + (channels - 1) * bytes_per_channel;
-            float* JXL_RESTRICT row_out = alpha.Row(y);
-            if (bits_per_sample <= 8) {
-              LoadFloatRow<Load8>(row_out, in + i, mul, xsize, bytes_per_pixel);
-            } else {
-              if (little_endian) {
-                LoadFloatRow<LoadLE16>(row_out, in + i, mul, xsize,
-                                       bytes_per_pixel);
-              } else {
-                LoadFloatRow<LoadBE16>(row_out, in + i, mul, xsize,
-                                       bytes_per_pixel);
-              }
-            }
-          },
-          "ConvertAlphaUint"));
-    }
-
-    ib->SetAlpha(std::move(alpha), alpha_is_premultiplied);
+    JXL_RETURN_IF_ERROR(
+        ConvertFromExternal(bytes, xsize, ysize, bits_per_sample, format,
+                            format.num_channels - 1, pool, &alpha));
+    ib->SetAlpha(std::move(alpha));
   } else if (!has_alpha && ib->HasAlpha()) {
     // if alpha is not passed, but it is expected, then assume
     // it is all-opaque
     ImageF alpha(xsize, ysize);
     FillImage(1.0f, &alpha);
-    ib->SetAlpha(std::move(alpha), alpha_is_premultiplied);
+    ib->SetAlpha(std::move(alpha));
   }
 
   return true;
@@ -391,18 +156,10 @@ Status ConvertFromExternal(Span<const uint8_t> bytes, size_t xsize,
 Status BufferToImageF(const JxlPixelFormat& pixel_format, size_t xsize,
                       size_t ysize, const void* buffer, size_t size,
                       ThreadPool* pool, ImageF* channel) {
-  size_t bitdepth;
-  bool float_in;
-
-  JXL_RETURN_IF_ERROR(
-      PixelFormatToExternal(pixel_format, &bitdepth, &float_in));
-
-  JXL_RETURN_IF_ERROR(ConvertFromExternal(
-      jxl::Span<const uint8_t>(static_cast<const uint8_t*>(buffer), size),
-      xsize, ysize, bitdepth, pixel_format.endianness, pool, channel, float_in,
-      pixel_format.align));
-
-  return true;
+  size_t bitdepth = JxlDataTypeBytes(pixel_format.data_type) * kBitsPerByte;
+  return ConvertFromExternal(
+      jxl::Bytes(static_cast<const uint8_t*>(buffer), size), xsize, ysize,
+      bitdepth, pixel_format, 0, pool, channel);
 }
 
 Status BufferToImageBundle(const JxlPixelFormat& pixel_format, uint32_t xsize,
@@ -410,16 +167,10 @@ Status BufferToImageBundle(const JxlPixelFormat& pixel_format, uint32_t xsize,
                            jxl::ThreadPool* pool,
                            const jxl::ColorEncoding& c_current,
                            jxl::ImageBundle* ib) {
-  size_t bitdepth;
-  bool float_in;
-  JXL_RETURN_IF_ERROR(
-      PixelFormatToExternal(pixel_format, &bitdepth, &float_in));
-
+  size_t bitdepth = JxlDataTypeBytes(pixel_format.data_type) * kBitsPerByte;
   JXL_RETURN_IF_ERROR(ConvertFromExternal(
-      jxl::Span<const uint8_t>(static_cast<const uint8_t*>(buffer), size),
-      xsize, ysize, c_current, pixel_format.num_channels,
-      /*alpha_is_premultiplied=*/false, bitdepth, pixel_format.endianness, pool,
-      ib, float_in, pixel_format.align));
+      jxl::Bytes(static_cast<const uint8_t*>(buffer), size), xsize, ysize,
+      c_current, bitdepth, pixel_format, pool, ib));
   ib->VerifyMetadata();
 
   return true;
index 73b7175..d2968a0 100644 (file)
@@ -8,14 +8,12 @@
 
 // Interleaved image for color transforms and Codec.
 
+#include <jxl/types.h>
 #include <stddef.h>
 #include <stdint.h>
 
-#include "jxl/types.h"
 #include "lib/jxl/base/data_parallel.h"
-#include "lib/jxl/base/padded_bytes.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/codec_in_out.h"
 #include "lib/jxl/color_encoding_internal.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/image_bundle.h"
 namespace jxl {
 Status ConvertFromExternal(Span<const uint8_t> bytes, size_t xsize,
                            size_t ysize, size_t bits_per_sample,
-                           JxlEndianness endianness, ThreadPool* pool,
-                           ImageF* channel, bool float_in, size_t align);
+                           JxlPixelFormat format, size_t c, ThreadPool* pool,
+                           ImageF* channel);
 
 // Convert an interleaved pixel buffer to the internal ImageBundle
 // representation. This is the opposite of ConvertToExternal().
 Status ConvertFromExternal(Span<const uint8_t> bytes, size_t xsize,
                            size_t ysize, const ColorEncoding& c_current,
-                           size_t channels, bool alpha_is_premultiplied,
-                           size_t bits_per_sample, JxlEndianness endianness,
-                           ThreadPool* pool, ImageBundle* ib, bool float_in,
-                           size_t align);
+                           size_t bits_per_sample, JxlPixelFormat format,
+                           ThreadPool* pool, ImageBundle* ib);
 Status BufferToImageF(const JxlPixelFormat& pixel_format, size_t xsize,
                       size_t ysize, const void* buffer, size_t size,
                       ThreadPool* pool, ImageF* channel);
index a123d4b..64e9cf6 100644 (file)
@@ -21,17 +21,14 @@ void BM_EncExternalImage_ConvertImageRGBA(benchmark::State& state) {
   ImageBundle ib(&im);
 
   std::vector<uint8_t> interleaved(xsize * ysize * 4);
-
+  JxlPixelFormat format = {4, JXL_TYPE_UINT8, JXL_NATIVE_ENDIAN, 0};
   for (auto _ : state) {
     for (size_t i = 0; i < kNumIter; ++i) {
       JXL_CHECK(ConvertFromExternal(
-          Span<const uint8_t>(interleaved.data(), interleaved.size()), xsize,
-          ysize,
+          Bytes(interleaved.data(), interleaved.size()), xsize, ysize,
           /*c_current=*/ColorEncoding::SRGB(),
-          /*channels=*/4,
-          /*alpha_is_premultiplied=*/false,
-          /*bits_per_sample=*/8, JXL_NATIVE_ENDIAN,
-          /*pool=*/nullptr, &ib, /*float_in=*/false, /*align=*/0));
+          /*bits_per_sample=*/8, format,
+          /*pool=*/nullptr, &ib));
     }
   }
 
index 2c5fa5a..de2e15e 100644 (file)
@@ -8,13 +8,12 @@
 #include <array>
 #include <new>
 
-#include "gtest/gtest.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/data_parallel.h"
-#include "lib/jxl/base/thread_pool_internal.h"
 #include "lib/jxl/color_encoding_internal.h"
 #include "lib/jxl/image_ops.h"
 #include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
 namespace {
@@ -25,23 +24,20 @@ TEST(ExternalImageTest, InvalidSize) {
   im.SetAlphaBits(8);
   ImageBundle ib(&im);
 
+  JxlPixelFormat format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
   const uint8_t buf[10 * 100 * 8] = {};
+  EXPECT_FALSE(ConvertFromExternal(Bytes(buf, 10), /*xsize=*/10, /*ysize=*/100,
+                                   /*c_current=*/ColorEncoding::SRGB(),
+                                   /*bits_per_sample=*/16, format, nullptr,
+                                   &ib));
   EXPECT_FALSE(ConvertFromExternal(
-      Span<const uint8_t>(buf, 10), /*xsize=*/10, /*ysize=*/100,
-      /*c_current=*/ColorEncoding::SRGB(), /*channels=*/4,
-      /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16, JXL_BIG_ENDIAN,
-      nullptr, &ib, /*float_in=*/false, /*align=*/0));
-  EXPECT_FALSE(ConvertFromExternal(
-      Span<const uint8_t>(buf, sizeof(buf) - 1), /*xsize=*/10, /*ysize=*/100,
-      /*c_current=*/ColorEncoding::SRGB(), /*channels=*/4,
-      /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16, JXL_BIG_ENDIAN,
-      nullptr, &ib, /*float_in=*/false, /*align=*/0));
+      Bytes(buf, sizeof(buf) - 1), /*xsize=*/10, /*ysize=*/100,
+      /*c_current=*/ColorEncoding::SRGB(),
+      /*bits_per_sample=*/16, format, nullptr, &ib));
   EXPECT_TRUE(
-      ConvertFromExternal(Span<const uint8_t>(buf, sizeof(buf)), /*xsize=*/10,
+      ConvertFromExternal(Bytes(buf, sizeof(buf)), /*xsize=*/10,
                           /*ysize=*/100, /*c_current=*/ColorEncoding::SRGB(),
-                          /*channels=*/4, /*alpha_is_premultiplied=*/false,
-                          /*bits_per_sample=*/16, JXL_BIG_ENDIAN, nullptr, &ib,
-                          /*float_in=*/false, /*align=*/0));
+                          /*bits_per_sample=*/16, format, nullptr, &ib));
 }
 #endif
 
@@ -54,16 +50,29 @@ TEST(ExternalImageTest, AlphaMissing) {
   const size_t ysize = 20;
   const uint8_t buf[xsize * ysize * 4] = {};
 
+  JxlPixelFormat format = {4, JXL_TYPE_UINT8, JXL_BIG_ENDIAN, 0};
   // has_alpha is true but the ImageBundle has no alpha. Alpha channel should
   // be ignored.
-  EXPECT_TRUE(
-      ConvertFromExternal(Span<const uint8_t>(buf, sizeof(buf)), xsize, ysize,
-                          /*c_current=*/ColorEncoding::SRGB(),
-                          /*channels=*/4, /*alpha_is_premultiplied=*/false,
-                          /*bits_per_sample=*/8, JXL_BIG_ENDIAN, nullptr, &ib,
-                          /*float_in=*/false, /*align=*/0));
+  EXPECT_TRUE(ConvertFromExternal(Bytes(buf, sizeof(buf)), xsize, ysize,
+                                  /*c_current=*/ColorEncoding::SRGB(),
+                                  /*bits_per_sample=*/8, format, nullptr, &ib));
   EXPECT_FALSE(ib.HasAlpha());
 }
 
+TEST(ExternalImageTest, AlphaPremultiplied) {
+  ImageMetadata im;
+  im.SetAlphaBits(8, true);
+
+  ImageBundle ib(&im);
+  const size_t xsize = 10;
+  const size_t ysize = 20;
+  const size_t size = xsize * ysize * 8;
+  const uint8_t buf[size] = {};
+
+  JxlPixelFormat format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  EXPECT_TRUE(BufferToImageBundle(format, xsize, ysize, buf, size, nullptr,
+                                  ColorEncoding::SRGB(), &ib));
+}
+
 }  // namespace
 }  // namespace jxl
diff --git a/lib/jxl/enc_fast_lossless.cc b/lib/jxl/enc_fast_lossless.cc
new file mode 100644 (file)
index 0000000..90cd37d
--- /dev/null
@@ -0,0 +1,3877 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef FJXL_SELF_INCLUDE
+
+#include "lib/jxl/enc_fast_lossless.h"
+
+#include <assert.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <memory>
+#include <vector>
+
+// Enable NEON and AVX2/AVX512 if not asked to do otherwise and the compilers
+// support it.
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include <arm_neon.h>
+
+#ifndef FJXL_ENABLE_NEON
+#define FJXL_ENABLE_NEON 1
+#endif
+
+#elif (defined(__x86_64__) || defined(_M_X64)) && !defined(_MSC_VER)
+#include <immintrin.h>
+
+// manually add _mm512_cvtsi512_si32 definition if missing
+// (e.g. with Xcode on macOS Mojave)
+// copied from gcc 11.1.0 include/avx512fintrin.h line 14367-14373
+#if defined(__clang__) &&                                           \
+    ((!defined(__apple_build_version__) && __clang_major__ < 10) || \
+     (defined(__apple_build_version__) && __apple_build_version__ < 12000032))
+inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtsi512_si32(__m512i __A) {
+  __v16si __B = (__v16si)__A;
+  return __B[0];
+}
+#endif
+
+// TODO(veluca): MSVC support for dynamic dispatch.
+#if defined(__clang__) || defined(__GNUC__)
+
+#ifndef FJXL_ENABLE_AVX2
+#define FJXL_ENABLE_AVX2 1
+#endif
+
+#ifndef FJXL_ENABLE_AVX512
+// On clang-7 or earlier, and gcc-10 or earlier, AVX512 seems broken.
+#if (defined(__clang__) &&                                             \
+         (!defined(__apple_build_version__) && __clang_major__ > 7) || \
+     (defined(__apple_build_version__) &&                              \
+      __apple_build_version__ > 10010046)) ||                          \
+    (defined(__GNUC__) && __GNUC__ > 10)
+#define FJXL_ENABLE_AVX512 1
+#endif
+#endif
+
+#endif
+
+#endif
+
+#ifndef FJXL_ENABLE_NEON
+#define FJXL_ENABLE_NEON 0
+#endif
+
+#ifndef FJXL_ENABLE_AVX2
+#define FJXL_ENABLE_AVX2 0
+#endif
+
+#ifndef FJXL_ENABLE_AVX512
+#define FJXL_ENABLE_AVX512 0
+#endif
+
+namespace {
+#if defined(_MSC_VER) && !defined(__clang__)
+#define FJXL_INLINE __forceinline
+FJXL_INLINE uint32_t FloorLog2(uint32_t v) {
+  unsigned long index;
+  _BitScanReverse(&index, v);
+  return index;
+}
+FJXL_INLINE uint32_t CtzNonZero(uint64_t v) {
+  unsigned long index;
+  _BitScanForward(&index, v);
+  return index;
+}
+#else
+#define FJXL_INLINE inline __attribute__((always_inline))
+FJXL_INLINE uint32_t FloorLog2(uint32_t v) {
+  return v ? 31 - __builtin_clz(v) : 0;
+}
+FJXL_INLINE uint32_t CtzNonZero(uint64_t v) { return __builtin_ctzll(v); }
+#endif
+
+// Compiles to a memcpy on little-endian systems.
+FJXL_INLINE void StoreLE64(uint8_t* tgt, uint64_t data) {
+#if (!defined(__BYTE_ORDER__) || (__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__))
+  for (int i = 0; i < 8; i++) {
+    tgt[i] = (data >> (i * 8)) & 0xFF;
+  }
+#else
+  memcpy(tgt, &data, 8);
+#endif
+}
+
+FJXL_INLINE size_t AddBits(uint32_t count, uint64_t bits, uint8_t* data_buf,
+                           size_t& bits_in_buffer, uint64_t& bit_buffer) {
+  bit_buffer |= bits << bits_in_buffer;
+  bits_in_buffer += count;
+  StoreLE64(data_buf, bit_buffer);
+  size_t bytes_in_buffer = bits_in_buffer / 8;
+  bits_in_buffer -= bytes_in_buffer * 8;
+  bit_buffer >>= bytes_in_buffer * 8;
+  return bytes_in_buffer;
+}
+
+struct BitWriter {
+  void Allocate(size_t maximum_bit_size) {
+    assert(data == nullptr);
+    // Leave some padding.
+    data.reset(static_cast<uint8_t*>(malloc(maximum_bit_size / 8 + 64)));
+  }
+
+  void Write(uint32_t count, uint64_t bits) {
+    bytes_written += AddBits(count, bits, data.get() + bytes_written,
+                             bits_in_buffer, buffer);
+  }
+
+  void ZeroPadToByte() {
+    if (bits_in_buffer != 0) {
+      Write(8 - bits_in_buffer, 0);
+    }
+  }
+
+  FJXL_INLINE void WriteMultiple(const uint64_t* nbits, const uint64_t* bits,
+                                 size_t n) {
+    // Necessary because Write() is only guaranteed to work with <=56 bits.
+    // Trying to SIMD-fy this code results in lower speed (and definitely less
+    // clarity).
+    {
+      for (size_t i = 0; i < n; i++) {
+        this->buffer |= bits[i] << this->bits_in_buffer;
+        memcpy(this->data.get() + this->bytes_written, &this->buffer, 8);
+        uint64_t shift = 64 - this->bits_in_buffer;
+        this->bits_in_buffer += nbits[i];
+        // This `if` seems to be faster than using ternaries.
+        if (this->bits_in_buffer >= 64) {
+          uint64_t next_buffer = bits[i] >> shift;
+          this->buffer = next_buffer;
+          this->bits_in_buffer -= 64;
+          this->bytes_written += 8;
+        }
+      }
+      memcpy(this->data.get() + this->bytes_written, &this->buffer, 8);
+      size_t bytes_in_buffer = this->bits_in_buffer / 8;
+      this->bits_in_buffer -= bytes_in_buffer * 8;
+      this->buffer >>= bytes_in_buffer * 8;
+      this->bytes_written += bytes_in_buffer;
+    }
+  }
+
+  std::unique_ptr<uint8_t[], void (*)(void*)> data = {nullptr, free};
+  size_t bytes_written = 0;
+  size_t bits_in_buffer = 0;
+  uint64_t buffer = 0;
+};
+
+}  // namespace
+
+extern "C" {
+
+struct JxlFastLosslessFrameState {
+  size_t width;
+  size_t height;
+  size_t nb_chans;
+  size_t bitdepth;
+  BitWriter header;
+  std::vector<std::array<BitWriter, 4>> group_data;
+  size_t current_bit_writer = 0;
+  size_t bit_writer_byte_pos = 0;
+  size_t bits_in_buffer = 0;
+  uint64_t bit_buffer = 0;
+};
+
+size_t JxlFastLosslessOutputSize(const JxlFastLosslessFrameState* frame) {
+  size_t total_size_groups = 0;
+  for (size_t i = 0; i < frame->group_data.size(); i++) {
+    size_t sz = 0;
+    for (size_t j = 0; j < frame->nb_chans; j++) {
+      const auto& writer = frame->group_data[i][j];
+      sz += writer.bytes_written * 8 + writer.bits_in_buffer;
+    }
+    sz = (sz + 7) / 8;
+    total_size_groups += sz;
+  }
+  return frame->header.bytes_written + total_size_groups;
+}
+
+size_t JxlFastLosslessMaxRequiredOutput(
+    const JxlFastLosslessFrameState* frame) {
+  return JxlFastLosslessOutputSize(frame) + 32;
+}
+
+void JxlFastLosslessPrepareHeader(JxlFastLosslessFrameState* frame,
+                                  int add_image_header, int is_last) {
+  BitWriter* output = &frame->header;
+  output->Allocate(1000 + frame->group_data.size() * 32);
+
+  std::vector<size_t> group_sizes(frame->group_data.size());
+  for (size_t i = 0; i < frame->group_data.size(); i++) {
+    size_t sz = 0;
+    for (size_t j = 0; j < frame->nb_chans; j++) {
+      const auto& writer = frame->group_data[i][j];
+      sz += writer.bytes_written * 8 + writer.bits_in_buffer;
+    }
+    sz = (sz + 7) / 8;
+    group_sizes[i] = sz;
+  }
+
+  bool have_alpha = (frame->nb_chans == 2 || frame->nb_chans == 4);
+
+#if FJXL_STANDALONE
+  if (add_image_header) {
+    // Signature
+    output->Write(16, 0x0AFF);
+
+    // Size header, hand-crafted.
+    // Not small
+    output->Write(1, 0);
+
+    auto wsz = [output](size_t size) {
+      if (size - 1 < (1 << 9)) {
+        output->Write(2, 0b00);
+        output->Write(9, size - 1);
+      } else if (size - 1 < (1 << 13)) {
+        output->Write(2, 0b01);
+        output->Write(13, size - 1);
+      } else if (size - 1 < (1 << 18)) {
+        output->Write(2, 0b10);
+        output->Write(18, size - 1);
+      } else {
+        output->Write(2, 0b11);
+        output->Write(30, size - 1);
+      }
+    };
+
+    wsz(frame->height);
+
+    // No special ratio.
+    output->Write(3, 0);
+
+    wsz(frame->width);
+
+    // Hand-crafted ImageMetadata.
+    output->Write(1, 0);  // all_default
+    output->Write(1, 0);  // extra_fields
+    output->Write(1, 0);  // bit_depth.floating_point_sample
+    if (frame->bitdepth == 8) {
+      output->Write(2, 0b00);  // bit_depth.bits_per_sample = 8
+    } else if (frame->bitdepth == 10) {
+      output->Write(2, 0b01);  // bit_depth.bits_per_sample = 10
+    } else if (frame->bitdepth == 12) {
+      output->Write(2, 0b10);  // bit_depth.bits_per_sample = 12
+    } else {
+      output->Write(2, 0b11);  // 1 + u(6)
+      output->Write(6, frame->bitdepth - 1);
+    }
+    if (frame->bitdepth <= 14) {
+      output->Write(1, 1);  // 16-bit-buffer sufficient
+    } else {
+      output->Write(1, 0);  // 16-bit-buffer NOT sufficient
+    }
+    if (have_alpha) {
+      output->Write(2, 0b01);  // One extra channel
+      output->Write(1, 1);     // ... all_default (ie. 8-bit alpha)
+    } else {
+      output->Write(2, 0b00);  // No extra channel
+    }
+    output->Write(1, 0);  // Not XYB
+    if (frame->nb_chans > 2) {
+      output->Write(1, 1);  // color_encoding.all_default (sRGB)
+    } else {
+      output->Write(1, 0);     // color_encoding.all_default false
+      output->Write(1, 0);     // color_encoding.want_icc false
+      output->Write(2, 1);     // grayscale
+      output->Write(2, 1);     // D65
+      output->Write(1, 0);     // no gamma transfer function
+      output->Write(2, 0b10);  // tf: 2 + u(4)
+      output->Write(4, 11);    // tf of sRGB
+      output->Write(2, 1);     // relative rendering intent
+    }
+    output->Write(2, 0b00);  // No extensions.
+
+    output->Write(1, 1);  // all_default transform data
+
+    // No ICC, no preview. Frame should start at byte boundery.
+    output->ZeroPadToByte();
+  }
+#else
+  assert(!add_image_header);
+#endif
+
+  // Handcrafted frame header.
+  output->Write(1, 0);     // all_default
+  output->Write(2, 0b00);  // regular frame
+  output->Write(1, 1);     // modular
+  output->Write(2, 0b00);  // default flags
+  output->Write(1, 0);     // not YCbCr
+  output->Write(2, 0b00);  // no upsampling
+  if (have_alpha) {
+    output->Write(2, 0b00);  // no alpha upsampling
+  }
+  output->Write(2, 0b01);  // default group size
+  output->Write(2, 0b00);  // exactly one pass
+  output->Write(1, 0);     // no custom size or origin
+  output->Write(2, 0b00);  // kReplace blending mode
+  if (have_alpha) {
+    output->Write(2, 0b00);  // kReplace blending mode for alpha channel
+  }
+  output->Write(1, is_last);  // is_last
+  output->Write(2, 0b00);     // a frame has no name
+  output->Write(1, 0);        // loop filter is not all_default
+  output->Write(1, 0);        // no gaborish
+  output->Write(2, 0);        // 0 EPF iters
+  output->Write(2, 0b00);     // No LF extensions
+  output->Write(2, 0b00);     // No FH extensions
+
+  output->Write(1, 0);      // No TOC permutation
+  output->ZeroPadToByte();  // TOC is byte-aligned.
+  for (size_t i = 0; i < frame->group_data.size(); i++) {
+    size_t sz = group_sizes[i];
+    if (sz < (1 << 10)) {
+      output->Write(2, 0b00);
+      output->Write(10, sz);
+    } else if (sz - 1024 < (1 << 14)) {
+      output->Write(2, 0b01);
+      output->Write(14, sz - 1024);
+    } else if (sz - 17408 < (1 << 22)) {
+      output->Write(2, 0b10);
+      output->Write(22, sz - 17408);
+    } else {
+      output->Write(2, 0b11);
+      output->Write(30, sz - 4211712);
+    }
+  }
+  output->ZeroPadToByte();  // Groups are byte-aligned.
+}
+
+#if FJXL_ENABLE_AVX512
+__attribute__((target("avx512vbmi2"))) static size_t AppendBytesWithBitOffset(
+    const uint8_t* data, size_t n, size_t bit_buffer_nbits,
+    unsigned char* output, uint64_t& bit_buffer) {
+  if (n < 128) {
+    return 0;
+  }
+
+  size_t i = 0;
+  __m512i shift = _mm512_set1_epi64(64 - bit_buffer_nbits);
+  __m512i carry = _mm512_set1_epi64(bit_buffer << (64 - bit_buffer_nbits));
+
+  for (; i + 64 <= n; i += 64) {
+    __m512i current = _mm512_loadu_si512(data + i);
+    __m512i previous_u64 = _mm512_alignr_epi64(current, carry, 7);
+    carry = current;
+    __m512i out = _mm512_shrdv_epi64(previous_u64, current, shift);
+    _mm512_storeu_si512(output + i, out);
+  }
+
+  bit_buffer = data[i - 1] >> (8 - bit_buffer_nbits);
+
+  return i;
+}
+#endif
+
+size_t JxlFastLosslessWriteOutput(JxlFastLosslessFrameState* frame,
+                                  unsigned char* output, size_t output_size) {
+  assert(output_size >= 32);
+  unsigned char* initial_output = output;
+  size_t (*append_bytes_with_bit_offset)(const uint8_t*, size_t, size_t,
+                                         unsigned char*, uint64_t&) = nullptr;
+
+#if FJXL_ENABLE_AVX512
+  if (__builtin_cpu_supports("avx512vbmi2")) {
+    append_bytes_with_bit_offset = AppendBytesWithBitOffset;
+  }
+#endif
+
+  while (true) {
+    size_t& cur = frame->current_bit_writer;
+    size_t& bw_pos = frame->bit_writer_byte_pos;
+    if (cur >= 1 + frame->group_data.size() * frame->nb_chans) {
+      return output - initial_output;
+    }
+    if (output_size <= 8) {
+      return output - initial_output;
+    }
+    size_t nbc = frame->nb_chans;
+    const BitWriter& writer =
+        cur == 0 ? frame->header
+                 : frame->group_data[(cur - 1) / nbc][(cur - 1) % nbc];
+    size_t full_byte_count =
+        std::min(output_size - 8, writer.bytes_written - bw_pos);
+    if (frame->bits_in_buffer == 0) {
+      memcpy(output, writer.data.get() + bw_pos, full_byte_count);
+    } else {
+      size_t i = 0;
+      if (append_bytes_with_bit_offset) {
+        i += append_bytes_with_bit_offset(
+            writer.data.get() + bw_pos, full_byte_count, frame->bits_in_buffer,
+            output, frame->bit_buffer);
+      }
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+      // Copy 8 bytes at a time until we reach the border.
+      for (; i + 8 < full_byte_count; i += 8) {
+        uint64_t chunk;
+        memcpy(&chunk, writer.data.get() + bw_pos + i, 8);
+        uint64_t out = frame->bit_buffer | (chunk << frame->bits_in_buffer);
+        memcpy(output + i, &out, 8);
+        frame->bit_buffer = chunk >> (64 - frame->bits_in_buffer);
+      }
+#endif
+      for (; i < full_byte_count; i++) {
+        AddBits(8, writer.data.get()[bw_pos + i], output + i,
+                frame->bits_in_buffer, frame->bit_buffer);
+      }
+    }
+    output += full_byte_count;
+    output_size -= full_byte_count;
+    bw_pos += full_byte_count;
+    if (bw_pos == writer.bytes_written) {
+      auto write = [&](size_t num, uint64_t bits) {
+        size_t n = AddBits(num, bits, output, frame->bits_in_buffer,
+                           frame->bit_buffer);
+        output += n;
+        output_size -= n;
+      };
+      if (writer.bits_in_buffer) {
+        write(writer.bits_in_buffer, writer.buffer);
+      }
+      bw_pos = 0;
+      cur++;
+      if ((cur - 1) % nbc == 0 && frame->bits_in_buffer != 0) {
+        write(8 - frame->bits_in_buffer, 0);
+      }
+    }
+  }
+}
+
+void JxlFastLosslessFreeFrameState(JxlFastLosslessFrameState* frame) {
+  delete frame;
+}
+
+}  // extern "C"
+
+#endif
+
+#ifdef FJXL_SELF_INCLUDE
+
+namespace {
+
+constexpr size_t kNumRawSymbols = 19;
+constexpr size_t kNumLZ77 = 33;
+constexpr size_t kLZ77CacheSize = 32;
+
+constexpr size_t kLZ77Offset = 224;
+constexpr size_t kLZ77MinLength = 7;
+
+void EncodeHybridUintLZ77(uint32_t value, uint32_t* token, uint32_t* nbits,
+                          uint32_t* bits) {
+  // 400 config
+  uint32_t n = FloorLog2(value);
+  *token = value < 16 ? value : 16 + n - 4;
+  *nbits = value < 16 ? 0 : n;
+  *bits = value < 16 ? 0 : value - (1 << *nbits);
+}
+
+struct PrefixCode {
+  uint8_t raw_nbits[kNumRawSymbols] = {};
+  uint8_t raw_bits[kNumRawSymbols] = {};
+
+  alignas(64) uint8_t raw_nbits_simd[16] = {};
+  alignas(64) uint8_t raw_bits_simd[16] = {};
+
+  uint8_t lz77_nbits[kNumLZ77] = {};
+  uint16_t lz77_bits[kNumLZ77] = {};
+
+  uint64_t lz77_cache_bits[kLZ77CacheSize] = {};
+  uint8_t lz77_cache_nbits[kLZ77CacheSize] = {};
+
+  static uint16_t BitReverse(size_t nbits, uint16_t bits) {
+    constexpr uint16_t kNibbleLookup[16] = {
+        0b0000, 0b1000, 0b0100, 0b1100, 0b0010, 0b1010, 0b0110, 0b1110,
+        0b0001, 0b1001, 0b0101, 0b1101, 0b0011, 0b1011, 0b0111, 0b1111,
+    };
+    uint16_t rev16 = (kNibbleLookup[bits & 0xF] << 12) |
+                     (kNibbleLookup[(bits >> 4) & 0xF] << 8) |
+                     (kNibbleLookup[(bits >> 8) & 0xF] << 4) |
+                     (kNibbleLookup[bits >> 12]);
+    return rev16 >> (16 - nbits);
+  }
+
+  // Create the prefix codes given the code lengths.
+  // Supports the code lengths being split into two halves.
+  static void ComputeCanonicalCode(const uint8_t* first_chunk_nbits,
+                                   uint8_t* first_chunk_bits,
+                                   size_t first_chunk_size,
+                                   const uint8_t* second_chunk_nbits,
+                                   uint16_t* second_chunk_bits,
+                                   size_t second_chunk_size) {
+    constexpr size_t kMaxCodeLength = 15;
+    uint8_t code_length_counts[kMaxCodeLength + 1] = {};
+    for (size_t i = 0; i < first_chunk_size; i++) {
+      code_length_counts[first_chunk_nbits[i]]++;
+      assert(first_chunk_nbits[i] <= kMaxCodeLength);
+      assert(first_chunk_nbits[i] <= 8);
+      assert(first_chunk_nbits[i] > 0);
+    }
+    for (size_t i = 0; i < second_chunk_size; i++) {
+      code_length_counts[second_chunk_nbits[i]]++;
+      assert(second_chunk_nbits[i] <= kMaxCodeLength);
+    }
+
+    uint16_t next_code[kMaxCodeLength + 1] = {};
+
+    uint16_t code = 0;
+    for (size_t i = 1; i < kMaxCodeLength + 1; i++) {
+      code = (code + code_length_counts[i - 1]) << 1;
+      next_code[i] = code;
+    }
+
+    for (size_t i = 0; i < first_chunk_size; i++) {
+      first_chunk_bits[i] =
+          BitReverse(first_chunk_nbits[i], next_code[first_chunk_nbits[i]]++);
+    }
+    for (size_t i = 0; i < second_chunk_size; i++) {
+      second_chunk_bits[i] =
+          BitReverse(second_chunk_nbits[i], next_code[second_chunk_nbits[i]]++);
+    }
+  }
+
+  template <typename T>
+  static void ComputeCodeLengthsNonZeroImpl(const uint64_t* freqs, size_t n,
+                                            size_t precision, T infty,
+                                            uint8_t* min_limit,
+                                            uint8_t* max_limit,
+                                            uint8_t* nbits) {
+    std::vector<T> dynp(((1U << precision) + 1) * (n + 1), infty);
+    auto d = [&](size_t sym, size_t off) -> T& {
+      return dynp[sym * ((1 << precision) + 1) + off];
+    };
+    d(0, 0) = 0;
+    for (size_t sym = 0; sym < n; sym++) {
+      for (T bits = min_limit[sym]; bits <= max_limit[sym]; bits++) {
+        size_t off_delta = 1U << (precision - bits);
+        for (size_t off = 0; off + off_delta <= (1U << precision); off++) {
+          d(sym + 1, off + off_delta) =
+              std::min(d(sym, off) + static_cast<T>(freqs[sym]) * bits,
+                       d(sym + 1, off + off_delta));
+        }
+      }
+    }
+
+    size_t sym = n;
+    size_t off = 1U << precision;
+
+    assert(d(sym, off) != infty);
+
+    while (sym-- > 0) {
+      assert(off > 0);
+      for (size_t bits = min_limit[sym]; bits <= max_limit[sym]; bits++) {
+        size_t off_delta = 1U << (precision - bits);
+        if (off_delta <= off &&
+            d(sym + 1, off) == d(sym, off - off_delta) + freqs[sym] * bits) {
+          off -= off_delta;
+          nbits[sym] = bits;
+          break;
+        }
+      }
+    }
+  }
+
+  // Computes nbits[i] for i <= n, subject to min_limit[i] <= nbits[i] <=
+  // max_limit[i] and sum 2**-nbits[i] == 1, so to minimize sum(nbits[i] *
+  // freqs[i]).
+  static void ComputeCodeLengthsNonZero(const uint64_t* freqs, size_t n,
+                                        uint8_t* min_limit, uint8_t* max_limit,
+                                        uint8_t* nbits) {
+    size_t precision = 0;
+    size_t shortest_length = 255;
+    uint64_t freqsum = 0;
+    for (size_t i = 0; i < n; i++) {
+      assert(freqs[i] != 0);
+      freqsum += freqs[i];
+      if (min_limit[i] < 1) min_limit[i] = 1;
+      assert(min_limit[i] <= max_limit[i]);
+      precision = std::max<size_t>(max_limit[i], precision);
+      shortest_length = std::min<size_t>(min_limit[i], shortest_length);
+    }
+    // If all the minimum limits are greater than 1, shift precision so that we
+    // behave as if the shortest was 1.
+    precision -= shortest_length - 1;
+    uint64_t infty = freqsum * precision;
+    if (infty < std::numeric_limits<uint32_t>::max() / 2) {
+      ComputeCodeLengthsNonZeroImpl(freqs, n, precision,
+                                    static_cast<uint32_t>(infty), min_limit,
+                                    max_limit, nbits);
+    } else {
+      ComputeCodeLengthsNonZeroImpl(freqs, n, precision, infty, min_limit,
+                                    max_limit, nbits);
+    }
+  }
+
+  static constexpr size_t kMaxNumSymbols =
+      kNumRawSymbols + 1 < kNumLZ77 ? kNumLZ77 : kNumRawSymbols + 1;
+  static void ComputeCodeLengths(const uint64_t* freqs, size_t n,
+                                 const uint8_t* min_limit_in,
+                                 const uint8_t* max_limit_in, uint8_t* nbits) {
+    assert(n <= kMaxNumSymbols);
+    uint64_t compact_freqs[kMaxNumSymbols];
+    uint8_t min_limit[kMaxNumSymbols];
+    uint8_t max_limit[kMaxNumSymbols];
+    size_t ni = 0;
+    for (size_t i = 0; i < n; i++) {
+      if (freqs[i]) {
+        compact_freqs[ni] = freqs[i];
+        min_limit[ni] = min_limit_in[i];
+        max_limit[ni] = max_limit_in[i];
+        ni++;
+      }
+    }
+    uint8_t num_bits[kMaxNumSymbols] = {};
+    ComputeCodeLengthsNonZero(compact_freqs, ni, min_limit, max_limit,
+                              num_bits);
+    ni = 0;
+    for (size_t i = 0; i < n; i++) {
+      nbits[i] = 0;
+      if (freqs[i]) {
+        nbits[i] = num_bits[ni++];
+      }
+    }
+  }
+
+  // Invalid code, used to construct arrays.
+  PrefixCode() {}
+
+  template <typename BitDepth>
+  PrefixCode(BitDepth, uint64_t* raw_counts, uint64_t* lz77_counts) {
+    // "merge" together all the lz77 counts in a single symbol for the level 1
+    // table (containing just the raw symbols, up to length 7).
+    uint64_t level1_counts[kNumRawSymbols + 1];
+    memcpy(level1_counts, raw_counts, kNumRawSymbols * sizeof(uint64_t));
+    size_t numraw = kNumRawSymbols;
+    while (numraw > 0 && level1_counts[numraw - 1] == 0) numraw--;
+
+    level1_counts[numraw] = 0;
+    for (size_t i = 0; i < kNumLZ77; i++) {
+      level1_counts[numraw] += lz77_counts[i];
+    }
+    uint8_t level1_nbits[kNumRawSymbols + 1] = {};
+    ComputeCodeLengths(level1_counts, numraw + 1, BitDepth::kMinRawLength,
+                       BitDepth::kMaxRawLength, level1_nbits);
+
+    uint8_t level2_nbits[kNumLZ77] = {};
+    uint8_t min_lengths[kNumLZ77] = {};
+    uint8_t l = 15 - level1_nbits[numraw];
+    uint8_t max_lengths[kNumLZ77];
+    for (size_t i = 0; i < kNumLZ77; i++) {
+      max_lengths[i] = l;
+    }
+    size_t num_lz77 = kNumLZ77;
+    while (num_lz77 > 0 && lz77_counts[num_lz77 - 1] == 0) num_lz77--;
+    ComputeCodeLengths(lz77_counts, num_lz77, min_lengths, max_lengths,
+                       level2_nbits);
+    for (size_t i = 0; i < numraw; i++) {
+      raw_nbits[i] = level1_nbits[i];
+    }
+    for (size_t i = 0; i < num_lz77; i++) {
+      lz77_nbits[i] =
+          level2_nbits[i] ? level1_nbits[numraw] + level2_nbits[i] : 0;
+    }
+
+    ComputeCanonicalCode(raw_nbits, raw_bits, numraw, lz77_nbits, lz77_bits,
+                         kNumLZ77);
+    BitDepth::PrepareForSimd(raw_nbits, raw_bits, numraw, raw_nbits_simd,
+                             raw_bits_simd);
+
+    // Prepare lz77 cache
+    for (size_t count = 0; count < kLZ77CacheSize; count++) {
+      unsigned token, nbits, bits;
+      EncodeHybridUintLZ77(count, &token, &nbits, &bits);
+      lz77_cache_nbits[count] = lz77_nbits[token] + nbits + raw_nbits[0];
+      lz77_cache_bits[count] =
+          (((bits << lz77_nbits[token]) | lz77_bits[token]) << raw_nbits[0]) |
+          raw_bits[0];
+    }
+  }
+
+  void WriteTo(BitWriter* writer) const {
+    uint64_t code_length_counts[18] = {};
+    code_length_counts[17] = 3 + 2 * (kNumLZ77 - 1);
+    for (size_t i = 0; i < kNumRawSymbols; i++) {
+      code_length_counts[raw_nbits[i]]++;
+    }
+    for (size_t i = 0; i < kNumLZ77; i++) {
+      code_length_counts[lz77_nbits[i]]++;
+    }
+    uint8_t code_length_nbits[18] = {};
+    uint8_t code_length_nbits_min[18] = {};
+    uint8_t code_length_nbits_max[18] = {
+        5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+    };
+    ComputeCodeLengths(code_length_counts, 18, code_length_nbits_min,
+                       code_length_nbits_max, code_length_nbits);
+    writer->Write(2, 0b00);  // HSKIP = 0, i.e. don't skip code lengths.
+
+    // As per Brotli RFC.
+    uint8_t code_length_order[18] = {1, 2, 3, 4,  0,  5,  17, 6,  16,
+                                     7, 8, 9, 10, 11, 12, 13, 14, 15};
+    uint8_t code_length_length_nbits[] = {2, 4, 3, 2, 2, 4};
+    uint8_t code_length_length_bits[] = {0, 7, 3, 2, 1, 15};
+
+    // Encode lengths of code lengths.
+    size_t num_code_lengths = 18;
+    while (code_length_nbits[code_length_order[num_code_lengths - 1]] == 0) {
+      num_code_lengths--;
+    }
+    for (size_t i = 0; i < num_code_lengths; i++) {
+      int symbol = code_length_nbits[code_length_order[i]];
+      writer->Write(code_length_length_nbits[symbol],
+                    code_length_length_bits[symbol]);
+    }
+
+    // Compute the canonical codes for the codes that represent the lengths of
+    // the actual codes for data.
+    uint16_t code_length_bits[18] = {};
+    ComputeCanonicalCode(nullptr, nullptr, 0, code_length_nbits,
+                         code_length_bits, 18);
+    // Encode raw bit code lengths.
+    for (size_t i = 0; i < kNumRawSymbols; i++) {
+      writer->Write(code_length_nbits[raw_nbits[i]],
+                    code_length_bits[raw_nbits[i]]);
+    }
+    size_t num_lz77 = kNumLZ77;
+    while (lz77_nbits[num_lz77 - 1] == 0) {
+      num_lz77--;
+    }
+    // Encode 0s until 224 (start of LZ77 symbols). This is in total 224-19 =
+    // 205.
+    static_assert(kLZ77Offset == 224, "");
+    static_assert(kNumRawSymbols == 19, "");
+    writer->Write(code_length_nbits[17], code_length_bits[17]);
+    writer->Write(3, 0b010);  // 5
+    writer->Write(code_length_nbits[17], code_length_bits[17]);
+    writer->Write(3, 0b000);  // (5-2)*8 + 3 = 27
+    writer->Write(code_length_nbits[17], code_length_bits[17]);
+    writer->Write(3, 0b010);  // (27-2)*8 + 5 = 205
+    // Encode LZ77 symbols, with values 224+i.
+    for (size_t i = 0; i < num_lz77; i++) {
+      writer->Write(code_length_nbits[lz77_nbits[i]],
+                    code_length_bits[lz77_nbits[i]]);
+    }
+  }
+};
+
+template <typename T>
+struct VecPair {
+  T low;
+  T hi;
+};
+
+#ifdef FJXL_GENERIC_SIMD
+#undef FJXL_GENERIC_SIMD
+#endif
+
+#ifdef FJXL_AVX512
+#define FJXL_GENERIC_SIMD
+struct SIMDVec32;
+struct Mask32 {
+  __mmask16 mask;
+  SIMDVec32 IfThenElse(const SIMDVec32& if_true, const SIMDVec32& if_false);
+  size_t CountPrefix() const {
+    return CtzNonZero(~uint64_t{_cvtmask16_u32(mask)});
+  }
+};
+
+struct SIMDVec32 {
+  __m512i vec;
+
+  static constexpr size_t kLanes = 16;
+
+  FJXL_INLINE static SIMDVec32 Load(const uint32_t* data) {
+    return SIMDVec32{_mm512_loadu_si512((__m512i*)data)};
+  }
+  FJXL_INLINE void Store(uint32_t* data) {
+    _mm512_storeu_si512((__m512i*)data, vec);
+  }
+  FJXL_INLINE static SIMDVec32 Val(uint32_t v) {
+    return SIMDVec32{_mm512_set1_epi32(v)};
+  }
+  FJXL_INLINE SIMDVec32 ValToToken() const {
+    return SIMDVec32{
+        _mm512_sub_epi32(_mm512_set1_epi32(32), _mm512_lzcnt_epi32(vec))};
+  }
+  FJXL_INLINE SIMDVec32 SatSubU(const SIMDVec32& to_subtract) const {
+    return SIMDVec32{_mm512_sub_epi32(_mm512_max_epu32(vec, to_subtract.vec),
+                                      to_subtract.vec)};
+  }
+  FJXL_INLINE SIMDVec32 Sub(const SIMDVec32& to_subtract) const {
+    return SIMDVec32{_mm512_sub_epi32(vec, to_subtract.vec)};
+  }
+  FJXL_INLINE SIMDVec32 Add(const SIMDVec32& oth) const {
+    return SIMDVec32{_mm512_add_epi32(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec32 Xor(const SIMDVec32& oth) const {
+    return SIMDVec32{_mm512_xor_epi32(vec, oth.vec)};
+  }
+  FJXL_INLINE Mask32 Eq(const SIMDVec32& oth) const {
+    return Mask32{_mm512_cmpeq_epi32_mask(vec, oth.vec)};
+  }
+  FJXL_INLINE Mask32 Gt(const SIMDVec32& oth) const {
+    return Mask32{_mm512_cmpgt_epi32_mask(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec32 Pow2() const {
+    return SIMDVec32{_mm512_sllv_epi32(_mm512_set1_epi32(1), vec)};
+  }
+  template <size_t i>
+  FJXL_INLINE SIMDVec32 SignedShiftRight() const {
+    return SIMDVec32{_mm512_srai_epi32(vec, i)};
+  }
+};
+
+struct SIMDVec16;
+
+struct Mask16 {
+  __mmask32 mask;
+  SIMDVec16 IfThenElse(const SIMDVec16& if_true, const SIMDVec16& if_false);
+  Mask16 And(const Mask16& oth) const {
+    return Mask16{_kand_mask32(mask, oth.mask)};
+  }
+  size_t CountPrefix() const {
+    return CtzNonZero(~uint64_t{_cvtmask32_u32(mask)});
+  }
+};
+
+struct SIMDVec16 {
+  __m512i vec;
+
+  static constexpr size_t kLanes = 32;
+
+  FJXL_INLINE static SIMDVec16 Load(const uint16_t* data) {
+    return SIMDVec16{_mm512_loadu_si512((__m512i*)data)};
+  }
+  FJXL_INLINE void Store(uint16_t* data) {
+    _mm512_storeu_si512((__m512i*)data, vec);
+  }
+  FJXL_INLINE static SIMDVec16 Val(uint16_t v) {
+    return SIMDVec16{_mm512_set1_epi16(v)};
+  }
+  FJXL_INLINE static SIMDVec16 FromTwo32(const SIMDVec32& lo,
+                                         const SIMDVec32& hi) {
+    auto tmp = _mm512_packus_epi32(lo.vec, hi.vec);
+    alignas(64) uint64_t perm[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+    return SIMDVec16{
+        _mm512_permutex2var_epi64(tmp, _mm512_load_si512((__m512i*)perm), tmp)};
+  }
+
+  FJXL_INLINE SIMDVec16 ValToToken() const {
+    auto c16 = _mm512_set1_epi32(16);
+    auto c32 = _mm512_set1_epi32(32);
+    auto low16bit = _mm512_set1_epi32(0x0000FFFF);
+    auto lzhi =
+        _mm512_sub_epi32(c16, _mm512_min_epu32(c16, _mm512_lzcnt_epi32(vec)));
+    auto lzlo = _mm512_sub_epi32(
+        c32, _mm512_lzcnt_epi32(_mm512_and_si512(low16bit, vec)));
+    return SIMDVec16{_mm512_or_si512(lzlo, _mm512_slli_epi32(lzhi, 16))};
+  }
+
+  FJXL_INLINE SIMDVec16 SatSubU(const SIMDVec16& to_subtract) const {
+    return SIMDVec16{_mm512_subs_epu16(vec, to_subtract.vec)};
+  }
+  FJXL_INLINE SIMDVec16 Sub(const SIMDVec16& to_subtract) const {
+    return SIMDVec16{_mm512_sub_epi16(vec, to_subtract.vec)};
+  }
+  FJXL_INLINE SIMDVec16 Add(const SIMDVec16& oth) const {
+    return SIMDVec16{_mm512_add_epi16(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 Min(const SIMDVec16& oth) const {
+    return SIMDVec16{_mm512_min_epu16(vec, oth.vec)};
+  }
+  FJXL_INLINE Mask16 Eq(const SIMDVec16& oth) const {
+    return Mask16{_mm512_cmpeq_epi16_mask(vec, oth.vec)};
+  }
+  FJXL_INLINE Mask16 Gt(const SIMDVec16& oth) const {
+    return Mask16{_mm512_cmpgt_epi16_mask(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 Pow2() const {
+    return SIMDVec16{_mm512_sllv_epi16(_mm512_set1_epi16(1), vec)};
+  }
+  FJXL_INLINE SIMDVec16 Or(const SIMDVec16& oth) const {
+    return SIMDVec16{_mm512_or_si512(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 Xor(const SIMDVec16& oth) const {
+    return SIMDVec16{_mm512_xor_si512(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 And(const SIMDVec16& oth) const {
+    return SIMDVec16{_mm512_and_si512(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 HAdd(const SIMDVec16& oth) const {
+    return SIMDVec16{_mm512_srai_epi16(_mm512_add_epi16(vec, oth.vec), 1)};
+  }
+  FJXL_INLINE SIMDVec16 PrepareForU8Lookup() const {
+    return SIMDVec16{_mm512_or_si512(vec, _mm512_set1_epi16(0xFF00))};
+  }
+  FJXL_INLINE SIMDVec16 U8Lookup(const uint8_t* table) const {
+    return SIMDVec16{_mm512_shuffle_epi8(
+        _mm512_broadcast_i32x4(_mm_loadu_si128((__m128i*)table)), vec)};
+  }
+  FJXL_INLINE VecPair<SIMDVec16> Interleave(const SIMDVec16& low) const {
+    auto lo = _mm512_unpacklo_epi16(low.vec, vec);
+    auto hi = _mm512_unpackhi_epi16(low.vec, vec);
+    alignas(64) uint64_t perm1[8] = {0, 1, 8, 9, 2, 3, 10, 11};
+    alignas(64) uint64_t perm2[8] = {4, 5, 12, 13, 6, 7, 14, 15};
+    return {SIMDVec16{_mm512_permutex2var_epi64(
+                lo, _mm512_load_si512((__m512i*)perm1), hi)},
+            SIMDVec16{_mm512_permutex2var_epi64(
+                lo, _mm512_load_si512((__m512i*)perm2), hi)}};
+  }
+  FJXL_INLINE VecPair<SIMDVec32> Upcast() const {
+    auto lo = _mm512_unpacklo_epi16(vec, _mm512_setzero_si512());
+    auto hi = _mm512_unpackhi_epi16(vec, _mm512_setzero_si512());
+    alignas(64) uint64_t perm1[8] = {0, 1, 8, 9, 2, 3, 10, 11};
+    alignas(64) uint64_t perm2[8] = {4, 5, 12, 13, 6, 7, 14, 15};
+    return {SIMDVec32{_mm512_permutex2var_epi64(
+                lo, _mm512_load_si512((__m512i*)perm1), hi)},
+            SIMDVec32{_mm512_permutex2var_epi64(
+                lo, _mm512_load_si512((__m512i*)perm2), hi)}};
+  }
+  template <size_t i>
+  FJXL_INLINE SIMDVec16 SignedShiftRight() const {
+    return SIMDVec16{_mm512_srai_epi16(vec, i)};
+  }
+
+  static std::array<SIMDVec16, 1> LoadG8(const unsigned char* data) {
+    __m256i bytes = _mm256_loadu_si256((__m256i*)data);
+    return {SIMDVec16{_mm512_cvtepu8_epi16(bytes)}};
+  }
+  static std::array<SIMDVec16, 1> LoadG16(const unsigned char* data) {
+    return {Load((const uint16_t*)data)};
+  }
+
+  static std::array<SIMDVec16, 2> LoadGA8(const unsigned char* data) {
+    __m512i bytes = _mm512_loadu_si512((__m512i*)data);
+    __m512i gray = _mm512_and_si512(bytes, _mm512_set1_epi16(0xFF));
+    __m512i alpha = _mm512_srli_epi16(bytes, 8);
+    return {SIMDVec16{gray}, SIMDVec16{alpha}};
+  }
+  static std::array<SIMDVec16, 2> LoadGA16(const unsigned char* data) {
+    __m512i bytes1 = _mm512_loadu_si512((__m512i*)data);
+    __m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 64));
+    __m512i g_mask = _mm512_set1_epi32(0xFFFF);
+    __m512i permuteidx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0);
+    __m512i g = _mm512_permutexvar_epi64(
+        permuteidx, _mm512_packus_epi32(_mm512_and_si512(bytes1, g_mask),
+                                        _mm512_and_si512(bytes2, g_mask)));
+    __m512i a = _mm512_permutexvar_epi64(
+        permuteidx, _mm512_packus_epi32(_mm512_srli_epi32(bytes1, 16),
+                                        _mm512_srli_epi32(bytes2, 16)));
+    return {SIMDVec16{g}, SIMDVec16{a}};
+  }
+
+  static std::array<SIMDVec16, 3> LoadRGB8(const unsigned char* data) {
+    __m512i bytes0 = _mm512_loadu_si512((__m512i*)data);
+    __m512i bytes1 =
+        _mm512_zextsi256_si512(_mm256_loadu_si256((__m256i*)(data + 64)));
+
+    // 0x7A = element of upper half of second vector = 0 after lookup; still in
+    // the upper half once we add 1 or 2.
+    uint8_t z = 0x7A;
+    __m512i ridx =
+        _mm512_set_epi8(z, 93, z, 90, z, 87, z, 84, z, 81, z, 78, z, 75, z, 72,
+                        z, 69, z, 66, z, 63, z, 60, z, 57, z, 54, z, 51, z, 48,
+                        z, 45, z, 42, z, 39, z, 36, z, 33, z, 30, z, 27, z, 24,
+                        z, 21, z, 18, z, 15, z, 12, z, 9, z, 6, z, 3, z, 0);
+    __m512i gidx = _mm512_add_epi8(ridx, _mm512_set1_epi8(1));
+    __m512i bidx = _mm512_add_epi8(gidx, _mm512_set1_epi8(1));
+    __m512i r = _mm512_permutex2var_epi8(bytes0, ridx, bytes1);
+    __m512i g = _mm512_permutex2var_epi8(bytes0, gidx, bytes1);
+    __m512i b = _mm512_permutex2var_epi8(bytes0, bidx, bytes1);
+    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}};
+  }
+  static std::array<SIMDVec16, 3> LoadRGB16(const unsigned char* data) {
+    __m512i bytes0 = _mm512_loadu_si512((__m512i*)data);
+    __m512i bytes1 = _mm512_loadu_si512((__m512i*)(data + 64));
+    __m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 128));
+
+    __m512i ridx_lo = _mm512_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 63, 60, 57,
+                                       54, 51, 48, 45, 42, 39, 36, 33, 30, 27,
+                                       24, 21, 18, 15, 12, 9, 6, 3, 0);
+    // -1 is such that when adding 1 or 2, we get the correct index for
+    // green/blue.
+    __m512i ridx_hi =
+        _mm512_set_epi16(29, 26, 23, 20, 17, 14, 11, 8, 5, 2, -1, 0, 0, 0, 0, 0,
+                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+    __m512i gidx_lo = _mm512_add_epi16(ridx_lo, _mm512_set1_epi16(1));
+    __m512i gidx_hi = _mm512_add_epi16(ridx_hi, _mm512_set1_epi16(1));
+    __m512i bidx_lo = _mm512_add_epi16(gidx_lo, _mm512_set1_epi16(1));
+    __m512i bidx_hi = _mm512_add_epi16(gidx_hi, _mm512_set1_epi16(1));
+
+    __mmask32 rmask = _cvtu32_mask32(0b11111111110000000000000000000000);
+    __mmask32 gbmask = _cvtu32_mask32(0b11111111111000000000000000000000);
+
+    __m512i rlo = _mm512_permutex2var_epi16(bytes0, ridx_lo, bytes1);
+    __m512i glo = _mm512_permutex2var_epi16(bytes0, gidx_lo, bytes1);
+    __m512i blo = _mm512_permutex2var_epi16(bytes0, bidx_lo, bytes1);
+    __m512i r = _mm512_mask_permutexvar_epi16(rlo, rmask, ridx_hi, bytes2);
+    __m512i g = _mm512_mask_permutexvar_epi16(glo, gbmask, gidx_hi, bytes2);
+    __m512i b = _mm512_mask_permutexvar_epi16(blo, gbmask, bidx_hi, bytes2);
+    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}};
+  }
+
+  static std::array<SIMDVec16, 4> LoadRGBA8(const unsigned char* data) {
+    __m512i bytes1 = _mm512_loadu_si512((__m512i*)data);
+    __m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 64));
+    __m512i rg_mask = _mm512_set1_epi32(0xFFFF);
+    __m512i permuteidx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0);
+    __m512i rg = _mm512_permutexvar_epi64(
+        permuteidx, _mm512_packus_epi32(_mm512_and_si512(bytes1, rg_mask),
+                                        _mm512_and_si512(bytes2, rg_mask)));
+    __m512i ba = _mm512_permutexvar_epi64(
+        permuteidx, _mm512_packus_epi32(_mm512_srli_epi32(bytes1, 16),
+                                        _mm512_srli_epi32(bytes2, 16)));
+    __m512i r = _mm512_and_si512(rg, _mm512_set1_epi16(0xFF));
+    __m512i g = _mm512_srli_epi16(rg, 8);
+    __m512i b = _mm512_and_si512(ba, _mm512_set1_epi16(0xFF));
+    __m512i a = _mm512_srli_epi16(ba, 8);
+    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}};
+  }
+  static std::array<SIMDVec16, 4> LoadRGBA16(const unsigned char* data) {
+    __m512i bytes0 = _mm512_loadu_si512((__m512i*)data);
+    __m512i bytes1 = _mm512_loadu_si512((__m512i*)(data + 64));
+    __m512i bytes2 = _mm512_loadu_si512((__m512i*)(data + 128));
+    __m512i bytes3 = _mm512_loadu_si512((__m512i*)(data + 192));
+
+    auto pack32 = [](__m512i a, __m512i b) {
+      __m512i permuteidx = _mm512_set_epi64(7, 5, 3, 1, 6, 4, 2, 0);
+      return _mm512_permutexvar_epi64(permuteidx, _mm512_packus_epi32(a, b));
+    };
+    auto packlow32 = [&pack32](__m512i a, __m512i b) {
+      __m512i mask = _mm512_set1_epi32(0xFFFF);
+      return pack32(_mm512_and_si512(a, mask), _mm512_and_si512(b, mask));
+    };
+    auto packhi32 = [&pack32](__m512i a, __m512i b) {
+      return pack32(_mm512_srli_epi32(a, 16), _mm512_srli_epi32(b, 16));
+    };
+
+    __m512i rb0 = packlow32(bytes0, bytes1);
+    __m512i rb1 = packlow32(bytes2, bytes3);
+    __m512i ga0 = packhi32(bytes0, bytes1);
+    __m512i ga1 = packhi32(bytes2, bytes3);
+
+    __m512i r = packlow32(rb0, rb1);
+    __m512i g = packlow32(ga0, ga1);
+    __m512i b = packhi32(rb0, rb1);
+    __m512i a = packhi32(ga0, ga1);
+    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}};
+  }
+
+  void SwapEndian() {
+    auto indices = _mm512_broadcast_i32x4(
+        _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14));
+    vec = _mm512_shuffle_epi8(vec, indices);
+  }
+};
+
+SIMDVec16 Mask16::IfThenElse(const SIMDVec16& if_true,
+                             const SIMDVec16& if_false) {
+  return SIMDVec16{_mm512_mask_blend_epi16(mask, if_false.vec, if_true.vec)};
+}
+
+SIMDVec32 Mask32::IfThenElse(const SIMDVec32& if_true,
+                             const SIMDVec32& if_false) {
+  return SIMDVec32{_mm512_mask_blend_epi32(mask, if_false.vec, if_true.vec)};
+}
+
+struct Bits64 {
+  static constexpr size_t kLanes = 8;
+
+  __m512i nbits;
+  __m512i bits;
+
+  FJXL_INLINE void Store(uint64_t* nbits_out, uint64_t* bits_out) {
+    _mm512_storeu_si512((__m512i*)nbits_out, nbits);
+    _mm512_storeu_si512((__m512i*)bits_out, bits);
+  }
+};
+
+struct Bits32 {
+  __m512i nbits;
+  __m512i bits;
+
+  static Bits32 FromRaw(SIMDVec32 nbits, SIMDVec32 bits) {
+    return Bits32{nbits.vec, bits.vec};
+  }
+
+  Bits64 Merge() const {
+    auto nbits_hi32 = _mm512_srli_epi64(nbits, 32);
+    auto nbits_lo32 = _mm512_and_si512(nbits, _mm512_set1_epi64(0xFFFFFFFF));
+    auto bits_hi32 = _mm512_srli_epi64(bits, 32);
+    auto bits_lo32 = _mm512_and_si512(bits, _mm512_set1_epi64(0xFFFFFFFF));
+
+    auto nbits64 = _mm512_add_epi64(nbits_hi32, nbits_lo32);
+    auto bits64 =
+        _mm512_or_si512(_mm512_sllv_epi64(bits_hi32, nbits_lo32), bits_lo32);
+    return Bits64{nbits64, bits64};
+  }
+
+  void Interleave(const Bits32& low) {
+    bits = _mm512_or_si512(_mm512_sllv_epi32(bits, low.nbits), low.bits);
+    nbits = _mm512_add_epi32(nbits, low.nbits);
+  }
+
+  void ClipTo(size_t n) {
+    n = std::min<size_t>(n, 16);
+    constexpr uint32_t kMask[32] = {
+        ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u,
+        ~0u, ~0u, ~0u, ~0u, ~0u, 0,   0,   0,   0,   0,   0,
+        0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+    };
+    __m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 16 - n));
+    nbits = _mm512_and_si512(mask, nbits);
+    bits = _mm512_and_si512(mask, bits);
+  }
+  void Skip(size_t n) {
+    n = std::min<size_t>(n, 16);
+    constexpr uint32_t kMask[32] = {
+        0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+        0,   0,   0,   0,   0,   ~0u, ~0u, ~0u, ~0u, ~0u, ~0u,
+        ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u,
+    };
+    __m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 16 - n));
+    nbits = _mm512_and_si512(mask, nbits);
+    bits = _mm512_and_si512(mask, bits);
+  }
+};
+
+struct Bits16 {
+  __m512i nbits;
+  __m512i bits;
+
+  static Bits16 FromRaw(SIMDVec16 nbits, SIMDVec16 bits) {
+    return Bits16{nbits.vec, bits.vec};
+  }
+
+  Bits32 Merge() const {
+    auto nbits_hi16 = _mm512_srli_epi32(nbits, 16);
+    auto nbits_lo16 = _mm512_and_si512(nbits, _mm512_set1_epi32(0xFFFF));
+    auto bits_hi16 = _mm512_srli_epi32(bits, 16);
+    auto bits_lo16 = _mm512_and_si512(bits, _mm512_set1_epi32(0xFFFF));
+
+    auto nbits32 = _mm512_add_epi32(nbits_hi16, nbits_lo16);
+    auto bits32 =
+        _mm512_or_si512(_mm512_sllv_epi32(bits_hi16, nbits_lo16), bits_lo16);
+    return Bits32{nbits32, bits32};
+  }
+
+  void Interleave(const Bits16& low) {
+    bits = _mm512_or_si512(_mm512_sllv_epi16(bits, low.nbits), low.bits);
+    nbits = _mm512_add_epi16(nbits, low.nbits);
+  }
+
+  void ClipTo(size_t n) {
+    n = std::min<size_t>(n, 32);
+    constexpr uint16_t kMask[64] = {
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+        0,      0,      0,      0,      0,      0,      0,      0,
+        0,      0,      0,      0,      0,      0,      0,      0,
+        0,      0,      0,      0,      0,      0,      0,      0,
+        0,      0,      0,      0,      0,      0,      0,      0,
+    };
+    __m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 32 - n));
+    nbits = _mm512_and_si512(mask, nbits);
+    bits = _mm512_and_si512(mask, bits);
+  }
+  void Skip(size_t n) {
+    n = std::min<size_t>(n, 32);
+    constexpr uint16_t kMask[64] = {
+        0,      0,      0,      0,      0,      0,      0,      0,
+        0,      0,      0,      0,      0,      0,      0,      0,
+        0,      0,      0,      0,      0,      0,      0,      0,
+        0,      0,      0,      0,      0,      0,      0,      0,
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+    };
+    __m512i mask = _mm512_loadu_si512((__m512i*)(kMask + 32 - n));
+    nbits = _mm512_and_si512(mask, nbits);
+    bits = _mm512_and_si512(mask, bits);
+  }
+};
+
+#endif
+
+#ifdef FJXL_AVX2
+#define FJXL_GENERIC_SIMD
+
+struct SIMDVec32;
+
+struct Mask32 {
+  __m256i mask;
+  SIMDVec32 IfThenElse(const SIMDVec32& if_true, const SIMDVec32& if_false);
+  size_t CountPrefix() const {
+    return CtzNonZero(~static_cast<uint64_t>(
+        (uint8_t)_mm256_movemask_ps(_mm256_castsi256_ps(mask))));
+  }
+};
+
+struct SIMDVec32 {
+  __m256i vec;
+
+  static constexpr size_t kLanes = 8;
+
+  FJXL_INLINE static SIMDVec32 Load(const uint32_t* data) {
+    return SIMDVec32{_mm256_loadu_si256((__m256i*)data)};
+  }
+  FJXL_INLINE void Store(uint32_t* data) {
+    _mm256_storeu_si256((__m256i*)data, vec);
+  }
+  FJXL_INLINE static SIMDVec32 Val(uint32_t v) {
+    return SIMDVec32{_mm256_set1_epi32(v)};
+  }
+  FJXL_INLINE SIMDVec32 ValToToken() const {
+    // we know that each value has at most 20 bits, so we just need 5 nibbles
+    // and don't need to mask the fifth. However we do need to set the higher
+    // bytes to 0xFF, which will make table lookups return 0.
+    auto nibble0 =
+        _mm256_or_si256(_mm256_and_si256(vec, _mm256_set1_epi32(0xF)),
+                        _mm256_set1_epi32(0xFFFFFF00));
+    auto nibble1 = _mm256_or_si256(
+        _mm256_and_si256(_mm256_srli_epi32(vec, 4), _mm256_set1_epi32(0xF)),
+        _mm256_set1_epi32(0xFFFFFF00));
+    auto nibble2 = _mm256_or_si256(
+        _mm256_and_si256(_mm256_srli_epi32(vec, 8), _mm256_set1_epi32(0xF)),
+        _mm256_set1_epi32(0xFFFFFF00));
+    auto nibble3 = _mm256_or_si256(
+        _mm256_and_si256(_mm256_srli_epi32(vec, 12), _mm256_set1_epi32(0xF)),
+        _mm256_set1_epi32(0xFFFFFF00));
+    auto nibble4 = _mm256_or_si256(_mm256_srli_epi32(vec, 16),
+                                   _mm256_set1_epi32(0xFFFFFF00));
+
+    auto lut0 = _mm256_broadcastsi128_si256(
+        _mm_setr_epi8(0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4));
+    auto lut1 = _mm256_broadcastsi128_si256(
+        _mm_setr_epi8(0, 5, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8));
+    auto lut2 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
+        0, 9, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12));
+    auto lut3 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
+        0, 13, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16));
+    auto lut4 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
+        0, 17, 18, 18, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20));
+
+    auto token0 = _mm256_shuffle_epi8(lut0, nibble0);
+    auto token1 = _mm256_shuffle_epi8(lut1, nibble1);
+    auto token2 = _mm256_shuffle_epi8(lut2, nibble2);
+    auto token3 = _mm256_shuffle_epi8(lut3, nibble3);
+    auto token4 = _mm256_shuffle_epi8(lut4, nibble4);
+
+    auto token =
+        _mm256_max_epi32(_mm256_max_epi32(_mm256_max_epi32(token0, token1),
+                                          _mm256_max_epi32(token2, token3)),
+                         token4);
+    return SIMDVec32{token};
+  }
+  FJXL_INLINE SIMDVec32 SatSubU(const SIMDVec32& to_subtract) const {
+    return SIMDVec32{_mm256_sub_epi32(_mm256_max_epu32(vec, to_subtract.vec),
+                                      to_subtract.vec)};
+  }
+  FJXL_INLINE SIMDVec32 Sub(const SIMDVec32& to_subtract) const {
+    return SIMDVec32{_mm256_sub_epi32(vec, to_subtract.vec)};
+  }
+  FJXL_INLINE SIMDVec32 Add(const SIMDVec32& oth) const {
+    return SIMDVec32{_mm256_add_epi32(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec32 Xor(const SIMDVec32& oth) const {
+    return SIMDVec32{_mm256_xor_si256(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec32 Pow2() const {
+    return SIMDVec32{_mm256_sllv_epi32(_mm256_set1_epi32(1), vec)};
+  }
+  FJXL_INLINE Mask32 Eq(const SIMDVec32& oth) const {
+    return Mask32{_mm256_cmpeq_epi32(vec, oth.vec)};
+  }
+  FJXL_INLINE Mask32 Gt(const SIMDVec32& oth) const {
+    return Mask32{_mm256_cmpgt_epi32(vec, oth.vec)};
+  }
+  template <size_t i>
+  FJXL_INLINE SIMDVec32 SignedShiftRight() const {
+    return SIMDVec32{_mm256_srai_epi32(vec, i)};
+  }
+};
+
+struct SIMDVec16;
+
+struct Mask16 {
+  __m256i mask;
+  SIMDVec16 IfThenElse(const SIMDVec16& if_true, const SIMDVec16& if_false);
+  Mask16 And(const Mask16& oth) const {
+    return Mask16{_mm256_and_si256(mask, oth.mask)};
+  }
+  size_t CountPrefix() const {
+    return CtzNonZero(
+               ~static_cast<uint64_t>((uint32_t)_mm256_movemask_epi8(mask))) /
+           2;
+  }
+};
+
+struct SIMDVec16 {
+  __m256i vec;
+
+  static constexpr size_t kLanes = 16;
+
+  FJXL_INLINE static SIMDVec16 Load(const uint16_t* data) {
+    return SIMDVec16{_mm256_loadu_si256((__m256i*)data)};
+  }
+  FJXL_INLINE void Store(uint16_t* data) {
+    _mm256_storeu_si256((__m256i*)data, vec);
+  }
+  FJXL_INLINE static SIMDVec16 Val(uint16_t v) {
+    return SIMDVec16{_mm256_set1_epi16(v)};
+  }
+  FJXL_INLINE static SIMDVec16 FromTwo32(const SIMDVec32& lo,
+                                         const SIMDVec32& hi) {
+    auto tmp = _mm256_packus_epi32(lo.vec, hi.vec);
+    return SIMDVec16{_mm256_permute4x64_epi64(tmp, 0b11011000)};
+  }
+
+  FJXL_INLINE SIMDVec16 ValToToken() const {
+    auto nibble0 =
+        _mm256_or_si256(_mm256_and_si256(vec, _mm256_set1_epi16(0xF)),
+                        _mm256_set1_epi16(0xFF00));
+    auto nibble1 = _mm256_or_si256(
+        _mm256_and_si256(_mm256_srli_epi16(vec, 4), _mm256_set1_epi16(0xF)),
+        _mm256_set1_epi16(0xFF00));
+    auto nibble2 = _mm256_or_si256(
+        _mm256_and_si256(_mm256_srli_epi16(vec, 8), _mm256_set1_epi16(0xF)),
+        _mm256_set1_epi16(0xFF00));
+    auto nibble3 =
+        _mm256_or_si256(_mm256_srli_epi16(vec, 12), _mm256_set1_epi16(0xFF00));
+
+    auto lut0 = _mm256_broadcastsi128_si256(
+        _mm_setr_epi8(0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4));
+    auto lut1 = _mm256_broadcastsi128_si256(
+        _mm_setr_epi8(0, 5, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8));
+    auto lut2 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
+        0, 9, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12));
+    auto lut3 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
+        0, 13, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16));
+
+    auto token0 = _mm256_shuffle_epi8(lut0, nibble0);
+    auto token1 = _mm256_shuffle_epi8(lut1, nibble1);
+    auto token2 = _mm256_shuffle_epi8(lut2, nibble2);
+    auto token3 = _mm256_shuffle_epi8(lut3, nibble3);
+
+    auto token = _mm256_max_epi16(_mm256_max_epi16(token0, token1),
+                                  _mm256_max_epi16(token2, token3));
+    return SIMDVec16{token};
+  }
+
+  FJXL_INLINE SIMDVec16 SatSubU(const SIMDVec16& to_subtract) const {
+    return SIMDVec16{_mm256_subs_epu16(vec, to_subtract.vec)};
+  }
+  FJXL_INLINE SIMDVec16 Sub(const SIMDVec16& to_subtract) const {
+    return SIMDVec16{_mm256_sub_epi16(vec, to_subtract.vec)};
+  }
+  FJXL_INLINE SIMDVec16 Add(const SIMDVec16& oth) const {
+    return SIMDVec16{_mm256_add_epi16(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 Min(const SIMDVec16& oth) const {
+    return SIMDVec16{_mm256_min_epu16(vec, oth.vec)};
+  }
+  FJXL_INLINE Mask16 Eq(const SIMDVec16& oth) const {
+    return Mask16{_mm256_cmpeq_epi16(vec, oth.vec)};
+  }
+  FJXL_INLINE Mask16 Gt(const SIMDVec16& oth) const {
+    return Mask16{_mm256_cmpgt_epi16(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 Pow2() const {
+    auto pow2_lo_lut = _mm256_broadcastsi128_si256(
+        _mm_setr_epi8(1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6,
+                      1u << 7, 0, 0, 0, 0, 0, 0, 0, 0));
+    auto pow2_hi_lut = _mm256_broadcastsi128_si256(
+        _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1 << 0, 1 << 1, 1 << 2, 1 << 3,
+                      1 << 4, 1 << 5, 1 << 6, 1u << 7));
+
+    auto masked = _mm256_or_si256(vec, _mm256_set1_epi16(0xFF00));
+
+    auto pow2_lo = _mm256_shuffle_epi8(pow2_lo_lut, masked);
+    auto pow2_hi = _mm256_shuffle_epi8(pow2_hi_lut, masked);
+
+    auto pow2 = _mm256_or_si256(_mm256_slli_epi16(pow2_hi, 8), pow2_lo);
+    return SIMDVec16{pow2};
+  }
+  FJXL_INLINE SIMDVec16 Or(const SIMDVec16& oth) const {
+    return SIMDVec16{_mm256_or_si256(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 Xor(const SIMDVec16& oth) const {
+    return SIMDVec16{_mm256_xor_si256(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 And(const SIMDVec16& oth) const {
+    return SIMDVec16{_mm256_and_si256(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 HAdd(const SIMDVec16& oth) const {
+    return SIMDVec16{_mm256_srai_epi16(_mm256_add_epi16(vec, oth.vec), 1)};
+  }
+  FJXL_INLINE SIMDVec16 PrepareForU8Lookup() const {
+    return SIMDVec16{_mm256_or_si256(vec, _mm256_set1_epi16(0xFF00))};
+  }
+  FJXL_INLINE SIMDVec16 U8Lookup(const uint8_t* table) const {
+    return SIMDVec16{_mm256_shuffle_epi8(
+        _mm256_broadcastsi128_si256(_mm_loadu_si128((__m128i*)table)), vec)};
+  }
+  FJXL_INLINE VecPair<SIMDVec16> Interleave(const SIMDVec16& low) const {
+    auto v02 = _mm256_unpacklo_epi16(low.vec, vec);
+    auto v13 = _mm256_unpackhi_epi16(low.vec, vec);
+    return {SIMDVec16{_mm256_permute2x128_si256(v02, v13, 0x20)},
+            SIMDVec16{_mm256_permute2x128_si256(v02, v13, 0x31)}};
+  }
+  FJXL_INLINE VecPair<SIMDVec32> Upcast() const {
+    auto v02 = _mm256_unpacklo_epi16(vec, _mm256_setzero_si256());
+    auto v13 = _mm256_unpackhi_epi16(vec, _mm256_setzero_si256());
+    return {SIMDVec32{_mm256_permute2x128_si256(v02, v13, 0x20)},
+            SIMDVec32{_mm256_permute2x128_si256(v02, v13, 0x31)}};
+  }
+  template <size_t i>
+  FJXL_INLINE SIMDVec16 SignedShiftRight() const {
+    return SIMDVec16{_mm256_srai_epi16(vec, i)};
+  }
+
+  static std::array<SIMDVec16, 1> LoadG8(const unsigned char* data) {
+    __m128i bytes = _mm_loadu_si128((__m128i*)data);
+    return {SIMDVec16{_mm256_cvtepu8_epi16(bytes)}};
+  }
+  static std::array<SIMDVec16, 1> LoadG16(const unsigned char* data) {
+    return {Load((const uint16_t*)data)};
+  }
+
+  static std::array<SIMDVec16, 2> LoadGA8(const unsigned char* data) {
+    __m256i bytes = _mm256_loadu_si256((__m256i*)data);
+    __m256i gray = _mm256_and_si256(bytes, _mm256_set1_epi16(0xFF));
+    __m256i alpha = _mm256_srli_epi16(bytes, 8);
+    return {SIMDVec16{gray}, SIMDVec16{alpha}};
+  }
+  static std::array<SIMDVec16, 2> LoadGA16(const unsigned char* data) {
+    __m256i bytes1 = _mm256_loadu_si256((__m256i*)data);
+    __m256i bytes2 = _mm256_loadu_si256((__m256i*)(data + 32));
+    __m256i g_mask = _mm256_set1_epi32(0xFFFF);
+    __m256i g = _mm256_permute4x64_epi64(
+        _mm256_packus_epi32(_mm256_and_si256(bytes1, g_mask),
+                            _mm256_and_si256(bytes2, g_mask)),
+        0b11011000);
+    __m256i a = _mm256_permute4x64_epi64(
+        _mm256_packus_epi32(_mm256_srli_epi32(bytes1, 16),
+                            _mm256_srli_epi32(bytes2, 16)),
+        0b11011000);
+    return {SIMDVec16{g}, SIMDVec16{a}};
+  }
+
+  static std::array<SIMDVec16, 3> LoadRGB8(const unsigned char* data) {
+    __m128i bytes0 = _mm_loadu_si128((__m128i*)data);
+    __m128i bytes1 = _mm_loadu_si128((__m128i*)(data + 16));
+    __m128i bytes2 = _mm_loadu_si128((__m128i*)(data + 32));
+
+    __m128i idx =
+        _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13);
+
+    __m128i r6b5g5_0 = _mm_shuffle_epi8(bytes0, idx);
+    __m128i g6r5b5_1 = _mm_shuffle_epi8(bytes1, idx);
+    __m128i b6g5r5_2 = _mm_shuffle_epi8(bytes2, idx);
+
+    __m128i mask010 = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF,
+                                    0xFF, 0, 0, 0, 0, 0);
+    __m128i mask001 = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF,
+                                    0xFF, 0xFF, 0xFF);
+
+    __m128i b2g2b1 = _mm_blendv_epi8(b6g5r5_2, g6r5b5_1, mask001);
+    __m128i b2b0b1 = _mm_blendv_epi8(b2g2b1, r6b5g5_0, mask010);
+
+    __m128i r0r1b1 = _mm_blendv_epi8(r6b5g5_0, g6r5b5_1, mask010);
+    __m128i r0r1r2 = _mm_blendv_epi8(r0r1b1, b6g5r5_2, mask001);
+
+    __m128i g1r1g0 = _mm_blendv_epi8(g6r5b5_1, r6b5g5_0, mask001);
+    __m128i g1g2g0 = _mm_blendv_epi8(g1r1g0, b6g5r5_2, mask010);
+
+    __m128i g0g1g2 = _mm_alignr_epi8(g1g2g0, g1g2g0, 11);
+    __m128i b0b1b2 = _mm_alignr_epi8(b2b0b1, b2b0b1, 6);
+
+    return {SIMDVec16{_mm256_cvtepu8_epi16(r0r1r2)},
+            SIMDVec16{_mm256_cvtepu8_epi16(g0g1g2)},
+            SIMDVec16{_mm256_cvtepu8_epi16(b0b1b2)}};
+  }
+  static std::array<SIMDVec16, 3> LoadRGB16(const unsigned char* data) {
+    auto load_and_split_lohi = [](const unsigned char* data) {
+      // LHLHLH...
+      __m256i bytes = _mm256_loadu_si256((__m256i*)data);
+      // L0L0L0...
+      __m256i lo = _mm256_and_si256(bytes, _mm256_set1_epi16(0xFF));
+      // H0H0H0...
+      __m256i hi = _mm256_srli_epi16(bytes, 8);
+      // LLLLLLLLHHHHHHHHLLLLLLLLHHHHHHHH
+      __m256i packed = _mm256_packus_epi16(lo, hi);
+      return _mm256_permute4x64_epi64(packed, 0b11011000);
+    };
+    __m256i bytes0 = load_and_split_lohi(data);
+    __m256i bytes1 = load_and_split_lohi(data + 32);
+    __m256i bytes2 = load_and_split_lohi(data + 64);
+
+    __m256i idx = _mm256_broadcastsi128_si256(
+        _mm_setr_epi8(0, 3, 6, 9, 12, 15, 2, 5, 8, 11, 14, 1, 4, 7, 10, 13));
+
+    __m256i r6b5g5_0 = _mm256_shuffle_epi8(bytes0, idx);
+    __m256i g6r5b5_1 = _mm256_shuffle_epi8(bytes1, idx);
+    __m256i b6g5r5_2 = _mm256_shuffle_epi8(bytes2, idx);
+
+    __m256i mask010 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
+        0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0, 0));
+    __m256i mask001 = _mm256_broadcastsi128_si256(_mm_setr_epi8(
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF));
+
+    __m256i b2g2b1 = _mm256_blendv_epi8(b6g5r5_2, g6r5b5_1, mask001);
+    __m256i b2b0b1 = _mm256_blendv_epi8(b2g2b1, r6b5g5_0, mask010);
+
+    __m256i r0r1b1 = _mm256_blendv_epi8(r6b5g5_0, g6r5b5_1, mask010);
+    __m256i r0r1r2 = _mm256_blendv_epi8(r0r1b1, b6g5r5_2, mask001);
+
+    __m256i g1r1g0 = _mm256_blendv_epi8(g6r5b5_1, r6b5g5_0, mask001);
+    __m256i g1g2g0 = _mm256_blendv_epi8(g1r1g0, b6g5r5_2, mask010);
+
+    __m256i g0g1g2 = _mm256_alignr_epi8(g1g2g0, g1g2g0, 11);
+    __m256i b0b1b2 = _mm256_alignr_epi8(b2b0b1, b2b0b1, 6);
+
+    // Now r0r1r2, g0g1g2, b0b1b2 have the low bytes of the RGB pixels in their
+    // lower half, and the high bytes in their upper half.
+
+    auto combine_low_hi = [](__m256i v) {
+      __m128i low = _mm256_extracti128_si256(v, 0);
+      __m128i hi = _mm256_extracti128_si256(v, 1);
+      __m256i low16 = _mm256_cvtepu8_epi16(low);
+      __m256i hi16 = _mm256_cvtepu8_epi16(hi);
+      return _mm256_or_si256(_mm256_slli_epi16(hi16, 8), low16);
+    };
+
+    return {SIMDVec16{combine_low_hi(r0r1r2)},
+            SIMDVec16{combine_low_hi(g0g1g2)},
+            SIMDVec16{combine_low_hi(b0b1b2)}};
+  }
+
+  static std::array<SIMDVec16, 4> LoadRGBA8(const unsigned char* data) {
+    __m256i bytes1 = _mm256_loadu_si256((__m256i*)data);
+    __m256i bytes2 = _mm256_loadu_si256((__m256i*)(data + 32));
+    __m256i rg_mask = _mm256_set1_epi32(0xFFFF);
+    __m256i rg = _mm256_permute4x64_epi64(
+        _mm256_packus_epi32(_mm256_and_si256(bytes1, rg_mask),
+                            _mm256_and_si256(bytes2, rg_mask)),
+        0b11011000);
+    __m256i ba = _mm256_permute4x64_epi64(
+        _mm256_packus_epi32(_mm256_srli_epi32(bytes1, 16),
+                            _mm256_srli_epi32(bytes2, 16)),
+        0b11011000);
+    __m256i r = _mm256_and_si256(rg, _mm256_set1_epi16(0xFF));
+    __m256i g = _mm256_srli_epi16(rg, 8);
+    __m256i b = _mm256_and_si256(ba, _mm256_set1_epi16(0xFF));
+    __m256i a = _mm256_srli_epi16(ba, 8);
+    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}};
+  }
+  static std::array<SIMDVec16, 4> LoadRGBA16(const unsigned char* data) {
+    __m256i bytes0 = _mm256_loadu_si256((__m256i*)data);
+    __m256i bytes1 = _mm256_loadu_si256((__m256i*)(data + 32));
+    __m256i bytes2 = _mm256_loadu_si256((__m256i*)(data + 64));
+    __m256i bytes3 = _mm256_loadu_si256((__m256i*)(data + 96));
+
+    auto pack32 = [](__m256i a, __m256i b) {
+      return _mm256_permute4x64_epi64(_mm256_packus_epi32(a, b), 0b11011000);
+    };
+    auto packlow32 = [&pack32](__m256i a, __m256i b) {
+      __m256i mask = _mm256_set1_epi32(0xFFFF);
+      return pack32(_mm256_and_si256(a, mask), _mm256_and_si256(b, mask));
+    };
+    auto packhi32 = [&pack32](__m256i a, __m256i b) {
+      return pack32(_mm256_srli_epi32(a, 16), _mm256_srli_epi32(b, 16));
+    };
+
+    __m256i rb0 = packlow32(bytes0, bytes1);
+    __m256i rb1 = packlow32(bytes2, bytes3);
+    __m256i ga0 = packhi32(bytes0, bytes1);
+    __m256i ga1 = packhi32(bytes2, bytes3);
+
+    __m256i r = packlow32(rb0, rb1);
+    __m256i g = packlow32(ga0, ga1);
+    __m256i b = packhi32(rb0, rb1);
+    __m256i a = packhi32(ga0, ga1);
+    return {SIMDVec16{r}, SIMDVec16{g}, SIMDVec16{b}, SIMDVec16{a}};
+  }
+
+  void SwapEndian() {
+    auto indices = _mm256_broadcastsi128_si256(
+        _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14));
+    vec = _mm256_shuffle_epi8(vec, indices);
+  }
+};
+
+SIMDVec16 Mask16::IfThenElse(const SIMDVec16& if_true,
+                             const SIMDVec16& if_false) {
+  return SIMDVec16{_mm256_blendv_epi8(if_false.vec, if_true.vec, mask)};
+}
+
+SIMDVec32 Mask32::IfThenElse(const SIMDVec32& if_true,
+                             const SIMDVec32& if_false) {
+  return SIMDVec32{_mm256_blendv_epi8(if_false.vec, if_true.vec, mask)};
+}
+
+struct Bits64 {
+  static constexpr size_t kLanes = 4;
+
+  __m256i nbits;
+  __m256i bits;
+
+  FJXL_INLINE void Store(uint64_t* nbits_out, uint64_t* bits_out) {
+    _mm256_storeu_si256((__m256i*)nbits_out, nbits);
+    _mm256_storeu_si256((__m256i*)bits_out, bits);
+  }
+};
+
+struct Bits32 {
+  __m256i nbits;
+  __m256i bits;
+
+  static Bits32 FromRaw(SIMDVec32 nbits, SIMDVec32 bits) {
+    return Bits32{nbits.vec, bits.vec};
+  }
+
+  Bits64 Merge() const {
+    auto nbits_hi32 = _mm256_srli_epi64(nbits, 32);
+    auto nbits_lo32 = _mm256_and_si256(nbits, _mm256_set1_epi64x(0xFFFFFFFF));
+    auto bits_hi32 = _mm256_srli_epi64(bits, 32);
+    auto bits_lo32 = _mm256_and_si256(bits, _mm256_set1_epi64x(0xFFFFFFFF));
+
+    auto nbits64 = _mm256_add_epi64(nbits_hi32, nbits_lo32);
+    auto bits64 =
+        _mm256_or_si256(_mm256_sllv_epi64(bits_hi32, nbits_lo32), bits_lo32);
+    return Bits64{nbits64, bits64};
+  }
+
+  void Interleave(const Bits32& low) {
+    bits = _mm256_or_si256(_mm256_sllv_epi32(bits, low.nbits), low.bits);
+    nbits = _mm256_add_epi32(nbits, low.nbits);
+  }
+
+  void ClipTo(size_t n) {
+    n = std::min<size_t>(n, 8);
+    constexpr uint32_t kMask[16] = {
+        ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, 0, 0, 0, 0, 0, 0, 0, 0,
+    };
+    __m256i mask = _mm256_loadu_si256((__m256i*)(kMask + 8 - n));
+    nbits = _mm256_and_si256(mask, nbits);
+    bits = _mm256_and_si256(mask, bits);
+  }
+  void Skip(size_t n) {
+    n = std::min<size_t>(n, 8);
+    constexpr uint32_t kMask[16] = {
+        0, 0, 0, 0, 0, 0, 0, 0, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u, ~0u,
+    };
+    __m256i mask = _mm256_loadu_si256((__m256i*)(kMask + 8 - n));
+    nbits = _mm256_and_si256(mask, nbits);
+    bits = _mm256_and_si256(mask, bits);
+  }
+};
+
+struct Bits16 {
+  __m256i nbits;
+  __m256i bits;
+
+  static Bits16 FromRaw(SIMDVec16 nbits, SIMDVec16 bits) {
+    return Bits16{nbits.vec, bits.vec};
+  }
+
+  Bits32 Merge() const {
+    auto nbits_hi16 = _mm256_srli_epi32(nbits, 16);
+    auto nbits_lo16 = _mm256_and_si256(nbits, _mm256_set1_epi32(0xFFFF));
+    auto bits_hi16 = _mm256_srli_epi32(bits, 16);
+    auto bits_lo16 = _mm256_and_si256(bits, _mm256_set1_epi32(0xFFFF));
+
+    auto nbits32 = _mm256_add_epi32(nbits_hi16, nbits_lo16);
+    auto bits32 =
+        _mm256_or_si256(_mm256_sllv_epi32(bits_hi16, nbits_lo16), bits_lo16);
+    return Bits32{nbits32, bits32};
+  }
+
+  void Interleave(const Bits16& low) {
+    auto pow2_lo_lut = _mm256_broadcastsi128_si256(
+        _mm_setr_epi8(1 << 0, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6,
+                      1u << 7, 0, 0, 0, 0, 0, 0, 0, 0));
+    auto low_nbits_masked =
+        _mm256_or_si256(low.nbits, _mm256_set1_epi16(0xFF00));
+
+    auto bits_shifted = _mm256_mullo_epi16(
+        bits, _mm256_shuffle_epi8(pow2_lo_lut, low_nbits_masked));
+
+    nbits = _mm256_add_epi16(nbits, low.nbits);
+    bits = _mm256_or_si256(bits_shifted, low.bits);
+  }
+
+  void ClipTo(size_t n) {
+    n = std::min<size_t>(n, 16);
+    constexpr uint16_t kMask[32] = {
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+        0,      0,      0,      0,      0,      0,      0,      0,
+        0,      0,      0,      0,      0,      0,      0,      0,
+    };
+    __m256i mask = _mm256_loadu_si256((__m256i*)(kMask + 16 - n));
+    nbits = _mm256_and_si256(mask, nbits);
+    bits = _mm256_and_si256(mask, bits);
+  }
+
+  void Skip(size_t n) {
+    n = std::min<size_t>(n, 16);
+    constexpr uint16_t kMask[32] = {
+        0,      0,      0,      0,      0,      0,      0,      0,
+        0,      0,      0,      0,      0,      0,      0,      0,
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+    };
+    __m256i mask = _mm256_loadu_si256((__m256i*)(kMask + 16 - n));
+    nbits = _mm256_and_si256(mask, nbits);
+    bits = _mm256_and_si256(mask, bits);
+  }
+};
+
+#endif
+
+#ifdef FJXL_NEON
+#define FJXL_GENERIC_SIMD
+
+struct SIMDVec32;
+
+struct Mask32 {
+  uint32x4_t mask;
+  SIMDVec32 IfThenElse(const SIMDVec32& if_true, const SIMDVec32& if_false);
+  Mask32 And(const Mask32& oth) const {
+    return Mask32{vandq_u32(mask, oth.mask)};
+  }
+  size_t CountPrefix() const {
+    uint32_t val_unset[4] = {0, 1, 2, 3};
+    uint32_t val_set[4] = {4, 4, 4, 4};
+    uint32x4_t val = vbslq_u32(mask, vld1q_u32(val_set), vld1q_u32(val_unset));
+    return vminvq_u32(val);
+  }
+};
+
+struct SIMDVec32 {
+  uint32x4_t vec;
+
+  static constexpr size_t kLanes = 4;
+
+  FJXL_INLINE static SIMDVec32 Load(const uint32_t* data) {
+    return SIMDVec32{vld1q_u32(data)};
+  }
+  FJXL_INLINE void Store(uint32_t* data) { vst1q_u32(data, vec); }
+  FJXL_INLINE static SIMDVec32 Val(uint32_t v) {
+    return SIMDVec32{vdupq_n_u32(v)};
+  }
+  FJXL_INLINE SIMDVec32 ValToToken() const {
+    return SIMDVec32{vsubq_u32(vdupq_n_u32(32), vclzq_u32(vec))};
+  }
+  FJXL_INLINE SIMDVec32 SatSubU(const SIMDVec32& to_subtract) const {
+    return SIMDVec32{vqsubq_u32(vec, to_subtract.vec)};
+  }
+  FJXL_INLINE SIMDVec32 Sub(const SIMDVec32& to_subtract) const {
+    return SIMDVec32{vsubq_u32(vec, to_subtract.vec)};
+  }
+  FJXL_INLINE SIMDVec32 Add(const SIMDVec32& oth) const {
+    return SIMDVec32{vaddq_u32(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec32 Xor(const SIMDVec32& oth) const {
+    return SIMDVec32{veorq_u32(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec32 Pow2() const {
+    return SIMDVec32{vshlq_u32(vdupq_n_u32(1), vreinterpretq_s32_u32(vec))};
+  }
+  FJXL_INLINE Mask32 Eq(const SIMDVec32& oth) const {
+    return Mask32{vceqq_u32(vec, oth.vec)};
+  }
+  FJXL_INLINE Mask32 Gt(const SIMDVec32& oth) const {
+    return Mask32{
+        vcgtq_s32(vreinterpretq_s32_u32(vec), vreinterpretq_s32_u32(oth.vec))};
+  }
+  template <size_t i>
+  FJXL_INLINE SIMDVec32 SignedShiftRight() const {
+    return SIMDVec32{
+        vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_u32(vec), i))};
+  }
+};
+
+struct SIMDVec16;
+
+struct Mask16 {
+  uint16x8_t mask;
+  SIMDVec16 IfThenElse(const SIMDVec16& if_true, const SIMDVec16& if_false);
+  Mask16 And(const Mask16& oth) const {
+    return Mask16{vandq_u16(mask, oth.mask)};
+  }
+  size_t CountPrefix() const {
+    uint16_t val_unset[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+    uint16_t val_set[8] = {8, 8, 8, 8, 8, 8, 8, 8};
+    uint16x8_t val = vbslq_u16(mask, vld1q_u16(val_set), vld1q_u16(val_unset));
+    return vminvq_u16(val);
+  }
+};
+
+struct SIMDVec16 {
+  uint16x8_t vec;
+
+  static constexpr size_t kLanes = 8;
+
+  FJXL_INLINE static SIMDVec16 Load(const uint16_t* data) {
+    return SIMDVec16{vld1q_u16(data)};
+  }
+  FJXL_INLINE void Store(uint16_t* data) { vst1q_u16(data, vec); }
+  FJXL_INLINE static SIMDVec16 Val(uint16_t v) {
+    return SIMDVec16{vdupq_n_u16(v)};
+  }
+  FJXL_INLINE static SIMDVec16 FromTwo32(const SIMDVec32& lo,
+                                         const SIMDVec32& hi) {
+    return SIMDVec16{vmovn_high_u32(vmovn_u32(lo.vec), hi.vec)};
+  }
+
+  FJXL_INLINE SIMDVec16 ValToToken() const {
+    return SIMDVec16{vsubq_u16(vdupq_n_u16(16), vclzq_u16(vec))};
+  }
+  FJXL_INLINE SIMDVec16 SatSubU(const SIMDVec16& to_subtract) const {
+    return SIMDVec16{vqsubq_u16(vec, to_subtract.vec)};
+  }
+  FJXL_INLINE SIMDVec16 Sub(const SIMDVec16& to_subtract) const {
+    return SIMDVec16{vsubq_u16(vec, to_subtract.vec)};
+  }
+  FJXL_INLINE SIMDVec16 Add(const SIMDVec16& oth) const {
+    return SIMDVec16{vaddq_u16(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 Min(const SIMDVec16& oth) const {
+    return SIMDVec16{vminq_u16(vec, oth.vec)};
+  }
+  FJXL_INLINE Mask16 Eq(const SIMDVec16& oth) const {
+    return Mask16{vceqq_u16(vec, oth.vec)};
+  }
+  FJXL_INLINE Mask16 Gt(const SIMDVec16& oth) const {
+    return Mask16{
+        vcgtq_s16(vreinterpretq_s16_u16(vec), vreinterpretq_s16_u16(oth.vec))};
+  }
+  FJXL_INLINE SIMDVec16 Pow2() const {
+    return SIMDVec16{vshlq_u16(vdupq_n_u16(1), vreinterpretq_s16_u16(vec))};
+  }
+  FJXL_INLINE SIMDVec16 Or(const SIMDVec16& oth) const {
+    return SIMDVec16{vorrq_u16(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 Xor(const SIMDVec16& oth) const {
+    return SIMDVec16{veorq_u16(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 And(const SIMDVec16& oth) const {
+    return SIMDVec16{vandq_u16(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 HAdd(const SIMDVec16& oth) const {
+    return SIMDVec16{vhaddq_u16(vec, oth.vec)};
+  }
+  FJXL_INLINE SIMDVec16 PrepareForU8Lookup() const {
+    return SIMDVec16{vorrq_u16(vec, vdupq_n_u16(0xFF00))};
+  }
+  FJXL_INLINE SIMDVec16 U8Lookup(const uint8_t* table) const {
+    uint8x16_t tbl = vld1q_u8(table);
+    uint8x16_t indices = vreinterpretq_u8_u16(vec);
+    return SIMDVec16{vreinterpretq_u16_u8(vqtbl1q_u8(tbl, indices))};
+  }
+  FJXL_INLINE VecPair<SIMDVec16> Interleave(const SIMDVec16& low) const {
+    return {SIMDVec16{vzip1q_u16(low.vec, vec)},
+            SIMDVec16{vzip2q_u16(low.vec, vec)}};
+  }
+  FJXL_INLINE VecPair<SIMDVec32> Upcast() const {
+    uint32x4_t lo = vmovl_u16(vget_low_u16(vec));
+    uint32x4_t hi = vmovl_high_u16(vec);
+    return {SIMDVec32{lo}, SIMDVec32{hi}};
+  }
+  template <size_t i>
+  FJXL_INLINE SIMDVec16 SignedShiftRight() const {
+    return SIMDVec16{
+        vreinterpretq_u16_s16(vshrq_n_s16(vreinterpretq_s16_u16(vec), i))};
+  }
+
+  static std::array<SIMDVec16, 1> LoadG8(const unsigned char* data) {
+    uint8x8_t v = vld1_u8(data);
+    return {SIMDVec16{vmovl_u8(v)}};
+  }
+  static std::array<SIMDVec16, 1> LoadG16(const unsigned char* data) {
+    return {Load((const uint16_t*)data)};
+  }
+
+  static std::array<SIMDVec16, 2> LoadGA8(const unsigned char* data) {
+    uint8x8x2_t v = vld2_u8(data);
+    return {SIMDVec16{vmovl_u8(v.val[0])}, SIMDVec16{vmovl_u8(v.val[1])}};
+  }
+  static std::array<SIMDVec16, 2> LoadGA16(const unsigned char* data) {
+    uint16x8x2_t v = vld2q_u16((const uint16_t*)data);
+    return {SIMDVec16{v.val[0]}, SIMDVec16{v.val[1]}};
+  }
+
+  static std::array<SIMDVec16, 3> LoadRGB8(const unsigned char* data) {
+    uint8x8x3_t v = vld3_u8(data);
+    return {SIMDVec16{vmovl_u8(v.val[0])}, SIMDVec16{vmovl_u8(v.val[1])},
+            SIMDVec16{vmovl_u8(v.val[2])}};
+  }
+  static std::array<SIMDVec16, 3> LoadRGB16(const unsigned char* data) {
+    uint16x8x3_t v = vld3q_u16((const uint16_t*)data);
+    return {SIMDVec16{v.val[0]}, SIMDVec16{v.val[1]}, SIMDVec16{v.val[2]}};
+  }
+
+  static std::array<SIMDVec16, 4> LoadRGBA8(const unsigned char* data) {
+    uint8x8x4_t v = vld4_u8(data);
+    return {SIMDVec16{vmovl_u8(v.val[0])}, SIMDVec16{vmovl_u8(v.val[1])},
+            SIMDVec16{vmovl_u8(v.val[2])}, SIMDVec16{vmovl_u8(v.val[3])}};
+  }
+  static std::array<SIMDVec16, 4> LoadRGBA16(const unsigned char* data) {
+    uint16x8x4_t v = vld4q_u16((const uint16_t*)data);
+    return {SIMDVec16{v.val[0]}, SIMDVec16{v.val[1]}, SIMDVec16{v.val[2]},
+            SIMDVec16{v.val[3]}};
+  }
+
+  void SwapEndian() {
+    vec = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(vec)));
+  }
+};
+
+SIMDVec16 Mask16::IfThenElse(const SIMDVec16& if_true,
+                             const SIMDVec16& if_false) {
+  return SIMDVec16{vbslq_u16(mask, if_true.vec, if_false.vec)};
+}
+
+SIMDVec32 Mask32::IfThenElse(const SIMDVec32& if_true,
+                             const SIMDVec32& if_false) {
+  return SIMDVec32{vbslq_u32(mask, if_true.vec, if_false.vec)};
+}
+
+struct Bits64 {
+  static constexpr size_t kLanes = 2;
+
+  uint64x2_t nbits;
+  uint64x2_t bits;
+
+  FJXL_INLINE void Store(uint64_t* nbits_out, uint64_t* bits_out) {
+    vst1q_u64(nbits_out, nbits);
+    vst1q_u64(bits_out, bits);
+  }
+};
+
+struct Bits32 {
+  uint32x4_t nbits;
+  uint32x4_t bits;
+
+  static Bits32 FromRaw(SIMDVec32 nbits, SIMDVec32 bits) {
+    return Bits32{nbits.vec, bits.vec};
+  }
+
+  Bits64 Merge() const {
+    // TODO(veluca): can probably be optimized.
+    uint64x2_t nbits_lo32 =
+        vandq_u64(vreinterpretq_u64_u32(nbits), vdupq_n_u64(0xFFFFFFFF));
+    uint64x2_t bits_hi32 =
+        vshlq_u64(vshrq_n_u64(vreinterpretq_u64_u32(bits), 32),
+                  vreinterpretq_s64_u64(nbits_lo32));
+    uint64x2_t bits_lo32 =
+        vandq_u64(vreinterpretq_u64_u32(bits), vdupq_n_u64(0xFFFFFFFF));
+    uint64x2_t nbits64 =
+        vsraq_n_u64(nbits_lo32, vreinterpretq_u64_u32(nbits), 32);
+    uint64x2_t bits64 = vorrq_u64(bits_hi32, bits_lo32);
+    return Bits64{nbits64, bits64};
+  }
+
+  void Interleave(const Bits32& low) {
+    bits =
+        vorrq_u32(vshlq_u32(bits, vreinterpretq_s32_u32(low.nbits)), low.bits);
+    nbits = vaddq_u32(nbits, low.nbits);
+  }
+
+  void ClipTo(size_t n) {
+    n = std::min<size_t>(n, 4);
+    constexpr uint32_t kMask[8] = {
+        ~0u, ~0u, ~0u, ~0u, 0, 0, 0, 0,
+    };
+    uint32x4_t mask = vld1q_u32(kMask + 4 - n);
+    nbits = vandq_u32(mask, nbits);
+    bits = vandq_u32(mask, bits);
+  }
+  void Skip(size_t n) {
+    n = std::min<size_t>(n, 4);
+    constexpr uint32_t kMask[8] = {
+        0, 0, 0, 0, ~0u, ~0u, ~0u, ~0u,
+    };
+    uint32x4_t mask = vld1q_u32(kMask + 4 - n);
+    nbits = vandq_u32(mask, nbits);
+    bits = vandq_u32(mask, bits);
+  }
+};
+
+struct Bits16 {
+  uint16x8_t nbits;
+  uint16x8_t bits;
+
+  static Bits16 FromRaw(SIMDVec16 nbits, SIMDVec16 bits) {
+    return Bits16{nbits.vec, bits.vec};
+  }
+
+  Bits32 Merge() const {
+    // TODO(veluca): can probably be optimized.
+    uint32x4_t nbits_lo16 =
+        vandq_u32(vreinterpretq_u32_u16(nbits), vdupq_n_u32(0xFFFF));
+    uint32x4_t bits_hi16 =
+        vshlq_u32(vshrq_n_u32(vreinterpretq_u32_u16(bits), 16),
+                  vreinterpretq_s32_u32(nbits_lo16));
+    uint32x4_t bits_lo16 =
+        vandq_u32(vreinterpretq_u32_u16(bits), vdupq_n_u32(0xFFFF));
+    uint32x4_t nbits32 =
+        vsraq_n_u32(nbits_lo16, vreinterpretq_u32_u16(nbits), 16);
+    uint32x4_t bits32 = vorrq_u32(bits_hi16, bits_lo16);
+    return Bits32{nbits32, bits32};
+  }
+
+  void Interleave(const Bits16& low) {
+    bits =
+        vorrq_u16(vshlq_u16(bits, vreinterpretq_s16_u16(low.nbits)), low.bits);
+    nbits = vaddq_u16(nbits, low.nbits);
+  }
+
+  void ClipTo(size_t n) {
+    n = std::min<size_t>(n, 8);
+    constexpr uint16_t kMask[16] = {
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+        0,      0,      0,      0,      0,      0,      0,      0,
+    };
+    uint16x8_t mask = vld1q_u16(kMask + 8 - n);
+    nbits = vandq_u16(mask, nbits);
+    bits = vandq_u16(mask, bits);
+  }
+  void Skip(size_t n) {
+    n = std::min<size_t>(n, 8);
+    constexpr uint16_t kMask[16] = {
+        0,      0,      0,      0,      0,      0,      0,      0,
+        0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+    };
+    uint16x8_t mask = vld1q_u16(kMask + 8 - n);
+    nbits = vandq_u16(mask, nbits);
+    bits = vandq_u16(mask, bits);
+  }
+};
+
+#endif
+
+#ifdef FJXL_GENERIC_SIMD
+constexpr size_t SIMDVec32::kLanes;
+constexpr size_t SIMDVec16::kLanes;
+
+//  Each of these functions will process SIMDVec16::kLanes worth of values.
+
+FJXL_INLINE void TokenizeSIMD(const uint16_t* residuals, uint16_t* token_out,
+                              uint16_t* nbits_out, uint16_t* bits_out) {
+  SIMDVec16 res = SIMDVec16::Load(residuals);
+  SIMDVec16 token = res.ValToToken();
+  SIMDVec16 nbits = token.SatSubU(SIMDVec16::Val(1));
+  SIMDVec16 bits = res.SatSubU(nbits.Pow2());
+  token.Store(token_out);
+  nbits.Store(nbits_out);
+  bits.Store(bits_out);
+}
+
+FJXL_INLINE void TokenizeSIMD(const uint32_t* residuals, uint16_t* token_out,
+                              uint32_t* nbits_out, uint32_t* bits_out) {
+  static_assert(SIMDVec16::kLanes == 2 * SIMDVec32::kLanes, "");
+  SIMDVec32 res_lo = SIMDVec32::Load(residuals);
+  SIMDVec32 res_hi = SIMDVec32::Load(residuals + SIMDVec32::kLanes);
+  SIMDVec32 token_lo = res_lo.ValToToken();
+  SIMDVec32 token_hi = res_hi.ValToToken();
+  SIMDVec32 nbits_lo = token_lo.SatSubU(SIMDVec32::Val(1));
+  SIMDVec32 nbits_hi = token_hi.SatSubU(SIMDVec32::Val(1));
+  SIMDVec32 bits_lo = res_lo.SatSubU(nbits_lo.Pow2());
+  SIMDVec32 bits_hi = res_hi.SatSubU(nbits_hi.Pow2());
+  SIMDVec16 token = SIMDVec16::FromTwo32(token_lo, token_hi);
+  token.Store(token_out);
+  nbits_lo.Store(nbits_out);
+  nbits_hi.Store(nbits_out + SIMDVec32::kLanes);
+  bits_lo.Store(bits_out);
+  bits_hi.Store(bits_out + SIMDVec32::kLanes);
+}
+
+FJXL_INLINE void HuffmanSIMDUpTo13(const uint16_t* tokens,
+                                   const PrefixCode& code, uint16_t* nbits_out,
+                                   uint16_t* bits_out) {
+  SIMDVec16 tok = SIMDVec16::Load(tokens).PrepareForU8Lookup();
+  tok.U8Lookup(code.raw_nbits_simd).Store(nbits_out);
+  tok.U8Lookup(code.raw_bits_simd).Store(bits_out);
+}
+
+FJXL_INLINE void HuffmanSIMD14(const uint16_t* tokens, const PrefixCode& code,
+                               uint16_t* nbits_out, uint16_t* bits_out) {
+  SIMDVec16 token_cap = SIMDVec16::Val(15);
+  SIMDVec16 tok = SIMDVec16::Load(tokens);
+  SIMDVec16 tok_index = tok.Min(token_cap).PrepareForU8Lookup();
+  SIMDVec16 huff_bits_pre = tok_index.U8Lookup(code.raw_bits_simd);
+  // Set the highest bit when token == 16; the Huffman code is constructed in
+  // such a way that the code for token 15 is the same as the code for 16,
+  // except for the highest bit.
+  Mask16 needs_high_bit = tok.Eq(SIMDVec16::Val(16));
+  SIMDVec16 huff_bits = needs_high_bit.IfThenElse(
+      huff_bits_pre.Or(SIMDVec16::Val(128)), huff_bits_pre);
+  huff_bits.Store(bits_out);
+  tok_index.U8Lookup(code.raw_nbits_simd).Store(nbits_out);
+}
+
+FJXL_INLINE void HuffmanSIMDAbove14(const uint16_t* tokens,
+                                    const PrefixCode& code, uint16_t* nbits_out,
+                                    uint16_t* bits_out) {
+  SIMDVec16 tok = SIMDVec16::Load(tokens);
+  // We assume `tok` fits in a *signed* 16-bit integer.
+  Mask16 above = tok.Gt(SIMDVec16::Val(12));
+  // 13, 14 -> 13
+  // 15, 16 -> 14
+  // 17, 18 -> 15
+  SIMDVec16 remap_tok = above.IfThenElse(tok.HAdd(SIMDVec16::Val(13)), tok);
+  SIMDVec16 tok_index = remap_tok.PrepareForU8Lookup();
+  SIMDVec16 huff_bits_pre = tok_index.U8Lookup(code.raw_bits_simd);
+  // Set the highest bit when token == 14, 16, 18.
+  Mask16 needs_high_bit = above.And(tok.Eq(tok.And(SIMDVec16::Val(0xFFFE))));
+  SIMDVec16 huff_bits = needs_high_bit.IfThenElse(
+      huff_bits_pre.Or(SIMDVec16::Val(128)), huff_bits_pre);
+  huff_bits.Store(bits_out);
+  tok_index.U8Lookup(code.raw_nbits_simd).Store(nbits_out);
+}
+
+FJXL_INLINE void StoreSIMDUpTo8(const uint16_t* nbits_tok,
+                                const uint16_t* bits_tok,
+                                const uint16_t* nbits_huff,
+                                const uint16_t* bits_huff, size_t n,
+                                size_t skip, Bits32* bits_out) {
+  Bits16 bits =
+      Bits16::FromRaw(SIMDVec16::Load(nbits_tok), SIMDVec16::Load(bits_tok));
+  Bits16 huff_bits =
+      Bits16::FromRaw(SIMDVec16::Load(nbits_huff), SIMDVec16::Load(bits_huff));
+  bits.Interleave(huff_bits);
+  bits.ClipTo(n);
+  bits.Skip(skip);
+  bits_out[0] = bits.Merge();
+}
+
+// Huffman and raw bits don't necessarily fit in a single u16 here.
+FJXL_INLINE void StoreSIMDUpTo14(const uint16_t* nbits_tok,
+                                 const uint16_t* bits_tok,
+                                 const uint16_t* nbits_huff,
+                                 const uint16_t* bits_huff, size_t n,
+                                 size_t skip, Bits32* bits_out) {
+  VecPair<SIMDVec16> bits =
+      SIMDVec16::Load(bits_tok).Interleave(SIMDVec16::Load(bits_huff));
+  VecPair<SIMDVec16> nbits =
+      SIMDVec16::Load(nbits_tok).Interleave(SIMDVec16::Load(nbits_huff));
+  Bits16 low = Bits16::FromRaw(nbits.low, bits.low);
+  Bits16 hi = Bits16::FromRaw(nbits.hi, bits.hi);
+  low.ClipTo(2 * n);
+  low.Skip(2 * skip);
+  hi.ClipTo(std::max(2 * n, SIMDVec16::kLanes) - SIMDVec16::kLanes);
+  hi.Skip(std::max(2 * skip, SIMDVec16::kLanes) - SIMDVec16::kLanes);
+
+  bits_out[0] = low.Merge();
+  bits_out[1] = hi.Merge();
+}
+
+FJXL_INLINE void StoreSIMDAbove14(const uint32_t* nbits_tok,
+                                  const uint32_t* bits_tok,
+                                  const uint16_t* nbits_huff,
+                                  const uint16_t* bits_huff, size_t n,
+                                  size_t skip, Bits32* bits_out) {
+  static_assert(SIMDVec16::kLanes == 2 * SIMDVec32::kLanes, "");
+  Bits32 bits_low =
+      Bits32::FromRaw(SIMDVec32::Load(nbits_tok), SIMDVec32::Load(bits_tok));
+  Bits32 bits_hi =
+      Bits32::FromRaw(SIMDVec32::Load(nbits_tok + SIMDVec32::kLanes),
+                      SIMDVec32::Load(bits_tok + SIMDVec32::kLanes));
+
+  VecPair<SIMDVec32> huff_bits = SIMDVec16::Load(bits_huff).Upcast();
+  VecPair<SIMDVec32> huff_nbits = SIMDVec16::Load(nbits_huff).Upcast();
+
+  Bits32 huff_low = Bits32::FromRaw(huff_nbits.low, huff_bits.low);
+  Bits32 huff_hi = Bits32::FromRaw(huff_nbits.hi, huff_bits.hi);
+
+  bits_low.Interleave(huff_low);
+  bits_low.ClipTo(n);
+  bits_low.Skip(skip);
+  bits_out[0] = bits_low;
+  bits_hi.Interleave(huff_hi);
+  bits_hi.ClipTo(std::max(n, SIMDVec32::kLanes) - SIMDVec32::kLanes);
+  bits_hi.Skip(std::max(skip, SIMDVec32::kLanes) - SIMDVec32::kLanes);
+  bits_out[1] = bits_hi;
+}
+
+#ifdef FJXL_AVX512
+FJXL_INLINE void StoreToWriterAVX512(const Bits32& bits32, BitWriter& output) {
+  __m512i bits = bits32.bits;
+  __m512i nbits = bits32.nbits;
+
+  // Insert the leftover bits from the bit buffer at the bottom of the vector
+  // and extract the top of the vector.
+  uint64_t trail_bits =
+      _mm512_cvtsi512_si32(_mm512_alignr_epi32(bits, bits, 15));
+  uint64_t trail_nbits =
+      _mm512_cvtsi512_si32(_mm512_alignr_epi32(nbits, nbits, 15));
+  __m512i lead_bits = _mm512_set1_epi32(output.buffer);
+  __m512i lead_nbits = _mm512_set1_epi32(output.bits_in_buffer);
+  bits = _mm512_alignr_epi32(bits, lead_bits, 15);
+  nbits = _mm512_alignr_epi32(nbits, lead_nbits, 15);
+
+  // Merge 32 -> 64 bits.
+  Bits32 b{nbits, bits};
+  Bits64 b64 = b.Merge();
+  bits = b64.bits;
+  nbits = b64.nbits;
+
+  __m512i zero = _mm512_setzero_si512();
+
+  auto sh1 = [zero](__m512i vec) { return _mm512_alignr_epi64(vec, zero, 7); };
+  auto sh2 = [zero](__m512i vec) { return _mm512_alignr_epi64(vec, zero, 6); };
+  auto sh4 = [zero](__m512i vec) { return _mm512_alignr_epi64(vec, zero, 4); };
+
+  // Compute first-past-end-bit-position.
+  __m512i end_interm0 = _mm512_add_epi64(nbits, sh1(nbits));
+  __m512i end_interm1 = _mm512_add_epi64(end_interm0, sh2(end_interm0));
+  __m512i end = _mm512_add_epi64(end_interm1, sh4(end_interm1));
+
+  uint64_t simd_nbits = _mm512_cvtsi512_si32(_mm512_alignr_epi64(end, end, 7));
+
+  // Compute begin-bit-position.
+  __m512i begin = _mm512_sub_epi64(end, nbits);
+
+  // Index of the last bit in the chunk, or the end bit if nbits==0.
+  __m512i last = _mm512_mask_sub_epi64(
+      end, _mm512_cmpneq_epi64_mask(nbits, zero), end, _mm512_set1_epi64(1));
+
+  __m512i lane_offset_mask = _mm512_set1_epi64(63);
+
+  // Starting position of the chunk that each lane will ultimately belong to.
+  __m512i chunk_start = _mm512_andnot_si512(lane_offset_mask, last);
+
+  // For all lanes that contain bits belonging to two different 64-bit chunks,
+  // compute the number of bits that belong to the first chunk.
+  // total # of bits fit in a u16, so we can satsub_u16 here.
+  __m512i first_chunk_nbits = _mm512_subs_epu16(chunk_start, begin);
+
+  // Move all the previous-chunk-bits to the previous lane.
+  __m512i negnbits = _mm512_sub_epi64(_mm512_set1_epi64(64), first_chunk_nbits);
+  __m512i first_chunk_bits =
+      _mm512_srlv_epi64(_mm512_sllv_epi64(bits, negnbits), negnbits);
+  __m512i first_chunk_bits_down =
+      _mm512_alignr_epi32(zero, first_chunk_bits, 2);
+  bits = _mm512_srlv_epi64(bits, first_chunk_nbits);
+  nbits = _mm512_sub_epi64(nbits, first_chunk_nbits);
+  bits = _mm512_or_si512(bits, _mm512_sllv_epi64(first_chunk_bits_down, nbits));
+  begin = _mm512_add_epi64(begin, first_chunk_nbits);
+
+  // We now know that every lane should give bits to only one chunk. We can
+  // shift the bits and then horizontally-or-reduce them within the same chunk.
+  __m512i offset = _mm512_and_si512(begin, lane_offset_mask);
+  __m512i aligned_bits = _mm512_sllv_epi64(bits, offset);
+  // h-or-reduce within same chunk
+  __m512i red0 = _mm512_mask_or_epi64(
+      aligned_bits, _mm512_cmpeq_epi64_mask(sh1(chunk_start), chunk_start),
+      sh1(aligned_bits), aligned_bits);
+  __m512i red1 = _mm512_mask_or_epi64(
+      red0, _mm512_cmpeq_epi64_mask(sh2(chunk_start), chunk_start), sh2(red0),
+      red0);
+  __m512i reduced = _mm512_mask_or_epi64(
+      red1, _mm512_cmpeq_epi64_mask(sh4(chunk_start), chunk_start), sh4(red1),
+      red1);
+  // Extract the highest lane that belongs to each chunk (the lane that ends up
+  // with the OR-ed value of all the other lanes of that chunk).
+  __m512i next_chunk_start =
+      _mm512_alignr_epi32(_mm512_set1_epi64(~0), chunk_start, 2);
+  __m512i result = _mm512_maskz_compress_epi64(
+      _mm512_cmpneq_epi64_mask(chunk_start, next_chunk_start), reduced);
+
+  _mm512_storeu_si512((__m512i*)(output.data.get() + output.bytes_written),
+                      result);
+
+  // Update the bit writer and add the last 32-bit lane.
+  // Note that since trail_nbits was at most 32 to begin with, operating on
+  // trail_bits does not risk overflowing.
+  output.bytes_written += simd_nbits / 8;
+  // Here we are implicitly relying on the fact that simd_nbits < 512 to know
+  // that the byte of bitreader data we access is initialized. This is
+  // guaranteed because the remaining bits in the bitreader buffer are at most
+  // 7, so simd_nbits <= 505 always.
+  trail_bits = (trail_bits << (simd_nbits % 8)) +
+               output.data.get()[output.bytes_written];
+  trail_nbits += simd_nbits % 8;
+  StoreLE64(output.data.get() + output.bytes_written, trail_bits);
+  size_t trail_bytes = trail_nbits / 8;
+  output.bits_in_buffer = trail_nbits % 8;
+  output.buffer = trail_bits >> (trail_bytes * 8);
+  output.bytes_written += trail_bytes;
+}
+
+#endif
+
+template <size_t n>
+FJXL_INLINE void StoreToWriter(const Bits32* bits, BitWriter& output) {
+#ifdef FJXL_AVX512
+  static_assert(n <= 2, "");
+  StoreToWriterAVX512(bits[0], output);
+  if (n == 2) {
+    StoreToWriterAVX512(bits[1], output);
+  }
+  return;
+#endif
+  static_assert(n <= 4, "");
+  alignas(64) uint64_t nbits64[Bits64::kLanes * n];
+  alignas(64) uint64_t bits64[Bits64::kLanes * n];
+  bits[0].Merge().Store(nbits64, bits64);
+  if (n > 1) {
+    bits[1].Merge().Store(nbits64 + Bits64::kLanes, bits64 + Bits64::kLanes);
+  }
+  if (n > 2) {
+    bits[2].Merge().Store(nbits64 + 2 * Bits64::kLanes,
+                          bits64 + 2 * Bits64::kLanes);
+  }
+  if (n > 3) {
+    bits[3].Merge().Store(nbits64 + 3 * Bits64::kLanes,
+                          bits64 + 3 * Bits64::kLanes);
+  }
+  output.WriteMultiple(nbits64, bits64, Bits64::kLanes * n);
+}
+
+namespace detail {
+template <typename T>
+struct IntegerTypes;
+
+template <>
+struct IntegerTypes<SIMDVec16> {
+  using signed_ = int16_t;
+  using unsigned_ = uint16_t;
+};
+
+template <>
+struct IntegerTypes<SIMDVec32> {
+  using signed_ = int32_t;
+  using unsigned_ = uint32_t;
+};
+
+template <typename T>
+struct SIMDType;
+
+template <>
+struct SIMDType<int16_t> {
+  using type = SIMDVec16;
+};
+
+template <>
+struct SIMDType<int32_t> {
+  using type = SIMDVec32;
+};
+
+}  // namespace detail
+
+template <typename T>
+using signed_t = typename detail::IntegerTypes<T>::signed_;
+
+template <typename T>
+using unsigned_t = typename detail::IntegerTypes<T>::unsigned_;
+
+template <typename T>
+using simd_t = typename detail::SIMDType<T>::type;
+
+// This function will process exactly one vector worth of pixels.
+
+template <typename T>
+size_t PredictPixels(const signed_t<T>* pixels, const signed_t<T>* pixels_left,
+                     const signed_t<T>* pixels_top,
+                     const signed_t<T>* pixels_topleft,
+                     unsigned_t<T>* residuals) {
+  T px = T::Load((unsigned_t<T>*)pixels);
+  T left = T::Load((unsigned_t<T>*)pixels_left);
+  T top = T::Load((unsigned_t<T>*)pixels_top);
+  T topleft = T::Load((unsigned_t<T>*)pixels_topleft);
+  T ac = left.Sub(topleft);
+  T ab = left.Sub(top);
+  T bc = top.Sub(topleft);
+  T grad = ac.Add(top);
+  T d = ab.Xor(bc);
+  T zero = T::Val(0);
+  T clamp = zero.Gt(d).IfThenElse(top, left);
+  T s = ac.Xor(bc);
+  T pred = zero.Gt(s).IfThenElse(grad, clamp);
+  T res = px.Sub(pred);
+  T res_times_2 = res.Add(res);
+  res = zero.Gt(res).IfThenElse(T::Val(-1).Sub(res_times_2), res_times_2);
+  res.Store(residuals);
+  return res.Eq(T::Val(0)).CountPrefix();
+}
+
+#endif
+
+void EncodeHybridUint000(uint32_t value, uint32_t* token, uint32_t* nbits,
+                         uint32_t* bits) {
+  uint32_t n = FloorLog2(value);
+  *token = value ? n + 1 : 0;
+  *nbits = value ? n : 0;
+  *bits = value ? value - (1 << n) : 0;
+}
+
+#ifdef FJXL_AVX512
+constexpr static size_t kLogChunkSize = 5;
+#elif defined(FJXL_AVX2) || defined(FJXL_NEON)
+// Even if NEON only has 128-bit lanes, it is still significantly (~1.3x) faster
+// to process two vectors at a time.
+constexpr static size_t kLogChunkSize = 4;
+#else
+constexpr static size_t kLogChunkSize = 3;
+#endif
+
+constexpr static size_t kChunkSize = 1 << kLogChunkSize;
+
+template <typename Residual>
+void GenericEncodeChunk(const Residual* residuals, size_t n, size_t skip,
+                        const PrefixCode& code, BitWriter& output) {
+  for (size_t ix = skip; ix < n; ix++) {
+    unsigned token, nbits, bits;
+    EncodeHybridUint000(residuals[ix], &token, &nbits, &bits);
+    output.Write(code.raw_nbits[token] + nbits,
+                 code.raw_bits[token] | bits << code.raw_nbits[token]);
+  }
+}
+
+struct UpTo8Bits {
+  size_t bitdepth;
+  explicit UpTo8Bits(size_t bitdepth) : bitdepth(bitdepth) {
+    assert(bitdepth <= 8);
+  }
+  // Here we can fit up to 9 extra bits + 7 Huffman bits in a u16; for all other
+  // symbols, we could actually go up to 8 Huffman bits as we have at most 8
+  // extra bits; however, the SIMD bit merging logic for AVX2 assumes that no
+  // Huffman length is 8 or more, so we cap at 8 anyway. Last symbol is used for
+  // LZ77 lengths and has no limitations except allowing to represent 32 symbols
+  // in total.
+  static constexpr uint8_t kMinRawLength[12] = {};
+  static constexpr uint8_t kMaxRawLength[12] = {
+      7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 10,
+  };
+  static size_t MaxEncodedBitsPerSample() { return 16; }
+  static constexpr size_t kInputBytes = 1;
+  using pixel_t = int16_t;
+  using upixel_t = uint16_t;
+
+  static void PrepareForSimd(const uint8_t* nbits, const uint8_t* bits,
+                             size_t n, uint8_t* nbits_simd,
+                             uint8_t* bits_simd) {
+    assert(n <= 16);
+    memcpy(nbits_simd, nbits, 16);
+    memcpy(bits_simd, bits, 16);
+  }
+
+  static void EncodeChunk(upixel_t* residuals, size_t n, size_t skip,
+                          const PrefixCode& code, BitWriter& output) {
+#ifdef FJXL_GENERIC_SIMD
+    Bits32 bits32[kChunkSize / SIMDVec16::kLanes];
+    alignas(64) uint16_t bits[SIMDVec16::kLanes];
+    alignas(64) uint16_t nbits[SIMDVec16::kLanes];
+    alignas(64) uint16_t bits_huff[SIMDVec16::kLanes];
+    alignas(64) uint16_t nbits_huff[SIMDVec16::kLanes];
+    alignas(64) uint16_t token[SIMDVec16::kLanes];
+    for (size_t i = 0; i < kChunkSize; i += SIMDVec16::kLanes) {
+      TokenizeSIMD(residuals + i, token, nbits, bits);
+      HuffmanSIMDUpTo13(token, code, nbits_huff, bits_huff);
+      StoreSIMDUpTo8(nbits, bits, nbits_huff, bits_huff, std::max(n, i) - i,
+                     std::max(skip, i) - i, bits32 + i / SIMDVec16::kLanes);
+    }
+    StoreToWriter<kChunkSize / SIMDVec16::kLanes>(bits32, output);
+    return;
+#endif
+    GenericEncodeChunk(residuals, n, skip, code, output);
+  }
+
+  size_t NumSymbols(bool doing_ycocg_or_large_palette) const {
+    // values gain 1 bit for YCoCg, 1 bit for prediction.
+    // Maximum symbol is 1 + effective bit depth of residuals.
+    if (doing_ycocg_or_large_palette) {
+      return bitdepth + 3;
+    } else {
+      return bitdepth + 2;
+    }
+  }
+};
+constexpr uint8_t UpTo8Bits::kMinRawLength[];
+constexpr uint8_t UpTo8Bits::kMaxRawLength[];
+
+struct From9To13Bits {
+  size_t bitdepth;
+  explicit From9To13Bits(size_t bitdepth) : bitdepth(bitdepth) {
+    assert(bitdepth <= 13 && bitdepth >= 9);
+  }
+  // Last symbol is used for LZ77 lengths and has no limitations except allowing
+  // to represent 32 symbols in total.
+  // We cannot fit all the bits in a u16, so do not even try and use up to 8
+  // bits per raw symbol.
+  // There are at most 16 raw symbols, so Huffman coding can be SIMDfied without
+  // any special tricks.
+  static constexpr uint8_t kMinRawLength[17] = {};
+  static constexpr uint8_t kMaxRawLength[17] = {
+      8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 10,
+  };
+  static size_t MaxEncodedBitsPerSample() { return 21; }
+  static constexpr size_t kInputBytes = 2;
+  using pixel_t = int16_t;
+  using upixel_t = uint16_t;
+
+  static void PrepareForSimd(const uint8_t* nbits, const uint8_t* bits,
+                             size_t n, uint8_t* nbits_simd,
+                             uint8_t* bits_simd) {
+    assert(n <= 16);
+    memcpy(nbits_simd, nbits, 16);
+    memcpy(bits_simd, bits, 16);
+  }
+
+  static void EncodeChunk(upixel_t* residuals, size_t n, size_t skip,
+                          const PrefixCode& code, BitWriter& output) {
+#ifdef FJXL_GENERIC_SIMD
+    Bits32 bits32[2 * kChunkSize / SIMDVec16::kLanes];
+    alignas(64) uint16_t bits[SIMDVec16::kLanes];
+    alignas(64) uint16_t nbits[SIMDVec16::kLanes];
+    alignas(64) uint16_t bits_huff[SIMDVec16::kLanes];
+    alignas(64) uint16_t nbits_huff[SIMDVec16::kLanes];
+    alignas(64) uint16_t token[SIMDVec16::kLanes];
+    for (size_t i = 0; i < kChunkSize; i += SIMDVec16::kLanes) {
+      TokenizeSIMD(residuals + i, token, nbits, bits);
+      HuffmanSIMDUpTo13(token, code, nbits_huff, bits_huff);
+      StoreSIMDUpTo14(nbits, bits, nbits_huff, bits_huff, std::max(n, i) - i,
+                      std::max(skip, i) - i,
+                      bits32 + 2 * i / SIMDVec16::kLanes);
+    }
+    StoreToWriter<2 * kChunkSize / SIMDVec16::kLanes>(bits32, output);
+    return;
+#endif
+    GenericEncodeChunk(residuals, n, skip, code, output);
+  }
+
+  size_t NumSymbols(bool doing_ycocg_or_large_palette) const {
+    // values gain 1 bit for YCoCg, 1 bit for prediction.
+    // Maximum symbol is 1 + effective bit depth of residuals.
+    if (doing_ycocg_or_large_palette) {
+      return bitdepth + 3;
+    } else {
+      return bitdepth + 2;
+    }
+  }
+};
+constexpr uint8_t From9To13Bits::kMinRawLength[];
+constexpr uint8_t From9To13Bits::kMaxRawLength[];
+
+void CheckHuffmanBitsSIMD(int bits1, int nbits1, int bits2, int nbits2) {
+  assert(nbits1 == 8);
+  assert(nbits2 == 8);
+  assert(bits2 == (bits1 | 128));
+}
+
+struct Exactly14Bits {
+  explicit Exactly14Bits(size_t bitdepth) { assert(bitdepth == 14); }
+  // Force LZ77 symbols to have at least 8 bits, and raw symbols 15 and 16 to
+  // have exactly 8, and no other symbol to have 8 or more. This ensures that
+  // the representation for 15 and 16 is identical up to one bit.
+  static constexpr uint8_t kMinRawLength[18] = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 7,
+  };
+  static constexpr uint8_t kMaxRawLength[18] = {
+      7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 10,
+  };
+  static constexpr size_t bitdepth = 14;
+  static size_t MaxEncodedBitsPerSample() { return 22; }
+  static constexpr size_t kInputBytes = 2;
+  using pixel_t = int16_t;
+  using upixel_t = uint16_t;
+
+  static void PrepareForSimd(const uint8_t* nbits, const uint8_t* bits,
+                             size_t n, uint8_t* nbits_simd,
+                             uint8_t* bits_simd) {
+    assert(n == 17);
+    CheckHuffmanBitsSIMD(bits[15], nbits[15], bits[16], nbits[16]);
+    memcpy(nbits_simd, nbits, 16);
+    memcpy(bits_simd, bits, 16);
+  }
+
+  static void EncodeChunk(upixel_t* residuals, size_t n, size_t skip,
+                          const PrefixCode& code, BitWriter& output) {
+#ifdef FJXL_GENERIC_SIMD
+    Bits32 bits32[2 * kChunkSize / SIMDVec16::kLanes];
+    alignas(64) uint16_t bits[SIMDVec16::kLanes];
+    alignas(64) uint16_t nbits[SIMDVec16::kLanes];
+    alignas(64) uint16_t bits_huff[SIMDVec16::kLanes];
+    alignas(64) uint16_t nbits_huff[SIMDVec16::kLanes];
+    alignas(64) uint16_t token[SIMDVec16::kLanes];
+    for (size_t i = 0; i < kChunkSize; i += SIMDVec16::kLanes) {
+      TokenizeSIMD(residuals + i, token, nbits, bits);
+      HuffmanSIMD14(token, code, nbits_huff, bits_huff);
+      StoreSIMDUpTo14(nbits, bits, nbits_huff, bits_huff, std::max(n, i) - i,
+                      std::max(skip, i) - i,
+                      bits32 + 2 * i / SIMDVec16::kLanes);
+    }
+    StoreToWriter<2 * kChunkSize / SIMDVec16::kLanes>(bits32, output);
+    return;
+#endif
+    GenericEncodeChunk(residuals, n, skip, code, output);
+  }
+
+  size_t NumSymbols(bool) const { return 17; }
+};
+constexpr uint8_t Exactly14Bits::kMinRawLength[];
+constexpr uint8_t Exactly14Bits::kMaxRawLength[];
+
+struct MoreThan14Bits {
+  size_t bitdepth;
+  explicit MoreThan14Bits(size_t bitdepth) : bitdepth(bitdepth) {
+    assert(bitdepth > 14);
+    assert(bitdepth <= 16);
+  }
+  // Force LZ77 symbols to have at least 8 bits, and raw symbols 13 to 18 to
+  // have exactly 8, and no other symbol to have 8 or more. This ensures that
+  // the representation for (13, 14), (15, 16), (17, 18) is identical up to one
+  // bit.
+  static constexpr uint8_t kMinRawLength[20] = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 7,
+  };
+  static constexpr uint8_t kMaxRawLength[20] = {
+      7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 10,
+  };
+  static size_t MaxEncodedBitsPerSample() { return 24; }
+  static constexpr size_t kInputBytes = 2;
+  using pixel_t = int32_t;
+  using upixel_t = uint32_t;
+
+  static void PrepareForSimd(const uint8_t* nbits, const uint8_t* bits,
+                             size_t n, uint8_t* nbits_simd,
+                             uint8_t* bits_simd) {
+    assert(n == 19);
+    CheckHuffmanBitsSIMD(bits[13], nbits[13], bits[14], nbits[14]);
+    CheckHuffmanBitsSIMD(bits[15], nbits[15], bits[16], nbits[16]);
+    CheckHuffmanBitsSIMD(bits[17], nbits[17], bits[18], nbits[18]);
+    for (size_t i = 0; i < 14; i++) {
+      nbits_simd[i] = nbits[i];
+      bits_simd[i] = bits[i];
+    }
+    nbits_simd[14] = nbits[15];
+    bits_simd[14] = bits[15];
+    nbits_simd[15] = nbits[17];
+    bits_simd[15] = bits[17];
+  }
+
+  static void EncodeChunk(upixel_t* residuals, size_t n, size_t skip,
+                          const PrefixCode& code, BitWriter& output) {
+#ifdef FJXL_GENERIC_SIMD
+    Bits32 bits32[2 * kChunkSize / SIMDVec16::kLanes];
+    alignas(64) uint32_t bits[SIMDVec16::kLanes];
+    alignas(64) uint32_t nbits[SIMDVec16::kLanes];
+    alignas(64) uint16_t bits_huff[SIMDVec16::kLanes];
+    alignas(64) uint16_t nbits_huff[SIMDVec16::kLanes];
+    alignas(64) uint16_t token[SIMDVec16::kLanes];
+    for (size_t i = 0; i < kChunkSize; i += SIMDVec16::kLanes) {
+      TokenizeSIMD(residuals + i, token, nbits, bits);
+      HuffmanSIMDAbove14(token, code, nbits_huff, bits_huff);
+      StoreSIMDAbove14(nbits, bits, nbits_huff, bits_huff, std::max(n, i) - i,
+                       std::max(skip, i) - i,
+                       bits32 + 2 * i / SIMDVec16::kLanes);
+    }
+    StoreToWriter<2 * kChunkSize / SIMDVec16::kLanes>(bits32, output);
+    return;
+#endif
+    GenericEncodeChunk(residuals, n, skip, code, output);
+  }
+  size_t NumSymbols(bool) const { return 19; }
+};
+constexpr uint8_t MoreThan14Bits::kMinRawLength[];
+constexpr uint8_t MoreThan14Bits::kMaxRawLength[];
+
+void PrepareDCGlobalCommon(bool is_single_group, size_t width, size_t height,
+                           const PrefixCode code[4], BitWriter* output) {
+  output->Allocate(100000 + (is_single_group ? width * height * 16 : 0));
+  // No patches, spline or noise.
+  output->Write(1, 1);  // default DC dequantization factors (?)
+  output->Write(1, 1);  // use global tree / histograms
+  output->Write(1, 0);  // no lz77 for the tree
+
+  output->Write(1, 1);         // simple code for the tree's context map
+  output->Write(2, 0);         // all contexts clustered together
+  output->Write(1, 1);         // use prefix code for tree
+  output->Write(4, 0);         // 000 hybrid uint
+  output->Write(6, 0b100011);  // Alphabet size is 4 (var16)
+  output->Write(2, 1);         // simple prefix code
+  output->Write(2, 3);         // with 4 symbols
+  output->Write(2, 0);
+  output->Write(2, 1);
+  output->Write(2, 2);
+  output->Write(2, 3);
+  output->Write(1, 0);  // First tree encoding option
+  // Huffman table + extra bits for the tree.
+  uint8_t symbol_bits[6] = {0b00, 0b10, 0b001, 0b101, 0b0011, 0b0111};
+  uint8_t symbol_nbits[6] = {2, 2, 3, 3, 4, 4};
+  // Write a tree with a leaf per channel, and gradient predictor for every
+  // leaf.
+  for (auto v : {1, 2, 1, 4, 1, 0, 0, 5, 0, 0, 0, 0, 5,
+                 0, 0, 0, 0, 5, 0, 0, 0, 0, 5, 0, 0, 0}) {
+    output->Write(symbol_nbits[v], symbol_bits[v]);
+  }
+
+  output->Write(1, 1);     // Enable lz77 for the main bitstream
+  output->Write(2, 0b00);  // lz77 offset 224
+  static_assert(kLZ77Offset == 224, "");
+  output->Write(4, 0b1010);  // lz77 min length 7
+  // 400 hybrid uint config for lz77
+  output->Write(4, 4);
+  output->Write(3, 0);
+  output->Write(3, 0);
+
+  output->Write(1, 1);  // simple code for the context map
+  output->Write(2, 3);  // 3 bits per entry
+  output->Write(3, 4);  // channel 3
+  output->Write(3, 3);  // channel 2
+  output->Write(3, 2);  // channel 1
+  output->Write(3, 1);  // channel 0
+  output->Write(3, 0);  // distance histogram first
+
+  output->Write(1, 1);  // use prefix codes
+  output->Write(4, 0);  // 000 hybrid uint config for distances (only need 0)
+  for (size_t i = 0; i < 4; i++) {
+    output->Write(4, 0);  // 000 hybrid uint config for symbols (only <= 10)
+  }
+
+  // Distance alphabet size:
+  output->Write(5, 0b00001);  // 2: just need 1 for RLE (i.e. distance 1)
+  // Symbol + LZ77 alphabet size:
+  for (size_t i = 0; i < 4; i++) {
+    output->Write(1, 1);    // > 1
+    output->Write(4, 8);    // <= 512
+    output->Write(8, 256);  // == 512
+  }
+
+  // Distance histogram:
+  output->Write(2, 1);  // simple prefix code
+  output->Write(2, 0);  // with one symbol
+  output->Write(1, 1);  // 1
+
+  // Symbol + lz77 histogram:
+  for (size_t i = 0; i < 4; i++) {
+    code[i].WriteTo(output);
+  }
+
+  // Group header for global modular image.
+  output->Write(1, 1);  // Global tree
+  output->Write(1, 1);  // All default wp
+}
+
+void PrepareDCGlobal(bool is_single_group, size_t width, size_t height,
+                     size_t nb_chans, const PrefixCode code[4],
+                     BitWriter* output) {
+  PrepareDCGlobalCommon(is_single_group, width, height, code, output);
+  if (nb_chans > 2) {
+    output->Write(2, 0b01);     // 1 transform
+    output->Write(2, 0b00);     // RCT
+    output->Write(5, 0b00000);  // Starting from ch 0
+    output->Write(2, 0b00);     // YCoCg
+  } else {
+    output->Write(2, 0b00);  // no transforms
+  }
+  if (!is_single_group) {
+    output->ZeroPadToByte();
+  }
+}
+
+template <typename BitDepth>
+struct ChunkEncoder {
+  FJXL_INLINE static void EncodeRle(size_t count, const PrefixCode& code,
+                                    BitWriter& output) {
+    if (count == 0) return;
+    count -= kLZ77MinLength + 1;
+    if (count < kLZ77CacheSize) {
+      output.Write(code.lz77_cache_nbits[count], code.lz77_cache_bits[count]);
+    } else {
+      unsigned token, nbits, bits;
+      EncodeHybridUintLZ77(count, &token, &nbits, &bits);
+      uint64_t wbits = bits;
+      wbits = (wbits << code.lz77_nbits[token]) | code.lz77_bits[token];
+      wbits = (wbits << code.raw_nbits[0]) | code.raw_bits[0];
+      output.Write(code.lz77_nbits[token] + nbits + code.raw_nbits[0], wbits);
+    }
+  }
+
+  FJXL_INLINE void Chunk(size_t run, typename BitDepth::upixel_t* residuals,
+                         size_t skip, size_t n) {
+    EncodeRle(run, *code, *output);
+    BitDepth::EncodeChunk(residuals, n, skip, *code, *output);
+  }
+
+  inline void Finalize(size_t run) { EncodeRle(run, *code, *output); }
+
+  const PrefixCode* code;
+  BitWriter* output;
+};
+
+template <typename BitDepth>
+struct ChunkSampleCollector {
+  FJXL_INLINE void Rle(size_t count, uint64_t* lz77_counts) {
+    if (count == 0) return;
+    raw_counts[0] += 1;
+    count -= kLZ77MinLength + 1;
+    unsigned token, nbits, bits;
+    EncodeHybridUintLZ77(count, &token, &nbits, &bits);
+    lz77_counts[token]++;
+  }
+
+  FJXL_INLINE void Chunk(size_t run, typename BitDepth::upixel_t* residuals,
+                         size_t skip, size_t n) {
+    // Run is broken. Encode the run and encode the individual vector.
+    Rle(run, lz77_counts);
+    for (size_t ix = skip; ix < n; ix++) {
+      unsigned token, nbits, bits;
+      EncodeHybridUint000(residuals[ix], &token, &nbits, &bits);
+      raw_counts[token]++;
+    }
+  }
+
+  // don't count final run since we don't know how long it really is
+  void Finalize(size_t run) {}
+
+  uint64_t* raw_counts;
+  uint64_t* lz77_counts;
+};
+
+constexpr uint32_t PackSigned(int32_t value) {
+  return (static_cast<uint32_t>(value) << 1) ^
+         ((static_cast<uint32_t>(~value) >> 31) - 1);
+}
+
+template <typename T, typename BitDepth>
+struct ChannelRowProcessor {
+  using upixel_t = typename BitDepth::upixel_t;
+  using pixel_t = typename BitDepth::pixel_t;
+  T* t;
+  void ProcessChunk(const pixel_t* row, const pixel_t* row_left,
+                    const pixel_t* row_top, const pixel_t* row_topleft,
+                    size_t n) {
+    alignas(64) upixel_t residuals[kChunkSize] = {};
+    size_t prefix_size = 0;
+    size_t required_prefix_size = 0;
+#ifdef FJXL_GENERIC_SIMD
+    constexpr size_t kNum =
+        sizeof(pixel_t) == 2 ? SIMDVec16::kLanes : SIMDVec32::kLanes;
+    for (size_t ix = 0; ix < kChunkSize; ix += kNum) {
+      size_t c =
+          PredictPixels<simd_t<pixel_t>>(row + ix, row_left + ix, row_top + ix,
+                                         row_topleft + ix, residuals + ix);
+      prefix_size =
+          prefix_size == required_prefix_size ? prefix_size + c : prefix_size;
+      required_prefix_size += kNum;
+    }
+#else
+    for (size_t ix = 0; ix < kChunkSize; ix++) {
+      pixel_t px = row[ix];
+      pixel_t left = row_left[ix];
+      pixel_t top = row_top[ix];
+      pixel_t topleft = row_topleft[ix];
+      pixel_t ac = left - topleft;
+      pixel_t ab = left - top;
+      pixel_t bc = top - topleft;
+      pixel_t grad = static_cast<pixel_t>(static_cast<upixel_t>(ac) +
+                                          static_cast<upixel_t>(top));
+      pixel_t d = ab ^ bc;
+      pixel_t clamp = d < 0 ? top : left;
+      pixel_t s = ac ^ bc;
+      pixel_t pred = s < 0 ? grad : clamp;
+      residuals[ix] = PackSigned(px - pred);
+      prefix_size = prefix_size == required_prefix_size
+                        ? prefix_size + (residuals[ix] == 0)
+                        : prefix_size;
+      required_prefix_size += 1;
+    }
+#endif
+    prefix_size = std::min(n, prefix_size);
+    if (prefix_size == n && (run > 0 || prefix_size > kLZ77MinLength)) {
+      // Run continues, nothing to do.
+      run += prefix_size;
+    } else if (prefix_size + run > kLZ77MinLength) {
+      // Run is broken. Encode the run and encode the individual vector.
+      t->Chunk(run + prefix_size, residuals, prefix_size, n);
+      run = 0;
+    } else {
+      // There was no run to begin with.
+      t->Chunk(0, residuals, 0, n);
+    }
+  }
+
+  void ProcessRow(const pixel_t* row, const pixel_t* row_left,
+                  const pixel_t* row_top, const pixel_t* row_topleft,
+                  size_t xs) {
+    for (size_t x = 0; x < xs; x += kChunkSize) {
+      ProcessChunk(row + x, row_left + x, row_top + x, row_topleft + x,
+                   std::min(kChunkSize, xs - x));
+    }
+  }
+
+  void Finalize() { t->Finalize(run); }
+  // Invariant: run == 0 or run > kLZ77MinLength.
+  size_t run = 0;
+};
+
+uint16_t LoadLE16(const unsigned char* ptr) {
+  return uint16_t{ptr[0]} | (uint16_t{ptr[1]} << 8);
+}
+
+uint16_t SwapEndian(uint16_t in) { return (in >> 8) | (in << 8); }
+
+#ifdef FJXL_GENERIC_SIMD
+void StorePixels(SIMDVec16 p, int16_t* dest) { p.Store((uint16_t*)dest); }
+
+void StorePixels(SIMDVec16 p, int32_t* dest) {
+  VecPair<SIMDVec32> p_up = p.Upcast();
+  p_up.low.Store((uint32_t*)dest);
+  p_up.hi.Store((uint32_t*)dest + SIMDVec32::kLanes);
+}
+#endif
+
+template <typename pixel_t>
+void FillRowG8(const unsigned char* rgba, size_t oxs, pixel_t* luma) {
+  size_t x = 0;
+#ifdef FJXL_GENERIC_SIMD
+  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
+    auto rgb = SIMDVec16::LoadG8(rgba + x);
+    StorePixels(rgb[0], luma + x);
+  }
+#endif
+  for (; x < oxs; x++) {
+    luma[x] = rgba[x];
+  }
+}
+
+template <bool big_endian, typename pixel_t>
+void FillRowG16(const unsigned char* rgba, size_t oxs, pixel_t* luma) {
+  size_t x = 0;
+#ifdef FJXL_GENERIC_SIMD
+  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
+    auto rgb = SIMDVec16::LoadG16(rgba + 2 * x);
+    if (big_endian) {
+      rgb[0].SwapEndian();
+    }
+    StorePixels(rgb[0], luma + x);
+  }
+#endif
+  for (; x < oxs; x++) {
+    uint16_t val = LoadLE16(rgba + 2 * x);
+    if (big_endian) {
+      val = SwapEndian(val);
+    }
+    luma[x] = val;
+  }
+}
+
+template <typename pixel_t>
+void FillRowGA8(const unsigned char* rgba, size_t oxs, pixel_t* luma,
+                pixel_t* alpha) {
+  size_t x = 0;
+#ifdef FJXL_GENERIC_SIMD
+  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
+    auto rgb = SIMDVec16::LoadGA8(rgba + 2 * x);
+    StorePixels(rgb[0], luma + x);
+    StorePixels(rgb[1], alpha + x);
+  }
+#endif
+  for (; x < oxs; x++) {
+    luma[x] = rgba[2 * x];
+    alpha[x] = rgba[2 * x + 1];
+  }
+}
+
+template <bool big_endian, typename pixel_t>
+void FillRowGA16(const unsigned char* rgba, size_t oxs, pixel_t* luma,
+                 pixel_t* alpha) {
+  size_t x = 0;
+#ifdef FJXL_GENERIC_SIMD
+  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
+    auto rgb = SIMDVec16::LoadGA16(rgba + 4 * x);
+    if (big_endian) {
+      rgb[0].SwapEndian();
+      rgb[1].SwapEndian();
+    }
+    StorePixels(rgb[0], luma + x);
+    StorePixels(rgb[1], alpha + x);
+  }
+#endif
+  for (; x < oxs; x++) {
+    uint16_t l = LoadLE16(rgba + 4 * x);
+    uint16_t a = LoadLE16(rgba + 4 * x + 2);
+    if (big_endian) {
+      l = SwapEndian(l);
+      a = SwapEndian(a);
+    }
+    luma[x] = l;
+    alpha[x] = a;
+  }
+}
+
+template <typename pixel_t>
+void StoreYCoCg(pixel_t r, pixel_t g, pixel_t b, pixel_t* y, pixel_t* co,
+                pixel_t* cg) {
+  *co = r - b;
+  pixel_t tmp = b + (*co >> 1);
+  *cg = g - tmp;
+  *y = tmp + (*cg >> 1);
+}
+
+#ifdef FJXL_GENERIC_SIMD
+void StoreYCoCg(SIMDVec16 r, SIMDVec16 g, SIMDVec16 b, int16_t* y, int16_t* co,
+                int16_t* cg) {
+  SIMDVec16 co_v = r.Sub(b);
+  SIMDVec16 tmp = b.Add(co_v.SignedShiftRight<1>());
+  SIMDVec16 cg_v = g.Sub(tmp);
+  SIMDVec16 y_v = tmp.Add(cg_v.SignedShiftRight<1>());
+  y_v.Store((uint16_t*)y);
+  co_v.Store((uint16_t*)co);
+  cg_v.Store((uint16_t*)cg);
+}
+
+void StoreYCoCg(SIMDVec16 r, SIMDVec16 g, SIMDVec16 b, int32_t* y, int32_t* co,
+                int32_t* cg) {
+  VecPair<SIMDVec32> r_up = r.Upcast();
+  VecPair<SIMDVec32> g_up = g.Upcast();
+  VecPair<SIMDVec32> b_up = b.Upcast();
+  SIMDVec32 co_lo_v = r_up.low.Sub(b_up.low);
+  SIMDVec32 tmp_lo = b_up.low.Add(co_lo_v.SignedShiftRight<1>());
+  SIMDVec32 cg_lo_v = g_up.low.Sub(tmp_lo);
+  SIMDVec32 y_lo_v = tmp_lo.Add(cg_lo_v.SignedShiftRight<1>());
+  SIMDVec32 co_hi_v = r_up.hi.Sub(b_up.hi);
+  SIMDVec32 tmp_hi = b_up.hi.Add(co_hi_v.SignedShiftRight<1>());
+  SIMDVec32 cg_hi_v = g_up.hi.Sub(tmp_hi);
+  SIMDVec32 y_hi_v = tmp_hi.Add(cg_hi_v.SignedShiftRight<1>());
+  y_lo_v.Store((uint32_t*)y);
+  co_lo_v.Store((uint32_t*)co);
+  cg_lo_v.Store((uint32_t*)cg);
+  y_hi_v.Store((uint32_t*)y + SIMDVec32::kLanes);
+  co_hi_v.Store((uint32_t*)co + SIMDVec32::kLanes);
+  cg_hi_v.Store((uint32_t*)cg + SIMDVec32::kLanes);
+}
+#endif
+
+template <typename pixel_t>
+void FillRowRGB8(const unsigned char* rgba, size_t oxs, pixel_t* y, pixel_t* co,
+                 pixel_t* cg) {
+  size_t x = 0;
+#ifdef FJXL_GENERIC_SIMD
+  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
+    auto rgb = SIMDVec16::LoadRGB8(rgba + 3 * x);
+    StoreYCoCg(rgb[0], rgb[1], rgb[2], y + x, co + x, cg + x);
+  }
+#endif
+  for (; x < oxs; x++) {
+    uint16_t r = rgba[3 * x];
+    uint16_t g = rgba[3 * x + 1];
+    uint16_t b = rgba[3 * x + 2];
+    StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x);
+  }
+}
+
+template <bool big_endian, typename pixel_t>
+void FillRowRGB16(const unsigned char* rgba, size_t oxs, pixel_t* y,
+                  pixel_t* co, pixel_t* cg) {
+  size_t x = 0;
+#ifdef FJXL_GENERIC_SIMD
+  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
+    auto rgb = SIMDVec16::LoadRGB16(rgba + 6 * x);
+    if (big_endian) {
+      rgb[0].SwapEndian();
+      rgb[1].SwapEndian();
+      rgb[2].SwapEndian();
+    }
+    StoreYCoCg(rgb[0], rgb[1], rgb[2], y + x, co + x, cg + x);
+  }
+#endif
+  for (; x < oxs; x++) {
+    uint16_t r = LoadLE16(rgba + 6 * x);
+    uint16_t g = LoadLE16(rgba + 6 * x + 2);
+    uint16_t b = LoadLE16(rgba + 6 * x + 4);
+    if (big_endian) {
+      r = SwapEndian(r);
+      g = SwapEndian(g);
+      b = SwapEndian(b);
+    }
+    StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x);
+  }
+}
+
+template <typename pixel_t>
+void FillRowRGBA8(const unsigned char* rgba, size_t oxs, pixel_t* y,
+                  pixel_t* co, pixel_t* cg, pixel_t* alpha) {
+  size_t x = 0;
+#ifdef FJXL_GENERIC_SIMD
+  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
+    auto rgb = SIMDVec16::LoadRGBA8(rgba + 4 * x);
+    StoreYCoCg(rgb[0], rgb[1], rgb[2], y + x, co + x, cg + x);
+    StorePixels(rgb[3], alpha + x);
+  }
+#endif
+  for (; x < oxs; x++) {
+    uint16_t r = rgba[4 * x];
+    uint16_t g = rgba[4 * x + 1];
+    uint16_t b = rgba[4 * x + 2];
+    uint16_t a = rgba[4 * x + 3];
+    StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x);
+    alpha[x] = a;
+  }
+}
+
+template <bool big_endian, typename pixel_t>
+void FillRowRGBA16(const unsigned char* rgba, size_t oxs, pixel_t* y,
+                   pixel_t* co, pixel_t* cg, pixel_t* alpha) {
+  size_t x = 0;
+#ifdef FJXL_GENERIC_SIMD
+  for (; x + SIMDVec16::kLanes <= oxs; x += SIMDVec16::kLanes) {
+    auto rgb = SIMDVec16::LoadRGBA16(rgba + 8 * x);
+    if (big_endian) {
+      rgb[0].SwapEndian();
+      rgb[1].SwapEndian();
+      rgb[2].SwapEndian();
+      rgb[3].SwapEndian();
+    }
+    StoreYCoCg(rgb[0], rgb[1], rgb[2], y + x, co + x, cg + x);
+    StorePixels(rgb[3], alpha + x);
+  }
+#endif
+  for (; x < oxs; x++) {
+    uint16_t r = LoadLE16(rgba + 8 * x);
+    uint16_t g = LoadLE16(rgba + 8 * x + 2);
+    uint16_t b = LoadLE16(rgba + 8 * x + 4);
+    uint16_t a = LoadLE16(rgba + 8 * x + 6);
+    if (big_endian) {
+      r = SwapEndian(r);
+      g = SwapEndian(g);
+      b = SwapEndian(b);
+      a = SwapEndian(a);
+    }
+    StoreYCoCg<pixel_t>(r, g, b, y + x, co + x, cg + x);
+    alpha[x] = a;
+  }
+}
+
+template <typename Processor, typename BitDepth>
+void ProcessImageArea(const unsigned char* rgba, size_t x0, size_t y0,
+                      size_t xs, size_t yskip, size_t ys, size_t row_stride,
+                      BitDepth bitdepth, size_t nb_chans, bool big_endian,
+                      Processor* processors) {
+  constexpr size_t kPadding = 32;
+
+  using pixel_t = typename BitDepth::pixel_t;
+
+  constexpr size_t kAlign = 64;
+  constexpr size_t kAlignPixels = kAlign / sizeof(pixel_t);
+
+  auto align = [=](pixel_t* ptr) {
+    size_t offset = reinterpret_cast<uintptr_t>(ptr) % kAlign;
+    if (offset) {
+      ptr += offset / sizeof(pixel_t);
+    }
+    return ptr;
+  };
+
+  constexpr size_t kNumPx =
+      (256 + kPadding * 2 + kAlignPixels + kAlignPixels - 1) / kAlignPixels *
+      kAlignPixels;
+
+  std::vector<std::array<std::array<pixel_t, kNumPx>, 2>> group_data(nb_chans);
+
+  for (size_t y = 0; y < ys; y++) {
+    const auto rgba_row =
+        rgba + row_stride * (y0 + y) + x0 * nb_chans * BitDepth::kInputBytes;
+    pixel_t* crow[4] = {};
+    pixel_t* prow[4] = {};
+    for (size_t i = 0; i < nb_chans; i++) {
+      crow[i] = align(&group_data[i][y & 1][kPadding]);
+      prow[i] = align(&group_data[i][(y - 1) & 1][kPadding]);
+    }
+
+    // Pre-fill rows with YCoCg converted pixels.
+    if (nb_chans == 1) {
+      if (BitDepth::kInputBytes == 1) {
+        FillRowG8(rgba_row, xs, crow[0]);
+      } else if (big_endian) {
+        FillRowG16</*big_endian=*/true>(rgba_row, xs, crow[0]);
+      } else {
+        FillRowG16</*big_endian=*/false>(rgba_row, xs, crow[0]);
+      }
+    } else if (nb_chans == 2) {
+      if (BitDepth::kInputBytes == 1) {
+        FillRowGA8(rgba_row, xs, crow[0], crow[1]);
+      } else if (big_endian) {
+        FillRowGA16</*big_endian=*/true>(rgba_row, xs, crow[0], crow[1]);
+      } else {
+        FillRowGA16</*big_endian=*/false>(rgba_row, xs, crow[0], crow[1]);
+      }
+    } else if (nb_chans == 3) {
+      if (BitDepth::kInputBytes == 1) {
+        FillRowRGB8(rgba_row, xs, crow[0], crow[1], crow[2]);
+      } else if (big_endian) {
+        FillRowRGB16</*big_endian=*/true>(rgba_row, xs, crow[0], crow[1],
+                                          crow[2]);
+      } else {
+        FillRowRGB16</*big_endian=*/false>(rgba_row, xs, crow[0], crow[1],
+                                           crow[2]);
+      }
+    } else {
+      if (BitDepth::kInputBytes == 1) {
+        FillRowRGBA8(rgba_row, xs, crow[0], crow[1], crow[2], crow[3]);
+      } else if (big_endian) {
+        FillRowRGBA16</*big_endian=*/true>(rgba_row, xs, crow[0], crow[1],
+                                           crow[2], crow[3]);
+      } else {
+        FillRowRGBA16</*big_endian=*/false>(rgba_row, xs, crow[0], crow[1],
+                                            crow[2], crow[3]);
+      }
+    }
+    // Deal with x == 0.
+    for (size_t c = 0; c < nb_chans; c++) {
+      *(crow[c] - 1) = y > 0 ? *(prow[c]) : 0;
+      // Fix topleft.
+      *(prow[c] - 1) = y > 0 ? *(prow[c]) : 0;
+    }
+    if (y < yskip) continue;
+    for (size_t c = 0; c < nb_chans; c++) {
+      // Get pointers to px/left/top/topleft data to speedup loop.
+      const pixel_t* row = crow[c];
+      const pixel_t* row_left = crow[c] - 1;
+      const pixel_t* row_top = y == 0 ? row_left : prow[c];
+      const pixel_t* row_topleft = y == 0 ? row_left : prow[c] - 1;
+
+      processors[c].ProcessRow(row, row_left, row_top, row_topleft, xs);
+    }
+  }
+  for (size_t c = 0; c < nb_chans; c++) {
+    processors[c].Finalize();
+  }
+}
+
+template <typename BitDepth>
+void WriteACSection(const unsigned char* rgba, size_t x0, size_t y0, size_t xs,
+                    size_t ys, size_t row_stride, bool is_single_group,
+                    BitDepth bitdepth, size_t nb_chans, bool big_endian,
+                    const PrefixCode code[4],
+                    std::array<BitWriter, 4>& output) {
+  for (size_t i = 0; i < nb_chans; i++) {
+    if (is_single_group && i == 0) continue;
+    output[i].Allocate(xs * ys * bitdepth.MaxEncodedBitsPerSample() + 4);
+  }
+  if (!is_single_group) {
+    // Group header for modular image.
+    // When the image is single-group, the global modular image is the one
+    // that contains the pixel data, and there is no group header.
+    output[0].Write(1, 1);     // Global tree
+    output[0].Write(1, 1);     // All default wp
+    output[0].Write(2, 0b00);  // 0 transforms
+  }
+
+  ChunkEncoder<BitDepth> encoders[4];
+  ChannelRowProcessor<ChunkEncoder<BitDepth>, BitDepth> row_encoders[4];
+  for (size_t c = 0; c < nb_chans; c++) {
+    row_encoders[c].t = &encoders[c];
+    encoders[c].output = &output[c];
+    encoders[c].code = &code[c];
+  }
+  ProcessImageArea<ChannelRowProcessor<ChunkEncoder<BitDepth>, BitDepth>>(
+      rgba, x0, y0, xs, 0, ys, row_stride, bitdepth, nb_chans, big_endian,
+      row_encoders);
+}
+
+constexpr int kHashExp = 16;
+constexpr uint32_t kHashSize = 1 << kHashExp;
+constexpr uint32_t kHashMultiplier = 2654435761;
+constexpr int kMaxColors = 512;
+
+// can be any function that returns a value in 0 .. kHashSize-1
+// has to map 0 to 0
+inline uint32_t pixel_hash(uint32_t p) {
+  return (p * kHashMultiplier) >> (32 - kHashExp);
+}
+
+template <size_t nb_chans>
+void FillRowPalette(const unsigned char* inrow, size_t xs,
+                    const int16_t* lookup, int16_t* out) {
+  for (size_t x = 0; x < xs; x++) {
+    uint32_t p = 0;
+    memcpy(&p, inrow + x * nb_chans, nb_chans);
+    out[x] = lookup[pixel_hash(p)];
+  }
+}
+
+template <typename Processor>
+void ProcessImageAreaPalette(const unsigned char* rgba, size_t x0, size_t y0,
+                             size_t xs, size_t yskip, size_t ys,
+                             size_t row_stride, const int16_t* lookup,
+                             size_t nb_chans, Processor* processors) {
+  constexpr size_t kPadding = 32;
+
+  std::vector<std::array<int16_t, 256 + kPadding * 2>> group_data(2);
+  Processor& row_encoder = processors[0];
+
+  for (size_t y = 0; y < ys; y++) {
+    // Pre-fill rows with palette converted pixels.
+    const unsigned char* inrow = rgba + row_stride * (y0 + y) + x0 * nb_chans;
+    int16_t* outrow = &group_data[y & 1][kPadding];
+    if (nb_chans == 1) {
+      FillRowPalette<1>(inrow, xs, lookup, outrow);
+    } else if (nb_chans == 2) {
+      FillRowPalette<2>(inrow, xs, lookup, outrow);
+    } else if (nb_chans == 3) {
+      FillRowPalette<3>(inrow, xs, lookup, outrow);
+    } else if (nb_chans == 4) {
+      FillRowPalette<4>(inrow, xs, lookup, outrow);
+    }
+    // Deal with x == 0.
+    group_data[y & 1][kPadding - 1] =
+        y > 0 ? group_data[(y - 1) & 1][kPadding] : 0;
+    // Fix topleft.
+    group_data[(y - 1) & 1][kPadding - 1] =
+        y > 0 ? group_data[(y - 1) & 1][kPadding] : 0;
+    // Get pointers to px/left/top/topleft data to speedup loop.
+    const int16_t* row = &group_data[y & 1][kPadding];
+    const int16_t* row_left = &group_data[y & 1][kPadding - 1];
+    const int16_t* row_top =
+        y == 0 ? row_left : &group_data[(y - 1) & 1][kPadding];
+    const int16_t* row_topleft =
+        y == 0 ? row_left : &group_data[(y - 1) & 1][kPadding - 1];
+
+    row_encoder.ProcessRow(row, row_left, row_top, row_topleft, xs);
+  }
+  row_encoder.Finalize();
+}
+
+void WriteACSectionPalette(const unsigned char* rgba, size_t x0, size_t y0,
+                           size_t xs, size_t ys, size_t row_stride,
+                           bool is_single_group, const PrefixCode code[4],
+                           const int16_t* lookup, size_t nb_chans,
+                           BitWriter& output) {
+  if (!is_single_group) {
+    output.Allocate(16 * xs * ys + 4);
+    // Group header for modular image.
+    // When the image is single-group, the global modular image is the one
+    // that contains the pixel data, and there is no group header.
+    output.Write(1, 1);     // Global tree
+    output.Write(1, 1);     // All default wp
+    output.Write(2, 0b00);  // 0 transforms
+  }
+
+  ChunkEncoder<UpTo8Bits> encoder;
+  ChannelRowProcessor<ChunkEncoder<UpTo8Bits>, UpTo8Bits> row_encoder;
+
+  row_encoder.t = &encoder;
+  encoder.output = &output;
+  encoder.code = &code[is_single_group ? 1 : 0];
+  ProcessImageAreaPalette<
+      ChannelRowProcessor<ChunkEncoder<UpTo8Bits>, UpTo8Bits>>(
+      rgba, x0, y0, xs, 0, ys, row_stride, lookup, nb_chans, &row_encoder);
+}
+
+template <typename BitDepth>
+void CollectSamples(const unsigned char* rgba, size_t x0, size_t y0, size_t xs,
+                    size_t row_stride, size_t row_count,
+                    uint64_t raw_counts[4][kNumRawSymbols],
+                    uint64_t lz77_counts[4][kNumLZ77], bool is_single_group,
+                    bool palette, BitDepth bitdepth, size_t nb_chans,
+                    bool big_endian, const int16_t* lookup) {
+  if (palette) {
+    ChunkSampleCollector<UpTo8Bits> sample_collectors[4];
+    ChannelRowProcessor<ChunkSampleCollector<UpTo8Bits>, UpTo8Bits>
+        row_sample_collectors[4];
+    for (size_t c = 0; c < nb_chans; c++) {
+      row_sample_collectors[c].t = &sample_collectors[c];
+      sample_collectors[c].raw_counts = raw_counts[is_single_group ? 1 : 0];
+      sample_collectors[c].lz77_counts = lz77_counts[is_single_group ? 1 : 0];
+    }
+    ProcessImageAreaPalette<
+        ChannelRowProcessor<ChunkSampleCollector<UpTo8Bits>, UpTo8Bits>>(
+        rgba, x0, y0, xs, 1, 1 + row_count, row_stride, lookup, nb_chans,
+        row_sample_collectors);
+  } else {
+    ChunkSampleCollector<BitDepth> sample_collectors[4];
+    ChannelRowProcessor<ChunkSampleCollector<BitDepth>, BitDepth>
+        row_sample_collectors[4];
+    for (size_t c = 0; c < nb_chans; c++) {
+      row_sample_collectors[c].t = &sample_collectors[c];
+      sample_collectors[c].raw_counts = raw_counts[c];
+      sample_collectors[c].lz77_counts = lz77_counts[c];
+    }
+    ProcessImageArea<
+        ChannelRowProcessor<ChunkSampleCollector<BitDepth>, BitDepth>>(
+        rgba, x0, y0, xs, 1, 1 + row_count, row_stride, bitdepth, nb_chans,
+        big_endian, row_sample_collectors);
+  }
+}
+
+void PrepareDCGlobalPalette(bool is_single_group, size_t width, size_t height,
+                            size_t nb_chans, const PrefixCode code[4],
+                            const std::vector<uint32_t>& palette,
+                            size_t pcolors, BitWriter* output) {
+  PrepareDCGlobalCommon(is_single_group, width, height, code, output);
+  output->Write(2, 0b01);     // 1 transform
+  output->Write(2, 0b01);     // Palette
+  output->Write(5, 0b00000);  // Starting from ch 0
+  if (nb_chans == 1) {
+    output->Write(2, 0b00);  // 1-channel palette (Gray)
+  } else if (nb_chans == 3) {
+    output->Write(2, 0b01);  // 3-channel palette (RGB)
+  } else if (nb_chans == 4) {
+    output->Write(2, 0b10);  // 4-channel palette (RGBA)
+  } else {
+    output->Write(2, 0b11);
+    output->Write(13, nb_chans - 1);
+  }
+  // pcolors <= kMaxColors + kChunkSize - 1
+  static_assert(kMaxColors + kChunkSize < 1281,
+                "add code to signal larger palette sizes");
+  if (pcolors < 256) {
+    output->Write(2, 0b00);
+    output->Write(8, pcolors);
+  } else {
+    output->Write(2, 0b01);
+    output->Write(10, pcolors - 256);
+  }
+
+  output->Write(2, 0b00);  // nb_deltas == 0
+  output->Write(4, 0);     // Zero predictor for delta palette
+  // Encode palette
+  ChunkEncoder<UpTo8Bits> encoder;
+  ChannelRowProcessor<ChunkEncoder<UpTo8Bits>, UpTo8Bits> row_encoder;
+  row_encoder.t = &encoder;
+  encoder.output = output;
+  encoder.code = &code[0];
+  int16_t p[4][32 + 1024] = {};
+  uint8_t prgba[4];
+  size_t i = 0;
+  size_t have_zero = 0;
+  if (palette[pcolors - 1] == 0) have_zero = 1;
+  for (; i < pcolors; i++) {
+    memcpy(prgba, &palette[i], 4);
+    p[0][16 + i + have_zero] = prgba[0];
+    p[1][16 + i + have_zero] = prgba[1];
+    p[2][16 + i + have_zero] = prgba[2];
+    p[3][16 + i + have_zero] = prgba[3];
+  }
+  p[0][15] = 0;
+  row_encoder.ProcessRow(p[0] + 16, p[0] + 15, p[0] + 15, p[0] + 15, pcolors);
+  p[1][15] = p[0][16];
+  p[0][15] = p[0][16];
+  if (nb_chans > 1) {
+    row_encoder.ProcessRow(p[1] + 16, p[1] + 15, p[0] + 16, p[0] + 15, pcolors);
+  }
+  p[2][15] = p[1][16];
+  p[1][15] = p[1][16];
+  if (nb_chans > 2) {
+    row_encoder.ProcessRow(p[2] + 16, p[2] + 15, p[1] + 16, p[1] + 15, pcolors);
+  }
+  p[3][15] = p[2][16];
+  p[2][15] = p[2][16];
+  if (nb_chans > 3) {
+    row_encoder.ProcessRow(p[3] + 16, p[3] + 15, p[2] + 16, p[2] + 15, pcolors);
+  }
+  row_encoder.Finalize();
+
+  if (!is_single_group) {
+    output->ZeroPadToByte();
+  }
+}
+
+template <size_t nb_chans>
+bool detect_palette(const unsigned char* r, size_t width,
+                    std::vector<uint32_t>& palette) {
+  size_t x = 0;
+  bool collided = false;
+  // this is just an unrolling of the next loop
+  for (; x + 7 < width; x += 8) {
+    uint32_t p[8] = {}, index[8];
+    for (int i = 0; i < 8; i++) memcpy(&p[i], r + (x + i) * nb_chans, 4);
+    for (int i = 0; i < 8; i++) p[i] &= ((1llu << (8 * nb_chans)) - 1);
+    for (int i = 0; i < 8; i++) index[i] = pixel_hash(p[i]);
+    for (int i = 0; i < 8; i++) {
+      collided |= (palette[index[i]] != 0 && p[i] != palette[index[i]]);
+    }
+    for (int i = 0; i < 8; i++) palette[index[i]] = p[i];
+  }
+  for (; x < width; x++) {
+    uint32_t p = 0;
+    memcpy(&p, r + x * nb_chans, nb_chans);
+    uint32_t index = pixel_hash(p);
+    collided |= (palette[index] != 0 && p != palette[index]);
+    palette[index] = p;
+  }
+  return collided;
+}
+
+template <typename BitDepth>
+JxlFastLosslessFrameState* LLEnc(const unsigned char* rgba, size_t width,
+                                 size_t stride, size_t height,
+                                 BitDepth bitdepth, size_t nb_chans,
+                                 bool big_endian, int effort,
+                                 void* runner_opaque,
+                                 FJxlParallelRunner runner) {
+  assert(width != 0);
+  assert(height != 0);
+  assert(stride >= nb_chans * BitDepth::kInputBytes * width);
+
+  // Count colors to try palette
+  std::vector<uint32_t> palette(kHashSize);
+  std::vector<int16_t> lookup(kHashSize);
+  lookup[0] = 0;
+  int pcolors = 0;
+  bool collided = effort < 2 || bitdepth.bitdepth != 8;
+  for (size_t y = 0; y < height && !collided; y++) {
+    const unsigned char* r = rgba + stride * y;
+    if (nb_chans == 1) collided = detect_palette<1>(r, width, palette);
+    if (nb_chans == 2) collided = detect_palette<2>(r, width, palette);
+    if (nb_chans == 3) collided = detect_palette<3>(r, width, palette);
+    if (nb_chans == 4) collided = detect_palette<4>(r, width, palette);
+  }
+
+  int nb_entries = 0;
+  if (!collided) {
+    pcolors = 1;  // always have all-zero as a palette color
+    bool have_color = false;
+    uint8_t minG = 255, maxG = 0;
+    for (uint32_t k = 0; k < kHashSize; k++) {
+      if (palette[k] == 0) continue;
+      uint8_t p[4];
+      memcpy(p, &palette[k], 4);
+      // move entries to front so sort has less work
+      palette[nb_entries] = palette[k];
+      if (p[0] != p[1] || p[0] != p[2]) have_color = true;
+      if (p[1] < minG) minG = p[1];
+      if (p[1] > maxG) maxG = p[1];
+      nb_entries++;
+      // don't do palette if too many colors are needed
+      if (nb_entries + pcolors > kMaxColors) {
+        collided = true;
+        break;
+      }
+    }
+    if (!have_color) {
+      // don't do palette if it's just grayscale without many holes
+      if (maxG - minG < nb_entries * 1.4f) collided = true;
+    }
+  }
+  if (!collided) {
+    std::sort(
+        palette.begin(), palette.begin() + nb_entries,
+        [&nb_chans](uint32_t ap, uint32_t bp) {
+          if (ap == 0) return false;
+          if (bp == 0) return true;
+          uint8_t a[4], b[4];
+          memcpy(a, &ap, 4);
+          memcpy(b, &bp, 4);
+          float ay, by;
+          if (nb_chans == 4) {
+            ay = (0.299f * a[0] + 0.587f * a[1] + 0.114f * a[2] + 0.01f) * a[3];
+            by = (0.299f * b[0] + 0.587f * b[1] + 0.114f * b[2] + 0.01f) * b[3];
+          } else {
+            ay = (0.299f * a[0] + 0.587f * a[1] + 0.114f * a[2] + 0.01f);
+            by = (0.299f * b[0] + 0.587f * b[1] + 0.114f * b[2] + 0.01f);
+          }
+          return ay < by;  // sort on alpha*luma
+        });
+    for (int k = 0; k < nb_entries; k++) {
+      if (palette[k] == 0) break;
+      lookup[pixel_hash(palette[k])] = pcolors++;
+    }
+  }
+
+  size_t num_groups_x = (width + 255) / 256;
+  size_t num_groups_y = (height + 255) / 256;
+  size_t num_dc_groups_x = (width + 2047) / 2048;
+  size_t num_dc_groups_y = (height + 2047) / 2048;
+
+  uint64_t raw_counts[4][kNumRawSymbols] = {};
+  uint64_t lz77_counts[4][kNumLZ77] = {};
+
+  bool onegroup = num_groups_x == 1 && num_groups_y == 1;
+
+  // sample the middle (effort * 2) rows of every group
+  for (size_t g = 0; g < num_groups_y * num_groups_x; g++) {
+    size_t xg = g % num_groups_x;
+    size_t yg = g / num_groups_x;
+    int y_offset = yg * 256;
+    int y_max = std::min<size_t>(height - yg * 256, 256);
+    int y_begin = y_offset + std::max<int>(0, y_max - 2 * effort) / 2;
+    int y_count =
+        std::min<int>(2 * effort * y_max / 256, y_offset + y_max - y_begin - 1);
+    int x_max =
+        std::min<size_t>(width - xg * 256, 256) / kChunkSize * kChunkSize;
+    CollectSamples(rgba, xg * 256, y_begin, x_max, stride, y_count, raw_counts,
+                   lz77_counts, onegroup, !collided, bitdepth, nb_chans,
+                   big_endian, lookup.data());
+  }
+
+  // TODO(veluca): can probably improve this and make it bitdepth-dependent.
+  uint64_t base_raw_counts[kNumRawSymbols] = {
+      3843, 852, 1270, 1214, 1014, 727, 481, 300, 159, 51,
+      5,    1,   1,    1,    1,    1,   1,   1,   1};
+
+  bool doing_ycocg = nb_chans > 2 && collided;
+  bool large_palette = !collided || pcolors >= 256;
+  for (size_t i = bitdepth.NumSymbols(doing_ycocg || large_palette);
+       i < kNumRawSymbols; i++) {
+    base_raw_counts[i] = 0;
+  }
+
+  for (size_t c = 0; c < 4; c++) {
+    for (size_t i = 0; i < kNumRawSymbols; i++) {
+      raw_counts[c][i] = (raw_counts[c][i] << 8) + base_raw_counts[i];
+    }
+  }
+
+  if (!collided) {
+    unsigned token, nbits, bits;
+    EncodeHybridUint000(PackSigned(pcolors - 1), &token, &nbits, &bits);
+    // ensure all palette indices can actually be encoded
+    for (size_t i = 0; i < token + 1; i++)
+      raw_counts[0][i] = std::max<uint64_t>(raw_counts[0][i], 1);
+    // these tokens are only used for the palette itself so they can get a bad
+    // code
+    for (size_t i = token + 1; i < 10; i++) raw_counts[0][i] = 1;
+  }
+
+  uint64_t base_lz77_counts[kNumLZ77] = {
+      29, 27, 25,  23, 21, 21, 19, 18, 21, 17, 16, 15, 15, 14,
+      13, 13, 137, 98, 61, 34, 1,  1,  1,  1,  1,  1,  1,  1,
+  };
+
+  for (size_t c = 0; c < 4; c++) {
+    for (size_t i = 0; i < kNumLZ77; i++) {
+      lz77_counts[c][i] = (lz77_counts[c][i] << 8) + base_lz77_counts[i];
+    }
+  }
+
+  alignas(64) PrefixCode hcode[4];
+  for (size_t i = 0; i < 4; i++) {
+    hcode[i] = PrefixCode(bitdepth, raw_counts[i], lz77_counts[i]);
+  }
+
+  size_t num_groups = onegroup ? 1
+                               : (2 + num_dc_groups_x * num_dc_groups_y +
+                                  num_groups_x * num_groups_y);
+
+  JxlFastLosslessFrameState* frame_state = new JxlFastLosslessFrameState();
+
+  frame_state->width = width;
+  frame_state->height = height;
+  frame_state->nb_chans = nb_chans;
+  frame_state->bitdepth = bitdepth.bitdepth;
+
+  frame_state->group_data = std::vector<std::array<BitWriter, 4>>(num_groups);
+  if (collided) {
+    PrepareDCGlobal(onegroup, width, height, nb_chans, hcode,
+                    &frame_state->group_data[0][0]);
+  } else {
+    PrepareDCGlobalPalette(onegroup, width, height, nb_chans, hcode, palette,
+                           pcolors, &frame_state->group_data[0][0]);
+  }
+
+  auto run_one = [&](size_t g) {
+    size_t xg = g % num_groups_x;
+    size_t yg = g / num_groups_x;
+    size_t group_id =
+        onegroup ? 0 : (2 + num_dc_groups_x * num_dc_groups_y + g);
+    size_t xs = std::min<size_t>(width - xg * 256, 256);
+    size_t ys = std::min<size_t>(height - yg * 256, 256);
+    size_t x0 = xg * 256;
+    size_t y0 = yg * 256;
+    auto& gd = frame_state->group_data[group_id];
+    if (collided) {
+      WriteACSection(rgba, x0, y0, xs, ys, stride, onegroup, bitdepth, nb_chans,
+                     big_endian, hcode, gd);
+
+    } else {
+      WriteACSectionPalette(rgba, x0, y0, xs, ys, stride, onegroup, hcode,
+                            lookup.data(), nb_chans, gd[0]);
+    }
+  };
+
+  runner(
+      runner_opaque, &run_one,
+      +[](void* r, size_t i) { (*reinterpret_cast<decltype(&run_one)>(r))(i); },
+      num_groups_x * num_groups_y);
+
+  return frame_state;
+}
+
+JxlFastLosslessFrameState* JxlFastLosslessEncodeImpl(
+    const unsigned char* rgba, size_t width, size_t stride, size_t height,
+    size_t nb_chans, size_t bitdepth, bool big_endian, int effort,
+    void* runner_opaque, FJxlParallelRunner runner) {
+  assert(bitdepth > 0);
+  assert(nb_chans <= 4);
+  assert(nb_chans != 0);
+  if (bitdepth <= 8) {
+    return LLEnc(rgba, width, stride, height, UpTo8Bits(bitdepth), nb_chans,
+                 big_endian, effort, runner_opaque, runner);
+  }
+  if (bitdepth <= 13) {
+    return LLEnc(rgba, width, stride, height, From9To13Bits(bitdepth), nb_chans,
+                 big_endian, effort, runner_opaque, runner);
+  }
+  if (bitdepth == 14) {
+    return LLEnc(rgba, width, stride, height, Exactly14Bits(bitdepth), nb_chans,
+                 big_endian, effort, runner_opaque, runner);
+  }
+  return LLEnc(rgba, width, stride, height, MoreThan14Bits(bitdepth), nb_chans,
+               big_endian, effort, runner_opaque, runner);
+}
+
+}  // namespace
+
+#endif  // FJXL_SELF_INCLUDE
+
+#ifndef FJXL_SELF_INCLUDE
+
+#define FJXL_SELF_INCLUDE
+
+// If we have NEON enabled, it is the default target.
+#if FJXL_ENABLE_NEON
+
+namespace default_implementation {
+#define FJXL_NEON
+#include "lib/jxl/enc_fast_lossless.cc"
+#undef FJXL_NEON
+}  // namespace default_implementation
+
+#else  // FJXL_ENABLE_NEON
+
+namespace default_implementation {
+#include "lib/jxl/enc_fast_lossless.cc"
+}
+
+#if FJXL_ENABLE_AVX2
+#ifdef __clang__
+#pragma clang attribute push(__attribute__((target("avx,avx2"))), \
+                             apply_to = function)
+// Causes spurious warnings on clang5.
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wmissing-braces"
+#elif defined(__GNUC__)
+#pragma GCC push_options
+// Seems to cause spurious errors on GCC8.
+#pragma GCC diagnostic ignored "-Wpsabi"
+#pragma GCC target "avx,avx2"
+#endif
+
+namespace AVX2 {
+#define FJXL_AVX2
+#include "lib/jxl/enc_fast_lossless.cc"
+#undef FJXL_AVX2
+}  // namespace AVX2
+
+#ifdef __clang__
+#pragma clang attribute pop
+#pragma clang diagnostic pop
+#elif defined(__GNUC__)
+#pragma GCC pop_options
+#endif
+#endif  // FJXL_ENABLE_AVX2
+
+#if FJXL_ENABLE_AVX512
+#ifdef __clang__
+#pragma clang attribute push(                                                 \
+    __attribute__((target("avx512cd,avx512bw,avx512vl,avx512f,avx512vbmi"))), \
+    apply_to = function)
+#elif defined(__GNUC__)
+#pragma GCC push_options
+#pragma GCC target "avx512cd,avx512bw,avx512vl,avx512f,avx512vbmi"
+#endif
+
+namespace AVX512 {
+#define FJXL_AVX512
+#include "lib/jxl/enc_fast_lossless.cc"
+#undef FJXL_AVX512
+}  // namespace AVX512
+
+#ifdef __clang__
+#pragma clang attribute pop
+#elif defined(__GNUC__)
+#pragma GCC pop_options
+#endif
+#endif  // FJXL_ENABLE_AVX512
+
+#endif
+
+extern "C" {
+
+#if FJXL_STANDALONE
+size_t JxlFastLosslessEncode(const unsigned char* rgba, size_t width,
+                             size_t row_stride, size_t height, size_t nb_chans,
+                             size_t bitdepth, int big_endian, int effort,
+                             unsigned char** output, void* runner_opaque,
+                             FJxlParallelRunner runner) {
+  auto frame_state = JxlFastLosslessPrepareFrame(
+      rgba, width, row_stride, height, nb_chans, bitdepth, big_endian, effort,
+      runner_opaque, runner);
+  JxlFastLosslessPrepareHeader(frame_state, /*add_image_header=*/1,
+                               /*is_last=*/1);
+  size_t output_size = JxlFastLosslessMaxRequiredOutput(frame_state);
+  *output = (unsigned char*)malloc(output_size);
+  size_t written = 0;
+  size_t total = 0;
+  while ((written = JxlFastLosslessWriteOutput(frame_state, *output + total,
+                                               output_size - total)) != 0) {
+    total += written;
+  }
+  return total;
+}
+#endif
+
+JxlFastLosslessFrameState* JxlFastLosslessPrepareFrame(
+    const unsigned char* rgba, size_t width, size_t row_stride, size_t height,
+    size_t nb_chans, size_t bitdepth, int big_endian, int effort,
+    void* runner_opaque, FJxlParallelRunner runner) {
+  auto trivial_runner =
+      +[](void*, void* opaque, void fun(void*, size_t), size_t count) {
+        for (size_t i = 0; i < count; i++) {
+          fun(opaque, i);
+        }
+      };
+
+  if (runner == nullptr) {
+    runner = trivial_runner;
+  }
+
+#if FJXL_ENABLE_AVX512
+  if (__builtin_cpu_supports("avx512cd") &&
+      __builtin_cpu_supports("avx512vbmi") &&
+      __builtin_cpu_supports("avx512bw") && __builtin_cpu_supports("avx512f") &&
+      __builtin_cpu_supports("avx512vl")) {
+    return AVX512::JxlFastLosslessEncodeImpl(rgba, width, row_stride, height,
+                                             nb_chans, bitdepth, big_endian,
+                                             effort, runner_opaque, runner);
+  }
+#endif
+#if FJXL_ENABLE_AVX2
+  if (__builtin_cpu_supports("avx2")) {
+    return AVX2::JxlFastLosslessEncodeImpl(rgba, width, row_stride, height,
+                                           nb_chans, bitdepth, big_endian,
+                                           effort, runner_opaque, runner);
+  }
+#endif
+
+  return default_implementation::JxlFastLosslessEncodeImpl(
+      rgba, width, row_stride, height, nb_chans, bitdepth, big_endian, effort,
+      runner_opaque, runner);
+}
+
+}  // extern "C"
+
+#endif  // FJXL_SELF_INCLUDE
diff --git a/lib/jxl/enc_fast_lossless.h b/lib/jxl/enc_fast_lossless.h
new file mode 100644 (file)
index 0000000..f0bcd72
--- /dev/null
@@ -0,0 +1,85 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_FAST_LOSSLESS_H_
+#define LIB_JXL_ENC_FAST_LOSSLESS_H_
+#include <stdlib.h>
+
+// FJXL_STANDALONE=1 for a stand-alone jxl encoder
+// FJXL_STANDALONE=0 for use in libjxl to encode frames (but no image header)
+#ifndef FJXL_STANDALONE
+#ifdef JPEGXL_MAJOR_VERSION
+#define FJXL_STANDALONE 0
+#else
+#define FJXL_STANDALONE 1
+#endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Simple encoding API.
+
+// A FJxlParallelRunner must call fun(opaque, i) for all i from 0 to count. It
+// may do so in parallel.
+typedef void(FJxlParallelRunner)(void* runner_opaque, void* opaque,
+                                 void fun(void*, size_t), size_t count);
+
+#if FJXL_STANDALONE
+// You may pass `nullptr` as a runner: encoding will be sequential.
+size_t JxlFastLosslessEncode(const unsigned char* rgba, size_t width,
+                             size_t row_stride, size_t height, size_t nb_chans,
+                             size_t bitdepth, int big_endian, int effort,
+                             unsigned char** output, void* runner_opaque,
+                             FJxlParallelRunner runner);
+#endif
+
+// More complex API for cases in which you may want to allocate your own buffer
+// and other advanced use cases.
+
+// Opaque struct that represents an intermediate state of the computation.
+struct JxlFastLosslessFrameState;
+
+// Returned JxlFastLosslessFrameState must be freed by calling
+// JxlFastLosslessFreeFrameState.
+JxlFastLosslessFrameState* JxlFastLosslessPrepareFrame(
+    const unsigned char* rgba, size_t width, size_t row_stride, size_t height,
+    size_t nb_chans, size_t bitdepth, int big_endian, int effort,
+    void* runner_opaque, FJxlParallelRunner runner);
+
+// Prepare the (image/frame) header. You may encode animations by concatenating
+// the output of multiple frames, of which the first one has add_image_header =
+// 1 and subsequent ones have add_image_header = 0, and all frames but the last
+// one have is_last = 0.
+// (when FJXL_STANDALONE=0, add_image_header has to be 0)
+void JxlFastLosslessPrepareHeader(JxlFastLosslessFrameState* frame,
+                                  int add_image_header, int is_last);
+
+// Upper bound on the required output size, including any padding that may be
+// required by JxlFastLosslessWriteOutput. Cannot be called before
+// JxlFastLosslessPrepareHeader.
+size_t JxlFastLosslessMaxRequiredOutput(const JxlFastLosslessFrameState* frame);
+
+// Actual size of the frame once it is encoded. This is not identical to
+// JxlFastLosslessMaxRequiredOutput because JxlFastLosslessWriteOutput may
+// require extra padding.
+size_t JxlFastLosslessOutputSize(const JxlFastLosslessFrameState* frame);
+
+// Writes the frame to the given output buffer. Returns the number of bytes that
+// were written, which is at least 1 unless the entire output has been written
+// already. It is required that `output_size >= 32` when calling this function.
+// This function must be called repeatedly until it returns 0.
+size_t JxlFastLosslessWriteOutput(JxlFastLosslessFrameState* frame,
+                                  unsigned char* output, size_t output_size);
+
+// Frees the provided frame state.
+void JxlFastLosslessFreeFrameState(JxlFastLosslessFrameState* frame);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // LIB_JXL_ENC_FAST_LOSSLESS_H_
diff --git a/lib/jxl/enc_fields.cc b/lib/jxl/enc_fields.cc
new file mode 100644 (file)
index 0000000..22c763e
--- /dev/null
@@ -0,0 +1,239 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_fields.h"
+
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/fields.h"
+
+namespace jxl {
+
+namespace {
+using ::jxl::fields_internal::VisitorBase;
+class WriteVisitor : public VisitorBase {
+ public:
+  WriteVisitor(const size_t extension_bits, BitWriter* JXL_RESTRICT writer)
+      : extension_bits_(extension_bits), writer_(writer) {}
+
+  Status Bits(const size_t bits, const uint32_t /*default_value*/,
+              uint32_t* JXL_RESTRICT value) override {
+    ok_ &= BitsCoder::Write(bits, *value, writer_);
+    return true;
+  }
+  Status U32(const U32Enc enc, const uint32_t /*default_value*/,
+             uint32_t* JXL_RESTRICT value) override {
+    ok_ &= U32Coder::Write(enc, *value, writer_);
+    return true;
+  }
+
+  Status U64(const uint64_t /*default_value*/,
+             uint64_t* JXL_RESTRICT value) override {
+    ok_ &= U64Coder::Write(*value, writer_);
+    return true;
+  }
+
+  Status F16(const float /*default_value*/,
+             float* JXL_RESTRICT value) override {
+    ok_ &= F16Coder::Write(*value, writer_);
+    return true;
+  }
+
+  Status BeginExtensions(uint64_t* JXL_RESTRICT extensions) override {
+    JXL_QUIET_RETURN_IF_ERROR(VisitorBase::BeginExtensions(extensions));
+    if (*extensions == 0) {
+      JXL_ASSERT(extension_bits_ == 0);
+      return true;
+    }
+    // TODO(janwas): extend API to pass in array of extension_bits, one per
+    // extension. We currently ascribe all bits to the first extension, but
+    // this is only an encoder limitation. NOTE: extension_bits_ can be zero
+    // if an extension does not require any additional fields.
+    ok_ &= U64Coder::Write(extension_bits_, writer_);
+    // For each nonzero bit except the lowest/first (already written):
+    for (uint64_t remaining_extensions = *extensions & (*extensions - 1);
+         remaining_extensions != 0;
+         remaining_extensions &= remaining_extensions - 1) {
+      ok_ &= U64Coder::Write(0, writer_);
+    }
+    return true;
+  }
+  // EndExtensions = default.
+
+  Status OK() const { return ok_; }
+
+ private:
+  const size_t extension_bits_;
+  BitWriter* JXL_RESTRICT writer_;
+  bool ok_ = true;
+};
+}  // namespace
+
+Status Bundle::Write(const Fields& fields, BitWriter* writer, size_t layer,
+                     AuxOut* aux_out) {
+  size_t extension_bits, total_bits;
+  JXL_RETURN_IF_ERROR(Bundle::CanEncode(fields, &extension_bits, &total_bits));
+
+  BitWriter::Allotment allotment(writer, total_bits);
+  WriteVisitor visitor(extension_bits, writer);
+  JXL_RETURN_IF_ERROR(visitor.VisitConst(fields));
+  JXL_RETURN_IF_ERROR(visitor.OK());
+  allotment.ReclaimAndCharge(writer, layer, aux_out);
+  return true;
+}
+
+// Returns false if the value is too large to encode.
+Status BitsCoder::Write(const size_t bits, const uint32_t value,
+                        BitWriter* JXL_RESTRICT writer) {
+  if (value >= (1ULL << bits)) {
+    return JXL_FAILURE("Value %d too large to encode in %" PRIu64 " bits",
+                       value, static_cast<uint64_t>(bits));
+  }
+  writer->Write(bits, value);
+  return true;
+}
+
+// Returns false if the value is too large to encode.
+Status U32Coder::Write(const U32Enc enc, const uint32_t value,
+                       BitWriter* JXL_RESTRICT writer) {
+  uint32_t selector;
+  size_t total_bits;
+  JXL_RETURN_IF_ERROR(ChooseSelector(enc, value, &selector, &total_bits));
+
+  writer->Write(2, selector);
+
+  const U32Distr d = enc.GetDistr(selector);
+  if (!d.IsDirect()) {  // Nothing more to write for direct encoding
+    const uint32_t offset = d.Offset();
+    JXL_ASSERT(value >= offset);
+    writer->Write(total_bits - 2, value - offset);
+  }
+
+  return true;
+}
+
+// Returns false if the value is too large to encode.
+Status U64Coder::Write(uint64_t value, BitWriter* JXL_RESTRICT writer) {
+  if (value == 0) {
+    // Selector: use 0 bits, value 0
+    writer->Write(2, 0);
+  } else if (value <= 16) {
+    // Selector: use 4 bits, value 1..16
+    writer->Write(2, 1);
+    writer->Write(4, value - 1);
+  } else if (value <= 272) {
+    // Selector: use 8 bits, value 17..272
+    writer->Write(2, 2);
+    writer->Write(8, value - 17);
+  } else {
+    // Selector: varint, first a 12-bit group, after that per 8-bit group.
+    writer->Write(2, 3);
+    writer->Write(12, value & 4095);
+    value >>= 12;
+    int shift = 12;
+    while (value > 0 && shift < 60) {
+      // Indicate varint not done
+      writer->Write(1, 1);
+      writer->Write(8, value & 255);
+      value >>= 8;
+      shift += 8;
+    }
+    if (value > 0) {
+      // This only could happen if shift == N - 4.
+      writer->Write(1, 1);
+      writer->Write(4, value & 15);
+      // Implicitly closed sequence, no extra stop bit is required.
+    } else {
+      // Indicate end of varint
+      writer->Write(1, 0);
+    }
+  }
+
+  return true;
+}
+
+Status F16Coder::Write(float value, BitWriter* JXL_RESTRICT writer) {
+  uint32_t bits32;
+  memcpy(&bits32, &value, sizeof(bits32));
+  const uint32_t sign = bits32 >> 31;
+  const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
+  const uint32_t mantissa32 = bits32 & 0x7FFFFF;
+
+  const int32_t exp = static_cast<int32_t>(biased_exp32) - 127;
+  if (JXL_UNLIKELY(exp > 15)) {
+    return JXL_FAILURE("Too big to encode, CanEncode should return false");
+  }
+
+  // Tiny or zero => zero.
+  if (exp < -24) {
+    writer->Write(16, 0);
+    return true;
+  }
+
+  uint32_t biased_exp16, mantissa16;
+
+  // exp = [-24, -15] => subnormal
+  if (JXL_UNLIKELY(exp < -14)) {
+    biased_exp16 = 0;
+    const uint32_t sub_exp = static_cast<uint32_t>(-14 - exp);
+    JXL_ASSERT(1 <= sub_exp && sub_exp < 11);
+    mantissa16 = (1 << (10 - sub_exp)) + (mantissa32 >> (13 + sub_exp));
+  } else {
+    // exp = [-14, 15]
+    biased_exp16 = static_cast<uint32_t>(exp + 15);
+    JXL_ASSERT(1 <= biased_exp16 && biased_exp16 < 31);
+    mantissa16 = mantissa32 >> 13;
+  }
+
+  JXL_ASSERT(mantissa16 < 1024);
+  const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
+  JXL_ASSERT(bits16 < 0x10000);
+  writer->Write(16, bits16);
+  return true;
+}
+
+Status WriteCodestreamHeaders(CodecMetadata* metadata, BitWriter* writer,
+                              AuxOut* aux_out) {
+  // Marker/signature
+  BitWriter::Allotment allotment(writer, 16);
+  writer->Write(8, 0xFF);
+  writer->Write(8, kCodestreamMarker);
+  allotment.ReclaimAndCharge(writer, kLayerHeader, aux_out);
+
+  JXL_RETURN_IF_ERROR(
+      WriteSizeHeader(metadata->size, writer, kLayerHeader, aux_out));
+
+  JXL_RETURN_IF_ERROR(
+      WriteImageMetadata(metadata->m, writer, kLayerHeader, aux_out));
+
+  metadata->transform_data.nonserialized_xyb_encoded = metadata->m.xyb_encoded;
+  JXL_RETURN_IF_ERROR(
+      Bundle::Write(metadata->transform_data, writer, kLayerHeader, aux_out));
+
+  return true;
+}
+
+Status WriteFrameHeader(const FrameHeader& frame,
+                        BitWriter* JXL_RESTRICT writer, AuxOut* aux_out) {
+  return Bundle::Write(frame, writer, kLayerHeader, aux_out);
+}
+
+Status WriteImageMetadata(const ImageMetadata& metadata,
+                          BitWriter* JXL_RESTRICT writer, size_t layer,
+                          AuxOut* aux_out) {
+  return Bundle::Write(metadata, writer, layer, aux_out);
+}
+
+Status WriteQuantizerParams(const QuantizerParams& params,
+                            BitWriter* JXL_RESTRICT writer, size_t layer,
+                            AuxOut* aux_out) {
+  return Bundle::Write(params, writer, layer, aux_out);
+}
+
+Status WriteSizeHeader(const SizeHeader& size, BitWriter* JXL_RESTRICT writer,
+                       size_t layer, AuxOut* aux_out) {
+  return Bundle::Write(size, writer, layer, aux_out);
+}
+
+}  // namespace jxl
diff --git a/lib/jxl/enc_fields.h b/lib/jxl/enc_fields.h
new file mode 100644 (file)
index 0000000..5bb179a
--- /dev/null
@@ -0,0 +1,37 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_ENC_FIELDS_H_
+#define LIB_JXL_ENC_FIELDS_H_
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/headers.h"
+#include "lib/jxl/image_metadata.h"
+#include "lib/jxl/quantizer.h"
+
+namespace jxl {
+
+struct AuxOut;
+
+// Write headers from the CodecMetadata. Also may modify nonserialized_...
+// fields of the metadata.
+Status WriteCodestreamHeaders(CodecMetadata* metadata, BitWriter* writer,
+                              AuxOut* aux_out);
+
+Status WriteFrameHeader(const FrameHeader& frame,
+                        BitWriter* JXL_RESTRICT writer, AuxOut* aux_out);
+
+Status WriteQuantizerParams(const QuantizerParams& params,
+                            BitWriter* JXL_RESTRICT writer, size_t layer,
+                            AuxOut* aux_out);
+
+Status WriteSizeHeader(const SizeHeader& size, BitWriter* JXL_RESTRICT writer,
+                       size_t layer, AuxOut* aux_out);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_ENC_FIELDS_H_
diff --git a/lib/jxl/enc_file.cc b/lib/jxl/enc_file.cc
deleted file mode 100644 (file)
index 0f29bd9..0000000
+++ /dev/null
@@ -1,238 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "lib/jxl/enc_file.h"
-
-#include <stddef.h>
-
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-#include "lib/jxl/aux_out.h"
-#include "lib/jxl/aux_out_fwd.h"
-#include "lib/jxl/base/compiler_specific.h"
-#include "lib/jxl/codec_in_out.h"
-#include "lib/jxl/color_encoding_internal.h"
-#include "lib/jxl/enc_bit_writer.h"
-#include "lib/jxl/enc_cache.h"
-#include "lib/jxl/enc_frame.h"
-#include "lib/jxl/enc_icc_codec.h"
-#include "lib/jxl/frame_header.h"
-#include "lib/jxl/headers.h"
-#include "lib/jxl/image_bundle.h"
-
-namespace jxl {
-
-namespace {
-
-// DC + 'Very Low Frequency'
-PassDefinition progressive_passes_dc_vlf[] = {
-    {/*num_coefficients=*/2, /*shift=*/0, /*salient_only=*/false,
-     /*suitable_for_downsampling_of_at_least=*/4}};
-
-PassDefinition progressive_passes_dc_lf[] = {
-    {/*num_coefficients=*/2, /*shift=*/0, /*salient_only=*/false,
-     /*suitable_for_downsampling_of_at_least=*/4},
-    {/*num_coefficients=*/3, /*shift=*/0, /*salient_only=*/false,
-     /*suitable_for_downsampling_of_at_least=*/2}};
-
-PassDefinition progressive_passes_dc_lf_salient_ac[] = {
-    {/*num_coefficients=*/2, /*shift=*/0, /*salient_only=*/false,
-     /*suitable_for_downsampling_of_at_least=*/4},
-    {/*num_coefficients=*/3, /*shift=*/0, /*salient_only=*/false,
-     /*suitable_for_downsampling_of_at_least=*/2},
-    {/*num_coefficients=*/8, /*shift=*/0, /*salient_only=*/true,
-     /*suitable_for_downsampling_of_at_least=*/0}};
-
-PassDefinition progressive_passes_dc_lf_salient_ac_other_ac[] = {
-    {/*num_coefficients=*/2, /*shift=*/0, /*salient_only=*/false,
-     /*suitable_for_downsampling_of_at_least=*/4},
-    {/*num_coefficients=*/3, /*shift=*/0, /*salient_only=*/false,
-     /*suitable_for_downsampling_of_at_least=*/2},
-    {/*num_coefficients=*/8, /*shift=*/0, /*salient_only=*/true,
-     /*suitable_for_downsampling_of_at_least=*/0},
-    {/*num_coefficients=*/8, /*shift=*/0, /*salient_only=*/false,
-     /*suitable_for_downsampling_of_at_least=*/0}};
-
-PassDefinition progressive_passes_dc_quant_ac_full_ac[] = {
-    {/*num_coefficients=*/8, /*shift=*/1, /*salient_only=*/false,
-     /*suitable_for_downsampling_of_at_least=*/2},
-    {/*num_coefficients=*/8, /*shift=*/0, /*salient_only=*/false,
-     /*suitable_for_downsampling_of_at_least=*/0},
-};
-
-Status PrepareCodecMetadataFromIO(const CompressParams& cparams,
-                                  const CodecInOut* io,
-                                  CodecMetadata* metadata) {
-  *metadata = io->metadata;
-  size_t ups = 1;
-  if (cparams.already_downsampled) ups = cparams.resampling;
-
-  JXL_RETURN_IF_ERROR(metadata->size.Set(io->xsize() * ups, io->ysize() * ups));
-
-  // Keep ICC profile in lossless modes because a reconstructed profile may be
-  // slightly different (quantization).
-  // Also keep ICC in JPEG reconstruction mode as we need byte-exact profiles.
-  if (!cparams.IsLossless() && !io->Main().IsJPEG()) {
-    metadata->m.color_encoding.DecideIfWantICC();
-  }
-
-  metadata->m.xyb_encoded =
-      cparams.color_transform == ColorTransform::kXYB ? true : false;
-
-  // TODO(firsching): move this EncodeFile to test_utils / re-implement this
-  // using API functions
-  return true;
-}
-
-}  // namespace
-
-Status EncodePreview(const CompressParams& cparams, const ImageBundle& ib,
-                     const CodecMetadata* metadata, const JxlCmsInterface& cms,
-                     ThreadPool* pool, BitWriter* JXL_RESTRICT writer) {
-  BitWriter preview_writer;
-  // TODO(janwas): also support generating preview by downsampling
-  if (ib.HasColor()) {
-    AuxOut aux_out;
-    PassesEncoderState passes_enc_state;
-    // TODO(lode): check if we want all extra channels and matching xyb_encoded
-    // for the preview, such that using the main ImageMetadata object for
-    // encoding this frame is warrented.
-    FrameInfo frame_info;
-    frame_info.is_preview = true;
-    JXL_RETURN_IF_ERROR(EncodeFrame(cparams, frame_info, metadata, ib,
-                                    &passes_enc_state, cms, pool,
-                                    &preview_writer, &aux_out));
-    preview_writer.ZeroPadToByte();
-  }
-
-  if (preview_writer.BitsWritten() != 0) {
-    writer->ZeroPadToByte();
-    writer->AppendByteAligned(preview_writer);
-  }
-
-  return true;
-}
-
-Status WriteHeaders(CodecMetadata* metadata, BitWriter* writer,
-                    AuxOut* aux_out) {
-  // Marker/signature
-  BitWriter::Allotment allotment(writer, 16);
-  writer->Write(8, 0xFF);
-  writer->Write(8, kCodestreamMarker);
-  ReclaimAndCharge(writer, &allotment, kLayerHeader, aux_out);
-
-  JXL_RETURN_IF_ERROR(
-      WriteSizeHeader(metadata->size, writer, kLayerHeader, aux_out));
-
-  JXL_RETURN_IF_ERROR(
-      WriteImageMetadata(metadata->m, writer, kLayerHeader, aux_out));
-
-  metadata->transform_data.nonserialized_xyb_encoded = metadata->m.xyb_encoded;
-  JXL_RETURN_IF_ERROR(
-      Bundle::Write(metadata->transform_data, writer, kLayerHeader, aux_out));
-
-  return true;
-}
-
-Status EncodeFile(const CompressParams& params, const CodecInOut* io,
-                  PassesEncoderState* passes_enc_state, PaddedBytes* compressed,
-                  const JxlCmsInterface& cms, AuxOut* aux_out,
-                  ThreadPool* pool) {
-  io->CheckMetadata();
-  BitWriter writer;
-
-  CompressParams cparams = params;
-  if (io->Main().color_transform != ColorTransform::kNone) {
-    // Set the color transform to YCbCr or XYB if the original image is such.
-    cparams.color_transform = io->Main().color_transform;
-  }
-
-  JXL_RETURN_IF_ERROR(ParamsPostInit(&cparams));
-
-  std::unique_ptr<CodecMetadata> metadata = jxl::make_unique<CodecMetadata>();
-  JXL_RETURN_IF_ERROR(PrepareCodecMetadataFromIO(cparams, io, metadata.get()));
-  JXL_RETURN_IF_ERROR(WriteHeaders(metadata.get(), &writer, aux_out));
-
-  // Only send ICC (at least several hundred bytes) if fields aren't enough.
-  if (metadata->m.color_encoding.WantICC()) {
-    JXL_RETURN_IF_ERROR(WriteICC(metadata->m.color_encoding.ICC(), &writer,
-                                 kLayerHeader, aux_out));
-  }
-
-  if (metadata->m.have_preview) {
-    JXL_RETURN_IF_ERROR(EncodePreview(cparams, io->preview_frame,
-                                      metadata.get(), cms, pool, &writer));
-  }
-
-  // Each frame should start on byte boundaries.
-  BitWriter::Allotment allotment(&writer, 8);
-  writer.ZeroPadToByte();
-  ReclaimAndCharge(&writer, &allotment, kLayerHeader, aux_out);
-
-  if (cparams.progressive_mode || cparams.qprogressive_mode) {
-    if (cparams.saliency_map != nullptr) {
-      passes_enc_state->progressive_splitter.SetSaliencyMap(
-          cparams.saliency_map);
-    }
-    passes_enc_state->progressive_splitter.SetSaliencyThreshold(
-        cparams.saliency_threshold);
-    if (cparams.qprogressive_mode) {
-      passes_enc_state->progressive_splitter.SetProgressiveMode(
-          ProgressiveMode{progressive_passes_dc_quant_ac_full_ac});
-    } else {
-      switch (cparams.saliency_num_progressive_steps) {
-        case 1:
-          passes_enc_state->progressive_splitter.SetProgressiveMode(
-              ProgressiveMode{progressive_passes_dc_vlf});
-          break;
-        case 2:
-          passes_enc_state->progressive_splitter.SetProgressiveMode(
-              ProgressiveMode{progressive_passes_dc_lf});
-          break;
-        case 3:
-          passes_enc_state->progressive_splitter.SetProgressiveMode(
-              ProgressiveMode{progressive_passes_dc_lf_salient_ac});
-          break;
-        case 4:
-          if (cparams.saliency_threshold == 0.0f) {
-            // No need for a 4th pass if saliency-threshold regards everything
-            // as salient.
-            passes_enc_state->progressive_splitter.SetProgressiveMode(
-                ProgressiveMode{progressive_passes_dc_lf_salient_ac});
-          } else {
-            passes_enc_state->progressive_splitter.SetProgressiveMode(
-                ProgressiveMode{progressive_passes_dc_lf_salient_ac_other_ac});
-          }
-          break;
-        default:
-          return JXL_FAILURE("Invalid saliency_num_progressive_steps.");
-      }
-    }
-  }
-
-  for (size_t i = 0; i < io->frames.size(); i++) {
-    FrameInfo info;
-    info.is_last = i == io->frames.size() - 1;
-    if (io->frames[i].use_for_next_frame) {
-      info.save_as_reference = 1;
-    }
-    JXL_RETURN_IF_ERROR(EncodeFrame(cparams, info, metadata.get(),
-                                    io->frames[i], passes_enc_state, cms, pool,
-                                    &writer, aux_out));
-  }
-
-  // Clean up passes_enc_state in case it gets reused.
-  for (size_t i = 0; i < 4; i++) {
-    passes_enc_state->shared.dc_frames[i] = Image3F();
-    passes_enc_state->shared.reference_frames[i].storage = ImageBundle();
-  }
-
-  *compressed = std::move(writer).TakeBytes();
-  return true;
-}
-
-}  // namespace jxl
diff --git a/lib/jxl/enc_file.h b/lib/jxl/enc_file.h
deleted file mode 100644 (file)
index 37b3a27..0000000
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#ifndef LIB_JXL_ENC_FILE_H_
-#define LIB_JXL_ENC_FILE_H_
-
-// Facade for JXL encoding.
-
-#include "lib/jxl/aux_out.h"
-#include "lib/jxl/aux_out_fwd.h"
-#include "lib/jxl/base/data_parallel.h"
-#include "lib/jxl/base/padded_bytes.h"
-#include "lib/jxl/base/status.h"
-#include "lib/jxl/codec_in_out.h"
-#include "lib/jxl/enc_cache.h"
-#include "lib/jxl/enc_params.h"
-
-namespace jxl {
-
-// Write preview from `io`.
-Status EncodePreview(const CompressParams& cparams, const ImageBundle& ib,
-                     const CodecMetadata* metadata, const JxlCmsInterface& cms,
-                     ThreadPool* pool, BitWriter* JXL_RESTRICT writer);
-
-// Write headers from the CodecMetadata. Also may modify nonserialized_...
-// fields of the metadata.
-Status WriteHeaders(CodecMetadata* metadata, BitWriter* writer,
-                    AuxOut* aux_out);
-
-// Compresses pixels from `io` (given in any ColorEncoding).
-// `io->metadata.m.original` must be set.
-Status EncodeFile(const CompressParams& params, const CodecInOut* io,
-                  PassesEncoderState* passes_enc_state, PaddedBytes* compressed,
-                  const JxlCmsInterface& cms, AuxOut* aux_out = nullptr,
-                  ThreadPool* pool = nullptr);
-
-// Backwards-compatible interface. Don't use in new code.
-// TODO(deymo): Remove this function once we migrate users to C encoder API.
-struct FrameEncCache {};
-JXL_INLINE Status EncodeFile(const CompressParams& params, const CodecInOut* io,
-                             FrameEncCache* /* unused */,
-                             PaddedBytes* compressed,
-                             const JxlCmsInterface& cms,
-                             AuxOut* aux_out = nullptr,
-                             ThreadPool* pool = nullptr) {
-  PassesEncoderState passes_enc_state;
-  return EncodeFile(params, io, &passes_enc_state, compressed, cms, aux_out,
-                    pool);
-}
-
-}  // namespace jxl
-
-#endif  // LIB_JXL_ENC_FILE_H_
index f57175b..cfd97e9 100644 (file)
 #include "lib/jxl/ac_context.h"
 #include "lib/jxl/ac_strategy.h"
 #include "lib/jxl/ans_params.h"
-#include "lib/jxl/aux_out.h"
-#include "lib/jxl/aux_out_fwd.h"
 #include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/common.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/base/override.h"
-#include "lib/jxl/base/padded_bytes.h"
-#include "lib/jxl/base/profiler.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/chroma_from_luma.h"
 #include "lib/jxl/coeff_order.h"
 #include "lib/jxl/coeff_order_fwd.h"
 #include "lib/jxl/color_encoding_internal.h"
-#include "lib/jxl/color_management.h"
-#include "lib/jxl/common.h"
+#include "lib/jxl/common.h"  // kMaxNumPasses
 #include "lib/jxl/compressed_dc.h"
 #include "lib/jxl/dct_util.h"
 #include "lib/jxl/enc_adaptive_quantization.h"
 #include "lib/jxl/enc_ans.h"
+#include "lib/jxl/enc_aux_out.h"
 #include "lib/jxl/enc_bit_writer.h"
 #include "lib/jxl/enc_cache.h"
 #include "lib/jxl/enc_chroma_from_luma.h"
 #include "lib/jxl/enc_coeff_order.h"
 #include "lib/jxl/enc_context_map.h"
 #include "lib/jxl/enc_entropy_coder.h"
+#include "lib/jxl/enc_fields.h"
+#include "lib/jxl/enc_gaborish.h"
 #include "lib/jxl/enc_group.h"
 #include "lib/jxl/enc_modular.h"
 #include "lib/jxl/enc_noise.h"
 #include "lib/jxl/enc_toc.h"
 #include "lib/jxl/enc_xyb.h"
 #include "lib/jxl/fields.h"
+#include "lib/jxl/frame_dimensions.h"
 #include "lib/jxl/frame_header.h"
-#include "lib/jxl/gaborish.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/image_bundle.h"
 #include "lib/jxl/image_ops.h"
 #include "lib/jxl/loop_filter.h"
+#include "lib/jxl/modular/options.h"
 #include "lib/jxl/quant_weights.h"
 #include "lib/jxl/quantizer.h"
 #include "lib/jxl/splines.h"
 namespace jxl {
 namespace {
 
-void ClusterGroups(PassesEncoderState* enc_state) {
-  if (enc_state->shared.frame_header.passes.num_passes > 1) {
-    // TODO(veluca): implement this for progressive modes.
-    return;
-  }
-  // This only considers pass 0 for now.
-  std::vector<uint8_t> context_map;
-  EntropyEncodingData codes;
-  auto& ac = enc_state->passes[0].ac_tokens;
-  size_t limit = std::ceil(std::sqrt(ac.size()));
-  if (limit == 1) return;
-  size_t num_contexts = enc_state->shared.block_ctx_map.NumACContexts();
-  std::vector<float> costs(ac.size());
-  HistogramParams params;
-  params.uint_method = HistogramParams::HybridUintMethod::kNone;
-  params.lz77_method = HistogramParams::LZ77Method::kNone;
-  params.ans_histogram_strategy =
-      HistogramParams::ANSHistogramStrategy::kApproximate;
-  size_t max = 0;
-  auto token_cost = [&](std::vector<std::vector<Token>>& tokens, size_t num_ctx,
-                        bool estimate = true) {
-    // TODO(veluca): not estimating is very expensive.
-    BitWriter writer;
-    size_t c = BuildAndEncodeHistograms(
-        params, num_ctx, tokens, &codes, &context_map,
-        estimate ? nullptr : &writer, 0, /*aux_out=*/0);
-    if (estimate) return c;
-    for (size_t i = 0; i < tokens.size(); i++) {
-      WriteTokens(tokens[i], codes, context_map, &writer, 0, nullptr);
-    }
-    return writer.BitsWritten();
-  };
-  for (size_t i = 0; i < ac.size(); i++) {
-    std::vector<std::vector<Token>> tokens{ac[i]};
-    costs[i] =
-        token_cost(tokens, enc_state->shared.block_ctx_map.NumACContexts());
-    if (costs[i] > costs[max]) {
-      max = i;
-    }
-  }
-  auto dist = [&](int i, int j) {
-    std::vector<std::vector<Token>> tokens{ac[i], ac[j]};
-    return token_cost(tokens, num_contexts) - costs[i] - costs[j];
-  };
-  std::vector<size_t> out{max};
-  std::vector<float> dists(ac.size());
-  size_t farthest = 0;
-  for (size_t i = 0; i < ac.size(); i++) {
-    if (i == max) continue;
-    dists[i] = dist(max, i);
-    if (dists[i] > dists[farthest]) {
-      farthest = i;
-    }
-  }
-
-  while (dists[farthest] > 0 && out.size() < limit) {
-    out.push_back(farthest);
-    dists[farthest] = 0;
-    enc_state->histogram_idx[farthest] = out.size() - 1;
-    for (size_t i = 0; i < ac.size(); i++) {
-      float d = dist(out.back(), i);
-      if (d < dists[i]) {
-        dists[i] = d;
-        enc_state->histogram_idx[i] = out.size() - 1;
-      }
-      if (dists[i] > dists[farthest]) {
-        farthest = i;
-      }
-    }
-  }
-
-  std::vector<size_t> remap(out.size());
-  std::iota(remap.begin(), remap.end(), 0);
-  for (size_t i = 0; i < enc_state->histogram_idx.size(); i++) {
-    enc_state->histogram_idx[i] = remap[enc_state->histogram_idx[i]];
-  }
-  auto remap_cost = [&](std::vector<size_t> remap) {
-    std::vector<size_t> re_remap(remap.size(), remap.size());
-    size_t r = 0;
-    for (size_t i = 0; i < remap.size(); i++) {
-      if (re_remap[remap[i]] == remap.size()) {
-        re_remap[remap[i]] = r++;
-      }
-      remap[i] = re_remap[remap[i]];
-    }
-    auto tokens = ac;
-    size_t max_hist = 0;
-    for (size_t i = 0; i < tokens.size(); i++) {
-      for (size_t j = 0; j < tokens[i].size(); j++) {
-        size_t hist = remap[enc_state->histogram_idx[i]];
-        tokens[i][j].context += hist * num_contexts;
-        max_hist = std::max(hist + 1, max_hist);
-      }
-    }
-    return token_cost(tokens, max_hist * num_contexts, /*estimate=*/false);
-  };
-
-  for (size_t src = 0; src < out.size(); src++) {
-    float cost = remap_cost(remap);
-    size_t best = src;
-    for (size_t j = src + 1; j < out.size(); j++) {
-      if (remap[src] == remap[j]) continue;
-      auto remap_c = remap;
-      std::replace(remap_c.begin(), remap_c.end(), remap[src], remap[j]);
-      float c = remap_cost(remap_c);
-      if (c < cost) {
-        best = j;
-        cost = c;
-      }
-    }
-    if (src != best) {
-      std::replace(remap.begin(), remap.end(), remap[src], remap[best]);
-    }
-  }
-  std::vector<size_t> re_remap(remap.size(), remap.size());
-  size_t r = 0;
-  for (size_t i = 0; i < remap.size(); i++) {
-    if (re_remap[remap[i]] == remap.size()) {
-      re_remap[remap[i]] = r++;
-    }
-    remap[i] = re_remap[remap[i]];
-  }
+PassDefinition progressive_passes_dc_vlf_lf_full_ac[] = {
+    {/*num_coefficients=*/2, /*shift=*/0,
+     /*suitable_for_downsampling_of_at_least=*/4},
+    {/*num_coefficients=*/3, /*shift=*/0,
+     /*suitable_for_downsampling_of_at_least=*/2},
+    {/*num_coefficients=*/8, /*shift=*/0,
+     /*suitable_for_downsampling_of_at_least=*/0},
+};
 
-  enc_state->shared.num_histograms =
-      *std::max_element(remap.begin(), remap.end()) + 1;
-  for (size_t i = 0; i < enc_state->histogram_idx.size(); i++) {
-    enc_state->histogram_idx[i] = remap[enc_state->histogram_idx[i]];
-  }
-  for (size_t i = 0; i < ac.size(); i++) {
-    for (size_t j = 0; j < ac[i].size(); j++) {
-      ac[i][j].context += enc_state->histogram_idx[i] * num_contexts;
-    }
-  }
-}
+PassDefinition progressive_passes_dc_quant_ac_full_ac[] = {
+    {/*num_coefficients=*/8, /*shift=*/1,
+     /*suitable_for_downsampling_of_at_least=*/2},
+    {/*num_coefficients=*/8, /*shift=*/0,
+     /*suitable_for_downsampling_of_at_least=*/0},
+};
 
 uint64_t FrameFlagsFromParams(const CompressParams& cparams) {
   uint64_t flags = 0;
@@ -281,7 +162,17 @@ Status MakeFrameHeader(const CompressParams& cparams,
 
   if (cparams.modular_mode) {
     frame_header->encoding = FrameEncoding::kModular;
-    frame_header->group_size_shift = cparams.modular_group_size_shift;
+    if (cparams.modular_group_size_shift == -1) {
+      frame_header->group_size_shift = 1;
+      // no point using groups when only one group is full and the others are
+      // less than half full: multithreading will not really help much, while
+      // compression does suffer
+      if (ib.xsize() <= 400 && ib.ysize() <= 400) {
+        frame_header->group_size_shift = 2;
+      }
+    } else {
+      frame_header->group_size_shift = cparams.modular_group_size_shift;
+    }
   }
 
   frame_header->chroma_subsampling = ib.chroma_subsampling;
@@ -299,6 +190,13 @@ Status MakeFrameHeader(const CompressParams& cparams,
           "recompressing JPEGs");
     }
   }
+  if (frame_header->color_transform != ColorTransform::kYCbCr &&
+      (frame_header->chroma_subsampling.MaxHShift() != 0 ||
+       frame_header->chroma_subsampling.MaxVShift() != 0)) {
+    return JXL_FAILURE(
+        "Chroma subsampling is not supported when color transform is not "
+        "YCbCr");
+  }
 
   frame_header->flags = FrameFlagsFromParams(cparams);
   // Non-photon noise is not supported in the Modular encoder for now.
@@ -477,6 +375,78 @@ void SimplifyInvisible(Image3F* image, const ImageF& alpha, bool lossless) {
   }
 }
 
+struct PixelStatsForChromacityAdjustment {
+  float dx = 0;
+  float db = 0;
+  float exposed_blue = 0;
+  float CalcPlane(const ImageF* JXL_RESTRICT plane) const {
+    float xmax = 0;
+    float ymax = 0;
+    for (size_t ty = 1; ty < plane->ysize(); ++ty) {
+      for (size_t tx = 1; tx < plane->xsize(); ++tx) {
+        float cur = plane->Row(ty)[tx];
+        float prev_row = plane->Row(ty - 1)[tx];
+        float prev = plane->Row(ty)[tx - 1];
+        xmax = std::max(xmax, std::abs(cur - prev));
+        ymax = std::max(ymax, std::abs(cur - prev_row));
+      }
+    }
+    return std::max(xmax, ymax);
+  }
+  void CalcExposedBlue(const ImageF* JXL_RESTRICT plane_y,
+                       const ImageF* JXL_RESTRICT plane_b) {
+    float eb = 0;
+    float xmax = 0;
+    float ymax = 0;
+    for (size_t ty = 1; ty < plane_y->ysize(); ++ty) {
+      for (size_t tx = 1; tx < plane_y->xsize(); ++tx) {
+        float cur_y = plane_y->Row(ty)[tx];
+        float cur_b = plane_b->Row(ty)[tx];
+        float exposed_b = cur_b - cur_y * 1.2;
+        float diff_b = cur_b - cur_y;
+        float prev_row = plane_b->Row(ty - 1)[tx];
+        float prev = plane_b->Row(ty)[tx - 1];
+        float diff_prev_row = prev_row - plane_y->Row(ty - 1)[tx];
+        float diff_prev = prev - plane_y->Row(ty)[tx - 1];
+        xmax = std::max(xmax, std::abs(diff_b - diff_prev));
+        ymax = std::max(ymax, std::abs(diff_b - diff_prev_row));
+        if (exposed_b >= 0) {
+          exposed_b *= fabs(cur_b - prev) + fabs(cur_b - prev_row);
+          eb = std::max(eb, exposed_b);
+        }
+      }
+    }
+    exposed_blue = eb;
+    db = std::max(xmax, ymax);
+  }
+  void Calc(const Image3F* JXL_RESTRICT opsin) {
+    dx = CalcPlane(&opsin->Plane(0));
+    CalcExposedBlue(&opsin->Plane(1), &opsin->Plane(2));
+  }
+  int HowMuchIsXChannelPixelized() {
+    if (dx >= 0.03) {
+      return 2;
+    }
+    if (dx >= 0.017) {
+      return 1;
+    }
+    return 0;
+  }
+  int HowMuchIsBChannelPixelized() {
+    int add = exposed_blue >= 0.13 ? 1 : 0;
+    if (db > 0.38) {
+      return 2 + add;
+    }
+    if (db > 0.33) {
+      return 1 + add;
+    }
+    if (db > 0.28) {
+      return add;
+    }
+    return 0;
+  }
+};
+
 }  // namespace
 
 class LossyFrameEncoder {
@@ -498,16 +468,18 @@ class LossyFrameEncoder {
                              const JxlCmsInterface& cms, ThreadPool* pool,
                              ModularFrameEncoder* modular_frame_encoder,
                              FrameHeader* frame_header) {
-    PROFILER_ZONE("ComputeEncodingData uninstrumented");
     JXL_ASSERT((opsin->xsize() % kBlockDim) == 0 &&
                (opsin->ysize() % kBlockDim) == 0);
     PassesSharedState& shared = enc_state_->shared;
 
     if (!enc_state_->cparams.max_error_mode) {
-      float x_qm_scale_steps[2] = {1.25f, 9.0f};
+      // Compute chromacity adjustments using two approaches.
+      // 1) Distance based approach for chromacity adjustment:
+      float x_qm_scale_steps[4] = {1.25f, 7.0f, 15.0f, 24.0f};
       shared.frame_header.x_qm_scale = 2;
       for (float x_qm_scale_step : x_qm_scale_steps) {
-        if (enc_state_->cparams.butteraugli_distance > x_qm_scale_step) {
+        if (enc_state_->cparams.original_butteraugli_distance >
+            x_qm_scale_step) {
           shared.frame_header.x_qm_scale++;
         }
       }
@@ -516,6 +488,20 @@ class LossyFrameEncoder {
         // faithful to original even with extreme (5-10x) zooming.
         shared.frame_header.x_qm_scale++;
       }
+      // 2) Pixel-based approach for chromacity adjustment:
+      // look at the individual pixels and make a guess how difficult
+      // the image would be based on the worst case pixel.
+      PixelStatsForChromacityAdjustment pixel_stats;
+      if (enc_state_->cparams.speed_tier <= SpeedTier::kWombat) {
+        pixel_stats.Calc(opsin);
+      }
+      // For X take the most severe adjustment.
+      shared.frame_header.x_qm_scale =
+          std::max<int>(shared.frame_header.x_qm_scale,
+                        2 + pixel_stats.HowMuchIsXChannelPixelized());
+      // B only adjusted by pixel-based approach.
+      shared.frame_header.b_qm_scale =
+          2 + pixel_stats.HowMuchIsBChannelPixelized();
     }
 
     JXL_RETURN_IF_ERROR(enc_state_->heuristics->LossyFrameHeuristics(
@@ -571,7 +557,6 @@ class LossyFrameEncoder {
   Status ComputeJPEGTranscodingData(const jpeg::JPEGData& jpeg_data,
                                     ModularFrameEncoder* modular_frame_encoder,
                                     FrameHeader* frame_header) {
-    PROFILER_ZONE("ComputeJPEGTranscodingData uninstrumented");
     PassesSharedState& shared = enc_state_->shared;
 
     frame_header->x_qm_scale = 2;
@@ -589,9 +574,16 @@ class LossyFrameEncoder {
     shared.ac_strategy.FillDCT8();
     FillImage(uint8_t(0), &shared.epf_sharpness);
 
+    enc_state_->passes.resize(enc_state_->progressive_splitter.GetNumPasses());
+    for (PassesEncoderState::PassData& pass : enc_state_->passes) {
+      pass.ac_tokens.resize(shared.frame_dim.num_groups);
+    }
+
     enc_state_->coeffs.clear();
-    enc_state_->coeffs.emplace_back(make_unique<ACImageT<int32_t>>(
-        kGroupDim * kGroupDim, frame_dim.num_groups));
+    while (enc_state_->coeffs.size() < enc_state_->passes.size()) {
+      enc_state_->coeffs.emplace_back(make_unique<ACImageT<int32_t>>(
+          kGroupDim * kGroupDim, frame_dim.num_groups));
+    }
 
     // convert JPEG quantization table to a Quantizer object
     float dcquantization[3];
@@ -735,9 +727,12 @@ class LossyFrameEncoder {
                                       "FindCorrelation"));
       }
     }
+
     if (!frame_header->chroma_subsampling.Is444()) {
       ZeroFillImage(&dc);
-      enc_state_->coeffs[0]->ZeroFill();
+      for (auto& coeff : enc_state_->coeffs) {
+        coeff->ZeroFill();
+      }
     }
     // JPEG DC is from -1024 to 1023.
     std::vector<size_t> dc_counts[3] = {};
@@ -747,7 +742,9 @@ class LossyFrameEncoder {
     size_t total_dc[3] = {};
     for (size_t c : {1, 0, 2}) {
       if (jpeg_data.components.size() == 1 && c != 1) {
-        enc_state_->coeffs[0]->ZeroFillPlane(c);
+        for (auto& coeff : enc_state_->coeffs) {
+          coeff->ZeroFillPlane(c);
+        }
         ZeroFillImage(&dc.Plane(c));
         // Ensure no division by 0.
         dc_counts[c][1024] = 1;
@@ -761,9 +758,11 @@ class LossyFrameEncoder {
            group_index++) {
         const size_t gx = group_index % frame_dim.xsize_groups;
         const size_t gy = group_index / frame_dim.xsize_groups;
-        size_t offset = 0;
-        int32_t* JXL_RESTRICT ac =
-            enc_state_->coeffs[0]->PlaneRow(c, group_index, 0).ptr32;
+        int32_t* coeffs[kMaxNumPasses];
+        for (size_t i = 0; i < enc_state_->coeffs.size(); i++) {
+          coeffs[i] = enc_state_->coeffs[i]->PlaneRow(c, group_index, 0).ptr32;
+        }
+        int32_t block[64];
         for (size_t by = gy * kGroupDimInBlocks;
              by < ysize_blocks && by < (gy + 1) * kGroupDimInBlocks; ++by) {
           if ((by >> vshift) << vshift != by) continue;
@@ -790,7 +789,7 @@ class LossyFrameEncoder {
                 !frame_header->chroma_subsampling.Is444()) {
               for (size_t y = 0; y < 8; y++) {
                 for (size_t x = 0; x < 8; x++) {
-                  ac[offset + y * 8 + x] = inputjpeg[base + x * 8 + y];
+                  block[y * 8 + x] = inputjpeg[base + x * 8 + y];
                 }
               }
             } else {
@@ -810,11 +809,16 @@ class LossyFrameEncoder {
                                     (1 << (kCFLFixedPointPrecision - 1))) >>
                                    kCFLFixedPointPrecision;
                   int QCR = QChroma - cfl_factor;
-                  ac[offset + y * 8 + x] = QCR;
+                  block[y * 8 + x] = QCR;
                 }
               }
             }
-            offset += 64;
+            enc_state_->progressive_splitter.SplitACCoefficients(
+                block, AcStrategy::FromRawStrategy(AcStrategy::Type::DCT), bx,
+                by, coeffs);
+            for (size_t i = 0; i < enc_state_->coeffs.size(); i++) {
+              coeffs[i] += kDCTBlockSize;
+            }
           }
         }
       }
@@ -875,14 +879,6 @@ class LossyFrameEncoder {
     // Must happen before WriteFrameHeader!
     shared.frame_header.UpdateFlag(true, FrameHeader::kSkipAdaptiveDCSmoothing);
 
-    enc_state_->passes.resize(enc_state_->progressive_splitter.GetNumPasses());
-    for (PassesEncoderState::PassData& pass : enc_state_->passes) {
-      pass.ac_tokens.resize(shared.frame_dim.num_groups);
-    }
-
-    JXL_CHECK(enc_state_->passes.size() ==
-              1);  // skipping coeff splitting so need to have only one pass
-
     ComputeAllCoeffOrders(frame_dim);
     shared.num_histograms = 1;
 
@@ -924,8 +920,9 @@ class LossyFrameEncoder {
   Status EncodeGlobalDCInfo(const FrameHeader& frame_header,
                             BitWriter* writer) const {
     // Encode quantizer DC and global scale.
+    QuantizerParams params = enc_state_->shared.quantizer.GetParams();
     JXL_RETURN_IF_ERROR(
-        enc_state_->shared.quantizer.Encode(writer, kLayerQuant, aux_out_));
+        WriteQuantizerParams(params, writer, kLayerQuant, aux_out_));
     EncodeBlockCtxMap(enc_state_->shared.block_ctx_map, writer, aux_out_);
     ColorCorrelationMapEncodeDC(&enc_state_->shared.cmap, writer, kLayerDC,
                                 aux_out_);
@@ -937,15 +934,12 @@ class LossyFrameEncoder {
     JXL_RETURN_IF_ERROR(DequantMatricesEncode(&enc_state_->shared.matrices,
                                               writer, kLayerQuant, aux_out_,
                                               modular_frame_encoder));
-    if (enc_state_->cparams.speed_tier <= SpeedTier::kTortoise) {
-      if (!doing_jpeg_recompression) ClusterGroups(enc_state_);
-    }
     size_t num_histo_bits =
         CeilLog2Nonzero(enc_state_->shared.frame_dim.num_groups);
     if (num_histo_bits != 0) {
       BitWriter::Allotment allotment(writer, num_histo_bits);
       writer->Write(num_histo_bits, enc_state_->shared.num_histograms - 1);
-      ReclaimAndCharge(writer, &allotment, kLayerAC, aux_out_);
+      allotment.ReclaimAndCharge(writer, kLayerAC, aux_out_);
     }
 
     for (size_t i = 0; i < enc_state_->progressive_splitter.GetNumPasses();
@@ -956,7 +950,7 @@ class LossyFrameEncoder {
           kOrderEnc, enc_state_->used_orders[i], &order_bits));
       BitWriter::Allotment allotment(writer, order_bits);
       JXL_CHECK(U32Coder::Write(kOrderEnc, enc_state_->used_orders[i], writer));
-      ReclaimAndCharge(writer, &allotment, kLayerOrder, aux_out_);
+      allotment.ReclaimAndCharge(writer, kLayerOrder, aux_out_);
       EncodeCoeffOrders(
           enc_state_->used_orders[i],
           &enc_state_->shared
@@ -995,7 +989,6 @@ class LossyFrameEncoder {
 
  private:
   void ComputeAllCoeffOrders(const FrameDimensions& frame_dim) {
-    PROFILER_FUNC;
     // No coefficient reordering in Falcon or faster.
     auto used_orders_info = ComputeUsedOrders(
         enc_state_->cparams.speed_tier, enc_state_->shared.ac_strategy,
@@ -1077,56 +1070,90 @@ Status EncodeFrame(const CompressParams& cparams_orig,
                    const JxlCmsInterface& cms, ThreadPool* pool,
                    BitWriter* writer, AuxOut* aux_out) {
   CompressParams cparams = cparams_orig;
-  if (cparams_orig.target_bitrate > 0.0f &&
-      frame_info.frame_type == FrameType::kRegularFrame) {
-    cparams.target_bitrate = 0.0f;
-    const float target_bitrate = cparams_orig.target_bitrate;
-    float bitrate = 0.0f;
-    float prev_bitrate = 0.0f;
-    float rescale = 1.0f;
-    size_t prev_bits = 0;
-    float error = 0.0f;
-    float best_error = 100.0f;
-    float best_rescale = 1.0f;
-    for (size_t i = 0; i < 10; ++i) {
-      std::unique_ptr<PassesEncoderState> state =
-          jxl::make_unique<PassesEncoderState>();
-      BitWriter bw;
-      JXL_CHECK(EncodeFrame(cparams, frame_info, metadata, ib, state.get(), cms,
-                            pool, &bw, nullptr));
-      bitrate = bw.BitsWritten() * 1.0 / (ib.xsize() * ib.ysize());
-      error = target_bitrate / bitrate - 1.0f;
-      if (std::abs(error) < std::abs(best_error)) {
-        best_error = error;
-        best_rescale = cparams.quant_ac_rescale;
-      }
-      if (bw.BitsWritten() == prev_bits || std::abs(error) < 0.0005f) {
-        break;
-      }
-      float lambda = 1.0f;
-      if (i > 0) {
-        lambda = (((bitrate / prev_bitrate) - 1.0f) / (rescale - 1.0f));
-      }
-      rescale = (1.0f + ((target_bitrate / bitrate) - 1.0f) / lambda);
-      if (rescale < 0.0f) {
-        break;
+  if (cparams.speed_tier == SpeedTier::kGlacier && !cparams.IsLossless()) {
+    cparams.speed_tier = SpeedTier::kTortoise;
+  }
+  if (cparams.speed_tier == SpeedTier::kGlacier) {
+    std::vector<CompressParams> all_params;
+    std::vector<size_t> size;
+
+    CompressParams cparams_attempt = cparams_orig;
+    cparams_attempt.speed_tier = SpeedTier::kTortoise;
+    cparams_attempt.options.max_properties = 4;
+
+    for (float x : {0.0f, 80.f}) {
+      cparams_attempt.channel_colors_percent = x;
+      for (float y : {0.0f, 95.0f}) {
+        cparams_attempt.channel_colors_pre_transform_percent = y;
+        // 70000 ensures that the number of palette colors is representable in
+        // modular headers.
+        for (int K : {0, 1 << 10, 70000}) {
+          cparams_attempt.palette_colors = K;
+          for (int tree_mode : {-1, (int)ModularOptions::TreeMode::kNoWP,
+                                (int)ModularOptions::TreeMode::kDefault}) {
+            if (tree_mode == -1) {
+              // LZ77 only
+              cparams_attempt.options.nb_repeats = 0;
+            } else {
+              cparams_attempt.options.nb_repeats = 1;
+              cparams_attempt.options.wp_tree_mode =
+                  static_cast<ModularOptions::TreeMode>(tree_mode);
+            }
+            for (Predictor pred : {Predictor::Zero, Predictor::Variable}) {
+              cparams_attempt.options.predictor = pred;
+              for (int g : {0, -1, 3}) {
+                cparams_attempt.modular_group_size_shift = g;
+                for (Override patches : {Override::kDefault, Override::kOff}) {
+                  cparams_attempt.patches = patches;
+                  all_params.push_back(cparams_attempt);
+                }
+              }
+            }
+          }
+        }
       }
-      cparams.quant_ac_rescale *= rescale;
-      prev_bitrate = bitrate;
-      prev_bits = bw.BitsWritten();
     }
-    if (aux_out) {
-      aux_out->max_quant_rescale = best_rescale;
-      aux_out->min_quant_rescale = best_rescale;
-      aux_out->min_bitrate_error = best_error;
-      aux_out->max_bitrate_error = best_error;
+
+    size.resize(all_params.size());
+
+    std::atomic<int> num_errors{0};
+
+    JXL_RETURN_IF_ERROR(RunOnPool(
+        pool, 0, all_params.size(), ThreadPool::NoInit,
+        [&](size_t task, size_t) {
+          BitWriter w;
+          PassesEncoderState state;
+          if (!EncodeFrame(all_params[task], frame_info, metadata, ib, &state,
+                           cms, nullptr, &w, aux_out)) {
+            num_errors.fetch_add(1, std::memory_order_relaxed);
+            return;
+          }
+          size[task] = w.BitsWritten();
+        },
+        "Compress kGlacier"));
+    JXL_RETURN_IF_ERROR(num_errors.load(std::memory_order_relaxed) == 0);
+
+    size_t best_idx = 0;
+    for (size_t i = 1; i < all_params.size(); i++) {
+      if (size[best_idx] > size[i]) {
+        best_idx = i;
+      }
     }
-    cparams.quant_ac_rescale = best_rescale;
+    cparams = all_params[best_idx];
   }
+
   ib.VerifyMetadata();
 
   passes_enc_state->special_frames.clear();
 
+  if (cparams.qprogressive_mode) {
+    passes_enc_state->progressive_splitter.SetProgressiveMode(
+        ProgressiveMode{progressive_passes_dc_quant_ac_full_ac});
+  } else if (cparams.progressive_mode) {
+    passes_enc_state->progressive_splitter.SetProgressiveMode(
+        ProgressiveMode{progressive_passes_dc_vlf_lf_full_ac});
+  }
+
   JXL_RETURN_IF_ERROR(ParamsPostInit(&cparams));
 
   if (cparams.progressive_dc < 0) {
@@ -1135,13 +1162,6 @@ Status EncodeFrame(const CompressParams& cparams_orig,
                          cparams.progressive_dc);
     }
     cparams.progressive_dc = 0;
-    // Enable progressive_dc for lower qualities, except for fast speeds where
-    // the modular encoder uses fixed tree.
-    if (cparams.speed_tier <= SpeedTier::kCheetah &&
-        cparams.butteraugli_distance >=
-            kMinButteraugliDistanceForProgressiveDc) {
-      cparams.progressive_dc = 1;
-    }
   }
   if (cparams.ec_resampling < cparams.resampling) {
     cparams.ec_resampling = cparams.resampling;
@@ -1210,7 +1230,7 @@ Status EncodeFrame(const CompressParams& cparams_orig,
   metadata_linear->color_encoding = c_linear;
   ImageBundle linear_storage(metadata_linear.get());
 
-  std::vector<AuxOut> aux_outs;
+  std::vector<std::unique_ptr<AuxOut>> aux_outs;
   // LossyFrameEncoder stores a reference to a std::function<Status(size_t)>
   // so we need to keep the std::function<Status(size_t)> being referenced
   // alive while lossy_frame_encoder is used. We could make resize_aux_outs a
@@ -1218,18 +1238,15 @@ Status EncodeFrame(const CompressParams& cparams_orig,
   // simpler.
   const std::function<Status(size_t)> resize_aux_outs =
       [&aux_outs, aux_out](const size_t num_threads) -> Status {
-    if (aux_out != nullptr) {
-      size_t old_size = aux_outs.size();
-      for (size_t i = num_threads; i < old_size; i++) {
-        aux_out->Assimilate(aux_outs[i]);
-      }
+    if (aux_out == nullptr) {
       aux_outs.resize(num_threads);
-      // Each thread needs these INPUTS. Don't copy the entire AuxOut
-      // because it may contain stats which would be Assimilated multiple
-      // times below.
-      for (size_t i = old_size; i < aux_outs.size(); i++) {
-        aux_outs[i].dump_image = aux_out->dump_image;
-        aux_outs[i].debug_prefix = aux_out->debug_prefix;
+    } else {
+      while (aux_outs.size() > num_threads) {
+        aux_out->Assimilate(*aux_outs.back());
+        aux_outs.pop_back();
+      }
+      while (num_threads > aux_outs.size()) {
+        aux_outs.emplace_back(jxl::make_unique<AuxOut>());
       }
     }
     return true;
@@ -1288,10 +1305,6 @@ Status EncodeFrame(const CompressParams& cparams_orig,
                           ib.alpha(), lossless);
       }
     }
-    if (aux_out != nullptr) {
-      JXL_RETURN_IF_ERROR(
-          aux_out->InspectImage3F("enc_frame:OpsinDynamicsImage", opsin));
-    }
     if (frame_header->encoding == FrameEncoding::kVarDCT) {
       PadImageToBlockMultipleInPlace(&opsin);
       JXL_RETURN_IF_ERROR(lossy_frame_encoder.ComputeEncodingData(
@@ -1309,9 +1322,11 @@ Status EncodeFrame(const CompressParams& cparams_orig,
   }
   if (cparams.ec_resampling != 1 && !cparams.already_downsampled) {
     extra_channels = &extra_channels_storage;
-    for (size_t i = 0; i < ib.extra_channels().size(); i++) {
-      extra_channels_storage.emplace_back(CopyImage(ib.extra_channels()[i]));
-      DownsampleImage(&extra_channels_storage.back(), cparams.ec_resampling);
+    for (const ImageF& ec : ib.extra_channels()) {
+      ImageF d_ec(ec.xsize(), ec.ysize());
+      CopyImageTo(ec, &d_ec);
+      DownsampleImage(&d_ec, cparams.ec_resampling);
+      extra_channels_storage.emplace_back(std::move(d_ec));
     }
   }
   // needs to happen *AFTER* VarDCT-ComputeEncodingData.
@@ -1388,13 +1403,13 @@ Status EncodeFrame(const CompressParams& cparams_orig,
 
   const auto process_dc_group = [&](const uint32_t group_index,
                                     const size_t thread) {
-    AuxOut* my_aux_out = aux_out ? &aux_outs[thread] : nullptr;
+    AuxOut* my_aux_out = aux_outs[thread].get();
     BitWriter* output = get_output(group_index + 1);
     if (frame_header->encoding == FrameEncoding::kVarDCT &&
         !(frame_header->flags & FrameHeader::kUseDcFrame)) {
       BitWriter::Allotment allotment(output, 2);
       output->Write(2, modular_frame_encoder->extra_dc_precision[group_index]);
-      ReclaimAndCharge(output, &allotment, kLayerDC, my_aux_out);
+      allotment.ReclaimAndCharge(output, kLayerDC, my_aux_out);
       JXL_CHECK(modular_frame_encoder->EncodeStream(
           output, my_aux_out, kLayerDC,
           ModularStreamId::VarDCTDC(group_index)));
@@ -1410,7 +1425,7 @@ Status EncodeFrame(const CompressParams& cparams_orig,
         BitWriter::Allotment allotment(output, nb_bits);
         output->Write(nb_bits,
                       modular_frame_encoder->ac_metadata_size[group_index] - 1);
-        ReclaimAndCharge(output, &allotment, kLayerControlFields, my_aux_out);
+        allotment.ReclaimAndCharge(output, kLayerControlFields, my_aux_out);
       }
       JXL_CHECK(modular_frame_encoder->EncodeStream(
           output, my_aux_out, kLayerControlFields,
@@ -1429,7 +1444,7 @@ Status EncodeFrame(const CompressParams& cparams_orig,
   std::atomic<int> num_errors{0};
   const auto process_group = [&](const uint32_t group_index,
                                  const size_t thread) {
-    AuxOut* my_aux_out = aux_out ? &aux_outs[thread] : nullptr;
+    AuxOut* my_aux_out = aux_outs[thread].get();
 
     for (size_t i = 0; i < num_passes; i++) {
       if (frame_header->encoding == FrameEncoding::kVarDCT) {
@@ -1458,7 +1473,7 @@ Status EncodeFrame(const CompressParams& cparams_orig,
   for (BitWriter& bw : group_codes) {
     BitWriter::Allotment allotment(&bw, 8);
     bw.ZeroPadToByte();  // end of group.
-    ReclaimAndCharge(&bw, &allotment, kLayerAC, aux_out);
+    allotment.ReclaimAndCharge(&bw, kLayerAC, aux_out);
   }
 
   std::vector<coeff_order_t>* permutation_ptr = nullptr;
index c046014..b1dc637 100644 (file)
@@ -6,8 +6,6 @@
 #ifndef LIB_JXL_ENC_FRAME_H_
 #define LIB_JXL_ENC_FRAME_H_
 
-#include "lib/jxl/aux_out.h"
-#include "lib/jxl/aux_out_fwd.h"
 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/enc_bit_writer.h"
@@ -18,6 +16,8 @@
 
 namespace jxl {
 
+struct AuxOut;
+
 // Information needed for encoding a frame that is not contained elsewhere and
 // does not belong to `cparams`.
 // TODO(lode): if possible, it might be better to replace FrameInfo and several
diff --git a/lib/jxl/enc_gaborish.cc b/lib/jxl/enc_gaborish.cc
new file mode 100644 (file)
index 0000000..32914a0
--- /dev/null
@@ -0,0 +1,62 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_gaborish.h"
+
+#include <stddef.h>
+
+#include <hwy/base.h>
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/convolve.h"
+#include "lib/jxl/image_ops.h"
+
+namespace jxl {
+
+void GaborishInverse(Image3F* in_out, float mul[3], ThreadPool* pool) {
+  WeightsSymmetric5 weights[3];
+  // Only an approximation. One or even two 3x3, and rank-1 (separable) 5x5
+  // are insufficient. The numbers here have been obtained by butteraugli
+  // based optimizing the whole system and the errors produced are likely
+  // more favorable for good rate-distortion compromises rather than
+  // just using mathematical optimization to find the inverse.
+  static const float kGaborish[5] = {
+      -0.090881924078487886f, -0.043663953593472138f, 0.01392497846646211f,
+      0.0036189602184591141f, 0.0030557936884763499f};
+  for (int i = 0; i < 3; ++i) {
+    double sum = 1.0 + mul[i] * 4 *
+                           (kGaborish[0] + kGaborish[1] + kGaborish[2] +
+                            kGaborish[4] + 2 * kGaborish[3]);
+    if (sum < 1e-5) {
+      sum = 1e-5;
+    }
+    const float normalize = static_cast<float>(1.0 / sum);
+    const float normalize_mul = mul[i] * normalize;
+    weights[i] = WeightsSymmetric5{{HWY_REP4(normalize)},
+                                   {HWY_REP4(normalize_mul * kGaborish[0])},
+                                   {HWY_REP4(normalize_mul * kGaborish[2])},
+                                   {HWY_REP4(normalize_mul * kGaborish[1])},
+                                   {HWY_REP4(normalize_mul * kGaborish[4])},
+                                   {HWY_REP4(normalize_mul * kGaborish[3])}};
+  }
+  // Reduce memory footprint by only allocating a single plane and swapping it
+  // into the output Image3F. Better still would be tiling.
+  // Note that we cannot *allocate* a plane, as doing so might cause Image3F to
+  // have planes of different stride. Instead, we copy one plane in a temporary
+  // image and reuse the existing planes of the in/out image.
+  ImageF temp(in_out->Plane(2).xsize(), in_out->Plane(2).ysize());
+  CopyImageTo(in_out->Plane(2), &temp);
+  Symmetric5(in_out->Plane(0), Rect(*in_out), weights[0], pool,
+             &in_out->Plane(2));
+  Symmetric5(in_out->Plane(1), Rect(*in_out), weights[1], pool,
+             &in_out->Plane(0));
+  Symmetric5(temp, Rect(*in_out), weights[2], pool, &in_out->Plane(1));
+  // Now planes are 1, 2, 0.
+  in_out->Plane(0).Swap(in_out->Plane(1));
+  // 2 1 0
+  in_out->Plane(0).Swap(in_out->Plane(2));
+}
+
+}  // namespace jxl
similarity index 90%
rename from lib/jxl/gaborish.h
rename to lib/jxl/enc_gaborish.h
index e43411d..102064f 100644 (file)
@@ -19,7 +19,7 @@ namespace jxl {
 // Used in encoder to reduce the impact of the decoder's smoothing.
 // This is not exact. Works in-place to reduce memory use.
 // The input is typically in XYB space.
-void GaborishInverse(Image3F* in_out, float mul, ThreadPool* pool);
+void GaborishInverse(Image3F* in_out, float mul[3], ThreadPool* pool);
 
 }  // namespace jxl
 
similarity index 86%
rename from lib/jxl/gaborish_test.cc
rename to lib/jxl/enc_gaborish_test.cc
index 55b17a0..57a18e3 100644 (file)
@@ -3,14 +3,14 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-#include "lib/jxl/gaborish.h"
+#include "lib/jxl/enc_gaborish.h"
 
 #include <hwy/base.h>
 
-#include "gtest/gtest.h"
 #include "lib/jxl/convolve.h"
 #include "lib/jxl/image_ops.h"
 #include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
 namespace {
@@ -41,8 +41,14 @@ void TestRoundTrip(const Image3F& in, float max_l1) {
   ConvolveGaborish(in.Plane(0), 0, 0, null_pool, &fwd.Plane(0));
   ConvolveGaborish(in.Plane(1), 0, 0, null_pool, &fwd.Plane(1));
   ConvolveGaborish(in.Plane(2), 0, 0, null_pool, &fwd.Plane(2));
-  GaborishInverse(&fwd, 0.92718927264540152f, null_pool);
-  VerifyRelativeError(in, fwd, max_l1, 1E-4f);
+  float w = 0.92718927264540152f;
+  float weights[3] = {
+      w,
+      w,
+      w,
+  };
+  GaborishInverse(&fwd, weights, null_pool);
+  JXL_ASSERT_OK(VerifyRelativeError(in, fwd, max_l1, 1E-4f, _));
 }
 
 TEST(GaborishTest, TestZero) {
index 0db7012..0d1b912 100644 (file)
@@ -6,12 +6,11 @@
 #ifndef LIB_JXL_ENC_GAMMA_CORRECT_H_
 #define LIB_JXL_ENC_GAMMA_CORRECT_H_
 
-// Deprecated: sRGB transfer function. Use color_management.h instead.
+// Deprecated: sRGB transfer function. Use JxlCms instead.
 
 #include <cmath>
 
 #include "lib/jxl/base/compiler_specific.h"
-#include "lib/jxl/transfer_functions-inl.h"
 
 namespace jxl {
 
index bf85306..b50f602 100644 (file)
@@ -5,29 +5,28 @@
 
 #include "lib/jxl/enc_group.h"
 
+#include <hwy/aligned_allocator.h>
 #include <utility>
 
-#include "hwy/aligned_allocator.h"
-
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "lib/jxl/enc_group.cc"
 #include <hwy/foreach_target.h>
 #include <hwy/highway.h>
 
 #include "lib/jxl/ac_strategy.h"
-#include "lib/jxl/aux_out.h"
-#include "lib/jxl/aux_out_fwd.h"
 #include "lib/jxl/base/bits.h"
 #include "lib/jxl/base/compiler_specific.h"
-#include "lib/jxl/base/profiler.h"
-#include "lib/jxl/common.h"
+#include "lib/jxl/common.h"  // kMaxNumPasses
 #include "lib/jxl/dct_util.h"
 #include "lib/jxl/dec_transforms-inl.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_cache.h"
 #include "lib/jxl/enc_params.h"
 #include "lib/jxl/enc_transforms-inl.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/quantizer-inl.h"
 #include "lib/jxl/quantizer.h"
+#include "lib/jxl/simd_util.h"
 HWY_BEFORE_NAMESPACE();
 namespace jxl {
 namespace HWY_NAMESPACE {
@@ -42,153 +41,323 @@ using hwy::HWY_NAMESPACE::Round;
 
 // NOTE: caller takes care of extracting quant from rect of RawQuantField.
 void QuantizeBlockAC(const Quantizer& quantizer, const bool error_diffusion,
-                     size_t c, int32_t quant, float qm_multiplier,
-                     size_t quant_kind, size_t xsize, size_t ysize,
-                     const float* JXL_RESTRICT block_in,
+                     size_t c, float qm_multiplier, size_t quant_kind,
+                     size_t xsize, size_t ysize, float* thresholds,
+                     const float* JXL_RESTRICT block_in, int32_t* quant,
                      int32_t* JXL_RESTRICT block_out) {
-  PROFILER_FUNC;
   const float* JXL_RESTRICT qm = quantizer.InvDequantMatrix(quant_kind, c);
-  const float qac = quantizer.Scale() * quant;
-  // Not SIMD-fied for now.
-  float thres[4] = {0.58f, 0.635f, 0.66f, 0.7f};
-  if (c == 0) {
-    for (int i = 1; i < 4; ++i) {
-      thres[i] += 0.08f;
-    }
-  }
-  if (c == 2) {
-    for (int i = 1; i < 4; ++i) {
-      thres[i] = 0.75f;
+  float qac = quantizer.Scale() * (*quant);
+  // Not SIMD-ified for now.
+  if (c != 1 && xsize * ysize >= 4) {
+    for (int i = 0; i < 4; ++i) {
+      thresholds[i] -= 0.00744f * xsize * ysize;
+      if (thresholds[i] < 0.5) {
+        thresholds[i] = 0.5;
+      }
     }
   }
-  if (xsize > 1 || ysize > 1) {
-    for (int i = 0; i < 4; ++i) {
-      thres[i] -= Clamp1(0.003f * xsize * ysize, 0.f, (c > 0 ? 0.08f : 0.12f));
+  HWY_CAPPED(float, kBlockDim) df;
+  HWY_CAPPED(int32_t, kBlockDim) di;
+  HWY_CAPPED(uint32_t, kBlockDim) du;
+  const auto quantv = Set(df, qac * qm_multiplier);
+  for (size_t y = 0; y < ysize * kBlockDim; y++) {
+    size_t yfix = static_cast<size_t>(y >= ysize * kBlockDim / 2) * 2;
+    const size_t off = y * kBlockDim * xsize;
+    for (size_t x = 0; x < xsize * kBlockDim; x += Lanes(df)) {
+      auto thr = Zero(df);
+      if (xsize == 1) {
+        HWY_ALIGN uint32_t kMask[kBlockDim] = {0, 0, 0, 0, ~0u, ~0u, ~0u, ~0u};
+        const auto mask = MaskFromVec(BitCast(df, Load(du, kMask + x)));
+        thr = IfThenElse(mask, Set(df, thresholds[yfix + 1]),
+                         Set(df, thresholds[yfix]));
+      } else {
+        // Same for all lanes in the vector.
+        thr = Set(
+            df,
+            thresholds[yfix + static_cast<size_t>(x >= xsize * kBlockDim / 2)]);
+      }
+      const auto q = Mul(Load(df, qm + off + x), quantv);
+      const auto in = Load(df, block_in + off + x);
+      const auto val = Mul(q, in);
+      const auto nzero_mask = Ge(Abs(val), thr);
+      const auto v = ConvertTo(di, IfThenElseZero(nzero_mask, Round(val)));
+      Store(v, di, block_out + off + x);
     }
   }
+}
 
-  if (!error_diffusion) {
-    HWY_CAPPED(float, kBlockDim) df;
-    HWY_CAPPED(int32_t, kBlockDim) di;
-    HWY_CAPPED(uint32_t, kBlockDim) du;
-    const auto quant = Set(df, qac * qm_multiplier);
-
-    for (size_t y = 0; y < ysize * kBlockDim; y++) {
-      size_t yfix = static_cast<size_t>(y >= ysize * kBlockDim / 2) * 2;
-      const size_t off = y * kBlockDim * xsize;
-      for (size_t x = 0; x < xsize * kBlockDim; x += Lanes(df)) {
-        auto thr = Zero(df);
-        if (xsize == 1) {
-          HWY_ALIGN uint32_t kMask[kBlockDim] = {0,   0,   0,   0,
-                                                 ~0u, ~0u, ~0u, ~0u};
-          const auto mask = MaskFromVec(BitCast(df, Load(du, kMask + x)));
-          thr =
-              IfThenElse(mask, Set(df, thres[yfix + 1]), Set(df, thres[yfix]));
-        } else {
-          // Same for all lanes in the vector.
-          thr = Set(
-              df,
-              thres[yfix + static_cast<size_t>(x >= xsize * kBlockDim / 2)]);
-        }
+void AdjustQuantBlockAC(const Quantizer& quantizer, size_t c,
+                        float qm_multiplier, size_t quant_kind, size_t xsize,
+                        size_t ysize, float* thresholds,
+                        const float* JXL_RESTRICT block_in, int32_t* quant) {
+  // No quantization adjusting for these small blocks.
+  // Quantization adjusting attempts to fix some known issues
+  // with larger blocks and on the 8x8 dct's emerging 8x8 blockiness
+  // when there are not many non-zeros.
+  constexpr size_t kPartialBlockKinds =
+      (1 << AcStrategy::Type::IDENTITY) | (1 << AcStrategy::Type::DCT2X2) |
+      (1 << AcStrategy::Type::DCT4X4) | (1 << AcStrategy::Type::DCT4X8) |
+      (1 << AcStrategy::Type::DCT8X4) | (1 << AcStrategy::Type::AFV0) |
+      (1 << AcStrategy::Type::AFV1) | (1 << AcStrategy::Type::AFV2) |
+      (1 << AcStrategy::Type::AFV3);
+  if ((1 << quant_kind) & kPartialBlockKinds) {
+    return;
+  }
 
-        const auto q = Mul(Load(df, qm + off + x), quant);
-        const auto in = Load(df, block_in + off + x);
-        const auto val = Mul(q, in);
-        const auto nzero_mask = Ge(Abs(val), thr);
-        const auto v = ConvertTo(di, IfThenElseZero(nzero_mask, Round(val)));
-        Store(v, di, block_out + off + x);
+  const float* JXL_RESTRICT qm = quantizer.InvDequantMatrix(quant_kind, c);
+  float qac = quantizer.Scale() * (*quant);
+  if (xsize > 1 || ysize > 1) {
+    for (int i = 0; i < 4; ++i) {
+      thresholds[i] -= Clamp1(0.003f * xsize * ysize, 0.f, 0.08f);
+      if (thresholds[i] < 0.54) {
+        thresholds[i] = 0.54;
       }
     }
-    return;
   }
-
-retry:
-  int hfNonZeros[4] = {};
-  float hfError[4] = {};
+  float sum_of_highest_freq_row_and_column = 0;
+  float sum_of_error = 0;
+  float sum_of_vals = 0;
+  float hfNonZeros[4] = {};
   float hfMaxError[4] = {};
-  size_t hfMaxErrorIx[4] = {};
+
   for (size_t y = 0; y < ysize * kBlockDim; y++) {
     for (size_t x = 0; x < xsize * kBlockDim; x++) {
       const size_t pos = y * kBlockDim * xsize + x;
       if (x < xsize && y < ysize) {
-        // Ensure block is initialized
-        block_out[pos] = 0;
         continue;
       }
       const size_t hfix = (static_cast<size_t>(y >= ysize * kBlockDim / 2) * 2 +
                            static_cast<size_t>(x >= xsize * kBlockDim / 2));
       const float val = block_in[pos] * (qm[pos] * qac * qm_multiplier);
-      float v = (std::abs(val) < thres[hfix]) ? 0 : rintf(val);
-      const float error = std::abs(val) - std::abs(v);
-      hfError[hfix] += error * error;
-      if (hfMaxError[hfix] < error) {
-        hfMaxError[hfix] = error;
-        hfMaxErrorIx[hfix] = pos;
+      const float v = (std::abs(val) < thresholds[hfix]) ? 0 : rintf(val);
+      const float error = std::abs(val - v);
+      sum_of_error += error;
+      sum_of_vals += std::abs(v);
+      if (c == 1 && v == 0) {
+        if (hfMaxError[hfix] < error) {
+          hfMaxError[hfix] = error;
+        }
       }
       if (v != 0.0f) {
         hfNonZeros[hfix] += std::abs(v);
+        bool in_corner = y >= 7 * ysize && x >= 7 * xsize;
+        bool on_border =
+            y == ysize * kBlockDim - 1 || x == xsize * kBlockDim - 1;
+        bool in_larger_corner = x >= 4 * xsize && y >= 4 * ysize;
+        if (in_corner || (on_border && in_larger_corner)) {
+          sum_of_highest_freq_row_and_column += std::abs(val);
+        }
       }
-      block_out[pos] = static_cast<int32_t>(rintf(v));
     }
   }
-  if (c != 1) return;
-  constexpr size_t kPartialBlockKinds =
-      (1 << AcStrategy::Type::IDENTITY) | (1 << AcStrategy::Type::DCT2X2) |
-      (1 << AcStrategy::Type::DCT4X4) | (1 << AcStrategy::Type::DCT4X8) |
-      (1 << AcStrategy::Type::DCT8X4) | (1 << AcStrategy::Type::AFV0) |
-      (1 << AcStrategy::Type::AFV1) | (1 << AcStrategy::Type::AFV2) |
-      (1 << AcStrategy::Type::AFV3);
-  if ((1 << quant_kind) & kPartialBlockKinds) return;
-  float hfErrorLimit = 0.029f * (xsize * ysize) * kDCTBlockSize * 0.25f;
-  bool goretry = false;
-  for (int i = 1; i < 4; ++i) {
-    if (hfError[i] >= hfErrorLimit &&
-        hfNonZeros[i] <= (xsize + ysize) * 0.25f) {
-      if (thres[i] >= 0.4f) {
-        thres[i] -= 0.01f;
-        goretry = true;
+  if (c == 1 && sum_of_vals * 8 < xsize * ysize) {
+    static const double kLimit[4] = {
+        0.46,
+        0.46,
+        0.46,
+        0.46,
+    };
+    static const double kMul[4] = {
+        0.9999,
+        0.9999,
+        0.9999,
+        0.9999,
+    };
+    const int32_t orig_quant = *quant;
+    int32_t new_quant = *quant;
+    for (int i = 1; i < 4; ++i) {
+      if (hfNonZeros[i] == 0.0 && hfMaxError[i] > kLimit[i]) {
+        new_quant = orig_quant + 1;
+        break;
       }
     }
+    *quant = new_quant;
+    if (hfNonZeros[3] == 0.0 && hfMaxError[3] > kLimit[3]) {
+      thresholds[3] = kMul[3] * hfMaxError[3] * new_quant / orig_quant;
+    } else if ((hfNonZeros[1] == 0.0 && hfMaxError[1] > kLimit[1]) ||
+               (hfNonZeros[2] == 0.0 && hfMaxError[2] > kLimit[2])) {
+      thresholds[1] = kMul[1] * std::max(hfMaxError[1], hfMaxError[2]) *
+                      new_quant / orig_quant;
+      thresholds[2] = thresholds[1];
+    } else if (hfNonZeros[0] == 0.0 && hfMaxError[0] > kLimit[0]) {
+      thresholds[0] = kMul[0] * hfMaxError[0] * new_quant / orig_quant;
+    }
   }
-  if (goretry) goto retry;
-  for (int i = 1; i < 4; ++i) {
-    if (hfError[i] >= hfErrorLimit && hfNonZeros[i] == 0) {
-      const size_t pos = hfMaxErrorIx[i];
-      if (hfMaxError[i] >= 0.4f) {
-        block_out[pos] = block_in[pos] > 0.0f ? 1.0f : -1.0f;
+  // Heuristic for improving accuracy of high-frequency patterns
+  // occurring in an environment with no medium-frequency masking
+  // patterns.
+  {
+    float all =
+        hfNonZeros[0] + hfNonZeros[1] + hfNonZeros[2] + hfNonZeros[3] + 1;
+    float mul[3] = {70, 30, 60};
+    if (mul[c] * sum_of_highest_freq_row_and_column >= all) {
+      *quant += mul[c] * sum_of_highest_freq_row_and_column / all;
+      if (*quant >= Quantizer::kQuantMax) {
+        *quant = Quantizer::kQuantMax - 1;
       }
     }
   }
+  if (quant_kind == AcStrategy::Type::DCT) {
+    // If this 8x8 block is too flat, increase the adaptive quantization level
+    // a bit to reduce visible block boundaries and requantize the block.
+    if (hfNonZeros[0] + hfNonZeros[1] + hfNonZeros[2] + hfNonZeros[3] < 11) {
+      *quant += 1;
+      if (*quant >= Quantizer::kQuantMax) {
+        *quant = Quantizer::kQuantMax - 1;
+      }
+    }
+  }
+  {
+    static const double kMul1[4][3] = {
+        {
+            0.22080615753848404,
+            0.45797479824262011,
+            0.29859235095977965,
+        },
+        {
+            0.70109486510286834,
+            0.16185281305512639,
+            0.14387691730035473,
+        },
+        {
+            0.114985964456218638,
+            0.44656840441027695,
+            0.10587658215149048,
+        },
+        {
+            0.46849665264409396,
+            0.41239077937781954,
+            0.088667407767185444,
+        },
+    };
+    static const double kMul2[4][3] = {
+        {
+            0.27450281941822197,
+            1.1255766549984996,
+            0.98950459134128388,
+        },
+        {
+            0.4652168675598285,
+            0.40945807983455818,
+            0.36581899811751367,
+        },
+        {
+            0.28034972424715715,
+            0.9182653201929738,
+            1.5581531543057416,
+        },
+        {
+            0.26873118114033728,
+            0.68863712390392484,
+            1.2082185408666786,
+        },
+    };
+    static const double kQuantNormalizer = 2.2942708343284721;
+    sum_of_error *= kQuantNormalizer;
+    sum_of_vals *= kQuantNormalizer;
+    if (quant_kind >= AcStrategy::Type::DCT16X16) {
+      int ix = 3;
+      if (quant_kind == AcStrategy::Type::DCT32X16 ||
+          quant_kind == AcStrategy::Type::DCT16X32) {
+        ix = 1;
+      } else if (quant_kind == AcStrategy::Type::DCT16X16) {
+        ix = 0;
+      } else if (quant_kind == AcStrategy::Type::DCT32X32) {
+        ix = 2;
+      }
+      int step =
+          sum_of_error / (kMul1[ix][c] * xsize * ysize * kBlockDim * kBlockDim +
+                          kMul2[ix][c] * sum_of_vals);
+      if (step >= 2) {
+        step = 2;
+      }
+      if (step < 0) {
+        step = 0;
+      }
+      if (sum_of_error > kMul1[ix][c] * xsize * ysize * kBlockDim * kBlockDim +
+                             kMul2[ix][c] * sum_of_vals) {
+        *quant += step;
+        if (*quant >= Quantizer::kQuantMax) {
+          *quant = Quantizer::kQuantMax - 1;
+        }
+      }
+    }
+  }
+  {
+    // Reduce quant in highly active areas.
+    int32_t div = (xsize * ysize);
+    int32_t activity = (hfNonZeros[0] + div / 2) / div;
+    int32_t orig_qp_limit = std::max(4, *quant / 2);
+    for (int i = 1; i < 4; ++i) {
+      activity = std::min<int32_t>(activity, (hfNonZeros[i] + div / 2) / div);
+    }
+    if (activity >= 15) {
+      activity = 15;
+    }
+    int32_t qp = *quant - activity;
+    if (c == 1) {
+      for (int i = 1; i < 4; ++i) {
+        thresholds[i] += 0.01 * activity;
+      }
+    }
+    if (qp < orig_qp_limit) {
+      qp = orig_qp_limit;
+    }
+    *quant = qp;
+  }
 }
 
 // NOTE: caller takes care of extracting quant from rect of RawQuantField.
-void QuantizeRoundtripYBlockAC(const Quantizer& quantizer,
-                               const bool error_diffusion, int32_t quant,
-                               size_t quant_kind, size_t xsize, size_t ysize,
-                               const float* JXL_RESTRICT biases,
+void QuantizeRoundtripYBlockAC(PassesEncoderState* enc_state, const size_t size,
+                               const Quantizer& quantizer,
+                               const bool error_diffusion, size_t quant_kind,
+                               size_t xsize, size_t ysize,
+                               const float* JXL_RESTRICT biases, int32_t* quant,
                                float* JXL_RESTRICT inout,
                                int32_t* JXL_RESTRICT quantized) {
-  QuantizeBlockAC(quantizer, error_diffusion, 1, quant, 1.0f, quant_kind, xsize,
-                  ysize, inout, quantized);
+  float thres_y[4] = {0.58f, 0.64f, 0.64f, 0.64f};
+  {
+    int32_t max_quant = 0;
+    int quant_orig = *quant;
+    float val[3] = {enc_state->x_qm_multiplier, 1.0f,
+                    enc_state->b_qm_multiplier};
+    int clut[3] = {1, 0, 2};
+    for (int ii = 0; ii < 3; ++ii) {
+      float thres[4] = {0.58f, 0.64f, 0.64f, 0.64f};
+      int c = clut[ii];
+      *quant = quant_orig;
+      AdjustQuantBlockAC(quantizer, c, val[c], quant_kind, xsize, ysize,
+                         &thres[0], inout + c * size, quant);
+      // Dead zone adjustment
+      if (c == 1) {
+        for (int k = 0; k < 4; ++k) {
+          thres_y[k] = thres[k];
+        }
+      }
+      max_quant = std::max(*quant, max_quant);
+    }
+    *quant = max_quant;
+  }
+
+  QuantizeBlockAC(quantizer, error_diffusion, 1, 1.0f, quant_kind, xsize, ysize,
+                  &thres_y[0], inout + size, quant, quantized + size);
 
-  PROFILER_ZONE("enc quant adjust bias");
   const float* JXL_RESTRICT dequant_matrix =
       quantizer.DequantMatrix(quant_kind, 1);
 
   HWY_CAPPED(float, kDCTBlockSize) df;
   HWY_CAPPED(int32_t, kDCTBlockSize) di;
-  const auto inv_qac = Set(df, quantizer.inv_quant_ac(quant));
+  const auto inv_qac = Set(df, quantizer.inv_quant_ac(*quant));
   for (size_t k = 0; k < kDCTBlockSize * xsize * ysize; k += Lanes(df)) {
-    const auto quant = Load(di, quantized + k);
+    const auto quant = Load(di, quantized + size + k);
     const auto adj_quant = AdjustQuantBias(di, 1, quant, biases);
     const auto dequantm = Load(df, dequant_matrix + k);
-    Store(Mul(Mul(adj_quant, dequantm), inv_qac), df, inout + k);
+    Store(Mul(Mul(adj_quant, dequantm), inv_qac), df, inout + size + k);
   }
 }
 
 void ComputeCoefficients(size_t group_idx, PassesEncoderState* enc_state,
                          const Image3F& opsin, Image3F* dc) {
-  PROFILER_FUNC;
   const Rect block_group_rect = enc_state->shared.BlockGroupRect(group_idx);
   const Rect group_rect = enc_state->shared.GroupRect(group_idx);
   const Rect cmap_rect(
@@ -203,12 +372,16 @@ void ComputeCoefficients(size_t group_idx, PassesEncoderState* enc_state,
   const size_t dc_stride = static_cast<size_t>(dc->PixelsPerRow());
   const size_t opsin_stride = static_cast<size_t>(opsin.PixelsPerRow());
 
-  const ImageI& full_quant_field = enc_state->shared.raw_quant_field;
+  ImageI& full_quant_field = enc_state->shared.raw_quant_field;
   const CompressParams& cparams = enc_state->cparams;
 
+  const size_t dct_scratch_size =
+      3 * (MaxVectorSize() / sizeof(float)) * AcStrategy::kMaxBlockDim;
+
   // TODO(veluca): consider strategies to reduce this memory.
   auto mem = hwy::AllocateAligned<int32_t>(3 * AcStrategy::kMaxCoeffArea);
-  auto fmem = hwy::AllocateAligned<float>(5 * AcStrategy::kMaxCoeffArea);
+  auto fmem = hwy::AllocateAligned<float>(5 * AcStrategy::kMaxCoeffArea +
+                                          dct_scratch_size);
   float* JXL_RESTRICT scratch_space =
       fmem.get() + 3 * AcStrategy::kMaxCoeffArea;
   {
@@ -216,25 +389,23 @@ void ComputeCoefficients(size_t group_idx, PassesEncoderState* enc_state,
     const bool error_diffusion = cparams.speed_tier <= SpeedTier::kSquirrel;
     constexpr HWY_CAPPED(float, kDCTBlockSize) d;
 
-    int32_t* JXL_RESTRICT coeffs[kMaxNumPasses][3] = {};
+    int32_t* JXL_RESTRICT coeffs[3][kMaxNumPasses] = {};
     size_t num_passes = enc_state->progressive_splitter.GetNumPasses();
     JXL_DASSERT(num_passes > 0);
     for (size_t i = 0; i < num_passes; i++) {
       // TODO(veluca): 16-bit quantized coeffs are not implemented yet.
       JXL_ASSERT(enc_state->coeffs[i]->Type() == ACType::k32);
       for (size_t c = 0; c < 3; c++) {
-        coeffs[i][c] = enc_state->coeffs[i]->PlaneRow(c, group_idx, 0).ptr32;
+        coeffs[c][i] = enc_state->coeffs[i]->PlaneRow(c, group_idx, 0).ptr32;
       }
     }
 
     HWY_ALIGN float* coeffs_in = fmem.get();
     HWY_ALIGN int32_t* quantized = mem.get();
 
-    size_t offset = 0;
-
     for (size_t by = 0; by < ysize_blocks; ++by) {
-      const int32_t* JXL_RESTRICT row_quant_ac =
-          block_group_rect.ConstRow(full_quant_field, by);
+      int32_t* JXL_RESTRICT row_quant_ac =
+          block_group_rect.Row(&full_quant_field, by);
       size_t ty = by / kColorTileDimInBlocks;
       const int8_t* JXL_RESTRICT row_cmap[3] = {
           cmap_rect.ConstRow(enc_state->shared.cmap.ytox_map, ty),
@@ -272,22 +443,19 @@ void ComputeCoefficients(size_t group_idx, PassesEncoderState* enc_state,
           size_t size = kDCTBlockSize * xblocks * yblocks;
 
           // DCT Y channel, roundtrip-quantize it and set DC.
-          const int32_t quant_ac = row_quant_ac[bx];
-          TransformFromPixels(acs.Strategy(), opsin_rows[1] + bx * kBlockDim,
-                              opsin_stride, coeffs_in + size, scratch_space);
-          DCFromLowestFrequencies(acs.Strategy(), coeffs_in + size,
-                                  dc_rows[1] + bx, dc_stride);
-          QuantizeRoundtripYBlockAC(
-              enc_state->shared.quantizer, error_diffusion, quant_ac,
-              acs.RawStrategy(), xblocks, yblocks, kDefaultQuantBias,
-              coeffs_in + size, quantized + size);
-
-          // DCT X and B channels
-          for (size_t c : {0, 2}) {
+          int32_t quant_ac = row_quant_ac[bx];
+          for (size_t c : {0, 1, 2}) {
             TransformFromPixels(acs.Strategy(), opsin_rows[c] + bx * kBlockDim,
                                 opsin_stride, coeffs_in + c * size,
                                 scratch_space);
           }
+          DCFromLowestFrequencies(acs.Strategy(), coeffs_in + size,
+                                  dc_rows[1] + bx, dc_stride);
+
+          QuantizeRoundtripYBlockAC(
+              enc_state, size, enc_state->shared.quantizer, error_diffusion,
+              acs.RawStrategy(), xblocks, yblocks, kDefaultQuantBias, &quant_ac,
+              coeffs_in, quantized);
 
           // Unapply color correlation
           for (size_t k = 0; k < size; k += Lanes(d)) {
@@ -302,18 +470,24 @@ void ComputeCoefficients(size_t group_idx, PassesEncoderState* enc_state,
 
           // Quantize X and B channels and set DC.
           for (size_t c : {0, 2}) {
+            float thres[4] = {0.58f, 0.62f, 0.62f, 0.62f};
             QuantizeBlockAC(enc_state->shared.quantizer, error_diffusion, c,
-                            quant_ac,
                             c == 0 ? enc_state->x_qm_multiplier
                                    : enc_state->b_qm_multiplier,
-                            acs.RawStrategy(), xblocks, yblocks,
-                            coeffs_in + c * size, quantized + c * size);
+                            acs.RawStrategy(), xblocks, yblocks, &thres[0],
+                            coeffs_in + c * size, &quant_ac,
+                            quantized + c * size);
             DCFromLowestFrequencies(acs.Strategy(), coeffs_in + c * size,
                                     dc_rows[c] + bx, dc_stride);
           }
-          enc_state->progressive_splitter.SplitACCoefficients(
-              quantized, size, acs, bx, by, offset, coeffs);
-          offset += size;
+          row_quant_ac[bx] = quant_ac;
+          for (size_t c = 0; c < 3; c++) {
+            enc_state->progressive_splitter.SplitACCoefficients(
+                quantized + c * size, acs, bx, by, coeffs[c]);
+            for (size_t p = 0; p < num_passes; p++) {
+              coeffs[c][p] += size;
+            }
+          }
         }
       }
     }
@@ -347,7 +521,7 @@ Status EncodeGroupTokenizedCoefficients(size_t group_idx, size_t pass_idx,
   if (histo_selector_bits != 0) {
     BitWriter::Allotment allotment(writer, histo_selector_bits);
     writer->Write(histo_selector_bits, histogram_idx);
-    ReclaimAndCharge(writer, &allotment, kLayerAC, aux_out);
+    allotment.ReclaimAndCharge(writer, kLayerAC, aux_out);
   }
   WriteTokens(enc_state.passes[pass_idx].ac_tokens[group_idx],
               enc_state.passes[pass_idx].codes,
index 62468dd..0caf408 100644 (file)
@@ -9,13 +9,15 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#include "lib/jxl/aux_out_fwd.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/enc_bit_writer.h"
-#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/image.h"
 
 namespace jxl {
 
+struct AuxOut;
+struct PassesEncoderState;
+
 // Fills DC
 void ComputeCoefficients(size_t group_idx, PassesEncoderState* enc_state,
                          const Image3F& opsin, Image3F* dc);
index 1ab4ea5..0e6ea76 100644 (file)
@@ -17,6 +17,7 @@
 #include "lib/jxl/enc_ar_control_field.h"
 #include "lib/jxl/enc_cache.h"
 #include "lib/jxl/enc_chroma_from_luma.h"
+#include "lib/jxl/enc_gaborish.h"
 #include "lib/jxl/enc_modular.h"
 #include "lib/jxl/enc_noise.h"
 #include "lib/jxl/enc_patch_dictionary.h"
 #include "lib/jxl/enc_quant_weights.h"
 #include "lib/jxl/enc_splines.h"
 #include "lib/jxl/enc_xyb.h"
-#include "lib/jxl/gaborish.h"
 
 namespace jxl {
+
+struct AuxOut;
+
 namespace {
 void FindBestBlockEntropyModel(PassesEncoderState& enc_state) {
   if (enc_state.cparams.decoding_speed_tier >= 1) {
@@ -302,7 +305,8 @@ void DownsampleImage2_Sharper(const ImageF& input, ImageF* output) {
   int64_t xsize = input.xsize();
   int64_t ysize = input.ysize();
 
-  ImageF box_downsample = CopyImage(input);
+  ImageF box_downsample(xsize, ysize);
+  CopyImageTo(input, &box_downsample);
   DownsampleImage(&box_downsample, 2);
 
   ImageF mask(box_downsample.xsize(), box_downsample.ysize());
@@ -613,7 +617,8 @@ void DownsampleImage2_Iterative(const ImageF& orig, ImageF* output) {
   int64_t xsize2 = DivCeil(orig.xsize(), 2);
   int64_t ysize2 = DivCeil(orig.ysize(), 2);
 
-  ImageF box_downsample = CopyImage(orig);
+  ImageF box_downsample(xsize, ysize);
+  CopyImageTo(orig, &box_downsample);
   DownsampleImage(&box_downsample, 2);
   ImageF mask(box_downsample.xsize(), box_downsample.ysize());
   CreateMask(box_downsample, mask);
@@ -627,7 +632,8 @@ void DownsampleImage2_Iterative(const ImageF& orig, ImageF* output) {
   initial.ShrinkTo(initial.xsize() - kBlockDim, initial.ysize() - kBlockDim);
   DownsampleImage2_Sharper(orig, &initial);
 
-  ImageF down = CopyImage(initial);
+  ImageF down(initial.xsize(), initial.ysize());
+  CopyImageTo(initial, &down);
   ImageF up(xsize, ysize);
   ImageF corr(xsize, ysize);
   ImageF corr2(xsize2, ysize2);
@@ -680,7 +686,7 @@ void DownsampleImage2_Iterative(Image3F* opsin) {
                        downsampled.ysize() - kBlockDim);
 
   Image3F rgb(opsin->xsize(), opsin->ysize());
-  OpsinParams opsin_params;  // TODO: use the ones that are actually used
+  OpsinParams opsin_params;  // TODO(user): use the ones that are actually used
   opsin_params.Init(kDefaultIntensityTarget);
   OpsinToLinear(*opsin, Rect(rgb), nullptr, &rgb, opsin_params);
 
@@ -701,14 +707,11 @@ Status DefaultEncoderHeuristics::LossyFrameHeuristics(
     PassesEncoderState* enc_state, ModularFrameEncoder* modular_frame_encoder,
     const ImageBundle* original_pixels, Image3F* opsin,
     const JxlCmsInterface& cms, ThreadPool* pool, AuxOut* aux_out) {
-  PROFILER_ZONE("JxlLossyFrameHeuristics uninstrumented");
-
   CompressParams& cparams = enc_state->cparams;
   PassesSharedState& shared = enc_state->shared;
 
   // Compute parameters for noise synthesis.
   if (shared.frame_header.flags & FrameHeader::kNoise) {
-    PROFILER_ZONE("enc GetNoiseParam");
     if (cparams.photon_noise_iso == 0) {
       // Don't start at zero amplitude since adding noise is expensive -- it
       // significantly slows down decoding, and this is unlikely to
@@ -825,13 +828,14 @@ Status DefaultEncoderHeuristics::LossyFrameHeuristics(
   // Call InitialQuantField only in Hare mode or slower. Otherwise, rely
   // on simple heuristics in FindBestAcStrategy, or set a constant for Falcon
   // mode.
-  if (cparams.speed_tier > SpeedTier::kHare || cparams.uniform_quant > 0) {
+  if (cparams.speed_tier > SpeedTier::kHare) {
     enc_state->initial_quant_field =
         ImageF(shared.frame_dim.xsize_blocks, shared.frame_dim.ysize_blocks);
-    float q = cparams.uniform_quant > 0
-                  ? cparams.uniform_quant
-                  : kAcQuant / cparams.butteraugli_distance;
+    enc_state->initial_quant_masking =
+        ImageF(shared.frame_dim.xsize_blocks, shared.frame_dim.ysize_blocks);
+    float q = kAcQuant / cparams.butteraugli_distance;
     FillImage(q, &enc_state->initial_quant_field);
+    FillImage(1.0f / (q + 0.001f), &enc_state->initial_quant_masking);
   } else {
     // Call this here, as it relies on pre-gaborish values.
     float butteraugli_distance_for_iqf = cparams.butteraugli_distance;
@@ -840,7 +844,8 @@ Status DefaultEncoderHeuristics::LossyFrameHeuristics(
     }
     enc_state->initial_quant_field = InitialQuantField(
         butteraugli_distance_for_iqf, *opsin, shared.frame_dim, pool, 1.0f,
-        &enc_state->initial_quant_masking);
+        &enc_state->initial_quant_masking,
+        &enc_state->initial_quant_masking1x1);
     quantizer.SetQuantField(quant_dc, enc_state->initial_quant_field, nullptr);
   }
 
@@ -848,7 +853,13 @@ Status DefaultEncoderHeuristics::LossyFrameHeuristics(
 
   // Apply inverse-gaborish.
   if (shared.frame_header.loop_filter.gab) {
-    GaborishInverse(opsin, 0.9908511000000001f, pool);
+    // Unsure why better to do some more gaborish on X and B than Y.
+    float weight[3] = {
+        1.0036278514398933f,
+        0.99406123118127299f,
+        0.99719338015886894f,
+    };
+    GaborishInverse(opsin, weight, pool);
   }
 
   FindBestDequantMatrices(cparams, *opsin, modular_frame_encoder,
@@ -875,6 +886,7 @@ Status DefaultEncoderHeuristics::LossyFrameHeuristics(
     if (cparams.speed_tier <= SpeedTier::kSquirrel) {
       cfl_heuristics.ComputeTile(r, *opsin, enc_state->shared.matrices,
                                  /*ac_strategy=*/nullptr,
+                                 /*raw_quant_field=*/nullptr,
                                  /*quantizer=*/nullptr, /*fast=*/false, thread,
                                  &enc_state->shared.cmap);
     }
@@ -891,6 +903,7 @@ Status DefaultEncoderHeuristics::LossyFrameHeuristics(
     // adjusting the quant field with butteraugli when all the other encoding
     // parameters are fixed is likely a more reliable choice anyway.
     AdjustQuantField(enc_state->shared.ac_strategy, r,
+                     cparams.butteraugli_distance,
                      &enc_state->initial_quant_field);
     quantizer.SetQuantFieldRect(enc_state->initial_quant_field, r,
                                 &enc_state->shared.raw_quant_field);
@@ -899,7 +912,7 @@ Status DefaultEncoderHeuristics::LossyFrameHeuristics(
     if (cparams.speed_tier <= SpeedTier::kHare) {
       cfl_heuristics.ComputeTile(
           r, *opsin, enc_state->shared.matrices, &enc_state->shared.ac_strategy,
-          &enc_state->shared.quantizer,
+          &enc_state->shared.raw_quant_field, &enc_state->shared.quantizer,
           /*fast=*/cparams.speed_tier >= SpeedTier::kWombat, thread,
           &enc_state->shared.cmap);
     }
index 16509f0..3cb9b50 100644 (file)
@@ -8,12 +8,12 @@
 
 // Hook for custom encoder heuristics (VarDCT only for now).
 
+#include <jxl/cms_interface.h>
 #include <stddef.h>
 #include <stdint.h>
 
 #include <string>
 
-#include "lib/jxl/aux_out_fwd.h"
 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/image.h"
@@ -21,7 +21,9 @@
 
 namespace jxl {
 
+struct AuxOut;
 struct PassesEncoderState;
+class DequantMatrices;
 class ImageBundle;
 class ModularFrameEncoder;
 
index 04b5669..3eab2c2 100644 (file)
@@ -8,7 +8,7 @@
 #include <algorithm>
 #include <memory>
 
-#include "lib/jxl/huffman_tree.h"
+#include "lib/jxl/enc_huffman_tree.h"
 
 namespace jxl {
 
similarity index 99%
rename from lib/jxl/huffman_tree.cc
rename to lib/jxl/enc_huffman_tree.cc
index 77107b0..5c40dea 100644 (file)
@@ -3,7 +3,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-#include "lib/jxl/huffman_tree.h"
+#include "lib/jxl/enc_huffman_tree.h"
 
 #include <algorithm>
 #include <limits>
similarity index 92%
rename from lib/jxl/huffman_tree.h
rename to lib/jxl/enc_huffman_tree.h
index e4ccac4..7d716cd 100644 (file)
@@ -34,8 +34,8 @@ void SetDepth(const HuffmanTree& p, HuffmanTree* pool, uint8_t* depth,
 // the symbol.
 //
 // See http://en.wikipedia.org/wiki/Huffman_coding
-void CreateHuffmanTree(const uint32_t* data, const size_t length,
-                       const int tree_limit, uint8_t* depth);
+void CreateHuffmanTree(const uint32_t* data, size_t length, int tree_limit,
+                       uint8_t* depth);
 
 // Write a Huffman tree from bit depths into the bitstream representation
 // of a Huffman tree. The generated Huffman tree is to be compressed once
index 32e9b6b..b9c3f25 100644 (file)
 #include <string>
 #include <vector>
 
-#include "lib/jxl/aux_out.h"
-#include "lib/jxl/aux_out_fwd.h"
 #include "lib/jxl/base/byte_order.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/enc_ans.h"
+#include "lib/jxl/enc_aux_out.h"
 #include "lib/jxl/fields.h"
 #include "lib/jxl/icc_codec_common.h"
+#include "lib/jxl/padded_bytes.h"
 
 namespace jxl {
 namespace {
@@ -70,6 +69,30 @@ Status PredictAndShuffle(size_t stride, size_t width, int order, size_t num,
   if (width > 1) Unshuffle(result->data() + start, num, width);
   return true;
 }
+
+static inline void EncodeVarInt(uint64_t value, PaddedBytes* data) {
+  size_t pos = data->size();
+  data->resize(data->size() + 9);
+  size_t output_size = data->size();
+  uint8_t* output = data->data();
+
+  // While more than 7 bits of data are left,
+  // store 7 bits and set the next byte flag
+  while (value > 127) {
+    // TODO(eustas): should it be `<` ?
+    JXL_CHECK(pos <= output_size);
+    // |128: Set the next byte flag
+    output[pos++] = ((uint8_t)(value & 127)) | 128;
+    // Remove the seven bits we just wrote
+    value >>= 7;
+  }
+  // TODO(eustas): should it be `<` ?
+  JXL_CHECK(pos <= output_size);
+  output[pos++] = ((uint8_t)value) & 127;
+
+  data->resize(pos);
+}
+
 }  // namespace
 
 // Outputs a transformed form of the given icc profile. The result itself is
@@ -83,7 +106,8 @@ Status PredictICC(const uint8_t* icc, size_t size, PaddedBytes* result) {
   EncodeVarInt(size, result);
 
   // Header
-  PaddedBytes header = ICCInitialHeaderPrediction();
+  PaddedBytes header;
+  header.append(ICCInitialHeaderPrediction());
   EncodeUint32(0, size, &header);
   for (size_t i = 0; i < kICCHeaderSize && i < size; i++) {
     ICCPredictHeader(icc, size, header.data(), i);
@@ -377,7 +401,7 @@ Status PredictICC(const uint8_t* icc, size_t size, PaddedBytes* result) {
   return true;
 }
 
-Status WriteICC(const PaddedBytes& icc, BitWriter* JXL_RESTRICT writer,
+Status WriteICC(const IccBytes& icc, BitWriter* JXL_RESTRICT writer,
                 size_t layer, AuxOut* JXL_RESTRICT aux_out) {
   if (icc.empty()) return JXL_FAILURE("ICC must be non-empty");
   PaddedBytes enc;
@@ -385,7 +409,7 @@ Status WriteICC(const PaddedBytes& icc, BitWriter* JXL_RESTRICT writer,
   std::vector<std::vector<Token>> tokens(1);
   BitWriter::Allotment allotment(writer, 128);
   JXL_RETURN_IF_ERROR(U64Coder::Write(enc.size(), writer));
-  ReclaimAndCharge(writer, &allotment, layer, aux_out);
+  allotment.ReclaimAndCharge(writer, layer, aux_out);
 
   for (size_t i = 0; i < enc.size(); i++) {
     tokens[0].emplace_back(
index 2480e3a..224c2e5 100644 (file)
 #include <stddef.h>
 #include <stdint.h>
 
-#include "lib/jxl/aux_out.h"
-#include "lib/jxl/aux_out_fwd.h"
+#include <vector>
+
 #include "lib/jxl/base/compiler_specific.h"
-#include "lib/jxl/base/padded_bytes.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/dec_bit_reader.h"
 #include "lib/jxl/enc_bit_writer.h"
 
 namespace jxl {
 
+struct AuxOut;
+class PaddedBytes;
+
 // Should still be called if `icc.empty()` - if so, writes only 1 bit.
-Status WriteICC(const PaddedBytes& icc, BitWriter* JXL_RESTRICT writer,
+Status WriteICC(const std::vector<uint8_t>& icc, BitWriter* JXL_RESTRICT writer,
                 size_t layer, AuxOut* JXL_RESTRICT aux_out);
 
 // Exposed only for testing
index fe6d282..918547b 100644 (file)
@@ -5,19 +5,17 @@
 
 #include "lib/jxl/enc_image_bundle.h"
 
+#include <jxl/cms_interface.h>
+
 #include <atomic>
 #include <limits>
 #include <utility>
 
 #include "lib/jxl/alpha.h"
 #include "lib/jxl/base/byte_order.h"
-#include "lib/jxl/base/padded_bytes.h"
-#include "lib/jxl/base/profiler.h"
-#include "lib/jxl/codec_in_out.h"
-#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/color_encoding_internal.h"
 #include "lib/jxl/fields.h"
 #include "lib/jxl/image_bundle.h"
-#include "lib/jxl/luminance.h"
 
 namespace jxl {
 
@@ -27,7 +25,6 @@ namespace {
 Status CopyToT(const ImageMetadata* metadata, const ImageBundle* ib,
                const Rect& rect, const ColorEncoding& c_desired,
                const JxlCmsInterface& cms, ThreadPool* pool, Image3F* out) {
-  PROFILER_FUNC;
   ColorSpaceTransform c_transform(cms);
   // Changing IsGray is probably a bug.
   JXL_CHECK(ib->IsGray() == c_desired.IsGray());
@@ -114,7 +111,6 @@ Status CopyToT(const ImageMetadata* metadata, const ImageBundle* ib,
 
 Status ImageBundle::TransformTo(const ColorEncoding& c_desired,
                                 const JxlCmsInterface& cms, ThreadPool* pool) {
-  PROFILER_FUNC;
   JXL_RETURN_IF_ERROR(CopyTo(Rect(color_), c_desired, cms, &color_, pool));
   c_current_ = c_desired;
   return true;
@@ -133,13 +129,17 @@ Status TransformIfNeeded(const ImageBundle& in, const ColorEncoding& c_desired,
   }
   // TODO(janwas): avoid copying via createExternal+copyBackToIO
   // instead of copy+createExternal+copyBackToIO
-  store->SetFromImage(CopyImage(in.color()), in.c_current());
+  Image3F color(in.color().xsize(), in.color().ysize());
+  CopyImageTo(in.color(), &color);
+  store->SetFromImage(std::move(color), in.c_current());
 
   // Must at least copy the alpha channel for use by external_image.
   if (in.HasExtraChannels()) {
     std::vector<ImageF> extra_channels;
     for (const ImageF& extra_channel : in.extra_channels()) {
-      extra_channels.emplace_back(CopyImage(extra_channel));
+      ImageF ec(extra_channel.xsize(), extra_channel.ysize());
+      CopyImageTo(extra_channel, &ec);
+      extra_channels.emplace_back(std::move(ec));
     }
     store->SetExtraChannels(std::move(extra_channels));
   }
diff --git a/lib/jxl/enc_jxl_skcms.h b/lib/jxl/enc_jxl_skcms.h
deleted file mode 100644 (file)
index 4be5420..0000000
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#ifndef LIB_JXL_ENC_JXL_SKCMS_H_
-#define LIB_JXL_ENC_JXL_SKCMS_H_
-
-// skcms wrapper to rename the skcms symbols to avoid conflicting names with
-// other projects using skcms as well. When using JPEGXL_BUNDLE_SKCMS the
-// bundled functions will be renamed from skcms_ to jxl_skcms_
-
-#ifdef SKCMS_API
-#error "Must include jxl_skcms.h and not skcms.h directly"
-#endif  // SKCMS_API
-
-#if JPEGXL_BUNDLE_SKCMS
-
-#define skcms_252_random_bytes jxl_skcms_252_random_bytes
-#define skcms_AdaptToXYZD50 jxl_skcms_AdaptToXYZD50
-#define skcms_ApproximateCurve jxl_skcms_ApproximateCurve
-#define skcms_ApproximatelyEqualProfiles jxl_skcms_ApproximatelyEqualProfiles
-#define skcms_AreApproximateInverses jxl_skcms_AreApproximateInverses
-#define skcms_GetCHAD jxl_skcms_GetCHAD
-#define skcms_GetTagByIndex jxl_skcms_GetTagByIndex
-#define skcms_GetTagBySignature jxl_skcms_GetTagBySignature
-#define skcms_GetWTPT jxl_skcms_GetWTPT
-#define skcms_Identity_TransferFunction jxl_skcms_Identity_TransferFunction
-#define skcms_MakeUsableAsDestination jxl_skcms_MakeUsableAsDestination
-#define skcms_MakeUsableAsDestinationWithSingleCurve \
-  jxl_skcms_MakeUsableAsDestinationWithSingleCurve
-#define skcms_Matrix3x3_concat jxl_skcms_Matrix3x3_concat
-#define skcms_Matrix3x3_invert jxl_skcms_Matrix3x3_invert
-#define skcms_MaxRoundtripError jxl_skcms_MaxRoundtripError
-#define skcms_Parse jxl_skcms_Parse
-#define skcms_PrimariesToXYZD50 jxl_skcms_PrimariesToXYZD50
-#define skcms_sRGB_Inverse_TransferFunction \
-  jxl_skcms_sRGB_Inverse_TransferFunction
-#define skcms_sRGB_profile jxl_skcms_sRGB_profile
-#define skcms_sRGB_TransferFunction jxl_skcms_sRGB_TransferFunction
-#define skcms_TransferFunction_eval jxl_skcms_TransferFunction_eval
-#define skcms_TransferFunction_invert jxl_skcms_TransferFunction_invert
-#define skcms_TransferFunction_makeHLGish jxl_skcms_TransferFunction_makeHLGish
-#define skcms_TransferFunction_makePQish jxl_skcms_TransferFunction_makePQish
-#define skcms_Transform jxl_skcms_Transform
-#define skcms_TransformWithPalette jxl_skcms_TransformWithPalette
-#define skcms_TRCs_AreApproximateInverse jxl_skcms_TRCs_AreApproximateInverse
-#define skcms_XYZD50_profile jxl_skcms_XYZD50_profile
-
-#endif  // JPEGXL_BUNDLE_SKCMS
-
-#include "skcms.h"
-
-#endif  // LIB_JXL_ENC_JXL_SKCMS_H_
diff --git a/lib/jxl/enc_linalg.cc b/lib/jxl/enc_linalg.cc
new file mode 100644 (file)
index 0000000..fe2090a
--- /dev/null
@@ -0,0 +1,52 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_linalg.h"
+
+#include <cmath>
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+
+namespace jxl {
+
+void ConvertToDiagonal(const ImageD& A, ImageD* const JXL_RESTRICT diag,
+                       ImageD* const JXL_RESTRICT U) {
+#if JXL_ENABLE_ASSERT
+  JXL_ASSERT(A.xsize() == 2);
+  JXL_ASSERT(A.ysize() == 2);
+  JXL_ASSERT(std::abs(A.Row(0)[1] - A.Row(1)[0]) < 1e-15);
+#endif
+
+  if (std::abs(A.ConstRow(0)[1]) < 1e-15) {
+    // Already diagonal.
+    diag->Row(0)[0] = A.ConstRow(0)[0];
+    diag->Row(0)[1] = A.ConstRow(1)[1];
+    U->Row(0)[0] = U->Row(1)[1] = 1.0;
+    U->Row(0)[1] = U->Row(1)[0] = 0.0;
+    return;
+  }
+  double b = -(A.Row(0)[0] + A.Row(1)[1]);
+  double c = A.Row(0)[0] * A.Row(1)[1] - A.Row(0)[1] * A.Row(0)[1];
+  double d = b * b - 4.0 * c;
+  double sqd = std::sqrt(d);
+  double l1 = (-b - sqd) * 0.5;
+  double l2 = (-b + sqd) * 0.5;
+
+  double v1[2] = {A.Row(0)[0] - l1, A.Row(1)[0]};
+  double v1n = 1.0 / std::hypot(v1[0], v1[1]);
+  v1[0] = v1[0] * v1n;
+  v1[1] = v1[1] * v1n;
+
+  diag->Row(0)[0] = l1;
+  diag->Row(0)[1] = l2;
+
+  U->Row(0)[0] = v1[1];
+  U->Row(0)[1] = -v1[0];
+  U->Row(1)[0] = v1[0];
+  U->Row(1)[1] = v1[1];
+}
+
+}  // namespace jxl
diff --git a/lib/jxl/enc_linalg.h b/lib/jxl/enc_linalg.h
new file mode 100644 (file)
index 0000000..791770d
--- /dev/null
@@ -0,0 +1,24 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_LINALG_H_
+#define LIB_JXL_LINALG_H_
+
+// Linear algebra.
+
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/image.h"
+
+namespace jxl {
+
+using ImageD = Plane<double>;
+
+// A is symmetric, U is orthogonal, and A = U * Diagonal(diag) * Transpose(U).
+void ConvertToDiagonal(const ImageD& A, ImageD* JXL_RESTRICT diag,
+                       ImageD* JXL_RESTRICT U);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_LINALG_H_
diff --git a/lib/jxl/enc_linalg_test.cc b/lib/jxl/enc_linalg_test.cc
new file mode 100644 (file)
index 0000000..967b9a3
--- /dev/null
@@ -0,0 +1,118 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_linalg.h"
+
+#include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/testing.h"
+
+namespace jxl {
+namespace {
+
+ImageD Identity(const size_t N) {
+  ImageD out(N, N);
+  for (size_t i = 0; i < N; ++i) {
+    double* JXL_RESTRICT row = out.Row(i);
+    std::fill(row, row + N, 0);
+    row[i] = 1.0;
+  }
+  return out;
+}
+
+ImageD Diagonal(const ImageD& d) {
+  JXL_ASSERT(d.ysize() == 1);
+  ImageD out(d.xsize(), d.xsize());
+  const double* JXL_RESTRICT row_diag = d.Row(0);
+  for (size_t k = 0; k < d.xsize(); ++k) {
+    double* JXL_RESTRICT row_out = out.Row(k);
+    std::fill(row_out, row_out + d.xsize(), 0.0);
+    row_out[k] = row_diag[k];
+  }
+  return out;
+}
+
+ImageD MatMul(const ImageD& A, const ImageD& B) {
+  JXL_ASSERT(A.ysize() == B.xsize());
+  ImageD out(A.xsize(), B.ysize());
+  for (size_t y = 0; y < B.ysize(); ++y) {
+    const double* const JXL_RESTRICT row_b = B.Row(y);
+    double* const JXL_RESTRICT row_out = out.Row(y);
+    for (size_t x = 0; x < A.xsize(); ++x) {
+      row_out[x] = 0.0;
+      for (size_t k = 0; k < B.xsize(); ++k) {
+        row_out[x] += A.Row(k)[x] * row_b[k];
+      }
+    }
+  }
+  return out;
+}
+
+ImageD Transpose(const ImageD& A) {
+  ImageD out(A.ysize(), A.xsize());
+  for (size_t x = 0; x < A.xsize(); ++x) {
+    double* const JXL_RESTRICT row_out = out.Row(x);
+    for (size_t y = 0; y < A.ysize(); ++y) {
+      row_out[y] = A.Row(y)[x];
+    }
+  }
+  return out;
+}
+
+ImageD RandomSymmetricMatrix(const size_t N, Rng& rng, const double vmin,
+                             const double vmax) {
+  ImageD A(N, N);
+  GenerateImage(rng, &A, vmin, vmax);
+  for (size_t i = 0; i < N; ++i) {
+    for (size_t j = 0; j < i; ++j) {
+      A.Row(j)[i] = A.Row(i)[j];
+    }
+  }
+  return A;
+}
+
+void VerifyMatrixEqual(const ImageD& A, const ImageD& B, const double eps) {
+  ASSERT_EQ(A.xsize(), B.xsize());
+  ASSERT_EQ(A.ysize(), B.ysize());
+  for (size_t y = 0; y < A.ysize(); ++y) {
+    for (size_t x = 0; x < A.xsize(); ++x) {
+      ASSERT_NEAR(A.Row(y)[x], B.Row(y)[x], eps);
+    }
+  }
+}
+
+void VerifyOrthogonal(const ImageD& A, const double eps) {
+  VerifyMatrixEqual(Identity(A.xsize()), MatMul(Transpose(A), A), eps);
+}
+
+TEST(LinAlgTest, ConvertToDiagonal) {
+  {
+    ImageD I = Identity(2);
+    ImageD U(2, 2), d(2, 1);
+    ConvertToDiagonal(I, &d, &U);
+    VerifyMatrixEqual(I, U, 1e-15);
+    for (size_t k = 0; k < 2; ++k) {
+      ASSERT_NEAR(d.Row(0)[k], 1.0, 1e-15);
+    }
+  }
+  {
+    ImageD A = Identity(2);
+    A.Row(0)[1] = A.Row(1)[0] = 2.0;
+    ImageD U(2, 2), d(2, 1);
+    ConvertToDiagonal(A, &d, &U);
+    VerifyOrthogonal(U, 1e-12);
+    VerifyMatrixEqual(A, MatMul(U, MatMul(Diagonal(d), Transpose(U))), 1e-12);
+  }
+  Rng rng(0);
+  for (size_t i = 0; i < 100; ++i) {
+    ImageD A = RandomSymmetricMatrix(2, rng, -1.0, 1.0);
+    ImageD U(2, 2), d(2, 1);
+    ConvertToDiagonal(A, &d, &U);
+    VerifyOrthogonal(U, 1e-12);
+    VerifyMatrixEqual(A, MatMul(U, MatMul(Diagonal(d), Transpose(U))), 1e-12);
+  }
+}
+
+}  // namespace
+}  // namespace jxl
index 9e34fe8..bcb7a33 100644 (file)
 #include <utility>
 #include <vector>
 
-#include "lib/jxl/aux_out.h"
 #include "lib/jxl/base/compiler_specific.h"
-#include "lib/jxl/base/padded_bytes.h"
 #include "lib/jxl/base/printf_macros.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/compressed_dc.h"
 #include "lib/jxl/dec_ans.h"
+#include "lib/jxl/enc_aux_out.h"
 #include "lib/jxl/enc_bit_writer.h"
 #include "lib/jxl/enc_cluster.h"
+#include "lib/jxl/enc_fields.h"
+#include "lib/jxl/enc_gaborish.h"
 #include "lib/jxl/enc_params.h"
 #include "lib/jxl/enc_patch_dictionary.h"
 #include "lib/jxl/enc_quant_weights.h"
 #include "lib/jxl/frame_header.h"
-#include "lib/jxl/gaborish.h"
 #include "lib/jxl/modular/encoding/context_predict.h"
 #include "lib/jxl/modular/encoding/enc_debug_tree.h"
 #include "lib/jxl/modular/encoding/enc_encoding.h"
 #include "lib/jxl/modular/modular_image.h"
 #include "lib/jxl/modular/options.h"
 #include "lib/jxl/modular/transform/enc_transform.h"
+#include "lib/jxl/pack_signed.h"
 #include "lib/jxl/toc.h"
 
 namespace jxl {
 
 namespace {
+// constexpr bool kPrintTree = false;
+
 // Squeeze default quantization factors
 // these quantization factors are for -Q 50  (other qualities simply scale the
 // factors; things are rounded down and obviously cannot get below 1)
@@ -175,7 +178,7 @@ Tree PredefinedTree(ModularOptions::TreeKind tree_kind, size_t total_pixels) {
     return MakeFixedTree(kGradientProp, cutoffs, Predictor::Gradient,
                          total_pixels);
   }
-  JXL_ABORT("Unreachable");
+  JXL_UNREACHABLE("Unreachable");
   return {};
 }
 
@@ -303,7 +306,7 @@ ModularFrameEncoder::ModularFrameEncoder(const FrameHeader& frame_header,
     : frame_dim_(frame_header.ToFrameDimensions()), cparams_(cparams_orig) {
   size_t num_streams =
       ModularStreamId::Num(frame_dim_, frame_header.passes.num_passes);
-  if (cparams_.IsLossless()) {
+  if (cparams_.ModularPartIsLossless()) {
     switch (cparams_.decoding_speed_tier) {
       case 0:
         break;
@@ -328,7 +331,7 @@ ModularFrameEncoder::ModularFrameEncoder(const FrameHeader& frame_header,
     }
   }
   if (cparams_.decoding_speed_tier >= 1 && cparams_.responsive &&
-      cparams_.IsLossless()) {
+      cparams_.ModularPartIsLossless()) {
     cparams_.options.tree_kind =
         ModularOptions::TreeKind::kTrivialTreeNoPredictor;
     cparams_.options.nb_repeats = 0;
@@ -338,18 +341,16 @@ ModularFrameEncoder::ModularFrameEncoder(const FrameHeader& frame_header,
   // use a sensible default if nothing explicit is specified:
   // Squeeze for lossy, no squeeze for lossless
   if (cparams_.responsive < 0) {
-    if (cparams_.IsLossless()) {
+    if (cparams_.ModularPartIsLossless()) {
       cparams_.responsive = 0;
     } else {
       cparams_.responsive = 1;
     }
   }
 
-  if (cparams_.speed_tier > SpeedTier::kWombat) {
-    cparams_.options.splitting_heuristics_node_threshold = 192;
-  } else {
-    cparams_.options.splitting_heuristics_node_threshold = 96;
-  }
+  cparams_.options.splitting_heuristics_node_threshold =
+      82 + 14 * static_cast<int>(cparams_.speed_tier);
+
   {
     // Set properties.
     std::vector<uint32_t> prop_order;
@@ -360,17 +361,31 @@ ModularFrameEncoder::ModularFrameEncoder(const FrameHeader& frame_header,
     } else {
       // Same, but for the non-Squeeze case.
       prop_order = {0, 1, 15, 9, 10, 11, 12, 13, 14, 2, 3, 4, 5, 6, 7, 8};
+      // if few groups, don't use group as a property
+      if (num_streams < 30 && cparams_.speed_tier > SpeedTier::kTortoise) {
+        prop_order.erase(prop_order.begin() + 1);
+      }
     }
     switch (cparams_.speed_tier) {
-      case SpeedTier::kSquirrel:
+      case SpeedTier::kHare:
+        cparams_.options.splitting_heuristics_properties.assign(
+            prop_order.begin(), prop_order.begin() + 4);
+        cparams_.options.max_property_values = 24;
+        break;
+      case SpeedTier::kWombat:
         cparams_.options.splitting_heuristics_properties.assign(
-            prop_order.begin(), prop_order.begin() + 8);
+            prop_order.begin(), prop_order.begin() + 5);
         cparams_.options.max_property_values = 32;
         break;
+      case SpeedTier::kSquirrel:
+        cparams_.options.splitting_heuristics_properties.assign(
+            prop_order.begin(), prop_order.begin() + 7);
+        cparams_.options.max_property_values = 48;
+        break;
       case SpeedTier::kKitten:
         cparams_.options.splitting_heuristics_properties.assign(
             prop_order.begin(), prop_order.begin() + 10);
-        cparams_.options.max_property_values = 64;
+        cparams_.options.max_property_values = 96;
         break;
       case SpeedTier::kTortoise:
         cparams_.options.splitting_heuristics_properties = prop_order;
@@ -378,7 +393,7 @@ ModularFrameEncoder::ModularFrameEncoder(const FrameHeader& frame_header,
         break;
       default:
         cparams_.options.splitting_heuristics_properties.assign(
-            prop_order.begin(), prop_order.begin() + 6);
+            prop_order.begin(), prop_order.begin() + 3);
         cparams_.options.max_property_values = 16;
         break;
     }
@@ -425,7 +440,7 @@ ModularFrameEncoder::ModularFrameEncoder(const FrameHeader& frame_header,
     delta_pred_ = cparams_.options.predictor;
     if (cparams_.lossy_palette) cparams_.options.predictor = Predictor::Zero;
   }
-  if (!cparams_.IsLossless()) {
+  if (!cparams_.ModularPartIsLossless()) {
     if (cparams_.options.predictor == Predictor::Weighted ||
         cparams_.options.predictor == Predictor::Variable ||
         cparams_.options.predictor == Predictor::Best)
@@ -473,7 +488,9 @@ Status ModularFrameEncoder::ComputeEncodingData(
               frame_header.DebugString().c_str());
 
   if (do_color && frame_header.loop_filter.gab) {
-    GaborishInverse(color, 0.9908511000000001f, pool);
+    float w = 0.9908511000000001f;
+    float weights[3] = {w, w, w};
+    GaborishInverse(color, weights, pool);
   }
 
   if (do_color && metadata.bit_depth.bits_per_sample <= 16 &&
@@ -634,8 +651,7 @@ Status ModularFrameEncoder::ComputeEncodingData(
         cparams_.level, max_bitdepth, level_max_bitdepth);
 
   // Set options and apply transformations
-
-  if (cparams_.butteraugli_distance > 0) {
+  if (!cparams_.ModularPartIsLossless()) {
     if (cparams_.palette_colors != 0) {
       JXL_DEBUG_V(3, "Lossy encode, not doing palette transforms");
     }
@@ -647,26 +663,41 @@ Status ModularFrameEncoder::ComputeEncodingData(
     cparams_.lossy_palette = false;
   }
 
-  // if few colors, do all-channel palette before trying channel palette
-  // Logic is as follows:
-  // - if you can make a palette with few colors (arbitrary threshold: 200),
-  //   then you can also make channel palettes, but they will just be extra
-  //   signaling cost for almost no benefit
-  // - if the palette needs more colors, then channel palette might help to
-  //   reduce palette signaling cost
-  if (cparams_.palette_colors != 0 &&
-      cparams_.speed_tier < SpeedTier::kFalcon) {
+  // Global palette
+  if (cparams_.palette_colors != 0 || cparams_.lossy_palette) {
     // all-channel palette (e.g. RGBA)
-    if (gi.channel.size() > 1) {
+    if (gi.channel.size() - gi.nb_meta_channels > 1) {
       Transform maybe_palette(TransformId::kPalette);
       maybe_palette.begin_c = gi.nb_meta_channels;
       maybe_palette.num_c = gi.channel.size() - gi.nb_meta_channels;
       maybe_palette.nb_colors =
-          std::min(std::min(200, (int)(xsize * ysize / 8)),
-                   std::abs(cparams_.palette_colors) / 16);
+          std::min((int)(xsize * ysize / 2), std::abs(cparams_.palette_colors));
       maybe_palette.ordered_palette = cparams_.palette_colors >= 0;
-      maybe_palette.lossy_palette = false;
-      do_transform(gi, maybe_palette, weighted::Header(), pool);
+      maybe_palette.lossy_palette =
+          (cparams_.lossy_palette && maybe_palette.num_c == 3);
+      if (maybe_palette.lossy_palette) {
+        maybe_palette.predictor = delta_pred_;
+      }
+      // TODO(veluca): use a custom weighted header if using the weighted
+      // predictor.
+      do_transform(gi, maybe_palette, weighted::Header(), pool,
+                   cparams_.options.zero_tokens);
+    }
+    // all-minus-one-channel palette (RGB with separate alpha, or CMY with
+    // separate K)
+    if (gi.channel.size() - gi.nb_meta_channels > 3) {
+      Transform maybe_palette_3(TransformId::kPalette);
+      maybe_palette_3.begin_c = gi.nb_meta_channels;
+      maybe_palette_3.num_c = gi.channel.size() - gi.nb_meta_channels - 1;
+      maybe_palette_3.nb_colors =
+          std::min((int)(xsize * ysize / 3), std::abs(cparams_.palette_colors));
+      maybe_palette_3.ordered_palette = cparams_.palette_colors >= 0;
+      maybe_palette_3.lossy_palette = cparams_.lossy_palette;
+      if (maybe_palette_3.lossy_palette) {
+        maybe_palette_3.predictor = delta_pred_;
+      }
+      do_transform(gi, maybe_palette_3, weighted::Header(), pool,
+                   cparams_.options.zero_tokens);
     }
   }
 
@@ -682,7 +713,7 @@ Status ModularFrameEncoder::ComputeEncodingData(
     for (size_t i = 0; i < nb_channels; i++) {
       int32_t min, max;
       compute_minmax(gi.channel[gi.nb_meta_channels + i], &min, &max);
-      int64_t colors = max - min + 1;
+      int64_t colors = (int64_t)max - min + 1;
       JXL_DEBUG_V(10, "Channel %" PRIuS ": range=%i..%i", i, min, max);
       Transform maybe_palette_1(TransformId::kPalette);
       maybe_palette_1.begin_c = i + gi.nb_meta_channels;
@@ -706,51 +737,12 @@ Status ModularFrameEncoder::ComputeEncodingData(
     }
   }
 
-  // Global palette
-  if ((cparams_.palette_colors != 0 || cparams_.lossy_palette) &&
-      cparams_.speed_tier < SpeedTier::kFalcon) {
-    // all-channel palette (e.g. RGBA)
-    if (gi.channel.size() - gi.nb_meta_channels > 1) {
-      Transform maybe_palette(TransformId::kPalette);
-      maybe_palette.begin_c = gi.nb_meta_channels;
-      maybe_palette.num_c = gi.channel.size() - gi.nb_meta_channels;
-      maybe_palette.nb_colors =
-          std::min((int)(xsize * ysize / 8), std::abs(cparams_.palette_colors));
-      maybe_palette.ordered_palette = cparams_.palette_colors >= 0;
-      maybe_palette.lossy_palette =
-          (cparams_.lossy_palette && maybe_palette.num_c == 3);
-      if (maybe_palette.lossy_palette) {
-        maybe_palette.predictor = delta_pred_;
-      }
-      // TODO(veluca): use a custom weighted header if using the weighted
-      // predictor.
-      do_transform(gi, maybe_palette, weighted::Header(), pool,
-                   cparams_.options.zero_tokens);
-    }
-    // all-minus-one-channel palette (RGB with separate alpha, or CMY with
-    // separate K)
-    if (gi.channel.size() - gi.nb_meta_channels > 3) {
-      Transform maybe_palette_3(TransformId::kPalette);
-      maybe_palette_3.begin_c = gi.nb_meta_channels;
-      maybe_palette_3.num_c = gi.channel.size() - gi.nb_meta_channels - 1;
-      maybe_palette_3.nb_colors =
-          std::min((int)(xsize * ysize / 8), std::abs(cparams_.palette_colors));
-      maybe_palette_3.ordered_palette = cparams_.palette_colors >= 0;
-      maybe_palette_3.lossy_palette = cparams_.lossy_palette;
-      if (maybe_palette_3.lossy_palette) {
-        maybe_palette_3.predictor = delta_pred_;
-      }
-      do_transform(gi, maybe_palette_3, weighted::Header(), pool,
-                   cparams_.options.zero_tokens);
-    }
-  }
-
   // don't do an RCT if we're short on bits
   if (cparams_.color_transform == ColorTransform::kNone && do_color &&
       gi.channel.size() - gi.nb_meta_channels >= 3 &&
       max_bitdepth + 1 < level_max_bitdepth) {
-    if (cparams_.colorspace < 0 &&
-        (!cparams_.IsLossless() || cparams_.speed_tier > SpeedTier::kHare)) {
+    if (cparams_.colorspace < 0 && (!cparams_.ModularPartIsLossless() ||
+                                    cparams_.speed_tier > SpeedTier::kHare)) {
       Transform ycocg{TransformId::kRCT};
       ycocg.rct_type = 6;
       ycocg.begin_c = gi.nb_meta_channels;
@@ -782,20 +774,32 @@ Status ModularFrameEncoder::ComputeEncodingData(
 
   std::vector<uint32_t> quants;
 
-  if (cparams_.butteraugli_distance > 0) {
+  if (!cparams_.ModularPartIsLossless()) {
     quants.resize(gi.channel.size(), 1);
-    float quality = 0.25f * cparams_.butteraugli_distance;
-    JXL_DEBUG_V(2,
-                "Adding quantization constants corresponding to distance %.3f ",
-                quality);
+    float quantizer = 0.25f;
     if (!cparams_.responsive) {
       JXL_DEBUG_V(1,
                   "Warning: lossy compression without Squeeze "
                   "transform is just color quantization.");
-      quality *= 0.1f;
+      quantizer *= 0.1f;
     }
+    float bitdepth_correction = 1.f;
     if (cparams_.color_transform != ColorTransform::kXYB) {
-      quality *= maxval / 255.f;
+      bitdepth_correction = maxval / 255.f;
+    }
+    std::vector<float> quantizers;
+    float dist = cparams_.butteraugli_distance;
+    for (size_t i = 0; i < 3; i++) {
+      quantizers.push_back(quantizer * dist * bitdepth_correction);
+    }
+    for (size_t i = 0; i < extra_channels.size(); i++) {
+      int ec_bitdepth =
+          metadata.extra_channel_info[i].bit_depth.bits_per_sample;
+      pixel_type ec_maxval = ec_bitdepth < 32 ? (1u << ec_bitdepth) - 1 : 0;
+      bitdepth_correction = ec_maxval / 255.f;
+      if (i < cparams_.ec_distance.size()) dist = cparams_.ec_distance[i];
+      if (dist < 0) dist = cparams_.butteraugli_distance;
+      quantizers.push_back(quantizer * dist * bitdepth_correction);
     }
     if (cparams_.options.nb_repeats == 0) {
       return JXL_FAILURE("nb_repeats = 0 not supported with modular lossy!");
@@ -814,14 +818,15 @@ Status ModularFrameEncoder::ComputeEncodingData(
         component = 1;
       }
       if (cparams_.color_transform == ColorTransform::kXYB && component < 3) {
-        q = quality * squeeze_quality_factor_xyb *
+        q = quantizers[component] * squeeze_quality_factor_xyb *
             squeeze_xyb_qtable[component][shift];
       } else {
         if (cparams_.colorspace != 0 && component > 0 && component < 3) {
-          q = quality * squeeze_quality_factor * squeeze_chroma_qtable[shift];
+          q = quantizers[component] * squeeze_quality_factor *
+              squeeze_chroma_qtable[shift];
         } else {
-          q = quality * squeeze_quality_factor * squeeze_luma_factor *
-              squeeze_luma_qtable[shift];
+          q = quantizers[component] * squeeze_quality_factor *
+              squeeze_luma_factor * squeeze_luma_qtable[shift];
         }
       }
       if (q < 1) q = 1;
@@ -1090,24 +1095,21 @@ Status ModularFrameEncoder::PrepareEncoding(const FrameHeader& frame_header,
   JXL_ASSERT(tree_.size() == decoded_tree.size());
   tree_ = std::move(decoded_tree);
 
-  if (WantDebugOutput(aux_out)) {
+  /* TODO(szabadka) Add text output callback to cparams
+  if (kPrintTree && WantDebugOutput(aux_out)) {
     if (frame_header.dc_level > 0) {
       PrintTree(tree_, aux_out->debug_prefix + "/dc_frame_level" +
                            std::to_string(frame_header.dc_level) + "_tree");
     } else {
       PrintTree(tree_, aux_out->debug_prefix + "/global_tree");
     }
-  }
+  } */
 
   image_widths_.resize(num_streams);
   JXL_RETURN_IF_ERROR(RunOnPool(
       pool, 0, num_streams, ThreadPool::NoInit,
       [&](const uint32_t stream_id, size_t /* thread */) {
         AuxOut my_aux_out;
-        if (aux_out) {
-          my_aux_out.dump_image = aux_out->dump_image;
-          my_aux_out.debug_prefix = aux_out->debug_prefix;
-        }
         tokens_[stream_id].clear();
         JXL_CHECK(ModularGenericCompress(
             stream_images_[stream_id], stream_options_[stream_id],
@@ -1128,11 +1130,11 @@ Status ModularFrameEncoder::EncodeGlobalInfo(BitWriter* writer,
   // If we are using brotli, or not using modular mode.
   if (tree_tokens_.empty() || tree_tokens_[0].empty()) {
     writer->Write(1, 0);
-    ReclaimAndCharge(writer, &allotment, kLayerModularTree, aux_out);
+    allotment.ReclaimAndCharge(writer, kLayerModularTree, aux_out);
     return true;
   }
   writer->Write(1, 1);
-  ReclaimAndCharge(writer, &allotment, kLayerModularTree, aux_out);
+  allotment.ReclaimAndCharge(writer, kLayerModularTree, aux_out);
 
   // Write tree
   HistogramParams params;
@@ -1369,7 +1371,7 @@ Status ModularFrameEncoder::PrepareStreamParams(const Rect& rect,
       for (size_t i = 0; i < nb_channels; i++) {
         int32_t min, max;
         compute_minmax(gi.channel[gi.nb_meta_channels + i], &min, &max);
-        int colors = max - min + 1;
+        int64_t colors = (int64_t)max - min + 1;
         JXL_DEBUG_V(10, "Channel %" PRIuS ": range=%i..%i", i, min, max);
         Transform maybe_palette_1(TransformId::kPalette);
         maybe_palette_1.begin_c = i + gi.nb_meta_channels;
@@ -1415,6 +1417,7 @@ Status ModularFrameEncoder::PrepareStreamParams(const Rect& rect,
       case SpeedTier::kKitten:
         nb_rcts_to_try = 9;
         break;
+      case SpeedTier::kGlacier:
       case SpeedTier::kTortoise:
         nb_rcts_to_try = 19;
         break;
index 02477ee..2af66e9 100644 (file)
@@ -6,8 +6,6 @@
 #ifndef LIB_JXL_ENC_MODULAR_H_
 #define LIB_JXL_ENC_MODULAR_H_
 
-#include "lib/jxl/aux_out.h"
-#include "lib/jxl/aux_out_fwd.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/dec_modular.h"
 #include "lib/jxl/enc_bit_writer.h"
@@ -21,6 +19,8 @@
 
 namespace jxl {
 
+struct AuxOut;
+
 class ModularFrameEncoder {
  public:
   ModularFrameEncoder(const FrameHeader& frame_header,
index 3628761..a12a9e6 100644 (file)
@@ -6,7 +6,6 @@
 #include "lib/jxl/enc_noise.h"
 
 #include <stdint.h>
-#include <stdio.h>
 #include <stdlib.h>
 
 #include <algorithm>
@@ -16,9 +15,9 @@
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/chroma_from_luma.h"
 #include "lib/jxl/convolve.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_optimize.h"
 #include "lib/jxl/image_ops.h"
-#include "lib/jxl/opsin_params.h"
-#include "lib/jxl/optimize.h"
 
 namespace jxl {
 namespace {
@@ -367,7 +366,7 @@ void EncodeNoise(const NoiseParams& noise_params, BitWriter* writer,
   for (float i : noise_params.lut) {
     EncodeFloatParam(i, kNoisePrecision, writer);
   }
-  ReclaimAndCharge(writer, &allotment, layer, aux_out);
+  allotment.ReclaimAndCharge(writer, layer, aux_out);
 }
 
 }  // namespace jxl
index 15fb07a..851fdd1 100644 (file)
@@ -10,7 +10,6 @@
 
 #include <stddef.h>
 
-#include "lib/jxl/aux_out_fwd.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/enc_bit_writer.h"
 #include "lib/jxl/image.h"
@@ -18,6 +17,8 @@
 
 namespace jxl {
 
+struct AuxOut;
+
 // Get parameters of the noise for NoiseParams model
 // Returns whether a valid noise model (with HasAny()) is set.
 Status GetNoiseParameter(const Image3F& opsin, NoiseParams* noise_params,
similarity index 99%
rename from lib/jxl/optimize.cc
rename to lib/jxl/enc_optimize.cc
index 0816596..6865ff6 100644 (file)
@@ -3,7 +3,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-#include "lib/jxl/optimize.h"
+#include "lib/jxl/enc_optimize.h"
 
 #include <algorithm>
 
similarity index 99%
rename from lib/jxl/optimize.h
rename to lib/jxl/enc_optimize.h
index 0a60198..9da523f 100644 (file)
@@ -8,8 +8,6 @@
 #ifndef LIB_JXL_OPTIMIZE_H_
 #define LIB_JXL_OPTIMIZE_H_
 
-#include <stdio.h>
-
 #include <cmath>
 #include <cstdio>
 #include <functional>
similarity index 97%
rename from lib/jxl/optimize_test.cc
rename to lib/jxl/enc_optimize_test.cc
index c606a03..cc65bf1 100644 (file)
@@ -3,11 +3,9 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-#include "lib/jxl/optimize.h"
+#include "lib/jxl/enc_optimize.h"
 
-#include <stdio.h>
-
-#include "gtest/gtest.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
 namespace optimize {
index 2e16fae..bce640b 100644 (file)
@@ -8,6 +8,7 @@
 
 // Parameters and flags that govern JXL compression.
 
+#include <jxl/encode.h>
 #include <stddef.h>
 #include <stdint.h>
 
@@ -22,6 +23,9 @@
 namespace jxl {
 
 enum class SpeedTier {
+  // Try multiple combinations of Tortoise flags for modular mode. Otherwise
+  // like kTortoise.
+  kGlacier = 0,
   // Turns on FindBestQuantizationHQ loop. Equivalent to "guetzli" mode.
   kTortoise = 1,
   // Turns on FindBestQuantization butteraugli loop.
@@ -48,79 +52,13 @@ enum class SpeedTier {
   kLightning = 9
 };
 
-inline bool ParseSpeedTier(const std::string& s, SpeedTier* out) {
-  if (s == "lightning") {
-    *out = SpeedTier::kLightning;
-    return true;
-  } else if (s == "thunder") {
-    *out = SpeedTier::kThunder;
-    return true;
-  } else if (s == "falcon") {
-    *out = SpeedTier::kFalcon;
-    return true;
-  } else if (s == "cheetah") {
-    *out = SpeedTier::kCheetah;
-    return true;
-  } else if (s == "hare") {
-    *out = SpeedTier::kHare;
-    return true;
-  } else if (s == "fast" || s == "wombat") {
-    *out = SpeedTier::kWombat;
-    return true;
-  } else if (s == "squirrel") {
-    *out = SpeedTier::kSquirrel;
-    return true;
-  } else if (s == "kitten") {
-    *out = SpeedTier::kKitten;
-    return true;
-  } else if (s == "guetzli" || s == "tortoise") {
-    *out = SpeedTier::kTortoise;
-    return true;
-  }
-  size_t st = 10 - static_cast<size_t>(strtoull(s.c_str(), nullptr, 0));
-  if (st <= static_cast<size_t>(SpeedTier::kLightning) &&
-      st >= static_cast<size_t>(SpeedTier::kTortoise)) {
-    *out = SpeedTier(st);
-    return true;
-  }
-  return false;
-}
-
-inline const char* SpeedTierName(SpeedTier speed_tier) {
-  switch (speed_tier) {
-    case SpeedTier::kLightning:
-      return "lightning";
-    case SpeedTier::kThunder:
-      return "thunder";
-    case SpeedTier::kFalcon:
-      return "falcon";
-    case SpeedTier::kCheetah:
-      return "cheetah";
-    case SpeedTier::kHare:
-      return "hare";
-    case SpeedTier::kWombat:
-      return "wombat";
-    case SpeedTier::kSquirrel:
-      return "squirrel";
-    case SpeedTier::kKitten:
-      return "kitten";
-    case SpeedTier::kTortoise:
-      return "tortoise";
-  }
-  return "INVALID";
-}
-
 // NOLINTNEXTLINE(clang-analyzer-optin.performance.Padding)
 struct CompressParams {
   float butteraugli_distance = 1.0f;
-  size_t target_size = 0;
-  float target_bitrate = 0.0f;
 
-  // 0.0 means search for the adaptive quantization map that matches the
-  // butteraugli distance, positive values mean quantize everywhere with that
-  // value.
-  float uniform_quant = 0.0f;
-  float quant_border_bias = 0.0f;
+  // explicit distances for extra channels (defaults to butteraugli_distance
+  // when not set; value of -1 can be used to represent 'default')
+  std::vector<float> ec_distance;
 
   // Try to achieve a maximum pixel-by-pixel error on each channel.
   bool max_error_mode = false;
@@ -132,21 +70,16 @@ struct CompressParams {
   // 0 = default.
   // 1 = slightly worse quality.
   // 4 = fastest speed, lowest quality
-  // TODO(veluca): hook this up to the C API.
   size_t decoding_speed_tier = 0;
 
-  int max_butteraugli_iters = 4;
-
-  int max_butteraugli_iters_guetzli_mode = 100;
-
   ColorTransform color_transform = ColorTransform::kXYB;
-  YCbCrChromaSubsampling chroma_subsampling;
 
   // If true, the "modular mode options" members below are used.
   bool modular_mode = false;
 
-  // Change group size in modular mode (0=128, 1=256, 2=512, 3=1024).
-  size_t modular_group_size_shift = 1;
+  // Change group size in modular mode (0=128, 1=256, 2=512, 3=1024, -1=encoder
+  // chooses).
+  int modular_group_size_shift = -1;
 
   Override preview = Override::kDefault;
   Override noise = Override::kDefault;
@@ -174,43 +107,26 @@ struct CompressParams {
   // Default: on for lossless, off for lossy
   Override keep_invisible = Override::kDefault;
 
-  // Progressive-mode saliency.
-  //
-  // How many progressive saliency-encoding steps to perform.
-  // - 1: Encode only DC and lowest-frequency AC. Does not need a saliency-map.
-  // - 2: Encode only DC+LF, dropping all HF AC data.
-  //      Does not need a saliency-map.
-  // - 3: Encode DC+LF+{salient HF}, dropping all non-salient HF data.
-  // - 4: Encode DC+LF+{salient HF}+{other HF}.
-  // - 5: Encode DC+LF+{quantized HF}+{low HF bits}.
-  size_t saliency_num_progressive_steps = 3;
-  // Every saliency-heatmap cell with saliency >= threshold will be considered
-  // as 'salient'. The default value of 0.0 will consider every AC-block
-  // as salient, hence not require a saliency-map, and not actually generate
-  // a 4th progressive step.
-  float saliency_threshold = 0.0f;
-  // Saliency-map (owned by caller).
-  ImageF* saliency_map = nullptr;
-
-  // Input and output file name. Will be used to provide pluggable saliency
-  // extractor with paths.
-  const char* file_in = nullptr;
-  const char* file_out = nullptr;
-
-  // Currently unused as of 2020-01.
-  bool clear_metadata = false;
-
-  // Prints extra information during/after encoding.
-  bool verbose = false;
-  bool log_search_state = false;
-
-  ButteraugliParams ba_params;
+  JxlCmsInterface cms;
+  bool cms_set = false;
+  void SetCms(const JxlCmsInterface& cms) {
+    this->cms = cms;
+    cms_set = true;
+  }
 
   // Force usage of CfL when doing JPEG recompression. This can have unexpected
   // effects on the decoded pixels, while still being JPEG-compliant and
   // allowing reconstruction of the original JPEG.
   bool force_cfl_jpeg_recompression = true;
 
+  // Use brotli compression for any boxes derived from a JPEG frame.
+  bool jpeg_compress_boxes = true;
+
+  // Preserve this metadata when doing JPEG recompression.
+  bool jpeg_keep_exif = true;
+  bool jpeg_keep_xmp = true;
+  bool jpeg_keep_jumbf = true;
+
   // Set the noise to what it would approximately be if shooting at the nominal
   // exposure for a given ISO setting on a 35mm camera.
   float photon_noise_iso = 0;
@@ -229,17 +145,32 @@ struct CompressParams {
   bool lossy_palette = false;
 
   // Returns whether these params are lossless as defined by SetLossless();
-  bool IsLossless() const {
-    // YCbCr is also considered lossless here since it's intended for
-    // source material that is already YCbCr (we don't do the fwd transform)
-    return modular_mode && butteraugli_distance == 0.0f &&
-           color_transform != jxl::ColorTransform::kXYB;
+  bool IsLossless() const { return modular_mode && ModularPartIsLossless(); }
+
+  bool ModularPartIsLossless() const {
+    if (modular_mode) {
+      // YCbCr is also considered lossless here since it's intended for
+      // source material that is already YCbCr (we don't do the fwd transform)
+      if (butteraugli_distance != 0 ||
+          color_transform == jxl::ColorTransform::kXYB)
+        return false;
+    }
+    for (float f : ec_distance) {
+      if (f > 0) return false;
+      if (f < 0 && butteraugli_distance != 0) return false;
+    }
+    // if no explicit ec_distance given, and using vardct, then the modular part
+    // is empty or not lossless
+    if (!modular_mode && ec_distance.empty()) return false;
+    // all modular channels are encoded at distance 0
+    return true;
   }
 
   // Sets the parameters required to make the codec lossless.
   void SetLossless() {
     modular_mode = true;
     butteraugli_distance = 0.0f;
+    for (float &f : ec_distance) f = 0.0f;
     color_transform = jxl::ColorTransform::kNone;
   }
 
@@ -264,18 +195,20 @@ struct CompressParams {
 
   std::vector<float> manual_noise;
   std::vector<float> manual_xyb_factors;
+
+  JxlDebugImageCallback debug_image = nullptr;
+  void* debug_image_opaque;
 };
 
 static constexpr float kMinButteraugliForDynamicAR = 0.5f;
 static constexpr float kMinButteraugliForDots = 3.0f;
 static constexpr float kMinButteraugliToSubtractOriginalPatches = 3.0f;
-static constexpr float kMinButteraugliDistanceForProgressiveDc = 4.5f;
 
 // Always off
 static constexpr float kMinButteraugliForNoise = 99.0f;
 
 // Minimum butteraugli distance the encoder accepts.
-static constexpr float kMinButteraugliDistance = 0.01f;
+static constexpr float kMinButteraugliDistance = 0.001f;
 
 // Tile size for encoder-side processing. Must be equal to color tile dim in the
 // current implementation.
index ff57ff0..d26a86d 100644 (file)
 #include <vector>
 
 #include "lib/jxl/ans_params.h"
+#include "lib/jxl/base/common.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/override.h"
 #include "lib/jxl/base/random.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/chroma_from_luma.h"
-#include "lib/jxl/color_management.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/dec_cache.h"
 #include "lib/jxl/dec_frame.h"
 #include "lib/jxl/enc_ans.h"
+#include "lib/jxl/enc_aux_out.h"
 #include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_debug_image.h"
 #include "lib/jxl/enc_dot_dictionary.h"
 #include "lib/jxl/enc_frame.h"
 #include "lib/jxl/entropy_coder.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/image_bundle.h"
 #include "lib/jxl/image_ops.h"
+#include "lib/jxl/pack_signed.h"
 #include "lib/jxl/patch_dictionary_internal.h"
 
 namespace jxl {
 
+static constexpr size_t kPatchFrameReferenceId = 3;
+
 // static
 void PatchDictionaryEncoder::Encode(const PatchDictionary& pdic,
                                     BitWriter* writer, size_t layer,
@@ -135,13 +139,13 @@ void PatchDictionaryEncoder::SubtractFrom(const PatchDictionary& pdic,
       size_t iy = y - by;
       size_t ref = ref_pos.ref;
       const float* JXL_RESTRICT ref_rows[3] = {
-          pdic.shared_->reference_frames[ref].frame->color()->ConstPlaneRow(
+          pdic.shared_->reference_frames[ref].frame.color().ConstPlaneRow(
               0, ref_pos.y0 + iy) +
               ref_pos.x0,
-          pdic.shared_->reference_frames[ref].frame->color()->ConstPlaneRow(
+          pdic.shared_->reference_frames[ref].frame.color().ConstPlaneRow(
               1, ref_pos.y0 + iy) +
               ref_pos.x0,
-          pdic.shared_->reference_frames[ref].frame->color()->ConstPlaneRow(
+          pdic.shared_->reference_frames[ref].frame.color().ConstPlaneRow(
               2, ref_pos.y0 + iy) +
               ref_pos.x0,
       };
@@ -154,7 +158,8 @@ void PatchDictionaryEncoder::SubtractFrom(const PatchDictionary& pdic,
           } else if (mode == PatchBlendMode::kNone) {
             // Nothing to do.
           } else {
-            JXL_ABORT("Blending mode %u not yet implemented", (uint32_t)mode);
+            JXL_UNREACHABLE("Blending mode %u not yet implemented",
+                            (uint32_t)mode);
           }
         }
       }
@@ -204,9 +209,11 @@ struct PatchColorspaceInfo {
 };
 
 std::vector<PatchInfo> FindTextLikePatches(
-    const Image3F& opsin, const PassesEncoderState* JXL_RESTRICT state,
-    ThreadPool* pool, AuxOut* aux_out, bool is_xyb) {
+    const CompressParams& cparams, const Image3F& opsin,
+    const PassesEncoderState* JXL_RESTRICT state, ThreadPool* pool,
+    AuxOut* aux_out, bool is_xyb) {
   if (state->cparams.patches == Override::kOff) return {};
+  const auto& frame_dim = state->shared.frame_dim;
 
   PatchColorspaceInfo pci(is_xyb);
   float kSimilarThreshold = 0.8f;
@@ -251,13 +258,13 @@ std::vector<PatchInfo> FindTextLikePatches(
 
   // Look for kPatchSide size squares, naturally aligned, that all have the same
   // pixel values.
-  ImageB is_screenshot_like(DivCeil(opsin.xsize(), kPatchSide),
-                            DivCeil(opsin.ysize(), kPatchSide));
+  ImageB is_screenshot_like(DivCeil(frame_dim.xsize, kPatchSide),
+                            DivCeil(frame_dim.ysize, kPatchSide));
   ZeroFillImage(&is_screenshot_like);
   uint8_t* JXL_RESTRICT screenshot_row = is_screenshot_like.Row(0);
   const size_t screenshot_stride = is_screenshot_like.PixelsPerRow();
   const auto process_row = [&](const uint32_t y, size_t /* thread */) {
-    for (uint64_t x = 0; x < opsin.xsize() / kPatchSide; x++) {
+    for (uint64_t x = 0; x < frame_dim.xsize / kPatchSide; x++) {
       bool all_same = true;
       for (size_t iy = 0; iy < static_cast<size_t>(kPatchSide); iy++) {
         for (size_t ix = 0; ix < static_cast<size_t>(kPatchSide); ix++) {
@@ -276,8 +283,8 @@ std::vector<PatchInfo> FindTextLikePatches(
         for (int64_t ix = -kExtraSide; ix < kExtraSide + kPatchSide; ix++) {
           int64_t cx = x * kPatchSide + ix;
           int64_t cy = y * kPatchSide + iy;
-          if (cx < 0 || static_cast<uint64_t>(cx) >= opsin.xsize() ||  //
-              cy < 0 || static_cast<uint64_t>(cy) >= opsin.ysize()) {
+          if (cx < 0 || static_cast<uint64_t>(cx) >= frame_dim.xsize ||  //
+              cy < 0 || static_cast<uint64_t>(cy) >= frame_dim.ysize) {
             continue;
           }
           num++;
@@ -290,12 +297,12 @@ std::vector<PatchInfo> FindTextLikePatches(
       has_screenshot_areas = true;
     }
   };
-  JXL_CHECK(RunOnPool(pool, 0, opsin.ysize() / kPatchSide, ThreadPool::NoInit,
+  JXL_CHECK(RunOnPool(pool, 0, frame_dim.ysize / kPatchSide, ThreadPool::NoInit,
                       process_row, "IsScreenshotLike"));
 
   // TODO(veluca): also parallelize the rest of this function.
-  if (WantDebugOutput(aux_out)) {
-    aux_out->DumpPlaneNormalized("screenshot_like", is_screenshot_like);
+  if (WantDebugOutput(cparams)) {
+    DumpPlaneNormalized(cparams, "screenshot_like", is_screenshot_like);
   }
 
   constexpr int kSearchRadius = 1;
@@ -305,9 +312,9 @@ std::vector<PatchInfo> FindTextLikePatches(
   }
 
   // Search for "similar enough" pixels near the screenshot-like areas.
-  ImageB is_background(opsin.xsize(), opsin.ysize());
+  ImageB is_background(frame_dim.xsize, frame_dim.ysize);
   ZeroFillImage(&is_background);
-  Image3F background(opsin.xsize(), opsin.ysize());
+  Image3F background(frame_dim.xsize, frame_dim.ysize);
   ZeroFillImage(&background);
   constexpr size_t kDistanceLimit = 50;
   float* JXL_RESTRICT background_rows[3] = {
@@ -322,8 +329,8 @@ std::vector<PatchInfo> FindTextLikePatches(
       std::pair<std::pair<uint32_t, uint32_t>, std::pair<uint32_t, uint32_t>>>
       queue;
   size_t queue_front = 0;
-  for (size_t y = 0; y < opsin.ysize(); y++) {
-    for (size_t x = 0; x < opsin.xsize(); x++) {
+  for (size_t y = 0; y < frame_dim.ysize; y++) {
+    for (size_t x = 0; x < frame_dim.xsize; x++) {
       if (!screenshot_row[screenshot_stride * (y / kPatchSide) +
                           (x / kPatchSide)])
         continue;
@@ -347,8 +354,8 @@ std::vector<PatchInfo> FindTextLikePatches(
         int next_first = cur.first + dx;
         int next_second = cur.second + dy;
         if (next_first < 0 || next_second < 0 ||
-            static_cast<uint32_t>(next_first) >= opsin.xsize() ||
-            static_cast<uint32_t>(next_second) >= opsin.ysize()) {
+            static_cast<uint32_t>(next_first) >= frame_dim.xsize ||
+            static_cast<uint32_t>(next_second) >= frame_dim.ysize) {
           continue;
         }
         if (static_cast<uint32_t>(
@@ -375,14 +382,14 @@ std::vector<PatchInfo> FindTextLikePatches(
   ImageF ccs;
   Rng rng(0);
   bool paint_ccs = false;
-  if (WantDebugOutput(aux_out)) {
-    aux_out->DumpPlaneNormalized("is_background", is_background);
+  if (WantDebugOutput(cparams)) {
+    DumpPlaneNormalized(cparams, "is_background", is_background);
     if (is_xyb) {
-      aux_out->DumpXybImage("background", background);
+      DumpXybImage(cparams, "background", background);
     } else {
-      aux_out->DumpImage("background", background);
+      DumpImage(cparams, "background", background);
     }
-    ccs = ImageF(opsin.xsize(), opsin.ysize());
+    ccs = ImageF(frame_dim.xsize, frame_dim.ysize);
     ZeroFillImage(&ccs);
     paint_ccs = true;
   }
@@ -404,14 +411,14 @@ std::vector<PatchInfo> FindTextLikePatches(
 
   // Find small CC outside the "similar enough" areas, compute bounding boxes,
   // and run heuristics to exclude some patches.
-  ImageB visited(opsin.xsize(), opsin.ysize());
+  ImageB visited(frame_dim.xsize, frame_dim.ysize);
   ZeroFillImage(&visited);
   uint8_t* JXL_RESTRICT visited_row = visited.Row(0);
   const size_t visited_stride = visited.PixelsPerRow();
   std::vector<std::pair<uint32_t, uint32_t>> cc;
   std::vector<std::pair<uint32_t, uint32_t>> stack;
-  for (size_t y = 0; y < opsin.ysize(); y++) {
-    for (size_t x = 0; x < opsin.xsize(); x++) {
+  for (size_t y = 0; y < frame_dim.ysize; y++) {
+    for (size_t x = 0; x < frame_dim.xsize; x++) {
       if (is_background_row[y * is_background_stride + x]) continue;
       cc.clear();
       stack.clear();
@@ -441,8 +448,8 @@ std::vector<PatchInfo> FindTextLikePatches(
             int next_first = static_cast<int32_t>(cur.first) + dx;
             int next_second = static_cast<int32_t>(cur.second) + dy;
             if (next_first < 0 || next_second < 0 ||
-                static_cast<uint32_t>(next_first) >= opsin.xsize() ||
-                static_cast<uint32_t>(next_second) >= opsin.ysize()) {
+                static_cast<uint32_t>(next_first) >= frame_dim.xsize ||
+                static_cast<uint32_t>(next_second) >= frame_dim.ysize) {
               continue;
             }
             std::pair<uint32_t, uint32_t> next{next_first, next_second};
@@ -470,10 +477,11 @@ std::vector<PatchInfo> FindTextLikePatches(
       bool has_similar = false;
       for (size_t iy = std::max<int>(
                static_cast<int32_t>(min_y) - kHasSimilarRadius, 0);
-           iy < std::min(max_y + kHasSimilarRadius + 1, opsin.ysize()); iy++) {
+           iy < std::min(max_y + kHasSimilarRadius + 1, frame_dim.ysize);
+           iy++) {
         for (size_t ix = std::max<int>(
                  static_cast<int32_t>(min_x) - kHasSimilarRadius, 0);
-             ix < std::min(max_x + kHasSimilarRadius + 1, opsin.xsize());
+             ix < std::min(max_x + kHasSimilarRadius + 1, frame_dim.xsize);
              ix++) {
           size_t opos = opsin_stride * iy + ix;
           float px[3] = {opsin_rows[0][opos], opsin_rows[1][opos],
@@ -516,15 +524,15 @@ std::vector<PatchInfo> FindTextLikePatches(
   }
 
   if (paint_ccs) {
-    JXL_ASSERT(WantDebugOutput(aux_out));
-    aux_out->DumpPlaneNormalized("ccs", ccs);
+    JXL_ASSERT(WantDebugOutput(cparams));
+    DumpPlaneNormalized(cparams, "ccs", ccs);
   }
   if (info.empty()) {
     return {};
   }
 
   // Remove duplicates.
-  constexpr size_t kMinPatchOccurences = 2;
+  constexpr size_t kMinPatchOccurrences = 2;
   std::sort(info.begin(), info.end());
   size_t unique = 0;
   for (size_t i = 1; i < info.size(); i++) {
@@ -532,13 +540,13 @@ std::vector<PatchInfo> FindTextLikePatches(
       info[unique].second.insert(info[unique].second.end(),
                                  info[i].second.begin(), info[i].second.end());
     } else {
-      if (info[unique].second.size() >= kMinPatchOccurences) {
+      if (info[unique].second.size() >= kMinPatchOccurrences) {
         unique++;
       }
       info[unique] = info[i];
     }
   }
-  if (info[unique].second.size() >= kMinPatchOccurences) {
+  if (info[unique].second.size() >= kMinPatchOccurrences) {
     unique++;
   }
   info.resize(unique);
@@ -564,7 +572,7 @@ void FindBestPatchDictionary(const Image3F& opsin,
                              const JxlCmsInterface& cms, ThreadPool* pool,
                              AuxOut* aux_out, bool is_xyb) {
   std::vector<PatchInfo> info =
-      FindTextLikePatches(opsin, state, pool, aux_out, is_xyb);
+      FindTextLikePatches(state->cparams, opsin, state, pool, aux_out, is_xyb);
 
   // TODO(veluca): this doesn't work if both dots and patches are enabled.
   // For now, since dots and patches are not likely to occur in the same kind of
@@ -692,7 +700,7 @@ void FindBestPatchDictionary(const Image3F& opsin,
     ref_pos.ysize = info[i].first.ysize;
     ref_pos.x0 = ref_positions[i].first;
     ref_pos.y0 = ref_positions[i].second;
-    ref_pos.ref = 0;
+    ref_pos.ref = kPatchFrameReferenceId;
     for (size_t y = 0; y < ref_pos.ysize; y++) {
       for (size_t x = 0; x < ref_pos.xsize; x++) {
         for (size_t c = 0; c < 3; c++) {
@@ -717,8 +725,8 @@ void FindBestPatchDictionary(const Image3F& opsin,
   // Recursive application of patches could create very weird issues.
   cparams.patches = Override::kOff;
 
-  RoundtripPatchFrame(&reference_frame, state, 0, cparams, cms, pool, aux_out,
-                      /*subtract=*/true);
+  RoundtripPatchFrame(&reference_frame, state, kPatchFrameReferenceId, cparams,
+                      cms, pool, aux_out, /*subtract=*/true);
 
   // TODO(veluca): this assumes that applying patches is commutative, which is
   // not true for all blending modes. This code only produces kAdd patches, so
@@ -754,9 +762,9 @@ void RoundtripPatchFrame(Image3F* reference_frame,
   ib.SetFromImage(std::move(*reference_frame),
                   state->shared.metadata->m.color_encoding);
   if (!ib.metadata()->extra_channel_info.empty()) {
-    // Add dummy extra channels to the patch image: patch encoding does not yet
-    // support extra channels, but the codec expects that the amount of extra
-    // channels in frames matches that in the metadata of the codestream.
+    // Add placeholder extra channels to the patch image: patch encoding does
+    // not yet support extra channels, but the codec expects that the amount of
+    // extra channels in frames matches that in the metadata of the codestream.
     std::vector<ImageF> extra_channels;
     extra_channels.reserve(ib.metadata()->extra_channel_info.size());
     for (size_t i = 0; i < ib.metadata()->extra_channel_info.size(); i++) {
@@ -793,7 +801,7 @@ void RoundtripPatchFrame(Image3F* reference_frame,
     frame_start += decoded.decoded_bytes();
     encoded_size -= decoded.decoded_bytes();
     size_t ref_xsize =
-        dec_state.shared_storage.reference_frames[idx].storage.color()->xsize();
+        dec_state.shared_storage.reference_frames[idx].frame.color()->xsize();
     // if the frame itself uses patches, we need to decode another frame
     if (!ref_xsize) {
       JXL_CHECK(DecodeFrame(&dec_state, pool, frame_start, encoded_size,
@@ -803,10 +811,8 @@ void RoundtripPatchFrame(Image3F* reference_frame,
     state->shared.reference_frames[idx] =
         std::move(dec_state.shared_storage.reference_frames[idx]);
   } else {
-    state->shared.reference_frames[idx].storage = std::move(ib);
+    state->shared.reference_frames[idx].frame = std::move(ib);
   }
-  state->shared.reference_frames[idx].frame =
-      &state->shared.reference_frames[idx].storage;
 }
 
 }  // namespace jxl
index 090827f..e17bfe4 100644 (file)
 #include <tuple>
 #include <vector>
 
-#include "lib/jxl/aux_out_fwd.h"
 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/chroma_from_luma.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/dec_bit_reader.h"
 #include "lib/jxl/dec_patch_dictionary.h"
 #include "lib/jxl/enc_bit_writer.h"
 #include "lib/jxl/enc_cache.h"
 #include "lib/jxl/enc_params.h"
 #include "lib/jxl/image.h"
-#include "lib/jxl/opsin_params.h"
 
 namespace jxl {
 
+struct AuxOut;
+
 constexpr size_t kMaxPatchSize = 32;
 
 struct QuantizedPatch {
index 3786ef5..1933435 100644 (file)
@@ -5,6 +5,10 @@
 
 #include "lib/jxl/enc_photon_noise.h"
 
+#include <algorithm>
+
+#include "lib/jxl/cms/opsin_params.h"
+
 namespace jxl {
 
 namespace {
@@ -38,7 +42,8 @@ inline constexpr T Cube(const T x) {
 
 NoiseParams SimulatePhotonNoise(const size_t xsize, const size_t ysize,
                                 const float iso) {
-  const float kOpsinAbsorbanceBiasCbrt = std::cbrt(kOpsinAbsorbanceBias[1]);
+  const float kOpsinAbsorbanceBiasCbrt =
+      std::cbrt(jxl::cms::kOpsinAbsorbanceBias[1]);
 
   // Focal plane exposure for 18% of kDefaultIntensityTarget, in lx·s.
   // (ISO = 10 lx·s ÷ H)
@@ -57,8 +62,8 @@ NoiseParams SimulatePhotonNoise(const size_t xsize, const size_t ysize,
     // scaled_index is used for XYB = (0, 2·scaled_index, 2·scaled_index)
     const float y = 2 * scaled_index;
     // 1 = default intensity target
-    const float linear = std::max(
-        0.f, Cube(y - kOpsinAbsorbanceBiasCbrt) + kOpsinAbsorbanceBias[1]);
+    const float linear = std::max(0.f, Cube(y - kOpsinAbsorbanceBiasCbrt) +
+                                           jxl::cms::kOpsinAbsorbanceBias[1]);
     const float electrons_per_pixel = electrons_per_pixel_18 * (linear / 0.18f);
     // Quadrature sum of read noise, photon shot noise (sqrt(S) so simply not
     // squared here) and photo response non-uniformity.
@@ -69,7 +74,8 @@ NoiseParams SimulatePhotonNoise(const size_t xsize, const size_t ysize,
                   Square(kPhotoResponseNonUniformity * electrons_per_pixel));
     const float linear_noise = noise * (0.18f / electrons_per_pixel_18);
     const float opsin_derivative =
-        (1.f / 3) / Square(std::cbrt(linear - kOpsinAbsorbanceBias[1]));
+        (1.f / 3) /
+        Square(std::cbrt(linear - jxl::cms::kOpsinAbsorbanceBias[1]));
     const float opsin_noise = linear_noise * opsin_derivative;
 
     // TODO(sboukortt): verify more thoroughly whether the denominator is
index 8370725..be11b46 100644 (file)
@@ -6,6 +6,7 @@
 #include "lib/jxl/enc_photon_noise.h"
 
 #include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
 namespace {
diff --git a/lib/jxl/enc_progressive_split.cc b/lib/jxl/enc_progressive_split.cc
new file mode 100644 (file)
index 0000000..811c945
--- /dev/null
@@ -0,0 +1,82 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/enc_progressive_split.h"
+
+#include <string.h>
+
+#include <algorithm>
+#include <memory>
+
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/common.h"  // kMaxNumPasses
+#include "lib/jxl/image.h"
+
+namespace jxl {
+
+template <typename T>
+void ProgressiveSplitter::SplitACCoefficients(
+    const T* JXL_RESTRICT block, const AcStrategy& acs, size_t bx, size_t by,
+    T* JXL_RESTRICT output[kMaxNumPasses]) {
+  size_t size = acs.covered_blocks_x() * acs.covered_blocks_y() * kDCTBlockSize;
+  auto shift_right_round0 = [&](T v, int shift) {
+    T one_if_negative = static_cast<uint32_t>(v) >> 31;
+    T add = (one_if_negative << shift) - one_if_negative;
+    return (v + add) >> shift;
+  };
+  // Early quit for the simple case of only one pass.
+  if (mode_.num_passes == 1) {
+    memcpy(output[0], block, sizeof(T) * size);
+    return;
+  }
+  size_t ncoeffs_all_done_from_earlier_passes = 1;
+
+  int previous_pass_shift = 0;
+  for (size_t num_pass = 0; num_pass < mode_.num_passes; num_pass++) {  // pass
+    // Zero out output block.
+    memset(output[num_pass], 0, size * sizeof(T));
+    const int pass_shift = mode_.passes[num_pass].shift;
+    size_t frame_ncoeffs = mode_.passes[num_pass].num_coefficients;
+    size_t xsize = acs.covered_blocks_x();
+    size_t ysize = acs.covered_blocks_y();
+    CoefficientLayout(&ysize, &xsize);
+    for (size_t y = 0; y < ysize * frame_ncoeffs; y++) {    // superblk-y
+      for (size_t x = 0; x < xsize * frame_ncoeffs; x++) {  // superblk-x
+        size_t pos = y * xsize * kBlockDim + x;
+        if (x < xsize * ncoeffs_all_done_from_earlier_passes &&
+            y < ysize * ncoeffs_all_done_from_earlier_passes) {
+          // This coefficient was already included in an earlier pass,
+          // which included a genuinely smaller set of coefficients.
+          continue;
+        }
+        T v = block[pos];
+        // Previous pass discarded some bits: do not encode them again.
+        if (previous_pass_shift != 0) {
+          T previous_v = shift_right_round0(v, previous_pass_shift) *
+                         (1 << previous_pass_shift);
+          v -= previous_v;
+        }
+        output[num_pass][pos] = shift_right_round0(v, pass_shift);
+      }  // superblk-x
+    }    // superblk-y
+    // We just finished a pass.
+    // Hence, we are now guaranteed to have included all coeffs up to
+    // frame_ncoeffs in every block, unless the current pass is shifted.
+    if (mode_.passes[num_pass].shift == 0) {
+      ncoeffs_all_done_from_earlier_passes = frame_ncoeffs;
+    }
+    previous_pass_shift = mode_.passes[num_pass].shift;
+  }  // num_pass
+}
+
+template void ProgressiveSplitter::SplitACCoefficients<int32_t>(
+    const int32_t* JXL_RESTRICT, const AcStrategy&, size_t, size_t,
+    int32_t* JXL_RESTRICT[kMaxNumPasses]);
+
+template void ProgressiveSplitter::SplitACCoefficients<int16_t>(
+    const int16_t* JXL_RESTRICT, const AcStrategy&, size_t, size_t,
+    int16_t* JXL_RESTRICT[kMaxNumPasses]);
+
+}  // namespace jxl
similarity index 73%
rename from lib/jxl/progressive_split.h
rename to lib/jxl/enc_progressive_split.h
index aeae980..06584fa 100644 (file)
@@ -17,8 +17,7 @@
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/chroma_from_luma.h"
-#include "lib/jxl/codec_in_out.h"
-#include "lib/jxl/common.h"
+#include "lib/jxl/common.h"  // kMaxNumPasses
 #include "lib/jxl/dct_util.h"
 #include "lib/jxl/frame_header.h"
 #include "lib/jxl/image.h"
@@ -41,10 +40,6 @@ struct PassDefinition {
   // How much to shift the encoded values by, with rounding.
   size_t shift;
 
-  // Whether or not we should include only salient blocks.
-  // TODO(veluca): ignored for now.
-  bool salient_only;
-
   // If specified, this indicates that if the requested downsampling factor is
   // sufficiently high, then it is fine to stop decoding after this pass.
   // By default, passes are not marked as being suitable for any downsampling.
@@ -53,9 +48,9 @@ struct PassDefinition {
 
 struct ProgressiveMode {
   size_t num_passes = 1;
-  PassDefinition passes[kMaxNumPasses] = {PassDefinition{
-      /*num_coefficients=*/8, /*shift=*/0, /*salient_only=*/false,
-      /*suitable_for_downsampling_of_at_least=*/1}};
+  PassDefinition passes[kMaxNumPasses] = {
+      PassDefinition{/*num_coefficients=*/8, /*shift=*/0,
+                     /*suitable_for_downsampling_of_at_least=*/1}};
 
   ProgressiveMode() = default;
 
@@ -65,19 +60,18 @@ struct ProgressiveMode {
     num_passes = nump;
     PassDefinition previous_pass{
         /*num_coefficients=*/1, /*shift=*/0,
-        /*salient_only=*/false,
         /*suitable_for_downsampling_of_at_least=*/kNoDownsamplingFactor};
     size_t last_downsampling_factor = kNoDownsamplingFactor;
     for (size_t i = 0; i < nump; i++) {
       JXL_ASSERT(p[i].num_coefficients > previous_pass.num_coefficients ||
                  (p[i].num_coefficients == previous_pass.num_coefficients &&
-                  !p[i].salient_only && previous_pass.salient_only) ||
-                 (p[i].num_coefficients == previous_pass.num_coefficients &&
                   p[i].shift < previous_pass.shift));
       JXL_ASSERT(p[i].suitable_for_downsampling_of_at_least ==
                      kNoDownsamplingFactor ||
                  p[i].suitable_for_downsampling_of_at_least <=
                      last_downsampling_factor);
+      // Only used inside assert.
+      (void)last_downsampling_factor;
       if (p[i].suitable_for_downsampling_of_at_least != kNoDownsamplingFactor) {
         last_downsampling_factor = p[i].suitable_for_downsampling_of_at_least;
       }
@@ -90,14 +84,6 @@ class ProgressiveSplitter {
  public:
   void SetProgressiveMode(ProgressiveMode mode) { mode_ = mode; }
 
-  void SetSaliencyMap(const ImageF* saliency_map) {
-    saliency_map_ = saliency_map;
-  }
-
-  void SetSaliencyThreshold(float threshold) {
-    saliency_threshold_ = threshold;
-  }
-
   size_t GetNumPasses() const { return mode_.num_passes; }
 
   void InitPasses(Passes* JXL_RESTRICT passes) const {
@@ -124,28 +110,21 @@ class ProgressiveSplitter {
   }
 
   template <typename T>
-  void SplitACCoefficients(const T* JXL_RESTRICT block, size_t size,
-                           const AcStrategy& acs, size_t bx, size_t by,
-                           size_t offset,
-                           T* JXL_RESTRICT output[kMaxNumPasses][3]);
+  void SplitACCoefficients(const T* JXL_RESTRICT block, const AcStrategy& acs,
+                           size_t bx, size_t by,
+                           T* JXL_RESTRICT output[kMaxNumPasses]);
 
  private:
-  bool SuperblockIsSalient(size_t row_start, size_t col_start, size_t num_rows,
-                           size_t num_cols) const;
   ProgressiveMode mode_;
-
-  // Not owned, must remain valid.
-  const ImageF* saliency_map_ = nullptr;
-  float saliency_threshold_ = 0.0;
 };
 
 extern template void ProgressiveSplitter::SplitACCoefficients<int32_t>(
-    const int32_t* JXL_RESTRICT, size_t, const AcStrategy&, size_t, size_t,
-    size_t, int32_t* JXL_RESTRICT[kMaxNumPasses][3]);
+    const int32_t* JXL_RESTRICT, const AcStrategy&, size_t, size_t,
+    int32_t* JXL_RESTRICT[kMaxNumPasses]);
 
 extern template void ProgressiveSplitter::SplitACCoefficients<int16_t>(
-    const int16_t* JXL_RESTRICT, size_t, const AcStrategy&, size_t, size_t,
-    size_t, int16_t* JXL_RESTRICT[kMaxNumPasses][3]);
+    const int16_t* JXL_RESTRICT, const AcStrategy&, size_t, size_t,
+    int16_t* JXL_RESTRICT[kMaxNumPasses]);
 
 }  // namespace jxl
 
index d8a9931..0d3f628 100644 (file)
@@ -5,7 +5,6 @@
 
 #include "lib/jxl/enc_quant_weights.h"
 
-#include <stdio.h>
 #include <stdlib.h>
 
 #include <algorithm>
 #include <limits>
 #include <utility>
 
-#include "lib/jxl/aux_out.h"
-#include "lib/jxl/aux_out_fwd.h"
 #include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/common.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/dct_scales.h"
+#include "lib/jxl/enc_aux_out.h"
 #include "lib/jxl/enc_bit_writer.h"
 #include "lib/jxl/enc_modular.h"
 #include "lib/jxl/fields.h"
@@ -28,6 +26,8 @@
 
 namespace jxl {
 
+struct AuxOut;
+
 namespace {
 
 Status EncodeDctParams(const DctQuantWeightParams& params, BitWriter* writer) {
@@ -138,7 +138,7 @@ Status DequantMatricesEncode(const DequantMatrices* matrices, BitWriter* writer,
           DequantMatrices::required_size_y[i], writer, modular_frame_encoder));
     }
   }
-  ReclaimAndCharge(writer, &allotment, layer, aux_out);
+  allotment.ReclaimAndCharge(writer, layer, aux_out);
   return true;
 }
 
@@ -159,7 +159,7 @@ Status DequantMatricesEncodeDC(const DequantMatrices* matrices,
       JXL_RETURN_IF_ERROR(F16Coder::Write(dc_quant[c] * 128.0f, writer));
     }
   }
-  ReclaimAndCharge(writer, &allotment, layer, aux_out);
+  allotment.ReclaimAndCharge(writer, layer, aux_out);
   return true;
 }
 
@@ -183,6 +183,18 @@ void DequantMatricesScaleDC(DequantMatrices* matrices, const float scale) {
   DequantMatricesSetCustomDC(matrices, dc);
 }
 
+void DequantMatricesRoundtrip(DequantMatrices* matrices) {
+  // Do not pass modular en/decoder, as they only change entropy and not
+  // values.
+  BitWriter writer;
+  JXL_CHECK(DequantMatricesEncode(matrices, &writer, 0, nullptr));
+  writer.ZeroPadToByte();
+  BitReader br(writer.GetSpan());
+  // Called only in the encoder: should fail only for programmer errors.
+  JXL_CHECK(matrices->Decode(&br));
+  JXL_CHECK(br.Close());
+}
+
 void DequantMatricesSetCustom(DequantMatrices* matrices,
                               const std::vector<QuantEncoding>& encodings,
                               ModularFrameEncoder* encoder) {
@@ -195,16 +207,7 @@ void DequantMatricesSetCustom(DequantMatrices* matrices,
                              encodings[i], i);
     }
   }
-  // Roundtrip encode/decode the matrices to ensure same values as decoder.
-  // Do not pass modular en/decoder, as they only change entropy and not
-  // values.
-  BitWriter writer;
-  JXL_CHECK(DequantMatricesEncode(matrices, &writer, 0, nullptr));
-  writer.ZeroPadToByte();
-  BitReader br(writer.GetSpan());
-  // Called only in the encoder: should fail only for programmer errors.
-  JXL_CHECK(matrices->Decode(&br));
-  JXL_CHECK(br.Close());
+  DequantMatricesRoundtrip(matrices);
 }
 
 }  // namespace jxl
index fe5273c..e0a387f 100644 (file)
@@ -10,6 +10,9 @@
 
 namespace jxl {
 
+struct AuxOut;
+struct BitWriter;
+
 Status DequantMatricesEncode(
     const DequantMatrices* matrices, BitWriter* writer, size_t layer,
     AuxOut* aux_out, ModularFrameEncoder* modular_frame_encoder = nullptr);
@@ -20,12 +23,15 @@ Status DequantMatricesEncodeDC(const DequantMatrices* matrices,
 // precision.
 void DequantMatricesSetCustomDC(DequantMatrices* matrices, const float* dc);
 
-void DequantMatricesScaleDC(DequantMatrices* matrices, const float scale);
+void DequantMatricesScaleDC(DequantMatrices* matrices, float scale);
 
 void DequantMatricesSetCustom(DequantMatrices* matrices,
                               const std::vector<QuantEncoding>& encodings,
                               ModularFrameEncoder* encoder);
 
+// Roundtrip encode/decode the matrices to ensure same values as decoder.
+void DequantMatricesRoundtrip(DequantMatrices* matrices);
+
 }  // namespace jxl
 
 #endif  // LIB_JXL_ENC_QUANT_WEIGHTS_H_
index cdb797d..6c98c8d 100644 (file)
@@ -8,15 +8,16 @@
 #include "lib/jxl/ans_params.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/chroma_from_luma.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/dct_scales.h"
 #include "lib/jxl/enc_ans.h"
 #include "lib/jxl/entropy_coder.h"
-#include "lib/jxl/opsin_params.h"
+#include "lib/jxl/pack_signed.h"
 #include "lib/jxl/splines.h"
 
 namespace jxl {
 
+struct AuxOut;
+
 class QuantizedSplineEncoder {
  public:
   // Only call if HasAny().
@@ -89,7 +90,7 @@ void EncodeSplines(const Splines& splines, BitWriter* writer,
 }
 
 Splines FindSplines(const Image3F& opsin) {
-  // TODO: implement spline detection.
+  // TODO(user): implement spline detection.
   return {};
 }
 
index 732d77a..3f6ecc7 100644 (file)
@@ -6,31 +6,20 @@
 #ifndef LIB_JXL_ENC_SPLINES_H_
 #define LIB_JXL_ENC_SPLINES_H_
 
-#include <stddef.h>
-#include <stdint.h>
-
-#include <utility>
-#include <vector>
-
-#include "lib/jxl/ans_params.h"
-#include "lib/jxl/aux_out.h"
-#include "lib/jxl/aux_out_fwd.h"
-#include "lib/jxl/base/status.h"
-#include "lib/jxl/chroma_from_luma.h"
-#include "lib/jxl/dec_ans.h"
-#include "lib/jxl/dec_bit_reader.h"
-#include "lib/jxl/enc_ans.h"
+#include <cstddef>
+
+#include "lib/jxl/enc_ans_params.h"
 #include "lib/jxl/enc_bit_writer.h"
-#include "lib/jxl/entropy_coder.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/splines.h"
 
 namespace jxl {
 
+struct AuxOut;
+
 // Only call if splines.HasAny().
-void EncodeSplines(const Splines& splines, BitWriter* writer,
-                   const size_t layer, const HistogramParams& histogram_params,
-                   AuxOut* aux_out);
+void EncodeSplines(const Splines& splines, BitWriter* writer, size_t layer,
+                   const HistogramParams& histogram_params, AuxOut* aux_out);
 
 Splines FindSplines(const Image3F& opsin);
 
index c877b0c..8ec659d 100644 (file)
@@ -7,10 +7,9 @@
 
 #include <stdint.h>
 
-#include "lib/jxl/aux_out_fwd.h"
+#include "lib/jxl/base/common.h"
 #include "lib/jxl/coeff_order.h"
-#include "lib/jxl/coeff_order_fwd.h"
-#include "lib/jxl/common.h"
+#include "lib/jxl/enc_aux_out.h"
 #include "lib/jxl/enc_coeff_order.h"
 #include "lib/jxl/field_encodings.h"
 #include "lib/jxl/fields.h"
@@ -39,7 +38,7 @@ Status WriteGroupOffsets(const std::vector<BitWriter>& group_codes,
     JXL_RETURN_IF_ERROR(U32Coder::Write(kTocDist, group_size, writer));
   }
   writer->ZeroPadToByte();  // before first group
-  ReclaimAndCharge(writer, &allotment, kLayerTOC, aux_out);
+  allotment.ReclaimAndCharge(writer, kLayerTOC, aux_out);
   return true;
 }
 
index dc81a5d..242b3ef 100644 (file)
 
 #include <vector>
 
-#include "lib/jxl/aux_out.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/status.h"
+#include "lib/jxl/coeff_order_fwd.h"
 #include "lib/jxl/enc_bit_writer.h"
 
 namespace jxl {
 
+struct AuxOut;
+
 // Writes the group offsets. If the permutation vector is nullptr, the identity
 // permutation will be used.
 Status WriteGroupOffsets(const std::vector<BitWriter>& group_codes,
index ef6dc2b..5b939dc 100644 (file)
@@ -49,7 +49,7 @@ HWY_INLINE void ReinterpretingIDCT(const float* input,
   }
 
   // ROWS, COLS <= 8, so we can put scratch space on the stack.
-  HWY_ALIGN float scratch_space[ROWS * COLS];
+  HWY_ALIGN float scratch_space[ROWS * COLS * 3];
   ComputeScaledIDCT<ROWS, COLS>()(block, DCTTo(output, output_stride),
                                   scratch_space);
 }
@@ -399,7 +399,7 @@ template <size_t afv_kind>
 void AFVTransformFromPixels(const float* JXL_RESTRICT pixels,
                             size_t pixels_stride,
                             float* JXL_RESTRICT coefficients) {
-  HWY_ALIGN float scratch_space[4 * 8 * 2];
+  HWY_ALIGN float scratch_space[4 * 8 * 5];
   size_t afv_x = afv_kind & 1;
   size_t afv_y = afv_kind / 2;
   HWY_ALIGN float block[4 * 8];
@@ -453,7 +453,6 @@ HWY_MAYBE_UNUSED void TransformFromPixels(const AcStrategy::Type strategy,
   using Type = AcStrategy::Type;
   switch (strategy) {
     case Type::IDENTITY: {
-      PROFILER_ZONE("DCT Identity");
       for (size_t y = 0; y < 2; y++) {
         for (size_t x = 0; x < 2; x++) {
           float block_dc = 0;
@@ -486,7 +485,6 @@ HWY_MAYBE_UNUSED void TransformFromPixels(const AcStrategy::Type strategy,
       break;
     }
     case Type::DCT8X4: {
-      PROFILER_ZONE("DCT 8x4");
       for (size_t x = 0; x < 2; x++) {
         HWY_ALIGN float block[4 * 8];
         ComputeScaledDCT<8, 4>()(DCTFrom(pixels + x * 4, pixels_stride), block,
@@ -505,7 +503,6 @@ HWY_MAYBE_UNUSED void TransformFromPixels(const AcStrategy::Type strategy,
       break;
     }
     case Type::DCT4X8: {
-      PROFILER_ZONE("DCT 4x8");
       for (size_t y = 0; y < 2; y++) {
         HWY_ALIGN float block[4 * 8];
         ComputeScaledDCT<4, 8>()(
@@ -524,7 +521,6 @@ HWY_MAYBE_UNUSED void TransformFromPixels(const AcStrategy::Type strategy,
       break;
     }
     case Type::DCT4X4: {
-      PROFILER_ZONE("DCT 4");
       for (size_t y = 0; y < 2; y++) {
         for (size_t x = 0; x < 2; x++) {
           HWY_ALIGN float block[4 * 4];
@@ -549,142 +545,119 @@ HWY_MAYBE_UNUSED void TransformFromPixels(const AcStrategy::Type strategy,
       break;
     }
     case Type::DCT2X2: {
-      PROFILER_ZONE("DCT 2");
       DCT2TopBlock<8>(pixels, pixels_stride, coefficients);
       DCT2TopBlock<4>(coefficients, kBlockDim, coefficients);
       DCT2TopBlock<2>(coefficients, kBlockDim, coefficients);
       break;
     }
     case Type::DCT16X16: {
-      PROFILER_ZONE("DCT 16");
       ComputeScaledDCT<16, 16>()(DCTFrom(pixels, pixels_stride), coefficients,
                                  scratch_space);
       break;
     }
     case Type::DCT16X8: {
-      PROFILER_ZONE("DCT 16x8");
       ComputeScaledDCT<16, 8>()(DCTFrom(pixels, pixels_stride), coefficients,
                                 scratch_space);
       break;
     }
     case Type::DCT8X16: {
-      PROFILER_ZONE("DCT 8x16");
       ComputeScaledDCT<8, 16>()(DCTFrom(pixels, pixels_stride), coefficients,
                                 scratch_space);
       break;
     }
     case Type::DCT32X8: {
-      PROFILER_ZONE("DCT 32x8");
       ComputeScaledDCT<32, 8>()(DCTFrom(pixels, pixels_stride), coefficients,
                                 scratch_space);
       break;
     }
     case Type::DCT8X32: {
-      PROFILER_ZONE("DCT 8x32");
       ComputeScaledDCT<8, 32>()(DCTFrom(pixels, pixels_stride), coefficients,
                                 scratch_space);
       break;
     }
     case Type::DCT32X16: {
-      PROFILER_ZONE("DCT 32x16");
       ComputeScaledDCT<32, 16>()(DCTFrom(pixels, pixels_stride), coefficients,
                                  scratch_space);
       break;
     }
     case Type::DCT16X32: {
-      PROFILER_ZONE("DCT 16x32");
       ComputeScaledDCT<16, 32>()(DCTFrom(pixels, pixels_stride), coefficients,
                                  scratch_space);
       break;
     }
     case Type::DCT32X32: {
-      PROFILER_ZONE("DCT 32");
       ComputeScaledDCT<32, 32>()(DCTFrom(pixels, pixels_stride), coefficients,
                                  scratch_space);
       break;
     }
     case Type::DCT: {
-      PROFILER_ZONE("DCT 8");
       ComputeScaledDCT<8, 8>()(DCTFrom(pixels, pixels_stride), coefficients,
                                scratch_space);
       break;
     }
     case Type::AFV0: {
-      PROFILER_ZONE("AFV0");
       AFVTransformFromPixels<0>(pixels, pixels_stride, coefficients);
       break;
     }
     case Type::AFV1: {
-      PROFILER_ZONE("AFV1");
       AFVTransformFromPixels<1>(pixels, pixels_stride, coefficients);
       break;
     }
     case Type::AFV2: {
-      PROFILER_ZONE("AFV2");
       AFVTransformFromPixels<2>(pixels, pixels_stride, coefficients);
       break;
     }
     case Type::AFV3: {
-      PROFILER_ZONE("AFV3");
       AFVTransformFromPixels<3>(pixels, pixels_stride, coefficients);
       break;
     }
     case Type::DCT64X64: {
-      PROFILER_ZONE("DCT 64x64");
       ComputeScaledDCT<64, 64>()(DCTFrom(pixels, pixels_stride), coefficients,
                                  scratch_space);
       break;
     }
     case Type::DCT64X32: {
-      PROFILER_ZONE("DCT 64x32");
       ComputeScaledDCT<64, 32>()(DCTFrom(pixels, pixels_stride), coefficients,
                                  scratch_space);
       break;
     }
     case Type::DCT32X64: {
-      PROFILER_ZONE("DCT 32x64");
       ComputeScaledDCT<32, 64>()(DCTFrom(pixels, pixels_stride), coefficients,
                                  scratch_space);
       break;
     }
     case Type::DCT128X128: {
-      PROFILER_ZONE("DCT 128x128");
       ComputeScaledDCT<128, 128>()(DCTFrom(pixels, pixels_stride), coefficients,
                                    scratch_space);
       break;
     }
     case Type::DCT128X64: {
-      PROFILER_ZONE("DCT 128x64");
       ComputeScaledDCT<128, 64>()(DCTFrom(pixels, pixels_stride), coefficients,
                                   scratch_space);
       break;
     }
     case Type::DCT64X128: {
-      PROFILER_ZONE("DCT 64x128");
       ComputeScaledDCT<64, 128>()(DCTFrom(pixels, pixels_stride), coefficients,
                                   scratch_space);
       break;
     }
     case Type::DCT256X256: {
-      PROFILER_ZONE("DCT 256x256");
       ComputeScaledDCT<256, 256>()(DCTFrom(pixels, pixels_stride), coefficients,
                                    scratch_space);
       break;
     }
     case Type::DCT256X128: {
-      PROFILER_ZONE("DCT 256x128");
       ComputeScaledDCT<256, 128>()(DCTFrom(pixels, pixels_stride), coefficients,
                                    scratch_space);
       break;
     }
     case Type::DCT128X256: {
-      PROFILER_ZONE("DCT 128x256");
       ComputeScaledDCT<128, 256>()(DCTFrom(pixels, pixels_stride), coefficients,
                                    scratch_space);
       break;
     }
     case Type::kNumValidStrategies:
-      JXL_ABORT("Invalid strategy");
+      JXL_UNREACHABLE("Invalid strategy");
   }
 }
 
@@ -814,7 +787,7 @@ HWY_MAYBE_UNUSED void DCFromLowestFrequencies(const AcStrategy::Type strategy,
       dc[0] = block[0];
       break;
     case Type::kNumValidStrategies:
-      JXL_ABORT("Invalid strategy");
+      JXL_UNREACHABLE("Invalid strategy");
   }
 }
 
index 577e296..9a52a85 100644 (file)
@@ -6,6 +6,7 @@
 #include "lib/jxl/enc_xyb.h"
 
 #include <algorithm>
+#include <atomic>
 #include <cstdlib>
 
 #undef HWY_TARGET_INCLUDE
 #include <hwy/foreach_target.h>
 #include <hwy/highway.h>
 
-#include "lib/jxl/aux_out_fwd.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/data_parallel.h"
-#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/fast_math-inl.h"
 #include "lib/jxl/base/status.h"
+#include "lib/jxl/cms/opsin_params.h"
+#include "lib/jxl/cms/transfer_functions-inl.h"
 #include "lib/jxl/color_encoding_internal.h"
-#include "lib/jxl/color_management.h"
 #include "lib/jxl/enc_bit_writer.h"
 #include "lib/jxl/enc_image_bundle.h"
-#include "lib/jxl/fast_math-inl.h"
 #include "lib/jxl/fields.h"
 #include "lib/jxl/image_bundle.h"
 #include "lib/jxl/image_ops.h"
-#include "lib/jxl/opsin_params.h"
-#include "lib/jxl/transfer_functions-inl.h"
 
 HWY_BEFORE_NAMESPACE();
 namespace jxl {
@@ -46,7 +44,7 @@ JXL_INLINE void OpsinAbsorbance(const V r, const V g, const V b,
                                 const float* JXL_RESTRICT premul_absorb,
                                 V* JXL_RESTRICT mixed0, V* JXL_RESTRICT mixed1,
                                 V* JXL_RESTRICT mixed2) {
-  const float* bias = &kOpsinAbsorbanceBias[0];
+  const float* bias = &jxl::cms::kOpsinAbsorbanceBias[0];
   const HWY_FULL(float) d;
   const size_t N = Lanes(d);
   const auto m0 = Load(d, premul_absorb + 0 * N);
@@ -97,6 +95,18 @@ void LinearRGBToXYB(const V r, const V g, const V b,
   // For wide-gamut inputs, r/g/b and valx (but not y/z) are often negative.
 }
 
+void LinearRGBRowToXYB(float* JXL_RESTRICT row0, float* JXL_RESTRICT row1,
+                       float* JXL_RESTRICT row2,
+                       const float* JXL_RESTRICT premul_absorb, size_t xsize) {
+  const HWY_FULL(float) d;
+  for (size_t x = 0; x < xsize; x += Lanes(d)) {
+    const auto r = Load(d, row0 + x);
+    const auto g = Load(d, row1 + x);
+    const auto b = Load(d, row2 + x);
+    LinearRGBToXYB(r, g, b, premul_absorb, row0 + x, row1 + x, row2 + x);
+  }
+}
+
 // Input/output uses the codec.h scaling: nominally 0-1 if in-gamut.
 template <class V>
 V LinearFromSRGB(V encoded) {
@@ -197,13 +207,108 @@ Status SRGBToXYBAndLinear(const Image3F& srgb,
       "SRGBToXYBAndLinear");
 }
 
+void ComputePremulAbsorb(float intensity_target, float* premul_absorb) {
+  const HWY_FULL(float) d;
+  const size_t N = Lanes(d);
+  const float mul = intensity_target / 255.0f;
+  for (size_t i = 0; i < 9; ++i) {
+    const auto absorb = Set(d, jxl::cms::kOpsinAbsorbanceMatrix[i] * mul);
+    Store(absorb, d, premul_absorb + i * N);
+  }
+  for (size_t i = 0; i < 3; ++i) {
+    const auto neg_bias_cbrt =
+        Set(d, -cbrtf(jxl::cms::kOpsinAbsorbanceBias[i]));
+    Store(neg_bias_cbrt, d, premul_absorb + (9 + i) * N);
+  }
+}
+
+Image3F TransformToLinearRGB(const Image3F& in,
+                             const ColorEncoding& color_encoding,
+                             float intensity_target, const JxlCmsInterface& cms,
+                             ThreadPool* pool) {
+  ColorSpaceTransform c_transform(cms);
+  bool is_gray = color_encoding.IsGray();
+  const ColorEncoding& c_desired = ColorEncoding::LinearSRGB(is_gray);
+  Image3F out(in.xsize(), in.ysize());
+  std::atomic<bool> ok{true};
+  JXL_CHECK(RunOnPool(
+      pool, 0, in.ysize(),
+      [&](const size_t num_threads) {
+        return c_transform.Init(color_encoding, c_desired, intensity_target,
+                                in.xsize(), num_threads);
+      },
+      [&](const uint32_t y, const size_t thread) {
+        float* mutable_src_buf = c_transform.BufSrc(thread);
+        const float* src_buf = mutable_src_buf;
+        // Interleave input.
+        if (is_gray) {
+          src_buf = in.ConstPlaneRow(0, y);
+        } else {
+          const float* JXL_RESTRICT row_in0 = in.ConstPlaneRow(0, y);
+          const float* JXL_RESTRICT row_in1 = in.ConstPlaneRow(1, y);
+          const float* JXL_RESTRICT row_in2 = in.ConstPlaneRow(2, y);
+          for (size_t x = 0; x < in.xsize(); x++) {
+            mutable_src_buf[3 * x + 0] = row_in0[x];
+            mutable_src_buf[3 * x + 1] = row_in1[x];
+            mutable_src_buf[3 * x + 2] = row_in2[x];
+          }
+        }
+        float* JXL_RESTRICT dst_buf = c_transform.BufDst(thread);
+        if (!c_transform.Run(thread, src_buf, dst_buf)) {
+          ok.store(false);
+          return;
+        }
+        float* JXL_RESTRICT row_out0 = out.PlaneRow(0, y);
+        float* JXL_RESTRICT row_out1 = out.PlaneRow(1, y);
+        float* JXL_RESTRICT row_out2 = out.PlaneRow(2, y);
+        // De-interleave output and convert type.
+        if (is_gray) {
+          for (size_t x = 0; x < in.xsize(); x++) {
+            row_out0[x] = dst_buf[x];
+            row_out1[x] = dst_buf[x];
+            row_out2[x] = dst_buf[x];
+          }
+        } else {
+          for (size_t x = 0; x < in.xsize(); x++) {
+            row_out0[x] = dst_buf[3 * x + 0];
+            row_out1[x] = dst_buf[3 * x + 1];
+            row_out2[x] = dst_buf[3 * x + 2];
+          }
+        }
+      },
+      "Colorspace transform"));
+  JXL_CHECK(ok.load());
+  return out;
+}
+
+void Image3FToXYB(const Image3F& in, const ColorEncoding& color_encoding,
+                  float intensity_target, ThreadPool* pool,
+                  Image3F* JXL_RESTRICT xyb, const JxlCmsInterface& cms) {
+  JXL_ASSERT(SameSize(in, *xyb));
+
+  const HWY_FULL(float) d;
+  // Pre-broadcasted constants
+  HWY_ALIGN float premul_absorb[MaxLanes(d) * 12];
+  ComputePremulAbsorb(intensity_target, premul_absorb);
+
+  bool is_gray = color_encoding.IsGray();
+  const ColorEncoding& c_linear_srgb = ColorEncoding::LinearSRGB(is_gray);
+  if (c_linear_srgb.SameColorEncoding(color_encoding)) {
+    JXL_CHECK(LinearSRGBToXYB(in, premul_absorb, pool, xyb));
+  } else if (color_encoding.IsSRGB()) {
+    JXL_CHECK(SRGBToXYB(in, premul_absorb, pool, xyb));
+  } else {
+    Image3F linear =
+        TransformToLinearRGB(in, color_encoding, intensity_target, cms, pool);
+    JXL_CHECK(LinearSRGBToXYB(linear, premul_absorb, pool, xyb));
+  }
+}
+
 // This is different from Butteraugli's OpsinDynamicsImage() in the sense that
 // it does not contain a sensitivity multiplier based on the blurred image.
 const ImageBundle* ToXYB(const ImageBundle& in, ThreadPool* pool,
                          Image3F* JXL_RESTRICT xyb, const JxlCmsInterface& cms,
                          ImageBundle* const JXL_RESTRICT linear) {
-  PROFILER_FUNC;
-
   const size_t xsize = in.xsize();
   const size_t ysize = in.ysize();
   JXL_ASSERT(SameSize(in, *xyb));
@@ -211,16 +316,7 @@ const ImageBundle* ToXYB(const ImageBundle& in, ThreadPool* pool,
   const HWY_FULL(float) d;
   // Pre-broadcasted constants
   HWY_ALIGN float premul_absorb[MaxLanes(d) * 12];
-  const size_t N = Lanes(d);
-  for (size_t i = 0; i < 9; ++i) {
-    const auto absorb = Set(d, kOpsinAbsorbanceMatrix[i] *
-                                   (in.metadata()->IntensityTarget() / 255.0f));
-    Store(absorb, d, premul_absorb + i * N);
-  }
-  for (size_t i = 0; i < 3; ++i) {
-    const auto neg_bias_cbrt = Set(d, -cbrtf(kOpsinAbsorbanceBias[i]));
-    Store(neg_bias_cbrt, d, premul_absorb + (9 + i) * N);
-  }
+  ComputePremulAbsorb(in.metadata()->IntensityTarget(), premul_absorb);
 
   const bool want_linear = linear != nullptr;
 
@@ -353,6 +449,48 @@ const ImageBundle* ToXYB(const ImageBundle& in, ThreadPool* pool,
   return HWY_DYNAMIC_DISPATCH(ToXYB)(in, pool, xyb, cms, linear_storage);
 }
 
+HWY_EXPORT(LinearRGBRowToXYB);
+void LinearRGBRowToXYB(float* JXL_RESTRICT row0, float* JXL_RESTRICT row1,
+                       float* JXL_RESTRICT row2,
+                       const float* JXL_RESTRICT premul_absorb, size_t xsize) {
+  HWY_DYNAMIC_DISPATCH(LinearRGBRowToXYB)
+  (row0, row1, row2, premul_absorb, xsize);
+}
+
+HWY_EXPORT(ComputePremulAbsorb);
+void ComputePremulAbsorb(float intensity_target, float* premul_absorb) {
+  HWY_DYNAMIC_DISPATCH(ComputePremulAbsorb)(intensity_target, premul_absorb);
+}
+
+void ScaleXYBRow(float* JXL_RESTRICT row0, float* JXL_RESTRICT row1,
+                 float* JXL_RESTRICT row2, size_t xsize) {
+  for (size_t x = 0; x < xsize; x++) {
+    row2[x] = (row2[x] - row1[x] + jxl::cms::kScaledXYBOffset[2]) *
+              jxl::cms::kScaledXYBScale[2];
+    row0[x] = (row0[x] + jxl::cms::kScaledXYBOffset[0]) *
+              jxl::cms::kScaledXYBScale[0];
+    row1[x] = (row1[x] + jxl::cms::kScaledXYBOffset[1]) *
+              jxl::cms::kScaledXYBScale[1];
+  }
+}
+
+void ScaleXYB(Image3F* opsin) {
+  for (size_t y = 0; y < opsin->ysize(); y++) {
+    float* row0 = opsin->PlaneRow(0, y);
+    float* row1 = opsin->PlaneRow(1, y);
+    float* row2 = opsin->PlaneRow(2, y);
+    ScaleXYBRow(row0, row1, row2, opsin->xsize());
+  }
+}
+
+HWY_EXPORT(Image3FToXYB);
+void Image3FToXYB(const Image3F& in, const ColorEncoding& color_encoding,
+                  float intensity_target, ThreadPool* pool,
+                  Image3F* JXL_RESTRICT xyb, const JxlCmsInterface& cms) {
+  return HWY_DYNAMIC_DISPATCH(Image3FToXYB)(in, color_encoding,
+                                            intensity_target, pool, xyb, cms);
+}
+
 HWY_EXPORT(RgbToYcbcr);
 Status RgbToYcbcr(const ImageF& r_plane, const ImageF& g_plane,
                   const ImageF& b_plane, ImageF* y_plane, ImageF* cb_plane,
@@ -361,21 +499,5 @@ Status RgbToYcbcr(const ImageF& r_plane, const ImageF& g_plane,
                                           cb_plane, cr_plane, pool);
 }
 
-// DEPRECATED
-Image3F OpsinDynamicsImage(const Image3B& srgb8, const JxlCmsInterface& cms) {
-  ImageMetadata metadata;
-  metadata.SetUintSamples(8);
-  metadata.color_encoding = ColorEncoding::SRGB();
-  ImageBundle ib(&metadata);
-  ib.SetFromImage(ConvertToFloat(srgb8), metadata.color_encoding);
-  JXL_CHECK(ib.TransformTo(ColorEncoding::LinearSRGB(ib.IsGray()), cms));
-  ThreadPool* null_pool = nullptr;
-  Image3F xyb(srgb8.xsize(), srgb8.ysize());
-
-  ImageBundle linear_storage(&metadata);
-  (void)ToXYB(ib, null_pool, &xyb, cms, &linear_storage);
-  return xyb;
-}
-
 }  // namespace jxl
 #endif  // HWY_ONCE
index de8f2e3..22a073b 100644 (file)
@@ -8,7 +8,6 @@
 
 // Converts to XYB color space.
 
-#include "lib/jxl/aux_out_fwd.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/base/status.h"
@@ -27,6 +26,21 @@ const ImageBundle* ToXYB(const ImageBundle& in, ThreadPool* pool,
                          Image3F* JXL_RESTRICT xyb, const JxlCmsInterface& cms,
                          ImageBundle* JXL_RESTRICT linear = nullptr);
 
+void Image3FToXYB(const Image3F& in, const ColorEncoding& color_encoding,
+                  float intensity_target, ThreadPool* pool,
+                  Image3F* JXL_RESTRICT xyb, const JxlCmsInterface& cms);
+
+void LinearRGBRowToXYB(float* JXL_RESTRICT row0, float* JXL_RESTRICT row1,
+                       float* JXL_RESTRICT row2,
+                       const float* JXL_RESTRICT premul_absorb, size_t xsize);
+
+void ComputePremulAbsorb(float intensity_target, float* premul_absorb);
+
+// Transforms each color component of the given XYB image into the [0.0, 1.0]
+// interval with an affine transform.
+void ScaleXYB(Image3F* opsin);
+void ScaleXYBRow(float* row0, float* row1, float* row2, size_t xsize);
+
 // Bt.601 to match JPEG/JFIF. Outputs _signed_ YCbCr values suitable for DCT,
 // see F.1.1.3 of T.81 (because our data type is float, there is no need to add
 // a bias to make the values unsigned).
@@ -34,9 +48,6 @@ Status RgbToYcbcr(const ImageF& r_plane, const ImageF& g_plane,
                   const ImageF& b_plane, ImageF* y_plane, ImageF* cb_plane,
                   ImageF* cr_plane, ThreadPool* pool);
 
-// DEPRECATED, used by opsin_image_wrapper.
-Image3F OpsinDynamicsImage(const Image3B& srgb8, const JxlCmsInterface& cms);
-
 }  // namespace jxl
 
 #endif  // LIB_JXL_ENC_XYB_H_
index 8e02dd6..b469969 100644 (file)
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-#include "jxl/encode.h"
-
 #include <brotli/encode.h>
+#include <jxl/cms.h>
+#include <jxl/codestream_header.h>
+#include <jxl/encode.h>
+#include <jxl/types.h>
 
 #include <algorithm>
 #include <cstddef>
+#include <cstdint>
 #include <cstring>
 
-#include "jxl/codestream_header.h"
-#include "jxl/types.h"
-#include "lib/jxl/aux_out.h"
 #include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/common.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/printf_macros.h"
 #include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
 #include "lib/jxl/codec_in_out.h"
-#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_bit_writer.h"
 #include "lib/jxl/enc_external_image.h"
-#include "lib/jxl/enc_file.h"
+#include "lib/jxl/enc_fast_lossless.h"
+#include "lib/jxl/enc_fields.h"
 #include "lib/jxl/enc_icc_codec.h"
+#include "lib/jxl/enc_params.h"
 #include "lib/jxl/encode_internal.h"
 #include "lib/jxl/exif.h"
 #include "lib/jxl/jpeg/enc_jpeg_data.h"
+#include "lib/jxl/luminance.h"
+#include "lib/jxl/memory_manager_internal.h"
+#include "lib/jxl/padded_bytes.h"
 #include "lib/jxl/sanitizers.h"
 
+struct JxlErrorOrStatus {
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  operator jxl::Status() const {
+    switch (error_) {
+      case JXL_ENC_SUCCESS:
+        return jxl::OkStatus();
+      case JXL_ENC_NEED_MORE_OUTPUT:
+        return jxl::StatusCode::kNotEnoughBytes;
+      default:
+        return jxl::StatusCode::kGenericError;
+    }
+  }
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  operator JxlEncoderStatus() const { return error_; }
+
+  static JxlErrorOrStatus Success() {
+    return JxlErrorOrStatus(JXL_ENC_SUCCESS);
+  }
+
+  static JxlErrorOrStatus MoreOutput() {
+    return JxlErrorOrStatus(JXL_ENC_NEED_MORE_OUTPUT);
+  }
+
+  static JxlErrorOrStatus Error() { return JxlErrorOrStatus(JXL_ENC_ERROR); }
+
+ private:
+  explicit JxlErrorOrStatus(JxlEncoderStatus error) : error_(error) {}
+  JxlEncoderStatus error_;
+};
+
 // Debug-printing failure macro similar to JXL_FAILURE, but for the status code
 // JXL_ENC_ERROR
 #ifdef JXL_CRASH_ON_ERROR
 #define JXL_API_ERROR(enc, error_code, format, ...)                          \
   (enc->error = error_code,                                                  \
    ::jxl::Debug(("%s:%d: " format "\n"), __FILE__, __LINE__, ##__VA_ARGS__), \
-   ::jxl::Abort(), JXL_ENC_ERROR)
+   ::jxl::Abort(), JxlErrorOrStatus::Error())
 #define JXL_API_ERROR_NOSET(format, ...)                                     \
   (::jxl::Debug(("%s:%d: " format "\n"), __FILE__, __LINE__, ##__VA_ARGS__), \
-   ::jxl::Abort(), JXL_ENC_ERROR)
+   ::jxl::Abort(), JxlErrorOrStatus::Error())
 #else  // JXL_CRASH_ON_ERROR
 #define JXL_API_ERROR(enc, error_code, format, ...)                            \
   (enc->error = error_code,                                                    \
    ((JXL_DEBUG_ON_ERROR) &&                                                    \
     ::jxl::Debug(("%s:%d: " format "\n"), __FILE__, __LINE__, ##__VA_ARGS__)), \
-   JXL_ENC_ERROR)
+   JxlErrorOrStatus::Error())
 #define JXL_API_ERROR_NOSET(format, ...)                                     \
   (::jxl::Debug(("%s:%d: " format "\n"), __FILE__, __LINE__, ##__VA_ARGS__), \
-   JXL_ENC_ERROR)
+   JxlErrorOrStatus::Error())
 #endif  // JXL_CRASH_ON_ERROR
 
-namespace jxl {}  // namespace jxl
+jxl::StatusOr<JxlOutputProcessorBuffer>
+JxlEncoderOutputProcessorWrapper::GetBuffer(size_t min_size,
+                                            size_t requested_size) {
+  JXL_ASSERT(min_size > 0);
+  JXL_ASSERT(!has_buffer_);
+  if (stop_requested_) return jxl::StatusCode::kNotEnoughBytes;
+  requested_size = std::max(min_size, requested_size);
+
+  // If we support seeking, output_position_ == position_.
+  if (external_output_processor_ && external_output_processor_->seek) {
+    JXL_ASSERT(output_position_ == position_);
+  }
+  // Otherwise, output_position_ <= position_.
+  JXL_ASSERT(output_position_ <= position_);
+  size_t additional_size = position_ - output_position_;
+
+  if (external_output_processor_) {
+    // TODO(veluca): here, we cannot just ask for a larger buffer, as it will be
+    // released with a prefix of the buffer that has not been written yet.
+    // Figure out if there is a good way to do this more efficiently.
+    if (additional_size == 0) {
+      size_t size = requested_size;
+      uint8_t* user_buffer =
+          static_cast<uint8_t*>(external_output_processor_->get_buffer(
+              external_output_processor_->opaque, &size));
+      if (size == 0 || user_buffer == nullptr) {
+        stop_requested_ = true;
+        return jxl::StatusCode::kNotEnoughBytes;
+      }
+      if (size < min_size) {
+        external_output_processor_->release_buffer(
+            external_output_processor_->opaque, 0);
+      } else {
+        internal_buffers_.emplace(position_, InternalBuffer());
+        has_buffer_ = true;
+        return JxlOutputProcessorBuffer(user_buffer, size, 0, this);
+      }
+    }
+  } else {
+    if (min_size + additional_size < *avail_out_) {
+      internal_buffers_.emplace(position_, InternalBuffer());
+      has_buffer_ = true;
+      return JxlOutputProcessorBuffer(*next_out_ + additional_size,
+                                      *avail_out_ - additional_size, 0, this);
+    }
+  }
+
+  // Otherwise, we need to allocate our own buffer.
+  auto it = internal_buffers_.emplace(position_, InternalBuffer()).first;
+  InternalBuffer& buffer = it->second;
+  size_t alloc_size = requested_size;
+  it++;
+  if (it != internal_buffers_.end()) {
+    alloc_size = std::min(alloc_size, it->first - position_);
+    JXL_ASSERT(alloc_size >= min_size);
+  }
+  buffer.owned_data.resize(alloc_size);
+  has_buffer_ = true;
+  return JxlOutputProcessorBuffer(buffer.owned_data.data(), alloc_size, 0,
+                                  this);
+}
+
+void JxlEncoderOutputProcessorWrapper::Seek(size_t pos) {
+  JXL_ASSERT(!has_buffer_);
+  if (external_output_processor_ && external_output_processor_->seek) {
+    external_output_processor_->seek(external_output_processor_->opaque, pos);
+    output_position_ = pos;
+  }
+  JXL_ASSERT(pos >= finalized_position_);
+  position_ = pos;
+}
+
+void JxlEncoderOutputProcessorWrapper::SetFinalizedPosition() {
+  JXL_ASSERT(!has_buffer_);
+  if (external_output_processor_ && external_output_processor_->seek) {
+    external_output_processor_->set_finalized_position(
+        external_output_processor_->opaque, position_);
+  }
+  finalized_position_ = position_;
+  FlushOutput();
+}
+
+bool JxlEncoderOutputProcessorWrapper::SetAvailOut(uint8_t** next_out,
+                                                   size_t* avail_out) {
+  if (external_output_processor_) return false;
+  avail_out_ = avail_out;
+  next_out_ = next_out;
+  FlushOutput();
+  return true;
+}
+
+void JxlEncoderOutputProcessorWrapper::ReleaseBuffer(size_t bytes_used) {
+  JXL_ASSERT(has_buffer_);
+  has_buffer_ = false;
+  auto it = internal_buffers_.find(position_);
+  JXL_ASSERT(it != internal_buffers_.end());
+  if (bytes_used == 0) {
+    if (external_output_processor_) {
+      external_output_processor_->release_buffer(
+          external_output_processor_->opaque, bytes_used);
+    }
+    internal_buffers_.erase(it);
+    return;
+  }
+  it->second.written_bytes = bytes_used;
+  position_ += bytes_used;
+
+  auto it_to_next = it;
+  it_to_next++;
+  if (it_to_next != internal_buffers_.end()) {
+    JXL_ASSERT(it_to_next->first >= position_);
+  }
+
+  if (external_output_processor_) {
+    // If the buffer was given by the user, tell the user it is not needed
+    // anymore.
+    if (it->second.owned_data.empty()) {
+      external_output_processor_->release_buffer(
+          external_output_processor_->opaque, bytes_used);
+      // If we don't support seeking, this implies we will never modify again
+      // the bytes that were written so far. Advance the finalized position and
+      // flush the output to clean up the internal buffers.
+      if (!external_output_processor_->seek) {
+        SetFinalizedPosition();
+        JXL_ASSERT(output_position_ == finalized_position_);
+        JXL_ASSERT(output_position_ == position_);
+      } else {
+        // Otherwise, advance the output position accordingly.
+        output_position_ += bytes_used;
+        JXL_ASSERT(output_position_ >= finalized_position_);
+        JXL_ASSERT(output_position_ == position_);
+      }
+    } else if (external_output_processor_->seek) {
+      // If we had buffered the data internally, flush it out to the external
+      // processor if we can.
+      external_output_processor_->seek(external_output_processor_->opaque,
+                                       position_ - bytes_used);
+      output_position_ = position_ - bytes_used;
+      while (output_position_ < position_) {
+        size_t num_to_write = position_ - output_position_;
+        if (!AppendBufferToExternalProcessor(it->second.owned_data.data() +
+                                                 output_position_ - position_ +
+                                                 bytes_used,
+                                             num_to_write)) {
+          return;
+        }
+      }
+      it->second.owned_data.clear();
+    }
+  }
+}
+
+// Tries to write all the bytes up to the finalized position.
+void JxlEncoderOutputProcessorWrapper::FlushOutput() {
+  JXL_ASSERT(!has_buffer_);
+  while (output_position_ < finalized_position_ &&
+         (avail_out_ == nullptr || *avail_out_ > 0)) {
+    JXL_ASSERT(!internal_buffers_.empty());
+    auto it = internal_buffers_.begin();
+    // If this fails, we are trying to move the finalized position past data
+    // that was not written yet. This is a library programming error.
+    JXL_ASSERT(output_position_ >= it->first);
+    JXL_ASSERT(it->second.written_bytes != 0);
+    size_t buffer_last_byte = it->first + it->second.written_bytes;
+    if (!it->second.owned_data.empty()) {
+      size_t start_in_buffer = output_position_ - it->first;
+      // Guaranteed by the invariant on `internal_buffers_`.
+      JXL_ASSERT(buffer_last_byte > output_position_);
+      size_t num_to_write =
+          std::min(buffer_last_byte, finalized_position_) - output_position_;
+      if (avail_out_ != nullptr) {
+        size_t n = std::min(num_to_write, *avail_out_);
+        memcpy(*next_out_, it->second.owned_data.data() + start_in_buffer, n);
+        *avail_out_ -= n;
+        *next_out_ += n;
+        output_position_ += n;
+      } else {
+        if (!AppendBufferToExternalProcessor(
+                it->second.owned_data.data() + start_in_buffer, num_to_write)) {
+          return;
+        }
+      }
+    } else {
+      size_t advance =
+          std::min(buffer_last_byte, finalized_position_) - output_position_;
+      output_position_ += advance;
+      if (avail_out_ != nullptr) {
+        *next_out_ += advance;
+        *avail_out_ -= advance;
+      }
+    }
+    if (buffer_last_byte == output_position_) {
+      internal_buffers_.erase(it);
+    }
+    if (external_output_processor_ && !external_output_processor_->seek) {
+      external_output_processor_->set_finalized_position(
+          external_output_processor_->opaque, output_position_);
+    }
+  }
+}
+
+bool JxlEncoderOutputProcessorWrapper::AppendBufferToExternalProcessor(
+    void* data, size_t count) {
+  JXL_ASSERT(external_output_processor_);
+  size_t n = count;
+  void* user_buffer = external_output_processor_->get_buffer(
+      external_output_processor_->opaque, &n);
+  if (user_buffer == nullptr || n == 0) {
+    stop_requested_ = true;
+    return false;
+  }
+  n = std::min(n, count);
+  memcpy(user_buffer, data, n);
+  external_output_processor_->release_buffer(external_output_processor_->opaque,
+                                             n);
+  output_position_ += n;
+  return true;
+}
+
+namespace jxl {
+
+size_t WriteBoxHeader(const jxl::BoxType& type, size_t size, bool unbounded,
+                      bool force_large_box, uint8_t* output) {
+  uint64_t box_size = 0;
+  bool large_size = false;
+  if (!unbounded) {
+    if (box_size >= kLargeBoxContentSizeThreshold || force_large_box) {
+      large_size = true;
+      // TODO(firsching): send a separate CL for this (+ test),
+      // quick fix in the old code: box_size += 8
+      box_size = size + kLargeBoxHeaderSize;
+    } else {
+      box_size = size + kSmallBoxHeaderSize;
+    }
+  }
+
+  size_t idx = 0;
+  {
+    const uint64_t store = large_size ? 1 : box_size;
+    for (size_t i = 0; i < 4; i++) {
+      output[idx++] = store >> (8 * (3 - i)) & 0xff;
+    }
+  }
+  for (size_t i = 0; i < 4; i++) {
+    output[idx++] = type[i];
+  }
+
+  if (large_size) {
+    for (size_t i = 0; i < 8; i++) {
+      output[idx++] = box_size >> (8 * (7 - i)) & 0xff;
+    }
+  }
+  return idx;
+}
+}  // namespace jxl
+
+template <typename WriteBox>
+jxl::Status JxlEncoderStruct::AppendBox(const jxl::BoxType& type,
+                                        bool unbounded, size_t box_max_size,
+                                        const WriteBox& write_box) {
+  size_t current_position = output_processor.CurrentPosition();
+  bool large_box = false;
+  if (box_max_size >= jxl::kLargeBoxContentSizeThreshold && !unbounded) {
+    output_processor.Seek(current_position + jxl::kLargeBoxHeaderSize);
+    large_box = true;
+  } else {
+    output_processor.Seek(current_position + jxl::kSmallBoxHeaderSize);
+  }
+  size_t box_contents_start = output_processor.CurrentPosition();
+  JXL_RETURN_IF_ERROR(write_box());
+  size_t box_contents_end = output_processor.CurrentPosition();
+  output_processor.Seek(current_position);
+  JXL_ASSERT(box_contents_end >= box_contents_start);
+  if (box_contents_end - box_contents_start > box_max_size) {
+    return JXL_API_ERROR(this, JXL_ENC_ERR_GENERIC,
+                         "Internal error: upper bound on box size was "
+                         "violated, upper bound: %" PRIuS ", actual: %" PRIuS,
+                         box_max_size, box_contents_end - box_contents_start);
+  }
+  // We need to release the buffer before Seek.
+  {
+    JXL_ASSIGN_OR_RETURN(
+        auto buffer,
+        output_processor.GetBuffer(box_contents_start - current_position));
+    buffer.advance(jxl::WriteBoxHeader(type,
+                                       box_contents_end - box_contents_start,
+                                       unbounded, large_box, buffer.data()));
+  }
+  output_processor.Seek(box_contents_end);
+  output_processor.SetFinalizedPosition();
+  return jxl::OkStatus();
+}
+
+template <typename T>
+jxl::Status AppendData(JxlEncoderOutputProcessorWrapper& output_processor,
+                       const T& data) {
+  size_t size = std::end(data) - std::begin(data);
+  size_t written = 0;
+  while (written < size) {
+    JXL_ASSIGN_OR_RETURN(auto buffer,
+                         output_processor.GetBuffer(1, size - written));
+    size_t n = std::min(size - written, buffer.size());
+    buffer.append(data.data() + written, n);
+    written += n;
+  }
+  return jxl::OkStatus();
+}
+
+template <typename BoxContents>
+jxl::Status JxlEncoderStruct::AppendBoxWithContents(
+    const jxl::BoxType& type, const BoxContents& contents) {
+  size_t size = std::end(contents) - std::begin(contents);
+  return AppendBox(type, /*unbounded=*/false, size,
+                   [&]() { return AppendData(output_processor, contents); });
+}
 
 uint32_t JxlEncoderVersion(void) {
   return JPEGXL_MAJOR_VERSION * 1000000 + JPEGXL_MINOR_VERSION * 1000 +
@@ -55,12 +409,14 @@ uint32_t JxlEncoderVersion(void) {
 }
 
 namespace {
-template <typename T>
-void AppendJxlpBoxCounter(uint32_t counter, bool last, T* output) {
+void WriteJxlpBoxCounter(uint32_t counter, bool last,
+                         JxlOutputProcessorBuffer& buffer) {
   if (last) counter |= 0x80000000;
+  uint8_t buf[4];
   for (size_t i = 0; i < 4; i++) {
-    output->push_back(counter >> (8 * (3 - i)) & 0xff);
+    buf[i] = counter >> (8 * (3 - i)) & 0xff;
   }
+  buffer.append(buf, 4);
 }
 
 void QueueFrame(
@@ -76,6 +432,14 @@ void QueueFrame(
   frame_settings->enc->num_queued_frames++;
 }
 
+void QueueFastLosslessFrame(const JxlEncoderFrameSettings* frame_settings,
+                            JxlFastLosslessFrameState* fast_lossless_frame) {
+  jxl::JxlEncoderQueuedInput queued_input(frame_settings->enc->memory_manager);
+  queued_input.fast_lossless_frame.reset(fast_lossless_frame);
+  frame_settings->enc->input_queue.emplace_back(std::move(queued_input));
+  frame_settings->enc->num_queued_frames++;
+}
+
 void QueueBox(JxlEncoder* enc,
               jxl::MemoryManagerUniquePtr<jxl::JxlEncoderQueuedBox>& box) {
   jxl::JxlEncoderQueuedInput queued_input(enc->memory_manager);
@@ -119,7 +483,7 @@ JxlEncoderStatus BrotliCompress(int quality, const uint8_t* in, size_t in_size,
     if (BrotliEncoderIsFinished(enc.get())) break;
   }
 
-  return JXL_ENC_SUCCESS;
+  return JxlErrorOrStatus::Success();
 }
 
 // The JXL codestream can have level 5 or level 10. Levels have certain
@@ -197,6 +561,36 @@ int VerifyLevelSettings(const JxlEncoder* enc, std::string* debug_string) {
   // All level 5 checks passes, so can return the more compatible level 5
   return 5;
 }
+
+size_t BitsPerChannel(JxlDataType data_type) {
+  switch (data_type) {
+    case JXL_TYPE_UINT8:
+      return 8;
+    case JXL_TYPE_UINT16:
+      return 16;
+    case JXL_TYPE_FLOAT:
+      return 32;
+    case JXL_TYPE_FLOAT16:
+      return 16;
+    default:
+      return 0;  // signals unhandled JxlDataType
+  }
+}
+
+template <typename T>
+uint32_t GetBitDepth(JxlBitDepth bit_depth, const T& metadata,
+                     JxlPixelFormat format) {
+  if (bit_depth.type == JXL_BIT_DEPTH_FROM_PIXEL_FORMAT) {
+    return BitsPerChannel(format.data_type);
+  } else if (bit_depth.type == JXL_BIT_DEPTH_FROM_CODESTREAM) {
+    return metadata.bit_depth.bits_per_sample;
+  } else if (bit_depth.type == JXL_BIT_DEPTH_CUSTOM) {
+    return bit_depth.bits_per_sample;
+  } else {
+    return 0;
+  }
+}
+
 JxlEncoderStatus CheckValidBitdepth(uint32_t bits_per_sample,
                                     uint32_t exponent_bits_per_sample) {
   if (!exponent_bits_per_sample) {
@@ -210,11 +604,34 @@ JxlEncoderStatus CheckValidBitdepth(uint32_t bits_per_sample,
              (bits_per_sample < 3 + exponent_bits_per_sample)) {
     return JXL_API_ERROR_NOSET("Invalid float description");
   }
-  return JXL_ENC_SUCCESS;
+  return JxlErrorOrStatus::Success();
+}
+
+JxlEncoderStatus VerifyInputBitDepth(JxlBitDepth bit_depth,
+                                     JxlPixelFormat format) {
+  return JxlErrorOrStatus::Success();
+}
+
+static inline bool EncodeVarInt(uint64_t value, size_t output_size,
+                                size_t* output_pos, uint8_t* output) {
+  // While more than 7 bits of data are left,
+  // store 7 bits and set the next byte flag
+  while (value > 127) {
+    // TODO(eustas): should it be `>=` ?
+    if (*output_pos > output_size) return false;
+    // |128: Set the next byte flag
+    output[(*output_pos)++] = ((uint8_t)(value & 127)) | 128;
+    // Remove the seven bits we just wrote
+    value >>= 7;
+  }
+  // TODO(eustas): should it be `>=` ?
+  if (*output_pos > output_size) return false;
+  output[(*output_pos)++] = ((uint8_t)value) & 127;
+  return true;
 }
 
 bool EncodeFrameIndexBox(const jxl::JxlEncoderFrameIndexBox& frame_index_box,
-                         jxl::BitWriter& writer) {
+                         std::vector<uint8_t>& buffer_vec) {
   bool ok = true;
   int NF = 0;
   for (size_t i = 0; i < frame_index_box.entries.size(); ++i) {
@@ -231,10 +648,10 @@ bool EncodeFrameIndexBox(const jxl::JxlEncoderFrameIndexBox& frame_index_box,
   static const int kFrameIndexBoxElementLength = 3 * kVarintMaxLength;
   const int buffer_size =
       kFrameIndexBoxHeaderLength + NF * kFrameIndexBoxElementLength;
-  std::vector<uint8_t> buffer_vec(buffer_size);
+  buffer_vec.resize(buffer_size);
   uint8_t* buffer = buffer_vec.data();
   size_t output_pos = 0;
-  ok &= jxl::EncodeVarInt(NF, buffer_vec.size(), &output_pos, buffer);
+  ok &= EncodeVarInt(NF, buffer_vec.size(), &output_pos, buffer);
   StoreBE32(frame_index_box.TNUM, &buffer[output_pos]);
   output_pos += 4;
   StoreBE32(frame_index_box.TDEN, &buffer[output_pos]);
@@ -262,9 +679,9 @@ bool EncodeFrameIndexBox(const jxl::JxlEncoderFrameIndexBox& frame_index_box,
       }
       int32_t Ti = T_prev;
       int32_t Fi = i - prev_ix;
-      ok &= jxl::EncodeVarInt(OFFi, buffer_vec.size(), &output_pos, buffer);
-      ok &= jxl::EncodeVarInt(Ti, buffer_vec.size(), &output_pos, buffer);
-      ok &= jxl::EncodeVarInt(Fi, buffer_vec.size(), &output_pos, buffer);
+      ok &= EncodeVarInt(OFFi, buffer_vec.size(), &output_pos, buffer);
+      ok &= EncodeVarInt(Ti, buffer_vec.size(), &output_pos, buffer);
+      ok &= EncodeVarInt(Fi, buffer_vec.size(), &output_pos, buffer);
       prev_prev_ix = prev_ix;
       prev_ix = i;
       T_prev = T;
@@ -280,19 +697,20 @@ bool EncodeFrameIndexBox(const jxl::JxlEncoderFrameIndexBox& frame_index_box,
     }
     int32_t Ti = T_prev;
     int32_t Fi = i - prev_ix;
-    ok &= jxl::EncodeVarInt(OFFi, buffer_vec.size(), &output_pos, buffer);
-    ok &= jxl::EncodeVarInt(Ti, buffer_vec.size(), &output_pos, buffer);
-    ok &= jxl::EncodeVarInt(Fi, buffer_vec.size(), &output_pos, buffer);
+    ok &= EncodeVarInt(OFFi, buffer_vec.size(), &output_pos, buffer);
+    ok &= EncodeVarInt(Ti, buffer_vec.size(), &output_pos, buffer);
+    ok &= EncodeVarInt(Fi, buffer_vec.size(), &output_pos, buffer);
   }
   // Enough buffer has been allocated, this function should never fail in
   // writing.
   JXL_ASSERT(ok);
+  buffer_vec.resize(output_pos);
   return ok;
 }
 
 }  // namespace
 
-JxlEncoderStatus JxlEncoderStruct::RefillOutputByteQueue() {
+jxl::Status JxlEncoderStruct::ProcessOneEnqueuedInput() {
   jxl::PaddedBytes bytes;
 
   jxl::JxlEncoderQueuedInput& input = input_queue[0];
@@ -327,23 +745,26 @@ JxlEncoderStatus JxlEncoderStruct::RefillOutputByteQueue() {
            level_message)
               .c_str());
     }
-
+    jxl::AuxOut* aux_out =
+        input.frame ? input.frame->option_values.aux_out : nullptr;
     jxl::BitWriter writer;
-    if (!WriteHeaders(&metadata, &writer, nullptr)) {
+    if (!WriteCodestreamHeaders(&metadata, &writer, aux_out)) {
       return JXL_API_ERROR(this, JXL_ENC_ERR_GENERIC,
                            "Failed to write codestream header");
     }
     // Only send ICC (at least several hundred bytes) if fields aren't enough.
     if (metadata.m.color_encoding.WantICC()) {
       if (!jxl::WriteICC(metadata.m.color_encoding.ICC(), &writer,
-                         jxl::kLayerHeader, nullptr)) {
+                         jxl::kLayerHeader, aux_out)) {
         return JXL_API_ERROR(this, JXL_ENC_ERR_GENERIC,
                              "Failed to write ICC profile");
       }
     }
     // TODO(lode): preview should be added here if a preview image is added
 
+    jxl::BitWriter::Allotment allotment(&writer, 8);
     writer.ZeroPadToByte();
+    allotment.ReclaimAndCharge(&writer, jxl::kLayerHeader, aux_out);
 
     // Not actually the end of frame, but the end of metadata/ICC, but helps
     // the next frame to start here for indexing purposes.
@@ -354,16 +775,19 @@ JxlEncoderStatus JxlEncoderStruct::RefillOutputByteQueue() {
 
     if (MustUseContainer()) {
       // Add "JXL " and ftyp box.
-      output_byte_queue.insert(
-          output_byte_queue.end(), jxl::kContainerHeader,
-          jxl::kContainerHeader + sizeof(jxl::kContainerHeader));
+      {
+        JXL_ASSIGN_OR_RETURN(auto buffer, output_processor.GetBuffer(
+                                              jxl::kContainerHeader.size()));
+        buffer.append(jxl::kContainerHeader);
+      }
       if (codestream_level != 5) {
         // Add jxll box directly after the ftyp box to indicate the codestream
         // level.
-        output_byte_queue.insert(
-            output_byte_queue.end(), jxl::kLevelBoxHeader,
-            jxl::kLevelBoxHeader + sizeof(jxl::kLevelBoxHeader));
-        output_byte_queue.push_back(codestream_level);
+        JXL_ASSIGN_OR_RETURN(auto buffer, output_processor.GetBuffer(
+                                              jxl::kLevelBoxHeader.size() + 1));
+        buffer.append(jxl::kLevelBoxHeader);
+        uint8_t cl = codestream_level;
+        buffer.append(&cl, 1);
       }
 
       // Whether to write the basic info and color profile header of the
@@ -372,23 +796,26 @@ JxlEncoderStatus JxlEncoderStruct::RefillOutputByteQueue() {
       // always be done, but there's no reason to add an extra box with box
       // header overhead if the codestream will already come immediately after
       // the signature and level boxes.
-      bool partial_header = store_jpeg_metadata || (use_boxes && !input.frame);
+      bool partial_header =
+          store_jpeg_metadata ||
+          (use_boxes && (!input.frame && !input.fast_lossless_frame));
 
       if (partial_header) {
-        jxl::AppendBoxHeader(jxl::MakeBoxType("jxlp"), bytes.size() + 4,
-                             /*unbounded=*/false, &output_byte_queue);
-        AppendJxlpBoxCounter(jxlp_counter++, /*last=*/false,
-                             &output_byte_queue);
-        output_byte_queue.insert(output_byte_queue.end(), bytes.data(),
-                                 bytes.data() + bytes.size());
+        JXL_RETURN_IF_ERROR(AppendBox(
+            jxl::MakeBoxType("jxlp"), /*unbounded=*/false, bytes.size() + 4,
+            [&]() {
+              JXL_ASSIGN_OR_RETURN(
+                  auto buffer, output_processor.GetBuffer(bytes.size() + 4));
+              WriteJxlpBoxCounter(jxlp_counter++, /*last=*/false, buffer);
+              buffer.append(bytes);
+              return jxl::OkStatus();
+            }));
         bytes.clear();
       }
 
       if (store_jpeg_metadata && !jpeg_metadata.empty()) {
-        jxl::AppendBoxHeader(jxl::MakeBoxType("jbrd"), jpeg_metadata.size(),
-                             false, &output_byte_queue);
-        output_byte_queue.insert(output_byte_queue.end(), jpeg_metadata.begin(),
-                                 jpeg_metadata.end());
+        JXL_RETURN_IF_ERROR(
+            AppendBoxWithContents(jxl::MakeBoxType("jbrd"), jpeg_metadata));
       }
     }
     wrote_bytes = true;
@@ -396,136 +823,199 @@ JxlEncoderStatus JxlEncoderStruct::RefillOutputByteQueue() {
 
   // Choose frame or box processing: exactly one of the two unique pointers (box
   // or frame) in the input queue item is non-null.
-  if (input.frame) {
+  if (input.frame || input.fast_lossless_frame) {
     jxl::MemoryManagerUniquePtr<jxl::JxlEncoderQueuedFrame> input_frame =
         std::move(input.frame);
+    jxl::FJXLFrameUniquePtr fast_lossless_frame =
+        std::move(input.fast_lossless_frame);
     input_queue.erase(input_queue.begin());
     num_queued_frames--;
-    for (unsigned idx = 0; idx < input_frame->ec_initialized.size(); idx++) {
-      if (!input_frame->ec_initialized[idx]) {
-        return JXL_API_ERROR(this, JXL_ENC_ERR_API_USAGE,
-                             "Extra channel %u is not initialized", idx);
+    if (input_frame) {
+      for (unsigned idx = 0; idx < input_frame->ec_initialized.size(); idx++) {
+        if (!input_frame->ec_initialized[idx]) {
+          return JXL_API_ERROR(this, JXL_ENC_ERR_API_USAGE,
+                               "Extra channel %u is not initialized", idx);
+        }
       }
-    }
 
-    // TODO(zond): If the input queue is empty and the frames_closed is true,
-    // then mark this frame as the last.
+      // TODO(zond): If the input queue is empty and the frames_closed is true,
+      // then mark this frame as the last.
 
-    // TODO(zond): Handle progressive mode like EncodeFile does it.
-    // TODO(zond): Handle animation like EncodeFile does it, by checking if
-    //             JxlEncoderCloseFrames has been called and if the frame queue
-    //             is empty (to see if it's the last animation frame).
+      // TODO(zond): Handle progressive mode like EncodeFile does it.
+      // TODO(zond): Handle animation like EncodeFile does it, by checking if
+      //             JxlEncoderCloseFrames has been called and if the frame
+      //             queue is empty (to see if it's the last animation frame).
 
-    if (metadata.m.xyb_encoded) {
-      input_frame->option_values.cparams.color_transform =
-          jxl::ColorTransform::kXYB;
-    } else {
-      // TODO(zond): Figure out when to use kYCbCr instead.
-      input_frame->option_values.cparams.color_transform =
-          jxl::ColorTransform::kNone;
+      if (metadata.m.xyb_encoded) {
+        input_frame->option_values.cparams.color_transform =
+            jxl::ColorTransform::kXYB;
+      } else {
+        // TODO(zond): Figure out when to use kYCbCr instead.
+        input_frame->option_values.cparams.color_transform =
+            jxl::ColorTransform::kNone;
+      }
     }
 
-    jxl::BitWriter writer;
-    jxl::PassesEncoderState enc_state;
-
-    // EncodeFrame creates jxl::FrameHeader object internally based on the
-    // FrameInfo, imagebundle, cparams and metadata. Copy the information to
-    // these.
-    jxl::ImageBundle& ib = input_frame->frame;
-    ib.name = input_frame->option_values.frame_name;
-    if (metadata.m.have_animation) {
-      ib.duration = input_frame->option_values.header.duration;
-      ib.timecode = input_frame->option_values.header.timecode;
+    uint32_t duration;
+    uint32_t timecode;
+    if (input_frame && metadata.m.have_animation) {
+      duration = input_frame->option_values.header.duration;
+      timecode = input_frame->option_values.header.timecode;
     } else {
       // If have_animation is false, the encoder should ignore the duration and
       // timecode values. However, assigning them to ib will cause the encoder
       // to write an invalid frame header that can't be decoded so ensure
       // they're the default value of 0 here.
-      ib.duration = 0;
-      ib.timecode = 0;
+      duration = 0;
+      timecode = 0;
     }
-    frame_index_box.AddFrame(codestream_bytes_written_end_of_frame, ib.duration,
-                             input_frame->option_values.frame_index_box);
-    ib.blendmode = static_cast<jxl::BlendMode>(
-        input_frame->option_values.header.layer_info.blend_info.blendmode);
-    ib.blend =
-        input_frame->option_values.header.layer_info.blend_info.blendmode !=
-        JXL_BLEND_REPLACE;
-
-    size_t save_as_reference =
-        input_frame->option_values.header.layer_info.save_as_reference;
-    ib.use_for_next_frame = !!save_as_reference;
-
-    jxl::FrameInfo frame_info;
+
     bool last_frame = frames_closed && !num_queued_frames;
-    frame_info.is_last = last_frame;
-    frame_info.save_as_reference = save_as_reference;
-    frame_info.source =
-        input_frame->option_values.header.layer_info.blend_info.source;
-    frame_info.clamp =
-        input_frame->option_values.header.layer_info.blend_info.clamp;
-    frame_info.alpha_channel =
-        input_frame->option_values.header.layer_info.blend_info.alpha;
-    frame_info.extra_channel_blending_info.resize(
-        metadata.m.num_extra_channels);
-    // If extra channel blend info has not been set, use the blend mode from the
-    // layer_info.
-    JxlBlendInfo default_blend_info =
-        input_frame->option_values.header.layer_info.blend_info;
-    for (size_t i = 0; i < metadata.m.num_extra_channels; ++i) {
-      auto& to = frame_info.extra_channel_blending_info[i];
-      const auto& from =
-          i < input_frame->option_values.extra_channel_blend_info.size()
-              ? input_frame->option_values.extra_channel_blend_info[i]
-              : default_blend_info;
-      to.mode = static_cast<jxl::BlendMode>(from.blendmode);
-      to.source = from.source;
-      to.alpha_channel = from.alpha;
-      to.clamp = (from.clamp != 0);
-    }
 
-    if (input_frame->option_values.header.layer_info.have_crop) {
-      ib.origin.x0 = input_frame->option_values.header.layer_info.crop_x0;
-      ib.origin.y0 = input_frame->option_values.header.layer_info.crop_y0;
-    }
-    JXL_ASSERT(writer.BitsWritten() == 0);
-    if (!jxl::EncodeFrame(input_frame->option_values.cparams, frame_info,
-                          &metadata, input_frame->frame, &enc_state, cms,
-                          thread_pool.get(), &writer,
-                          /*aux_out=*/nullptr)) {
-      return JXL_API_ERROR(this, JXL_ENC_ERR_GENERIC, "Failed to encode frame");
+    jxl::BitWriter writer;
+
+    std::function<jxl::Status()> append_frame_codestream;
+    size_t codestream_upper_bound = 0;
+
+    if (input_frame) {
+      jxl::PassesEncoderState enc_state;
+
+      frame_index_box.AddFrame(codestream_bytes_written_end_of_frame, duration,
+                               input_frame->option_values.frame_index_box);
+
+      // EncodeFrame creates jxl::FrameHeader object internally based on the
+      // FrameInfo, imagebundle, cparams and metadata. Copy the information to
+      // these.
+      jxl::ImageBundle& ib = input_frame->frame;
+      ib.duration = duration;
+      ib.timecode = timecode;
+      ib.name = input_frame->option_values.frame_name;
+      ib.blendmode = static_cast<jxl::BlendMode>(
+          input_frame->option_values.header.layer_info.blend_info.blendmode);
+      ib.blend =
+          input_frame->option_values.header.layer_info.blend_info.blendmode !=
+          JXL_BLEND_REPLACE;
+
+      size_t save_as_reference =
+          input_frame->option_values.header.layer_info.save_as_reference;
+      if (save_as_reference >= 3) {
+        return JXL_API_ERROR(
+            this, JXL_ENC_ERR_API_USAGE,
+            "Cannot use save_as_reference values >=3 (found: %d)",
+            (int)save_as_reference);
+      }
+      ib.use_for_next_frame = !!save_as_reference;
+
+      jxl::FrameInfo frame_info;
+      frame_info.is_last = last_frame;
+      frame_info.save_as_reference = save_as_reference;
+      frame_info.source =
+          input_frame->option_values.header.layer_info.blend_info.source;
+      frame_info.clamp =
+          input_frame->option_values.header.layer_info.blend_info.clamp;
+      frame_info.alpha_channel =
+          input_frame->option_values.header.layer_info.blend_info.alpha;
+      frame_info.extra_channel_blending_info.resize(
+          metadata.m.num_extra_channels);
+      // If extra channel blend info has not been set, use the blend mode from
+      // the layer_info.
+      JxlBlendInfo default_blend_info =
+          input_frame->option_values.header.layer_info.blend_info;
+      for (size_t i = 0; i < metadata.m.num_extra_channels; ++i) {
+        auto& to = frame_info.extra_channel_blending_info[i];
+        const auto& from =
+            i < input_frame->option_values.extra_channel_blend_info.size()
+                ? input_frame->option_values.extra_channel_blend_info[i]
+                : default_blend_info;
+        to.mode = static_cast<jxl::BlendMode>(from.blendmode);
+        to.source = from.source;
+        to.alpha_channel = from.alpha;
+        to.clamp = (from.clamp != 0);
+      }
+
+      if (input_frame->option_values.header.layer_info.have_crop) {
+        ib.origin.x0 = input_frame->option_values.header.layer_info.crop_x0;
+        ib.origin.y0 = input_frame->option_values.header.layer_info.crop_y0;
+      }
+      JXL_ASSERT(writer.BitsWritten() == 0);
+      if (!jxl::EncodeFrame(input_frame->option_values.cparams, frame_info,
+                            &metadata, input_frame->frame, &enc_state, cms,
+                            thread_pool.get(), &writer,
+                            input_frame->option_values.aux_out)) {
+        return JXL_API_ERROR(this, JXL_ENC_ERR_GENERIC,
+                             "Failed to encode frame");
+      }
+      codestream_bytes_written_beginning_of_frame =
+          codestream_bytes_written_end_of_frame;
+      codestream_bytes_written_end_of_frame +=
+          jxl::DivCeil(writer.BitsWritten(), 8);
+
+      // Possibly bytes already contains the codestream header: in case this is
+      // the first frame, and the codestream header was not encoded as jxlp
+      // above.
+      bytes.append(std::move(writer).TakeBytes());
+      codestream_upper_bound = bytes.size();
+      append_frame_codestream = [&bytes, this]() {
+        return AppendData(output_processor, bytes);
+      };
+    } else {
+      JXL_CHECK(fast_lossless_frame);
+      JxlFastLosslessPrepareHeader(fast_lossless_frame.get(),
+                                   /*add_image_header=*/0, last_frame);
+      size_t fl_size = JxlFastLosslessOutputSize(fast_lossless_frame.get());
+      codestream_upper_bound = fl_size + bytes.size();
+      append_frame_codestream = [&bytes, &fast_lossless_frame, fl_size,
+                                 this]() {
+        if (!bytes.empty()) {
+          JXL_RETURN_IF_ERROR(AppendData(output_processor, bytes));
+        }
+        size_t written = 0;
+        while (true) {
+          JXL_ASSIGN_OR_RETURN(
+              auto buffer, output_processor.GetBuffer(32, fl_size - written));
+          size_t n = JxlFastLosslessWriteOutput(fast_lossless_frame.get(),
+                                                buffer.data(), buffer.size());
+          if (n == 0) break;
+          buffer.advance(n);
+          written += n;
+        };
+        return jxl::OkStatus();
+      };
     }
-    codestream_bytes_written_beginning_of_frame =
-        codestream_bytes_written_end_of_frame;
-    codestream_bytes_written_end_of_frame +=
-        jxl::DivCeil(writer.BitsWritten(), 8);
 
-    // Possibly bytes already contains the codestream header: in case this is
-    // the first frame, and the codestream header was not encoded as jxlp above.
-    bytes.append(std::move(writer).TakeBytes());
     if (MustUseContainer()) {
       if (last_frame && jxlp_counter == 0) {
         // If this is the last frame and no jxlp boxes were used yet, it's
-        // slighly more efficient to write a jxlc box since it has 4 bytes less
-        // overhead.
-        jxl::AppendBoxHeader(jxl::MakeBoxType("jxlc"), bytes.size(),
-                             /*unbounded=*/false, &output_byte_queue);
+        // slightly more efficient to write a jxlc box since it has 4 bytes
+        // less overhead.
+        JXL_RETURN_IF_ERROR(
+            AppendBox(jxl::MakeBoxType("jxlc"), /*unbounded=*/false,
+                      codestream_upper_bound, append_frame_codestream));
       } else {
-        jxl::AppendBoxHeader(jxl::MakeBoxType("jxlp"), bytes.size() + 4,
-                             /*unbounded=*/false, &output_byte_queue);
-        AppendJxlpBoxCounter(jxlp_counter++, last_frame, &output_byte_queue);
+        JXL_RETURN_IF_ERROR(AppendBox(
+            jxl::MakeBoxType("jxlp"), /*unbounded=*/false,
+            codestream_upper_bound + 4, [&]() {
+              {
+                JXL_ASSIGN_OR_RETURN(auto buffer,
+                                     output_processor.GetBuffer(4));
+                WriteJxlpBoxCounter(jxlp_counter++, last_frame, buffer);
+              }
+              return append_frame_codestream();
+            }));
       }
+    } else {
+      JXL_RETURN_IF_ERROR(append_frame_codestream());
+      output_processor.SetFinalizedPosition();
     }
 
-    output_byte_queue.insert(output_byte_queue.end(), bytes.data(),
-                             bytes.data() + bytes.size());
-
-    last_used_cparams = input_frame->option_values.cparams;
+    if (input_frame) {
+      last_used_cparams = input_frame->option_values.cparams;
+    }
     if (last_frame && frame_index_box.StoreFrameIndexBox()) {
-      bytes.clear();
-      EncodeFrameIndexBox(frame_index_box, writer);
-      jxl::AppendBoxHeader(jxl::MakeBoxType("jxli"), bytes.size(),
-                           /*unbounded=*/false, &output_byte_queue);
+      std::vector<uint8_t> index_box_content;
+      EncodeFrameIndexBox(frame_index_box, index_box_content);
+      JXL_RETURN_IF_ERROR(AppendBoxWithContents(jxl::MakeBoxType("jxli"),
+                                                jxl::Bytes(index_box_content)));
     }
   } else {
     // Not a frame, so is a box instead
@@ -547,19 +1037,15 @@ JxlEncoderStatus JxlEncoderStruct::RefillOutputByteQueue() {
         return JXL_API_ERROR(this, JXL_ENC_ERR_GENERIC,
                              "Brotli compression for brob box failed");
       }
-      jxl::AppendBoxHeader(jxl::MakeBoxType("brob"), compressed.size(), false,
-                           &output_byte_queue);
-      output_byte_queue.insert(output_byte_queue.end(), compressed.data(),
-                               compressed.data() + compressed.size());
+
+      JXL_RETURN_IF_ERROR(
+          AppendBoxWithContents(jxl::MakeBoxType("brob"), compressed));
     } else {
-      jxl::AppendBoxHeader(box->type, box->contents.size(), false,
-                           &output_byte_queue);
-      output_byte_queue.insert(output_byte_queue.end(), box->contents.data(),
-                               box->contents.data() + box->contents.size());
+      JXL_RETURN_IF_ERROR(AppendBoxWithContents(box->type, box->contents));
     }
   }
 
-  return JXL_ENC_SUCCESS;
+  return jxl::OkStatus();
 }
 
 JxlEncoderStatus JxlEncoderSetColorEncoding(JxlEncoder* enc,
@@ -571,8 +1057,7 @@ JxlEncoderStatus JxlEncoderSetColorEncoding(JxlEncoder* enc,
     return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE,
                          "Color encoding is already set");
   }
-  if (!jxl::ConvertExternalToInternalColorEncoding(
-          *color, &enc->metadata.m.color_encoding)) {
+  if (!enc->metadata.m.color_encoding.FromExternal(*color)) {
     return JXL_API_ERROR(enc, JXL_ENC_ERR_GENERIC, "Error in color conversion");
   }
   if (enc->metadata.m.color_encoding.GetColorSpace() ==
@@ -591,7 +1076,7 @@ JxlEncoderStatus JxlEncoderSetColorEncoding(JxlEncoder* enc,
   if (!enc->intensity_target_set) {
     jxl::SetIntensityTarget(&enc->metadata.m);
   }
-  return JXL_ENC_SUCCESS;
+  return JxlErrorOrStatus::Success();
 }
 
 JxlEncoderStatus JxlEncoderSetICCProfile(JxlEncoder* enc,
@@ -604,11 +1089,18 @@ JxlEncoderStatus JxlEncoderSetICCProfile(JxlEncoder* enc,
     return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE,
                          "ICC profile is already set");
   }
-  jxl::PaddedBytes icc;
+  if (size == 0) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_BAD_INPUT, "Empty ICC profile");
+  }
+  jxl::IccBytes icc;
   icc.assign(icc_profile, icc_profile + size);
-  if (!enc->metadata.m.color_encoding.SetICC(std::move(icc))) {
-    return JXL_API_ERROR(enc, JXL_ENC_ERR_BAD_INPUT,
-                         "ICC profile could not be set");
+  if (enc->cms_set) {
+    if (!enc->metadata.m.color_encoding.SetICC(std::move(icc), &enc->cms)) {
+      return JXL_API_ERROR(enc, JXL_ENC_ERR_BAD_INPUT,
+                           "ICC profile could not be set");
+    }
+  } else {
+    enc->metadata.m.color_encoding.SetICCRaw(std::move(icc));
   }
   if (enc->metadata.m.color_encoding.GetColorSpace() ==
       jxl::ColorSpace::kGray) {
@@ -629,11 +1121,11 @@ JxlEncoderStatus JxlEncoderSetICCProfile(JxlEncoder* enc,
     jxl::SetIntensityTarget(&enc->metadata.m);
   }
 
-  if (!enc->basic_info.uses_original_profile) {
-    enc->metadata.m.color_encoding.DecideIfWantICC();
+  if (!enc->basic_info.uses_original_profile && enc->cms_set) {
+    enc->metadata.m.color_encoding.DecideIfWantICC(enc->cms);
   }
 
-  return JXL_ENC_SUCCESS;
+  return JxlErrorOrStatus::Success();
 }
 
 void JxlEncoderInitBasicInfo(JxlBasicInfo* info) {
@@ -667,9 +1159,9 @@ void JxlEncoderInitBasicInfo(JxlBasicInfo* info) {
 
 void JxlEncoderInitFrameHeader(JxlFrameHeader* frame_header) {
   // For each field, the default value of the specification is used. Depending
-  // on wheter an animation frame, or a composite still blending frame, is used,
-  // different fields have to be set up by the user after initing the frame
-  // header.
+  // on whether an animation frame, or a composite still blending frame,
+  // is used, different fields have to be set up by the user after initing
+  // the frame header.
   frame_header->duration = 0;
   frame_header->timecode = 0;
   frame_header->name_length = 0;
@@ -712,6 +1204,7 @@ JxlEncoderStatus JxlEncoderSetBasicInfo(JxlEncoder* enc,
                                             info->exponent_bits_per_sample)) {
     return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE, "Invalid bit depth");
   }
+
   enc->metadata.m.bit_depth.bits_per_sample = info->bits_per_sample;
   enc->metadata.m.bit_depth.exponent_bits_per_sample =
       info->exponent_bits_per_sample;
@@ -720,6 +1213,18 @@ JxlEncoderStatus JxlEncoderSetBasicInfo(JxlEncoder* enc,
   enc->metadata.m.modular_16_bit_buffer_sufficient =
       (!info->uses_original_profile || info->bits_per_sample <= 12) &&
       info->alpha_bits <= 12;
+  if ((info->intrinsic_xsize > 0 || info->intrinsic_ysize > 0) &&
+      (info->intrinsic_xsize != info->xsize ||
+       info->intrinsic_ysize != info->ysize)) {
+    if (info->intrinsic_xsize > (1ull << 30ull) ||
+        info->intrinsic_ysize > (1ull << 30ull) ||
+        !enc->metadata.m.intrinsic_size.Set(info->intrinsic_xsize,
+                                            info->intrinsic_ysize)) {
+      return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE,
+                           "Invalid intrinsic dimensions");
+    }
+    enc->metadata.m.have_intrinsic_size = true;
+  }
 
   // The number of extra channels includes the alpha channel, so for example and
   // RGBA with no other extra channels, has exactly num_extra_channels == 1
@@ -758,10 +1263,10 @@ JxlEncoderStatus JxlEncoderSetBasicInfo(JxlEncoder* enc,
   if (info->intensity_target != 0) {
     enc->metadata.m.SetIntensityTarget(info->intensity_target);
     enc->intensity_target_set = true;
-  } else if (enc->color_encoding_set || enc->metadata.m.xyb_encoded) {
-    // If both conditions are false, JxlEncoderSetColorEncoding will be called
-    // later and we will get one more chance to call jxl::SetIntensityTarget,
-    // after the color encoding is indeed set.
+  } else if (enc->color_encoding_set) {
+    // If this is false, JxlEncoderSetColorEncoding will be called later and we
+    // will get one more chance to call jxl::SetIntensityTarget, after the color
+    // encoding is indeed set.
     jxl::SetIntensityTarget(&enc->metadata.m);
     enc->intensity_target_set = true;
   }
@@ -799,7 +1304,7 @@ JxlEncoderStatus JxlEncoderSetBasicInfo(JxlEncoder* enc,
          std::to_string(enc->codestream_level) + " failed: " + level_message)
             .c_str());
   }
-  return JXL_ENC_SUCCESS;
+  return JxlErrorOrStatus::Success();
 }
 
 void JxlEncoderInitExtraChannelInfo(JxlExtraChannelType type,
@@ -817,6 +1322,55 @@ void JxlEncoderInitExtraChannelInfo(JxlExtraChannelType type,
   info->cfa_channel = 0;
 }
 
+JXL_EXPORT JxlEncoderStatus JxlEncoderSetUpsamplingMode(JxlEncoder* enc,
+                                                        const int64_t factor,
+                                                        const int64_t mode) {
+  // for convenience, allow calling this with factor 1 and just make it a no-op
+  if (factor == 1) return JxlErrorOrStatus::Success();
+  if (factor != 2 && factor != 4 && factor != 8)
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE,
+                         "Invalid upsampling factor");
+  if (mode < -1)
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE, "Invalid upsampling mode");
+  if (mode > 1)
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_NOT_SUPPORTED,
+                         "Unsupported upsampling mode");
+
+  const size_t count = (factor == 2 ? 15 : (factor == 4 ? 55 : 210));
+  auto& td = enc->metadata.transform_data;
+  float* weights = (factor == 2 ? td.upsampling2_weights
+                                : (factor == 4 ? td.upsampling4_weights
+                                               : td.upsampling8_weights));
+  if (mode == -1) {
+    // Default fancy upsampling: don't signal custom weights
+    enc->metadata.transform_data.custom_weights_mask &= ~(factor >> 1);
+  } else if (mode == 0) {
+    // Nearest neighbor upsampling
+    enc->metadata.transform_data.custom_weights_mask |= (factor >> 1);
+    memset(weights, 0, sizeof(float) * count);
+    if (factor == 2) {
+      weights[9] = 1.f;
+    } else if (factor == 4) {
+      for (int i : {19, 24, 49}) weights[i] = 1.f;
+    } else if (factor == 8) {
+      for (int i : {39, 44, 49, 54, 119, 124, 129, 174, 179, 204}) {
+        weights[i] = 1.f;
+      }
+    }
+  } else if (mode == 1) {
+    // 'Pixel dots' upsampling (nearest-neighbor with cut corners)
+    JxlEncoderSetUpsamplingMode(enc, factor, 0);
+    if (factor == 4) {
+      weights[19] = 0.f;
+      weights[24] = 0.5f;
+    } else if (factor == 8) {
+      for (int i : {39, 44, 49, 119}) weights[i] = 0.f;
+      for (int i : {54, 124}) weights[i] = 0.5f;
+    }
+  }
+  return JxlErrorOrStatus::Success();
+}
+
 JXL_EXPORT JxlEncoderStatus JxlEncoderSetExtraChannelInfo(
     JxlEncoder* enc, size_t index, const JxlExtraChannelInfo* info) {
   if (index >= enc->metadata.m.num_extra_channels) {
@@ -854,7 +1408,7 @@ JXL_EXPORT JxlEncoderStatus JxlEncoderSetExtraChannelInfo(
          std::to_string(enc->codestream_level) + " failed: " + level_message)
             .c_str());
   }
-  return JXL_ENC_SUCCESS;
+  return JxlErrorOrStatus::Success();
 }
 
 JXL_EXPORT JxlEncoderStatus JxlEncoderSetExtraChannelName(JxlEncoder* enc,
@@ -867,7 +1421,7 @@ JXL_EXPORT JxlEncoderStatus JxlEncoderSetExtraChannelName(JxlEncoder* enc,
   }
   enc->metadata.m.extra_channel_info[index].name =
       std::string(name, name + size);
-  return JXL_ENC_SUCCESS;
+  return JxlErrorOrStatus::Success();
 }
 
 JxlEncoderFrameSettings* JxlEncoderFrameSettingsCreate(
@@ -882,63 +1436,72 @@ JxlEncoderFrameSettings* JxlEncoderFrameSettingsCreate(
     opts->values.lossless = false;
   }
   opts->values.cparams.level = enc->codestream_level;
+  opts->values.cparams.ec_distance.resize(enc->metadata.m.num_extra_channels,
+                                          -1);
+
   JxlEncoderFrameSettings* ret = opts.get();
   enc->encoder_options.emplace_back(std::move(opts));
   return ret;
 }
 
-JxlEncoderFrameSettings* JxlEncoderOptionsCreate(
-    JxlEncoder* enc, const JxlEncoderFrameSettings* source) {
-  // Deprecated function name, call the non-deprecated function
-  return JxlEncoderFrameSettingsCreate(enc, source);
-}
-
 JxlEncoderStatus JxlEncoderSetFrameLossless(
     JxlEncoderFrameSettings* frame_settings, const JXL_BOOL lossless) {
   if (lossless && frame_settings->enc->basic_info_set &&
       frame_settings->enc->metadata.m.xyb_encoded) {
-    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
-                         "Set use_original_profile=true for lossless encoding");
+    return JXL_API_ERROR(
+        frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+        "Set uses_original_profile=true for lossless encoding");
   }
   frame_settings->values.lossless = lossless;
-  return JXL_ENC_SUCCESS;
-}
-
-JxlEncoderStatus JxlEncoderOptionsSetLossless(
-    JxlEncoderFrameSettings* frame_settings, JXL_BOOL lossless) {
-  // Deprecated function name, call the non-deprecated function
-  return JxlEncoderSetFrameLossless(frame_settings, lossless);
-}
-
-JxlEncoderStatus JxlEncoderOptionsSetEffort(
-    JxlEncoderFrameSettings* frame_settings, const int effort) {
-  return JxlEncoderFrameSettingsSetOption(frame_settings,
-                                          JXL_ENC_FRAME_SETTING_EFFORT, effort);
+  return JxlErrorOrStatus::Success();
 }
 
 JxlEncoderStatus JxlEncoderSetFrameDistance(
     JxlEncoderFrameSettings* frame_settings, float distance) {
   if (distance < 0.f || distance > 25.f) {
     return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
-                         "Distance has to be in [0.0..25.0]");
+                         "Distance has to be in [0.0..25.0] (corresponding to "
+                         "quality in [0.0..100.0])");
   }
   if (distance > 0.f && distance < 0.01f) {
     distance = 0.01f;
   }
   frame_settings->values.cparams.butteraugli_distance = distance;
-  return JXL_ENC_SUCCESS;
+  return JxlErrorOrStatus::Success();
 }
 
-JxlEncoderStatus JxlEncoderOptionsSetDistance(
-    JxlEncoderFrameSettings* frame_settings, float distance) {
-  // Deprecated function name, call the non-deprecated function
-  return JxlEncoderSetFrameDistance(frame_settings, distance);
+JxlEncoderStatus JxlEncoderSetExtraChannelDistance(
+    JxlEncoderFrameSettings* frame_settings, size_t index, float distance) {
+  if (index >= frame_settings->enc->metadata.m.num_extra_channels) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                         "Invalid value for the index of extra channel");
+  }
+  if (distance != -1.f && (distance < 0.f || distance > 25.f)) {
+    return JXL_API_ERROR(
+        frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+        "Distance has to be -1 or in [0.0..25.0] (corresponding to "
+        "quality in [0.0..100.0])");
+  }
+  if (distance > 0.f && distance < 0.01f) {
+    distance = 0.01f;
+  }
+
+  if (index >= frame_settings->values.cparams.ec_distance.size()) {
+    // This can only happen if JxlEncoderFrameSettingsCreate() was called before
+    // JxlEncoderSetBasicInfo().
+    frame_settings->values.cparams.ec_distance.resize(
+        frame_settings->enc->metadata.m.num_extra_channels, -1);
+  }
+
+  frame_settings->values.cparams.ec_distance[index] = distance;
+  return JxlErrorOrStatus::Success();
 }
 
-JxlEncoderStatus JxlEncoderOptionsSetDecodingSpeed(
-    JxlEncoderFrameSettings* frame_settings, int tier) {
-  return JxlEncoderFrameSettingsSetOption(
-      frame_settings, JXL_ENC_FRAME_SETTING_DECODING_SPEED, tier);
+float JxlEncoderDistanceFromQuality(float quality) {
+  return quality >= 100.0 ? 0.0
+         : quality >= 30
+             ? 0.1 + (100 - quality) * 0.09
+             : 53.0 / 3000.0 * quality * quality - 23.0 / 20.0 * quality + 25.0;
 }
 
 JxlEncoderStatus JxlEncoderFrameSettingsSetOption(
@@ -958,6 +1521,10 @@ JxlEncoderStatus JxlEncoderFrameSettingsSetOption(
     case JXL_ENC_FRAME_SETTING_QPROGRESSIVE_AC:
     case JXL_ENC_FRAME_SETTING_LOSSY_PALETTE:
     case JXL_ENC_FRAME_SETTING_JPEG_RECON_CFL:
+    case JXL_ENC_FRAME_SETTING_JPEG_COMPRESS_BOXES:
+    case JXL_ENC_FRAME_SETTING_JPEG_KEEP_EXIF:
+    case JXL_ENC_FRAME_SETTING_JPEG_KEEP_XMP:
+    case JXL_ENC_FRAME_SETTING_JPEG_KEEP_JUMBF:
       if (value < -1 || value > 1) {
         return JXL_API_ERROR(
             frame_settings->enc, JXL_ENC_ERR_API_USAGE,
@@ -970,13 +1537,20 @@ JxlEncoderStatus JxlEncoderFrameSettingsSetOption(
 
   switch (option) {
     case JXL_ENC_FRAME_SETTING_EFFORT:
-      if (value < 1 || value > 9) {
-        return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_NOT_SUPPORTED,
-                             "Encode effort has to be in [1..9]");
+      if (frame_settings->enc->allow_expert_options) {
+        if (value < 1 || value > 10) {
+          return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_NOT_SUPPORTED,
+                               "Encode effort has to be in [1..10]");
+        }
+      } else {
+        if (value < 1 || value > 9) {
+          return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_NOT_SUPPORTED,
+                               "Encode effort has to be in [1..9]");
+        }
       }
       frame_settings->values.cparams.speed_tier =
           static_cast<jxl::SpeedTier>(10 - value);
-      return JXL_ENC_SUCCESS;
+      break;
     case JXL_ENC_FRAME_SETTING_BROTLI_EFFORT:
       if (value < -1 || value > 11) {
         return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
@@ -986,23 +1560,23 @@ JxlEncoderStatus JxlEncoderFrameSettingsSetOption(
       frame_settings->values.cparams.brotli_effort = value;
       // set enc option for brotli use in brob boxes
       frame_settings->enc->brotli_effort = value;
-      return JXL_ENC_SUCCESS;
+      break;
     case JXL_ENC_FRAME_SETTING_DECODING_SPEED:
       if (value < 0 || value > 4) {
         return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_NOT_SUPPORTED,
                              "Decoding speed has to be in [0..4]");
       }
       frame_settings->values.cparams.decoding_speed_tier = value;
-      return JXL_ENC_SUCCESS;
+      break;
     case JXL_ENC_FRAME_SETTING_RESAMPLING:
       if (value != -1 && value != 1 && value != 2 && value != 4 && value != 8) {
         return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
                              "Resampling factor has to be 1, 2, 4 or 8");
       }
       frame_settings->values.cparams.resampling = value;
-      return JXL_ENC_SUCCESS;
+      break;
     case JXL_ENC_FRAME_SETTING_EXTRA_CHANNEL_RESAMPLING:
-      // TOOD(lode): the jxl codestream allows choosing a different resampling
+      // TODO(lode): the jxl codestream allows choosing a different resampling
       // factor for each extra channel, independently per frame. Move this
       // option to a JxlEncoderFrameSettings-option that can be set per extra
       // channel, so needs its own function rather than
@@ -1013,74 +1587,74 @@ JxlEncoderStatus JxlEncoderFrameSettingsSetOption(
                              "Resampling factor has to be 1, 2, 4 or 8");
       }
       frame_settings->values.cparams.ec_resampling = value;
-      return JXL_ENC_SUCCESS;
+      break;
     case JXL_ENC_FRAME_SETTING_ALREADY_DOWNSAMPLED:
       if (value < 0 || value > 1) {
-        return JXL_ENC_ERROR;
+        return JxlErrorOrStatus::Error();
       }
       frame_settings->values.cparams.already_downsampled = (value == 1);
-      return JXL_ENC_SUCCESS;
+      break;
     case JXL_ENC_FRAME_SETTING_NOISE:
       frame_settings->values.cparams.noise = static_cast<jxl::Override>(value);
-      return JXL_ENC_SUCCESS;
+      break;
     case JXL_ENC_FRAME_SETTING_DOTS:
       frame_settings->values.cparams.dots = static_cast<jxl::Override>(value);
-      return JXL_ENC_SUCCESS;
+      break;
     case JXL_ENC_FRAME_SETTING_PATCHES:
       frame_settings->values.cparams.patches =
           static_cast<jxl::Override>(value);
-      return JXL_ENC_SUCCESS;
+      break;
     case JXL_ENC_FRAME_SETTING_EPF:
       if (value < -1 || value > 3) {
         return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
                              "EPF value has to be in [-1..3]");
       }
       frame_settings->values.cparams.epf = static_cast<int>(value);
-      return JXL_ENC_SUCCESS;
+      break;
     case JXL_ENC_FRAME_SETTING_GABORISH:
       frame_settings->values.cparams.gaborish =
           static_cast<jxl::Override>(value);
-      return JXL_ENC_SUCCESS;
+      break;
     case JXL_ENC_FRAME_SETTING_MODULAR:
       frame_settings->values.cparams.modular_mode = (value == 1);
-      return JXL_ENC_SUCCESS;
+      break;
     case JXL_ENC_FRAME_SETTING_KEEP_INVISIBLE:
       frame_settings->values.cparams.keep_invisible =
           static_cast<jxl::Override>(value);
-      return JXL_ENC_SUCCESS;
+      break;
     case JXL_ENC_FRAME_SETTING_GROUP_ORDER:
       frame_settings->values.cparams.centerfirst = (value == 1);
-      return JXL_ENC_SUCCESS;
+      break;
     case JXL_ENC_FRAME_SETTING_GROUP_ORDER_CENTER_X:
       if (value < -1) {
         return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
                              "Center x coordinate has to be -1 or positive");
       }
       frame_settings->values.cparams.center_x = static_cast<size_t>(value);
-      return JXL_ENC_SUCCESS;
+      break;
     case JXL_ENC_FRAME_SETTING_GROUP_ORDER_CENTER_Y:
       if (value < -1) {
         return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
                              "Center y coordinate has to be -1 or positive");
       }
       frame_settings->values.cparams.center_y = static_cast<size_t>(value);
-      return JXL_ENC_SUCCESS;
+      break;
     case JXL_ENC_FRAME_SETTING_RESPONSIVE:
       frame_settings->values.cparams.responsive = value;
-      return JXL_ENC_SUCCESS;
+      break;
     case JXL_ENC_FRAME_SETTING_PROGRESSIVE_AC:
       frame_settings->values.cparams.progressive_mode = value;
-      return JXL_ENC_SUCCESS;
+      break;
     case JXL_ENC_FRAME_SETTING_QPROGRESSIVE_AC:
       frame_settings->values.cparams.qprogressive_mode = value;
-      return JXL_ENC_SUCCESS;
+      break;
     case JXL_ENC_FRAME_SETTING_PROGRESSIVE_DC:
       if (value < -1 || value > 2) {
         return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
                              "Progressive DC has to be in [-1..2]");
       }
       frame_settings->values.cparams.progressive_dc = value;
-      return JXL_ENC_SUCCESS;
+      break;
     case JXL_ENC_FRAME_SETTING_PALETTE_COLORS:
       if (value < -1 || value > 70913) {
         return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
@@ -1091,13 +1665,14 @@ JxlEncoderStatus JxlEncoderFrameSettingsSetOption(
       } else {
         frame_settings->values.cparams.palette_colors = value;
       }
-      return JXL_ENC_SUCCESS;
+      break;
     case JXL_ENC_FRAME_SETTING_LOSSY_PALETTE:
       // TODO(lode): the defaults of some palette settings depend on others.
       // See the logic in cjxl. Similar for other settings. This should be
       // handled in the encoder during JxlEncoderProcessOutput (or,
       // alternatively, in the cjxl binary like now)
       frame_settings->values.cparams.lossy_palette = (value == 1);
+      break;
       return JXL_ENC_SUCCESS;
     case JXL_ENC_FRAME_SETTING_COLOR_TRANSFORM:
       if (value < -1 || value > 2) {
@@ -1111,29 +1686,21 @@ JxlEncoderStatus JxlEncoderFrameSettingsSetOption(
         frame_settings->values.cparams.color_transform =
             static_cast<jxl::ColorTransform>(value);
       }
-      return JXL_ENC_SUCCESS;
+      break;
     case JXL_ENC_FRAME_SETTING_MODULAR_COLOR_SPACE:
       if (value < -1 || value > 41) {
         return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
                              "Option value has to be in [-1..41]");
       }
       frame_settings->values.cparams.colorspace = value;
-      return JXL_ENC_SUCCESS;
+      break;
     case JXL_ENC_FRAME_SETTING_MODULAR_GROUP_SIZE:
       if (value < -1 || value > 3) {
         return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
                              "Option value has to be in [-1..3]");
       }
-      // TODO(lode): the default behavior of this parameter for cjxl is
-      // to choose 1 or 2 depending on the situation. This behavior needs to be
-      // implemented either in the C++ library by allowing to set this to -1, or
-      // kept in cjxl and set it to 1 or 2 using this API.
-      if (value == -1) {
-        frame_settings->values.cparams.modular_group_size_shift = 1;
-      } else {
-        frame_settings->values.cparams.modular_group_size_shift = value;
-      }
-      return JXL_ENC_SUCCESS;
+      frame_settings->values.cparams.modular_group_size_shift = value;
+      break;
     case JXL_ENC_FRAME_SETTING_MODULAR_PREDICTOR:
       if (value < -1 || value > 15) {
         return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
@@ -1141,7 +1708,7 @@ JxlEncoderStatus JxlEncoderFrameSettingsSetOption(
       }
       frame_settings->values.cparams.options.predictor =
           static_cast<jxl::Predictor>(value);
-      return JXL_ENC_SUCCESS;
+      break;
     case JXL_ENC_FRAME_SETTING_MODULAR_NB_PREV_CHANNELS:
       // The max allowed value can in theory be higher. However, it depends on
       // the effort setting. 11 is the highest safe value that doesn't cause
@@ -1157,25 +1724,49 @@ JxlEncoderStatus JxlEncoderFrameSettingsSetOption(
       } else {
         frame_settings->values.cparams.options.max_properties = value;
       }
-      return JXL_ENC_SUCCESS;
+      break;
     case JXL_ENC_FRAME_SETTING_JPEG_RECON_CFL:
       if (value == -1) {
         frame_settings->values.cparams.force_cfl_jpeg_recompression = true;
       } else {
         frame_settings->values.cparams.force_cfl_jpeg_recompression = value;
       }
-      return JXL_ENC_SUCCESS;
+      break;
     case JXL_ENC_FRAME_INDEX_BOX:
+      if (value < 0 || value > 1) {
+        return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_NOT_SUPPORTED,
+                             "Option value has to be 0 or 1");
+      }
       frame_settings->values.frame_index_box = true;
-      return JXL_ENC_SUCCESS;
+      break;
     case JXL_ENC_FRAME_SETTING_PHOTON_NOISE:
       return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_NOT_SUPPORTED,
                            "Float option, try setting it with "
                            "JxlEncoderFrameSettingsSetFloatOption");
+    case JXL_ENC_FRAME_SETTING_JPEG_COMPRESS_BOXES:
+      frame_settings->values.cparams.jpeg_compress_boxes = value;
+      break;
+    case JXL_ENC_FRAME_SETTING_BUFFERING:
+      if (value < 0 || value > 3) {
+        return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_NOT_SUPPORTED,
+                             "Buffering has to be in [0..3]");
+      }
+      break;
+    case JXL_ENC_FRAME_SETTING_JPEG_KEEP_EXIF:
+      frame_settings->values.cparams.jpeg_keep_exif = value;
+      break;
+    case JXL_ENC_FRAME_SETTING_JPEG_KEEP_XMP:
+      frame_settings->values.cparams.jpeg_keep_xmp = value;
+      break;
+    case JXL_ENC_FRAME_SETTING_JPEG_KEEP_JUMBF:
+      frame_settings->values.cparams.jpeg_keep_jumbf = value;
+      break;
+
     default:
       return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_NOT_SUPPORTED,
                            "Unknown option");
   }
+  return JxlErrorOrStatus::Success();
 }
 
 JxlEncoderStatus JxlEncoderFrameSettingsSetFloatOption(
@@ -1187,7 +1778,7 @@ JxlEncoderStatus JxlEncoderFrameSettingsSetFloatOption(
       // TODO(lode): add encoder setting to set the 8 floating point values of
       // the noise synthesis parameters per frame for more fine grained control.
       frame_settings->values.cparams.photon_noise_iso = value;
-      return JXL_ENC_SUCCESS;
+      return JxlErrorOrStatus::Success();
     case JXL_ENC_FRAME_SETTING_MODULAR_MA_TREE_LEARNING_PERCENT:
       if (value < -1.f || value > 100.f) {
         return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
@@ -1205,7 +1796,7 @@ JxlEncoderStatus JxlEncoderFrameSettingsSetFloatOption(
       } else {
         frame_settings->values.cparams.options.nb_repeats = value * 0.01f;
       }
-      return JXL_ENC_SUCCESS;
+      return JxlErrorOrStatus::Success();
     case JXL_ENC_FRAME_SETTING_CHANNEL_COLORS_GLOBAL_PERCENT:
       if (value < -1.f || value > 100.f) {
         return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
@@ -1218,7 +1809,7 @@ JxlEncoderStatus JxlEncoderFrameSettingsSetFloatOption(
         frame_settings->values.cparams.channel_colors_pre_transform_percent =
             value;
       }
-      return JXL_ENC_SUCCESS;
+      return JxlErrorOrStatus::Success();
     case JXL_ENC_FRAME_SETTING_CHANNEL_COLORS_GROUP_PERCENT:
       if (value < -1.f || value > 100.f) {
         return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
@@ -1229,7 +1820,7 @@ JxlEncoderStatus JxlEncoderFrameSettingsSetFloatOption(
       } else {
         frame_settings->values.cparams.channel_colors_percent = value;
       }
-      return JXL_ENC_SUCCESS;
+      return JxlErrorOrStatus::Success();
     case JXL_ENC_FRAME_SETTING_EFFORT:
     case JXL_ENC_FRAME_SETTING_DECODING_SPEED:
     case JXL_ENC_FRAME_SETTING_RESAMPLING:
@@ -1260,6 +1851,11 @@ JxlEncoderStatus JxlEncoderFrameSettingsSetFloatOption(
     case JXL_ENC_FRAME_INDEX_BOX:
     case JXL_ENC_FRAME_SETTING_BROTLI_EFFORT:
     case JXL_ENC_FRAME_SETTING_FILL_ENUM:
+    case JXL_ENC_FRAME_SETTING_JPEG_COMPRESS_BOXES:
+    case JXL_ENC_FRAME_SETTING_BUFFERING:
+    case JXL_ENC_FRAME_SETTING_JPEG_KEEP_EXIF:
+    case JXL_ENC_FRAME_SETTING_JPEG_KEEP_XMP:
+    case JXL_ENC_FRAME_SETTING_JPEG_KEEP_JUMBF:
       return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_NOT_SUPPORTED,
                            "Int option, try setting it with "
                            "JxlEncoderFrameSettingsSetOption");
@@ -1280,7 +1876,8 @@ JxlEncoder* JxlEncoderCreate(const JxlMemoryManager* memory_manager) {
   JxlEncoder* enc = new (alloc) JxlEncoder();
   enc->memory_manager = local_memory_manager;
   // TODO(sboukortt): add an API function to set this.
-  enc->cms = jxl::GetJxlCms();
+  enc->cms = *JxlGetDefaultCms();
+  enc->cms_set = true;
 
   // Initialize all the field values.
   JxlEncoderReset(enc);
@@ -1294,7 +1891,6 @@ void JxlEncoderReset(JxlEncoder* enc) {
   enc->num_queued_frames = 0;
   enc->num_queued_boxes = 0;
   enc->encoder_options.clear();
-  enc->output_byte_queue.clear();
   enc->codestream_bytes_written_beginning_of_frame = 0;
   enc->codestream_bytes_written_end_of_frame = 0;
   enc->wrote_bytes = false;
@@ -1309,6 +1905,7 @@ void JxlEncoderReset(JxlEncoder* enc) {
   enc->use_container = false;
   enc->use_boxes = false;
   enc->codestream_level = -1;
+  enc->output_processor = JxlEncoderOutputProcessorWrapper();
   JxlEncoderInitBasicInfo(&enc->basic_info);
 }
 
@@ -1330,7 +1927,7 @@ JxlEncoderStatus JxlEncoderUseContainer(JxlEncoder* enc,
                          "this setting can only be set at the beginning");
   }
   enc->use_container = static_cast<bool>(use_container);
-  return JXL_ENC_SUCCESS;
+  return JxlErrorOrStatus::Success();
 }
 
 JxlEncoderStatus JxlEncoderStoreJPEGMetadata(JxlEncoder* enc,
@@ -1340,7 +1937,7 @@ JxlEncoderStatus JxlEncoderStoreJPEGMetadata(JxlEncoder* enc,
                          "this setting can only be set at the beginning");
   }
   enc->store_jpeg_metadata = static_cast<bool>(store_jpeg_metadata);
-  return JXL_ENC_SUCCESS;
+  return JxlErrorOrStatus::Success();
 }
 
 JxlEncoderStatus JxlEncoderSetCodestreamLevel(JxlEncoder* enc, int level) {
@@ -1352,7 +1949,7 @@ JxlEncoderStatus JxlEncoderSetCodestreamLevel(JxlEncoder* enc, int level) {
                          "this setting can only be set at the beginning");
   }
   enc->codestream_level = level;
-  return JXL_ENC_SUCCESS;
+  return JxlErrorOrStatus::Success();
 }
 
 int JxlEncoderGetRequiredCodestreamLevel(const JxlEncoder* enc) {
@@ -1362,6 +1959,7 @@ int JxlEncoderGetRequiredCodestreamLevel(const JxlEncoder* enc) {
 void JxlEncoderSetCms(JxlEncoder* enc, JxlCmsInterface cms) {
   jxl::msan::MemoryIsInitialized(&cms, sizeof(cms));
   enc->cms = cms;
+  enc->cms_set = true;
 }
 
 JxlEncoderStatus JxlEncoderSetParallelRunner(JxlEncoder* enc,
@@ -1377,7 +1975,7 @@ JxlEncoderStatus JxlEncoderSetParallelRunner(JxlEncoder* enc,
     return JXL_API_ERROR(enc, JXL_ENC_ERR_GENERIC,
                          "error setting parallel runner");
   }
-  return JXL_ENC_SUCCESS;
+  return JxlErrorOrStatus::Success();
 }
 
 namespace {
@@ -1399,7 +1997,7 @@ JxlEncoderStatus GetCurrentDimensions(
     return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
                          "zero-sized frame is not allowed");
   }
-  return JXL_ENC_SUCCESS;
+  return JxlErrorOrStatus::Success();
 }
 }  // namespace
 
@@ -1412,18 +2010,14 @@ JxlEncoderStatus JxlEncoderAddJPEGFrame(
   }
 
   jxl::CodecInOut io;
-  if (!jxl::jpeg::DecodeImageJPG(jxl::Span<const uint8_t>(buffer, size), &io)) {
+  if (!jxl::jpeg::DecodeImageJPG(jxl::Bytes(buffer, size), &io)) {
     return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_BAD_INPUT,
                          "Error during decode of input JPEG");
   }
 
   if (!frame_settings->enc->color_encoding_set) {
-    if (!SetColorEncodingFromJpegData(
-            *io.Main().jpeg_data,
-            &frame_settings->enc->metadata.m.color_encoding)) {
-      return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_BAD_INPUT,
-                           "Error in input JPEG color space");
-    }
+    SetColorEncodingFromJpegData(
+        *io.Main().jpeg_data, &frame_settings->enc->metadata.m.color_encoding);
   }
 
   if (!frame_settings->enc->basic_info_set) {
@@ -1448,7 +2042,8 @@ JxlEncoderStatus JxlEncoderAddJPEGFrame(
         frame_settings->enc->metadata.m.orientation);
     jxl::InterpretExif(io.blobs.exif, &orientation);
     frame_settings->enc->metadata.m.orientation = orientation;
-
+  }
+  if (!io.blobs.exif.empty() && frame_settings->values.cparams.jpeg_keep_exif) {
     size_t exif_size = io.blobs.exif.size();
     // Exif data in JPEG is limited to 64k
     if (exif_size > 0xFFFF) {
@@ -1460,29 +2055,37 @@ JxlEncoderStatus JxlEncoderAddJPEGFrame(
     memcpy(exif.data() + 4, io.blobs.exif.data(), io.blobs.exif.size());
     JxlEncoderUseBoxes(frame_settings->enc);
     JxlEncoderAddBox(frame_settings->enc, "Exif", exif.data(), exif_size,
-                     /*compress_box=*/JXL_TRUE);
+                     frame_settings->values.cparams.jpeg_compress_boxes);
   }
-  if (!io.blobs.xmp.empty()) {
+  if (!io.blobs.xmp.empty() && frame_settings->values.cparams.jpeg_keep_xmp) {
     JxlEncoderUseBoxes(frame_settings->enc);
     JxlEncoderAddBox(frame_settings->enc, "xml ", io.blobs.xmp.data(),
-                     io.blobs.xmp.size(), /*compress_box=*/JXL_TRUE);
+                     io.blobs.xmp.size(),
+                     frame_settings->values.cparams.jpeg_compress_boxes);
   }
-  if (!io.blobs.jumbf.empty()) {
+  if (!io.blobs.jumbf.empty() &&
+      frame_settings->values.cparams.jpeg_keep_jumbf) {
     JxlEncoderUseBoxes(frame_settings->enc);
     JxlEncoderAddBox(frame_settings->enc, "jumb", io.blobs.jumbf.data(),
-                     io.blobs.jumbf.size(), /*compress_box=*/JXL_TRUE);
+                     io.blobs.jumbf.size(),
+                     frame_settings->values.cparams.jpeg_compress_boxes);
   }
   if (frame_settings->enc->store_jpeg_metadata) {
+    if (!frame_settings->values.cparams.jpeg_keep_exif ||
+        !frame_settings->values.cparams.jpeg_keep_xmp) {
+      return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                           "Need to preserve EXIF and XMP to allow JPEG "
+                           "bitstream reconstruction");
+    }
     jxl::jpeg::JPEGData data_in = *io.Main().jpeg_data;
-    jxl::PaddedBytes jpeg_data;
+    std::vector<uint8_t> jpeg_data;
     if (!jxl::jpeg::EncodeJPEGData(data_in, &jpeg_data,
                                    frame_settings->values.cparams)) {
       return JXL_API_ERROR(
           frame_settings->enc, JXL_ENC_ERR_JBRD,
           "JPEG bitstream reconstruction data cannot be encoded");
     }
-    frame_settings->enc->jpeg_metadata = std::vector<uint8_t>(
-        jpeg_data.data(), jpeg_data.data() + jpeg_data.size());
+    frame_settings->enc->jpeg_metadata = jpeg_data;
   }
 
   auto queued_frame = jxl::MemoryManagerMakeUnique<jxl::JxlEncoderQueuedFrame>(
@@ -1522,18 +2125,77 @@ JxlEncoderStatus JxlEncoderAddJPEGFrame(
   queued_frame->frame.chroma_subsampling = io.Main().chroma_subsampling;
 
   QueueFrame(frame_settings, queued_frame);
-  return JXL_ENC_SUCCESS;
+  return JxlErrorOrStatus::Success();
+}
+
+static bool CanDoFastLossless(const JxlEncoderFrameSettings* frame_settings,
+                              const JxlPixelFormat* pixel_format,
+                              bool has_alpha) {
+  if (!frame_settings->values.lossless) {
+    return false;
+  }
+  // TODO(veluca): many of the following options could be made to work, but are
+  // just not implemented in FJXL's frame header handling yet.
+  if (frame_settings->values.frame_index_box) {
+    return false;
+  }
+  if (frame_settings->values.header.layer_info.have_crop) {
+    return false;
+  }
+  if (frame_settings->enc->metadata.m.have_animation) {
+    return false;
+  }
+  if (frame_settings->values.cparams.speed_tier != jxl::SpeedTier::kLightning) {
+    return false;
+  }
+  if (frame_settings->values.image_bit_depth.type ==
+          JxlBitDepthType::JXL_BIT_DEPTH_CUSTOM &&
+      frame_settings->values.image_bit_depth.bits_per_sample !=
+          frame_settings->enc->metadata.m.bit_depth.bits_per_sample) {
+    return false;
+  }
+  // TODO(veluca): implement support for LSB-padded input in fast_lossless.
+  if (frame_settings->values.image_bit_depth.type ==
+          JxlBitDepthType::JXL_BIT_DEPTH_FROM_PIXEL_FORMAT &&
+      frame_settings->values.image_bit_depth.bits_per_sample % 8 != 0) {
+    return false;
+  }
+  if (!frame_settings->values.frame_name.empty()) {
+    return false;
+  }
+  // No extra channels other than alpha.
+  if (!(has_alpha && frame_settings->enc->metadata.m.num_extra_channels == 1) &&
+      frame_settings->enc->metadata.m.num_extra_channels != 0) {
+    return false;
+  }
+  if (frame_settings->enc->metadata.m.bit_depth.bits_per_sample > 16) {
+    return false;
+  }
+  if (pixel_format->data_type != JxlDataType::JXL_TYPE_FLOAT16 &&
+      pixel_format->data_type != JxlDataType::JXL_TYPE_UINT16 &&
+      pixel_format->data_type != JxlDataType::JXL_TYPE_UINT8) {
+    return false;
+  }
+  if ((frame_settings->enc->metadata.m.bit_depth.bits_per_sample > 8) !=
+      (pixel_format->data_type == JxlDataType::JXL_TYPE_UINT16 ||
+       pixel_format->data_type == JxlDataType::JXL_TYPE_FLOAT16)) {
+    return false;
+  }
+  if (!((pixel_format->num_channels == 1 || pixel_format->num_channels == 3) &&
+        !has_alpha) &&
+      !((pixel_format->num_channels == 2 || pixel_format->num_channels == 4) &&
+        has_alpha)) {
+    return false;
+  }
+
+  return true;
 }
 
 JxlEncoderStatus JxlEncoderAddImageFrame(
     const JxlEncoderFrameSettings* frame_settings,
     const JxlPixelFormat* pixel_format, const void* buffer, size_t size) {
-  if (!frame_settings->enc->basic_info_set ||
-      (!frame_settings->enc->color_encoding_set &&
-       !frame_settings->enc->metadata.m.xyb_encoded)) {
-    // Basic Info must be set, and color encoding must be set directly,
-    // or set to XYB via JxlBasicInfo.uses_original_profile = JXL_FALSE
-    // Otherwise, this is an API misuse.
+  if (!frame_settings->enc->basic_info_set) {
+    // Basic Info must be set. Otherwise, this is an API misuse.
     return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
                          "Basic info or color encoding not set yet");
   }
@@ -1554,6 +2216,52 @@ JxlEncoderStatus JxlEncoderAddImageFrame(
     }
   }
 
+  bool has_alpha = frame_settings->enc->metadata.m.HasAlpha();
+
+  size_t xsize, ysize;
+  if (GetCurrentDimensions(frame_settings, xsize, ysize) != JXL_ENC_SUCCESS) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_GENERIC,
+                         "bad dimensions");
+  }
+
+  // All required conditions to do fast-lossless.
+  if (CanDoFastLossless(frame_settings, pixel_format, has_alpha)) {
+    const size_t bytes_per_pixel =
+        pixel_format->data_type == JxlDataType::JXL_TYPE_UINT8
+            ? pixel_format->num_channels
+            : pixel_format->num_channels * 2;
+    const size_t last_row_size = xsize * bytes_per_pixel;
+    const size_t align = pixel_format->align;
+    const size_t row_size =
+        (align > 1 ? jxl::DivCeil(last_row_size, align) * align
+                   : last_row_size);
+    const size_t bytes_to_read = row_size * (ysize - 1) + last_row_size;
+    if (bytes_to_read > size) {
+      return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+                           "provided image buffer too small");
+    }
+    const bool big_endian =
+        pixel_format->endianness == JXL_BIG_ENDIAN ||
+        (pixel_format->endianness == JXL_NATIVE_ENDIAN && !IsLittleEndian());
+
+    auto runner = +[](void* void_pool, void* opaque, void fun(void*, size_t),
+                      size_t count) {
+      auto* pool = reinterpret_cast<jxl::ThreadPool*>(void_pool);
+      JXL_CHECK(jxl::RunOnPool(
+          pool, 0, count, jxl::ThreadPool::NoInit,
+          [&](size_t i, size_t) { fun(opaque, i); }, "Encode fast lossless"));
+    };
+    QueueFastLosslessFrame(
+        frame_settings,
+        JxlFastLosslessPrepareFrame(
+            reinterpret_cast<const unsigned char*>(buffer), xsize, row_size,
+            ysize, pixel_format->num_channels,
+            frame_settings->enc->metadata.m.bit_depth.bits_per_sample,
+            big_endian, /*effort=*/2, frame_settings->enc->thread_pool.get(),
+            runner));
+    return JxlErrorOrStatus::Success();
+  }
+
   auto queued_frame = jxl::MemoryManagerMakeUnique<jxl::JxlEncoderQueuedFrame>(
       &frame_settings->enc->memory_manager,
       // JxlEncoderQueuedFrame is a struct with no constructors, so we use the
@@ -1590,11 +2298,6 @@ JxlEncoderStatus JxlEncoderAddImageFrame(
         frame_settings->enc, JXL_ENC_ERR_API_USAGE,
         "number of extra channels mismatch (need 1 extra channel for alpha)");
   }
-  size_t xsize, ysize;
-  if (GetCurrentDimensions(frame_settings, xsize, ysize) != JXL_ENC_SUCCESS) {
-    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_GENERIC,
-                         "bad dimensions");
-  }
   std::vector<jxl::ImageF> extra_channels(
       frame_settings->enc->metadata.m.num_extra_channels);
   for (auto& extra_channel : extra_channels) {
@@ -1623,22 +2326,116 @@ JxlEncoderStatus JxlEncoderAddImageFrame(
   queued_frame->frame.blend =
       frame_settings->values.header.layer_info.blend_info.source > 0;
 
-  if (!jxl::BufferToImageBundle(*pixel_format, xsize, ysize, buffer, size,
+  if (JXL_ENC_SUCCESS !=
+      VerifyInputBitDepth(frame_settings->values.image_bit_depth,
+                          *pixel_format)) {
+    return JXL_API_ERROR_NOSET("Invalid input bit depth");
+  }
+  size_t bits_per_sample =
+      GetBitDepth(frame_settings->values.image_bit_depth,
+                  frame_settings->enc->metadata.m, *pixel_format);
+  const uint8_t* uint8_buffer = reinterpret_cast<const uint8_t*>(buffer);
+  if (!jxl::ConvertFromExternal(jxl::Bytes(uint8_buffer, size), xsize, ysize,
+                                c_current, bits_per_sample, *pixel_format,
                                 frame_settings->enc->thread_pool.get(),
-                                c_current, &(queued_frame->frame))) {
+                                &(queued_frame->frame))) {
     return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
                          "Invalid input buffer");
   }
   if (frame_settings->values.lossless &&
       frame_settings->enc->metadata.m.xyb_encoded) {
-    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
-                         "Set use_original_profile=true for lossless encoding");
+    return JXL_API_ERROR(
+        frame_settings->enc, JXL_ENC_ERR_API_USAGE,
+        "Set uses_original_profile=true for lossless encoding");
   }
   queued_frame->option_values.cparams.level =
       frame_settings->enc->codestream_level;
 
   QueueFrame(frame_settings, queued_frame);
-  return JXL_ENC_SUCCESS;
+  return JxlErrorOrStatus::Success();
+}
+
+JxlEncoderStatus JxlEncoderAddChunkedFrame(
+    const JxlEncoderFrameSettings* frame_settings, JXL_BOOL is_last_frame,
+    JxlChunkedFrameInputSource chunked_frame_input) {
+  size_t xsize;
+  size_t ysize;
+  if (GetCurrentDimensions(frame_settings, xsize, ysize) != JXL_ENC_SUCCESS) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_GENERIC,
+                         "bad dimensions");
+  }
+  // TODO(veluca): implement this without immediately buffering the whole frame.
+  // In the next line,`color_pixel_format` gets overwritten
+  JxlPixelFormat color_pixel_format = {4, JXL_TYPE_UINT8, JXL_NATIVE_ENDIAN, 0};
+  chunked_frame_input.get_color_channels_pixel_format(
+      chunked_frame_input.opaque, &color_pixel_format);
+  size_t bytes_per_pixel = color_pixel_format.num_channels *
+                           BitsPerChannel(color_pixel_format.data_type) / 8;
+  std::vector<uint8_t> color_data(bytes_per_pixel * xsize * ysize);
+  size_t row_offset = 0;
+  const void* buffer = chunked_frame_input.get_color_channel_data_at(
+      chunked_frame_input.opaque, 0, 0, xsize, ysize, &row_offset);
+  if (!buffer) {
+    return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_GENERIC,
+                         "no buffer for color channels given");
+  }
+  auto stride = xsize * bytes_per_pixel;
+  for (size_t y = 0; y < ysize; y++) {
+    memcpy(color_data.data() + y * stride,
+           static_cast<const uint8_t*>(buffer) + y * row_offset,
+           bytes_per_pixel * xsize);
+  }
+  chunked_frame_input.release_buffer(chunked_frame_input.opaque, buffer);
+
+  auto status = JxlEncoderAddImageFrame(frame_settings, &color_pixel_format,
+                                        color_data.data(), color_data.size());
+  if (status != JXL_ENC_SUCCESS) return status;
+  bool already_have_alpha = color_pixel_format.num_channels == 2 ||
+                            color_pixel_format.num_channels == 4;
+  for (size_t ec = 0; ec < frame_settings->enc->metadata.m.num_extra_channels;
+       ec++) {
+    if (frame_settings->enc->metadata.m.extra_channel_info[ec].type ==
+        jxl::ExtraChannel::kAlpha) {
+      if (already_have_alpha) {
+        // Skip this alpha channel, but still request additional alpha channels
+        // if they exist.
+        already_have_alpha = false;
+        continue;
+      }
+    }
+
+    // In the next line,`pixel_format` gets overwritten
+    JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT8, JXL_NATIVE_ENDIAN, 0};
+    chunked_frame_input.get_extra_channel_pixel_format(
+        chunked_frame_input.opaque, ec, &pixel_format);
+    size_t bytes_per_pixel =
+        pixel_format.num_channels * BitsPerChannel(pixel_format.data_type) / 8;
+    std::vector<uint8_t> data(bytes_per_pixel * xsize * ysize);
+    auto stride = xsize * bytes_per_pixel;
+    const void* buffer = chunked_frame_input.get_extra_channel_data_at(
+        chunked_frame_input.opaque, ec, 0, 0, xsize, ysize, &row_offset);
+    if (!buffer) {
+      return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_GENERIC,
+                           "no buffer for extra channel given");
+    }
+    for (size_t y = 0; y < ysize; y++) {
+      memcpy(data.data() + y * stride,
+             static_cast<const uint8_t*>(buffer) + y * row_offset,
+             bytes_per_pixel * xsize);
+    }
+    chunked_frame_input.release_buffer(chunked_frame_input.opaque, buffer);
+    auto status = JxlEncoderSetExtraChannelBuffer(frame_settings, &pixel_format,
+                                                  data.data(), data.size(), ec);
+    if (status != JXL_ENC_SUCCESS) return status;
+  }
+
+  if (is_last_frame) {
+    JxlEncoderCloseInput(frame_settings->enc);
+  }
+  if (frame_settings->enc->output_processor.OutputProcessorSet()) {
+    return JxlEncoderFlushInput(frame_settings->enc);
+  }
+  return JxlErrorOrStatus::Success();
 }
 
 JxlEncoderStatus JxlEncoderUseBoxes(JxlEncoder* enc) {
@@ -1647,7 +2444,7 @@ JxlEncoderStatus JxlEncoderUseBoxes(JxlEncoder* enc) {
                          "this setting can only be set at the beginning");
   }
   enc->use_boxes = true;
-  return JXL_ENC_SUCCESS;
+  return JxlErrorOrStatus::Success();
 }
 
 JxlEncoderStatus JxlEncoderAddBox(JxlEncoder* enc, const JxlBoxType type,
@@ -1658,6 +2455,10 @@ JxlEncoderStatus JxlEncoderAddBox(JxlEncoder* enc, const JxlBoxType type,
         enc, JXL_ENC_ERR_API_USAGE,
         "must set JxlEncoderUseBoxes at the beginning to add boxes");
   }
+  if (enc->boxes_closed) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE,
+                         "Box input already closed");
+  }
   if (compress_box) {
     if (memcmp("jxl", type, 3) == 0) {
       return JXL_API_ERROR(
@@ -1684,12 +2485,13 @@ JxlEncoderStatus JxlEncoderAddBox(JxlEncoder* enc, const JxlBoxType type,
   box->contents.assign(contents, contents + size);
   box->compress_box = !!compress_box;
   QueueBox(enc, box);
-  return JXL_ENC_SUCCESS;
+  return JxlErrorOrStatus::Success();
 }
 
 JXL_EXPORT JxlEncoderStatus JxlEncoderSetExtraChannelBuffer(
-    const JxlEncoderOptions* frame_settings, const JxlPixelFormat* pixel_format,
-    const void* buffer, size_t size, uint32_t index) {
+    const JxlEncoderFrameSettings* frame_settings,
+    const JxlPixelFormat* pixel_format, const void* buffer, size_t size,
+    uint32_t index) {
   if (index >= frame_settings->enc->metadata.m.num_extra_channels) {
     return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
                          "Invalid value for the index of extra channel");
@@ -1712,16 +2514,27 @@ JXL_EXPORT JxlEncoderStatus JxlEncoderSetExtraChannelBuffer(
     return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_GENERIC,
                          "bad dimensions");
   }
-  if (!jxl::BufferToImageF(*pixel_format, xsize, ysize, buffer, size,
-                           frame_settings->enc->thread_pool.get(),
-                           &frame_settings->enc->input_queue.back()
-                                .frame->frame.extra_channels()[index])) {
+  JxlPixelFormat ec_format = *pixel_format;
+  ec_format.num_channels = 1;
+  if (JXL_ENC_SUCCESS !=
+      VerifyInputBitDepth(frame_settings->values.image_bit_depth, ec_format)) {
+    return JXL_API_ERROR_NOSET("Invalid input bit depth");
+  }
+  size_t bits_per_sample = GetBitDepth(
+      frame_settings->values.image_bit_depth,
+      frame_settings->enc->metadata.m.extra_channel_info[index], ec_format);
+  const uint8_t* uint8_buffer = reinterpret_cast<const uint8_t*>(buffer);
+  auto queued_frame = frame_settings->enc->input_queue.back().frame.get();
+  if (!jxl::ConvertFromExternal(jxl::Bytes(uint8_buffer, size), xsize, ysize,
+                                bits_per_sample, ec_format, 0,
+                                frame_settings->enc->thread_pool.get(),
+                                &queued_frame->frame.extra_channels()[index])) {
     return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
                          "Failed to set buffer for extra channel");
   }
-  frame_settings->enc->input_queue.back().frame->ec_initialized[index] = 1;
+  queued_frame->ec_initialized[index] = 1;
 
-  return JXL_ENC_SUCCESS;
+  return JxlErrorOrStatus::Success();
 }
 
 void JxlEncoderCloseFrames(JxlEncoder* enc) { enc->frames_closed = true; }
@@ -1732,32 +2545,59 @@ void JxlEncoderCloseInput(JxlEncoder* enc) {
   JxlEncoderCloseFrames(enc);
   JxlEncoderCloseBoxes(enc);
 }
+
+JXL_EXPORT JxlEncoderStatus JxlEncoderFlushInput(JxlEncoder* enc) {
+  if (!enc->output_processor.OutputProcessorSet()) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE,
+                         "Cannot flush input without setting output "
+                         "processor with JxlEncoderSetOutputProcessor");
+  }
+  while (!enc->input_queue.empty()) {
+    if (!enc->ProcessOneEnqueuedInput()) {
+      return JxlErrorOrStatus::Error();
+    }
+  }
+  return JxlErrorOrStatus::Success();
+}
+
+JXL_EXPORT JxlEncoderStatus JxlEncoderSetOutputProcessor(
+    JxlEncoder* enc, JxlEncoderOutputProcessor output_processor) {
+  if (enc->output_processor.HasAvailOut()) {
+    return JXL_API_ERROR(
+        enc, JXL_ENC_ERR_API_USAGE,
+        "Cannot set an output processor when some output was already produced");
+  }
+  if (!output_processor.set_finalized_position ||
+      !output_processor.get_buffer || !output_processor.release_buffer) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE,
+                         "Missing output processor functions");
+  }
+  enc->output_processor = JxlEncoderOutputProcessorWrapper(output_processor);
+  return JxlErrorOrStatus::Success();
+}
+
 JxlEncoderStatus JxlEncoderProcessOutput(JxlEncoder* enc, uint8_t** next_out,
                                          size_t* avail_out) {
-  while (*avail_out > 0 &&
-         (!enc->output_byte_queue.empty() || !enc->input_queue.empty())) {
-    if (!enc->output_byte_queue.empty()) {
-      size_t to_copy = std::min(*avail_out, enc->output_byte_queue.size());
-      std::copy_n(enc->output_byte_queue.begin(), to_copy, *next_out);
-      *next_out += to_copy;
-      *avail_out -= to_copy;
-      enc->output_byte_queue.erase(enc->output_byte_queue.begin(),
-                                   enc->output_byte_queue.begin() + to_copy);
-    } else if (!enc->input_queue.empty()) {
-      if (enc->RefillOutputByteQueue() != JXL_ENC_SUCCESS) {
-        return JXL_ENC_ERROR;
-      }
+  if (!enc->output_processor.SetAvailOut(next_out, avail_out)) {
+    return JXL_API_ERROR(enc, JXL_ENC_ERR_API_USAGE,
+                         "Cannot call JxlEncoderProcessOutput after calling "
+                         "JxlEncoderSetOutputProcessor");
+  }
+  while (*avail_out != 0 && !enc->input_queue.empty()) {
+    if (!enc->ProcessOneEnqueuedInput()) {
+      return JxlErrorOrStatus::Error();
     }
   }
 
-  if (!enc->output_byte_queue.empty() || !enc->input_queue.empty()) {
-    return JXL_ENC_NEED_MORE_OUTPUT;
+  if (!enc->input_queue.empty() || enc->output_processor.HasOutputToWrite()) {
+    return JxlErrorOrStatus::MoreOutput();
   }
-  return JXL_ENC_SUCCESS;
+  return JxlErrorOrStatus::Success();
 }
 
-JxlEncoderStatus JxlEncoderSetFrameHeader(JxlEncoderOptions* frame_settings,
-                                          const JxlFrameHeader* frame_header) {
+JxlEncoderStatus JxlEncoderSetFrameHeader(
+    JxlEncoderFrameSettings* frame_settings,
+    const JxlFrameHeader* frame_header) {
   if (frame_header->layer_info.blend_info.source > 3) {
     return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
                          "invalid blending source index");
@@ -1775,11 +2615,11 @@ JxlEncoderStatus JxlEncoderSetFrameHeader(JxlEncoderOptions* frame_settings,
   // JxlEncoderSetFrameName if desired.
   frame_settings->values.frame_name = "";
 
-  return JXL_ENC_SUCCESS;
+  return JxlErrorOrStatus::Success();
 }
 
 JxlEncoderStatus JxlEncoderSetExtraChannelBlendInfo(
-    JxlEncoderOptions* frame_settings, size_t index,
+    JxlEncoderFrameSettings* frame_settings, size_t index,
     const JxlBlendInfo* blend_info) {
   if (index >= frame_settings->enc->metadata.m.num_extra_channels) {
     return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
@@ -1794,7 +2634,7 @@ JxlEncoderStatus JxlEncoderSetExtraChannelBlendInfo(
         frame_settings->enc->metadata.m.num_extra_channels, default_blend_info);
   }
   frame_settings->values.extra_channel_blend_info[index] = *blend_info;
-  return JXL_ENC_SUCCESS;
+  return JxlErrorOrStatus::Success();
 }
 
 JxlEncoderStatus JxlEncoderSetFrameName(JxlEncoderFrameSettings* frame_settings,
@@ -1806,17 +2646,121 @@ JxlEncoderStatus JxlEncoderSetFrameName(JxlEncoderFrameSettings* frame_settings,
   }
   frame_settings->values.frame_name = str;
   frame_settings->values.header.name_length = str.size();
-  return JXL_ENC_SUCCESS;
+  return JxlErrorOrStatus::Success();
+}
+
+JxlEncoderStatus JxlEncoderSetFrameBitDepth(
+    JxlEncoderFrameSettings* frame_settings, const JxlBitDepth* bit_depth) {
+  if (bit_depth->type != JXL_BIT_DEPTH_FROM_PIXEL_FORMAT &&
+      bit_depth->type != JXL_BIT_DEPTH_FROM_CODESTREAM) {
+    return JXL_API_ERROR_NOSET(
+        "Only JXL_BIT_DEPTH_FROM_PIXEL_FORMAT and "
+        "JXL_BIT_DEPTH_FROM_CODESTREAM is implemented "
+        "for input buffers.");
+  }
+  frame_settings->values.image_bit_depth = *bit_depth;
+  return JxlErrorOrStatus::Success();
 }
 
 void JxlColorEncodingSetToSRGB(JxlColorEncoding* color_encoding,
                                JXL_BOOL is_gray) {
-  ConvertInternalToExternalColorEncoding(jxl::ColorEncoding::SRGB(is_gray),
-                                         color_encoding);
+  *color_encoding = jxl::ColorEncoding::SRGB(is_gray).ToExternal();
 }
 
 void JxlColorEncodingSetToLinearSRGB(JxlColorEncoding* color_encoding,
                                      JXL_BOOL is_gray) {
-  ConvertInternalToExternalColorEncoding(
-      jxl::ColorEncoding::LinearSRGB(is_gray), color_encoding);
+  *color_encoding = jxl::ColorEncoding::LinearSRGB(is_gray).ToExternal();
+}
+
+void JxlEncoderAllowExpertOptions(JxlEncoder* enc) {
+  enc->allow_expert_options = true;
+}
+
+JXL_EXPORT void JxlEncoderSetDebugImageCallback(
+    JxlEncoderFrameSettings* frame_settings, JxlDebugImageCallback callback,
+    void* opaque) {
+  frame_settings->values.cparams.debug_image = callback;
+  frame_settings->values.cparams.debug_image_opaque = opaque;
+}
+
+JXL_EXPORT JxlEncoderStats* JxlEncoderStatsCreate() {
+  return new JxlEncoderStats();
+}
+
+JXL_EXPORT void JxlEncoderStatsDestroy(JxlEncoderStats* stats) {
+  if (stats) delete stats;
+}
+
+JXL_EXPORT void JxlEncoderCollectStats(JxlEncoderFrameSettings* frame_settings,
+                                       JxlEncoderStats* stats) {
+  if (!stats) return;
+  frame_settings->values.aux_out = &stats->aux_out;
+}
+
+JXL_EXPORT size_t JxlEncoderStatsGet(const JxlEncoderStats* stats,
+                                     JxlEncoderStatsKey key) {
+  if (!stats) return 0;
+  const jxl::AuxOut& aux_out = stats->aux_out;
+  switch (key) {
+    case JXL_ENC_STAT_HEADER_BITS:
+      return aux_out.layers[jxl::kLayerHeader].total_bits;
+    case JXL_ENC_STAT_TOC_BITS:
+      return aux_out.layers[jxl::kLayerTOC].total_bits;
+    case JXL_ENC_STAT_DICTIONARY_BITS:
+      return aux_out.layers[jxl::kLayerDictionary].total_bits;
+    case JXL_ENC_STAT_SPLINES_BITS:
+      return aux_out.layers[jxl::kLayerSplines].total_bits;
+    case JXL_ENC_STAT_NOISE_BITS:
+      return aux_out.layers[jxl::kLayerNoise].total_bits;
+    case JXL_ENC_STAT_QUANT_BITS:
+      return aux_out.layers[jxl::kLayerQuant].total_bits;
+    case JXL_ENC_STAT_MODULAR_TREE_BITS:
+      return aux_out.layers[jxl::kLayerModularTree].total_bits;
+    case JXL_ENC_STAT_MODULAR_GLOBAL_BITS:
+      return aux_out.layers[jxl::kLayerModularGlobal].total_bits;
+    case JXL_ENC_STAT_DC_BITS:
+      return aux_out.layers[jxl::kLayerDC].total_bits;
+    case JXL_ENC_STAT_MODULAR_DC_GROUP_BITS:
+      return aux_out.layers[jxl::kLayerModularDcGroup].total_bits;
+    case JXL_ENC_STAT_CONTROL_FIELDS_BITS:
+      return aux_out.layers[jxl::kLayerControlFields].total_bits;
+    case JXL_ENC_STAT_COEF_ORDER_BITS:
+      return aux_out.layers[jxl::kLayerOrder].total_bits;
+    case JXL_ENC_STAT_AC_HISTOGRAM_BITS:
+      return aux_out.layers[jxl::kLayerAC].total_bits;
+    case JXL_ENC_STAT_AC_BITS:
+      return aux_out.layers[jxl::kLayerACTokens].total_bits;
+    case JXL_ENC_STAT_MODULAR_AC_GROUP_BITS:
+      return aux_out.layers[jxl::kLayerModularAcGroup].total_bits;
+    case JXL_ENC_STAT_NUM_SMALL_BLOCKS:
+      return aux_out.num_small_blocks;
+    case JXL_ENC_STAT_NUM_DCT4X8_BLOCKS:
+      return aux_out.num_dct4x8_blocks;
+    case JXL_ENC_STAT_NUM_AFV_BLOCKS:
+      return aux_out.num_afv_blocks;
+    case JXL_ENC_STAT_NUM_DCT8_BLOCKS:
+      return aux_out.num_dct8_blocks;
+    case JXL_ENC_STAT_NUM_DCT8X32_BLOCKS:
+      return aux_out.num_dct16_blocks;
+    case JXL_ENC_STAT_NUM_DCT16_BLOCKS:
+      return aux_out.num_dct16x32_blocks;
+    case JXL_ENC_STAT_NUM_DCT16X32_BLOCKS:
+      return aux_out.num_dct32_blocks;
+    case JXL_ENC_STAT_NUM_DCT32_BLOCKS:
+      return aux_out.num_dct32x64_blocks;
+    case JXL_ENC_STAT_NUM_DCT32X64_BLOCKS:
+      return aux_out.num_dct32x64_blocks;
+    case JXL_ENC_STAT_NUM_DCT64_BLOCKS:
+      return aux_out.num_dct64_blocks;
+    case JXL_ENC_STAT_NUM_BUTTERAUGLI_ITERS:
+      return aux_out.num_butteraugli_iters;
+    default:
+      return 0;
+  }
+}
+
+JXL_EXPORT void JxlEncoderStatsMerge(JxlEncoderStats* stats,
+                                     const JxlEncoderStats* other) {
+  if (!stats || !other) return;
+  stats->aux_out.Assimilate(other->aux_out);
 }
index 9f82546..7fb4fd1 100644 (file)
@@ -7,16 +7,26 @@
 #ifndef LIB_JXL_ENCODE_INTERNAL_H_
 #define LIB_JXL_ENCODE_INTERNAL_H_
 
-#include <deque>
+#include <jxl/encode.h>
+#include <jxl/memory_manager.h>
+#include <jxl/parallel_runner.h>
+#include <jxl/types.h>
+#include <sys/types.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <map>
+#include <memory>
 #include <vector>
 
-#include "jxl/encode.h"
-#include "jxl/memory_manager.h"
-#include "jxl/parallel_runner.h"
-#include "jxl/types.h"
+#include "lib/jxl/base/common.h"
 #include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_fast_lossless.h"
 #include "lib/jxl/enc_frame.h"
 #include "lib/jxl/memory_manager_internal.h"
+#include "lib/jxl/padded_bytes.h"
 
 namespace jxl {
 
@@ -108,7 +118,9 @@ typedef struct JxlEncoderFrameSettingsValuesStruct {
   JxlFrameHeader header;
   std::vector<JxlBlendInfo> extra_channel_blend_info;
   std::string frame_name;
+  JxlBitDepth image_bit_depth;
   bool frame_index_box = false;
+  jxl::AuxOut* aux_out = nullptr;
 } JxlEncoderFrameSettingsValues;
 
 typedef std::array<uint8_t, 4> BoxType;
@@ -121,12 +133,13 @@ constexpr BoxType MakeBoxType(const char* type) {
         static_cast<uint8_t>(type[2]), static_cast<uint8_t>(type[3])}});
 }
 
-constexpr unsigned char kContainerHeader[] = {
+constexpr std::array<unsigned char, 32> kContainerHeader = {
     0,   0,   0, 0xc, 'J',  'X', 'L', ' ', 0xd, 0xa, 0x87,
     0xa, 0,   0, 0,   0x14, 'f', 't', 'y', 'p', 'j', 'x',
     'l', ' ', 0, 0,   0,    0,   'j', 'x', 'l', ' '};
 
-constexpr unsigned char kLevelBoxHeader[] = {0, 0, 0, 0x9, 'j', 'x', 'l', 'l'};
+constexpr std::array<unsigned char, 8> kLevelBoxHeader = {0,   0,   0,   0x9,
+                                                          'j', 'x', 'l', 'l'};
 
 struct JxlEncoderQueuedFrame {
   JxlEncoderFrameSettingsValues option_values;
@@ -140,47 +153,178 @@ struct JxlEncoderQueuedBox {
   bool compress_box;
 };
 
+using FJXLFrameUniquePtr =
+    std::unique_ptr<JxlFastLosslessFrameState,
+                    decltype(&JxlFastLosslessFreeFrameState)>;
+
 // Either a frame, or a box, not both.
+// Can also be a FJXL frame.
 struct JxlEncoderQueuedInput {
   explicit JxlEncoderQueuedInput(const JxlMemoryManager& memory_manager)
       : frame(nullptr, jxl::MemoryManagerDeleteHelper(&memory_manager)),
         box(nullptr, jxl::MemoryManagerDeleteHelper(&memory_manager)) {}
   MemoryManagerUniquePtr<JxlEncoderQueuedFrame> frame;
   MemoryManagerUniquePtr<JxlEncoderQueuedBox> box;
+  FJXLFrameUniquePtr fast_lossless_frame = {nullptr,
+                                            JxlFastLosslessFreeFrameState};
 };
 
+static constexpr size_t kSmallBoxHeaderSize = 8;
+static constexpr size_t kLargeBoxHeaderSize = 16;
+static constexpr size_t kLargeBoxContentSizeThreshold =
+    0x100000000ull - kSmallBoxHeaderSize;
+
+size_t WriteBoxHeader(const jxl::BoxType& type, size_t size, bool unbounded,
+                      bool force_large_box, uint8_t* output);
+
 // Appends a JXL container box header with given type, size, and unbounded
 // properties to output.
 template <typename T>
 void AppendBoxHeader(const jxl::BoxType& type, size_t size, bool unbounded,
                      T* output) {
-  uint64_t box_size = 0;
-  bool large_size = false;
-  if (!unbounded) {
-    box_size = size + 8;
-    if (box_size >= 0x100000000ull) {
-      large_size = true;
-    }
+  size_t current_size = output->size();
+  output->resize(current_size + kLargeBoxHeaderSize);
+  size_t header_size =
+      WriteBoxHeader(type, size, unbounded, /*force_large_box=*/false,
+                     output->data() + current_size);
+  output->resize(current_size + header_size);
+}
+
+}  // namespace jxl
+
+class JxlOutputProcessorBuffer;
+
+class JxlEncoderOutputProcessorWrapper {
+  friend class JxlOutputProcessorBuffer;
+
+ public:
+  JxlEncoderOutputProcessorWrapper() = default;
+  explicit JxlEncoderOutputProcessorWrapper(JxlEncoderOutputProcessor processor)
+      : external_output_processor_(
+            jxl::make_unique<JxlEncoderOutputProcessor>(processor)) {}
+
+  bool HasAvailOut() const { return avail_out_ != nullptr; }
+
+  // Caller can never overwrite a previously-written buffer. Asking for a buffer
+  // with `min_size` such that `position + min_size` overlaps with a
+  // previously-written buffer is invalid.
+  jxl::StatusOr<JxlOutputProcessorBuffer> GetBuffer(size_t min_size,
+                                                    size_t requested_size = 0);
+
+  void Seek(size_t pos);
+
+  void SetFinalizedPosition();
+
+  size_t CurrentPosition() const { return position_; }
+
+  bool SetAvailOut(uint8_t** next_out, size_t* avail_out);
+
+  bool WasStopRequested() const { return stop_requested_; }
+  bool OutputProcessorSet() const {
+    return external_output_processor_ != nullptr;
+  }
+  bool HasOutputToWrite() const {
+    return output_position_ < finalized_position_;
   }
 
-  {
-    const uint64_t store = large_size ? 1 : box_size;
-    for (size_t i = 0; i < 4; i++) {
-      output->push_back(store >> (8 * (3 - i)) & 0xff);
-    }
+ private:
+  void ReleaseBuffer(size_t bytes_used);
+
+  // Tries to write all the bytes up to the finalized position.
+  void FlushOutput();
+
+  bool AppendBufferToExternalProcessor(void* data, size_t count);
+
+  struct InternalBuffer {
+    // Bytes in the range `[output_position_ - start_of_the_buffer,
+    // written_bytes)` need to be flushed out.
+    size_t written_bytes = 0;
+    // If data has been buffered, it is stored in `owned_data`.
+    jxl::PaddedBytes owned_data;
+  };
+
+  // Invariant: `internal_buffers_` does not contain chunks that are entirely
+  // below the output position.
+  std::map<size_t, InternalBuffer> internal_buffers_;
+
+  uint8_t** next_out_ = nullptr;
+  size_t* avail_out_ = nullptr;
+  // Where the next GetBuffer call will write bytes to.
+  size_t position_ = 0;
+  // The position of the last SetFinalizedPosition call.
+  size_t finalized_position_ = 0;
+  // Either the position of the `external_output_processor_` or the position
+  // `next_out_` points to.
+  size_t output_position_ = 0;
+
+  bool stop_requested_ = false;
+  bool has_buffer_ = false;
+
+  std::unique_ptr<JxlEncoderOutputProcessor> external_output_processor_;
+};
+
+class JxlOutputProcessorBuffer {
+ public:
+  size_t size() const { return size_; };
+  uint8_t* data() { return data_; }
+
+  JxlOutputProcessorBuffer(uint8_t* buffer, size_t size, size_t bytes_used,
+                           JxlEncoderOutputProcessorWrapper* wrapper)
+      : data_(buffer),
+        size_(size),
+        bytes_used_(bytes_used),
+        wrapper_(wrapper) {}
+  ~JxlOutputProcessorBuffer() { release(); }
+
+  JxlOutputProcessorBuffer(const JxlOutputProcessorBuffer&) = delete;
+  JxlOutputProcessorBuffer(JxlOutputProcessorBuffer&& other) noexcept
+      : JxlOutputProcessorBuffer(other.data_, other.size_, other.bytes_used_,
+                                 other.wrapper_) {
+    other.data_ = nullptr;
+    other.size_ = 0;
   }
-  for (size_t i = 0; i < 4; i++) {
-    output->push_back(type[i]);
+
+  void advance(size_t count) {
+    JXL_ASSERT(count <= size_);
+    data_ += count;
+    size_ -= count;
+    bytes_used_ += count;
   }
 
-  if (large_size) {
-    for (size_t i = 0; i < 8; i++) {
-      output->push_back(box_size >> (8 * (7 - i)) & 0xff);
+  void release() {
+    if (this->data_) {
+      wrapper_->ReleaseBuffer(bytes_used_);
     }
+    data_ = nullptr;
+    size_ = 0;
   }
-}
 
-}  // namespace jxl
+  void append(const void* data, size_t count) {
+    memcpy(data_, data, count);
+    advance(count);
+  }
+
+  template <typename T>
+  void append(const T& data) {
+    static_assert(sizeof(*std::begin(data)) == 1, "Cannot append non-bytes");
+    append(&*std::begin(data), std::end(data) - std::begin(data));
+  }
+
+  JxlOutputProcessorBuffer& operator=(const JxlOutputProcessorBuffer&) = delete;
+  JxlOutputProcessorBuffer& operator=(
+      JxlOutputProcessorBuffer&& other) noexcept {
+    data_ = other.data_;
+    size_ = other.size_;
+    wrapper_ = other.wrapper_;
+    return *this;
+  }
+
+ private:
+  uint8_t* data_;
+  size_t size_;
+  size_t bytes_used_;
+  JxlEncoderOutputProcessorWrapper* wrapper_;
+};
 
 // Internal use only struct, can only be initialized correctly by
 // JxlEncoderCreate.
@@ -190,20 +334,21 @@ struct JxlEncoderStruct {
   jxl::MemoryManagerUniquePtr<jxl::ThreadPool> thread_pool{
       nullptr, jxl::MemoryManagerDeleteHelper(&memory_manager)};
   JxlCmsInterface cms;
+  bool cms_set;
   std::vector<jxl::MemoryManagerUniquePtr<JxlEncoderFrameSettings>>
       encoder_options;
 
   size_t num_queued_frames;
   size_t num_queued_boxes;
   std::vector<jxl::JxlEncoderQueuedInput> input_queue;
-  std::deque<uint8_t> output_byte_queue;
+  JxlEncoderOutputProcessorWrapper output_processor;
 
   // How many codestream bytes have been written, i.e.,
   // content of jxlc and jxlp boxes. Frame index box jxli
   // requires position indices to point to codestream bytes,
   // so we need to keep track of the total of flushed or queue
   // codestream bytes. These bytes may be in a single jxlc box
-  // or accross multiple jxlp boxes.
+  // or across multiple jxlp boxes.
   size_t codestream_bytes_written_beginning_of_frame;
   size_t codestream_bytes_written_end_of_frame;
   jxl::JxlEncoderFrameIndexBox frame_index_box;
@@ -238,21 +383,28 @@ struct JxlEncoderStruct {
   bool basic_info_set;
   bool color_encoding_set;
   bool intensity_target_set;
+  bool allow_expert_options = false;
   int brotli_effort = -1;
 
   // Takes the first frame in the input_queue, encodes it, and appends
   // the bytes to the output_byte_queue.
-  JxlEncoderStatus RefillOutputByteQueue();
+  jxl::Status ProcessOneEnqueuedInput();
 
   bool MustUseContainer() const {
-    return use_container || codestream_level != 5 || store_jpeg_metadata ||
-           use_boxes;
+    return use_container || (codestream_level != 5 && codestream_level != -1) ||
+           store_jpeg_metadata || use_boxes;
   }
 
-  // Appends the bytes of a JXL box header with the provided type and size to
-  // the end of the output_byte_queue. If unbounded is true, the size won't be
-  // added to the header and the box will be assumed to continue until EOF.
-  void AppendBoxHeader(const jxl::BoxType& type, size_t size, bool unbounded);
+  // `write_box` must never seek before the position the output wrapper was at
+  // the moment of the call, and must leave the output wrapper such that its
+  // position is one byte past the end of the written box.
+  template <typename WriteBox>
+  jxl::Status AppendBox(const jxl::BoxType& type, bool unbounded,
+                        size_t box_max_size, const WriteBox& write_box);
+
+  template <typename BoxContents>
+  jxl::Status AppendBoxWithContents(const jxl::BoxType& type,
+                                    const BoxContents& contents);
 };
 
 struct JxlEncoderFrameSettingsStruct {
@@ -260,4 +412,8 @@ struct JxlEncoderFrameSettingsStruct {
   jxl::JxlEncoderFrameSettingsValues values;
 };
 
+struct JxlEncoderStatsStruct {
+  jxl::AuxOut aux_out;
+};
+
 #endif  // LIB_JXL_ENCODE_INTERNAL_H_
index 4f1ef0b..ceb974f 100644 (file)
@@ -3,21 +3,36 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-#include "jxl/encode.h"
-
-#include "enc_color_management.h"
-#include "gtest/gtest.h"
-#include "jxl/decode.h"
-#include "jxl/decode_cxx.h"
-#include "jxl/encode_cxx.h"
+#include <jxl/cms.h>
+#include <jxl/cms_interface.h>
+#include <jxl/codestream_header.h>
+#include <jxl/decode.h>
+#include <jxl/decode_cxx.h>
+#include <jxl/encode.h>
+#include <jxl/encode_cxx.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <ostream>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "jxl/types.h"
 #include "lib/extras/codec.h"
 #include "lib/extras/dec/jxl.h"
-#include "lib/jxl/enc_butteraugli_pnorm.h"
+#include "lib/extras/metrics.h"
+#include "lib/extras/packed_image.h"
+#include "lib/jxl/common.h"  // JXL_HIGH_PRECISION
 #include "lib/jxl/encode_internal.h"
 #include "lib/jxl/jpeg/dec_jpeg_data.h"
 #include "lib/jxl/jpeg/dec_jpeg_data_writer.h"
+#include "lib/jxl/test_image.h"
 #include "lib/jxl/test_utils.h"
-#include "lib/jxl/testdata.h"
+#include "lib/jxl/testing.h"
 
 TEST(EncodeTest, AddFrameAfterCloseInputTest) {
   JxlEncoderPtr enc = JxlEncoderMake(nullptr);
@@ -59,7 +74,7 @@ TEST(EncodeTest, AddJPEGAfterCloseTest) {
   JxlEncoderCloseInput(enc.get());
 
   const std::string jpeg_path = "jxl/flower/flower.png.im_q85_420.jpg";
-  const jxl::PaddedBytes orig = jxl::ReadTestData(jpeg_path);
+  const std::vector<uint8_t> orig = jxl::test::ReadTestData(jpeg_path);
 
   JxlEncoderFrameSettings* frame_settings =
       JxlEncoderFrameSettingsCreate(enc.get(), NULL);
@@ -68,32 +83,6 @@ TEST(EncodeTest, AddJPEGAfterCloseTest) {
             JxlEncoderAddJPEGFrame(frame_settings, orig.data(), orig.size()));
 }
 
-TEST(EncodeTest, AddFrameBeforeColorEncodingTest) {
-  JxlEncoderPtr enc = JxlEncoderMake(nullptr);
-  EXPECT_NE(nullptr, enc.get());
-
-  size_t xsize = 64;
-  size_t ysize = 64;
-  JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
-  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
-
-  jxl::CodecInOut input_io =
-      jxl::test::SomeTestImageToCodecInOut(pixels, 4, xsize, ysize);
-
-  JxlBasicInfo basic_info;
-  jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &pixel_format);
-  basic_info.xsize = xsize;
-  basic_info.ysize = ysize;
-  basic_info.uses_original_profile = true;
-  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc.get(), 10));
-  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc.get(), &basic_info));
-  JxlEncoderFrameSettings* frame_settings =
-      JxlEncoderFrameSettingsCreate(enc.get(), NULL);
-  EXPECT_EQ(JXL_ENC_ERROR,
-            JxlEncoderAddImageFrame(frame_settings, &pixel_format,
-                                    pixels.data(), pixels.size()));
-}
-
 TEST(EncodeTest, AddFrameBeforeBasicInfoTest) {
   JxlEncoderPtr enc = JxlEncoderMake(nullptr);
   EXPECT_NE(nullptr, enc.get());
@@ -212,21 +201,20 @@ void VerifyFrameEncoding(size_t xsize, size_t ysize, JxlEncoder* enc,
   EXPECT_EQ(JXL_ENC_SUCCESS, process_result);
   jxl::CodecInOut decoded_io;
   EXPECT_TRUE(jxl::test::DecodeFile(
-      {}, jxl::Span<const uint8_t>(compressed.data(), compressed.size()),
-      &decoded_io, /*pool=*/nullptr));
+      {}, jxl::Bytes(compressed.data(), compressed.size()), &decoded_io));
 
   EXPECT_LE(
-      ComputeDistance2(input_io.Main(), decoded_io.Main(), jxl::GetJxlCms()),
+      ComputeDistance2(input_io.Main(), decoded_io.Main(), *JxlGetDefaultCms()),
 #if JXL_HIGH_PRECISION
-      1.8);
+      1.84);
 #else
-      8.0);
+      8.7);
 #endif
 }
 
 void VerifyFrameEncoding(JxlEncoder* enc,
                          const JxlEncoderFrameSettings* frame_settings) {
-  VerifyFrameEncoding(63, 129, enc, frame_settings, 2600,
+  VerifyFrameEncoding(63, 129, enc, frame_settings, 2700,
                       /*lossy_use_original_profile=*/false);
 }
 
@@ -255,7 +243,7 @@ TEST(EncodeTest, CmsTest) {
   JxlEncoderPtr enc = JxlEncoderMake(nullptr);
   EXPECT_NE(nullptr, enc.get());
   bool cms_called = false;
-  JxlCmsInterface cms = jxl::GetJxlCms();
+  JxlCmsInterface cms = *JxlGetDefaultCms();
   struct InitData {
     void* original_init_data;
     jpegxl_cms_init_func original_init;
@@ -304,14 +292,126 @@ TEST(EncodeTest, frame_settingsTest) {
     EXPECT_NE(nullptr, enc.get());
     JxlEncoderFrameSettings* frame_settings =
         JxlEncoderFrameSettingsCreate(enc.get(), NULL);
-    // Lower than currently supported values
+    const size_t nb_options = 23;
+    const JxlEncoderFrameSettingId options[nb_options] = {
+        JXL_ENC_FRAME_SETTING_EFFORT,
+        JXL_ENC_FRAME_SETTING_BROTLI_EFFORT,
+        JXL_ENC_FRAME_SETTING_DECODING_SPEED,
+        JXL_ENC_FRAME_SETTING_RESAMPLING,
+        JXL_ENC_FRAME_SETTING_EXTRA_CHANNEL_RESAMPLING,
+        JXL_ENC_FRAME_SETTING_ALREADY_DOWNSAMPLED,
+        JXL_ENC_FRAME_SETTING_EPF,
+        JXL_ENC_FRAME_SETTING_GROUP_ORDER_CENTER_X,
+        JXL_ENC_FRAME_SETTING_GROUP_ORDER_CENTER_Y,
+        JXL_ENC_FRAME_SETTING_PROGRESSIVE_DC,
+        JXL_ENC_FRAME_SETTING_PALETTE_COLORS,
+        JXL_ENC_FRAME_SETTING_COLOR_TRANSFORM,
+        JXL_ENC_FRAME_SETTING_MODULAR_COLOR_SPACE,
+        JXL_ENC_FRAME_SETTING_MODULAR_GROUP_SIZE,
+        JXL_ENC_FRAME_SETTING_MODULAR_PREDICTOR,
+        JXL_ENC_FRAME_SETTING_MODULAR_NB_PREV_CHANNELS,
+        JXL_ENC_FRAME_SETTING_JPEG_RECON_CFL,
+        JXL_ENC_FRAME_INDEX_BOX,
+        JXL_ENC_FRAME_SETTING_JPEG_COMPRESS_BOXES,
+        JXL_ENC_FRAME_SETTING_BUFFERING,
+        JXL_ENC_FRAME_SETTING_JPEG_KEEP_EXIF,
+        JXL_ENC_FRAME_SETTING_JPEG_KEEP_XMP,
+        JXL_ENC_FRAME_SETTING_JPEG_KEEP_JUMBF};
+    const int too_low[nb_options] = {0,  -2, -2, 3,  -2, -2, -2, -2,
+                                     -2, -2, -2, -2, -2, -2, -2, -2,
+                                     -2, -1, -2, -1, -2, -2, -2};
+    const int too_high[nb_options] = {11, 12, 5,     16, 6,  2, 4,  -3,
+                                      -3, 3,  70914, 3,  42, 4, 16, 12,
+                                      2,  2,  2,     4,  2,  2, 2};
+    const int in_range[nb_options] = {5,  5, 3,  1,  1,  1,  3,  -1,
+                                      0,  1, -1, -1, 3,  2,  15, -1,
+                                      -1, 1, 0,  0,  -1, -1, -1};
+    for (size_t i = 0; i < nb_options; i++) {
+      // Lower than currently supported values
+      EXPECT_EQ(JXL_ENC_ERROR, JxlEncoderFrameSettingsSetOption(
+                                   frame_settings, options[i], too_low[i]));
+      // Higher than currently supported values
+      EXPECT_EQ(JXL_ENC_ERROR, JxlEncoderFrameSettingsSetOption(
+                                   frame_settings, options[i], too_high[i]));
+      // Using SetFloatOption on integer options
+      EXPECT_EQ(JXL_ENC_ERROR, JxlEncoderFrameSettingsSetFloatOption(
+                                   frame_settings, options[i], 1.0f));
+      // Within range of the currently supported values
+      EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderFrameSettingsSetOption(
+                                     frame_settings, options[i], in_range[i]));
+    }
+    // Effort 10 should only work when expert options are allowed
     EXPECT_EQ(JXL_ENC_ERROR,
               JxlEncoderFrameSettingsSetOption(
-                  frame_settings, JXL_ENC_FRAME_SETTING_EFFORT, 0));
-    // Higher than currently supported values
-    EXPECT_EQ(JXL_ENC_ERROR,
+                  frame_settings, JXL_ENC_FRAME_SETTING_EFFORT, 10));
+    JxlEncoderAllowExpertOptions(enc.get());
+    EXPECT_EQ(JXL_ENC_SUCCESS,
               JxlEncoderFrameSettingsSetOption(
                   frame_settings, JXL_ENC_FRAME_SETTING_EFFORT, 10));
+
+    // Non-existing option
+    EXPECT_EQ(JXL_ENC_ERROR,
+              JxlEncoderFrameSettingsSetOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_FILL_ENUM, 0));
+    EXPECT_EQ(JXL_ENC_ERROR,
+              JxlEncoderFrameSettingsSetFloatOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_FILL_ENUM, 0.f));
+
+    // Float options
+    EXPECT_EQ(JXL_ENC_ERROR,
+              JxlEncoderFrameSettingsSetFloatOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_PHOTON_NOISE, -1.0f));
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderFrameSettingsSetFloatOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_PHOTON_NOISE, 100.0f));
+    EXPECT_EQ(
+        JXL_ENC_ERROR,
+        JxlEncoderFrameSettingsSetFloatOption(
+            frame_settings,
+            JXL_ENC_FRAME_SETTING_MODULAR_MA_TREE_LEARNING_PERCENT, 101.0f));
+    EXPECT_EQ(
+        JXL_ENC_ERROR,
+        JxlEncoderFrameSettingsSetFloatOption(
+            frame_settings,
+            JXL_ENC_FRAME_SETTING_MODULAR_MA_TREE_LEARNING_PERCENT, -2.0f));
+    EXPECT_EQ(
+        JXL_ENC_SUCCESS,
+        JxlEncoderFrameSettingsSetFloatOption(
+            frame_settings,
+            JXL_ENC_FRAME_SETTING_MODULAR_MA_TREE_LEARNING_PERCENT, -1.0f));
+    EXPECT_EQ(JXL_ENC_ERROR,
+              JxlEncoderFrameSettingsSetFloatOption(
+                  frame_settings,
+                  JXL_ENC_FRAME_SETTING_CHANNEL_COLORS_GLOBAL_PERCENT, 101.0f));
+    EXPECT_EQ(JXL_ENC_ERROR,
+              JxlEncoderFrameSettingsSetFloatOption(
+                  frame_settings,
+                  JXL_ENC_FRAME_SETTING_CHANNEL_COLORS_GLOBAL_PERCENT, -2.0f));
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderFrameSettingsSetFloatOption(
+                  frame_settings,
+                  JXL_ENC_FRAME_SETTING_CHANNEL_COLORS_GLOBAL_PERCENT, -1.0f));
+    EXPECT_EQ(JXL_ENC_ERROR,
+              JxlEncoderFrameSettingsSetFloatOption(
+                  frame_settings,
+                  JXL_ENC_FRAME_SETTING_CHANNEL_COLORS_GROUP_PERCENT, 101.0f));
+    EXPECT_EQ(JXL_ENC_ERROR,
+              JxlEncoderFrameSettingsSetFloatOption(
+                  frame_settings,
+                  JXL_ENC_FRAME_SETTING_CHANNEL_COLORS_GROUP_PERCENT, -2.0f));
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderFrameSettingsSetFloatOption(
+                  frame_settings,
+                  JXL_ENC_FRAME_SETTING_CHANNEL_COLORS_GROUP_PERCENT, -1.0f));
+    EXPECT_EQ(JXL_ENC_ERROR,
+              JxlEncoderFrameSettingsSetOption(
+                  frame_settings,
+                  JXL_ENC_FRAME_SETTING_CHANNEL_COLORS_GROUP_PERCENT, 50.0f));
+    EXPECT_EQ(JXL_ENC_ERROR,
+              JxlEncoderFrameSettingsSetOption(
+                  frame_settings, JXL_ENC_FRAME_SETTING_PHOTON_NOISE, 50.0f));
+
+    VerifyFrameEncoding(63, 129, enc.get(), frame_settings, 2500, false);
   }
 
   {
@@ -321,7 +421,7 @@ TEST(EncodeTest, frame_settingsTest) {
         JxlEncoderFrameSettingsCreate(enc.get(), NULL);
     EXPECT_EQ(JXL_ENC_SUCCESS,
               JxlEncoderSetFrameLossless(frame_settings, JXL_TRUE));
-    VerifyFrameEncoding(63, 129, enc.get(), frame_settings, 3600, false);
+    VerifyFrameEncoding(63, 129, enc.get(), frame_settings, 3000, false);
     EXPECT_EQ(true, enc->last_used_cparams.IsLossless());
   }
 
@@ -331,7 +431,7 @@ TEST(EncodeTest, frame_settingsTest) {
     JxlEncoderFrameSettings* frame_settings =
         JxlEncoderFrameSettingsCreate(enc.get(), NULL);
     EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetFrameDistance(frame_settings, 0.5));
-    VerifyFrameEncoding(63, 129, enc.get(), frame_settings, 3000, false);
+    VerifyFrameEncoding(63, 129, enc.get(), frame_settings, 3030, false);
     EXPECT_EQ(0.5, enc->last_used_cparams.butteraugli_distance);
   }
 
@@ -392,7 +492,8 @@ TEST(EncodeTest, frame_settingsTest) {
     EXPECT_EQ(JXL_ENC_SUCCESS,
               JxlEncoderFrameSettingsSetOption(
                   frame_settings, JXL_ENC_FRAME_SETTING_PROGRESSIVE_DC, 2));
-    VerifyFrameEncoding(enc.get(), frame_settings);
+    VerifyFrameEncoding(63, 129, enc.get(), frame_settings, 2830,
+                        /*lossy_use_original_profile=*/false);
     EXPECT_EQ(false, enc->last_used_cparams.responsive);
     EXPECT_EQ(true, enc->last_used_cparams.progressive_mode);
     EXPECT_EQ(2, enc->last_used_cparams.progressive_dc);
@@ -408,7 +509,7 @@ TEST(EncodeTest, frame_settingsTest) {
         JxlEncoderFrameSettingsSetFloatOption(
             frame_settings, JXL_ENC_FRAME_SETTING_PHOTON_NOISE, 1777.777));
     VerifyFrameEncoding(enc.get(), frame_settings);
-    EXPECT_NEAR(1777.777f, enc->last_used_cparams.photon_noise_iso, 1E-6);
+    EXPECT_NEAR(1777.777f, enc->last_used_cparams.photon_noise_iso, 1E-4);
   }
 
   {
@@ -502,7 +603,7 @@ TEST(EncodeTest, LossyEncoderUseOriginalProfileTest) {
     ASSERT_NE(nullptr, enc.get());
     JxlEncoderFrameSettings* frame_settings =
         JxlEncoderFrameSettingsCreate(enc.get(), NULL);
-    VerifyFrameEncoding(63, 129, enc.get(), frame_settings, 4100, true);
+    VerifyFrameEncoding(63, 129, enc.get(), frame_settings, 7897, true);
   }
   {
     JxlEncoderPtr enc = JxlEncoderMake(nullptr);
@@ -512,7 +613,7 @@ TEST(EncodeTest, LossyEncoderUseOriginalProfileTest) {
     EXPECT_EQ(JXL_ENC_SUCCESS,
               JxlEncoderFrameSettingsSetOption(
                   frame_settings, JXL_ENC_FRAME_SETTING_PROGRESSIVE_DC, 2));
-    VerifyFrameEncoding(63, 129, enc.get(), frame_settings, 4500, true);
+    VerifyFrameEncoding(63, 129, enc.get(), frame_settings, 8310, true);
   }
   {
     JxlEncoderPtr enc = JxlEncoderMake(nullptr);
@@ -522,7 +623,7 @@ TEST(EncodeTest, LossyEncoderUseOriginalProfileTest) {
     ASSERT_EQ(JXL_ENC_SUCCESS,
               JxlEncoderFrameSettingsSetOption(
                   frame_settings, JXL_ENC_FRAME_SETTING_EFFORT, 8));
-    VerifyFrameEncoding(63, 129, enc.get(), frame_settings, 3700, true);
+    VerifyFrameEncoding(63, 129, enc.get(), frame_settings, 7228, true);
   }
 }
 
@@ -530,8 +631,8 @@ namespace {
 // Returns a copy of buf from offset to offset+size, or a new zeroed vector if
 // the result would have been out of bounds taking integer overflow into
 // account.
-const std::vector<uint8_t> SliceSpan(const jxl::Span<const uint8_t>& buf,
-                                     size_t offset, size_t size) {
+std::vector<uint8_t> SliceSpan(const jxl::Span<const uint8_t>& buf,
+                               size_t offset, size_t size) {
   if (offset + size >= buf.size()) {
     return std::vector<uint8_t>(size, 0);
   }
@@ -552,7 +653,7 @@ struct Box {
   char extended_type[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
 
   // Box data.
-  jxl::Span<const uint8_t> data = jxl::Span<const uint8_t>(nullptr, 0);
+  jxl::Span<const uint8_t> data = jxl::Bytes(nullptr, 0);
 
   // If the size is not given, the datasize extends to the end of the file.
   // If this field is false, the size field is not encoded when the box is
@@ -596,16 +697,14 @@ struct Box {
         return true;
       }
       data_size_given = true;
-      data = jxl::Span<const uint8_t>(in->data() + header_size,
-                                      box_size - header_size);
+      data = jxl::Bytes(in->data() + header_size, box_size - header_size);
     } else {
       data_size_given = false;
-      data = jxl::Span<const uint8_t>(in->data() + header_size,
-                                      in->size() - header_size);
+      data = jxl::Bytes(in->data() + header_size, in->size() - header_size);
     }
 
-    *in = jxl::Span<const uint8_t>(in->data() + header_size + data.size(),
-                                   in->size() - header_size - data.size());
+    *in = jxl::Bytes(in->data() + header_size + data.size(),
+                     in->size() - header_size - data.size());
     return true;
   }
 };
@@ -703,7 +802,7 @@ TEST(EncodeTest, SingleFrameBoundedJXLCTest) {
 
   Container container = {};
   jxl::Span<const uint8_t> encoded_span =
-      jxl::Span<const uint8_t>(compressed.data(), compressed.size());
+      jxl::Bytes(compressed.data(), compressed.size());
   EXPECT_TRUE(container.Decode(&encoded_span));
   EXPECT_EQ(0u, encoded_span.size());
   bool found_jxlc = false;
@@ -776,7 +875,7 @@ TEST(EncodeTest, CodestreamLevelTest) {
 
   Container container = {};
   jxl::Span<const uint8_t> encoded_span =
-      jxl::Span<const uint8_t>(compressed.data(), compressed.size());
+      jxl::Bytes(compressed.data(), compressed.size());
   EXPECT_TRUE(container.Decode(&encoded_span));
   EXPECT_EQ(0u, encoded_span.size());
   EXPECT_EQ(0, memcmp("jxll", container.boxes[0].type, 4));
@@ -813,7 +912,7 @@ TEST(EncodeTest, CodestreamLevelVerificationTest) {
 
 TEST(EncodeTest, JXL_TRANSCODE_JPEG_TEST(JPEGReconstructionTest)) {
   const std::string jpeg_path = "jxl/flower/flower.png.im_q85_420.jpg";
-  const jxl::PaddedBytes orig = jxl::ReadTestData(jpeg_path);
+  const std::vector<uint8_t> orig = jxl::test::ReadTestData(jpeg_path);
 
   JxlEncoderPtr enc = JxlEncoderMake(nullptr);
   JxlEncoderFrameSettings* frame_settings =
@@ -841,6 +940,49 @@ TEST(EncodeTest, JXL_TRANSCODE_JPEG_TEST(JPEGReconstructionTest)) {
   EXPECT_EQ(JXL_ENC_SUCCESS, process_result);
 
   jxl::extras::JXLDecompressParams dparams;
+  jxl::test::DefaultAcceptedFormats(dparams);
+  std::vector<uint8_t> decoded_jpeg_bytes;
+  jxl::extras::PackedPixelFile ppf;
+  EXPECT_TRUE(DecodeImageJXL(compressed.data(), compressed.size(), dparams,
+                             nullptr, &ppf, &decoded_jpeg_bytes));
+
+  EXPECT_EQ(decoded_jpeg_bytes.size(), orig.size());
+  EXPECT_EQ(0, memcmp(decoded_jpeg_bytes.data(), orig.data(), orig.size()));
+}
+
+TEST(EncodeTest, JXL_TRANSCODE_JPEG_TEST(ProgressiveJPEGReconstructionTest)) {
+  const std::string jpeg_path = "jxl/flower/flower.png.im_q85_420.jpg";
+  const std::vector<uint8_t> orig = jxl::test::ReadTestData(jpeg_path);
+
+  JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+  JxlEncoderFrameSettings* frame_settings =
+      JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+
+  frame_settings->values.cparams.progressive_mode = true;
+
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderStoreJPEGMetadata(enc.get(), JXL_TRUE));
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderAddJPEGFrame(frame_settings, orig.data(), orig.size()));
+  JxlEncoderCloseInput(enc.get());
+
+  std::vector<uint8_t> compressed = std::vector<uint8_t>(64);
+  uint8_t* next_out = compressed.data();
+  size_t avail_out = compressed.size() - (next_out - compressed.data());
+  JxlEncoderStatus process_result = JXL_ENC_NEED_MORE_OUTPUT;
+  while (process_result == JXL_ENC_NEED_MORE_OUTPUT) {
+    process_result = JxlEncoderProcessOutput(enc.get(), &next_out, &avail_out);
+    if (process_result == JXL_ENC_NEED_MORE_OUTPUT) {
+      size_t offset = next_out - compressed.data();
+      compressed.resize(compressed.size() * 2);
+      next_out = compressed.data() + offset;
+      avail_out = compressed.size() - offset;
+    }
+  }
+  compressed.resize(next_out - compressed.data());
+  EXPECT_EQ(JXL_ENC_SUCCESS, process_result);
+
+  jxl::extras::JXLDecompressParams dparams;
+  jxl::test::DefaultAcceptedFormats(dparams);
   std::vector<uint8_t> decoded_jpeg_bytes;
   jxl::extras::PackedPixelFile ppf;
   EXPECT_TRUE(DecodeImageJXL(compressed.data(), compressed.size(), dparams,
@@ -889,6 +1031,8 @@ TEST(EncodeTest, BasicInfoTest) {
   basic_info.min_nits = 5.0;
   basic_info.linear_below = 12.7;
   basic_info.orientation = JXL_ORIENT_ROTATE_90_CW;
+  basic_info.intrinsic_xsize = 88;
+  basic_info.intrinsic_ysize = 99;
   basic_info.animation.tps_numerator = 55;
   basic_info.animation.tps_denominator = 77;
   basic_info.animation.num_loops = 10;
@@ -944,6 +1088,8 @@ TEST(EncodeTest, BasicInfoTest) {
       EXPECT_EQ(basic_info.uses_original_profile,
                 basic_info2.uses_original_profile);
       EXPECT_EQ(basic_info.orientation, basic_info2.orientation);
+      EXPECT_EQ(basic_info.intrinsic_xsize, basic_info2.intrinsic_xsize);
+      EXPECT_EQ(basic_info.intrinsic_ysize, basic_info2.intrinsic_ysize);
       EXPECT_EQ(basic_info.num_color_channels, basic_info2.num_color_channels);
       // TODO(lode): also test num_extra_channels, but currently there may be a
       // mismatch between 0 and 1 if there is alpha, until encoder support for
@@ -1166,142 +1312,158 @@ TEST(EncodeTest, CroppedFrameTest) {
   EXPECT_EQ(true, seen_frame);
 }
 
-TEST(EncodeTest, BoxTest) {
+struct EncodeBoxTest : public testing::TestWithParam<std::tuple<bool, size_t>> {
+};
+
+TEST_P(EncodeBoxTest, JXL_BOXES_TEST(BoxTest)) {
   // Test with uncompressed boxes and with brob boxes
-  for (int compress_box = 0; compress_box <= 1; ++compress_box) {
-    // Tests adding two metadata boxes with the encoder: an exif box before the
-    // image frame, and an xml box after the image frame. Then verifies the
-    // decoder can decode them, they are in the expected place, and have the
-    // correct content after decoding.
-    JxlEncoderPtr enc = JxlEncoderMake(nullptr);
-    EXPECT_NE(nullptr, enc.get());
+  bool compress_box = std::get<0>(GetParam());
+  size_t xml_box_size = std::get<1>(GetParam());
+  // TODO(firsching): use xml_box_size
+  (void)xml_box_size;
+  // Tests adding two metadata boxes with the encoder: an exif box before the
+  // image frame, and an xml box after the image frame. Then verifies the
+  // decoder can decode them, they are in the expected place, and have the
+  // correct content after decoding.
+  JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+  EXPECT_NE(nullptr, enc.get());
 
-    EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderUseBoxes(enc.get()));
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderUseBoxes(enc.get()));
 
-    JxlEncoderFrameSettings* frame_settings =
-        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
-    size_t xsize = 50;
-    size_t ysize = 17;
-    JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
-    std::vector<uint8_t> pixels =
-        jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
-    JxlBasicInfo basic_info;
-    jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &pixel_format);
-    basic_info.xsize = xsize;
-    basic_info.ysize = ysize;
-    basic_info.uses_original_profile = false;
-    EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc.get(), 10));
-    EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc.get(), &basic_info));
-    JxlColorEncoding color_encoding;
-    JxlColorEncodingSetToSRGB(&color_encoding,
-                              /*is_gray=*/false);
-    EXPECT_EQ(JXL_ENC_SUCCESS,
-              JxlEncoderSetColorEncoding(enc.get(), &color_encoding));
+  JxlEncoderFrameSettings* frame_settings =
+      JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+  size_t xsize = 50;
+  size_t ysize = 17;
+  JxlPixelFormat pixel_format = {4, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+  std::vector<uint8_t> pixels = jxl::test::GetSomeTestImage(xsize, ysize, 4, 0);
+  JxlBasicInfo basic_info;
+  jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &pixel_format);
+  basic_info.xsize = xsize;
+  basic_info.ysize = ysize;
+  basic_info.uses_original_profile = false;
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc.get(), 10));
+  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc.get(), &basic_info));
+  JxlColorEncoding color_encoding;
+  JxlColorEncodingSetToSRGB(&color_encoding,
+                            /*is_gray=*/false);
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderSetColorEncoding(enc.get(), &color_encoding));
 
-    std::vector<uint8_t> compressed = std::vector<uint8_t>(64);
-    uint8_t* next_out = compressed.data();
-    size_t avail_out = compressed.size() - (next_out - compressed.data());
-
-    // Add an early metadata box. Also add a valid 4-byte TIFF offset header
-    // before the fake exif data of these box contents.
-    constexpr const char* exif_test_string = "\0\0\0\0exif test data";
-    const uint8_t* exif_data =
-        reinterpret_cast<const uint8_t*>(exif_test_string);
-    // Skip the 4 zeroes for strlen
-    const size_t exif_size = 4 + strlen(exif_test_string + 4);
-    JxlEncoderAddBox(enc.get(), "Exif", exif_data, exif_size, compress_box);
-
-    // Write to output
-    ProcessEncoder(enc.get(), compressed, next_out, avail_out);
+  std::vector<uint8_t> compressed = std::vector<uint8_t>(64);
+  uint8_t* next_out = compressed.data();
+  size_t avail_out = compressed.size() - (next_out - compressed.data());
 
-    // Add image frame
-    EXPECT_EQ(JXL_ENC_SUCCESS,
-              JxlEncoderAddImageFrame(frame_settings, &pixel_format,
-                                      pixels.data(), pixels.size()));
-    // Indicate this is the last frame
-    JxlEncoderCloseFrames(enc.get());
+  // Add an early metadata box. Also add a valid 4-byte TIFF offset header
+  // before the fake exif data of these box contents.
+  constexpr const char* exif_test_string = "\0\0\0\0exif test data";
+  const uint8_t* exif_data = reinterpret_cast<const uint8_t*>(exif_test_string);
+  // Skip the 4 zeroes for strlen
+  const size_t exif_size = 4 + strlen(exif_test_string + 4);
+  JxlEncoderAddBox(enc.get(), "Exif", exif_data, exif_size, compress_box);
 
-    // Write to output
-    ProcessEncoder(enc.get(), compressed, next_out, avail_out);
+  // Write to output
+  ProcessEncoder(enc.get(), compressed, next_out, avail_out);
 
-    // Add a late metadata box
-    constexpr const char* xml_test_string = "<some random xml data>";
-    const uint8_t* xml_data = reinterpret_cast<const uint8_t*>(xml_test_string);
-    size_t xml_size = strlen(xml_test_string);
-    JxlEncoderAddBox(enc.get(), "XML ", xml_data, xml_size, compress_box);
+  // Add image frame
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderAddImageFrame(frame_settings, &pixel_format,
+                                    pixels.data(), pixels.size()));
+  // Indicate this is the last frame
+  JxlEncoderCloseFrames(enc.get());
 
-    // Indicate this is the last box
-    JxlEncoderCloseBoxes(enc.get());
+  // Write to output
+  ProcessEncoder(enc.get(), compressed, next_out, avail_out);
 
-    // Write to output
-    ProcessEncoder(enc.get(), compressed, next_out, avail_out);
+  // Add a late metadata box
+  constexpr const char* xml_test_string = "<some random xml data>";
+  const uint8_t* xml_data = reinterpret_cast<const uint8_t*>(xml_test_string);
+  size_t xml_size = strlen(xml_test_string);
+  JxlEncoderAddBox(enc.get(), "XML ", xml_data, xml_size, compress_box);
 
-    // Decode to verify the boxes, we don't decode to pixels, only the boxes.
-    JxlDecoderPtr dec = JxlDecoderMake(nullptr);
-    EXPECT_NE(nullptr, dec.get());
+  // Indicate this is the last box
+  JxlEncoderCloseBoxes(enc.get());
 
-    if (compress_box) {
-      EXPECT_EQ(JXL_DEC_SUCCESS,
-                JxlDecoderSetDecompressBoxes(dec.get(), JXL_TRUE));
-    }
+  // Write to output
+  ProcessEncoder(enc.get(), compressed, next_out, avail_out);
 
-    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderSubscribeEvents(
-                                   dec.get(), JXL_DEC_FRAME | JXL_DEC_BOX));
-
-    JxlDecoderSetInput(dec.get(), compressed.data(), compressed.size());
-    JxlDecoderCloseInput(dec.get());
-
-    std::vector<uint8_t> dec_exif_box(exif_size);
-    std::vector<uint8_t> dec_xml_box(xml_size);
-
-    for (bool post_frame = false;;) {
-      JxlDecoderStatus status = JxlDecoderProcessInput(dec.get());
-      if (status == JXL_DEC_ERROR) {
-        FAIL();
-      } else if (status == JXL_DEC_SUCCESS) {
-        EXPECT_EQ(0, JxlDecoderReleaseBoxBuffer(dec.get()));
-        break;
-      } else if (status == JXL_DEC_FRAME) {
-        post_frame = true;
-      } else if (status == JXL_DEC_BOX) {
-        // Since we gave the exif/xml box output buffer of the exact known
-        // correct size, 0 bytes should be released. Same when no buffer was
-        // set.
-        EXPECT_EQ(0, JxlDecoderReleaseBoxBuffer(dec.get()));
-        JxlBoxType type;
-        EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBoxType(dec.get(), type, true));
-        if (!memcmp(type, "Exif", 4)) {
-          // This box should have been encoded before the image frame
-          EXPECT_EQ(false, post_frame);
-          JxlDecoderSetBoxBuffer(dec.get(), dec_exif_box.data(),
-                                 dec_exif_box.size());
-        } else if (!memcmp(type, "XML ", 4)) {
-          // This box should have been encoded after the image frame
-          EXPECT_EQ(true, post_frame);
-          JxlDecoderSetBoxBuffer(dec.get(), dec_xml_box.data(),
-                                 dec_xml_box.size());
-        }
-      } else {
-        FAIL();  // unexpected status
+  // Decode to verify the boxes, we don't decode to pixels, only the boxes.
+  JxlDecoderPtr dec = JxlDecoderMake(nullptr);
+  EXPECT_NE(nullptr, dec.get());
+
+  if (compress_box) {
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderSetDecompressBoxes(dec.get(), JXL_TRUE));
+  }
+
+  EXPECT_EQ(JXL_DEC_SUCCESS,
+            JxlDecoderSubscribeEvents(dec.get(), JXL_DEC_FRAME | JXL_DEC_BOX));
+
+  JxlDecoderSetInput(dec.get(), compressed.data(), compressed.size());
+  JxlDecoderCloseInput(dec.get());
+
+  std::vector<uint8_t> dec_exif_box(exif_size);
+  std::vector<uint8_t> dec_xml_box(xml_size);
+
+  for (bool post_frame = false;;) {
+    JxlDecoderStatus status = JxlDecoderProcessInput(dec.get());
+    if (status == JXL_DEC_ERROR) {
+      FAIL();
+    } else if (status == JXL_DEC_SUCCESS) {
+      EXPECT_EQ(0, JxlDecoderReleaseBoxBuffer(dec.get()));
+      break;
+    } else if (status == JXL_DEC_FRAME) {
+      post_frame = true;
+    } else if (status == JXL_DEC_BOX) {
+      // Since we gave the exif/xml box output buffer of the exact known
+      // correct size, 0 bytes should be released. Same when no buffer was
+      // set.
+      EXPECT_EQ(0, JxlDecoderReleaseBoxBuffer(dec.get()));
+      JxlBoxType type;
+      EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBoxType(dec.get(), type, true));
+      if (!memcmp(type, "Exif", 4)) {
+        // This box should have been encoded before the image frame
+        EXPECT_EQ(false, post_frame);
+        JxlDecoderSetBoxBuffer(dec.get(), dec_exif_box.data(),
+                               dec_exif_box.size());
+      } else if (!memcmp(type, "XML ", 4)) {
+        // This box should have been encoded after the image frame
+        EXPECT_EQ(true, post_frame);
+        JxlDecoderSetBoxBuffer(dec.get(), dec_xml_box.data(),
+                               dec_xml_box.size());
       }
+    } else {
+      FAIL();  // unexpected status
     }
-
-    EXPECT_EQ(0, memcmp(exif_data, dec_exif_box.data(), exif_size));
-    EXPECT_EQ(0, memcmp(xml_data, dec_xml_box.data(), xml_size));
   }
+
+  EXPECT_EQ(0, memcmp(exif_data, dec_exif_box.data(), exif_size));
+  EXPECT_EQ(0, memcmp(xml_data, dec_xml_box.data(), xml_size));
 }
 
-#if JPEGXL_ENABLE_JPEG  // Loading .jpg files requires libjpeg support.
+std::string nameBoxTest(
+    const ::testing::TestParamInfo<std::tuple<bool, size_t>>& info) {
+  return (std::get<0>(info.param) ? "C" : "Unc") + std::string("ompressed") +
+         "_BoxSize_" + std::to_string((std::get<1>(info.param)));
+}
+
+JXL_GTEST_INSTANTIATE_TEST_SUITE_P(
+    EncodeBoxParamsTest, EncodeBoxTest,
+    testing::Combine(testing::Values(false, true),
+                     testing::Values(256,
+                                     jxl::kLargeBoxContentSizeThreshold + 77)),
+    nameBoxTest);
+
 TEST(EncodeTest, JXL_TRANSCODE_JPEG_TEST(JPEGFrameTest)) {
+  TEST_LIBJPEG_SUPPORT();
   for (int skip_basic_info = 0; skip_basic_info < 2; skip_basic_info++) {
     for (int skip_color_encoding = 0; skip_color_encoding < 2;
          skip_color_encoding++) {
       // cannot set color encoding if basic info is not set
       if (skip_basic_info && !skip_color_encoding) continue;
       const std::string jpeg_path = "jxl/flower/flower_cropped.jpg";
-      const jxl::PaddedBytes orig = jxl::ReadTestData(jpeg_path);
+      const std::vector<uint8_t> orig = jxl::test::ReadTestData(jpeg_path);
       jxl::CodecInOut orig_io;
-      ASSERT_TRUE(SetFromBytes(jxl::Span<const uint8_t>(orig), &orig_io,
+      ASSERT_TRUE(SetFromBytes(jxl::Bytes(orig), &orig_io,
                                /*pool=*/nullptr));
 
       JxlEncoderPtr enc = JxlEncoderMake(nullptr);
@@ -1347,13 +1509,441 @@ TEST(EncodeTest, JXL_TRANSCODE_JPEG_TEST(JPEGFrameTest)) {
 
       jxl::CodecInOut decoded_io;
       EXPECT_TRUE(jxl::test::DecodeFile(
-          {}, jxl::Span<const uint8_t>(compressed.data(), compressed.size()),
-          &decoded_io, /*pool=*/nullptr));
+          {}, jxl::Bytes(compressed.data(), compressed.size()), &decoded_io));
+
+      EXPECT_LE(ComputeDistance2(orig_io.Main(), decoded_io.Main(),
+                                 *JxlGetDefaultCms()),
+                3.5);
+    }
+  }
+}
+
+namespace {
+class JxlStreamingAdapter {
+ public:
+  JxlStreamingAdapter(JxlEncoder* encoder, bool return_large_buffers,
+                      bool can_seek)
+      : return_large_buffers_(return_large_buffers) {
+    struct JxlEncoderOutputProcessor output_processor;
+    output_processor.opaque = this;
+    output_processor.get_buffer = [](void* opaque, size_t* size) {
+      return static_cast<JxlStreamingAdapter*>(opaque)->GetBuffer(size);
+    };
+    if (can_seek) {
+      output_processor.seek = [](void* opaque, uint64_t position) {
+        return static_cast<JxlStreamingAdapter*>(opaque)->Seek(position);
+      };
+    } else {
+      output_processor.seek = nullptr;
+    }
+    output_processor.set_finalized_position = [](void* opaque,
+                                                 uint64_t finalized_position) {
+      return static_cast<JxlStreamingAdapter*>(opaque)->SetFinalizedPosition(
+          finalized_position);
+    };
+    output_processor.release_buffer = [](void* opaque, size_t written_bytes) {
+      return static_cast<JxlStreamingAdapter*>(opaque)->ReleaseBuffer(
+          written_bytes);
+    };
+    EXPECT_EQ(JxlEncoderSetOutputProcessor(encoder, output_processor),
+              JXL_ENC_SUCCESS);
+  }
+
+  std::vector<uint8_t> output() && {
+    output_.resize(position_);
+    return std::move(output_);
+  }
+
+  void* GetBuffer(size_t* size) {
+    if (!return_large_buffers_) {
+      *size = 1;
+    }
+    if (position_ + *size > output_.size()) {
+      output_.resize(position_ + *size, 0xDA);
+    }
+    if (return_large_buffers_) {
+      *size = output_.size() - position_;
+    }
+    return output_.data() + position_;
+  }
+
+  void ReleaseBuffer(size_t written_bytes) {
+    // TODO(veluca): check no more bytes were written.
+    Seek(position_ + written_bytes);
+  }
+
+  void Seek(uint64_t position) {
+    EXPECT_GE(position, finalized_position_);
+    position_ = position;
+  }
+
+  void SetFinalizedPosition(uint64_t finalized_position) {
+    EXPECT_GE(finalized_position, finalized_position_);
+    finalized_position_ = finalized_position;
+    EXPECT_GE(position_, finalized_position_);
+  }
+
+  void CheckFinalWatermarkPosition() const {
+    EXPECT_EQ(finalized_position_, position_);
+  }
+
+ private:
+  std::vector<uint8_t> output_;
+  size_t position_ = 0;
+  size_t finalized_position_ = 0;
+  bool return_large_buffers_;
+};
+
+struct StreamingTestParam {
+  size_t bitmask;
+  bool use_container() const { return bitmask & 0x1; }
+  bool return_large_buffers() const { return bitmask & 0x2; }
+  bool multiple_frames() const { return bitmask & 0x4; }
+  bool fast_lossless() const { return bitmask & 0x8; }
+  bool can_seek() const { return bitmask & 0x10; }
+  bool with_extra_channels() const { return bitmask & 0x20; }
+  bool color_includes_alpha() const { return bitmask & 0x40; }
+
+  static std::vector<StreamingTestParam> All() {
+    std::vector<StreamingTestParam> params;
+    for (size_t bitmask = 0; bitmask < 128; bitmask++) {
+      params.push_back(StreamingTestParam{bitmask});
+    }
+    return params;
+  }
+};
+
+std::ostream& operator<<(std::ostream& out, StreamingTestParam p) {
+  if (p.use_container()) {
+    out << "WithContainer_";
+  } else {
+    out << "WithoutContainer_";
+  }
+  if (p.return_large_buffers()) {
+    out << "WithLargeBuffers_";
+  } else {
+    out << "WithSmallBuffers_";
+  }
+  if (p.multiple_frames()) out << "WithMultipleFrames_";
+  if (p.fast_lossless()) out << "FastLossless_";
+  if (!p.can_seek()) {
+    out << "CannotSeek_";
+  } else {
+    out << "CanSeek_";
+  }
+  if (p.with_extra_channels()) {
+    out << "WithExtraChannels_";
+  } else {
+    out << "WithoutExtraChannels_";
+  }
+  if (p.color_includes_alpha()) {
+    out << "ColorIncludesAlpha_";
+  } else {
+    out << "ColorWithoutAlpha_";
+  }
+  return out;
+}
+
+}  // namespace
+
+class EncoderStreamingTest : public testing::TestWithParam<StreamingTestParam> {
+ public:
+  static void SetupImage(const StreamingTestParam& p, size_t xsize,
+                         size_t ysize, size_t num_channels,
+                         size_t bits_per_sample, jxl::test::TestImage& image) {
+    image.SetDimensions(xsize, ysize)
+        .SetDataType(JXL_TYPE_UINT8)
+        .SetChannels(num_channels)
+        .SetAllBitDepths(bits_per_sample);
+    image.AddFrame().RandomFill();
+  }
+  static void SetUpBasicInfo(JxlBasicInfo& basic_info, size_t xsize,
+                             size_t ysize, size_t number_extra_channels,
+                             bool include_alpha) {
+    basic_info.xsize = xsize;
+    basic_info.ysize = ysize;
+    basic_info.num_extra_channels = number_extra_channels + include_alpha;
+  }
+
+  static void SetupEncoder(JxlEncoderStruct* enc, const StreamingTestParam& p,
+                           const JxlBasicInfo& basic_info,
+                           size_t number_extra_channels,
+                           const jxl::extras::PackedImage& frame,
+                           bool add_image_frames) {
+    JxlEncoderFrameSettings* frame_settings =
+        JxlEncoderFrameSettingsCreate(enc, NULL);
+    if (p.fast_lossless()) {
+      JxlEncoderSetFrameLossless(frame_settings, JXL_TRUE);
+      JxlEncoderFrameSettingsSetOption(frame_settings,
+                                       JXL_ENC_FRAME_SETTING_EFFORT, 1);
+    }
+    EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc, &basic_info));
+    JxlColorEncoding color_encoding;
+    JxlColorEncodingSetToSRGB(&color_encoding, /*is_gray=*/false);
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderSetColorEncoding(enc, &color_encoding));
+    if (p.use_container()) {
+      JxlEncoderSetCodestreamLevel(enc, 10);
+    }
+    for (size_t i = 0; i < number_extra_channels; i++) {
+      JxlExtraChannelInfo channel_info;
+      JxlExtraChannelType channel_type = JXL_CHANNEL_THERMAL;
+      JxlEncoderInitExtraChannelInfo(channel_type, &channel_info);
+      EXPECT_EQ(JXL_ENC_SUCCESS,
+                JxlEncoderSetExtraChannelInfo(enc, i, &channel_info));
+    }
+    size_t frame_count = static_cast<int>(add_image_frames) *
+                         (1 + static_cast<int>(p.multiple_frames()));
+    for (size_t i = 0; i < frame_count; i++) {
+      EXPECT_EQ(JXL_ENC_SUCCESS,
+                JxlEncoderAddImageFrame(frame_settings, &frame.format,
+                                        frame.pixels(), frame.pixels_size));
+    }
+    if (add_image_frames) {
+      JxlEncoderCloseInput(enc);
+    }
+  }
+};
+
+TEST_P(EncoderStreamingTest, OutputCallback) {
+  const StreamingTestParam p = GetParam();
+  size_t xsize = 257;
+  size_t ysize = 259;
+  jxl::test::TestImage image;
+  SetupImage(p, xsize, ysize, 3, p.use_container() ? 16 : 8, image);
+  const auto& frame = image.ppf().frames[0].color;
+  JxlBasicInfo basic_info = image.ppf().info;
+  SetUpBasicInfo(basic_info, xsize, ysize, 0, false);
+
+  std::vector<uint8_t> compressed = std::vector<uint8_t>(64);
+  // without sreaming
+  {
+    JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+    ASSERT_NE(nullptr, enc.get());
+    SetupEncoder(enc.get(), p, basic_info, 0, frame, true);
+    uint8_t* next_out = compressed.data();
+    size_t avail_out = compressed.size();
+    ProcessEncoder(enc.get(), compressed, next_out, avail_out);
+  }
+  // with streaming
+  {
+    JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+    ASSERT_NE(nullptr, enc.get());
+    JxlStreamingAdapter streaming_adapter(enc.get(), p.return_large_buffers(),
+                                          p.can_seek());
+    SetupEncoder(enc.get(), p, basic_info, 0, frame, true);
+    EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderFlushInput(enc.get()));
+    streaming_adapter.CheckFinalWatermarkPosition();
+    EXPECT_EQ(std::move(streaming_adapter).output(), compressed);
+  }
+}
+
+class JxlChunkedFrameInputSourceAdapter {
+ private:
+  static const void* GetDataAt(const jxl::extras::PackedPixelFile& ppf,
+                               size_t xpos, size_t ypos, size_t* row_offset) {
+    JxlDataType data_type = ppf.frames[0].color.format.data_type;
+    size_t num_channels = ppf.frames[0].color.format.num_channels;
+    size_t bytes_per_pixel =
+        num_channels * jxl::extras::PackedImage::BitsPerChannel(data_type) / 8;
+    *row_offset = ppf.frames[0].color.stride;
+    return static_cast<uint8_t*>(ppf.frames[0].color.pixels()) +
+           bytes_per_pixel * xpos + ypos * ppf.frames[0].color.stride;
+  }
+
+ public:
+  // Constructor to wrap the image data or any other state
+  explicit JxlChunkedFrameInputSourceAdapter(
+      jxl::extras::PackedPixelFile color_channel,
+      jxl::extras::PackedPixelFile extra_channel)
+      : colorchannel_(std::move(color_channel)),
+        extra_channel_(std::move(extra_channel)) {}
+
+  static void GetColorChannelsPixelFormat(void* opaque,
+                                          JxlPixelFormat* pixel_format) {
+    JxlChunkedFrameInputSourceAdapter* self =
+        static_cast<JxlChunkedFrameInputSourceAdapter*>(opaque);
+    *pixel_format = self->colorchannel_.frames[0].color.format;
+  }
+
+  static const void* GetColorChannelDataAt(void* opaque, size_t xpos,
+                                           size_t ypos, size_t xsize,
+                                           size_t ysize, size_t* row_offset) {
+    JxlChunkedFrameInputSourceAdapter* self =
+        static_cast<JxlChunkedFrameInputSourceAdapter*>(opaque);
+    return GetDataAt(self->colorchannel_, xpos, ypos, row_offset);
+  }
+
+  static void GetExtraChannelPixelFormat(void* opaque, size_t ec_index,
+                                         JxlPixelFormat* pixel_format) {
+    // In this test, we we the same color channel data, so `ec_index` is never
+    // used
+    JxlChunkedFrameInputSourceAdapter* self =
+        static_cast<JxlChunkedFrameInputSourceAdapter*>(opaque);
+    *pixel_format = self->extra_channel_.frames[0].color.format;
+  }
+
+  static const void* GetExtraChannelDataAt(void* opaque, size_t ec_index,
+                                           size_t xpos, size_t ypos,
+                                           size_t xsize, size_t ysize,
+                                           size_t* row_offset) {
+    // In this test, we we the same color channel data, so `ec_index` is never
+    // used
+    JxlChunkedFrameInputSourceAdapter* self =
+        static_cast<JxlChunkedFrameInputSourceAdapter*>(opaque);
+    return GetDataAt(self->extra_channel_, xpos, ypos, row_offset);
+  }
+
+  static void ReleaseCurrentData(void* opaque, const void* buffer) {
+    // No dynamic memory is allocated in GetColorChannelDataAt or
+    // GetExtraChannelDataAt. Therefore, no cleanup is required here.
+  }
+
+  JxlChunkedFrameInputSource GetInputSource() {
+    return JxlChunkedFrameInputSource{
+        this,
+        JxlChunkedFrameInputSourceAdapter::GetColorChannelsPixelFormat,
+        JxlChunkedFrameInputSourceAdapter::GetColorChannelDataAt,
+        JxlChunkedFrameInputSourceAdapter::GetExtraChannelPixelFormat,
+        JxlChunkedFrameInputSourceAdapter::GetExtraChannelDataAt,
+        JxlChunkedFrameInputSourceAdapter::ReleaseCurrentData};
+  }
+
+ private:
+  const jxl::extras::PackedPixelFile colorchannel_;
+  const jxl::extras::PackedPixelFile extra_channel_;
+};
+
+TEST_P(EncoderStreamingTest, ChunkedFrame) {
+  const StreamingTestParam p = GetParam();
+  size_t xsize = 257;
+  size_t ysize = 259;
+  size_t number_extra_channels = p.with_extra_channels() ? 5 : 0;
+  jxl::test::TestImage image;
+  SetupImage(p, xsize, ysize, p.color_includes_alpha() ? 4 : 3,
+             p.use_container() ? 16 : 8, image);
+  jxl::test::TestImage ec_image;
+  SetupImage(p, xsize, ysize, 1, 8, ec_image);
+  const auto& frame = image.ppf().frames[0].color;
+  const auto& ec_frame = ec_image.ppf().frames[0].color;
+  JxlBasicInfo basic_info = image.ppf().info;
+  SetUpBasicInfo(basic_info, xsize, ysize, number_extra_channels,
+                 p.color_includes_alpha());
+  std::vector<uint8_t> compressed = std::vector<uint8_t>(64);
+  std::vector<uint8_t> streaming_compressed = std::vector<uint8_t>(64);
+
+  // without streaming
+  {
+    JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+    ASSERT_NE(nullptr, enc.get());
+    SetupEncoder(enc.get(), p, basic_info, number_extra_channels, frame, false);
+    JxlEncoderFrameSettings* frame_settings =
+        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderAddImageFrame(frame_settings, &frame.format,
+                                      frame.pixels(), frame.pixels_size));
+    for (size_t i = 0; i < number_extra_channels; i++) {
+      EXPECT_EQ(JXL_ENC_SUCCESS,
+                JxlEncoderSetExtraChannelBuffer(
+                    frame_settings, &ec_frame.format, ec_frame.pixels(),
+                    ec_frame.pixels_size, i));
+    }
+    JxlEncoderCloseInput(enc.get());
+    uint8_t* next_out = compressed.data();
+    size_t avail_out = compressed.size();
+    ProcessEncoder(enc.get(), compressed, next_out, avail_out);
+  }
+
+  // with streaming
+  {
+    JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+    ASSERT_NE(nullptr, enc.get());
+    SetupEncoder(enc.get(), p, basic_info, number_extra_channels, frame, false);
+    JxlEncoderFrameSettings* frame_settings =
+        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+    JxlChunkedFrameInputSourceAdapter chunked_frame_adapter(
+        std::move(image.ppf()), std::move(ec_image.ppf()));
+    EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderAddChunkedFrame(
+                                   frame_settings, JXL_TRUE,
+                                   chunked_frame_adapter.GetInputSource()));
+    // JxlEncoderCloseInput(enc.get());
+    uint8_t* next_out = streaming_compressed.data();
+    size_t avail_out = streaming_compressed.size();
+    ProcessEncoder(enc.get(), streaming_compressed, next_out, avail_out);
+  }
+  EXPECT_EQ(streaming_compressed, compressed);
+}
+
+TEST_P(EncoderStreamingTest, ChunkedAndOutputCallback) {
+  const StreamingTestParam p = GetParam();
+  size_t xsize = 257;
+  size_t ysize = 259;
+  size_t number_extra_channels = p.with_extra_channels() ? 5 : 0;
+  jxl::test::TestImage image;
+  SetupImage(p, xsize, ysize, p.color_includes_alpha() ? 4 : 3,
+             p.use_container() ? 16 : 8, image);
+  jxl::test::TestImage ec_image;
+  SetupImage(p, xsize, ysize, 1, 8, ec_image);
+
+  const auto& frame = image.ppf().frames[0].color;
+  const auto& ec_frame = ec_image.ppf().frames[0].color;
+  JxlBasicInfo basic_info = image.ppf().info;
+  SetUpBasicInfo(basic_info, xsize, ysize, number_extra_channels,
+                 p.color_includes_alpha());
+
+  std::vector<uint8_t> compressed = std::vector<uint8_t>(64);
+
+  // without streaming
+  {
+    JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+    ASSERT_NE(nullptr, enc.get());
+    SetupEncoder(enc.get(), p, basic_info, number_extra_channels, frame, false);
+    JxlEncoderFrameSettings* frame_settings =
+        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+    size_t frame_count = static_cast<int>(p.multiple_frames()) + 1;
+    for (size_t i = 0; i < frame_count; i++) {
+      EXPECT_EQ(JXL_ENC_SUCCESS,
+                JxlEncoderAddImageFrame(frame_settings, &frame.format,
+                                        frame.pixels(), frame.pixels_size));
+      for (size_t i = 0; i < number_extra_channels; i++) {
+        EXPECT_EQ(JXL_ENC_SUCCESS,
+                  JxlEncoderSetExtraChannelBuffer(
+                      frame_settings, &ec_frame.format, ec_frame.pixels(),
+                      ec_frame.pixels_size, i));
+      }
+    }
+    JxlEncoderCloseInput(enc.get());
+    uint8_t* next_out = compressed.data();
+    size_t avail_out = compressed.size();
+    ProcessEncoder(enc.get(), compressed, next_out, avail_out);
+  }
 
-      EXPECT_LE(
-          ComputeDistance2(orig_io.Main(), decoded_io.Main(), jxl::GetJxlCms()),
-          3.5);
+  // with streaming
+  {
+    JxlEncoderPtr enc = JxlEncoderMake(nullptr);
+    ASSERT_NE(nullptr, enc.get());
+    SetupEncoder(enc.get(), p, basic_info, number_extra_channels, frame, false);
+    JxlEncoderFrameSettings* frame_settings =
+        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
+    JxlStreamingAdapter streaming_adapter =
+        JxlStreamingAdapter(enc.get(), p.return_large_buffers(), p.can_seek());
+
+    JxlChunkedFrameInputSourceAdapter chunked_frame_adapter(
+        std::move(image.ppf()), std::move(ec_image.ppf()));
+    size_t frame_count = static_cast<int>(p.multiple_frames()) + 1;
+    for (size_t i = 0; i < frame_count; i++) {
+      EXPECT_EQ(JXL_ENC_SUCCESS,
+                JxlEncoderAddChunkedFrame(
+                    // should only set `JXL_TRUE` in the lass pass of the loop
+                    frame_settings, i + 1 == frame_count ? JXL_TRUE : JXL_FALSE,
+                    chunked_frame_adapter.GetInputSource()));
     }
+
+    streaming_adapter.CheckFinalWatermarkPosition();
+    EXPECT_EQ(std::move(streaming_adapter).output(), compressed);
   }
 }
-#endif  // JPEGXL_ENABLE_JPEG
+
+JXL_GTEST_INSTANTIATE_TEST_SUITE_P(
+    EncoderStreamingTest, EncoderStreamingTest,
+    testing::ValuesIn(StreamingTestParam::All()));
index 0043c2d..a90ed02 100644 (file)
 #include "lib/jxl/ac_strategy.h"
 #include "lib/jxl/base/bits.h"
 #include "lib/jxl/base/compiler_specific.h"
-#include "lib/jxl/base/profiler.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/coeff_order.h"
 #include "lib/jxl/coeff_order_fwd.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/dec_ans.h"
 #include "lib/jxl/dec_bit_reader.h"
 #include "lib/jxl/dec_context_map.h"
 #include "lib/jxl/epf.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/image_ops.h"
+#include "lib/jxl/pack_signed.h"
 
 namespace jxl {
 
index 9b3ffa3..d32fe1b 100644 (file)
@@ -7,10 +7,10 @@
 
 #include <stdint.h>
 
-#include "gtest/gtest.h"
 #include "lib/jxl/base/random.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/dec_ans.h"
+#include "lib/jxl/pack_signed.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
 namespace {
@@ -24,7 +24,7 @@ TEST(EntropyCoderTest, PackUnpack) {
   }
 }
 
-struct DummyBitReader {
+struct MockBitReader {
   uint32_t nbits, bits;
   void Consume(uint32_t nbits) {}
   uint32_t PeekBits(uint32_t n) {
@@ -45,7 +45,7 @@ void HybridUintRoundtrip(HybridUintConfig config, size_t limit = 1 << 24) {
     config.Encode(integers[i], &token[i], &nbits[i], &bits[i]);
   }
   for (size_t i = 0; i < kNumIntegers; i++) {
-    DummyBitReader br{nbits[i], bits[i]};
+    MockBitReader br{nbits[i], bits[i]};
     EXPECT_EQ(integers[i],
               ANSSymbolReader::ReadHybridUintConfig(config, token[i], &br));
   }
index 7288ed9..930fa72 100644 (file)
@@ -9,7 +9,6 @@
 
 #include <math.h>
 #include <stdint.h>
-#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
@@ -22,7 +21,6 @@
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/convolve.h"
 #include "lib/jxl/dec_cache.h"
 #include "lib/jxl/image.h"
index 06652fc..8193ce7 100644 (file)
@@ -9,8 +9,8 @@
 // Basic parsing of Exif (just enough for the render-impacting things
 // like orientation)
 
-#include "jxl/codestream_header.h"
-#include "lib/jxl/base/padded_bytes.h"
+#include <jxl/codestream_header.h>
+
 #include "lib/jxl/image_metadata.h"
 
 namespace jxl {
@@ -38,9 +38,10 @@ inline size_t FindExifTagPosition(const std::vector<uint8_t>& exif,
   bool bigendian;
   if (!IsExif(exif, &bigendian)) return 0;
   const uint8_t* t = exif.data() + 4;
-  uint32_t offset = (bigendian ? LoadBE32(t) : LoadLE32(t));
+  uint64_t offset = (bigendian ? LoadBE32(t) : LoadLE32(t));
   if (exif.size() < 12 + offset + 2 || offset < 8) return 0;
   t += offset - 4;
+  if (offset + 2 >= exif.size()) return 0;
   uint16_t nb_tags = (bigendian ? LoadBE16(t) : LoadLE16(t));
   t += 2;
   while (nb_tags > 0) {
@@ -54,9 +55,9 @@ inline size_t FindExifTagPosition(const std::vector<uint8_t>& exif,
   return 0;
 }
 
-// TODO (jon): tag 1 can be used to represent Adobe RGB 1998 if it has value
+// TODO(jon): tag 1 can be used to represent Adobe RGB 1998 if it has value
 // "R03"
-// TODO (jon): set intrinsic dimensions according to
+// TODO(jon): set intrinsic dimensions according to
 // https://discourse.wicg.io/t/proposal-exif-image-resolution-auto-and-from-image/4326/24
 // Parses the Exif data just enough to extract any render-impacting info.
 // If the Exif data is invalid or could not be parsed, then it is treated
index 3b5c16b..508d808 100644 (file)
@@ -6,11 +6,11 @@
 #ifndef LIB_JXL_FAKE_PARALLEL_RUNNER_TESTONLY_H_
 #define LIB_JXL_FAKE_PARALLEL_RUNNER_TESTONLY_H_
 
+#include <jxl/parallel_runner.h>
 #include <stdint.h>
 
 #include <vector>
 
-#include "jxl/parallel_runner.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/random.h"
 
index defdfcd..de1f845 100644 (file)
@@ -10,6 +10,7 @@
 #define LIB_JXL_FAST_DCT_INL_H_
 #endif
 
+#include <cmath>
 #include <hwy/aligned_allocator.h>
 #include <hwy/highway.h>
 
@@ -157,7 +158,9 @@ HWY_NOINLINE void TestFastIDCT() {
   auto idct_mem = hwy::AllocateAligned<int16_t>(N * M);
   int16_t* idct = idct_mem.get();
 
-  auto scratch_space_mem = hwy::AllocateAligned<float>(N * M * 2);
+  const HWY_FULL(float) df;
+  auto scratch_space_mem = hwy::AllocateAligned<float>(
+      N * M * 2 + 3 * std::max(N, M) * MaxLanes(df));
   float* scratch_space = scratch_space_mem.get();
   auto scratch_space_i_mem = hwy::AllocateAligned<int16_t>(N * M * 2);
   int16_t* scratch_space_i = scratch_space_i_mem.get();
@@ -204,7 +207,7 @@ HWY_NOINLINE void TestFloatIDCT() {
   auto dct_in_mem = hwy::AllocateAligned<float>(N * M);
   float* dct_in = dct_mem.get();
 
-  auto scratch_space_mem = hwy::AllocateAligned<float>(N * M * 2);
+  auto scratch_space_mem = hwy::AllocateAligned<float>(N * M * 5);
   float* scratch_space = scratch_space_mem.get();
 
   Rng rng(0);
index d9d852f..a55b67a 100644 (file)
@@ -3,8 +3,6 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-#include <stdio.h>
-
 #include <numeric>
 
 #undef HWY_TARGET_INCLUDE
 #include "lib/jxl/dct-inl.h"
 #include "lib/jxl/fast_dct-inl.h"
 #include "lib/jxl/fast_dct.h"
+#include "lib/jxl/testing.h"
 #include "lib/jxl/transpose-inl.h"
 
 // Test utils
 #include <hwy/highway.h>
-#include <hwy/tests/test_util-inl.h>
+#include <hwy/tests/hwy_gtest.h>
 HWY_BEFORE_NAMESPACE();
 namespace jxl {
 namespace HWY_NAMESPACE {
@@ -198,6 +197,7 @@ HWY_NOINLINE void TestFloatIDCT64x64() {
 }
 HWY_NOINLINE void TestFastTranspose64x128() { TestFastTranspose<64, 128>(); }
 HWY_NOINLINE void TestFloatTranspose64x128() { TestFloatTranspose<64, 128>(); }
+/*
 HWY_NOINLINE void TestFastIDCT64x128() { TestFastIDCT<64, 128>(); }
 HWY_NOINLINE void TestFloatIDCT64x128() {
 #if HWY_TARGET == HWY_SCALAR && \
@@ -207,8 +207,10 @@ HWY_NOINLINE void TestFloatIDCT64x128() {
   TestFloatIDCT<64, 128>();
 #endif
 }
+*/
 HWY_NOINLINE void TestFastTranspose128x64() { TestFastTranspose<128, 64>(); }
 HWY_NOINLINE void TestFloatTranspose128x64() { TestFloatTranspose<128, 64>(); }
+/*
 HWY_NOINLINE void TestFastIDCT128x64() { TestFastIDCT<128, 64>(); }
 HWY_NOINLINE void TestFloatIDCT128x64() {
 #if HWY_TARGET == HWY_SCALAR && \
@@ -218,10 +220,12 @@ HWY_NOINLINE void TestFloatIDCT128x64() {
   TestFloatIDCT<128, 64>();
 #endif
 }
+*/
 HWY_NOINLINE void TestFastTranspose128x128() { TestFastTranspose<128, 128>(); }
 HWY_NOINLINE void TestFloatTranspose128x128() {
   TestFloatTranspose<128, 128>();
 }
+/*
 HWY_NOINLINE void TestFastIDCT128x128() { TestFastIDCT<128, 128>(); }
 HWY_NOINLINE void TestFloatIDCT128x128() {
 #if HWY_TARGET == HWY_SCALAR && \
@@ -231,10 +235,12 @@ HWY_NOINLINE void TestFloatIDCT128x128() {
   TestFloatIDCT<128, 128>();
 #endif
 }
+*/
 HWY_NOINLINE void TestFastTranspose128x256() { TestFastTranspose<128, 256>(); }
 HWY_NOINLINE void TestFloatTranspose128x256() {
   TestFloatTranspose<128, 256>();
 }
+/*
 HWY_NOINLINE void TestFastIDCT128x256() { TestFastIDCT<128, 256>(); }
 HWY_NOINLINE void TestFloatIDCT128x256() {
 #if HWY_TARGET == HWY_SCALAR && \
@@ -244,10 +250,12 @@ HWY_NOINLINE void TestFloatIDCT128x256() {
   TestFloatIDCT<128, 256>();
 #endif
 }
+*/
 HWY_NOINLINE void TestFastTranspose256x128() { TestFastTranspose<256, 128>(); }
 HWY_NOINLINE void TestFloatTranspose256x128() {
   TestFloatTranspose<256, 128>();
 }
+/*
 HWY_NOINLINE void TestFastIDCT256x128() { TestFastIDCT<256, 128>(); }
 HWY_NOINLINE void TestFloatIDCT256x128() {
 #if HWY_TARGET == HWY_SCALAR && \
@@ -257,10 +265,12 @@ HWY_NOINLINE void TestFloatIDCT256x128() {
   TestFloatIDCT<256, 128>();
 #endif
 }
+*/
 HWY_NOINLINE void TestFastTranspose256x256() { TestFastTranspose<256, 256>(); }
 HWY_NOINLINE void TestFloatTranspose256x256() {
   TestFloatTranspose<256, 256>();
 }
+/*
 HWY_NOINLINE void TestFastIDCT256x256() { TestFastIDCT<256, 256>(); }
 HWY_NOINLINE void TestFloatIDCT256x256() {
 #if HWY_TARGET == HWY_SCALAR && \
@@ -270,6 +280,7 @@ HWY_NOINLINE void TestFloatIDCT256x256() {
   TestFloatIDCT<256, 256>();
 #endif
 }
+*/
 
 }  // namespace
 // NOLINTNEXTLINE(google-readability-namespace-comments)
index 897aadc..868e1b7 100644 (file)
@@ -3,21 +3,21 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-#include <stdio.h>
-
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "lib/jxl/fast_math_test.cc"
+#include <jxl/cms.h>
+
 #include <hwy/foreach_target.h>
 
 #include "lib/jxl/base/random.h"
+#include "lib/jxl/cms/transfer_functions-inl.h"
 #include "lib/jxl/dec_xyb-inl.h"
-#include "lib/jxl/enc_color_management.h"
 #include "lib/jxl/enc_xyb.h"
-#include "lib/jxl/transfer_functions-inl.h"
+#include "lib/jxl/testing.h"
 
 // Test utils
 #include <hwy/highway.h>
-#include <hwy/tests/test_util-inl.h>
+#include <hwy/tests/hwy_gtest.h>
 HWY_BEFORE_NAMESPACE();
 namespace jxl {
 namespace HWY_NAMESPACE {
@@ -139,38 +139,6 @@ HWY_NOINLINE void TestFastSRGB() {
   printf("max abs err %e\n", static_cast<double>(max_abs_err));
 }
 
-HWY_NOINLINE void TestFastPQEFD() {
-  constexpr size_t kNumTrials = 1 << 23;
-  Rng rng(1);
-  float max_abs_err = 0;
-  HWY_FULL(float) d;
-  for (size_t i = 0; i < kNumTrials; i++) {
-    const float f = rng.UniformF(0.0f, 1.0f);
-    const float actual = GetLane(TF_PQ().EncodedFromDisplay(d, Set(d, f)));
-    const float expected = TF_PQ().EncodedFromDisplay(f);
-    const float abs_err = std::abs(expected - actual);
-    EXPECT_LT(abs_err, 7e-7) << "f = " << f;
-    max_abs_err = std::max(max_abs_err, abs_err);
-  }
-  printf("max abs err %e\n", static_cast<double>(max_abs_err));
-}
-
-HWY_NOINLINE void TestFastHLGEFD() {
-  constexpr size_t kNumTrials = 1 << 23;
-  Rng rng(1);
-  float max_abs_err = 0;
-  HWY_FULL(float) d;
-  for (size_t i = 0; i < kNumTrials; i++) {
-    const float f = rng.UniformF(0.0f, 1.0f);
-    const float actual = GetLane(TF_HLG().EncodedFromDisplay(d, Set(d, f)));
-    const float expected = TF_HLG().EncodedFromDisplay(f);
-    const float abs_err = std::abs(expected - actual);
-    EXPECT_LT(abs_err, 5e-7) << "f = " << f;
-    max_abs_err = std::max(max_abs_err, abs_err);
-  }
-  printf("max abs err %e\n", static_cast<double>(max_abs_err));
-}
-
 HWY_NOINLINE void TestFast709EFD() {
   constexpr size_t kNumTrials = 1 << 23;
   Rng rng(1);
@@ -187,22 +155,6 @@ HWY_NOINLINE void TestFast709EFD() {
   printf("max abs err %e\n", static_cast<double>(max_abs_err));
 }
 
-HWY_NOINLINE void TestFastPQDFE() {
-  constexpr size_t kNumTrials = 1 << 23;
-  Rng rng(1);
-  float max_abs_err = 0;
-  HWY_FULL(float) d;
-  for (size_t i = 0; i < kNumTrials; i++) {
-    const float f = rng.UniformF(0.0f, 1.0f);
-    const float actual = GetLane(TF_PQ().DisplayFromEncoded(d, Set(d, f)));
-    const float expected = TF_PQ().DisplayFromEncoded(f);
-    const float abs_err = std::abs(expected - actual);
-    EXPECT_LT(abs_err, 3E-6) << "f = " << f;
-    max_abs_err = std::max(max_abs_err, abs_err);
-  }
-  printf("max abs err %e\n", static_cast<double>(max_abs_err));
-}
-
 HWY_NOINLINE void TestFastXYB() {
   if (!HasFastXYBTosRGB8()) return;
   ImageMetadata metadata;
@@ -231,7 +183,7 @@ HWY_NOINLINE void TestFastXYB() {
         ib.SetFromImage(std::move(chunk), ColorEncoding::SRGB());
         Image3F xyb(kChunk * kChunk, kChunk);
         std::vector<uint8_t> roundtrip(kChunk * kChunk * kChunk * 3);
-        ToXYB(ib, nullptr, &xyb, GetJxlCms());
+        ToXYB(ib, nullptr, &xyb, *JxlGetDefaultCms());
         for (int y = 0; y < kChunk; y++) {
           const float* xyba[4] = {xyb.PlaneRow(0, y), xyb.PlaneRow(1, y),
                                   xyb.PlaneRow(2, y), nullptr};
@@ -278,9 +230,6 @@ HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastCos);
 HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastErf);
 HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestCubeRoot);
 HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastSRGB);
-HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastPQDFE);
-HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastPQEFD);
-HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastHLGEFD);
 HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFast709EFD);
 HWY_EXPORT_AND_TEST_P(FastMathTargetTest, TestFastXYB);
 
index 5af749b..613e8fa 100644 (file)
@@ -11,9 +11,9 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#include <hwy/base.h>
 #include <vector>
 
-#include "hwy/base.h"
 #include "lib/jxl/base/bits.h"
 #include "lib/jxl/base/status.h"
 
index e8d6025..47a7563 100644 (file)
@@ -9,8 +9,8 @@
 
 #include <algorithm>
 #include <cmath>
+#include <hwy/base.h>
 
-#include "hwy/base.h"
 #include "lib/jxl/base/bits.h"
 #include "lib/jxl/base/printf_macros.h"
 
@@ -18,118 +18,7 @@ namespace jxl {
 
 namespace {
 
-// A bundle can be in one of three states concerning extensions: not-begun,
-// active, ended. Bundles may be nested, so we need a stack of states.
-class ExtensionStates {
- public:
-  void Push() {
-    // Initial state = not-begun.
-    begun_ <<= 1;
-    ended_ <<= 1;
-  }
-
-  // Clears current state; caller must check IsEnded beforehand.
-  void Pop() {
-    begun_ >>= 1;
-    ended_ >>= 1;
-  }
-
-  // Returns true if state == active || state == ended.
-  Status IsBegun() const { return (begun_ & 1) != 0; }
-  // Returns true if state != not-begun && state != active.
-  Status IsEnded() const { return (ended_ & 1) != 0; }
-
-  void Begin() {
-    JXL_ASSERT(!IsBegun());
-    JXL_ASSERT(!IsEnded());
-    begun_ += 1;
-  }
-
-  void End() {
-    JXL_ASSERT(IsBegun());
-    JXL_ASSERT(!IsEnded());
-    ended_ += 1;
-  }
-
- private:
-  // Current state := least-significant bit of begun_ and ended_.
-  uint64_t begun_ = 0;
-  uint64_t ended_ = 0;
-};
-
-// Visitors generate Init/AllDefault/Read/Write logic for all fields. Each
-// bundle's VisitFields member function calls visitor->U32 etc. We do not
-// overload operator() because a function name is easier to search for.
-
-class VisitorBase : public Visitor {
- public:
-  explicit VisitorBase() {}
-  ~VisitorBase() override { JXL_ASSERT(depth_ == 0); }
-
-  // This is the only call site of Fields::VisitFields.
-  // Ensures EndExtensions was called.
-  Status Visit(Fields* fields) override {
-    depth_ += 1;
-    JXL_ASSERT(depth_ <= Bundle::kMaxExtensions);
-    extension_states_.Push();
-
-    const Status ok = fields->VisitFields(this);
-
-    if (ok) {
-      // If VisitFields called BeginExtensions, must also call
-      // EndExtensions.
-      JXL_ASSERT(!extension_states_.IsBegun() || extension_states_.IsEnded());
-    } else {
-      // Failed, undefined state: don't care whether EndExtensions was
-      // called.
-    }
-
-    extension_states_.Pop();
-    JXL_ASSERT(depth_ != 0);
-    depth_ -= 1;
-
-    return ok;
-  }
-
-  // For visitors accepting a const Visitor, need to const-cast so we can call
-  // the non-const Visitor::VisitFields. NOTE: C is not modified except the
-  // `all_default` field by CanEncodeVisitor.
-  Status VisitConst(const Fields& t) { return Visit(const_cast<Fields*>(&t)); }
-
-  // Derived types (overridden by InitVisitor because it is unsafe to read
-  // from *value there)
-
-  Status Bool(bool default_value, bool* JXL_RESTRICT value) override {
-    uint32_t bits = *value ? 1 : 0;
-    JXL_RETURN_IF_ERROR(Bits(1, static_cast<uint32_t>(default_value), &bits));
-    JXL_DASSERT(bits <= 1);
-    *value = bits == 1;
-    return true;
-  }
-
-  // Overridden by ReadVisitor and WriteVisitor.
-  // Called before any conditional visit based on "extensions".
-  // Overridden by ReadVisitor, CanEncodeVisitor and WriteVisitor.
-  Status BeginExtensions(uint64_t* JXL_RESTRICT extensions) override {
-    JXL_RETURN_IF_ERROR(U64(0, extensions));
-
-    extension_states_.Begin();
-    return true;
-  }
-
-  // Called after all extension fields (if any). Although non-extension
-  // fields could be visited afterward, we prefer the convention that
-  // extension fields are always the last to be visited. Overridden by
-  // ReadVisitor.
-  Status EndExtensions() override {
-    extension_states_.End();
-    return true;
-  }
-
- private:
-  size_t depth_ = 0;  // to check nesting
-  ExtensionStates extension_states_;
-};
+using ::jxl::fields_internal::VisitorBase;
 
 struct InitVisitor : public VisitorBase {
   Status Bits(const size_t /*unused*/, const uint32_t default_value,
@@ -336,7 +225,7 @@ class ReadVisitor : public VisitorBase {
     if (pos_after_ext_size_ == 0) return true;
 
     // Not enough bytes as set by BeginExtensions or earlier. Do not return
-    // this as an JXL_FAILURE or false (which can also propagate to error
+    // this as a JXL_FAILURE or false (which can also propagate to error
     // through e.g. JXL_RETURN_IF_ERROR), since this may be used while
     // silently checking whether there are enough bytes. If this case must be
     // treated as an error, reader_>Close() will do this, just like is already
@@ -377,6 +266,8 @@ class ReadVisitor : public VisitorBase {
   uint64_t extension_bits_[Bundle::kMaxExtensions] = {0};
   uint64_t total_extension_bits_ = 0;
   size_t pos_after_ext_size_ = 0;  // 0 iff extensions == 0.
+
+  friend Status jxl::CheckHasEnoughBits(Visitor*, size_t);
 };
 
 class MaxBitsVisitor : public VisitorBase {
@@ -517,82 +408,24 @@ class CanEncodeVisitor : public VisitorBase {
   // including the hidden extension sizes.
   uint64_t pos_after_ext_ = 0;
 };
-
-class WriteVisitor : public VisitorBase {
- public:
-  WriteVisitor(const size_t extension_bits, BitWriter* JXL_RESTRICT writer)
-      : extension_bits_(extension_bits), writer_(writer) {}
-
-  Status Bits(const size_t bits, const uint32_t /*default_value*/,
-              uint32_t* JXL_RESTRICT value) override {
-    ok_ &= BitsCoder::Write(bits, *value, writer_);
-    return true;
-  }
-  Status U32(const U32Enc enc, const uint32_t /*default_value*/,
-             uint32_t* JXL_RESTRICT value) override {
-    ok_ &= U32Coder::Write(enc, *value, writer_);
-    return true;
-  }
-
-  Status U64(const uint64_t /*default_value*/,
-             uint64_t* JXL_RESTRICT value) override {
-    ok_ &= U64Coder::Write(*value, writer_);
-    return true;
-  }
-
-  Status F16(const float /*default_value*/,
-             float* JXL_RESTRICT value) override {
-    ok_ &= F16Coder::Write(*value, writer_);
-    return true;
-  }
-
-  Status BeginExtensions(uint64_t* JXL_RESTRICT extensions) override {
-    JXL_QUIET_RETURN_IF_ERROR(VisitorBase::BeginExtensions(extensions));
-    if (*extensions == 0) {
-      JXL_ASSERT(extension_bits_ == 0);
-      return true;
-    }
-    // TODO(janwas): extend API to pass in array of extension_bits, one per
-    // extension. We currently ascribe all bits to the first extension, but
-    // this is only an encoder limitation. NOTE: extension_bits_ can be zero
-    // if an extension does not require any additional fields.
-    ok_ &= U64Coder::Write(extension_bits_, writer_);
-    // For each nonzero bit except the lowest/first (already written):
-    for (uint64_t remaining_extensions = *extensions & (*extensions - 1);
-         remaining_extensions != 0;
-         remaining_extensions &= remaining_extensions - 1) {
-      ok_ &= U64Coder::Write(0, writer_);
-    }
-    return true;
-  }
-  // EndExtensions = default.
-
-  Status OK() const { return ok_; }
-
- private:
-  const size_t extension_bits_;
-  BitWriter* JXL_RESTRICT writer_;
-  bool ok_ = true;
-};
-
 }  // namespace
 
 void Bundle::Init(Fields* fields) {
   InitVisitor visitor;
   if (!visitor.Visit(fields)) {
-    JXL_ABORT("Init should never fail");
+    JXL_UNREACHABLE("Init should never fail");
   }
 }
 void Bundle::SetDefault(Fields* fields) {
   SetDefaultVisitor visitor;
   if (!visitor.Visit(fields)) {
-    JXL_ABORT("SetDefault should never fail");
+    JXL_UNREACHABLE("SetDefault should never fail");
   }
 }
 bool Bundle::AllDefault(const Fields& fields) {
   AllDefaultVisitor visitor;
   if (!visitor.VisitConst(fields)) {
-    JXL_ABORT("AllDefault should never fail");
+    JXL_UNREACHABLE("AllDefault should never fail");
   }
   return visitor.AllDefault();
 }
@@ -627,19 +460,23 @@ bool Bundle::CanRead(BitReader* reader, Fields* fields) {
   // there's an error. Use Read() to determine which error it is.
   return status.code() != StatusCode::kNotEnoughBytes;
 }
-Status Bundle::Write(const Fields& fields, BitWriter* writer, size_t layer,
-                     AuxOut* aux_out) {
-  size_t extension_bits, total_bits;
-  JXL_RETURN_IF_ERROR(CanEncode(fields, &extension_bits, &total_bits));
-
-  BitWriter::Allotment allotment(writer, total_bits);
-  WriteVisitor visitor(extension_bits, writer);
-  JXL_RETURN_IF_ERROR(visitor.VisitConst(fields));
-  JXL_RETURN_IF_ERROR(visitor.OK());
-  ReclaimAndCharge(writer, &allotment, layer, aux_out);
+
+size_t BitsCoder::MaxEncodedBits(const size_t bits) { return bits; }
+
+Status BitsCoder::CanEncode(const size_t bits, const uint32_t value,
+                            size_t* JXL_RESTRICT encoded_bits) {
+  *encoded_bits = bits;
+  if (value >= (1ULL << bits)) {
+    return JXL_FAILURE("Value %u too large for %" PRIu64 " bits", value,
+                       static_cast<uint64_t>(bits));
+  }
   return true;
 }
 
+uint32_t BitsCoder::Read(const size_t bits, BitReader* JXL_RESTRICT reader) {
+  return reader->ReadBits(bits);
+}
+
 size_t U32Coder::MaxEncodedBits(const U32Enc enc) {
   size_t extra_bits = 0;
   for (uint32_t selector = 0; selector < 4; ++selector) {
@@ -672,25 +509,6 @@ uint32_t U32Coder::Read(const U32Enc enc, BitReader* JXL_RESTRICT reader) {
   }
 }
 
-// Returns false if the value is too large to encode.
-Status U32Coder::Write(const U32Enc enc, const uint32_t value,
-                       BitWriter* JXL_RESTRICT writer) {
-  uint32_t selector;
-  size_t total_bits;
-  JXL_RETURN_IF_ERROR(ChooseSelector(enc, value, &selector, &total_bits));
-
-  writer->Write(2, selector);
-
-  const U32Distr d = enc.GetDistr(selector);
-  if (!d.IsDirect()) {  // Nothing more to write for direct encoding
-    const uint32_t offset = d.Offset();
-    JXL_ASSERT(value >= offset);
-    writer->Write(total_bits - 2, value - offset);
-  }
-
-  return true;
-}
-
 Status U32Coder::ChooseSelector(const U32Enc enc, const uint32_t value,
                                 uint32_t* JXL_RESTRICT selector,
                                 size_t* JXL_RESTRICT total_bits) {
@@ -761,46 +579,6 @@ uint64_t U64Coder::Read(BitReader* JXL_RESTRICT reader) {
   return result;
 }
 
-// Returns false if the value is too large to encode.
-Status U64Coder::Write(uint64_t value, BitWriter* JXL_RESTRICT writer) {
-  if (value == 0) {
-    // Selector: use 0 bits, value 0
-    writer->Write(2, 0);
-  } else if (value <= 16) {
-    // Selector: use 4 bits, value 1..16
-    writer->Write(2, 1);
-    writer->Write(4, value - 1);
-  } else if (value <= 272) {
-    // Selector: use 8 bits, value 17..272
-    writer->Write(2, 2);
-    writer->Write(8, value - 17);
-  } else {
-    // Selector: varint, first a 12-bit group, after that per 8-bit group.
-    writer->Write(2, 3);
-    writer->Write(12, value & 4095);
-    value >>= 12;
-    int shift = 12;
-    while (value > 0 && shift < 60) {
-      // Indicate varint not done
-      writer->Write(1, 1);
-      writer->Write(8, value & 255);
-      value >>= 8;
-      shift += 8;
-    }
-    if (value > 0) {
-      // This only could happen if shift == N - 4.
-      writer->Write(1, 1);
-      writer->Write(4, value & 15);
-      // Implicitly closed sequence, no extra stop bit is required.
-    } else {
-      // Indicate end of varint
-      writer->Write(1, 0);
-    }
-  }
-
-  return true;
-}
-
 // Can always encode, but useful because it also returns bit size.
 Status U64Coder::CanEncode(uint64_t value, size_t* JXL_RESTRICT encoded_bits) {
   if (value == 0) {
@@ -855,46 +633,6 @@ Status F16Coder::Read(BitReader* JXL_RESTRICT reader,
   return true;
 }
 
-Status F16Coder::Write(float value, BitWriter* JXL_RESTRICT writer) {
-  uint32_t bits32;
-  memcpy(&bits32, &value, sizeof(bits32));
-  const uint32_t sign = bits32 >> 31;
-  const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
-  const uint32_t mantissa32 = bits32 & 0x7FFFFF;
-
-  const int32_t exp = static_cast<int32_t>(biased_exp32) - 127;
-  if (JXL_UNLIKELY(exp > 15)) {
-    return JXL_FAILURE("Too big to encode, CanEncode should return false");
-  }
-
-  // Tiny or zero => zero.
-  if (exp < -24) {
-    writer->Write(16, 0);
-    return true;
-  }
-
-  uint32_t biased_exp16, mantissa16;
-
-  // exp = [-24, -15] => subnormal
-  if (JXL_UNLIKELY(exp < -14)) {
-    biased_exp16 = 0;
-    const uint32_t sub_exp = static_cast<uint32_t>(-14 - exp);
-    JXL_ASSERT(1 <= sub_exp && sub_exp < 11);
-    mantissa16 = (1 << (10 - sub_exp)) + (mantissa32 >> (13 + sub_exp));
-  } else {
-    // exp = [-14, 15]
-    biased_exp16 = static_cast<uint32_t>(exp + 15);
-    JXL_ASSERT(1 <= biased_exp16 && biased_exp16 < 31);
-    mantissa16 = mantissa32 >> 13;
-  }
-
-  JXL_ASSERT(mantissa16 < 1024);
-  const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
-  JXL_ASSERT(bits16 < 0x10000);
-  writer->Write(16, bits16);
-  return true;
-}
-
 Status F16Coder::CanEncode(float value, size_t* JXL_RESTRICT encoded_bits) {
   *encoded_bits = MaxEncodedBits();
   if (std::isnan(value) || std::isinf(value)) {
@@ -903,4 +641,16 @@ Status F16Coder::CanEncode(float value, size_t* JXL_RESTRICT encoded_bits) {
   return std::abs(value) <= 65504.0f;
 }
 
+Status CheckHasEnoughBits(Visitor* visitor, size_t bits) {
+  if (!visitor->IsReading()) return false;
+  ReadVisitor* rv = static_cast<ReadVisitor*>(visitor);
+  size_t have_bits = rv->reader_->TotalBytes() * kBitsPerByte;
+  size_t want_bits = bits + rv->reader_->TotalBitsConsumed();
+  if (have_bits < want_bits) {
+    return JXL_STATUS(StatusCode::kNotEnoughBytes,
+                      "Not enough bytes for header");
+  }
+  return true;
+}
+
 }  // namespace jxl
index 18a57cf..78c10f2 100644 (file)
@@ -11,7 +11,6 @@
 #include <inttypes.h>
 #include <stddef.h>
 #include <stdint.h>
-#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
 #include <cmath>  // abs
 #include <cstdarg>
 
-#include "lib/jxl/aux_out_fwd.h"
 #include "lib/jxl/base/bits.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/dec_bit_reader.h"
-#include "lib/jxl/enc_bit_writer.h"
 #include "lib/jxl/field_encodings.h"
 
 namespace jxl {
 
+struct AuxOut;
+struct BitWriter;
+
 // Integer coders: BitsCoder (raw), U32Coder (table), U64Coder (varint).
 
 // Reads/writes a given (fixed) number of bits <= 32.
-class BitsCoder {
- public:
-  static size_t MaxEncodedBits(const size_t bits) { return bits; }
-
-  static Status CanEncode(const size_t bits, const uint32_t value,
-                          size_t* JXL_RESTRICT encoded_bits) {
-    *encoded_bits = bits;
-    if (value >= (1ULL << bits)) {
-      return JXL_FAILURE("Value %u too large for %" PRIu64 " bits", value,
-                         static_cast<uint64_t>(bits));
-    }
-    return true;
-  }
+namespace BitsCoder {
+size_t MaxEncodedBits(size_t bits);
 
-  static uint32_t Read(const size_t bits, BitReader* JXL_RESTRICT reader) {
-    return reader->ReadBits(bits);
-  }
+Status CanEncode(size_t bits, uint32_t value,
+                 size_t* JXL_RESTRICT encoded_bits);
 
-  // Returns false if the value is too large to encode.
-  static Status Write(const size_t bits, const uint32_t value,
-                      BitWriter* JXL_RESTRICT writer) {
-    if (value >= (1ULL << bits)) {
-      return JXL_FAILURE("Value %d too large to encode in %" PRIu64 " bits",
-                         value, static_cast<uint64_t>(bits));
-    }
-    writer->Write(bits, value);
-    return true;
-  }
-};
+uint32_t Read(size_t bits, BitReader* JXL_RESTRICT reader);
+
+// Returns false if the value is too large to encode.
+Status Write(size_t bits, uint32_t value, BitWriter* JXL_RESTRICT writer);
+}  // namespace BitsCoder
 
 // Encodes u32 using a lookup table and/or extra bits, governed by a per-field
 // encoding `enc` which consists of four distributions `d` chosen via a 2-bit
@@ -79,54 +60,47 @@ class BitsCoder {
 //   01x -> 1..2
 //   10xx -> 3..7
 //   11xxxxxxxx -> 8..263
-class U32Coder {
- public:
-  static size_t MaxEncodedBits(U32Enc enc);
-  static Status CanEncode(U32Enc enc, uint32_t value,
-                          size_t* JXL_RESTRICT encoded_bits);
-  static uint32_t Read(U32Enc enc, BitReader* JXL_RESTRICT reader);
+namespace U32Coder {
+size_t MaxEncodedBits(U32Enc enc);
+Status CanEncode(U32Enc enc, uint32_t value, size_t* JXL_RESTRICT encoded_bits);
+uint32_t Read(U32Enc enc, BitReader* JXL_RESTRICT reader);
 
-  // Returns false if the value is too large to encode.
-  static Status Write(U32Enc enc, uint32_t value,
-                      BitWriter* JXL_RESTRICT writer);
+// Returns false if the value is too large to encode.
+Status Write(U32Enc enc, uint32_t value, BitWriter* JXL_RESTRICT writer);
 
- private:
-  static Status ChooseSelector(U32Enc enc, uint32_t value,
-                               uint32_t* JXL_RESTRICT selector,
-                               size_t* JXL_RESTRICT total_bits);
-};
+// "private"
+Status ChooseSelector(U32Enc enc, uint32_t value,
+                      uint32_t* JXL_RESTRICT selector,
+                      size_t* JXL_RESTRICT total_bits);
+}  // namespace U32Coder
 
 // Encodes 64-bit unsigned integers with a fixed distribution, taking 2 bits
 // to encode 0, 6 bits to encode 1 to 16, 10 bits to encode 17 to 272, 15 bits
 // to encode up to 4095, and on the order of log2(value) * 1.125 bits for
 // larger values.
-class U64Coder {
- public:
-  static constexpr size_t MaxEncodedBits() {
-    return 2 + 12 + 6 * (8 + 1) + (4 + 1);
-  }
+namespace U64Coder {
+constexpr size_t MaxEncodedBits() { return 2 + 12 + 6 * (8 + 1) + (4 + 1); }
 
-  static uint64_t Read(BitReader* JXL_RESTRICT reader);
+uint64_t Read(BitReader* JXL_RESTRICT reader);
 
-  // Returns false if the value is too large to encode.
-  static Status Write(uint64_t value, BitWriter* JXL_RESTRICT writer);
+// Returns false if the value is too large to encode.
+Status Write(uint64_t value, BitWriter* JXL_RESTRICT writer);
 
-  // Can always encode, but useful because it also returns bit size.
-  static Status CanEncode(uint64_t value, size_t* JXL_RESTRICT encoded_bits);
-};
+// Can always encode, but useful because it also returns bit size.
+Status CanEncode(uint64_t value, size_t* JXL_RESTRICT encoded_bits);
+}  // namespace U64Coder
 
 // IEEE 754 half-precision (binary16). Refuses to read/write NaN/Inf.
-class F16Coder {
- public:
-  static constexpr size_t MaxEncodedBits() { return 16; }
+namespace F16Coder {
+constexpr size_t MaxEncodedBits() { return 16; }
 
-  // Returns false if the bit representation is NaN or infinity
-  static Status Read(BitReader* JXL_RESTRICT reader, float* JXL_RESTRICT value);
+// Returns false if the bit representation is NaN or infinity
+Status Read(BitReader* JXL_RESTRICT reader, float* JXL_RESTRICT value);
 
-  // Returns false if the value is too large to encode.
-  static Status Write(float value, BitWriter* JXL_RESTRICT writer);
-  static Status CanEncode(float value, size_t* JXL_RESTRICT encoded_bits);
-};
+// Returns false if the value is too large to encode.
+Status Write(float value, BitWriter* JXL_RESTRICT writer);
+Status CanEncode(float value, size_t* JXL_RESTRICT encoded_bits);
+}  // namespace F16Coder
 
 // A "bundle" is a forward- and backward compatible collection of fields.
 // They are used for SizeHeader/FrameHeader/GroupHeader. Bundles can be
@@ -179,48 +153,44 @@ class F16Coder {
 //   }
 //   Note: if extensions are present, AllDefault() == false.
 
-class Bundle {
- public:
-  static constexpr size_t kMaxExtensions = 64;  // bits in u64
-
-  // Initializes fields to the default values. It is not recursive to nested
-  // fields, this function is intended to be called in the constructors so
-  // each nested field will already Init itself.
-  static void Init(Fields* JXL_RESTRICT fields);
+namespace Bundle {
+constexpr size_t kMaxExtensions = 64;  // bits in u64
 
-  // Similar to Init, but recursive to nested fields.
-  static void SetDefault(Fields* JXL_RESTRICT fields);
+// Initializes fields to the default values. It is not recursive to nested
+// fields, this function is intended to be called in the constructors so
+// each nested field will already Init itself.
+void Init(Fields* JXL_RESTRICT fields);
 
-  // Returns whether ALL fields (including `extensions`, if present) are equal
-  // to their default value.
-  static bool AllDefault(const Fields& fields);
+// Similar to Init, but recursive to nested fields.
+void SetDefault(Fields* JXL_RESTRICT fields);
 
-  // Returns max number of bits required to encode a T.
-  static size_t MaxBits(const Fields& fields);
+// Returns whether ALL fields (including `extensions`, if present) are equal
+// to their default value.
+bool AllDefault(const Fields& fields);
 
-  // Returns whether a header's fields can all be encoded, i.e. they have a
-  // valid representation. If so, "*total_bits" is the exact number of bits
-  // required. Called by Write.
-  static Status CanEncode(const Fields& fields,
-                          size_t* JXL_RESTRICT extension_bits,
-                          size_t* JXL_RESTRICT total_bits);
+// Returns max number of bits required to encode a T.
+size_t MaxBits(const Fields& fields);
 
-  static Status Read(BitReader* reader, Fields* JXL_RESTRICT fields);
+// Returns whether a header's fields can all be encoded, i.e. they have a
+// valid representation. If so, "*total_bits" is the exact number of bits
+// required. Called by Write.
+Status CanEncode(const Fields& fields, size_t* JXL_RESTRICT extension_bits,
+                 size_t* JXL_RESTRICT total_bits);
 
-  // Returns whether enough bits are available to fully read this bundle using
-  // Read. Also returns true in case of a codestream error (other than not being
-  // large enough): that means enough bits are available to determine there's an
-  // error, use Read to get such error status.
-  // NOTE: this advances the BitReader, a different one pointing back at the
-  // original bit position in the codestream must be created to use Read after
-  // this.
-  static bool CanRead(BitReader* reader, Fields* JXL_RESTRICT fields);
+Status Read(BitReader* reader, Fields* JXL_RESTRICT fields);
 
-  static Status Write(const Fields& fields, BitWriter* JXL_RESTRICT writer,
-                      size_t layer, AuxOut* aux_out);
+// Returns whether enough bits are available to fully read this bundle using
+// Read. Also returns true in case of a codestream error (other than not being
+// large enough): that means enough bits are available to determine there's an
+// error, use Read to get such error status.
+// NOTE: this advances the BitReader, a different one pointing back at the
+// original bit position in the codestream must be created to use Read after
+// this.
+bool CanRead(BitReader* reader, Fields* JXL_RESTRICT fields);
 
- private:
-};
+Status Write(const Fields& fields, BitWriter* JXL_RESTRICT writer, size_t layer,
+             AuxOut* aux_out);
+}  // namespace Bundle
 
 // Different subclasses of Visitor are passed to implementations of Fields
 // throughout their lifetime. Templates used to be used for this but dynamic
@@ -285,6 +255,123 @@ class Visitor {
   virtual Status EndExtensions() = 0;
 };
 
+namespace fields_internal {
+// A bundle can be in one of three states concerning extensions: not-begun,
+// active, ended. Bundles may be nested, so we need a stack of states.
+class ExtensionStates {
+ public:
+  void Push() {
+    // Initial state = not-begun.
+    begun_ <<= 1;
+    ended_ <<= 1;
+  }
+
+  // Clears current state; caller must check IsEnded beforehand.
+  void Pop() {
+    begun_ >>= 1;
+    ended_ >>= 1;
+  }
+
+  // Returns true if state == active || state == ended.
+  Status IsBegun() const { return (begun_ & 1) != 0; }
+  // Returns true if state != not-begun && state != active.
+  Status IsEnded() const { return (ended_ & 1) != 0; }
+
+  void Begin() {
+    JXL_ASSERT(!IsBegun());
+    JXL_ASSERT(!IsEnded());
+    begun_ += 1;
+  }
+
+  void End() {
+    JXL_ASSERT(IsBegun());
+    JXL_ASSERT(!IsEnded());
+    ended_ += 1;
+  }
+
+ private:
+  // Current state := least-significant bit of begun_ and ended_.
+  uint64_t begun_ = 0;
+  uint64_t ended_ = 0;
+};
+
+// Visitors generate Init/AllDefault/Read/Write logic for all fields. Each
+// bundle's VisitFields member function calls visitor->U32 etc. We do not
+// overload operator() because a function name is easier to search for.
+
+class VisitorBase : public Visitor {
+ public:
+  explicit VisitorBase() {}
+  ~VisitorBase() override { JXL_ASSERT(depth_ == 0); }
+
+  // This is the only call site of Fields::VisitFields.
+  // Ensures EndExtensions was called.
+  Status Visit(Fields* fields) override {
+    depth_ += 1;
+    JXL_ASSERT(depth_ <= Bundle::kMaxExtensions);
+    extension_states_.Push();
+
+    const Status ok = fields->VisitFields(this);
+
+    if (ok) {
+      // If VisitFields called BeginExtensions, must also call
+      // EndExtensions.
+      JXL_ASSERT(!extension_states_.IsBegun() || extension_states_.IsEnded());
+    } else {
+      // Failed, undefined state: don't care whether EndExtensions was
+      // called.
+    }
+
+    extension_states_.Pop();
+    JXL_ASSERT(depth_ != 0);
+    depth_ -= 1;
+
+    return ok;
+  }
+
+  // For visitors accepting a const Visitor, need to const-cast so we can call
+  // the non-const Visitor::VisitFields. NOTE: C is not modified except the
+  // `all_default` field by CanEncodeVisitor.
+  Status VisitConst(const Fields& t) { return Visit(const_cast<Fields*>(&t)); }
+
+  // Derived types (overridden by InitVisitor because it is unsafe to read
+  // from *value there)
+
+  Status Bool(bool default_value, bool* JXL_RESTRICT value) override {
+    uint32_t bits = *value ? 1 : 0;
+    JXL_RETURN_IF_ERROR(Bits(1, static_cast<uint32_t>(default_value), &bits));
+    JXL_DASSERT(bits <= 1);
+    *value = bits == 1;
+    return true;
+  }
+
+  // Overridden by ReadVisitor and WriteVisitor.
+  // Called before any conditional visit based on "extensions".
+  // Overridden by ReadVisitor, CanEncodeVisitor and WriteVisitor.
+  Status BeginExtensions(uint64_t* JXL_RESTRICT extensions) override {
+    JXL_RETURN_IF_ERROR(U64(0, extensions));
+
+    extension_states_.Begin();
+    return true;
+  }
+
+  // Called after all extension fields (if any). Although non-extension
+  // fields could be visited afterward, we prefer the convention that
+  // extension fields are always the last to be visited. Overridden by
+  // ReadVisitor.
+  Status EndExtensions() override {
+    extension_states_.End();
+    return true;
+  }
+
+ private:
+  size_t depth_ = 0;  // to check nesting
+  ExtensionStates extension_states_;
+};
+}  // namespace fields_internal
+
+Status CheckHasEnoughBits(Visitor* visitor, size_t bits);
+
 }  // namespace jxl
 
 #endif  // LIB_JXL_FIELDS_H_
index c11b052..b178a6b 100644 (file)
 #include <array>
 #include <utility>
 
-#include "gtest/gtest.h"
-#include "lib/jxl/aux_out.h"
-#include "lib/jxl/aux_out_fwd.h"
+#include "lib/jxl/base/common.h"
 #include "lib/jxl/base/span.h"
-#include "lib/jxl/common.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_fields.h"
 #include "lib/jxl/frame_header.h"
 #include "lib/jxl/headers.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
 namespace {
 
 // Ensures `value` round-trips and in exactly `expected_bits_written`.
 void TestU32Coder(const uint32_t value, const size_t expected_bits_written) {
-  U32Coder coder;
   const U32Enc enc(Val(0), Bits(4), Val(0x7FFFFFFF), Bits(32));
 
   BitWriter writer;
@@ -32,16 +31,16 @@ void TestU32Coder(const uint32_t value, const size_t expected_bits_written) {
       &writer, RoundUpBitsToByteMultiple(U32Coder::MaxEncodedBits(enc)));
 
   size_t precheck_pos;
-  EXPECT_TRUE(coder.CanEncode(enc, value, &precheck_pos));
+  EXPECT_TRUE(U32Coder::CanEncode(enc, value, &precheck_pos));
   EXPECT_EQ(expected_bits_written, precheck_pos);
 
-  EXPECT_TRUE(coder.Write(enc, value, &writer));
+  EXPECT_TRUE(U32Coder::Write(enc, value, &writer));
   EXPECT_EQ(expected_bits_written, writer.BitsWritten());
   writer.ZeroPadToByte();
-  ReclaimAndCharge(&writer, &allotment, 0, nullptr);
+  allotment.ReclaimAndCharge(&writer, 0, nullptr);
 
   BitReader reader(writer.GetSpan());
-  const uint32_t decoded_value = coder.Read(enc, &reader);
+  const uint32_t decoded_value = U32Coder::Read(enc, &reader);
   EXPECT_EQ(value, decoded_value);
   EXPECT_TRUE(reader.Close());
 }
@@ -58,24 +57,22 @@ TEST(FieldsTest, U32CoderTest) {
 }
 
 void TestU64Coder(const uint64_t value, const size_t expected_bits_written) {
-  U64Coder coder;
-
   BitWriter writer;
   BitWriter::Allotment allotment(
       &writer, RoundUpBitsToByteMultiple(U64Coder::MaxEncodedBits()));
 
   size_t precheck_pos;
-  EXPECT_TRUE(coder.CanEncode(value, &precheck_pos));
+  EXPECT_TRUE(U64Coder::CanEncode(value, &precheck_pos));
   EXPECT_EQ(expected_bits_written, precheck_pos);
 
-  EXPECT_TRUE(coder.Write(value, &writer));
+  EXPECT_TRUE(U64Coder::Write(value, &writer));
   EXPECT_EQ(expected_bits_written, writer.BitsWritten());
 
   writer.ZeroPadToByte();
-  ReclaimAndCharge(&writer, &allotment, 0, nullptr);
+  allotment.ReclaimAndCharge(&writer, 0, nullptr);
 
   BitReader reader(writer.GetSpan());
-  const uint64_t decoded_value = coder.Read(&reader);
+  const uint64_t decoded_value = U64Coder::Read(&reader);
   EXPECT_EQ(value, decoded_value);
   EXPECT_TRUE(reader.Close());
 }
@@ -160,25 +157,23 @@ TEST(FieldsTest, U64CoderTest) {
 }
 
 Status TestF16Coder(const float value) {
-  F16Coder coder;
-
   size_t max_encoded_bits;
   // It is not a fatal error if it can't be encoded.
-  if (!coder.CanEncode(value, &max_encoded_bits)) return false;
+  if (!F16Coder::CanEncode(value, &max_encoded_bits)) return false;
   EXPECT_EQ(F16Coder::MaxEncodedBits(), max_encoded_bits);
 
   BitWriter writer;
   BitWriter::Allotment allotment(&writer,
                                  RoundUpBitsToByteMultiple(max_encoded_bits));
 
-  EXPECT_TRUE(coder.Write(value, &writer));
+  EXPECT_TRUE(F16Coder::Write(value, &writer));
   EXPECT_EQ(F16Coder::MaxEncodedBits(), writer.BitsWritten());
   writer.ZeroPadToByte();
-  ReclaimAndCharge(&writer, &allotment, 0, nullptr);
+  allotment.ReclaimAndCharge(&writer, 0, nullptr);
 
   BitReader reader(writer.GetSpan());
   float decoded_value;
-  EXPECT_TRUE(coder.Read(&reader, &decoded_value));
+  EXPECT_TRUE(F16Coder::Read(&reader, &decoded_value));
   // All values we test can be represented exactly.
   EXPECT_EQ(value, decoded_value);
   EXPECT_TRUE(reader.Close());
@@ -365,7 +360,7 @@ TEST(FieldsTest, TestNewDecoderOldData) {
                                  kMaxOutBytes * kBitsPerByte - total_bits);
   writer.Write(20, 0xA55A);  // sentinel
   writer.ZeroPadToByte();
-  ReclaimAndCharge(&writer, &allotment, kLayerHeader, nullptr);
+  allotment.ReclaimAndCharge(&writer, kLayerHeader, nullptr);
 
   ASSERT_LE(writer.GetSpan().size(), kMaxOutBytes);
   BitReader reader(writer.GetSpan());
@@ -412,7 +407,7 @@ TEST(FieldsTest, TestOldDecoderNewData) {
   // Ensure Read skips the additional fields
   writer.Write(20, 0xA55A);  // sentinel
   writer.ZeroPadToByte();
-  ReclaimAndCharge(&writer, &allotment, kLayerHeader, nullptr);
+  allotment.ReclaimAndCharge(&writer, kLayerHeader, nullptr);
 
   BitReader reader(writer.GetSpan());
   OldBundle old_bundle;
diff --git a/lib/jxl/frame_dimensions.h b/lib/jxl/frame_dimensions.h
new file mode 100644 (file)
index 0000000..4886be1
--- /dev/null
@@ -0,0 +1,92 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_FRAME_DIMENSIONS_H_
+#define LIB_JXL_FRAME_DIMENSIONS_H_
+
+// FrameDimensions struct, block and group dimensions constants.
+
+#include <cstddef>
+
+#include "lib/jxl/base/common.h"
+
+namespace jxl {
+// Some enums and typedefs used by more than one header file.
+
+// Block is the square grid of pixels to which an "energy compaction"
+// transformation (e.g. DCT) is applied. Each block has its own AC quantizer.
+constexpr size_t kBlockDim = 8;
+
+constexpr size_t kDCTBlockSize = kBlockDim * kBlockDim;
+
+constexpr size_t kGroupDim = 256;
+static_assert(kGroupDim % kBlockDim == 0,
+              "Group dim should be divisible by block dim");
+constexpr size_t kGroupDimInBlocks = kGroupDim / kBlockDim;
+
+// Dimensions of a frame, in pixels, and other derived dimensions.
+// Computed from FrameHeader.
+// TODO(veluca): add extra channels.
+struct FrameDimensions {
+  void Set(size_t xsize, size_t ysize, size_t group_size_shift,
+           size_t max_hshift, size_t max_vshift, bool modular_mode,
+           size_t upsampling) {
+    group_dim = (kGroupDim >> 1) << group_size_shift;
+    dc_group_dim = group_dim * kBlockDim;
+    xsize_upsampled = xsize;
+    ysize_upsampled = ysize;
+    this->xsize = DivCeil(xsize, upsampling);
+    this->ysize = DivCeil(ysize, upsampling);
+    xsize_blocks = DivCeil(this->xsize, kBlockDim << max_hshift) << max_hshift;
+    ysize_blocks = DivCeil(this->ysize, kBlockDim << max_vshift) << max_vshift;
+    xsize_padded = xsize_blocks * kBlockDim;
+    ysize_padded = ysize_blocks * kBlockDim;
+    if (modular_mode) {
+      // Modular mode doesn't have any padding.
+      xsize_padded = this->xsize;
+      ysize_padded = this->ysize;
+    }
+    xsize_upsampled_padded = xsize_padded * upsampling;
+    ysize_upsampled_padded = ysize_padded * upsampling;
+    xsize_groups = DivCeil(this->xsize, group_dim);
+    ysize_groups = DivCeil(this->ysize, group_dim);
+    xsize_dc_groups = DivCeil(xsize_blocks, group_dim);
+    ysize_dc_groups = DivCeil(ysize_blocks, group_dim);
+    num_groups = xsize_groups * ysize_groups;
+    num_dc_groups = xsize_dc_groups * ysize_dc_groups;
+  }
+
+  // Image size without any upsampling, i.e. original_size / upsampling.
+  size_t xsize;
+  size_t ysize;
+  // Original image size.
+  size_t xsize_upsampled;
+  size_t ysize_upsampled;
+  // Image size after upsampling the padded image.
+  size_t xsize_upsampled_padded;
+  size_t ysize_upsampled_padded;
+  // Image size after padding to a multiple of kBlockDim (if VarDCT mode).
+  size_t xsize_padded;
+  size_t ysize_padded;
+  // Image size in kBlockDim blocks.
+  size_t xsize_blocks;
+  size_t ysize_blocks;
+  // Image size in number of groups.
+  size_t xsize_groups;
+  size_t ysize_groups;
+  // Image size in number of DC groups.
+  size_t xsize_dc_groups;
+  size_t ysize_dc_groups;
+  // Number of AC or DC groups.
+  size_t num_groups;
+  size_t num_dc_groups;
+  // Size of a group.
+  size_t group_dim;
+  size_t dc_group_dim;
+};
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_FRAME_DIMENSIONS_H_
index e69a12c..824aa8e 100644 (file)
@@ -7,15 +7,16 @@
 
 #include <sstream>
 
-#include "lib/jxl/aux_out.h"
 #include "lib/jxl/base/printf_macros.h"
 #include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"  // kMaxNumPasses
 #include "lib/jxl/fields.h"
+#include "lib/jxl/pack_signed.h"
 
 namespace jxl {
 
-constexpr uint8_t YCbCrChromaSubsampling::kHShift[];
-constexpr uint8_t YCbCrChromaSubsampling::kVShift[];
+constexpr uint8_t YCbCrChromaSubsampling::kHShift[] = {0, 1, 1, 0};
+constexpr uint8_t YCbCrChromaSubsampling::kVShift[] = {0, 1, 0, 1};
 
 static Status VisitBlendMode(Visitor* JXL_RESTRICT visitor,
                              BlendMode default_value, BlendMode* blend_mode) {
@@ -79,6 +80,27 @@ Status BlendingInfo::VisitFields(Visitor* JXL_RESTRICT visitor) {
   return true;
 }
 
+#if JXL_DEBUG_V_LEVEL >= 1
+std::string BlendingInfo::DebugString() const {
+  std::ostringstream os;
+  os << (mode == BlendMode::kReplace            ? "Replace"
+         : mode == BlendMode::kAdd              ? "Add"
+         : mode == BlendMode::kBlend            ? "Blend"
+         : mode == BlendMode::kAlphaWeightedAdd ? "AlphaWeightedAdd"
+                                                : "Mul");
+  if (nonserialized_num_extra_channels > 0 &&
+      (mode == BlendMode::kBlend || mode == BlendMode::kAlphaWeightedAdd)) {
+    os << ",alpha=" << alpha_channel << ",clamp=" << clamp;
+  } else if (mode == BlendMode::kMul) {
+    os << ",clamp=" << clamp;
+  }
+  if (mode != BlendMode::kReplace || nonserialized_is_partial_frame) {
+    os << ",source=" << source;
+  }
+  return os.str();
+}
+#endif
+
 AnimationFrame::AnimationFrame(const CodecMetadata* metadata)
     : nonserialized_metadata(metadata) {
   Bundle::Init(this);
@@ -142,6 +164,7 @@ Status Passes::VisitFields(Visitor* JXL_RESTRICT visitor) {
   return true;
 }
 
+#if JXL_DEBUG_V_LEVEL >= 1
 std::string Passes::DebugString() const {
   std::ostringstream os;
   os << "p=" << num_passes;
@@ -165,6 +188,7 @@ std::string Passes::DebugString() const {
   }
   return os.str();
 }
+#endif
 
 FrameHeader::FrameHeader(const CodecMetadata* metadata)
     : animation_frame(metadata), nonserialized_metadata(metadata) {
@@ -176,11 +200,6 @@ Status ReadFrameHeader(BitReader* JXL_RESTRICT reader,
   return Bundle::Read(reader, frame);
 }
 
-Status WriteFrameHeader(const FrameHeader& frame,
-                        BitWriter* JXL_RESTRICT writer, AuxOut* aux_out) {
-  return Bundle::Write(frame, writer, kLayerHeader, aux_out);
-}
-
 Status FrameHeader::VisitFields(Visitor* JXL_RESTRICT visitor) {
   if (visitor->AllDefault(*this, &all_default)) {
     // Overwrite all serialized fields, but not any nonserialized_*.
@@ -406,6 +425,7 @@ Status FrameHeader::VisitFields(Visitor* JXL_RESTRICT visitor) {
   return visitor->EndExtensions();
 }
 
+#if JXL_DEBUG_V_LEVEL >= 1
 std::string FrameHeader::DebugString() const {
   std::ostringstream os;
   os << (encoding == FrameEncoding::kVarDCT ? "VarDCT" : "Modular");
@@ -462,9 +482,21 @@ std::string FrameHeader::DebugString() const {
   if (loop_filter.gab) os << ",Gaborish";
   if (loop_filter.epf_iters > 0) os << ",epf=" << loop_filter.epf_iters;
   if (animation_frame.duration > 0) os << ",dur=" << animation_frame.duration;
+  if (frame_type == FrameType::kRegularFrame ||
+      frame_type == FrameType::kSkipProgressive) {
+    os << ",";
+    os << blending_info.DebugString();
+    for (size_t i = 0; i < extra_channel_blending_info.size(); ++i) {
+      os << (i == 0 ? "[" : ";");
+      os << extra_channel_blending_info[i].DebugString();
+      if (i + 1 == extra_channel_blending_info.size()) os << "]";
+    }
+  }
   if (save_as_reference > 0) os << ",ref=" << save_as_reference;
+  os << "," << (save_before_color_transform ? "before" : "after") << "_ct";
   if (is_last) os << ",last";
   return os.str();
 }
+#endif
 
 }  // namespace jxl
index 7eb2f35..b246bf8 100644 (file)
 #include <stddef.h>
 #include <stdint.h>
 
+#include <algorithm>
 #include <string>
 
-#include "lib/jxl/aux_out_fwd.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/override.h"
-#include "lib/jxl/base/padded_bytes.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/coeff_order_fwd.h"
-#include "lib/jxl/common.h"
+#include "lib/jxl/common.h"  // kMaxNumPasses
 #include "lib/jxl/dec_bit_reader.h"
-#include "lib/jxl/enc_bit_writer.h"
 #include "lib/jxl/fields.h"
+#include "lib/jxl/frame_dimensions.h"
 #include "lib/jxl/image_metadata.h"
 #include "lib/jxl/loop_filter.h"
 
 namespace jxl {
 
+// TODO(eustas): move to proper place?
 // Also used by extra channel names.
 static inline Status VisitNameString(Visitor* JXL_RESTRICT visitor,
                                      std::string* name) {
@@ -116,37 +116,36 @@ struct YCbCrChromaSubsampling : public Fields {
   }
 
   bool Is444() const {
-    for (size_t c : {0, 2}) {
-      if (channel_mode_[c] != channel_mode_[1]) {
-        return false;
-      }
-    }
-    return true;
+    return HShift(0) == 0 && VShift(0) == 0 &&  // Cb
+           HShift(2) == 0 && VShift(2) == 0 &&  // Cr
+           HShift(1) == 0 && VShift(1) == 0;    // Y
   }
 
   bool Is420() const {
-    return channel_mode_[0] == 1 && channel_mode_[1] == 0 &&
-           channel_mode_[2] == 1;
+    return HShift(0) == 1 && VShift(0) == 1 &&  // Cb
+           HShift(2) == 1 && VShift(2) == 1 &&  // Cr
+           HShift(1) == 0 && VShift(1) == 0;    // Y
   }
 
   bool Is422() const {
-    for (size_t c : {0, 2}) {
-      if (kHShift[channel_mode_[c]] == kHShift[channel_mode_[1]] + 1 &&
-          kVShift[channel_mode_[c]] == kVShift[channel_mode_[1]]) {
-        return false;
-      }
-    }
-    return true;
+    return HShift(0) == 1 && VShift(0) == 0 &&  // Cb
+           HShift(2) == 1 && VShift(2) == 0 &&  // Cr
+           HShift(1) == 0 && VShift(1) == 0;    // Y
   }
 
   bool Is440() const {
-    for (size_t c : {0, 2}) {
-      if (kHShift[channel_mode_[c]] == kHShift[channel_mode_[1]] &&
-          kVShift[channel_mode_[c]] == kVShift[channel_mode_[1]] + 1) {
-        return false;
-      }
-    }
-    return true;
+    return HShift(0) == 0 && VShift(0) == 1 &&  // Cb
+           HShift(2) == 0 && VShift(2) == 1 &&  // Cr
+           HShift(1) == 0 && VShift(1) == 0;    // Y
+  }
+
+  std::string DebugString() const {
+    if (Is444()) return "444";
+    if (Is420()) return "420";
+    if (Is422()) return "422";
+    if (Is440()) return "440";
+    return "cs" + std::to_string(channel_mode_[0]) +
+           std::to_string(channel_mode_[1]) + std::to_string(channel_mode_[2]);
   }
 
  private:
@@ -158,8 +157,8 @@ struct YCbCrChromaSubsampling : public Fields {
       maxvs_ = std::max(maxvs_, kVShift[channel_mode_[i]]);
     }
   }
-  static constexpr uint8_t kHShift[4] = {0, 1, 1, 0};
-  static constexpr uint8_t kVShift[4] = {0, 1, 0, 1};
+  static const uint8_t kHShift[4];
+  static const uint8_t kVShift[4];
   uint32_t channel_mode_[3];
   uint8_t maxhs_;
   uint8_t maxvs_;
@@ -220,6 +219,8 @@ struct BlendingInfo : public Fields {
   // Frame ID to copy from (0-3). Only encoded if blend_mode is not kReplace.
   uint32_t source;
 
+  std::string DebugString() const;
+
   size_t nonserialized_num_extra_channels = 0;
   bool nonserialized_is_partial_frame = false;
 };
@@ -493,9 +494,6 @@ struct FrameHeader : public Fields {
 Status ReadFrameHeader(BitReader* JXL_RESTRICT reader,
                        FrameHeader* JXL_RESTRICT frame);
 
-Status WriteFrameHeader(const FrameHeader& frame,
-                        BitWriter* JXL_RESTRICT writer, AuxOut* aux_out);
-
 // Shared by enc/dec. 5F and 13 are by far the most common for d1/2/4/8, 0
 // ensures low overhead for small images.
 static constexpr U32Enc kOrderEnc =
diff --git a/lib/jxl/gaborish.cc b/lib/jxl/gaborish.cc
deleted file mode 100644 (file)
index 6a187c4..0000000
+++ /dev/null
@@ -1,70 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "lib/jxl/gaborish.h"
-
-#include <stddef.h>
-
-#include <hwy/base.h>
-
-#include "lib/jxl/base/status.h"
-#include "lib/jxl/convolve.h"
-#include "lib/jxl/image_ops.h"
-
-namespace jxl {
-
-void GaborishInverse(Image3F* in_out, float mul, ThreadPool* pool) {
-  JXL_ASSERT(mul >= 0.0f);
-
-  // Only an approximation. One or even two 3x3, and rank-1 (separable) 5x5
-  // are insufficient.
-  constexpr float kGaborish[5] = {
-      -0.092359145662814029f,  -0.039253623634014627f, 0.016176494530216929f,
-      0.00083458437774987476f, 0.004512465323949319f,
-  };
-  /*
-    better would be:
-      1.0 - mul * (4 * (kGaborish[0] + kGaborish[1] +
-                        kGaborish[2] + kGaborish[4]) +
-                   8 * (kGaborish[3]));
-  */
-  WeightsSymmetric5 weights = {{HWY_REP4(1.0f)},
-                               {HWY_REP4(mul * kGaborish[0])},
-                               {HWY_REP4(mul * kGaborish[2])},
-                               {HWY_REP4(mul * kGaborish[1])},
-                               {HWY_REP4(mul * kGaborish[4])},
-                               {HWY_REP4(mul * kGaborish[3])}};
-  double sum = static_cast<double>(weights.c[0]);
-  sum += 4 * weights.r[0];
-  sum += 4 * weights.R[0];
-  sum += 4 * weights.d[0];
-  sum += 4 * weights.D[0];
-  sum += 8 * weights.L[0];
-  const float normalize = static_cast<float>(1.0 / sum);
-  for (size_t i = 0; i < 4; ++i) {
-    weights.c[i] *= normalize;
-    weights.r[i] *= normalize;
-    weights.R[i] *= normalize;
-    weights.d[i] *= normalize;
-    weights.D[i] *= normalize;
-    weights.L[i] *= normalize;
-  }
-
-  // Reduce memory footprint by only allocating a single plane and swapping it
-  // into the output Image3F. Better still would be tiling.
-  // Note that we cannot *allocate* a plane, as doing so might cause Image3F to
-  // have planes of different stride. Instead, we copy one plane in a temporary
-  // image and reuse the existing planes of the in/out image.
-  ImageF temp = CopyImage(in_out->Plane(2));
-  Symmetric5(in_out->Plane(0), Rect(*in_out), weights, pool, &in_out->Plane(2));
-  Symmetric5(in_out->Plane(1), Rect(*in_out), weights, pool, &in_out->Plane(0));
-  Symmetric5(temp, Rect(*in_out), weights, pool, &in_out->Plane(1));
-  // Now planes are 1, 2, 0.
-  in_out->Plane(0).Swap(in_out->Plane(1));
-  // 2 1 0
-  in_out->Plane(0).Swap(in_out->Plane(2));
-}
-
-}  // namespace jxl
index d17ce89..131ec4f 100644 (file)
@@ -7,8 +7,8 @@
 
 #include <algorithm>
 
-#include "gtest/gtest.h"
 #include "lib/jxl/enc_gamma_correct.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
 namespace {
index 930ffb4..9edd995 100644 (file)
 #include <hwy/foreach_target.h>
 #include <hwy/highway.h>
 
+#include "lib/jxl/base/common.h"
 #include "lib/jxl/base/compiler_specific.h"
-#include "lib/jxl/base/profiler.h"
-#include "lib/jxl/common.h"
+#include "lib/jxl/base/matrix_ops.h"
 #include "lib/jxl/image_ops.h"
-#include "lib/jxl/linalg.h"
 HWY_BEFORE_NAMESPACE();
 namespace jxl {
 namespace HWY_NAMESPACE {
@@ -217,7 +216,8 @@ struct OutputStore {
   void operator()(const V& out, float* JXL_RESTRICT pos,
                   ptrdiff_t offset) const {
     // Stream helps for large images but is slower for images that fit in cache.
-    Store(out, HWY_FULL(float)(), pos + offset);
+    const HWY_FULL(float) df;
+    Store(out, df, pos + offset);
   }
 };
 
@@ -227,7 +227,8 @@ class SingleInput {
  public:
   explicit SingleInput(const float* pos) : pos_(pos) {}
   Vec<HWY_FULL(float)> operator()(const size_t offset) const {
-    return Load(HWY_FULL(float)(), pos_ + offset);
+    const HWY_FULL(float) df;
+    return Load(df, pos_ + offset);
   }
   const float* pos_;
 };
@@ -238,8 +239,9 @@ class TwoInputs {
  public:
   TwoInputs(const float* pos1, const float* pos2) : pos1_(pos1), pos2_(pos2) {}
   Vec<HWY_FULL(float)> operator()(const size_t offset) const {
-    const auto in1 = Load(HWY_FULL(float)(), pos1_ + offset);
-    const auto in2 = Load(HWY_FULL(float)(), pos2_ + offset);
+    const HWY_FULL(float) df;
+    const auto in1 = Load(df, pos1_ + offset);
+    const auto in2 = Load(df, pos2_ + offset);
     return Add(in1, in2);
   }
 
@@ -377,11 +379,11 @@ void VerticalStrip(const hwy::AlignedUniquePtr<RecursiveGaussian>& rg,
 void FastGaussianVertical(const hwy::AlignedUniquePtr<RecursiveGaussian>& rg,
                           const ImageF& in, ThreadPool* /*pool*/,
                           ImageF* JXL_RESTRICT out) {
-  PROFILER_FUNC;
   JXL_CHECK(SameSize(in, *out));
 
+  const HWY_FULL(float) df;
   constexpr size_t kCacheLineLanes = 64 / sizeof(float);
-  constexpr size_t kVN = MaxLanes(HWY_FULL(float)());
+  constexpr size_t kVN = MaxLanes(df);
   constexpr size_t kCacheLineVectors =
       (kVN < kCacheLineLanes) ? (kCacheLineLanes / kVN) : 4;
   constexpr size_t kFastPace = kCacheLineVectors * kVN;
@@ -497,7 +499,6 @@ ImageF ConvolveAndSample(const ImageF& in, const std::vector<float>& kernel,
 // Implements "Recursive Implementation of the Gaussian Filter Using Truncated
 // Cosine Functions" by Charalampidis [2016].
 hwy::AlignedUniquePtr<RecursiveGaussian> CreateRecursiveGaussian(double sigma) {
-  PROFILER_FUNC;
   auto rg = hwy::MakeUniqueAligned<RecursiveGaussian>();
   constexpr double kPi = 3.141592653589793238;
 
@@ -542,7 +543,7 @@ hwy::AlignedUniquePtr<RecursiveGaussian> CreateRecursiveGaussian(double sigma) {
   const double gamma[3] = {1, radius * radius - sigma * sigma,  // (55)
                            zeta_15 * rho[0] + zeta_35 * rho[1] + rho[2]};
   double beta[3];
-  MatMul(A, gamma, 3, 3, 1, beta);  // (53)
+  Mul3x3Vector(A, gamma, beta);  // (53)
 
   // Sanity check: correctly solved for beta (IIR filter weights are normalized)
   const double sum = beta[0] * p_1 + beta[1] * p_3 + beta[2] * p_5;  // (39)
@@ -595,7 +596,6 @@ namespace {
 void FastGaussianHorizontal(const hwy::AlignedUniquePtr<RecursiveGaussian>& rg,
                             const ImageF& in, ThreadPool* pool,
                             ImageF* JXL_RESTRICT out) {
-  PROFILER_FUNC;
   JXL_CHECK(SameSize(in, *out));
 
   const intptr_t xsize = in.xsize();
index 2aa94f7..097c1aa 100644 (file)
@@ -9,12 +9,12 @@
 #include <hwy/targets.h>
 #include <vector>
 
-#include "gtest/gtest.h"
 #include "lib/extras/time.h"
 #include "lib/jxl/base/printf_macros.h"
 #include "lib/jxl/convolve.h"
 #include "lib/jxl/image_ops.h"
 #include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
 
@@ -132,7 +132,8 @@ void TestDirac2D(size_t xsize, size_t ysize, double sigma) {
 
   const double max_l1 = sigma < 1.5 ? 5E-3 : 6E-4;
   const size_t border = 2 * sigma;
-  VerifyRelativeError(expected, out, max_l1, 1E-8, border);
+
+  JXL_ASSERT_OK(VerifyRelativeError(expected, out, max_l1, 1E-8, _, border));
 }
 
 TEST(GaussBlurTest, Test2D) {
@@ -204,7 +205,7 @@ void TestRandom(size_t xsize, size_t ysize, float min, float max, double sigma,
       GaussianKernel(static_cast<int>(4 * sigma), static_cast<float>(sigma));
   const ImageF expected = Convolve(in, kernel);
 
-  VerifyRelativeError(expected, out, max_l1, max_rel, border);
+  JXL_ASSERT_OK(VerifyRelativeError(expected, out, max_l1, max_rel, _, border));
 }
 
 void TestRandomForSizes(float min, float max, double sigma) {
index 0351904..fe0943f 100644 (file)
@@ -3,6 +3,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+#include <jxl/cms.h>
 #include <math.h>
 #include <stddef.h>
 #include <stdint.h>
 #include <array>
 #include <utility>
 
-#include "gtest/gtest.h"
-#include "lib/jxl/aux_out.h"
+#include "lib/jxl/base/common.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/base/override.h"
-#include "lib/jxl/base/padded_bytes.h"
-#include "lib/jxl/base/thread_pool_internal.h"
 #include "lib/jxl/codec_in_out.h"
 #include "lib/jxl/color_encoding_internal.h"
-#include "lib/jxl/color_management.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/enc_cache.h"
-#include "lib/jxl/enc_color_management.h"
-#include "lib/jxl/enc_file.h"
 #include "lib/jxl/enc_params.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/image_bundle.h"
 #include "lib/jxl/image_ops.h"
 #include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
+
+struct AuxOut;
+
 namespace {
 
 // Returns distance of point p to line p0..p1, the result is signed and is not
@@ -159,14 +157,12 @@ void TestGradient(ThreadPool* pool, uint32_t color0, uint32_t color1,
 
   CodecInOut io2;
 
-  PaddedBytes compressed;
-  AuxOut* aux_out = nullptr;
+  std::vector<uint8_t> compressed;
   PassesEncoderState enc_state;
-  EXPECT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(),
-                         aux_out, pool));
-  EXPECT_TRUE(test::DecodeFile({}, compressed, &io2, pool));
-  EXPECT_TRUE(
-      io2.Main().TransformTo(io2.metadata.m.color_encoding, GetJxlCms(), pool));
+  EXPECT_TRUE(test::EncodeFile(cparams, &io, &enc_state, &compressed, pool));
+  EXPECT_TRUE(test::DecodeFile({}, Bytes(compressed), &io2, pool));
+  EXPECT_TRUE(io2.Main().TransformTo(io2.metadata.m.color_encoding,
+                                     *JxlGetDefaultCms(), pool));
 
   if (use_gradient) {
     // Test that the gradient map worked. For that, we take a second derivative
@@ -190,13 +186,13 @@ void TestGradient(ThreadPool* pool, uint32_t color0, uint32_t color1,
 static constexpr bool fast_mode = true;
 
 TEST(GradientTest, SteepGradient) {
-  ThreadPoolInternal pool(8);
+  test::ThreadPoolForTests pool(8);
   // Relatively steep gradients, colors from the sky of stp.png
   TestGradient(&pool, 0xd99d58, 0x889ab1, 512, 512, 90, fast_mode, 3.0);
 }
 
 TEST(GradientTest, SubtleGradient) {
-  ThreadPoolInternal pool(8);
+  test::ThreadPoolForTests pool(8);
   // Very subtle gradient
   TestGradient(&pool, 0xb89b7b, 0xa89b8d, 512, 512, 90, fast_mode, 4.0);
 }
index 7c560e5..db88147 100644 (file)
@@ -5,9 +5,8 @@
 
 #include "lib/jxl/headers.h"
 
-#include "lib/jxl/base/printf_macros.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/fields.h"
+#include "lib/jxl/frame_dimensions.h"
 
 namespace jxl {
 namespace {
@@ -192,9 +191,4 @@ Status ReadSizeHeader(BitReader* JXL_RESTRICT reader,
   return Bundle::Read(reader, size);
 }
 
-Status WriteSizeHeader(const SizeHeader& size, BitWriter* JXL_RESTRICT writer,
-                       size_t layer, AuxOut* aux_out) {
-  return Bundle::Write(size, writer, layer, aux_out);
-}
-
 }  // namespace jxl
index a9be252..3cce84d 100644 (file)
@@ -6,16 +6,14 @@
 #ifndef LIB_JXL_HEADERS_H_
 #define LIB_JXL_HEADERS_H_
 
-// Codestream headers, also stored in CodecInOut.
+// Codestream headers.
 
 #include <stddef.h>
 #include <stdint.h>
 
-#include "lib/jxl/aux_out_fwd.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/dec_bit_reader.h"
-#include "lib/jxl/enc_bit_writer.h"
 #include "lib/jxl/field_encodings.h"
 
 namespace jxl {
@@ -94,9 +92,6 @@ struct AnimationHeader : public Fields {
 Status ReadSizeHeader(BitReader* JXL_RESTRICT reader,
                       SizeHeader* JXL_RESTRICT size);
 
-Status WriteSizeHeader(const SizeHeader& size, BitWriter* JXL_RESTRICT writer,
-                       size_t layer, AuxOut* aux_out);
-
 }  // namespace jxl
 
 #endif  // LIB_JXL_HEADERS_H_
index 9b2e8ea..e25d931 100644 (file)
@@ -5,7 +5,7 @@
 
 #include "lib/jxl/base/iaca.h"
 
-#include "gtest/gtest.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
 namespace {
index dd83fbe..a1f118e 100644 (file)
 #include <string>
 #include <vector>
 
-#include "lib/jxl/aux_out.h"
-#include "lib/jxl/aux_out_fwd.h"
 #include "lib/jxl/base/byte_order.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/dec_ans.h"
 #include "lib/jxl/fields.h"
 #include "lib/jxl/icc_codec_common.h"
+#include "lib/jxl/padded_bytes.h"
 
 namespace jxl {
 namespace {
@@ -53,6 +51,19 @@ void Shuffle(uint8_t* data, size_t size, size_t width) {
 //               63-bit values).
 constexpr const size_t kPreambleSize = 22;  // enough for reading 2 VarInts
 
+uint64_t DecodeVarInt(const uint8_t* input, size_t inputSize, size_t* pos) {
+  size_t i;
+  uint64_t ret = 0;
+  for (i = 0; *pos + i < inputSize && i < 10; ++i) {
+    ret |= uint64_t(input[*pos + i] & 127) << uint64_t(7 * i);
+    // If the next-byte flag is not set, stop
+    if ((input[*pos + i] & 128) == 0) break;
+  }
+  // TODO(user): Return a decoding error if i == 10.
+  *pos += i + 1;
+  return ret;
+}
+
 }  // namespace
 
 // Mimics the beginning of UnpredictICC for quick validity check.
@@ -98,7 +109,8 @@ Status UnpredictICC(const uint8_t* enc, size_t size, PaddedBytes* result) {
   pos = commands_end;  // pos in data stream
 
   // Header
-  PaddedBytes header = ICCInitialHeaderPrediction();
+  PaddedBytes header;
+  header.append(ICCInitialHeaderPrediction());
   EncodeUint32(0, osize, &header);
   for (size_t i = 0; i <= kICCHeaderSize; i++) {
     if (result->size() == osize) {
@@ -380,12 +392,4 @@ Status ICCReader::CheckEOI(BitReader* reader) {
                     "Not enough bytes for reading ICC profile");
 }
 
-Status ReadICC(BitReader* JXL_RESTRICT reader, PaddedBytes* JXL_RESTRICT icc,
-               size_t output_limit) {
-  ICCReader icc_reader;
-  JXL_RETURN_IF_ERROR(icc_reader.Init(reader, output_limit));
-  JXL_RETURN_IF_ERROR(icc_reader.Process(reader, icc));
-  return true;
-}
-
 }  // namespace jxl
index d55b316..87e523a 100644 (file)
@@ -8,24 +8,17 @@
 
 // Compressed representation of ICC profiles.
 
-#include <stddef.h>
-#include <stdint.h>
+#include <cstddef>
+#include <cstdint>
 
-#include "lib/jxl/aux_out.h"
-#include "lib/jxl/aux_out_fwd.h"
 #include "lib/jxl/base/compiler_specific.h"
-#include "lib/jxl/base/padded_bytes.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/dec_ans.h"
 #include "lib/jxl/dec_bit_reader.h"
-#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/padded_bytes.h"
 
 namespace jxl {
 
-// Should still be called if `icc.empty()` - if so, writes only 1 bit.
-Status WriteICC(const PaddedBytes& icc, BitWriter* JXL_RESTRICT writer,
-                size_t layer, AuxOut* JXL_RESTRICT aux_out);
-
 struct ICCReader {
   Status Init(BitReader* reader, size_t output_limit);
   Status Process(BitReader* reader, PaddedBytes* icc);
@@ -46,13 +39,6 @@ struct ICCReader {
   PaddedBytes decompressed_;
 };
 
-// `icc` may be empty afterwards - if so, call CreateProfile. Does not append,
-// clears any original data that was in icc.
-// If `output_limit` is not 0, then returns error if resulting profile would be
-// longer than `output_limit`
-Status ReadICC(BitReader* JXL_RESTRICT reader, PaddedBytes* JXL_RESTRICT icc,
-               size_t output_limit = 0);
-
 // Exposed only for testing
 Status PredictICC(const uint8_t* icc, size_t size, PaddedBytes* result);
 
index 3e60048..af993a0 100644 (file)
 #include <string>
 #include <vector>
 
-#include "lib/jxl/aux_out.h"
-#include "lib/jxl/aux_out_fwd.h"
 #include "lib/jxl/base/byte_order.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/fields.h"
+#include "lib/jxl/padded_bytes.h"
 
 namespace jxl {
 namespace {
@@ -96,29 +94,19 @@ Status CheckIs32Bit(uint64_t v) {
   return true;
 }
 
-PaddedBytes ICCInitialHeaderPrediction() {
-  PaddedBytes result(kICCHeaderSize);
-  for (size_t i = 0; i < kICCHeaderSize; i++) {
-    result[i] = 0;
-  }
-  result[8] = 4;
-  EncodeKeyword(kMntrTag, result.data(), result.size(), 12);
-  EncodeKeyword(kRgb_Tag, result.data(), result.size(), 16);
-  EncodeKeyword(kXyz_Tag, result.data(), result.size(), 20);
-  EncodeKeyword(kAcspTag, result.data(), result.size(), 36);
-  result[68] = 0;
-  result[69] = 0;
-  result[70] = 246;
-  result[71] = 214;
-  result[72] = 0;
-  result[73] = 1;
-  result[74] = 0;
-  result[75] = 0;
-  result[76] = 0;
-  result[77] = 0;
-  result[78] = 211;
-  result[79] = 45;
-  return result;
+const uint8_t kIccInitialHeaderPrediction[kICCHeaderSize] = {
+    0,   0,   0,   0,   0,   0,   0,   0,   4, 0, 0, 0, 'm', 'n', 't', 'r',
+    'R', 'G', 'B', ' ', 'X', 'Y', 'Z', ' ', 0, 0, 0, 0, 0,   0,   0,   0,
+    0,   0,   0,   0,   'a', 'c', 's', 'p', 0, 0, 0, 0, 0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0, 0, 0, 0, 0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   246, 214, 0, 1, 0, 0, 0,   0,   211, 45,
+    0,   0,   0,   0,   0,   0,   0,   0,   0, 0, 0, 0, 0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0, 0, 0, 0, 0,   0,   0,   0,
+    0,   0,   0,   0,   0,   0,   0,   0,   0, 0, 0, 0, 0,   0,   0,   0,
+};
+
+const Span<const uint8_t> ICCInitialHeaderPrediction() {
+  return Bytes(kIccInitialHeaderPrediction);
 }
 
 void ICCPredictHeader(const uint8_t* icc, size_t size, uint8_t* header,
index e91e908..4a7d5e0 100644 (file)
@@ -8,16 +8,17 @@
 
 // Compressed representation of ICC profiles.
 
-#include <stddef.h>
-#include <stdint.h>
-
 #include <array>
+#include <cstddef>
+#include <cstdint>
 
-#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/span.h"
 #include "lib/jxl/base/status.h"
 
 namespace jxl {
 
+class PaddedBytes;
+
 static constexpr size_t kICCHeaderSize = 128;
 
 typedef std::array<uint8_t, 4> Tag;
@@ -94,7 +95,7 @@ void AppendKeyword(const Tag& keyword, PaddedBytes* data);
 Status CheckOutOfBounds(size_t a, size_t b, size_t size);
 Status CheckIs32Bit(uint64_t v);
 
-PaddedBytes ICCInitialHeaderPrediction();
+const Span<const uint8_t> ICCInitialHeaderPrediction();
 void ICCPredictHeader(const uint8_t* icc, size_t size, uint8_t* header,
                       size_t pos);
 uint8_t LinearPredictICCValue(const uint8_t* data, size_t start, size_t i,
index d365471..743aa9a 100644 (file)
@@ -5,22 +5,26 @@
 
 #include "lib/jxl/icc_codec.h"
 
+#include <cstdint>
 #include <string>
+#include <vector>
 
-#include "gtest/gtest.h"
 #include "lib/jxl/base/span.h"
+#include "lib/jxl/color_encoding_internal.h"
 #include "lib/jxl/enc_icc_codec.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
 namespace {
 
-void TestProfile(const PaddedBytes& icc) {
+void TestProfile(const IccBytes& icc) {
   BitWriter writer;
   ASSERT_TRUE(WriteICC(icc, &writer, 0, nullptr));
   writer.ZeroPadToByte();
-  PaddedBytes dec;
+  std::vector<uint8_t> dec;
   BitReader reader(writer.GetSpan());
-  ASSERT_TRUE(ReadICC(&reader, &dec));
+  ASSERT_TRUE(test::ReadICC(&reader, &dec));
   ASSERT_TRUE(reader.Close());
   EXPECT_EQ(icc.size(), dec.size());
   if (icc.size() == dec.size()) {
@@ -32,11 +36,9 @@ void TestProfile(const PaddedBytes& icc) {
 }
 
 void TestProfile(const std::string& icc) {
-  PaddedBytes bytes(icc.size());
-  for (size_t i = 0; i < icc.size(); i++) {
-    bytes[i] = icc[i];
-  }
-  TestProfile(bytes);
+  IccBytes data;
+  Bytes(icc).AppendTo(&data);
+  TestProfile(data);
 }
 
 // Valid profile from one of the images output by the decoder.
@@ -128,7 +130,7 @@ TEST(IccCodecTest, Icc) {
 
   {
     // Exactly the ICC header size
-    PaddedBytes profile(128);
+    IccBytes profile(128);
     for (size_t i = 0; i < 128; i++) {
       profile[i] = 0;
     }
@@ -136,14 +138,14 @@ TEST(IccCodecTest, Icc) {
   }
 
   {
-    PaddedBytes profile;
-    profile.append(kTestProfile, kTestProfile + sizeof(kTestProfile));
+    IccBytes profile;
+    Bytes(kTestProfile, sizeof(kTestProfile)).AppendTo(&profile);
     TestProfile(profile);
   }
 
   // Test substrings of full profile
   {
-    PaddedBytes profile;
+    IccBytes profile;
     for (size_t i = 0; i <= 256; i++) {
       profile.push_back(kTestProfile[i]);
       TestProfile(profile);
@@ -190,10 +192,10 @@ static const unsigned char kEncodedTestProfile[] = {
 
 // Tests that the decoded kEncodedTestProfile matches kTestProfile.
 TEST(IccCodecTest, EncodedIccProfile) {
-  jxl::BitReader reader(jxl::Span<const uint8_t>(kEncodedTestProfile,
-                                                 sizeof(kEncodedTestProfile)));
-  jxl::PaddedBytes dec;
-  ASSERT_TRUE(ReadICC(&reader, &dec));
+  jxl::BitReader reader(
+      jxl::Bytes(kEncodedTestProfile, sizeof(kEncodedTestProfile)));
+  std::vector<uint8_t> dec;
+  ASSERT_TRUE(test::ReadICC(&reader, &dec));
   ASSERT_TRUE(reader.Close());
   EXPECT_EQ(sizeof(kTestProfile), dec.size());
   if (sizeof(kTestProfile) == dec.size()) {
index 34b315d..382c957 100644 (file)
@@ -12,8 +12,8 @@
 #include <hwy/foreach_target.h>
 #include <hwy/highway.h>
 
-#include "lib/jxl/base/profiler.h"
-#include "lib/jxl/common.h"
+#include "lib/jxl/base/common.h"
+#include "lib/jxl/frame_dimensions.h"
 #include "lib/jxl/image_ops.h"
 #include "lib/jxl/sanitizers.h"
 
@@ -34,11 +34,6 @@ namespace {
 
 HWY_EXPORT(GetVectorSize);  // Local function.
 
-size_t VectorSize() {
-  static size_t bytes = HWY_DYNAMIC_DISPATCH(GetVectorSize)();
-  return bytes;
-}
-
 // Returns distance [bytes] between the start of two consecutive rows, a
 // multiple of vector/cache line size but NOT CacheAligned::kAlias - see below.
 size_t BytesPerRow(const size_t xsize, const size_t sizeof_t) {
@@ -71,15 +66,17 @@ size_t BytesPerRow(const size_t xsize, const size_t sizeof_t) {
 
 }  // namespace
 
+size_t VectorSize() {
+  static size_t bytes = HWY_DYNAMIC_DISPATCH(GetVectorSize)();
+  return bytes;
+}
+
 PlaneBase::PlaneBase(const size_t xsize, const size_t ysize,
                      const size_t sizeof_t)
     : xsize_(static_cast<uint32_t>(xsize)),
       ysize_(static_cast<uint32_t>(ysize)),
       orig_xsize_(static_cast<uint32_t>(xsize)),
       orig_ysize_(static_cast<uint32_t>(ysize)) {
-  // (Can't profile CacheAligned itself because it is used by profiler.h)
-  PROFILER_FUNC;
-
   JXL_CHECK(xsize == xsize_);
   JXL_CHECK(ysize == ysize_);
 
@@ -111,7 +108,10 @@ void PlaneBase::InitializePadding(const size_t sizeof_t, Padding padding) {
 
   for (size_t y = 0; y < ysize_; ++y) {
     uint8_t* JXL_RESTRICT row = static_cast<uint8_t*>(VoidRow(y));
-#if defined(__clang__) && (__clang_major__ <= 6)
+#if defined(__clang__) &&                                           \
+    ((!defined(__apple_build_version__) && __clang_major__ <= 6) || \
+     (defined(__apple_build_version__) &&                           \
+      __apple_build_version__ <= 10001145))
     // There's a bug in msan in clang-6 when handling AVX2 operations. This
     // workaround allows tests to pass on msan, although it is slower and
     // prevents msan warnings from uninitialized images.
@@ -133,54 +133,12 @@ void PlaneBase::Swap(PlaneBase& other) {
   std::swap(bytes_, other.bytes_);
 }
 
-Image3F PadImageMirror(const Image3F& in, const size_t xborder,
-                       const size_t yborder) {
-  size_t xsize = in.xsize();
-  size_t ysize = in.ysize();
-  Image3F out(xsize + 2 * xborder, ysize + 2 * yborder);
-  if (xborder > xsize || yborder > ysize) {
-    for (size_t c = 0; c < 3; c++) {
-      for (int32_t y = 0; y < static_cast<int32_t>(out.ysize()); y++) {
-        float* row_out = out.PlaneRow(c, y);
-        const float* row_in = in.PlaneRow(
-            c, Mirror(y - static_cast<int32_t>(yborder), in.ysize()));
-        for (int32_t x = 0; x < static_cast<int32_t>(out.xsize()); x++) {
-          int32_t xin = Mirror(x - static_cast<int32_t>(xborder), in.xsize());
-          row_out[x] = row_in[xin];
-        }
-      }
-    }
-    return out;
-  }
-  CopyImageTo(in, Rect(xborder, yborder, xsize, ysize), &out);
-  for (size_t c = 0; c < 3; c++) {
-    // Horizontal pad.
-    for (size_t y = 0; y < ysize; y++) {
-      for (size_t x = 0; x < xborder; x++) {
-        out.PlaneRow(c, y + yborder)[x] =
-            in.ConstPlaneRow(c, y)[xborder - x - 1];
-        out.PlaneRow(c, y + yborder)[x + xsize + xborder] =
-            in.ConstPlaneRow(c, y)[xsize - 1 - x];
-      }
-    }
-    // Vertical pad.
-    for (size_t y = 0; y < yborder; y++) {
-      memcpy(out.PlaneRow(c, y), out.ConstPlaneRow(c, 2 * yborder - 1 - y),
-             out.xsize() * sizeof(float));
-      memcpy(out.PlaneRow(c, y + ysize + yborder),
-             out.ConstPlaneRow(c, ysize + yborder - 1 - y),
-             out.xsize() * sizeof(float));
-    }
-  }
-  return out;
-}
-
-void PadImageToBlockMultipleInPlace(Image3F* JXL_RESTRICT in) {
-  PROFILER_FUNC;
+void PadImageToBlockMultipleInPlace(Image3F* JXL_RESTRICT in,
+                                    size_t block_dim) {
   const size_t xsize_orig = in->xsize();
   const size_t ysize_orig = in->ysize();
-  const size_t xsize = RoundUpToBlockDim(xsize_orig);
-  const size_t ysize = RoundUpToBlockDim(ysize_orig);
+  const size_t xsize = RoundUpTo(xsize_orig, block_dim);
+  const size_t ysize = RoundUpTo(ysize_orig, block_dim);
   // Expands image size to the originally-allocated size.
   in->ShrinkTo(xsize, ysize);
   for (size_t c = 0; c < 3; c++) {
index 5fe2c55..fef7f67 100644 (file)
 #include <sstream>
 #include <utility>  // std::move
 
-#include "lib/jxl/base/cache_aligned.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/common.h"
+#include "lib/jxl/cache_aligned.h"
 
 namespace jxl {
 
+// Helper function to create rows that are multiples of SIMD vector size.
+size_t VectorSize();
+
 // Type-independent parts of Plane<> - reduces code duplication and facilitates
 // moving member function implementations to cc file.
 struct PlaneBase {
index dfbc02d..fc6d153 100644 (file)
@@ -9,13 +9,8 @@
 #include <utility>
 
 #include "lib/jxl/base/byte_order.h"
-#include "lib/jxl/base/padded_bytes.h"
 #include "lib/jxl/base/printf_macros.h"
-#include "lib/jxl/base/profiler.h"
-#include "lib/jxl/codec_in_out.h"
-#include "lib/jxl/color_management.h"
 #include "lib/jxl/fields.h"
-#include "lib/jxl/luminance.h"
 
 namespace jxl {
 
@@ -41,8 +36,9 @@ void ImageBundle::VerifyMetadata() const {
   JXL_CHECK(metadata_->color_encoding.IsGray() == IsGray());
 
   if (metadata_->HasAlpha() && alpha().xsize() == 0) {
-    JXL_ABORT("MD alpha_bits %u IB alpha %" PRIuS " x %" PRIuS "\n",
-              metadata_->GetAlphaBits(), alpha().xsize(), alpha().ysize());
+    JXL_UNREACHABLE("MD alpha_bits %u IB alpha %" PRIuS " x %" PRIuS "\n",
+                    metadata_->GetAlphaBits(), alpha().xsize(),
+                    alpha().ysize());
   }
   const uint32_t alpha_bits = metadata_->GetAlphaBits();
   JXL_CHECK(alpha_bits <= 32);
@@ -99,12 +95,11 @@ ImageF* ImageBundle::alpha() {
   return &extra_channels_[ec];
 }
 
-void ImageBundle::SetAlpha(ImageF&& alpha, bool alpha_is_premultiplied) {
+void ImageBundle::SetAlpha(ImageF&& alpha) {
   const ExtraChannelInfo* eci = metadata_->Find(ExtraChannel::kAlpha);
   // Must call SetAlphaBits first, otherwise we don't know which channel index
   JXL_CHECK(eci != nullptr);
   JXL_CHECK(alpha.xsize() != 0 && alpha.ysize() != 0);
-  JXL_CHECK(eci->alpha_associated == alpha_is_premultiplied);
   if (extra_channels_.size() < metadata_->extra_channel_info.size()) {
     // TODO(jon): get rid of this case
     extra_channels_.insert(
index d233abb..e1940d8 100644 (file)
@@ -8,28 +8,25 @@
 
 // The main image or frame consists of a bundle of associated images.
 
+#include <jxl/cms_interface.h>
 #include <stddef.h>
 #include <stdint.h>
 
 #include <vector>
 
-#include "jxl/cms_interface.h"
-#include "lib/jxl/aux_out_fwd.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/color_encoding_internal.h"
-#include "lib/jxl/common.h"
+#include "lib/jxl/common.h"  // JPEGXL_ENABLE_TRANSCODE_JPEG
 #include "lib/jxl/dec_bit_reader.h"
 #include "lib/jxl/dec_xyb.h"
-#include "lib/jxl/enc_bit_writer.h"
 #include "lib/jxl/field_encodings.h"
 #include "lib/jxl/frame_header.h"
 #include "lib/jxl/headers.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/image_metadata.h"
 #include "lib/jxl/jpeg/jpeg_data.h"
-#include "lib/jxl/opsin_params.h"
 #include "lib/jxl/quantizer.h"
 
 namespace jxl {
@@ -48,11 +45,14 @@ class ImageBundle {
 
   ImageBundle Copy() const {
     ImageBundle copy(metadata_);
-    copy.color_ = CopyImage(color_);
+    copy.color_ = Image3F(color_.xsize(), color_.ysize());
+    CopyImageTo(color_, &copy.color_);
     copy.c_current_ = c_current_;
     copy.extra_channels_.reserve(extra_channels_.size());
     for (const ImageF& plane : extra_channels_) {
-      copy.extra_channels_.emplace_back(CopyImage(plane));
+      ImageF ec(plane.xsize(), plane.ysize());
+      CopyImageTo(plane, &ec);
+      copy.extra_channels_.emplace_back(std::move(ec));
     }
 
     copy.jpeg_data =
@@ -132,10 +132,7 @@ class ImageBundle {
   bool IsGray() const { return c_current_.IsGray(); }
 
   bool IsSRGB() const { return c_current_.IsSRGB(); }
-  bool IsLinearSRGB() const {
-    return c_current_.white_point == WhitePoint::kD65 &&
-           c_current_.primaries == Primaries::kSRGB && c_current_.tf.IsLinear();
-  }
+  bool IsLinearSRGB() const { return c_current_.IsLinearSRGB(); }
 
   // Set the c_current profile without doing any transformation, e.g. if the
   // transformation was already applied.
@@ -162,7 +159,7 @@ class ImageBundle {
 
   // -- ALPHA
 
-  void SetAlpha(ImageF&& alpha, bool alpha_is_premultiplied);
+  void SetAlpha(ImageF&& alpha);
   bool HasAlpha() const {
     return metadata_->Find(ExtraChannel::kAlpha) != nullptr;
   }
index 6de2e49..1a10598 100644 (file)
@@ -5,8 +5,9 @@
 
 #include "lib/jxl/image_bundle.h"
 
-#include "gtest/gtest.h"
-#include "lib/jxl/aux_out.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
 namespace {
@@ -23,7 +24,7 @@ TEST(ImageBundleTest, ExtraChannelName) {
   metadata.extra_channel_info.push_back(std::move(eci));
   ASSERT_TRUE(WriteImageMetadata(metadata, &writer, /*layer=*/0, &aux_out));
   writer.ZeroPadToByte();
-  ReclaimAndCharge(&writer, &allotment, /*layer=*/0, &aux_out);
+  allotment.ReclaimAndCharge(&writer, /*layer=*/0, &aux_out);
 
   BitReader reader(writer.GetSpan());
   ImageMetadata metadata_out;
index 7a1ee1c..4cca910 100644 (file)
 
 #include "lib/jxl/alpha.h"
 #include "lib/jxl/base/byte_order.h"
-#include "lib/jxl/base/padded_bytes.h"
-#include "lib/jxl/base/profiler.h"
-#include "lib/jxl/codec_in_out.h"
-#include "lib/jxl/color_management.h"
+#include "lib/jxl/cms/opsin_params.h"
 #include "lib/jxl/fields.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/quantizer.h"
 
 namespace jxl {
 BitDepth::BitDepth() { Bundle::Init(this); }
@@ -59,6 +58,7 @@ Status BitDepth::VisitFields(Visitor* JXL_RESTRICT visitor) {
   return true;
 }
 
+#if JXL_DEBUG_V_LEVEL >= 1
 std::string BitDepth::DebugString() const {
   std::ostringstream os;
   os << (floating_point_sample ? "F" : "U");
@@ -66,6 +66,7 @@ std::string BitDepth::DebugString() const {
   if (floating_point_sample) os << "." << exponent_bits_per_sample;
   return os.str();
 }
+#endif
 
 CustomTransformData::CustomTransformData() { Bundle::Init(this); }
 Status CustomTransformData::VisitFields(Visitor* JXL_RESTRICT visitor) {
@@ -252,6 +253,7 @@ Status ExtraChannelInfo::VisitFields(Visitor* JXL_RESTRICT visitor) {
   return true;
 }
 
+#if JXL_DEBUG_V_LEVEL >= 1
 std::string ExtraChannelInfo::DebugString() const {
   std::ostringstream os;
   os << (type == ExtraChannel::kAlpha           ? "Alpha"
@@ -267,6 +269,7 @@ std::string ExtraChannelInfo::DebugString() const {
   os << " shift: " << dim_shift;
   return os.str();
 }
+#endif
 
 ImageMetadata::ImageMetadata() { Bundle::Init(this); }
 Status ImageMetadata::VisitFields(Visitor* JXL_RESTRICT visitor) {
@@ -352,12 +355,13 @@ Status OpsinInverseMatrix::VisitFields(Visitor* JXL_RESTRICT visitor) {
     return true;
   }
   for (int i = 0; i < 9; ++i) {
-    JXL_QUIET_RETURN_IF_ERROR(visitor->F16(
-        DefaultInverseOpsinAbsorbanceMatrix()[i], &inverse_matrix[i]));
+    JXL_QUIET_RETURN_IF_ERROR(
+        visitor->F16(jxl::cms::DefaultInverseOpsinAbsorbanceMatrix()[i],
+                     &inverse_matrix[i]));
   }
   for (int i = 0; i < 3; ++i) {
-    JXL_QUIET_RETURN_IF_ERROR(
-        visitor->F16(kNegOpsinAbsorbanceBiasRGB[i], &opsin_biases[i]));
+    JXL_QUIET_RETURN_IF_ERROR(visitor->F16(
+        jxl::cms::kNegOpsinAbsorbanceBiasRGB[i], &opsin_biases[i]));
   }
   for (int i = 0; i < 4; ++i) {
     JXL_QUIET_RETURN_IF_ERROR(
@@ -401,12 +405,6 @@ Status ReadImageMetadata(BitReader* JXL_RESTRICT reader,
   return Bundle::Read(reader, metadata);
 }
 
-Status WriteImageMetadata(const ImageMetadata& metadata,
-                          BitWriter* JXL_RESTRICT writer, size_t layer,
-                          AuxOut* aux_out) {
-  return Bundle::Write(metadata, writer, layer, aux_out);
-}
-
 void ImageMetadata::SetAlphaBits(uint32_t bits, bool alpha_is_premultiplied) {
   std::vector<ExtraChannelInfo>& eciv = extra_channel_info;
   ExtraChannelInfo* alpha = Find(ExtraChannel::kAlpha);
@@ -443,6 +441,7 @@ void ImageMetadata::SetAlphaBits(uint32_t bits, bool alpha_is_premultiplied) {
   if (bits > 12) modular_16_bit_buffer_sufficient = false;
 }
 
+#if JXL_DEBUG_V_LEVEL >= 1
 std::string ImageMetadata::DebugString() const {
   std::ostringstream os;
   os << bit_depth.DebugString();
@@ -473,5 +472,6 @@ std::string CodecMetadata::DebugString() const {
   os << " " << m.DebugString();
   return os.str();
 }
+#endif
 
 }  // namespace jxl
index 9008e42..9a1e9d1 100644 (file)
@@ -9,21 +9,22 @@
 #ifndef LIB_JXL_IMAGE_METADATA_H_
 #define LIB_JXL_IMAGE_METADATA_H_
 
+#include <jxl/codestream_header.h>
 #include <stddef.h>
 #include <stdint.h>
 
 #include <string>
 #include <vector>
 
-#include "jxl/codestream_header.h"
 #include "lib/jxl/color_encoding_internal.h"
 #include "lib/jxl/fields.h"
 #include "lib/jxl/headers.h"
 #include "lib/jxl/jpeg/jpeg_data.h"
-#include "lib/jxl/opsin_params.h"
 
 namespace jxl {
 
+struct AuxOut;
+
 // EXIF orientation of the image. This field overrides any field present in
 // actual EXIF metadata. The value tells which transformation the decoder must
 // apply after decoding to display the image with the correct orientation.
@@ -168,7 +169,7 @@ struct ToneMapping : public Fields {
   float linear_below;
 };
 
-// Contains weights to customize some trasnforms - in particular, XYB and
+// Contains weights to customize some transforms - in particular, XYB and
 // upsampling.
 struct CustomTransformData : public Fields {
   CustomTransformData();
@@ -200,7 +201,7 @@ struct ImageMetadata : public Fields {
 
   // Returns bit depth of the JPEG XL compressed alpha channel, or 0 if no alpha
   // channel present. In the theoretical case that there are multiple alpha
-  // channels, returns the bit depht of the first.
+  // channels, returns the bit depth of the first.
   uint32_t GetAlphaBits() const {
     const ExtraChannelInfo* alpha = Find(ExtraChannel::kAlpha);
     if (alpha == nullptr) return 0;
@@ -324,7 +325,7 @@ struct ImageMetadata : public Fields {
   // must still use kNone (or kYCbCr, which would mean applying the YCbCr
   // transform to the 3-channel XYB data), since with !xyb_encoded, the 3
   // channels are stored as-is, no matter what meaning the color profile assigns
-  // to them. To use ColorEncoding::kXYB, xyb_encoded must be true.
+  // to them. To use ColorSpace::kXYB, xyb_encoded must be true.
   //
   // This value is defined in image metadata because this is the global
   // codestream header. This value does not affect the image itself, so is not
index 63fc087..b2ce23f 100644 (file)
 #include <limits>
 #include <vector>
 
-#include "lib/jxl/base/profiler.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/common.h"
+#include "lib/jxl/frame_dimensions.h"
 #include "lib/jxl/image.h"
 
 namespace jxl {
 
 template <typename T>
 void CopyImageTo(const Plane<T>& from, Plane<T>* JXL_RESTRICT to) {
-  PROFILER_ZONE("CopyImage1");
   JXL_ASSERT(SameSize(from, *to));
   if (from.ysize() == 0 || from.xsize() == 0) return;
   for (size_t y = 0; y < from.ysize(); ++y) {
@@ -32,19 +30,10 @@ void CopyImageTo(const Plane<T>& from, Plane<T>* JXL_RESTRICT to) {
   }
 }
 
-// DEPRECATED - prefer to preallocate result.
-template <typename T>
-Plane<T> CopyImage(const Plane<T>& from) {
-  Plane<T> to(from.xsize(), from.ysize());
-  CopyImageTo(from, &to);
-  return to;
-}
-
 // Copies `from:rect_from` to `to:rect_to`.
 template <typename T>
 void CopyImageTo(const Rect& rect_from, const Plane<T>& from,
                  const Rect& rect_to, Plane<T>* JXL_RESTRICT to) {
-  PROFILER_ZONE("CopyImageR");
   JXL_DASSERT(SameSize(rect_from, rect_to));
   JXL_DASSERT(rect_from.IsInside(from));
   JXL_DASSERT(rect_to.IsInside(*to));
@@ -56,19 +45,10 @@ void CopyImageTo(const Rect& rect_from, const Plane<T>& from,
   }
 }
 
-// DEPRECATED - Returns a copy of the "image" pixels that lie in "rect".
-template <typename T>
-Plane<T> CopyImage(const Rect& rect, const Plane<T>& image) {
-  Plane<T> copy(rect.xsize(), rect.ysize());
-  CopyImageTo(rect, image, &copy);
-  return copy;
-}
-
 // Copies `from:rect_from` to `to:rect_to`.
 template <typename T>
 void CopyImageTo(const Rect& rect_from, const Image3<T>& from,
                  const Rect& rect_to, Image3<T>* JXL_RESTRICT to) {
-  PROFILER_ZONE("CopyImageR");
   JXL_ASSERT(SameSize(rect_from, rect_to));
   for (size_t c = 0; c < 3; c++) {
     CopyImageTo(rect_from, from.Plane(c), rect_to, &to->Plane(c));
@@ -78,7 +58,6 @@ void CopyImageTo(const Rect& rect_from, const Image3<T>& from,
 template <typename T, typename U>
 void ConvertPlaneAndClamp(const Rect& rect_from, const Plane<T>& from,
                           const Rect& rect_to, Plane<U>* JXL_RESTRICT to) {
-  PROFILER_ZONE("ConvertPlane");
   JXL_ASSERT(SameSize(rect_from, rect_to));
   using M = decltype(T() + U());
   for (size_t y = 0; y < rect_to.ysize(); ++y) {
@@ -98,18 +77,6 @@ void CopyImageTo(const T& from, T* JXL_RESTRICT to) {
   return CopyImageTo(Rect(from), from, Rect(*to), to);
 }
 
-// Copies `from:rect_from` to `to`.
-template <typename T>
-void CopyImageTo(const Rect& rect_from, const T& from, T* JXL_RESTRICT to) {
-  return CopyImageTo(rect_from, from, Rect(*to), to);
-}
-
-// Copies `from` to `to:rect_to`.
-template <typename T>
-void CopyImageTo(const T& from, const Rect& rect_to, T* JXL_RESTRICT to) {
-  return CopyImageTo(Rect(from), from, rect_to, to);
-}
-
 // Copies `from:rect_from` to `to:rect_to`; also copies `padding` pixels of
 // border around `from:rect_from`, in all directions, whenever they are inside
 // the first image.
@@ -135,55 +102,6 @@ void CopyImageToWithPadding(const Rect& from_rect, const T& from,
                      to);
 }
 
-// DEPRECATED - prefer to preallocate result.
-template <typename T>
-Image3<T> CopyImage(const Image3<T>& from) {
-  Image3<T> copy(from.xsize(), from.ysize());
-  CopyImageTo(from, &copy);
-  return copy;
-}
-
-// DEPRECATED - prefer to preallocate result.
-template <typename T>
-Image3<T> CopyImage(const Rect& rect, const Image3<T>& from) {
-  Image3<T> to(rect.xsize(), rect.ysize());
-  CopyImageTo(rect, from.Plane(0), to.Plane(0));
-  CopyImageTo(rect, from.Plane(1), to.Plane(1));
-  CopyImageTo(rect, from.Plane(2), to.Plane(2));
-  return to;
-}
-
-// Sets "thickness" pixels on each border to "value". This is faster than
-// initializing the entire image and overwriting valid/interior pixels.
-template <typename T>
-void SetBorder(const size_t thickness, const T value, Image3<T>* image) {
-  const size_t xsize = image->xsize();
-  const size_t ysize = image->ysize();
-  // Top: fill entire row
-  for (size_t c = 0; c < 3; ++c) {
-    for (size_t y = 0; y < std::min(thickness, ysize); ++y) {
-      T* JXL_RESTRICT row = image->PlaneRow(c, y);
-      std::fill(row, row + xsize, value);
-    }
-
-    // Bottom: fill entire row
-    for (size_t y = ysize - thickness; y < ysize; ++y) {
-      T* JXL_RESTRICT row = image->PlaneRow(c, y);
-      std::fill(row, row + xsize, value);
-    }
-
-    // Left/right: fill the 'columns' on either side, but only if the image is
-    // big enough that they don't already belong to the top/bottom rows.
-    if (ysize >= 2 * thickness) {
-      for (size_t y = thickness; y < ysize - thickness; ++y) {
-        T* JXL_RESTRICT row = image->PlaneRow(c, y);
-        std::fill(row, row + thickness, value);
-        std::fill(row + xsize - thickness, row + xsize, value);
-      }
-    }
-  }
-}
-
 template <class ImageIn, class ImageOut>
 void Subtract(const ImageIn& image1, const ImageIn& image2, ImageOut* out) {
   using T = typename ImageIn::T;
@@ -265,20 +183,6 @@ Plane<T> LinComb(const T lambda1, const Plane<T>& image1, const T lambda2,
   return out;
 }
 
-// Returns a pixel-by-pixel multiplication of image by lambda.
-template <typename T>
-Plane<T> ScaleImage(const T lambda, const Plane<T>& image) {
-  Plane<T> out(image.xsize(), image.ysize());
-  for (size_t y = 0; y < image.ysize(); ++y) {
-    const T* const JXL_RESTRICT row = image.Row(y);
-    T* const JXL_RESTRICT row_out = out.Row(y);
-    for (size_t x = 0; x < image.xsize(); ++x) {
-      row_out[x] = lambda * row[x];
-    }
-  }
-  return out;
-}
-
 // Multiplies image by lambda in-place
 template <typename T>
 void ScaleImage(const T lambda, Plane<T>* image) {
@@ -290,22 +194,14 @@ void ScaleImage(const T lambda, Plane<T>* image) {
   }
 }
 
+// Multiplies image by lambda in-place
 template <typename T>
-Plane<T> Product(const Plane<T>& a, const Plane<T>& b) {
-  Plane<T> c(a.xsize(), a.ysize());
-  for (size_t y = 0; y < a.ysize(); ++y) {
-    const T* const JXL_RESTRICT row_a = a.Row(y);
-    const T* const JXL_RESTRICT row_b = b.Row(y);
-    T* const JXL_RESTRICT row_c = c.Row(y);
-    for (size_t x = 0; x < a.xsize(); ++x) {
-      row_c[x] = row_a[x] * row_b[x];
-    }
+void ScaleImage(const T lambda, Image3<T>* image) {
+  for (size_t c = 0; c < 3; ++c) {
+    ScaleImage(lambda, &image->Plane(c));
   }
-  return c;
 }
 
-float DotProduct(const ImageF& a, const ImageF& b);
-
 template <typename T>
 void FillImage(const T value, Plane<T>* image) {
   for (size_t y = 0; y < image->ysize(); ++y) {
@@ -439,42 +335,6 @@ void ImageMinMax(const Plane<T>& image, T* const JXL_RESTRICT min,
   }
 }
 
-// Copies pixels, scaling their value relative to the "from" min/max by
-// "to_range". Example: U8 [0, 255] := [0.0, 1.0], to_range = 1.0 =>
-// outputs [0.0, 1.0].
-template <typename FromType, typename ToType>
-void ImageConvert(const Plane<FromType>& from, const float to_range,
-                  Plane<ToType>* const JXL_RESTRICT to) {
-  JXL_ASSERT(SameSize(from, *to));
-  FromType min_from, max_from;
-  ImageMinMax(from, &min_from, &max_from);
-  const float scale = to_range / (max_from - min_from);
-  for (size_t y = 0; y < from.ysize(); ++y) {
-    const FromType* const JXL_RESTRICT row_from = from.Row(y);
-    ToType* const JXL_RESTRICT row_to = to->Row(y);
-    for (size_t x = 0; x < from.xsize(); ++x) {
-      row_to[x] = static_cast<ToType>((row_from[x] - min_from) * scale);
-    }
-  }
-}
-
-template <typename From>
-Plane<float> ConvertToFloat(const Plane<From>& from) {
-  float factor = 1.0f / std::numeric_limits<From>::max();
-  if (std::is_same<From, double>::value || std::is_same<From, float>::value) {
-    factor = 1.0f;
-  }
-  Plane<float> to(from.xsize(), from.ysize());
-  for (size_t y = 0; y < from.ysize(); ++y) {
-    const From* const JXL_RESTRICT row_from = from.Row(y);
-    float* const JXL_RESTRICT row_to = to.Row(y);
-    for (size_t x = 0; x < from.xsize(); ++x) {
-      row_to[x] = row_from[x] * factor;
-    }
-  }
-  return to;
-}
-
 template <typename T>
 Plane<T> ImageFromPacked(const std::vector<T>& packed, const size_t xsize,
                          const size_t ysize) {
@@ -487,32 +347,6 @@ Plane<T> ImageFromPacked(const std::vector<T>& packed, const size_t xsize,
   return out;
 }
 
-// Computes independent minimum and maximum values for each plane.
-template <typename T>
-void Image3MinMax(const Image3<T>& image, const Rect& rect,
-                  std::array<T, 3>* out_min, std::array<T, 3>* out_max) {
-  for (size_t c = 0; c < 3; ++c) {
-    T min = std::numeric_limits<T>::max();
-    T max = std::numeric_limits<T>::min();
-    for (size_t y = 0; y < rect.ysize(); ++y) {
-      const T* JXL_RESTRICT row = rect.ConstPlaneRow(image, c, y);
-      for (size_t x = 0; x < rect.xsize(); ++x) {
-        min = std::min(min, row[x]);
-        max = std::max(max, row[x]);
-      }
-    }
-    (*out_min)[c] = min;
-    (*out_max)[c] = max;
-  }
-}
-
-// Computes independent minimum and maximum values for each plane.
-template <typename T>
-void Image3MinMax(const Image3<T>& image, std::array<T, 3>* out_min,
-                  std::array<T, 3>* out_max) {
-  Image3MinMax(image, Rect(image), out_min, out_max);
-}
-
 template <typename T>
 void Image3Max(const Image3<T>& image, std::array<T, 3>* out_max) {
   for (size_t c = 0; c < 3; ++c) {
@@ -527,38 +361,6 @@ void Image3Max(const Image3<T>& image, std::array<T, 3>* out_max) {
   }
 }
 
-// Computes the sum of the pixels in `rect`.
-template <typename T>
-T ImageSum(const Plane<T>& image, const Rect& rect) {
-  T result = 0;
-  for (size_t y = 0; y < rect.ysize(); ++y) {
-    const T* JXL_RESTRICT row = rect.ConstRow(image, y);
-    for (size_t x = 0; x < rect.xsize(); ++x) {
-      result += row[x];
-    }
-  }
-  return result;
-}
-
-template <typename T>
-T ImageSum(const Plane<T>& image) {
-  return ImageSum(image, Rect(image));
-}
-
-template <typename T>
-std::array<T, 3> Image3Sum(const Image3<T>& image, const Rect& rect) {
-  std::array<T, 3> out_sum = 0;
-  for (size_t c = 0; c < 3; ++c) {
-    (out_sum)[c] = ImageSum(image.Plane(c), rect);
-  }
-  return out_sum;
-}
-
-template <typename T>
-std::array<T, 3> Image3Sum(const Image3<T>& image) {
-  return Image3Sum(image, Rect(image));
-}
-
 template <typename T>
 std::vector<T> PackedFromImage(const Plane<T>& image, const Rect& rect) {
   const size_t xsize = rect.xsize();
@@ -575,157 +377,6 @@ std::vector<T> PackedFromImage(const Plane<T>& image) {
   return PackedFromImage(image, Rect(image));
 }
 
-// Computes the median pixel value.
-template <typename T>
-T ImageMedian(const Plane<T>& image, const Rect& rect) {
-  std::vector<T> pixels = PackedFromImage(image, rect);
-  return Median(&pixels);
-}
-
-template <typename T>
-T ImageMedian(const Plane<T>& image) {
-  return ImageMedian(image, Rect(image));
-}
-
-template <typename T>
-std::array<T, 3> Image3Median(const Image3<T>& image, const Rect& rect) {
-  std::array<T, 3> out_median;
-  for (size_t c = 0; c < 3; ++c) {
-    (out_median)[c] = ImageMedian(image.Plane(c), rect);
-  }
-  return out_median;
-}
-
-template <typename T>
-std::array<T, 3> Image3Median(const Image3<T>& image) {
-  return Image3Median(image, Rect(image));
-}
-
-template <typename FromType, typename ToType>
-void Image3Convert(const Image3<FromType>& from, const float to_range,
-                   Image3<ToType>* const JXL_RESTRICT to) {
-  JXL_ASSERT(SameSize(from, *to));
-  std::array<FromType, 3> min_from, max_from;
-  Image3MinMax(from, &min_from, &max_from);
-  float scales[3];
-  for (size_t c = 0; c < 3; ++c) {
-    scales[c] = to_range / (max_from[c] - min_from[c]);
-  }
-  float scale = std::min(scales[0], std::min(scales[1], scales[2]));
-  for (size_t c = 0; c < 3; ++c) {
-    for (size_t y = 0; y < from.ysize(); ++y) {
-      const FromType* JXL_RESTRICT row_from = from.ConstPlaneRow(c, y);
-      ToType* JXL_RESTRICT row_to = to->PlaneRow(c, y);
-      for (size_t x = 0; x < from.xsize(); ++x) {
-        const float to = (row_from[x] - min_from[c]) * scale;
-        row_to[x] = static_cast<ToType>(to);
-      }
-    }
-  }
-}
-
-template <typename From>
-Image3F ConvertToFloat(const Image3<From>& from) {
-  return Image3F(ConvertToFloat(from.Plane(0)), ConvertToFloat(from.Plane(1)),
-                 ConvertToFloat(from.Plane(2)));
-}
-
-template <typename Tin, typename Tout>
-void Subtract(const Image3<Tin>& image1, const Image3<Tin>& image2,
-              Image3<Tout>* out) {
-  const size_t xsize = image1.xsize();
-  const size_t ysize = image1.ysize();
-  JXL_CHECK(xsize == image2.xsize());
-  JXL_CHECK(ysize == image2.ysize());
-
-  for (size_t c = 0; c < 3; ++c) {
-    for (size_t y = 0; y < ysize; ++y) {
-      const Tin* const JXL_RESTRICT row1 = image1.ConstPlaneRow(c, y);
-      const Tin* const JXL_RESTRICT row2 = image2.ConstPlaneRow(c, y);
-      Tout* const JXL_RESTRICT row_out = out->PlaneRow(c, y);
-      for (size_t x = 0; x < xsize; ++x) {
-        row_out[x] = row1[x] - row2[x];
-      }
-    }
-  }
-}
-
-template <typename Tin, typename Tout>
-void SubtractFrom(const Image3<Tin>& what, Image3<Tout>* to) {
-  const size_t xsize = what.xsize();
-  const size_t ysize = what.ysize();
-  for (size_t c = 0; c < 3; ++c) {
-    for (size_t y = 0; y < ysize; ++y) {
-      const Tin* JXL_RESTRICT row_what = what.ConstPlaneRow(c, y);
-      Tout* JXL_RESTRICT row_to = to->PlaneRow(c, y);
-      for (size_t x = 0; x < xsize; ++x) {
-        row_to[x] -= row_what[x];
-      }
-    }
-  }
-}
-
-template <typename Tin, typename Tout>
-void AddTo(const Image3<Tin>& what, Image3<Tout>* to) {
-  const size_t xsize = what.xsize();
-  const size_t ysize = what.ysize();
-  for (size_t c = 0; c < 3; ++c) {
-    for (size_t y = 0; y < ysize; ++y) {
-      const Tin* JXL_RESTRICT row_what = what.ConstPlaneRow(c, y);
-      Tout* JXL_RESTRICT row_to = to->PlaneRow(c, y);
-      for (size_t x = 0; x < xsize; ++x) {
-        row_to[x] += row_what[x];
-      }
-    }
-  }
-}
-
-// Adds `what` of the size of `rect` to `to` in the position of `rect`.
-template <typename Tin, typename Tout>
-void AddTo(const Rect& rect, const Image3<Tin>& what, Image3<Tout>* to) {
-  const size_t xsize = what.xsize();
-  const size_t ysize = what.ysize();
-  JXL_ASSERT(xsize == rect.xsize());
-  JXL_ASSERT(ysize == rect.ysize());
-  for (size_t c = 0; c < 3; ++c) {
-    for (size_t y = 0; y < ysize; ++y) {
-      const Tin* JXL_RESTRICT row_what = what.ConstPlaneRow(c, y);
-      Tout* JXL_RESTRICT row_to = rect.PlaneRow(to, c, y);
-      for (size_t x = 0; x < xsize; ++x) {
-        row_to[x] += row_what[x];
-      }
-    }
-  }
-}
-
-template <typename T>
-Image3<T> ScaleImage(const T lambda, const Image3<T>& image) {
-  Image3<T> out(image.xsize(), image.ysize());
-  for (size_t c = 0; c < 3; ++c) {
-    for (size_t y = 0; y < image.ysize(); ++y) {
-      const T* JXL_RESTRICT row = image.ConstPlaneRow(c, y);
-      T* JXL_RESTRICT row_out = out.PlaneRow(c, y);
-      for (size_t x = 0; x < image.xsize(); ++x) {
-        row_out[x] = lambda * row[x];
-      }
-    }
-  }
-  return out;
-}
-
-// Multiplies image by lambda in-place
-template <typename T>
-void ScaleImage(const T lambda, Image3<T>* image) {
-  for (size_t c = 0; c < 3; ++c) {
-    for (size_t y = 0; y < image->ysize(); ++y) {
-      T* const JXL_RESTRICT row = image->PlaneRow(c, y);
-      for (size_t x = 0; x < image->xsize(); ++x) {
-        row[x] = lambda * row[x];
-      }
-    }
-  }
-}
-
 // Initializes all planes to the same "value".
 template <typename T>
 void FillImage(const T value, Image3<T>* image) {
@@ -789,13 +440,10 @@ void ZeroFillPlane(Plane<T>* image, Rect rect) {
   }
 }
 
-// Pad an image with xborder columns on each vertical side and yboder rows
-// above and below, mirroring the image.
-Image3F PadImageMirror(const Image3F& in, size_t xborder, size_t yborder);
-
 // Same as above, but operates in-place. Assumes that the `in` image was
 // allocated large enough.
-void PadImageToBlockMultipleInPlace(Image3F* JXL_RESTRICT in);
+void PadImageToBlockMultipleInPlace(Image3F* JXL_RESTRICT in,
+                                    size_t block_dim = kBlockDim);
 
 // Downsamples an image by a given factor.
 void DownsampleImage(Image3F* opsin, size_t factor);
index 8937364..dfcb229 100644 (file)
@@ -6,15 +6,14 @@
 #include "lib/jxl/image_ops.h"
 
 #include <stdint.h>
-#include <stdio.h>
 #include <stdlib.h>
 
 #include <utility>
 
-#include "gtest/gtest.h"
 #include "lib/jxl/base/printf_macros.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
 namespace {
@@ -25,7 +24,7 @@ void TestPacked(const size_t xsize, const size_t ysize) {
   RandomFillImage(&image1);
   const std::vector<T>& packed = PackedFromImage(image1);
   const Plane<T>& image2 = ImageFromPacked(packed, xsize, ysize);
-  EXPECT_TRUE(SamePixels(image1, image2));
+  JXL_EXPECT_OK(SamePixels(image1, image2, _));
 }
 
 TEST(ImageTest, TestPacked) {
index 4549c19..e7d7228 100644 (file)
@@ -6,53 +6,38 @@
 #ifndef LIB_JXL_IMAGE_TEST_UTILS_H_
 #define LIB_JXL_IMAGE_TEST_UTILS_H_
 
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+#include <inttypes.h>
 #include <stddef.h>
+#include <stdint.h>
 
 #include <cmath>
 #include <limits>
+#include <sstream>
 
-#include "gtest/gtest.h"
 #include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/printf_macros.h"
 #include "lib/jxl/base/random.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/image.h"
 
 namespace jxl {
 
 template <typename T>
-void VerifyEqual(const Plane<T>& expected, const Plane<T>& actual) {
-  JXL_CHECK(SameSize(expected, actual));
-  for (size_t y = 0; y < expected.ysize(); ++y) {
-    const T* const JXL_RESTRICT row_expected = expected.Row(y);
-    const T* const JXL_RESTRICT row_actual = actual.Row(y);
-    for (size_t x = 0; x < expected.xsize(); ++x) {
-      ASSERT_EQ(row_expected[x], row_actual[x]) << x << " " << y;
-    }
-  }
-}
-
-template <typename T>
-void VerifyEqual(const Image3<T>& expected, const Image3<T>& actual) {
-  for (size_t c = 0; c < 3; ++c) {
-    VerifyEqual(expected.Plane(c), actual.Plane(c));
-  }
-}
-
-template <typename T>
 bool SamePixels(const Plane<T>& image1, const Plane<T>& image2,
-                const Rect rect) {
-  if (!rect.IsInside(image1) || !rect.IsInside(image2)) {
-    ADD_FAILURE() << "requested rectangle is not fully inside the image";
-    return false;
-  }
+                std::stringstream& failures) {
+  const Rect rect(image1);
+  JXL_CHECK(SameSize(image1, image2));
   size_t mismatches = 0;
   for (size_t y = rect.y0(); y < rect.ysize(); ++y) {
     const T* const JXL_RESTRICT row1 = image1.Row(y);
     const T* const JXL_RESTRICT row2 = image2.Row(y);
     for (size_t x = rect.x0(); x < rect.xsize(); ++x) {
       if (row1[x] != row2[x]) {
-        ADD_FAILURE() << "pixel mismatch" << x << ", " << y << ": "
-                      << double(row1[x]) << " != " << double(row2[x]);
+        failures << "pixel mismatch" << x << ", " << y << ": "
+                 << double(row1[x]) << " != " << double(row2[x]) << "\n";
         if (++mismatches > 4) {
           return false;
         }
@@ -63,16 +48,11 @@ bool SamePixels(const Plane<T>& image1, const Plane<T>& image2,
 }
 
 template <typename T>
-bool SamePixels(const Plane<T>& image1, const Plane<T>& image2) {
-  JXL_CHECK(SameSize(image1, image2));
-  return SamePixels(image1, image2, Rect(image1));
-}
-
-template <typename T>
-bool SamePixels(const Image3<T>& image1, const Image3<T>& image2) {
+bool SamePixels(const Image3<T>& image1, const Image3<T>& image2,
+                std::stringstream& failures) {
   JXL_CHECK(SameSize(image1, image2));
   for (size_t c = 0; c < 3; ++c) {
-    if (!SamePixels(image1.Plane(c), image2.Plane(c))) {
+    if (!SamePixels(image1.Plane(c), image2.Plane(c), failures)) {
       return false;
     }
   }
@@ -82,10 +62,11 @@ bool SamePixels(const Image3<T>& image1, const Image3<T>& image2) {
 // Use for floating-point images with fairly large numbers; tolerates small
 // absolute errors and/or small relative errors.
 template <typename T>
-void VerifyRelativeError(const Plane<T>& expected, const Plane<T>& actual,
+bool VerifyRelativeError(const Plane<T>& expected, const Plane<T>& actual,
                          const double threshold_l1,
                          const double threshold_relative,
-                         const intptr_t border = 0, const size_t c = 0) {
+                         std::stringstream& failures, const intptr_t border = 0,
+                         const size_t c = 0) {
   JXL_CHECK(SameSize(expected, actual));
   const intptr_t xsize = expected.xsize();
   const intptr_t ysize = expected.ysize();
@@ -118,57 +99,36 @@ void VerifyRelativeError(const Plane<T>& expected, const Plane<T>& actual,
       }
     }
   }
-  if (any_bad) {
-    // Never had a valid relative value, don't print it.
-    if (max_relative < 0) {
-      fprintf(stderr, "c=%" PRIu64 ": max +/- %E exceeds +/- %.2E\n",
-              static_cast<uint64_t>(c), max_l1, threshold_l1);
-    } else {
-      fprintf(stderr,
-              "c=%" PRIu64 ": max +/- %E, x %E exceeds +/- %.2E, x %.2E\n",
-              static_cast<uint64_t>(c), max_l1, max_relative, threshold_l1,
-              threshold_relative);
-    }
-    // Dump the expected image and actual image if the region is small enough.
-    const intptr_t kMaxTestDumpSize = 16;
-    if (xsize <= kMaxTestDumpSize + 2 * border &&
-        ysize <= kMaxTestDumpSize + 2 * border) {
-      fprintf(stderr, "Expected image:\n");
-      for (intptr_t y = border; y < ysize - border; ++y) {
-        const T* const JXL_RESTRICT row_expected = expected.Row(y);
-        for (intptr_t x = border; x < xsize - border; ++x) {
-          fprintf(stderr, "%10lf ", static_cast<double>(row_expected[x]));
-        }
-        fprintf(stderr, "\n");
-      }
-
-      fprintf(stderr, "Actual image:\n");
-      for (intptr_t y = border; y < ysize - border; ++y) {
-        const T* const JXL_RESTRICT row_expected = expected.Row(y);
-        const T* const JXL_RESTRICT row_actual = actual.Row(y);
-        for (intptr_t x = border; x < xsize - border; ++x) {
-          const double l1 = std::abs(row_expected[x] - row_actual[x]);
-
-          bool bad = l1 > threshold_l1;
-          if (row_expected[x] > 1E-10) {
-            const double relative = l1 / std::abs(double(row_expected[x]));
-            bad &= relative > threshold_relative;
-          }
-          if (bad) {
-            fprintf(stderr, "%10lf ", static_cast<double>(row_actual[x]));
-          } else {
-            fprintf(stderr, "%10s ", "==");
-          }
-        }
-        fprintf(stderr, "\n");
+  if (!any_bad) {
+    return true;
+  }
+  // Never had a valid relative value, don't print it.
+  if (max_relative < 0) {
+    fprintf(stderr, "c=%" PRIu64 ": max +/- %E exceeds +/- %.2E\n",
+            static_cast<uint64_t>(c), max_l1, threshold_l1);
+  } else {
+    fprintf(stderr,
+            "c=%" PRIu64 ": max +/- %E, x %E exceeds +/- %.2E, x %.2E\n",
+            static_cast<uint64_t>(c), max_l1, max_relative, threshold_l1,
+            threshold_relative);
+  }
+  // Dump the expected image and actual image if the region is small enough.
+  const intptr_t kMaxTestDumpSize = 16;
+  if (xsize <= kMaxTestDumpSize + 2 * border &&
+      ysize <= kMaxTestDumpSize + 2 * border) {
+    fprintf(stderr, "Expected image:\n");
+    for (intptr_t y = border; y < ysize - border; ++y) {
+      const T* const JXL_RESTRICT row_expected = expected.Row(y);
+      for (intptr_t x = border; x < xsize - border; ++x) {
+        fprintf(stderr, "%10lf ", static_cast<double>(row_expected[x]));
       }
+      fprintf(stderr, "\n");
     }
 
-    // Find first failing x for further debugging.
+    fprintf(stderr, "Actual image:\n");
     for (intptr_t y = border; y < ysize - border; ++y) {
       const T* const JXL_RESTRICT row_expected = expected.Row(y);
       const T* const JXL_RESTRICT row_actual = actual.Row(y);
-
       for (intptr_t x = border; x < xsize - border; ++x) {
         const double l1 = std::abs(row_expected[x] - row_actual[x]);
 
@@ -178,26 +138,55 @@ void VerifyRelativeError(const Plane<T>& expected, const Plane<T>& actual,
           bad &= relative > threshold_relative;
         }
         if (bad) {
-          FAIL() << x << ", " << y << " (" << expected.xsize() << " x "
+          fprintf(stderr, "%10lf ", static_cast<double>(row_actual[x]));
+        } else {
+          fprintf(stderr, "%10s ", "==");
+        }
+      }
+      fprintf(stderr, "\n");
+    }
+  }
+
+  // Find first failing x for further debugging.
+  for (intptr_t y = border; y < ysize - border; ++y) {
+    const T* const JXL_RESTRICT row_expected = expected.Row(y);
+    const T* const JXL_RESTRICT row_actual = actual.Row(y);
+
+    for (intptr_t x = border; x < xsize - border; ++x) {
+      const double l1 = std::abs(row_expected[x] - row_actual[x]);
+
+      bool bad = l1 > threshold_l1;
+      if (row_expected[x] > 1E-10) {
+        const double relative = l1 / std::abs(double(row_expected[x]));
+        bad &= relative > threshold_relative;
+      }
+      if (bad) {
+        failures << x << ", " << y << " (" << expected.xsize() << " x "
                  << expected.ysize() << ") expected "
                  << static_cast<double>(row_expected[x]) << " actual "
                  << static_cast<double>(row_actual[x]);
-        }
+        return false;
       }
     }
-    return;  // if any_bad, we should have exited.
   }
+  return false;
 }
 
 template <typename T>
-void VerifyRelativeError(const Image3<T>& expected, const Image3<T>& actual,
+bool VerifyRelativeError(const Image3<T>& expected, const Image3<T>& actual,
                          const float threshold_l1,
                          const float threshold_relative,
+                         std::stringstream& failures,
                          const intptr_t border = 0) {
   for (size_t c = 0; c < 3; ++c) {
-    VerifyRelativeError(expected.Plane(c), actual.Plane(c), threshold_l1,
-                        threshold_relative, border, c);
+    bool ok =
+        VerifyRelativeError(expected.Plane(c), actual.Plane(c), threshold_l1,
+                            threshold_relative, failures, border, c);
+    if (!ok) {
+      return false;
+    }
   }
+  return true;
 }
 
 template <typename T, typename U = T>
diff --git a/lib/jxl/inverse_mtf-inl.h b/lib/jxl/inverse_mtf-inl.h
new file mode 100644 (file)
index 0000000..fcb01d7
--- /dev/null
@@ -0,0 +1,90 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// SIMDified inverse-move-to-front transform.
+
+#if defined(LIB_JXL_INVERSE_MTF_INL_H_) == defined(HWY_TARGET_TOGGLE)
+#ifdef LIB_JXL_INVERSE_MTF_INL_H_
+#undef LIB_JXL_INVERSE_MTF_INL_H_
+#else
+#define LIB_JXL_INVERSE_MTF_INL_H_
+#endif
+
+#include <hwy/highway.h>
+
+#include "lib/jxl/sanitizers.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+// These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::FirstN;
+using hwy::HWY_NAMESPACE::IfThenElse;
+using hwy::HWY_NAMESPACE::Load;
+using hwy::HWY_NAMESPACE::LoadU;
+using hwy::HWY_NAMESPACE::StoreU;
+
+inline void MoveToFront(uint8_t* v, uint8_t index) {
+  uint8_t value = v[index];
+  uint8_t i = index;
+  if (i < 4) {
+    for (; i; --i) v[i] = v[i - 1];
+  } else {
+    const HWY_CAPPED(uint8_t, 64) d;
+    int tail = i & (Lanes(d) - 1);
+    if (tail) {
+      i -= tail;
+      const auto vec = Load(d, v + i);
+      const auto prev = LoadU(d, v + i + 1);
+      StoreU(IfThenElse(FirstN(d, tail), vec, prev), d, v + i + 1);
+    }
+    while (i) {
+      i -= Lanes(d);
+      const auto vec = Load(d, v + i);
+      StoreU(vec, d, v + i + 1);
+    }
+  }
+  v[0] = value;
+}
+
+inline void InverseMoveToFrontTransform(uint8_t* v, int v_len) {
+  HWY_ALIGN uint8_t mtf[256 + 64];
+  int i;
+  for (i = 0; i < 256; ++i) {
+    mtf[i] = static_cast<uint8_t>(i);
+  }
+#if JXL_MEMORY_SANITIZER
+  const HWY_CAPPED(uint8_t, 64) d;
+  for (size_t j = 0; j < Lanes(d); ++j) {
+    mtf[256 + j] = 0;
+  }
+#endif  // JXL_MEMORY_SANITIZER
+  for (i = 0; i < v_len; ++i) {
+    uint8_t index = v[i];
+    v[i] = mtf[index];
+    if (index) MoveToFront(mtf, index);
+  }
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#endif  // LIB_JXL_INVERSE_MTF_INL_H_
+
+#if HWY_ONCE
+#ifndef INVERSE_MTF_ONCE
+#define INVERSE_MTF_ONCE
+
+namespace jxl {
+inline void InverseMoveToFrontTransform(uint8_t* v, int v_len) {
+  return HWY_STATIC_DISPATCH(InverseMoveToFrontTransform)(v, v_len);
+}
+}  // namespace jxl
+
+#endif  // INVERSE_MTF_ONCE
+#endif  // HWY_ONCE
index db49a1c..9763786 100644 (file)
@@ -120,8 +120,8 @@ Status DecodeJPEGData(Span<const uint8_t> encoded, JPEGData* jpeg_data) {
 
   // Check if there is more decompressed output.
   size_t available_out = 1;
-  uint64_t dummy;
-  uint8_t* next_out = reinterpret_cast<uint8_t*>(&dummy);
+  uint64_t sink;
+  uint8_t* next_out = reinterpret_cast<uint8_t*>(&sink);
   result = BrotliDecoderDecompressStream(brotli_dec, &available_in, &in,
                                          &available_out, &next_out, nullptr);
   if (available_out == 0 ||
index 5336e47..33b8d19 100644 (file)
@@ -8,12 +8,18 @@
 #include <stdlib.h>
 #include <string.h> /* for memset, memcpy */
 
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
 #include <deque>
 #include <string>
 #include <vector>
 
 #include "lib/jxl/base/bits.h"
-#include "lib/jxl/common.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/common.h"
+#include "lib/jxl/frame_dimensions.h"
+#include "lib/jxl/image_bundle.h"
 #include "lib/jxl/jpeg/dec_jpeg_serialization_state.h"
 #include "lib/jxl/jpeg/jpeg_data.h"
 
@@ -34,9 +40,6 @@ const int kJpegPrecision = 8;
 // JpegBitWriter: buffer size
 const size_t kJpegBitWriterChunkSize = 16384;
 
-// DCTCodingState: maximum number of correction bits to buffer
-const int kJPEGMaxCorrectionBits = 1u << 16;
-
 // Returns non-zero if and only if x has a zero byte, i.e. one of
 // x & 0xff, x & 0xff00, ..., x & 0xff00000000000000 is zero.
 static JXL_INLINE uint64_t HasZeroByte(uint64_t x) {
@@ -75,18 +78,19 @@ static JXL_INLINE void Reserve(JpegBitWriter* bw, size_t n_bytes) {
  * space in the output buffer. Emits up to 2 bytes to buffer.
  */
 static JXL_INLINE void EmitByte(JpegBitWriter* bw, int byte) {
-  bw->data[bw->pos++] = byte;
-  if (byte == 0xFF) bw->data[bw->pos++] = 0;
+  bw->data[bw->pos] = byte;
+  bw->data[bw->pos + 1] = 0;
+  bw->pos += (byte != 0xFF ? 1 : 2);
 }
 
-static JXL_INLINE void DischargeBitBuffer(JpegBitWriter* bw) {
-  // At this point we are ready to emit the most significant 6 bytes of
-  // put_buffer_ to the output.
+static JXL_INLINE void DischargeBitBuffer(JpegBitWriter* bw, int nbits,
+                                          uint64_t bits) {
+  // At this point we are ready to emit the put_buffer to the output.
   // The JPEG format requires that after every 0xff byte in the entropy
   // coded section, there is a zero byte, therefore we first check if any of
-  // the 6 most significant bytes of put_buffer_ is 0xFF.
-  Reserve(bw, 12);
-  if (HasZeroByte(~bw->put_buffer | 0xFFFF)) {
+  // the 8 bytes of put_buffer is 0xFF.
+  bw->put_buffer |= (bits >> -bw->put_bits);
+  if (JXL_UNLIKELY(HasZeroByte(~bw->put_buffer))) {
     // We have a 0xFF byte somewhere, examine each byte and append a zero
     // byte if necessary.
     EmitByte(bw, (bw->put_buffer >> 56) & 0xFF);
@@ -95,32 +99,31 @@ static JXL_INLINE void DischargeBitBuffer(JpegBitWriter* bw) {
     EmitByte(bw, (bw->put_buffer >> 32) & 0xFF);
     EmitByte(bw, (bw->put_buffer >> 24) & 0xFF);
     EmitByte(bw, (bw->put_buffer >> 16) & 0xFF);
+    EmitByte(bw, (bw->put_buffer >> 8) & 0xFF);
+    EmitByte(bw, (bw->put_buffer) & 0xFF);
   } else {
-    // We don't have any 0xFF bytes, output all 6 bytes without checking.
-    bw->data[bw->pos] = (bw->put_buffer >> 56) & 0xFF;
-    bw->data[bw->pos + 1] = (bw->put_buffer >> 48) & 0xFF;
-    bw->data[bw->pos + 2] = (bw->put_buffer >> 40) & 0xFF;
-    bw->data[bw->pos + 3] = (bw->put_buffer >> 32) & 0xFF;
-    bw->data[bw->pos + 4] = (bw->put_buffer >> 24) & 0xFF;
-    bw->data[bw->pos + 5] = (bw->put_buffer >> 16) & 0xFF;
-    bw->pos += 6;
+    // We don't have any 0xFF bytes, output all 8 bytes without checking.
+    StoreBE64(bw->put_buffer, bw->data + bw->pos);
+    bw->pos += 8;
   }
-  bw->put_buffer <<= 48;
-  bw->put_bits += 48;
+
+  bw->put_bits += 64;
+  bw->put_buffer = bits << bw->put_bits;
 }
 
 static JXL_INLINE void WriteBits(JpegBitWriter* bw, int nbits, uint64_t bits) {
-  // This is an optimization; if everything goes well,
-  // then |nbits| is positive; if non-existing Huffman symbol is going to be
-  // encoded, its length should be zero; later encoder could check the
-  // "health" of JpegBitWriter.
-  if (nbits == 0) {
-    bw->healthy = false;
-    return;
-  }
+  JXL_DASSERT(nbits > 0);
   bw->put_bits -= nbits;
-  bw->put_buffer |= (bits << bw->put_bits);
-  if (bw->put_bits <= 16) DischargeBitBuffer(bw);
+  if (JXL_UNLIKELY(bw->put_bits < 0)) {
+    if (JXL_UNLIKELY(nbits > 64)) {
+      bw->put_bits += nbits;
+      bw->healthy = false;
+    } else {
+      DischargeBitBuffer(bw, nbits, bits);
+    }
+  } else {
+    bw->put_buffer |= (bits << bw->put_bits);
+  }
 }
 
 void EmitMarker(JpegBitWriter* bw, int marker) {
@@ -181,44 +184,91 @@ void DCTCodingStateInit(DCTCodingState* s) {
   s->eob_run_ = 0;
   s->cur_ac_huff_ = nullptr;
   s->refinement_bits_.clear();
-  s->refinement_bits_.reserve(kJPEGMaxCorrectionBits);
+  s->refinement_bits_.reserve(64);
+}
+
+static JXL_INLINE void WriteSymbol(int symbol, HuffmanCodeTable* table,
+                                   JpegBitWriter* bw) {
+  WriteBits(bw, table->depth[symbol], table->code[symbol]);
+}
+
+static JXL_INLINE void WriteSymbolBits(int symbol, HuffmanCodeTable* table,
+                                       JpegBitWriter* bw, int nbits,
+                                       uint64_t bits) {
+  WriteBits(bw, nbits + table->depth[symbol],
+            bits | (table->code[symbol] << nbits));
 }
 
 // Emit all buffered data to the bit stream using the given Huffman code and
 // bit writer.
 static JXL_INLINE void Flush(DCTCodingState* s, JpegBitWriter* bw) {
   if (s->eob_run_ > 0) {
+    Reserve(bw, 16);
     int nbits = FloorLog2Nonzero<uint32_t>(s->eob_run_);
     int symbol = nbits << 4u;
-    WriteBits(bw, s->cur_ac_huff_->depth[symbol],
-              s->cur_ac_huff_->code[symbol]);
+    WriteSymbol(symbol, s->cur_ac_huff_, bw);
     if (nbits > 0) {
       WriteBits(bw, nbits, s->eob_run_ & ((1 << nbits) - 1));
     }
     s->eob_run_ = 0;
   }
-  for (size_t i = 0; i < s->refinement_bits_.size(); ++i) {
-    WriteBits(bw, 1, s->refinement_bits_[i]);
+  const size_t kStride = 124;  // (515 - 16) / 2 / 2
+  size_t num_words = s->refinement_bits_count_ >> 4;
+  size_t i = 0;
+  while (i < num_words) {
+    size_t limit = std::min(i + kStride, num_words);
+    Reserve(bw, 512);
+    for (; i < limit; ++i) {
+      WriteBits(bw, 16, s->refinement_bits_[i]);
+    }
+  }
+  Reserve(bw, 16);
+  size_t tail = s->refinement_bits_count_ & 0xF;
+  if (tail) {
+    WriteBits(bw, tail, s->refinement_bits_.back());
   }
   s->refinement_bits_.clear();
+  s->refinement_bits_count_ = 0;
 }
 
 // Buffer some more data at the end-of-band (the last non-zero or newly
 // non-zero coefficient within the [Ss, Se] spectral band).
 static JXL_INLINE void BufferEndOfBand(DCTCodingState* s,
-                                       const HuffmanCodeTable* ac_huff,
-                                       const std::vector<int>* new_bits,
+                                       HuffmanCodeTable* ac_huff,
+                                       const int* new_bits_array,
+                                       size_t new_bits_count,
                                        JpegBitWriter* bw) {
   if (s->eob_run_ == 0) {
     s->cur_ac_huff_ = ac_huff;
   }
   ++s->eob_run_;
-  if (new_bits) {
-    s->refinement_bits_.insert(s->refinement_bits_.end(), new_bits->begin(),
-                               new_bits->end());
+  if (new_bits_count) {
+    uint64_t new_bits = 0;
+    for (size_t i = 0; i < new_bits_count; ++i) {
+      new_bits = (new_bits << 1) | new_bits_array[i];
+    }
+    size_t tail = s->refinement_bits_count_ & 0xF;
+    if (tail) {  // First stuff the tail item
+      size_t stuff_bits_count = std::min(16 - tail, new_bits_count);
+      uint16_t stuff_bits = new_bits >> (new_bits_count - stuff_bits_count);
+      stuff_bits &= ((1u << stuff_bits_count) - 1);
+      s->refinement_bits_.back() =
+          (s->refinement_bits_.back() << stuff_bits_count) | stuff_bits;
+      new_bits_count -= stuff_bits_count;
+      s->refinement_bits_count_ += stuff_bits_count;
+    }
+    while (new_bits_count >= 16) {
+      s->refinement_bits_.push_back(new_bits >> (new_bits_count - 16));
+      new_bits_count -= 16;
+      s->refinement_bits_count_ += 16;
+    }
+    if (new_bits_count) {
+      s->refinement_bits_.push_back(new_bits & ((1u << new_bits_count) - 1));
+      s->refinement_bits_count_ += new_bits_count;
+    }
   }
-  if (s->eob_run_ == 0x7FFF ||
-      s->refinement_bits_.size() > kJPEGMaxCorrectionBits - kDCTBlockSize + 1) {
+
+  if (s->eob_run_ == 0x7FFF) {
     Flush(s, bw);
   }
 }
@@ -362,10 +412,11 @@ bool EncodeDHT(const JPEGData& jpg, SerializationState* state) {
       huff_table = &state->dc_huff_table[index];
     }
     // TODO(eustas): cache
-    // TODO(eustas): set up non-existing symbols
+    huff_table->InitDepths(127);
     if (!BuildHuffmanCodeTable(huff, huff_table)) {
       return false;
     }
+    huff_table->initialized = true;
     size_t total_count = 0;
     size_t max_length = 0;
     for (size_t i = 0; i < huff.counts.size(); ++i) {
@@ -464,65 +515,74 @@ bool EncodeInterMarkerData(const JPEGData& jpg, SerializationState* state) {
   return true;
 }
 
-bool EncodeDCTBlockSequential(const coeff_t* coeffs,
-                              const HuffmanCodeTable& dc_huff,
-                              const HuffmanCodeTable& ac_huff,
-                              int num_zero_runs, coeff_t* last_dc_coeff,
-                              JpegBitWriter* bw) {
+bool EncodeDCTBlockSequential(const coeff_t* coeffs, HuffmanCodeTable* dc_huff,
+                              HuffmanCodeTable* ac_huff, int num_zero_runs,
+                              coeff_t* last_dc_coeff, JpegBitWriter* bw) {
   coeff_t temp2;
   coeff_t temp;
+  coeff_t litmus = 0;
   temp2 = coeffs[0];
   temp = temp2 - *last_dc_coeff;
   *last_dc_coeff = temp2;
-  temp2 = temp;
-  if (temp < 0) {
-    temp = -temp;
-    if (temp < 0) return false;
-    temp2--;
-  }
-  int dc_nbits = (temp == 0) ? 0 : (FloorLog2Nonzero<uint32_t>(temp) + 1);
-  WriteBits(bw, dc_huff.depth[dc_nbits], dc_huff.code[dc_nbits]);
+  temp2 = temp >> (8 * sizeof(coeff_t) - 1);
+  temp += temp2;
+  temp2 ^= temp;
+
+  int dc_nbits = (temp2 == 0) ? 0 : (FloorLog2Nonzero<uint32_t>(temp2) + 1);
+  WriteSymbol(dc_nbits, dc_huff, bw);
+#if false
+  // If the input is corrupt, this could be triggered. Checking is
+  // costly though, so it makes more sense to avoid this branch.
+  // (producing a corrupt JPEG when the input is corrupt, instead
+  // of catching it and returning error)
   if (dc_nbits >= 12) return false;
-  if (dc_nbits > 0) {
-    WriteBits(bw, dc_nbits, temp2 & ((1u << dc_nbits) - 1));
+#endif
+  if (dc_nbits) {
+    WriteBits(bw, dc_nbits, temp & ((1u << dc_nbits) - 1));
   }
-  int r = 0;
-  for (int k = 1; k < 64; ++k) {
-    if ((temp = coeffs[kJPEGNaturalOrder[k]]) == 0) {
+  int16_t r = 0;
+
+  for (size_t i = 1; i < 64; i++) {
+    if ((temp = coeffs[kJPEGNaturalOrder[i]]) == 0) {
       r++;
-      continue;
-    }
-    if (temp < 0) {
-      temp = -temp;
-      if (temp < 0) return false;
-      temp2 = ~temp;
     } else {
-      temp2 = temp;
-    }
-    while (r > 15) {
-      WriteBits(bw, ac_huff.depth[0xf0], ac_huff.code[0xf0]);
-      r -= 16;
+      temp2 = temp >> (8 * sizeof(coeff_t) - 1);
+      temp += temp2;
+      temp2 ^= temp;
+      if (JXL_UNLIKELY(r > 15)) {
+        WriteSymbol(0xf0, ac_huff, bw);
+        r -= 16;
+        if (r > 15) {
+          WriteSymbol(0xf0, ac_huff, bw);
+          r -= 16;
+        }
+        if (r > 15) {
+          WriteSymbol(0xf0, ac_huff, bw);
+          r -= 16;
+        }
+      }
+      litmus |= temp2;
+      int ac_nbits =
+          FloorLog2Nonzero<uint32_t>(static_cast<uint16_t>(temp2)) + 1;
+      int symbol = (r << 4u) + ac_nbits;
+      WriteSymbolBits(symbol, ac_huff, bw, ac_nbits,
+                      temp & ((1 << ac_nbits) - 1));
+      r = 0;
     }
-    int ac_nbits = FloorLog2Nonzero<uint32_t>(temp) + 1;
-    if (ac_nbits >= 16) return false;
-    int symbol = (r << 4u) + ac_nbits;
-    WriteBits(bw, ac_huff.depth[symbol], ac_huff.code[symbol]);
-    WriteBits(bw, ac_nbits, temp2 & ((1 << ac_nbits) - 1));
-    r = 0;
   }
+
   for (int i = 0; i < num_zero_runs; ++i) {
-    WriteBits(bw, ac_huff.depth[0xf0], ac_huff.code[0xf0]);
+    WriteSymbol(0xf0, ac_huff, bw);
     r -= 16;
   }
   if (r > 0) {
-    WriteBits(bw, ac_huff.depth[0], ac_huff.code[0]);
+    WriteSymbol(0, ac_huff, bw);
   }
-  return true;
+  return (litmus >= 0);
 }
 
-bool EncodeDCTBlockProgressive(const coeff_t* coeffs,
-                               const HuffmanCodeTable& dc_huff,
-                               const HuffmanCodeTable& ac_huff, int Ss, int Se,
+bool EncodeDCTBlockProgressive(const coeff_t* coeffs, HuffmanCodeTable* dc_huff,
+                               HuffmanCodeTable* ac_huff, int Ss, int Se,
                                int Al, int num_zero_runs,
                                DCTCodingState* coding_state,
                                coeff_t* last_dc_coeff, JpegBitWriter* bw) {
@@ -540,8 +600,8 @@ bool EncodeDCTBlockProgressive(const coeff_t* coeffs,
       temp2--;
     }
     int nbits = (temp == 0) ? 0 : (FloorLog2Nonzero<uint32_t>(temp) + 1);
-    WriteBits(bw, dc_huff.depth[nbits], dc_huff.code[nbits]);
-    if (nbits > 0) {
+    WriteSymbol(nbits, dc_huff, bw);
+    if (nbits) {
       WriteBits(bw, nbits, temp2 & ((1 << nbits) - 1));
     }
     ++Ss;
@@ -570,24 +630,24 @@ bool EncodeDCTBlockProgressive(const coeff_t* coeffs,
     }
     Flush(coding_state, bw);
     while (r > 15) {
-      WriteBits(bw, ac_huff.depth[0xf0], ac_huff.code[0xf0]);
+      WriteSymbol(0xf0, ac_huff, bw);
       r -= 16;
     }
     int nbits = FloorLog2Nonzero<uint32_t>(temp) + 1;
     int symbol = (r << 4u) + nbits;
-    WriteBits(bw, ac_huff.depth[symbol], ac_huff.code[symbol]);
+    WriteSymbol(symbol, ac_huff, bw);
     WriteBits(bw, nbits, temp2 & ((1 << nbits) - 1));
     r = 0;
   }
   if (num_zero_runs > 0) {
     Flush(coding_state, bw);
     for (int i = 0; i < num_zero_runs; ++i) {
-      WriteBits(bw, ac_huff.depth[0xf0], ac_huff.code[0xf0]);
+      WriteSymbol(0xf0, ac_huff, bw);
       r -= 16;
     }
   }
   if (r > 0) {
-    BufferEndOfBand(coding_state, &ac_huff, nullptr, bw);
+    BufferEndOfBand(coding_state, ac_huff, nullptr, 0, bw);
     if (!eob_run_allowed) {
       Flush(coding_state, bw);
     }
@@ -595,9 +655,8 @@ bool EncodeDCTBlockProgressive(const coeff_t* coeffs,
   return true;
 }
 
-bool EncodeRefinementBits(const coeff_t* coeffs,
-                          const HuffmanCodeTable& ac_huff, int Ss, int Se,
-                          int Al, DCTCodingState* coding_state,
+bool EncodeRefinementBits(const coeff_t* coeffs, HuffmanCodeTable* ac_huff,
+                          int Ss, int Se, int Al, DCTCodingState* coding_state,
                           JpegBitWriter* bw) {
   bool eob_run_allowed = Ss > 0;
   if (Ss == 0) {
@@ -618,8 +677,8 @@ bool EncodeRefinementBits(const coeff_t* coeffs,
     }
   }
   int r = 0;
-  std::vector<int> refinement_bits;
-  refinement_bits.reserve(kDCTBlockSize);
+  int refinement_bits[kDCTBlockSize];
+  size_t refinement_bits_count = 0;
   for (int k = Ss; k <= Se; k++) {
     if (abs_values[k] == 0) {
       r++;
@@ -627,30 +686,31 @@ bool EncodeRefinementBits(const coeff_t* coeffs,
     }
     while (r > 15 && k <= eob) {
       Flush(coding_state, bw);
-      WriteBits(bw, ac_huff.depth[0xf0], ac_huff.code[0xf0]);
+      WriteSymbol(0xf0, ac_huff, bw);
       r -= 16;
-      for (int bit : refinement_bits) {
-        WriteBits(bw, 1, bit);
+      for (size_t i = 0; i < refinement_bits_count; ++i) {
+        WriteBits(bw, 1, refinement_bits[i]);
       }
-      refinement_bits.clear();
+      refinement_bits_count = 0;
     }
     if (abs_values[k] > 1) {
-      refinement_bits.push_back(abs_values[k] & 1u);
+      refinement_bits[refinement_bits_count++] = abs_values[k] & 1u;
       continue;
     }
     Flush(coding_state, bw);
     int symbol = (r << 4u) + 1;
     int new_non_zero_bit = (coeffs[kJPEGNaturalOrder[k]] < 0) ? 0 : 1;
-    WriteBits(bw, ac_huff.depth[symbol], ac_huff.code[symbol]);
+    WriteSymbol(symbol, ac_huff, bw);
     WriteBits(bw, 1, new_non_zero_bit);
-    for (int bit : refinement_bits) {
-      WriteBits(bw, 1, bit);
+    for (size_t i = 0; i < refinement_bits_count; ++i) {
+      WriteBits(bw, 1, refinement_bits[i]);
     }
-    refinement_bits.clear();
+    refinement_bits_count = 0;
     r = 0;
   }
-  if (r > 0 || !refinement_bits.empty()) {
-    BufferEndOfBand(coding_state, &ac_huff, &refinement_bits, bw);
+  if (r > 0 || refinement_bits_count) {
+    BufferEndOfBand(coding_state, ac_huff, refinement_bits,
+                    refinement_bits_count, bw);
     if (!eob_run_allowed) {
       Flush(coding_state, bw);
     }
@@ -658,6 +718,23 @@ bool EncodeRefinementBits(const coeff_t* coeffs,
   return true;
 }
 
+size_t NumHistograms(const JPEGData& jpg) {
+  size_t num = 0;
+  for (const auto& si : jpg.scan_info) {
+    num += si.num_components;
+  }
+  return num;
+}
+
+size_t HistogramIndex(const JPEGData& jpg, size_t scan_index,
+                      size_t component_index) {
+  size_t idx = 0;
+  for (size_t i = 0; i < scan_index; ++i) {
+    idx += jpg.scan_info[i].num_components;
+  }
+  return idx + component_index;
+}
+
 template <int kMode>
 SerializationStatus JXL_NOINLINE DoEncodeScan(const JPEGData& jpg,
                                               SerializationState* state) {
@@ -716,7 +793,8 @@ SerializationStatus JXL_NOINLINE DoEncodeScan(const JPEGData& jpg,
 
   // DC-only is defined by [0..0] spectral range.
   const bool want_ac = ((Ss != 0) || (Se != 0));
-  // TODO: support streaming decoding again.
+  const bool want_dc = (Ss == 0);
+  // TODO(user): support streaming decoding again.
   const bool complete_ac = true;
   const bool has_ac = true;
   if (want_ac && !has_ac) return SerializationStatus::NEEDS_MORE_INPUT;
@@ -750,12 +828,21 @@ SerializationStatus JXL_NOINLINE DoEncodeScan(const JPEGData& jpg,
         ss.restarts_to_go = restart_interval;
         memset(ss.last_dc_coeff, 0, sizeof(ss.last_dc_coeff));
       }
+
       // Encode one MCU
       for (size_t i = 0; i < scan_info.num_components; ++i) {
         const JPEGComponentScanInfo& si = scan_info.components[i];
         const JPEGComponent& c = jpg.components[si.comp_idx];
-        const HuffmanCodeTable& dc_huff = state->dc_huff_table[si.dc_tbl_idx];
-        const HuffmanCodeTable& ac_huff = state->ac_huff_table[si.ac_tbl_idx];
+        size_t dc_tbl_idx = si.dc_tbl_idx;
+        size_t ac_tbl_idx = si.ac_tbl_idx;
+        HuffmanCodeTable* dc_huff = &state->dc_huff_table[dc_tbl_idx];
+        HuffmanCodeTable* ac_huff = &state->ac_huff_table[ac_tbl_idx];
+        if (want_dc && !dc_huff->initialized) {
+          return SerializationStatus::ERROR;
+        }
+        if (want_ac && !ac_huff->initialized) {
+          return SerializationStatus::ERROR;
+        }
         int n_blocks_y = is_interleaved ? c.v_samp_factor : 1;
         int n_blocks_x = is_interleaved ? c.h_samp_factor : 1;
         for (int iy = 0; iy < n_blocks_y; ++iy) {
@@ -776,6 +863,8 @@ SerializationStatus JXL_NOINLINE DoEncodeScan(const JPEGData& jpg,
             }
             const coeff_t* coeffs = &c.coeffs[block_idx << 6];
             bool ok;
+            // compressed size per block cannot be more than 512 bytes
+            Reserve(bw, 512);
             if (kMode == 0) {
               ok = EncodeDCTBlockSequential(coeffs, dc_huff, ac_huff,
                                             num_zero_runs,
@@ -899,26 +988,21 @@ SerializationStatus SerializeSection(uint8_t marker, SerializationState* state,
   }
 }
 
-}  // namespace
-
 // TODO(veluca): add streaming support again.
-Status WriteJpeg(const JPEGData& jpg, const JPEGOutput& out) {
-  SerializationState ss;
-
-  size_t written = 0;
+Status WriteJpegInternal(const JPEGData& jpg, const JPEGOutput& out,
+                         SerializationState* ss) {
   const auto maybe_push_output = [&]() -> Status {
-    if (ss.stage != SerializationState::ERROR) {
-      while (!ss.output_queue.empty()) {
-        auto& chunk = ss.output_queue.front();
+    if (ss->stage != SerializationState::STAGE_ERROR) {
+      while (!ss->output_queue.empty()) {
+        auto& chunk = ss->output_queue.front();
         size_t num_written = out(chunk.next, chunk.len);
         if (num_written == 0 && chunk.len > 0) {
           return StatusMessage(Status(StatusCode::kNotEnoughBytes),
                                "Failed to write output");
         }
         chunk.len -= num_written;
-        written += num_written;
         if (chunk.len == 0) {
-          ss.output_queue.pop_front();
+          ss->output_queue.pop_front();
         }
       }
     }
@@ -926,39 +1010,38 @@ Status WriteJpeg(const JPEGData& jpg, const JPEGOutput& out) {
   };
 
   while (true) {
-    switch (ss.stage) {
-      case SerializationState::INIT: {
+    switch (ss->stage) {
+      case SerializationState::STAGE_INIT: {
         // Valid Brunsli requires, at least, 0xD9 marker.
         // This might happen on corrupted stream, or on unconditioned JPEGData.
         // TODO(eustas): check D9 in the only one and is the last one.
         if (jpg.marker_order.empty()) {
-          ss.stage = SerializationState::ERROR;
+          ss->stage = SerializationState::STAGE_ERROR;
           break;
         }
-
-        ss.dc_huff_table.resize(kMaxHuffmanTables);
-        ss.ac_huff_table.resize(kMaxHuffmanTables);
+        ss->dc_huff_table.resize(kMaxHuffmanTables);
+        ss->ac_huff_table.resize(kMaxHuffmanTables);
         if (jpg.has_zero_padding_bit) {
-          ss.pad_bits = jpg.padding_bits.data();
-          ss.pad_bits_end = ss.pad_bits + jpg.padding_bits.size();
+          ss->pad_bits = jpg.padding_bits.data();
+          ss->pad_bits_end = ss->pad_bits + jpg.padding_bits.size();
         }
 
-        EncodeSOI(&ss);
+        EncodeSOI(ss);
         JXL_QUIET_RETURN_IF_ERROR(maybe_push_output());
-        ss.stage = SerializationState::SERIALIZE_SECTION;
+        ss->stage = SerializationState::STAGE_SERIALIZE_SECTION;
         break;
       }
 
-      case SerializationState::SERIALIZE_SECTION: {
-        if (ss.section_index >= jpg.marker_order.size()) {
-          ss.stage = SerializationState::DONE;
+      case SerializationState::STAGE_SERIALIZE_SECTION: {
+        if (ss->section_index >= jpg.marker_order.size()) {
+          ss->stage = SerializationState::STAGE_DONE;
           break;
         }
-        uint8_t marker = jpg.marker_order[ss.section_index];
-        SerializationStatus status = SerializeSection(marker, &ss, jpg);
+        uint8_t marker = jpg.marker_order[ss->section_index];
+        SerializationStatus status = SerializeSection(marker, ss, jpg);
         if (status == SerializationStatus::ERROR) {
           JXL_WARNING("Failed to encode marker 0x%.2x", marker);
-          ss.stage = SerializationState::ERROR;
+          ss->stage = SerializationState::STAGE_ERROR;
           break;
         }
         JXL_QUIET_RETURN_IF_ERROR(maybe_push_output());
@@ -966,29 +1049,31 @@ Status WriteJpeg(const JPEGData& jpg, const JPEGOutput& out) {
           return JXL_FAILURE("Incomplete serialization data");
         } else if (status != SerializationStatus::DONE) {
           JXL_DASSERT(false);
-          ss.stage = SerializationState::ERROR;
+          ss->stage = SerializationState::STAGE_ERROR;
           break;
         }
-        ++ss.section_index;
+        ++ss->section_index;
         break;
       }
 
-      case SerializationState::DONE:
-        JXL_ASSERT(ss.output_queue.empty());
+      case SerializationState::STAGE_DONE:
+        JXL_ASSERT(ss->output_queue.empty());
+        if (ss->pad_bits != nullptr && ss->pad_bits != ss->pad_bits_end) {
+          return JXL_FAILURE("Invalid number of padding bits.");
+        }
         return true;
 
-      case SerializationState::ERROR:
+      case SerializationState::STAGE_ERROR:
         return JXL_FAILURE("JPEG serialization error");
     }
   }
 }
 
-Status EncodeImageJPGCoefficients(const CodecInOut* io, PaddedBytes* bytes) {
-  auto write = [&bytes](const uint8_t* buf, size_t len) {
-    bytes->append(buf, buf + len);
-    return len;
-  };
-  return WriteJpeg(*io->Main().jpeg_data, write);
+}  // namespace
+
+Status WriteJpeg(const JPEGData& jpg, const JPEGOutput& out) {
+  auto ss = jxl::make_unique<SerializationState>();
+  return WriteJpegInternal(jpg, out, ss.get());
 }
 
 }  // namespace jpeg
index f272ae7..c6f70ff 100644 (file)
@@ -13,7 +13,7 @@
 
 #include <functional>
 
-#include "lib/jxl/codec_in_out.h"
+#include "lib/jxl/jpeg/dec_jpeg_serialization_state.h"
 #include "lib/jxl/jpeg/jpeg_data.h"
 
 namespace jxl {
@@ -25,9 +25,6 @@ using JPEGOutput = std::function<size_t(const uint8_t* buf, size_t len)>;
 
 Status WriteJpeg(const JPEGData& jpg, const JPEGOutput& out);
 
-// Reconstructs the JPEG from the coefficients and metadata in CodecInOut.
-Status EncodeImageJPGCoefficients(const CodecInOut* io, PaddedBytes* bytes);
-
 }  // namespace jpeg
 }  // namespace jxl
 
index a25c335..9950dc1 100644 (file)
@@ -6,6 +6,7 @@
 #ifndef LIB_JXL_JPEG_DEC_JPEG_SERIALIZATION_STATE_H_
 #define LIB_JXL_JPEG_DEC_JPEG_SERIALIZATION_STATE_H_
 
+#include <algorithm>
 #include <deque>
 #include <vector>
 
@@ -16,8 +17,12 @@ namespace jxl {
 namespace jpeg {
 
 struct HuffmanCodeTable {
-  int depth[256];
-  int code[256];
+  int8_t depth[256];
+  uint16_t code[256];
+  bool initialized = false;
+  void InitDepths(int8_t value = 0) {
+    std::fill(std::begin(depth), std::end(depth), value);
+  }
 };
 
 // Handles the packing of bits into output bytes.
@@ -36,10 +41,11 @@ struct DCTCodingState {
   // The run length of end-of-band symbols in a progressive scan.
   int eob_run_;
   // The huffman table to be used when flushing the state.
-  const HuffmanCodeTable* cur_ac_huff_;
+  HuffmanCodeTable* cur_ac_huff_;
   // The sequence of currently buffered refinement bits for a successive
   // approximation scan (one where Ah > 0).
-  std::vector<int> refinement_bits_;
+  std::vector<uint16_t> refinement_bits_;
+  size_t refinement_bits_count_ = 0;
 };
 
 struct EncodeScanState {
@@ -62,13 +68,13 @@ struct EncodeScanState {
 
 struct SerializationState {
   enum Stage {
-    INIT,
-    SERIALIZE_SECTION,
-    DONE,
-    ERROR,
+    STAGE_INIT,
+    STAGE_SERIALIZE_SECTION,
+    STAGE_DONE,
+    STAGE_ERROR,
   };
 
-  Stage stage = INIT;
+  Stage stage = STAGE_INIT;
 
   std::deque<OutputChunk> output_queue;
 
index 0f625d8..0b3a1c9 100644 (file)
@@ -6,9 +6,12 @@
 #include "lib/jxl/jpeg/enc_jpeg_data.h"
 
 #include <brotli/encode.h>
-#include <stdio.h>
 
+#include "lib/jxl/codec_in_out.h"
+#include "lib/jxl/enc_fields.h"
+#include "lib/jxl/image_bundle.h"
 #include "lib/jxl/jpeg/enc_jpeg_data_reader.h"
+#include "lib/jxl/luminance.h"
 #include "lib/jxl/sanitizers.h"
 
 namespace jxl {
@@ -99,7 +102,7 @@ Status DetectBlobs(jpeg::JPEGData& jpeg_data) {
 }
 
 Status ParseChunkedMarker(const jpeg::JPEGData& src, uint8_t marker_type,
-                          const ByteSpan& tag, PaddedBytes* output,
+                          const ByteSpan& tag, IccBytes* output,
                           bool allow_permutations = false) {
   output->clear();
 
@@ -162,7 +165,7 @@ Status ParseChunkedMarker(const jpeg::JPEGData& src, uint8_t marker_type,
     if (!presence[index]) {
       return JXL_FAILURE("Missing chunk.");
     }
-    output->append(chunks[index]);
+    chunks[index].AppendTo(output);
   }
 
   return true;
@@ -213,9 +216,9 @@ static inline bool IsJPG(const Span<const uint8_t> bytes) {
 
 }  // namespace
 
-Status SetColorEncodingFromJpegData(const jpeg::JPEGData& jpg,
-                                    ColorEncoding* color_encoding) {
-  PaddedBytes icc_profile;
+void SetColorEncodingFromJpegData(const jpeg::JPEGData& jpg,
+                                  ColorEncoding* color_encoding) {
+  IccBytes icc_profile;
   if (!ParseChunkedMarker(jpg, kApp2, ByteSpan(kIccProfileTag), &icc_profile)) {
     JXL_WARNING("ReJPEG: corrupted ICC profile\n");
     icc_profile.clear();
@@ -224,27 +227,19 @@ Status SetColorEncodingFromJpegData(const jpeg::JPEGData& jpg,
   if (icc_profile.empty()) {
     bool is_gray = (jpg.components.size() == 1);
     *color_encoding = ColorEncoding::SRGB(is_gray);
-    return true;
+  } else {
+    color_encoding->SetICCRaw(std::move(icc_profile));
   }
-
-  return color_encoding->SetICC(std::move(icc_profile));
 }
 
-Status EncodeJPEGData(JPEGData& jpeg_data, PaddedBytes* bytes,
+Status EncodeJPEGData(JPEGData& jpeg_data, std::vector<uint8_t>* bytes,
                       const CompressParams& cparams) {
+  bytes->clear();
   jpeg_data.app_marker_type.resize(jpeg_data.app_data.size(),
                                    AppMarkerType::kUnknown);
   JXL_RETURN_IF_ERROR(DetectIccProfile(jpeg_data));
   JXL_RETURN_IF_ERROR(DetectBlobs(jpeg_data));
-  BitWriter writer;
-  JXL_RETURN_IF_ERROR(Bundle::Write(jpeg_data, &writer, 0, nullptr));
-  writer.ZeroPadToByte();
-  *bytes = std::move(writer).TakeBytes();
-  BrotliEncoderState* brotli_enc =
-      BrotliEncoderCreateInstance(nullptr, nullptr, nullptr);
-  int effort = cparams.brotli_effort;
-  if (effort < 0) effort = 11 - static_cast<int>(cparams.speed_tier);
-  BrotliEncoderSetParameter(brotli_enc, BROTLI_PARAM_QUALITY, effort);
+
   size_t total_data = 0;
   for (size_t i = 0; i < jpeg_data.app_data.size(); i++) {
     if (jpeg_data.app_marker_type[i] != AppMarkerType::kUnknown) {
@@ -259,10 +254,25 @@ Status EncodeJPEGData(JPEGData& jpeg_data, PaddedBytes* bytes,
     total_data += jpeg_data.inter_marker_data[i].size();
   }
   total_data += jpeg_data.tail_data.size();
-  size_t initial_size = bytes->size();
   size_t brotli_capacity = BrotliEncoderMaxCompressedSize(total_data);
+
+  BitWriter writer;
+  JXL_RETURN_IF_ERROR(Bundle::Write(jpeg_data, &writer, 0, nullptr));
+  writer.ZeroPadToByte();
+  {
+    PaddedBytes serialized_jpeg_data = std::move(writer).TakeBytes();
+    bytes->reserve(serialized_jpeg_data.size() + brotli_capacity);
+    Bytes(serialized_jpeg_data).AppendTo(bytes);
+  }
+
+  BrotliEncoderState* brotli_enc =
+      BrotliEncoderCreateInstance(nullptr, nullptr, nullptr);
+  int effort = cparams.brotli_effort;
+  if (effort < 0) effort = 11 - static_cast<int>(cparams.speed_tier);
+  BrotliEncoderSetParameter(brotli_enc, BROTLI_PARAM_QUALITY, effort);
+  size_t initial_size = bytes->size();
   BrotliEncoderSetParameter(brotli_enc, BROTLI_PARAM_SIZE_HINT, total_data);
-  bytes->resize(bytes->size() + brotli_capacity);
+  bytes->resize(initial_size + brotli_capacity);
   size_t enc_size = 0;
   auto br_append = [&](const std::vector<uint8_t>& data, bool last) {
     size_t available_in = data.size();
@@ -307,8 +317,7 @@ Status DecodeImageJPG(const Span<const uint8_t> bytes, CodecInOut* io) {
                       jpeg_data)) {
     return JXL_FAILURE("Error reading JPEG");
   }
-  JXL_RETURN_IF_ERROR(
-      SetColorEncodingFromJpegData(*jpeg_data, &io->metadata.m.color_encoding));
+  SetColorEncodingFromJpegData(*jpeg_data, &io->metadata.m.color_encoding);
   JXL_RETURN_IF_ERROR(SetBlobsFromJpegData(*jpeg_data, &io->blobs));
   size_t nbcomp = jpeg_data->components.size();
   if (nbcomp != 1 && nbcomp != 3) {
@@ -373,7 +382,7 @@ Status DecodeImageJPG(const Span<const uint8_t> bytes, CodecInOut* io) {
   io->metadata.m.SetUintSamples(BITS_IN_JSAMPLE);
   io->SetFromImage(Image3F(jpeg_data->width, jpeg_data->height),
                    io->metadata.m.color_encoding);
-  SetIntensityTarget(io);
+  SetIntensityTarget(&io->metadata.m);
   return true;
 }
 
index 806128c..595d640 100644 (file)
@@ -6,18 +6,23 @@
 #ifndef LIB_JXL_JPEG_ENC_JPEG_DATA_H_
 #define LIB_JXL_JPEG_ENC_JPEG_DATA_H_
 
-#include "lib/jxl/base/padded_bytes.h"
-#include "lib/jxl/codec_in_out.h"
+#include <cstdint>
+#include <vector>
+
+#include "lib/jxl/color_encoding_internal.h"
 #include "lib/jxl/enc_params.h"
 #include "lib/jxl/jpeg/jpeg_data.h"
 
 namespace jxl {
+
+class CodecInOut;
+
 namespace jpeg {
-Status EncodeJPEGData(JPEGData& jpeg_data, PaddedBytes* bytes,
+Status EncodeJPEGData(JPEGData& jpeg_data, std::vector<uint8_t>* bytes,
                       const CompressParams& cparams);
 
-Status SetColorEncodingFromJpegData(const jpeg::JPEGData& jpg,
-                                    ColorEncoding* color_encoding);
+void SetColorEncodingFromJpegData(const jpeg::JPEGData& jpg,
+                                  ColorEncoding* color_encoding);
 
 /**
  * Decodes bytes containing JPEG codestream into a CodecInOut as coefficients
index 4a6c1de..ce64dae 100644 (file)
 #include <string>
 #include <vector>
 
+#include "lib/jxl/base/common.h"
 #include "lib/jxl/base/printf_macros.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/common.h"
+#include "lib/jxl/frame_dimensions.h"
 #include "lib/jxl/jpeg/enc_jpeg_huffman_decode.h"
 #include "lib/jxl/jpeg/jpeg_data.h"
 
-// By default only print debug messages when JXL_DEBUG_ON_ERROR is enabled.
-#ifndef JXL_DEBUG_JPEG_DATA_READER
-#define JXL_DEBUG_JPEG_DATA_READER JXL_DEBUG_ON_ERROR
-#endif  // JXL_DEBUG_JPEG_DATA_READER
-
-#define JXL_JPEG_DEBUG(format, ...) \
-  JXL_DEBUG(JXL_DEBUG_JPEG_DATA_READER, format, ##__VA_ARGS__)
-
 namespace jxl {
 namespace jpeg {
 
 namespace {
 static const int kBrunsliMaxSampling = 15;
-static const size_t kBrunsliMaxNumBlocks = 1ull << 24;
 
 // Macros for commonly used error conditions.
 
-#define JXL_JPEG_VERIFY_LEN(n)                            \
-  if (*pos + (n) > len) {                                 \
-    JXL_JPEG_DEBUG("Unexpected end of input: pos=%" PRIuS \
-                   " need=%d len=%" PRIuS,                \
-                   *pos, static_cast<int>(n), len);       \
-    jpg->error = JPEGReadError::UNEXPECTED_EOF;           \
-    return false;                                         \
+#define JXL_JPEG_VERIFY_LEN(n)                                \
+  if (*pos + (n) > len) {                                     \
+    return JXL_FAILURE("Unexpected end of input: pos=%" PRIuS \
+                       " need=%d len=%" PRIuS,                \
+                       *pos, static_cast<int>(n), len);       \
   }
 
-#define JXL_JPEG_VERIFY_INPUT(var, low, high, code)                \
-  if ((var) < (low) || (var) > (high)) {                           \
-    JXL_JPEG_DEBUG("Invalid " #var ": %d", static_cast<int>(var)); \
-    jpg->error = JPEGReadError::INVALID_##code;                    \
-    return false;                                                  \
+#define JXL_JPEG_VERIFY_INPUT(var, low, high, code)                    \
+  if ((var) < (low) || (var) > (high)) {                               \
+    return JXL_FAILURE("Invalid " #var ": %d", static_cast<int>(var)); \
   }
 
-#define JXL_JPEG_VERIFY_MARKER_END()                         \
-  if (start_pos + marker_len != *pos) {                      \
-    JXL_JPEG_DEBUG("Invalid marker length: declared=%" PRIuS \
-                   " actual=%" PRIuS,                        \
-                   marker_len, (*pos - start_pos));          \
-    jpg->error = JPEGReadError::WRONG_MARKER_SIZE;           \
-    return false;                                            \
+#define JXL_JPEG_VERIFY_MARKER_END()                             \
+  if (start_pos + marker_len != *pos) {                          \
+    return JXL_FAILURE("Invalid marker length: declared=%" PRIuS \
+                       " actual=%" PRIuS,                        \
+                       marker_len, (*pos - start_pos));          \
   }
 
-#define JXL_JPEG_EXPECT_MARKER()                                            \
-  if (pos + 2 > len || data[pos] != 0xff) {                                 \
-    JXL_JPEG_DEBUG("Marker byte (0xff) expected, found: 0x%.2x pos=%" PRIuS \
-                   " len=%" PRIuS,                                          \
-                   (pos < len ? data[pos] : 0), pos, len);                  \
-    jpg->error = JPEGReadError::MARKER_BYTE_NOT_FOUND;                      \
-    return false;                                                           \
+#define JXL_JPEG_EXPECT_MARKER()                                 \
+  if (pos + 2 > len || data[pos] != 0xff) {                      \
+    return JXL_FAILURE(                                          \
+        "Marker byte (0xff) expected, found: 0x%.2x pos=%" PRIuS \
+        " len=%" PRIuS,                                          \
+        (pos < len ? data[pos] : 0), pos, len);                  \
   }
 
 inline int ReadUint8(const uint8_t* data, size_t* pos) {
@@ -84,9 +69,7 @@ inline int ReadUint16(const uint8_t* data, size_t* pos) {
 bool ProcessSOF(const uint8_t* data, const size_t len, JpegReadMode mode,
                 size_t* pos, JPEGData* jpg) {
   if (jpg->width != 0) {
-    JXL_JPEG_DEBUG("Duplicate SOF marker.");
-    jpg->error = JPEGReadError::DUPLICATE_SOF;
-    return false;
+    return JXL_FAILURE("Duplicate SOF marker.");
   }
   const size_t start_pos = *pos;
   JXL_JPEG_VERIFY_LEN(8);
@@ -112,9 +95,7 @@ bool ProcessSOF(const uint8_t* data, const size_t len, JpegReadMode mode,
   for (size_t i = 0; i < jpg->components.size(); ++i) {
     const int id = ReadUint8(data, pos);
     if (ids_seen[id]) {  // (cf. section B.2.2, syntax of Ci)
-      JXL_JPEG_DEBUG("Duplicate ID %d in SOF.", id);
-      jpg->error = JPEGReadError::DUPLICATE_COMPONENT_ID;
-      return false;
+      return JXL_FAILURE("Duplicate ID %d in SOF.", id);
     }
     ids_seen[id] = true;
     jpg->components[i].id = id;
@@ -139,19 +120,12 @@ bool ProcessSOF(const uint8_t* data, const size_t len, JpegReadMode mode,
     JPEGComponent* c = &jpg->components[i];
     if (max_h_samp_factor % c->h_samp_factor != 0 ||
         max_v_samp_factor % c->v_samp_factor != 0) {
-      JXL_JPEG_DEBUG("Non-integral subsampling ratios.");
-      jpg->error = JPEGReadError::INVALID_SAMPLING_FACTORS;
-      return false;
+      return JXL_FAILURE("Non-integral subsampling ratios.");
     }
     c->width_in_blocks = MCU_cols * c->h_samp_factor;
     c->height_in_blocks = MCU_rows * c->v_samp_factor;
     const uint64_t num_blocks =
         static_cast<uint64_t>(c->width_in_blocks) * c->height_in_blocks;
-    if (num_blocks > kBrunsliMaxNumBlocks) {
-      JXL_JPEG_DEBUG("Image too large.");
-      jpg->error = JPEGReadError::IMAGE_TOO_LARGE;
-      return false;
-    }
     if (mode == JpegReadMode::kReadAll) {
       c->coeffs.resize(num_blocks * kDCTBlockSize);
     }
@@ -178,9 +152,7 @@ bool ProcessSOS(const uint8_t* data, const size_t len, size_t* pos,
   for (size_t i = 0; i < comps_in_scan; ++i) {
     uint32_t id = ReadUint8(data, pos);
     if (ids_seen[id]) {  // (cf. section B.2.3, regarding CSj)
-      JXL_JPEG_DEBUG("Duplicate ID %d in SOS.", id);
-      jpg->error = JPEGReadError::DUPLICATE_COMPONENT_ID;
-      return false;
+      return JXL_FAILURE("Duplicate ID %d in SOS.", id);
     }
     ids_seen[id] = true;
     bool found_index = false;
@@ -191,9 +163,7 @@ bool ProcessSOS(const uint8_t* data, const size_t len, size_t* pos,
       }
     }
     if (!found_index) {
-      JXL_JPEG_DEBUG("SOS marker: Could not find component with id %d", id);
-      jpg->error = JPEGReadError::COMPONENT_NOT_FOUND;
-      return false;
+      return JXL_FAILURE("SOS marker: Could not find component with id %d", id);
     }
     int c = ReadUint8(data, pos);
     int dc_tbl_idx = c >> 4;
@@ -231,18 +201,14 @@ bool ProcessSOS(const uint8_t* data, const size_t len, size_t* pos,
       }
     }
     if (scan_info.Ss == 0 && !found_dc_table) {
-      JXL_JPEG_DEBUG(
+      return JXL_FAILURE(
           "SOS marker: Could not find DC Huffman table with index %d",
           scan_info.components[i].dc_tbl_idx);
-      jpg->error = JPEGReadError::HUFFMAN_TABLE_NOT_FOUND;
-      return false;
     }
     if (scan_info.Se > 0 && !found_ac_table) {
-      JXL_JPEG_DEBUG(
+      return JXL_FAILURE(
           "SOS marker: Could not find AC Huffman table with index %d",
           scan_info.components[i].ac_tbl_idx);
-      jpg->error = JPEGReadError::HUFFMAN_TABLE_NOT_FOUND;
-      return false;
     }
   }
   jpg->scan_info.push_back(scan_info);
@@ -261,9 +227,7 @@ bool ProcessDHT(const uint8_t* data, const size_t len, JpegReadMode mode,
   JXL_JPEG_VERIFY_LEN(2);
   size_t marker_len = ReadUint16(data, pos);
   if (marker_len == 2) {
-    JXL_JPEG_DEBUG("DHT marker: no Huffman table found");
-    jpg->error = JPEGReadError::EMPTY_DHT;
-    return false;
+    return JXL_FAILURE("DHT marker: no Huffman table found");
   }
   while (*pos < start_pos + marker_len) {
     JXL_JPEG_VERIFY_LEN(1 + kJpegHuffmanMaxBitLength);
@@ -307,9 +271,7 @@ bool ProcessDHT(const uint8_t* data, const size_t len, JpegReadMode mode,
         JXL_JPEG_VERIFY_INPUT(value, 0, kJpegDCAlphabetSize - 1, HUFFMAN_CODE);
       }
       if (values_seen[value]) {
-        JXL_JPEG_DEBUG("Duplicate Huffman code value %d", value);
-        jpg->error = JPEGReadError::INVALID_HUFFMAN_CODE;
-        return false;
+        return JXL_FAILURE("Duplicate Huffman code value %d", value);
       }
       values_seen[value] = true;
       huff.values[i] = value;
@@ -319,9 +281,7 @@ bool ProcessDHT(const uint8_t* data, const size_t len, JpegReadMode mode,
     huff.values[total_count] = kJpegHuffmanAlphabetSize;
     space -= (1 << (kJpegHuffmanMaxBitLength - max_depth));
     if (space < 0) {
-      JXL_JPEG_DEBUG("Invalid Huffman code lengths.");
-      jpg->error = JPEGReadError::INVALID_HUFFMAN_CODE;
-      return false;
+      return JXL_FAILURE("Invalid Huffman code lengths.");
     } else if (space > 0 && huff_lut[0].value != 0xffff) {
       // Re-initialize the values to an invalid symbol so that we can recognize
       // it when reading the bit stream using a Huffman code with space > 0.
@@ -348,9 +308,7 @@ bool ProcessDQT(const uint8_t* data, const size_t len, size_t* pos,
   JXL_JPEG_VERIFY_LEN(2);
   size_t marker_len = ReadUint16(data, pos);
   if (marker_len == 2) {
-    JXL_JPEG_DEBUG("DQT marker: no quantization table found");
-    jpg->error = JPEGReadError::EMPTY_DQT;
-    return false;
+    return JXL_FAILURE("DQT marker: no quantization table found");
   }
   while (*pos < start_pos + marker_len && jpg->quant.size() < kMaxQuantTables) {
     JXL_JPEG_VERIFY_LEN(1);
@@ -380,9 +338,7 @@ bool ProcessDQT(const uint8_t* data, const size_t len, size_t* pos,
 bool ProcessDRI(const uint8_t* data, const size_t len, size_t* pos,
                 bool* found_dri, JPEGData* jpg) {
   if (*found_dri) {
-    JXL_JPEG_DEBUG("Duplicate DRI marker.");
-    jpg->error = JPEGReadError::DUPLICATE_DRI;
-    return false;
+    return JXL_FAILURE("Duplicate DRI marker.");
   }
   *found_dri = true;
   const size_t start_pos = *pos;
@@ -505,8 +461,7 @@ struct BitReaderState {
     }
     if (pos_ > next_marker_pos_) {
       // Data ran out before the scan was complete.
-      JXL_JPEG_DEBUG("Unexpected end of scan.");
-      return false;
+      return JXL_FAILURE("Unexpected end of scan.");
     }
     *pos = pos_;
     return true;
@@ -590,9 +545,7 @@ bool DecodeDCTBlock(const HuffmanTableEntry* dc_huff,
   if (Ss == 0) {
     int s = ReadSymbol(dc_huff, br);
     if (s >= kJpegDCAlphabetSize) {
-      JXL_JPEG_DEBUG("Invalid Huffman symbol %d  for DC coefficient.", s);
-      jpg->error = JPEGReadError::INVALID_SYMBOL;
-      return false;
+      return JXL_FAILURE("Invalid Huffman symbol %d  for DC coefficient.", s);
     }
     int diff = 0;
     if (s > 0) {
@@ -604,9 +557,7 @@ bool DecodeDCTBlock(const HuffmanTableEntry* dc_huff,
     coeffs[0] = dc_coeff;
     // TODO(eustas): is there a more elegant / explicit way to check this?
     if (dc_coeff != coeffs[0]) {
-      JXL_JPEG_DEBUG("Invalid DC coefficient %d", dc_coeff);
-      jpg->error = JPEGReadError::NON_REPRESENTABLE_DC_COEFF;
-      return false;
+      return JXL_FAILURE("Invalid DC coefficient %d", dc_coeff);
     }
     *last_dc_coeff = coeff;
     ++Ss;
@@ -622,25 +573,21 @@ bool DecodeDCTBlock(const HuffmanTableEntry* dc_huff,
   for (int k = Ss; k <= Se; k++) {
     int sr = ReadSymbol(ac_huff, br);
     if (sr >= kJpegHuffmanAlphabetSize) {
-      JXL_JPEG_DEBUG("Invalid Huffman symbol %d for AC coefficient %d", sr, k);
-      jpg->error = JPEGReadError::INVALID_SYMBOL;
-      return false;
+      return JXL_FAILURE("Invalid Huffman symbol %d for AC coefficient %d", sr,
+                         k);
     }
     int r = sr >> 4;
     int s = sr & 15;
     if (s > 0) {
       k += r;
       if (k > Se) {
-        JXL_JPEG_DEBUG("Out-of-band coefficient %d band was %d-%d", k, Ss, Se);
-        jpg->error = JPEGReadError::OUT_OF_BAND_COEFF;
-        return false;
+        return JXL_FAILURE("Out-of-band coefficient %d band was %d-%d", k, Ss,
+                           Se);
       }
       if (s + Al >= kJpegDCAlphabetSize) {
-        JXL_JPEG_DEBUG(
+        return JXL_FAILURE(
             "Out of range AC coefficient value: s = %d Al = %d k = %d", s, Al,
             k);
-        jpg->error = JPEGReadError::NON_REPRESENTABLE_AC_COEFF;
-        return false;
       }
       int bits = br->ReadBits(s);
       int coeff = HuffExtend(bits, s);
@@ -658,9 +605,7 @@ bool DecodeDCTBlock(const HuffmanTableEntry* dc_huff,
       *eobrun = 1 << r;
       if (r > 0) {
         if (!eobrun_allowed) {
-          JXL_JPEG_DEBUG("End-of-block run crossing DC coeff.");
-          jpg->error = JPEGReadError::EOB_RUN_TOO_LONG;
-          return false;
+          return JXL_FAILURE("End-of-block run crossing DC coeff.");
         }
         *eobrun += br->ReadBits(r);
       }
@@ -697,18 +642,15 @@ bool RefineDCTBlock(const HuffmanTableEntry* ac_huff, int Ss, int Se, int Al,
     for (; k <= Se; k++) {
       s = ReadSymbol(ac_huff, br);
       if (s >= kJpegHuffmanAlphabetSize) {
-        JXL_JPEG_DEBUG("Invalid Huffman symbol %d for AC coefficient %d", s, k);
-        jpg->error = JPEGReadError::INVALID_SYMBOL;
-        return false;
+        return JXL_FAILURE("Invalid Huffman symbol %d for AC coefficient %d", s,
+                           k);
       }
       r = s >> 4;
       s &= 15;
       if (s) {
         if (s != 1) {
-          JXL_JPEG_DEBUG("Invalid Huffman symbol %d for AC coefficient %d", s,
-                         k);
-          jpg->error = JPEGReadError::INVALID_SYMBOL;
-          return false;
+          return JXL_FAILURE("Invalid Huffman symbol %d for AC coefficient %d",
+                             s, k);
         }
         s = br->ReadBits(1) ? p1 : m1;
         in_zero_run = false;
@@ -722,9 +664,7 @@ bool RefineDCTBlock(const HuffmanTableEntry* ac_huff, int Ss, int Se, int Al,
           *eobrun = 1 << r;
           if (r > 0) {
             if (!eobrun_allowed) {
-              JXL_JPEG_DEBUG("End-of-block run crossing DC coeff.");
-              jpg->error = JPEGReadError::EOB_RUN_TOO_LONG;
-              return false;
+              return JXL_FAILURE("End-of-block run crossing DC coeff.");
             }
             *eobrun += br->ReadBits(r);
           }
@@ -754,19 +694,15 @@ bool RefineDCTBlock(const HuffmanTableEntry* ac_huff, int Ss, int Se, int Al,
       } while (k <= Se);
       if (s) {
         if (k > Se) {
-          JXL_JPEG_DEBUG("Out-of-band coefficient %d band was %d-%d", k, Ss,
-                         Se);
-          jpg->error = JPEGReadError::OUT_OF_BAND_COEFF;
-          return false;
+          return JXL_FAILURE("Out-of-band coefficient %d band was %d-%d", k, Ss,
+                             Se);
         }
         coeffs[kJPEGNaturalOrder[k]] = s;
       }
     }
   }
   if (in_zero_run) {
-    JXL_JPEG_DEBUG("Extra zero run before end-of-block.");
-    jpg->error = JPEGReadError::EXTRA_ZERO_RUN;
-    return false;
+    return JXL_FAILURE("Extra zero run before end-of-block.");
   }
   if (*eobrun > 0) {
     for (; k <= Se; k++) {
@@ -794,17 +730,14 @@ bool ProcessRestart(const uint8_t* data, const size_t len,
                     JPEGData* jpg) {
   size_t pos = 0;
   if (!br->FinishStream(jpg, &pos)) {
-    jpg->error = JPEGReadError::INVALID_SCAN;
-    return false;
+    return JXL_FAILURE("Invalid scan");
   }
   int expected_marker = 0xd0 + *next_restart_marker;
   JXL_JPEG_EXPECT_MARKER();
   int marker = data[pos + 1];
   if (marker != expected_marker) {
-    JXL_JPEG_DEBUG("Did not find expected restart marker %d actual %d",
-                   expected_marker, marker);
-    jpg->error = JPEGReadError::WRONG_RESTART_MARKER;
-    return false;
+    return JXL_FAILURE("Did not find expected restart marker %d actual %d",
+                       expected_marker, marker);
   }
   br->Reset(pos + 2);
   *next_restart_marker += 1;
@@ -854,27 +787,21 @@ bool ProcessScan(const uint8_t* data, const size_t len,
     int comp_idx = scan_info->components[i].comp_idx;
     for (int k = Ss; k <= Se; ++k) {
       if (scan_progression[comp_idx][k] & scan_bitmask) {
-        JXL_JPEG_DEBUG(
+        return JXL_FAILURE(
             "Overlapping scans: component=%d k=%d prev_mask: %u cur_mask %u",
             comp_idx, k, scan_progression[i][k], scan_bitmask);
-        jpg->error = JPEGReadError::OVERLAPPING_SCANS;
-        return false;
       }
       if (scan_progression[comp_idx][k] & refinement_bitmask) {
-        JXL_JPEG_DEBUG(
+        return JXL_FAILURE(
             "Invalid scan order, a more refined scan was already done: "
             "component=%d k=%d prev_mask=%u cur_mask=%u",
             comp_idx, k, scan_progression[i][k], scan_bitmask);
-        jpg->error = JPEGReadError::INVALID_SCAN_ORDER;
-        return false;
       }
       scan_progression[comp_idx][k] |= scan_bitmask;
     }
   }
   if (Al > 10) {
-    JXL_JPEG_DEBUG("Scan parameter Al=%d is not supported.", Al);
-    jpg->error = JPEGReadError::NON_REPRESENTABLE_AC_COEFF;
-    return false;
+    return JXL_FAILURE("Scan parameter Al=%d is not supported.", Al);
   }
   for (int mcu_y = 0; mcu_y < MCU_rows; ++mcu_y) {
     for (int mcu_x = 0; mcu_x < MCUs_per_row; ++mcu_x) {
@@ -885,13 +812,11 @@ bool ProcessScan(const uint8_t* data, const size_t len,
             restarts_to_go = jpg->restart_interval;
             memset(static_cast<void*>(last_dc_coeff), 0, sizeof(last_dc_coeff));
             if (eobrun > 0) {
-              JXL_JPEG_DEBUG("End-of-block run too long.");
-              jpg->error = JPEGReadError::EOB_RUN_TOO_LONG;
-              return false;
+              return JXL_FAILURE("End-of-block run too long.");
             }
             eobrun = -1;  // fresh start
           } else {
-            return false;
+            return JXL_FAILURE("Could not process restart.");
           }
         }
         --restarts_to_go;
@@ -942,20 +867,15 @@ bool ProcessScan(const uint8_t* data, const size_t len,
     }
   }
   if (eobrun > 0) {
-    JXL_JPEG_DEBUG("End-of-block run too long.");
-    jpg->error = JPEGReadError::EOB_RUN_TOO_LONG;
-    return false;
+    return JXL_FAILURE("End-of-block run too long.");
   }
   if (!br.FinishStream(jpg, pos)) {
-    jpg->error = JPEGReadError::INVALID_SCAN;
-    return false;
+    return JXL_FAILURE("Invalid scan.");
   }
   if (*pos > len) {
-    JXL_JPEG_DEBUG("Unexpected end of file during scan. pos=%" PRIuS
-                   " len=%" PRIuS,
-                   *pos, len);
-    jpg->error = JPEGReadError::UNEXPECTED_EOF;
-    return false;
+    return JXL_FAILURE("Unexpected end of file during scan. pos=%" PRIuS
+                       " len=%" PRIuS,
+                       *pos, len);
   }
   return true;
 }
@@ -974,10 +894,8 @@ bool FixupIndexes(JPEGData* jpg) {
       }
     }
     if (!found_index) {
-      JXL_JPEG_DEBUG("Quantization table with index %u not found",
-                     c->quant_idx);
-      jpg->error = JPEGReadError::QUANT_TABLE_NOT_FOUND;
-      return false;
+      return JXL_FAILURE("Quantization table with index %u not found",
+                         c->quant_idx);
     }
   }
   return true;
@@ -1009,9 +927,7 @@ bool ReadJpeg(const uint8_t* data, const size_t len, JpegReadMode mode,
   int marker = data[pos + 1];
   pos += 2;
   if (marker != 0xd8) {
-    JXL_JPEG_DEBUG("Did not find expected SOI marker, actual=%d", marker);
-    jpg->error = JPEGReadError::SOI_NOT_FOUND;
-    return false;
+    return JXL_FAILURE("Did not find expected SOI marker, actual=%d", marker);
   }
   int lut_size = kMaxHuffmanTables * kJpegHuffmanLutSize;
   std::vector<HuffmanTableEntry> dc_huff_lut(lut_size);
@@ -1097,11 +1013,8 @@ bool ReadJpeg(const uint8_t* data, const size_t len, JpegReadMode mode,
         }
         break;
       default:
-        JXL_JPEG_DEBUG("Unsupported marker: %d pos=%" PRIuS " len=%" PRIuS,
-                       marker, pos, len);
-        jpg->error = JPEGReadError::UNSUPPORTED_MARKER;
-        ok = false;
-        break;
+        return JXL_FAILURE("Unsupported marker: %d pos=%" PRIuS " len=%" PRIuS,
+                           marker, pos, len);
     }
     if (!ok) {
       return false;
@@ -1113,9 +1026,7 @@ bool ReadJpeg(const uint8_t* data, const size_t len, JpegReadMode mode,
   } while (marker != 0xd9);
 
   if (!found_sof) {
-    JXL_JPEG_DEBUG("Missing SOF marker.");
-    jpg->error = JPEGReadError::SOF_NOT_FOUND;
-    return false;
+    return JXL_FAILURE("Missing SOF marker.");
   }
 
   // Supplemental checks.
@@ -1130,14 +1041,10 @@ bool ReadJpeg(const uint8_t* data, const size_t len, JpegReadMode mode,
       // Section B.2.4.2: "If a table has never been defined for a particular
       // destination, then when this destination is specified in a scan header,
       // the results are unpredictable."
-      JXL_JPEG_DEBUG("Need at least one Huffman code table.");
-      jpg->error = JPEGReadError::HUFFMAN_TABLE_ERROR;
-      return false;
+      return JXL_FAILURE("Need at least one Huffman code table.");
     }
     if (jpg->huffman_code.size() >= kMaxDHTMarkers) {
-      JXL_JPEG_DEBUG("Too many Huffman tables.");
-      jpg->error = JPEGReadError::HUFFMAN_TABLE_ERROR;
-      return false;
+      return JXL_FAILURE("Too many Huffman tables.");
     }
   }
   return true;
index a78d77c..6744e69 100644 (file)
@@ -7,6 +7,7 @@
 
 #include "lib/jxl/base/printf_macros.h"
 #include "lib/jxl/base/status.h"
+#include "lib/jxl/common.h"  // kMaxNumPasses, JPEGXL_ENABLE_TRANSCODE_JPEG
 
 namespace jxl {
 namespace jpeg {
@@ -288,8 +289,6 @@ Status JPEGData::VisitFields(Visitor* visitor) {
     JXL_RETURN_IF_ERROR(visitor->Bits(16, 0, &restart_interval));
   }
 
-  uint64_t padding_spot_limit = scan_info.size();
-
   for (auto& scan : scan_info) {
     uint32_t num_reset_points = scan.reset_points.size();
     JXL_RETURN_IF_ERROR(visitor->U32(Val(0), BitsOffset(2, 1), BitsOffset(4, 4),
@@ -304,16 +303,9 @@ Status JPEGData::VisitFields(Visitor* visitor) {
                                        BitsOffset(5, 9), BitsOffset(28, 41), 0,
                                        &block_idx));
       block_idx += last_block_idx + 1;
-      if (static_cast<int>(block_idx) < last_block_idx + 1) {
-        return JXL_FAILURE("Invalid block ID: %u, last block was %d", block_idx,
-                           last_block_idx);
-      }
-      // TODO(eustas): better upper boundary could be given at this point; also
-      //               it could be applied during reset_points reading.
-      if (block_idx > (1u << 30)) {
-        // At most 8K x 8K x num_channels blocks are expected. That is,
-        // typically, 1.5 * 2^27. 2^30 should be sufficient for any sane
-        // image.
+      if (block_idx >= (3u << 26)) {
+        // At most 8K x 8K x num_channels blocks are possible in a JPEG.
+        // So valid block indices are below 3 * 2^26.
         return JXL_FAILURE("Invalid block ID: %u", block_idx);
       }
       last_block_idx = block_idx;
@@ -337,25 +329,11 @@ Status JPEGData::VisitFields(Visitor* visitor) {
                                        BitsOffset(5, 9), BitsOffset(28, 41), 0,
                                        &block_idx));
       block_idx += last_block_idx + 1;
-      if (static_cast<int>(block_idx) < last_block_idx + 1) {
-        return JXL_FAILURE("Invalid block ID: %u, last block was %d", block_idx,
-                           last_block_idx);
-      }
-      if (block_idx > (1u << 30)) {
-        // At most 8K x 8K x num_channels blocks are expected. That is,
-        // typically, 1.5 * 2^27. 2^30 should be sufficient for any sane
-        // image.
+      if (block_idx > (3u << 26)) {
         return JXL_FAILURE("Invalid block ID: %u", block_idx);
       }
       last_block_idx = block_idx;
     }
-
-    if (restart_interval > 0) {
-      int MCUs_per_row = 0;
-      int MCU_rows = 0;
-      CalculateMcuSize(scan, &MCUs_per_row, &MCU_rows);
-      padding_spot_limit += DivCeil(MCU_rows * MCUs_per_row, restart_interval);
-    }
   }
   std::vector<uint32_t> inter_marker_data_sizes;
   inter_marker_data_sizes.reserve(info.num_intermarker);
@@ -366,8 +344,7 @@ Status JPEGData::VisitFields(Visitor* visitor) {
   }
   uint32_t tail_data_len = tail_data.size();
   if (!visitor->IsReading() && tail_data_len > 4260096) {
-    error = JPEGReadError::TAIL_DATA_TOO_LARGE;
-    return JXL_FAILURE("Tail data too large (max size = 4260096, size = %u).",
+    return JXL_FAILURE("Tail data too large (max size = 4260096, size = %u)",
                        tail_data_len);
   }
   JXL_RETURN_IF_ERROR(visitor->U32(Val(0), BitsOffset(8, 1),
@@ -378,18 +355,60 @@ Status JPEGData::VisitFields(Visitor* visitor) {
   if (has_zero_padding_bit) {
     uint32_t nbit = padding_bits.size();
     JXL_RETURN_IF_ERROR(visitor->Bits(24, 0, &nbit));
-    if (nbit > 7 * padding_spot_limit) {
-      return JXL_FAILURE("Number of padding bits does not correspond to image");
-    }
-    // TODO(eustas): check that that much bits of input are available.
     if (visitor->IsReading()) {
-      padding_bits.resize(nbit);
+      JXL_RETURN_IF_ERROR(CheckHasEnoughBits(visitor, nbit));
+      padding_bits.reserve(std::min<uint32_t>(1024u, nbit));
+      for (uint32_t i = 0; i < nbit; i++) {
+        bool bbit = false;
+        JXL_RETURN_IF_ERROR(visitor->Bool(false, &bbit));
+        padding_bits.push_back(bbit);
+      }
+    } else {
+      for (uint8_t& bit : padding_bits) {
+        bool bbit = bit;
+        JXL_RETURN_IF_ERROR(visitor->Bool(false, &bbit));
+        bit = bbit;
+      }
     }
-    // TODO(eustas): read in (8-64?) bit groups to reduce overhead.
-    for (uint8_t& bit : padding_bits) {
-      bool bbit = bit;
-      JXL_RETURN_IF_ERROR(visitor->Bool(false, &bbit));
-      bit = bbit;
+  }
+
+  {
+    size_t dht_index = 0;
+    size_t scan_index = 0;
+    bool is_progressive = false;
+    bool ac_ok[kMaxHuffmanTables] = {false};
+    bool dc_ok[kMaxHuffmanTables] = {false};
+    for (uint8_t marker : marker_order) {
+      if (marker == 0xC2) {
+        is_progressive = true;
+      } else if (marker == 0xC4) {
+        for (; dht_index < huffman_code.size();) {
+          const JPEGHuffmanCode& huff = huffman_code[dht_index++];
+          size_t index = huff.slot_id;
+          if (index & 0x10) {
+            index -= 0x10;
+            ac_ok[index] = true;
+          } else {
+            dc_ok[index] = true;
+          }
+          if (huff.is_last) break;
+        }
+      } else if (marker == 0xDA) {
+        const JPEGScanInfo& si = scan_info[scan_index++];
+        for (size_t i = 0; i < si.num_components; ++i) {
+          const JPEGComponentScanInfo& csi = si.components[i];
+          size_t dc_tbl_idx = csi.dc_tbl_idx;
+          size_t ac_tbl_idx = csi.ac_tbl_idx;
+          bool want_dc = !is_progressive || (si.Ss == 0);
+          if (want_dc && !dc_ok[dc_tbl_idx]) {
+            return JXL_FAILURE("DC Huffman table used before defined");
+          }
+          bool want_ac = !is_progressive || (si.Ss != 0) || (si.Se != 0);
+          if (want_ac && !ac_ok[ac_tbl_idx]) {
+            return JXL_FAILURE("AC Huffman table used before defined");
+          }
+        }
+      }
     }
   }
 
@@ -431,7 +450,8 @@ void JPEGData::CalculateMcuSize(const JPEGScanInfo& scan, int* MCUs_per_row,
 
 #if JPEGXL_ENABLE_TRANSCODE_JPEG
 
-Status SetJPEGDataFromICC(const PaddedBytes& icc, jpeg::JPEGData* jpeg_data) {
+Status SetJPEGDataFromICC(const std::vector<uint8_t>& icc,
+                          jpeg::JPEGData* jpeg_data) {
   size_t icc_pos = 0;
   for (size_t i = 0; i < jpeg_data->app_data.size(); i++) {
     if (jpeg_data->app_marker_type[i] != jpeg::AppMarkerType::kICC) {
index 8fbc869..4387d20 100644 (file)
@@ -16,6 +16,7 @@
 
 #include "lib/jxl/common.h"  // JPEGXL_ENABLE_TRANSCODE_JPEG
 #include "lib/jxl/fields.h"
+#include "lib/jxl/frame_dimensions.h"
 
 namespace jxl {
 namespace jpeg {
@@ -61,53 +62,6 @@ constexpr uint32_t kJPEGZigZagOrder[64] = {
 };
 /* clang-format on */
 
-enum struct JPEGReadError {
-  OK = 0,
-  SOI_NOT_FOUND,
-  SOF_NOT_FOUND,
-  UNEXPECTED_EOF,
-  MARKER_BYTE_NOT_FOUND,
-  UNSUPPORTED_MARKER,
-  WRONG_MARKER_SIZE,
-  INVALID_PRECISION,
-  INVALID_WIDTH,
-  INVALID_HEIGHT,
-  INVALID_NUMCOMP,
-  INVALID_SAMP_FACTOR,
-  INVALID_START_OF_SCAN,
-  INVALID_END_OF_SCAN,
-  INVALID_SCAN_BIT_POSITION,
-  INVALID_COMPS_IN_SCAN,
-  INVALID_HUFFMAN_INDEX,
-  INVALID_QUANT_TBL_INDEX,
-  INVALID_QUANT_VAL,
-  INVALID_MARKER_LEN,
-  INVALID_SAMPLING_FACTORS,
-  INVALID_HUFFMAN_CODE,
-  INVALID_SYMBOL,
-  NON_REPRESENTABLE_DC_COEFF,
-  NON_REPRESENTABLE_AC_COEFF,
-  INVALID_SCAN,
-  OVERLAPPING_SCANS,
-  INVALID_SCAN_ORDER,
-  EXTRA_ZERO_RUN,
-  DUPLICATE_DRI,
-  DUPLICATE_SOF,
-  WRONG_RESTART_MARKER,
-  DUPLICATE_COMPONENT_ID,
-  COMPONENT_NOT_FOUND,
-  HUFFMAN_TABLE_NOT_FOUND,
-  HUFFMAN_TABLE_ERROR,
-  QUANT_TABLE_NOT_FOUND,
-  EMPTY_DHT,
-  EMPTY_DQT,
-  OUT_OF_BAND_COEFF,
-  EOB_RUN_TOO_LONG,
-  IMAGE_TOO_LARGE,
-  INVALID_QUANT_TBL_PRECISION,
-  TAIL_DATA_TOO_LARGE
-};
-
 // Quantization values for an 8x8 pixel block.
 struct JPEGQuantTable {
   std::array<int32_t, kDCTBlockSize> values;
@@ -211,11 +165,7 @@ enum class AppMarkerType : uint32_t {
 // Represents a parsed jpeg file.
 struct JPEGData : public Fields {
   JPEGData()
-      : width(0),
-        height(0),
-        restart_interval(0),
-        error(JPEGReadError::OK),
-        has_zero_padding_bit(false) {}
+      : width(0), height(0), restart_interval(0), has_zero_padding_bit(false) {}
 
   JXL_FIELDS_NAME(JPEGData)
 #if JPEGXL_ENABLE_TRANSCODE_JPEG
@@ -224,7 +174,7 @@ struct JPEGData : public Fields {
   Status VisitFields(Visitor* visitor) override;
 #else
   Status VisitFields(Visitor* /* visitor */) override {
-    JXL_ABORT("JPEG transcoding support not enabled");
+    JXL_UNREACHABLE("JPEG transcoding support not enabled");
   }
 #endif  // JPEGXL_ENABLE_TRANSCODE_JPEG
 
@@ -244,7 +194,6 @@ struct JPEGData : public Fields {
   std::vector<uint8_t> marker_order;
   std::vector<std::vector<uint8_t>> inter_marker_data;
   std::vector<uint8_t> tail_data;
-  JPEGReadError error;
 
   // Extra information required for bit-precise JPEG file reconstruction.
 
@@ -254,11 +203,12 @@ struct JPEGData : public Fields {
 
 #if JPEGXL_ENABLE_TRANSCODE_JPEG
 // Set ICC profile in jpeg_data.
-Status SetJPEGDataFromICC(const PaddedBytes& icc, jpeg::JPEGData* jpeg_data);
+Status SetJPEGDataFromICC(const std::vector<uint8_t>& icc,
+                          jpeg::JPEGData* jpeg_data);
 #else
-static JXL_INLINE Status SetJPEGDataFromICC(const PaddedBytes& /* icc */,
-                                            jpeg::JPEGData* /* jpeg_data */) {
-  JXL_ABORT("JPEG transcoding support not enabled");
+static JXL_INLINE Status SetJPEGDataFromICC(
+    const std::vector<uint8_t>& /* icc */, jpeg::JPEGData* /* jpeg_data */) {
+  JXL_UNREACHABLE("JPEG transcoding support not enabled");
 }
 #endif  // JPEGXL_ENABLE_TRANSCODE_JPEG
 
diff --git a/lib/jxl/jxl_inspection.h b/lib/jxl/jxl_inspection.h
deleted file mode 100644 (file)
index 0b70a58..0000000
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#ifndef LIB_JXL_JXL_INSPECTION_H_
-#define LIB_JXL_JXL_INSPECTION_H_
-
-#include <functional>
-
-#include "lib/jxl/image.h"
-
-namespace jxl {
-// Type of the inspection-callback which, if enabled, will be called on various
-// intermediate data during image processing, allowing inspection access.
-//
-// Returns false if processing can be stopped at that point, true otherwise.
-// This is only advisory - it is always OK to just continue processing.
-using InspectorImage3F = std::function<bool(const char*, const Image3F&)>;
-}  // namespace jxl
-
-#endif  // LIB_JXL_JXL_INSPECTION_H_
index 63ce612..4fad7e2 100644 (file)
@@ -5,33 +5,30 @@
 
 #include "lib/extras/dec/jxl.h"
 
-#include <stdint.h>
-#include <stdio.h>
+#include <jxl/cms.h>
 
 #include <array>
+#include <cstdint>
 #include <future>
 #include <string>
 #include <tuple>
 #include <utility>
 #include <vector>
 
-#include "gtest/gtest.h"
 #include "lib/extras/codec.h"
-#include "lib/jxl/aux_out.h"
+#include "lib/extras/dec/decode.h"
+#include "lib/extras/enc/encode.h"
+#include "lib/extras/packed_image.h"
+#include "lib/jxl/alpha.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/base/override.h"
-#include "lib/jxl/base/padded_bytes.h"
-#include "lib/jxl/base/printf_macros.h"
-#include "lib/jxl/base/thread_pool_internal.h"
+#include "lib/jxl/base/span.h"
 #include "lib/jxl/codec_in_out.h"
-#include "lib/jxl/codec_y4m_testonly.h"
 #include "lib/jxl/color_encoding_internal.h"
-#include "lib/jxl/color_management.h"
+#include "lib/jxl/common.h"  // JXL_HIGH_PRECISION
 #include "lib/jxl/enc_butteraugli_comparator.h"
-#include "lib/jxl/enc_butteraugli_pnorm.h"
 #include "lib/jxl/enc_cache.h"
-#include "lib/jxl/enc_file.h"
 #include "lib/jxl/enc_params.h"
 #include "lib/jxl/fake_parallel_runner_testonly.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/jpeg/enc_jpeg_data.h"
 #include "lib/jxl/jpeg/jpeg_data.h"
 #include "lib/jxl/modular/options.h"
+#include "lib/jxl/test_image.h"
 #include "lib/jxl/test_utils.h"
-#include "lib/jxl/testdata.h"
-#include "tools/box/box.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
+
+struct AuxOut;
+
 namespace {
+using extras::JXLCompressParams;
+using extras::JXLDecompressParams;
+using extras::PackedPixelFile;
+using test::ButteraugliDistance;
+using test::ComputeDistance2;
+using test::ReadTestData;
 using test::Roundtrip;
+using test::TestImage;
+using test::ThreadPoolForTests;
 
 #define JXL_TEST_NL 0  // Disabled in code
 
-void CreateImage1x1(CodecInOut* io) {
-  Image3F image(1, 1);
-  ZeroFillImage(&image);
-  io->metadata.m.SetUintSamples(8);
-  io->metadata.m.color_encoding = ColorEncoding::SRGB();
-  io->SetFromImage(std::move(image), io->metadata.m.color_encoding);
-}
-
-TEST(JxlTest, HeaderSize) {
-  CodecInOut io;
-  CreateImage1x1(&io);
-
-  CompressParams cparams;
-  cparams.butteraugli_distance = 1.5;
-  ThreadPool* pool = nullptr;
-
-  {
-    CodecInOut io2;
-    AuxOut aux_out;
-    Roundtrip(&io, cparams, {}, pool, &io2, &aux_out);
-    EXPECT_LE(aux_out.layers[kLayerHeader].total_bits, 41u);
-  }
-
-  {
-    CodecInOut io2;
-    io.metadata.m.SetAlphaBits(8);
-    ImageF alpha(1, 1);
-    alpha.Row(0)[0] = 1;
-    io.Main().SetAlpha(std::move(alpha), /*alpha_is_premultiplied=*/false);
-    AuxOut aux_out;
-    Roundtrip(&io, cparams, {}, pool, &io2, &aux_out);
-    EXPECT_LE(aux_out.layers[kLayerHeader].total_bits, 49u);
-  }
-}
-
 TEST(JxlTest, RoundtripSinglePixel) {
-  CodecInOut io;
-  CreateImage1x1(&io);
+  TestImage t;
+  t.SetDimensions(1, 1).AddFrame().ZeroFill();
+  PackedPixelFile ppf_out;
+  EXPECT_EQ(Roundtrip(t.ppf(), {}, {}, nullptr, &ppf_out), 55);
+}
 
-  CompressParams cparams;
-  cparams.butteraugli_distance = 1.0;
-  ThreadPool* pool = nullptr;
-  CodecInOut io2;
-  Roundtrip(&io, cparams, {}, pool, &io2);
+TEST(JxlTest, RoundtripSinglePixelWithAlpha) {
+  TestImage t;
+  t.SetDimensions(1, 1).SetChannels(4).AddFrame().ZeroFill();
+  PackedPixelFile ppf_out;
+  EXPECT_EQ(Roundtrip(t.ppf(), {}, {}, nullptr, &ppf_out), 59);
 }
 
 // Changing serialized signature causes Decode to fail.
 #ifndef JXL_CRASH_ON_ERROR
 TEST(JxlTest, RoundtripMarker) {
-  CodecInOut io;
-  CreateImage1x1(&io);
-
-  CompressParams cparams;
-  cparams.butteraugli_distance = 1.0;
-  AuxOut* aux_out = nullptr;
-  ThreadPool* pool = nullptr;
-
-  PassesEncoderState enc_state;
+  TestImage t;
+  t.SetDimensions(1, 1).AddFrame().ZeroFill();
   for (size_t i = 0; i < 2; ++i) {
-    PaddedBytes compressed;
-    EXPECT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(),
-                           aux_out, pool));
+    std::vector<uint8_t> compressed;
+    EXPECT_TRUE(extras::EncodeImageJXL({}, t.ppf(), /*jpeg_bytes=*/nullptr,
+                                       &compressed));
     compressed[i] ^= 0xFF;
-    CodecInOut io2;
-    EXPECT_FALSE(test::DecodeFile({}, compressed, &io2, pool));
+    PackedPixelFile ppf_out;
+    EXPECT_FALSE(extras::DecodeImageJXL(compressed.data(), compressed.size(),
+                                        {}, /*decodec_bytes=*/nullptr,
+                                        &ppf_out));
   }
 }
 #endif
 
 TEST(JxlTest, RoundtripTinyFast) {
   ThreadPool* pool = nullptr;
-  const PaddedBytes orig =
+  const std::vector<uint8_t> orig =
       ReadTestData("external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
-  CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, pool));
-  io.ShrinkTo(32, 32);
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata().SetDimensions(32, 32);
 
-  CompressParams cparams;
-  cparams.speed_tier = SpeedTier::kSquirrel;
-  cparams.butteraugli_distance = 4.0f;
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 7);
+  cparams.distance = 4.0f;
 
-  CodecInOut io2;
-  const size_t enc_bytes = Roundtrip(&io, cparams, {}, pool, &io2);
-  printf("32x32 image size %" PRIuS " bytes\n", enc_bytes);
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 181, 15);
 }
 
 TEST(JxlTest, RoundtripSmallD1) {
   ThreadPool* pool = nullptr;
-  const PaddedBytes orig =
+  const std::vector<uint8_t> orig =
       ReadTestData("external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
-  CompressParams cparams;
-  cparams.butteraugli_distance = 1.0;
-
-  CodecInOut io_out;
-  size_t compressed_size;
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+  size_t xsize = t.ppf().info.xsize / 8;
+  size_t ysize = t.ppf().info.ysize / 8;
+  t.SetDimensions(xsize, ysize);
 
   {
-    CodecInOut io;
-    ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, pool));
-    io.ShrinkTo(io.xsize() / 8, io.ysize() / 8);
-
-    compressed_size = Roundtrip(&io, cparams, {}, pool, &io_out);
-    EXPECT_LE(compressed_size, 1000u);
-    EXPECT_THAT(ButteraugliDistance(io, io_out, cparams.ba_params, GetJxlCms(),
-                                    /*distmap=*/nullptr, pool),
-                IsSlightlyBelow(1.0));
+    PackedPixelFile ppf_out;
+    EXPECT_NEAR(Roundtrip(t.ppf(), {}, {}, pool, &ppf_out), 1027, 40);
+    EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(0.888));
   }
 
+  // With a lower intensity target than the default, the bitrate should be
+  // smaller.
+  t.ppf().info.intensity_target = 100.0f;
+
   {
-    // And then, with a lower intensity target than the default, the bitrate
-    // should be smaller.
-    CodecInOut io_dim;
-    ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io_dim, pool));
-    io_dim.metadata.m.SetIntensityTarget(100);
-    io_dim.ShrinkTo(io_dim.xsize() / 8, io_dim.ysize() / 8);
-    EXPECT_LT(Roundtrip(&io_dim, cparams, {}, pool, &io_out), compressed_size);
-    EXPECT_THAT(
-        ButteraugliDistance(io_dim, io_out, cparams.ba_params, GetJxlCms(),
-                            /*distmap=*/nullptr, pool),
-        IsSlightlyBelow(1.1));
-    EXPECT_EQ(io_dim.metadata.m.IntensityTarget(),
-              io_out.metadata.m.IntensityTarget());
+    PackedPixelFile ppf_out;
+    EXPECT_NEAR(Roundtrip(t.ppf(), {}, {}, pool, &ppf_out), 745, 20);
+    EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.3));
+    EXPECT_EQ(ppf_out.info.intensity_target, t.ppf().info.intensity_target);
   }
 }
-
-TEST(JxlTest, RoundtripOtherTransforms) {
-  ThreadPool* pool = nullptr;
-  const PaddedBytes orig =
-      ReadTestData("external/wesaturate/64px/a2d1un_nkitzmiller_srgb8.png");
-  std::unique_ptr<CodecInOut> io = jxl::make_unique<CodecInOut>();
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), io.get(), pool));
-
-  CompressParams cparams;
-  // Slow modes access linear image for adaptive quant search
-  cparams.speed_tier = SpeedTier::kKitten;
-  cparams.color_transform = ColorTransform::kNone;
-  cparams.butteraugli_distance = 5.0f;
-
-  std::unique_ptr<CodecInOut> io2 = jxl::make_unique<CodecInOut>();
-  const size_t compressed_size =
-      Roundtrip(io.get(), cparams, {}, pool, io2.get());
-  EXPECT_LE(compressed_size, 23000u);
-  EXPECT_THAT(ButteraugliDistance(*io, *io2, cparams.ba_params, GetJxlCms(),
-                                  /*distmap=*/nullptr, pool),
-              IsSlightlyBelow(3.0));
-
-  // Check the consistency when performing another roundtrip.
-  std::unique_ptr<CodecInOut> io3 = jxl::make_unique<CodecInOut>();
-  const size_t compressed_size2 =
-      Roundtrip(io.get(), cparams, {}, pool, io3.get());
-  EXPECT_LE(compressed_size2, 23000u);
-  EXPECT_THAT(ButteraugliDistance(*io, *io3, cparams.ba_params, GetJxlCms(),
-                                  /*distmap=*/nullptr, pool),
-              IsSlightlyBelow(3.0));
-}
-
 TEST(JxlTest, RoundtripResample2) {
   ThreadPool* pool = nullptr;
-  const PaddedBytes orig =
+  const std::vector<uint8_t> orig =
       ReadTestData("external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
-  CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, pool));
-  io.ShrinkTo(io.xsize(), io.ysize());
-  CompressParams cparams;
-  cparams.resampling = 2;
-  cparams.speed_tier = SpeedTier::kFalcon;
-  CodecInOut io2;
-  EXPECT_LE(Roundtrip(&io, cparams, {}, pool, &io2), 17000u);
-  EXPECT_THAT(ComputeDistance2(io.Main(), io2.Main(), GetJxlCms()),
-              IsSlightlyBelow(90));
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_RESAMPLING, 2);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 3);  // kFalcon
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 18500, 200);
+  EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(90));
 }
 
 TEST(JxlTest, RoundtripResample2Slow) {
   ThreadPool* pool = nullptr;
-  const PaddedBytes orig =
+  const std::vector<uint8_t> orig =
       ReadTestData("external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
-  CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, pool));
-  io.ShrinkTo(io.xsize(), io.ysize());
-  CompressParams cparams;
-  cparams.resampling = 2;
-  cparams.butteraugli_distance = 10;
-  cparams.speed_tier = SpeedTier::kTortoise;
-  CodecInOut io2;
-  EXPECT_LE(Roundtrip(&io, cparams, {}, pool, &io2), 5000u);
-  EXPECT_THAT(ComputeDistance2(io.Main(), io2.Main(), GetJxlCms()),
-              IsSlightlyBelow(250));
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_RESAMPLING, 2);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 9);  // kTortoise
+  cparams.distance = 10.0;
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 3888, 200);
+  EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(250));
 }
 
 TEST(JxlTest, RoundtripResample2MT) {
-  ThreadPoolInternal pool(4);
-  const PaddedBytes orig = ReadTestData("jxl/flower/flower.png");
+  ThreadPoolForTests pool(4);
+  const std::vector<uint8_t> orig = ReadTestData("jxl/flower/flower.png");
   // image has to be large enough to have multiple groups after downsampling
-  CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
-  CompressParams cparams;
-  cparams.resampling = 2;
-  cparams.speed_tier = SpeedTier::kFalcon;
-  CodecInOut io2;
-  // TODO(veluca): Figure out why msan and release produce different
-  // file size.
-  EXPECT_LE(Roundtrip(&io, cparams, {}, &pool, &io2), 200000u);
-  EXPECT_THAT(ComputeDistance2(io.Main(), io2.Main(), GetJxlCms()),
-              IsSlightlyBelow(340));
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_RESAMPLING, 2);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 3);  // kFalcon
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 223310, 2000);
+  EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(340));
 }
 
 // Roundtrip the image using a parallel runner that executes single-threaded but
@@ -264,175 +189,142 @@ TEST(JxlTest, RoundtripResample2MT) {
 TEST(JxlTest, RoundtripOutOfOrderProcessing) {
   FakeParallelRunner fake_pool(/*order_seed=*/123, /*num_threads=*/8);
   ThreadPool pool(&JxlFakeParallelRunner, &fake_pool);
-  const PaddedBytes orig = ReadTestData("jxl/flower/flower.png");
-  CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
+  const std::vector<uint8_t> orig = ReadTestData("jxl/flower/flower.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
   // Image size is selected so that the block border needed is larger than the
   // amount of pixels available on the next block.
-  io.ShrinkTo(513, 515);
+  t.SetDimensions(513, 515);
 
-  CompressParams cparams;
+  JXLCompressParams cparams;
   // Force epf so we end up needing a lot of border.
-  cparams.epf = 3;
-
-  CodecInOut io2;
-  Roundtrip(&io, cparams, {}, &pool, &io2);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EPF, 3);
 
-  EXPECT_GE(1.5, ButteraugliDistance(io, io2, cparams.ba_params, GetJxlCms(),
-                                     /*distmap=*/nullptr, &pool));
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 27444, 400);
+  EXPECT_LE(ButteraugliDistance(t.ppf(), ppf_out), 1.35);
 }
 
 TEST(JxlTest, RoundtripOutOfOrderProcessingBorder) {
   FakeParallelRunner fake_pool(/*order_seed=*/47, /*num_threads=*/8);
   ThreadPool pool(&JxlFakeParallelRunner, &fake_pool);
-  const PaddedBytes orig = ReadTestData("jxl/flower/flower.png");
-  CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
+  const std::vector<uint8_t> orig = ReadTestData("jxl/flower/flower.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
   // Image size is selected so that the block border needed is larger than the
   // amount of pixels available on the next block.
-  io.ShrinkTo(513, 515);
+  t.SetDimensions(513, 515);
 
-  CompressParams cparams;
+  JXLCompressParams cparams;
   // Force epf so we end up needing a lot of border.
-  cparams.epf = 3;
-  cparams.resampling = 2;
-
-  CodecInOut io2;
-  Roundtrip(&io, cparams, {}, &pool, &io2);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EPF, 3);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_RESAMPLING, 2);
 
-  EXPECT_GE(2.8, ButteraugliDistance(io, io2, cparams.ba_params, GetJxlCms(),
-                                     /*distmap=*/nullptr, &pool));
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 10065, 200);
+  EXPECT_LE(ButteraugliDistance(t.ppf(), ppf_out), 2.9);
 }
 
 TEST(JxlTest, RoundtripResample4) {
   ThreadPool* pool = nullptr;
-  const PaddedBytes orig =
+  const std::vector<uint8_t> orig =
       ReadTestData("external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
-  CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, pool));
-  io.ShrinkTo(io.xsize(), io.ysize());
-  CompressParams cparams;
-  cparams.resampling = 4;
-  CodecInOut io2;
-  EXPECT_LE(Roundtrip(&io, cparams, {}, pool, &io2), 6000u);
-  EXPECT_THAT(ButteraugliDistance(io, io2, cparams.ba_params, GetJxlCms(),
-                                  /*distmap=*/nullptr, pool),
-              IsSlightlyBelow(22));
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_RESAMPLING, 4);
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 5758, 100);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(22));
 }
 
 TEST(JxlTest, RoundtripResample8) {
   ThreadPool* pool = nullptr;
-  const PaddedBytes orig =
+  const std::vector<uint8_t> orig =
       ReadTestData("external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
-  CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, pool));
-  io.ShrinkTo(io.xsize(), io.ysize());
-  CompressParams cparams;
-  cparams.resampling = 8;
-  CodecInOut io2;
-  EXPECT_LE(Roundtrip(&io, cparams, {}, pool, &io2), 2100u);
-  EXPECT_THAT(ButteraugliDistance(io, io2, cparams.ba_params, GetJxlCms(),
-                                  /*distmap=*/nullptr, pool),
-              IsSlightlyBelow(50));
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_RESAMPLING, 8);
+
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 2036, 50);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(50));
 }
 
 TEST(JxlTest, RoundtripUnalignedD2) {
   ThreadPool* pool = nullptr;
-  const PaddedBytes orig =
+  const std::vector<uint8_t> orig =
       ReadTestData("external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
-  CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, pool));
-  io.ShrinkTo(io.xsize() / 12, io.ysize() / 7);
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+  size_t xsize = t.ppf().info.xsize / 12;
+  size_t ysize = t.ppf().info.ysize / 7;
+  t.SetDimensions(xsize, ysize);
 
-  CompressParams cparams;
-  cparams.butteraugli_distance = 2.0;
+  JXLCompressParams cparams;
+  cparams.distance = 2.0;
 
-  CodecInOut io2;
-  EXPECT_LE(Roundtrip(&io, cparams, {}, pool, &io2), 700u);
-  EXPECT_THAT(ButteraugliDistance(io, io2, cparams.ba_params, GetJxlCms(),
-                                  /*distmap=*/nullptr, pool),
-              IsSlightlyBelow(1.7));
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 506, 30);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.72));
 }
 
-#if JXL_TEST_NL
-
-TEST(JxlTest, RoundtripMultiGroupNL) {
-  ThreadPoolInternal pool(4);
-  const PaddedBytes orig = ReadTestData("jxl/flower/flower.png");
-  CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
-  io.ShrinkTo(600, 1024);  // partial X, full Y group
-
-  CompressParams cparams;
-
-  cparams.fast_mode = true;
-  cparams.butteraugli_distance = 1.0f;
-  CodecInOut io2;
-  Roundtrip(&io, cparams, {}, &pool, &io2);
-  EXPECT_THAT(ButteraugliDistance(io, io2, cparams.ba_params, GetJxlCms(),
-                                  /*distmap=*/nullptr, &pool),
-              IsSlightlyBelow(0.9f));
-
-  cparams.butteraugli_distance = 2.0f;
-  CodecInOut io3;
-  EXPECT_LE(Roundtrip(&io, cparams, {}, &pool, &io3), 80000u);
-  EXPECT_THAT(ButteraugliDistance(io, io3, cparams.ba_params, GetJxlCms(),
-                                  /*distmap=*/nullptr, &pool),
-              IsSlightlyBelow(1.5f));
-}
-
-#endif
-
 TEST(JxlTest, RoundtripMultiGroup) {
-  const PaddedBytes orig = ReadTestData("jxl/flower/flower.png");
-  CodecInOut io;
-  {
-    ThreadPoolInternal pool(4);
-    ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
-  }
-  io.ShrinkTo(600, 1024);
+  const std::vector<uint8_t> orig = ReadTestData("jxl/flower/flower.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata().SetDimensions(600, 1024);
 
   auto test = [&](jxl::SpeedTier speed_tier, float target_distance,
                   size_t expected_size, float expected_distance) {
-    ThreadPoolInternal pool(4);
-    CompressParams cparams;
-    cparams.butteraugli_distance = target_distance;
-    cparams.speed_tier = speed_tier;
-    CodecInOut io2;
-    EXPECT_LE(Roundtrip(&io, cparams, {}, &pool, &io2), expected_size);
-    EXPECT_THAT(ComputeDistance2(io.Main(), io2.Main(), GetJxlCms()),
+    ThreadPoolForTests pool(4);
+    JXLCompressParams cparams;
+    int64_t effort = 10 - static_cast<int>(speed_tier);
+    cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, effort);
+    cparams.distance = target_distance;
+
+    PackedPixelFile ppf_out;
+    EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), expected_size,
+                700);
+    EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out),
                 IsSlightlyBelow(expected_distance));
   };
 
   auto run_kitten = std::async(std::launch::async, test, SpeedTier::kKitten,
-                               1.0f, 55000u, 11);
+                               1.0f, 63624u, 8.5);
   auto run_wombat = std::async(std::launch::async, test, SpeedTier::kWombat,
-                               2.0f, 34000u, 18);
+                               2.0f, 39620u, 15.5);
 }
 
 TEST(JxlTest, RoundtripRGBToGrayscale) {
-  ThreadPoolInternal pool(4);
-  const PaddedBytes orig = ReadTestData("jxl/flower/flower.png");
+  ThreadPoolForTests pool(4);
+  const std::vector<uint8_t> orig = ReadTestData("jxl/flower/flower.png");
   CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
+  ASSERT_TRUE(SetFromBytes(Bytes(orig), &io, &pool));
   io.ShrinkTo(600, 1024);
 
   CompressParams cparams;
   cparams.butteraugli_distance = 1.0f;
   cparams.speed_tier = SpeedTier::kFalcon;
 
-  extras::JXLDecompressParams dparams;
+  JXLDecompressParams dparams;
   dparams.color_space = "Gra_D65_Rel_SRG";
 
   CodecInOut io2;
   EXPECT_FALSE(io.Main().IsGray());
-  EXPECT_LE(Roundtrip(&io, cparams, dparams, &pool, &io2), 55000u);
+  size_t compressed_size;
+  JXL_EXPECT_OK(
+      Roundtrip(&io, cparams, dparams, &io2, _, &compressed_size, &pool));
+  EXPECT_LE(compressed_size, 65000u);
   EXPECT_TRUE(io2.Main().IsGray());
 
   // Convert original to grayscale here, because TransformTo refuses to
   // convert between grayscale and RGB.
   ColorEncoding srgb_lin = ColorEncoding::LinearSRGB(/*is_gray=*/false);
-  ASSERT_TRUE(io.TransformTo(srgb_lin, GetJxlCms(), &pool));
+  ASSERT_TRUE(io.frames[0].TransformTo(srgb_lin, *JxlGetDefaultCms()));
   Image3F* color = io.Main().color();
   for (size_t y = 0; y < color->ysize(); ++y) {
     float* row_r = color->PlaneRow(0, y);
@@ -444,94 +336,94 @@ TEST(JxlTest, RoundtripRGBToGrayscale) {
     }
   }
   ColorEncoding srgb_gamma = ColorEncoding::SRGB(/*is_gray=*/false);
-  ASSERT_TRUE(io.TransformTo(srgb_gamma, GetJxlCms(), &pool));
+  ASSERT_TRUE(io.frames[0].TransformTo(srgb_gamma, *JxlGetDefaultCms()));
   io.metadata.m.color_encoding = io2.Main().c_current();
   io.Main().OverrideProfile(io2.Main().c_current());
-  EXPECT_THAT(ButteraugliDistance(io, io2, cparams.ba_params, GetJxlCms(),
+  EXPECT_THAT(ButteraugliDistance(io.frames, io2.frames, ButteraugliParams(),
+                                  *JxlGetDefaultCms(),
                                   /*distmap=*/nullptr, &pool),
-              IsSlightlyBelow(1.7));
+              IsSlightlyBelow(1.36));
 }
 
 TEST(JxlTest, RoundtripLargeFast) {
-  ThreadPoolInternal pool(8);
-  const PaddedBytes orig = ReadTestData("jxl/flower/flower.png");
-  CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
+  ThreadPoolForTests pool(8);
+  const std::vector<uint8_t> orig = ReadTestData("jxl/flower/flower.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
 
-  CompressParams cparams;
-  cparams.speed_tier = SpeedTier::kSquirrel;
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 7);  // kSquirrel
 
-  CodecInOut io2;
-  EXPECT_LE(Roundtrip(&io, cparams, {}, &pool, &io2), 450800u);
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 505555, 5000);
+  EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(75));
 }
 
 TEST(JxlTest, RoundtripDotsForceEpf) {
-  ThreadPoolInternal pool(8);
-  const PaddedBytes orig =
+  ThreadPoolForTests pool(8);
+  const std::vector<uint8_t> orig =
       ReadTestData("external/wesaturate/500px/cvo9xd_keong_macan_srgb8.png");
-  CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
 
-  CompressParams cparams;
-  cparams.epf = 2;
-  cparams.dots = Override::kOn;
-  cparams.speed_tier = SpeedTier::kSquirrel;
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 7);  // kSquirrel
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EPF, 2);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_DOTS, 1);
 
-  CodecInOut io2;
-  EXPECT_LE(Roundtrip(&io, cparams, {}, &pool, &io2), 450000u);
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 40777, 300);
+  EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(18));
 }
 
 // Checks for differing size/distance in two consecutive runs of distance 2,
 // which involves additional processing including adaptive reconstruction.
 // Failing this may be a sign of race conditions or invalid memory accesses.
 TEST(JxlTest, RoundtripD2Consistent) {
-  ThreadPoolInternal pool(8);
-  const PaddedBytes orig = ReadTestData("jxl/flower/flower.png");
-  CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
+  ThreadPoolForTests pool(8);
+  const std::vector<uint8_t> orig = ReadTestData("jxl/flower/flower.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
 
-  CompressParams cparams;
-  cparams.speed_tier = SpeedTier::kSquirrel;
-  cparams.butteraugli_distance = 2.0;
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 7);  // kSquirrel
+  cparams.distance = 2.0;
 
   // Try each xsize mod kBlockDim to verify right border handling.
   for (size_t xsize = 48; xsize > 40; --xsize) {
-    io.ShrinkTo(xsize, 15);
+    t.SetDimensions(xsize, 15);
 
-    CodecInOut io2;
-    const size_t size2 = Roundtrip(&io, cparams, {}, &pool, &io2);
+    PackedPixelFile ppf2;
+    const size_t size2 = Roundtrip(t.ppf(), cparams, {}, &pool, &ppf2);
 
-    CodecInOut io3;
-    const size_t size3 = Roundtrip(&io, cparams, {}, &pool, &io3);
+    PackedPixelFile ppf3;
+    const size_t size3 = Roundtrip(t.ppf(), cparams, {}, &pool, &ppf3);
 
     // Exact same compressed size.
     EXPECT_EQ(size2, size3);
 
     // Exact same distance.
-    const float dist2 = ComputeDistance2(io.Main(), io2.Main(), GetJxlCms());
-    const float dist3 = ComputeDistance2(io.Main(), io3.Main(), GetJxlCms());
+    const float dist2 = ComputeDistance2(t.ppf(), ppf2);
+    const float dist3 = ComputeDistance2(t.ppf(), ppf3);
     EXPECT_EQ(dist2, dist3);
   }
 }
 
 // Same as above, but for full image, testing multiple groups.
 TEST(JxlTest, RoundtripLargeConsistent) {
-  const PaddedBytes orig = ReadTestData("jxl/flower/flower.png");
-  CodecInOut io;
-  {
-    ThreadPoolInternal pool(8);
-    ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
-  }
+  const std::vector<uint8_t> orig = ReadTestData("jxl/flower/flower.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
 
-  CompressParams cparams;
-  cparams.speed_tier = SpeedTier::kSquirrel;
-  cparams.butteraugli_distance = 2.0;
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 7);  // kSquirrel
+  cparams.distance = 2.0;
 
   auto roundtrip_and_compare = [&]() {
-    ThreadPoolInternal pool(8);
-    CodecInOut io2;
-    size_t size = Roundtrip(&io, cparams, {}, &pool, &io2);
-    double dist = ComputeDistance2(io.Main(), io2.Main(), GetJxlCms());
+    ThreadPoolForTests pool(8);
+    PackedPixelFile ppf2;
+    size_t size = Roundtrip(t.ppf(), cparams, {}, &pool, &ppf2);
+    double dist = ComputeDistance2(t.ppf(), ppf2);
     return std::tuple<size_t, double>(size, dist);
   };
 
@@ -549,128 +441,112 @@ TEST(JxlTest, RoundtripLargeConsistent) {
   EXPECT_EQ(std::get<1>(result2), std::get<1>(result3));
 }
 
-#if JXL_TEST_NL
-
 TEST(JxlTest, RoundtripSmallNL) {
   ThreadPool* pool = nullptr;
-  const PaddedBytes orig =
+  const std::vector<uint8_t> orig =
       ReadTestData("external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
-  CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, pool));
-  io.ShrinkTo(io.xsize() / 8, io.ysize() / 8);
-
-  CompressParams cparams;
-  cparams.butteraugli_distance = 1.0;
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+  size_t xsize = t.ppf().info.xsize / 8;
+  size_t ysize = t.ppf().info.ysize / 8;
+  t.SetDimensions(xsize, ysize);
 
-  CodecInOut io2;
-  EXPECT_LE(Roundtrip(&io, cparams, {}, pool, &io2), 1500u);
-  EXPECT_THAT(ButteraugliDistance(io, io2, cparams.ba_params, GetJxlCms(),
-                                  /*distmap=*/nullptr, pool),
-              IsSlightlyBelow(1.7));
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), {}, {}, pool, &ppf_out), 1027, 45);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(0.82));
 }
 
-#endif
-
 TEST(JxlTest, RoundtripNoGaborishNoAR) {
   ThreadPool* pool = nullptr;
-  const PaddedBytes orig =
+  const std::vector<uint8_t> orig =
       ReadTestData("external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
-  CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, pool));
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
 
-  CompressParams cparams;
-  cparams.gaborish = Override::kOff;
-  cparams.epf = 0;
-  cparams.butteraugli_distance = 1.0;
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EPF, 0);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_GABORISH, 0);
 
-  CodecInOut io2;
-  EXPECT_LE(Roundtrip(&io, cparams, {}, pool, &io2), 40000u);
-  EXPECT_THAT(ButteraugliDistance(io, io2, cparams.ba_params, GetJxlCms(),
-                                  /*distmap=*/nullptr, pool),
-              IsSlightlyBelow(2.0));
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 41769, 400);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.8));
 }
 
 TEST(JxlTest, RoundtripSmallNoGaborish) {
   ThreadPool* pool = nullptr;
-  const PaddedBytes orig =
+  const std::vector<uint8_t> orig =
       ReadTestData("external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
-  CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, pool));
-  io.ShrinkTo(io.xsize() / 8, io.ysize() / 8);
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+  size_t xsize = t.ppf().info.xsize / 8;
+  size_t ysize = t.ppf().info.ysize / 8;
+  t.SetDimensions(xsize, ysize);
 
-  CompressParams cparams;
-  cparams.gaborish = Override::kOff;
-  cparams.butteraugli_distance = 1.0;
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_GABORISH, 0);
 
-  CodecInOut io2;
-  EXPECT_LE(Roundtrip(&io, cparams, {}, pool, &io2), 900u);
-  EXPECT_THAT(ButteraugliDistance(io, io2, cparams.ba_params, GetJxlCms(),
-                                  /*distmap=*/nullptr, pool),
-              IsSlightlyBelow(1.2));
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 1032, 20);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.1));
 }
 
 TEST(JxlTest, RoundtripSmallPatchesAlpha) {
   ThreadPool* pool = nullptr;
-  CodecInOut io;
-  io.metadata.m.color_encoding = ColorEncoding::LinearSRGB();
-  Image3F black_with_small_lines(256, 256);
-  ImageF alpha(black_with_small_lines.xsize(), black_with_small_lines.ysize());
-  ZeroFillImage(&black_with_small_lines);
+  TestImage t;
+  t.SetDimensions(256, 256).SetChannels(4);
+  t.SetColorEncoding("RGB_D65_SRG_Rel_Lin");
+  TestImage::Frame frame = t.AddFrame();
+  frame.ZeroFill();
   // This pattern should be picked up by the patch detection heuristics.
-  for (size_t y = 0; y < black_with_small_lines.ysize(); y++) {
-    float* JXL_RESTRICT row = black_with_small_lines.PlaneRow(1, y);
-    for (size_t x = 0; x < black_with_small_lines.xsize(); x++) {
-      if (x % 4 == 0 && (y / 32) % 4 == 0) row[x] = 127.0f;
+  for (size_t y = 0; y < t.ppf().info.ysize; ++y) {
+    for (size_t x = 0; x < t.ppf().info.xsize; ++x) {
+      if (x % 4 == 0 && (y / 32) % 4 == 0) {
+        frame.SetValue(y, x, 1, 127.0f / 255.0f);
+      }
+      frame.SetValue(y, x, 3, 1.0f);
     }
   }
-  io.metadata.m.SetAlphaBits(8);
-  io.SetFromImage(std::move(black_with_small_lines),
-                  ColorEncoding::LinearSRGB());
-  FillImage(1.0f, &alpha);
-  io.Main().SetAlpha(std::move(alpha), /*alpha_is_premultiplied=*/false);
 
-  CompressParams cparams;
-  cparams.speed_tier = SpeedTier::kSquirrel;
-  cparams.butteraugli_distance = 0.1f;
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 7);  // kSquirrel
+  cparams.distance = 0.1f;
 
-  CodecInOut io2;
-  EXPECT_LE(Roundtrip(&io, cparams, {}, pool, &io2), 2000u);
-  EXPECT_THAT(ButteraugliDistance(io, io2, cparams.ba_params, GetJxlCms(),
-                                  /*distmap=*/nullptr, pool),
-              IsSlightlyBelow(0.04f));
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 597, 100);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(0.018f));
 }
 
 TEST(JxlTest, RoundtripSmallPatches) {
   ThreadPool* pool = nullptr;
-  CodecInOut io;
-  io.metadata.m.color_encoding = ColorEncoding::LinearSRGB();
-  Image3F black_with_small_lines(256, 256);
-  ZeroFillImage(&black_with_small_lines);
+  TestImage t;
+  t.SetDimensions(256, 256);
+  t.SetColorEncoding("RGB_D65_SRG_Rel_Lin");
+  TestImage::Frame frame = t.AddFrame();
+  frame.ZeroFill();
   // This pattern should be picked up by the patch detection heuristics.
-  for (size_t y = 0; y < black_with_small_lines.ysize(); y++) {
-    float* JXL_RESTRICT row = black_with_small_lines.PlaneRow(1, y);
-    for (size_t x = 0; x < black_with_small_lines.xsize(); x++) {
-      if (x % 4 == 0 && (y / 32) % 4 == 0) row[x] = 127.0f;
+  for (size_t y = 0; y < t.ppf().info.ysize; ++y) {
+    for (size_t x = 0; x < t.ppf().info.xsize; ++x) {
+      if (x % 4 == 0 && (y / 32) % 4 == 0) {
+        frame.SetValue(y, x, 1, 127.0f / 255.0f);
+      }
     }
   }
-  io.SetFromImage(std::move(black_with_small_lines),
-                  ColorEncoding::LinearSRGB());
 
-  CompressParams cparams;
-  cparams.speed_tier = SpeedTier::kSquirrel;
-  cparams.butteraugli_distance = 0.1f;
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 7);  // kSquirrel
+  cparams.distance = 0.1f;
 
-  CodecInOut io2;
-  EXPECT_LE(Roundtrip(&io, cparams, {}, pool, &io2), 2000u);
-  EXPECT_THAT(ButteraugliDistance(io, io2, cparams.ba_params, GetJxlCms(),
-                                  /*distmap=*/nullptr, pool),
-              IsSlightlyBelow(0.04f));
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 486, 100);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(0.018f));
 }
 
-// Test header encoding of original bits per sample
+// TODO(szabadka) Add encoder and decoder API functions that accept frame
+// buffers in arbitrary unsigned and floating point formats, and then roundtrip
+// test the lossless codepath to make sure the exact binary representations
+// are preserved.
+#if 0
 TEST(JxlTest, RoundtripImageBundleOriginalBits) {
-  ThreadPool* pool = nullptr;
-
   // Image does not matter, only io.metadata.m and io2.metadata.m are tested.
   Image3F image(1, 1);
   ZeroFillImage(&image);
@@ -691,7 +567,7 @@ TEST(JxlTest, RoundtripImageBundleOriginalBits) {
 
     io.metadata.m.SetUintSamples(bit_depth);
     CodecInOut io2;
-    Roundtrip(&io, cparams, {}, pool, &io2);
+    JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _));
 
     EXPECT_EQ(bit_depth, io2.metadata.m.bit_depth.bits_per_sample);
     EXPECT_FALSE(io2.metadata.m.bit_depth.floating_point_sample);
@@ -702,7 +578,7 @@ TEST(JxlTest, RoundtripImageBundleOriginalBits) {
   // Test various existing and non-existing floating point formats
   for (uint32_t bit_depth = 8; bit_depth <= 32; bit_depth++) {
     if (bit_depth != 32) {
-      // TODO: test other float types once they work
+      // TODO(user): test other float types once they work
       break;
     }
 
@@ -728,7 +604,7 @@ TEST(JxlTest, RoundtripImageBundleOriginalBits) {
     io.metadata.m.bit_depth.exponent_bits_per_sample = exponent_bit_depth;
 
     CodecInOut io2;
-    Roundtrip(&io, cparams, {}, pool, &io2);
+    JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2));
 
     EXPECT_EQ(bit_depth, io2.metadata.m.bit_depth.bits_per_sample);
     EXPECT_TRUE(io2.metadata.m.bit_depth.floating_point_sample);
@@ -737,38 +613,37 @@ TEST(JxlTest, RoundtripImageBundleOriginalBits) {
     EXPECT_EQ(0u, io2.metadata.m.GetAlphaBits());
   }
 }
+#endif
 
 TEST(JxlTest, RoundtripGrayscale) {
-  ThreadPool* pool = nullptr;
-  const PaddedBytes orig = ReadTestData(
+  const std::vector<uint8_t> orig = ReadTestData(
       "external/wesaturate/500px/cvo9xd_keong_macan_grayscale.png");
   CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, pool));
+  ASSERT_TRUE(SetFromBytes(Bytes(orig), &io));
   ASSERT_NE(io.xsize(), 0u);
   io.ShrinkTo(128, 128);
   EXPECT_TRUE(io.Main().IsGray());
   EXPECT_EQ(8u, io.metadata.m.bit_depth.bits_per_sample);
   EXPECT_FALSE(io.metadata.m.bit_depth.floating_point_sample);
   EXPECT_EQ(0u, io.metadata.m.bit_depth.exponent_bits_per_sample);
-  EXPECT_TRUE(io.metadata.m.color_encoding.tf.IsSRGB());
+  EXPECT_TRUE(io.metadata.m.color_encoding.Tf().IsSRGB());
 
   PassesEncoderState enc_state;
-  AuxOut* aux_out = nullptr;
 
   {
     CompressParams cparams;
     cparams.butteraugli_distance = 1.0;
 
-    PaddedBytes compressed;
-    EXPECT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(),
-                           aux_out, pool));
+    std::vector<uint8_t> compressed;
+    EXPECT_TRUE(test::EncodeFile(cparams, &io, &enc_state, &compressed));
     CodecInOut io2;
-    EXPECT_TRUE(test::DecodeFile({}, compressed, &io2, pool));
+    EXPECT_TRUE(test::DecodeFile({}, Bytes(compressed), &io2));
     EXPECT_TRUE(io2.Main().IsGray());
 
     EXPECT_LE(compressed.size(), 7000u);
-    EXPECT_THAT(ButteraugliDistance(io, io2, cparams.ba_params, GetJxlCms(),
-                                    /*distmap=*/nullptr, pool),
+    EXPECT_THAT(ButteraugliDistance(io.frames, io2.frames, ButteraugliParams(),
+                                    *JxlGetDefaultCms(),
+                                    /*distmap=*/nullptr),
                 IsSlightlyBelow(1.6));
   }
 
@@ -778,46 +653,45 @@ TEST(JxlTest, RoundtripGrayscale) {
     CompressParams cparams;
     cparams.butteraugli_distance = 8.0;
 
-    PaddedBytes compressed;
-    EXPECT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(),
-                           aux_out, pool));
+    std::vector<uint8_t> compressed;
+    EXPECT_TRUE(test::EncodeFile(cparams, &io, &enc_state, &compressed));
     CodecInOut io2;
-    EXPECT_TRUE(test::DecodeFile({}, compressed, &io2, pool));
+    EXPECT_TRUE(test::DecodeFile({}, Bytes(compressed), &io2));
     EXPECT_TRUE(io2.Main().IsGray());
 
     EXPECT_LE(compressed.size(), 1300u);
-    EXPECT_THAT(ButteraugliDistance(io, io2, cparams.ba_params, GetJxlCms(),
-                                    /*distmap=*/nullptr, pool),
-                IsSlightlyBelow(6.0));
+    EXPECT_THAT(ButteraugliDistance(io.frames, io2.frames, ButteraugliParams(),
+                                    *JxlGetDefaultCms(),
+                                    /*distmap=*/nullptr),
+                IsSlightlyBelow(6.7));
   }
 
   {
     CompressParams cparams;
     cparams.butteraugli_distance = 1.0;
 
-    PaddedBytes compressed;
-    EXPECT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(),
-                           aux_out, pool));
+    std::vector<uint8_t> compressed;
+    EXPECT_TRUE(test::EncodeFile(cparams, &io, &enc_state, &compressed));
 
     CodecInOut io2;
-    extras::JXLDecompressParams dparams;
+    JXLDecompressParams dparams;
     dparams.color_space = "RGB_D65_SRG_Rel_SRG";
-    EXPECT_TRUE(test::DecodeFile(dparams, compressed, &io2, pool));
+    EXPECT_TRUE(test::DecodeFile(dparams, Bytes(compressed), &io2));
     EXPECT_FALSE(io2.Main().IsGray());
 
     EXPECT_LE(compressed.size(), 7000u);
-    EXPECT_THAT(ButteraugliDistance(io, io2, cparams.ba_params, GetJxlCms(),
-                                    /*distmap=*/nullptr, pool),
+    EXPECT_THAT(ButteraugliDistance(io.frames, io2.frames, ButteraugliParams(),
+                                    *JxlGetDefaultCms(),
+                                    /*distmap=*/nullptr),
                 IsSlightlyBelow(1.6));
   }
 }
 
 TEST(JxlTest, RoundtripAlpha) {
-  ThreadPool* pool = nullptr;
-  const PaddedBytes orig =
+  const std::vector<uint8_t> orig =
       ReadTestData("external/wesaturate/500px/tmshre_riaphotographs_alpha.png");
   CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, pool));
+  ASSERT_TRUE(SetFromBytes(Bytes(orig), &io));
 
   ASSERT_NE(io.xsize(), 0u);
   ASSERT_TRUE(io.metadata.m.HasAlpha());
@@ -830,37 +704,89 @@ TEST(JxlTest, RoundtripAlpha) {
   EXPECT_EQ(8u, io.metadata.m.bit_depth.bits_per_sample);
   EXPECT_FALSE(io.metadata.m.bit_depth.floating_point_sample);
   EXPECT_EQ(0u, io.metadata.m.bit_depth.exponent_bits_per_sample);
-  EXPECT_TRUE(io.metadata.m.color_encoding.tf.IsSRGB());
+  EXPECT_TRUE(io.metadata.m.color_encoding.Tf().IsSRGB());
   PassesEncoderState enc_state;
-  AuxOut* aux_out = nullptr;
-  PaddedBytes compressed;
-  EXPECT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(),
-                         aux_out, pool));
+  std::vector<uint8_t> compressed;
+  EXPECT_TRUE(test::EncodeFile(cparams, &io, &enc_state, &compressed));
+
+  EXPECT_LE(compressed.size(), 10077u);
 
   for (bool use_image_callback : {false, true}) {
     for (bool unpremul_alpha : {false, true}) {
       CodecInOut io2;
-      extras::JXLDecompressParams dparams;
+      JXLDecompressParams dparams;
       dparams.use_image_callback = use_image_callback;
       dparams.unpremultiply_alpha = unpremul_alpha;
-      EXPECT_TRUE(test::DecodeFile(dparams, compressed, &io2, pool));
-
-      EXPECT_LE(compressed.size(), 10077u);
+      EXPECT_TRUE(test::DecodeFile(dparams, Bytes(compressed), &io2));
+      EXPECT_THAT(ButteraugliDistance(io.frames, io2.frames,
+                                      ButteraugliParams(), *JxlGetDefaultCms(),
+                                      /*distmap=*/nullptr),
+                  IsSlightlyBelow(1.15));
+    }
+  }
+}
 
-      EXPECT_THAT(ButteraugliDistance(io, io2, cparams.ba_params, GetJxlCms(),
-                                      /*distmap=*/nullptr, pool),
-                  IsSlightlyBelow(1.2));
+namespace {
+// Performs "PremultiplyAlpha" for each ImageBundle (preview/frames).
+bool PremultiplyAlpha(CodecInOut& io) {
+  const auto doPremultiplyAlpha = [](ImageBundle& bundle) {
+    if (!bundle.HasAlpha()) return;
+    if (!bundle.HasColor()) return;
+    auto* color = bundle.color();
+    const auto* alpha = bundle.alpha();
+    JXL_CHECK(color->ysize() == alpha->ysize());
+    JXL_CHECK(color->xsize() == alpha->xsize());
+    for (size_t y = 0; y < color->ysize(); y++) {
+      ::jxl::PremultiplyAlpha(color->PlaneRow(0, y), color->PlaneRow(1, y),
+                              color->PlaneRow(2, y), alpha->Row(y),
+                              color->xsize());
+    }
+  };
+  ExtraChannelInfo* eci = io.metadata.m.Find(ExtraChannel::kAlpha);
+  if (eci == nullptr || eci->alpha_associated) return false;
+  if (io.metadata.m.have_preview) {
+    doPremultiplyAlpha(io.preview_frame);
+  }
+  for (ImageBundle& ib : io.frames) {
+    doPremultiplyAlpha(ib);
+  }
+  eci->alpha_associated = true;
+  return true;
+}
+
+bool UnpremultiplyAlpha(CodecInOut& io) {
+  const auto doUnpremultiplyAlpha = [](ImageBundle& bundle) {
+    if (!bundle.HasAlpha()) return;
+    if (!bundle.HasColor()) return;
+    auto* color = bundle.color();
+    const auto* alpha = bundle.alpha();
+    JXL_CHECK(color->ysize() == alpha->ysize());
+    JXL_CHECK(color->xsize() == alpha->xsize());
+    for (size_t y = 0; y < color->ysize(); y++) {
+      ::jxl::UnpremultiplyAlpha(color->PlaneRow(0, y), color->PlaneRow(1, y),
+                                color->PlaneRow(2, y), alpha->Row(y),
+                                color->xsize());
     }
+  };
+  ExtraChannelInfo* eci = io.metadata.m.Find(ExtraChannel::kAlpha);
+  if (eci == nullptr || !eci->alpha_associated) return false;
+  if (io.metadata.m.have_preview) {
+    doUnpremultiplyAlpha(io.preview_frame);
+  }
+  for (ImageBundle& ib : io.frames) {
+    doUnpremultiplyAlpha(ib);
   }
+  eci->alpha_associated = false;
+  return true;
 }
+}  // namespace
 
 TEST(JxlTest, RoundtripAlphaPremultiplied) {
-  ThreadPool* pool = nullptr;
-  const PaddedBytes orig =
+  const std::vector<uint8_t> orig =
       ReadTestData("external/wesaturate/500px/tmshre_riaphotographs_alpha.png");
   CodecInOut io, io_nopremul;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, pool));
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io_nopremul, pool));
+  ASSERT_TRUE(SetFromBytes(Bytes(orig), &io));
+  ASSERT_TRUE(SetFromBytes(Bytes(orig), &io_nopremul));
 
   ASSERT_NE(io.xsize(), 0u);
   ASSERT_TRUE(io.metadata.m.HasAlpha());
@@ -870,18 +796,18 @@ TEST(JxlTest, RoundtripAlphaPremultiplied) {
 
   CompressParams cparams;
   cparams.butteraugli_distance = 1.0;
+  cparams.SetCms(*JxlGetDefaultCms());
 
   EXPECT_FALSE(io.Main().AlphaIsPremultiplied());
-  EXPECT_TRUE(io.PremultiplyAlpha());
+  EXPECT_TRUE(PremultiplyAlpha(io));
   EXPECT_TRUE(io.Main().AlphaIsPremultiplied());
 
   EXPECT_FALSE(io_nopremul.Main().AlphaIsPremultiplied());
 
   PassesEncoderState enc_state;
-  AuxOut* aux_out = nullptr;
-  PaddedBytes compressed;
-  EXPECT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(),
-                         aux_out, pool));
+  std::vector<uint8_t> compressed;
+  EXPECT_TRUE(test::EncodeFile(cparams, &io, &enc_state, &compressed));
+  EXPECT_LE(compressed.size(), 10000u);
 
   for (bool use_image_callback : {false, true}) {
     for (bool unpremul_alpha : {false, true}) {
@@ -893,29 +819,30 @@ TEST(JxlTest, RoundtripAlphaPremultiplied) {
             use_image_callback ? "image callback" : "image_buffer",
             unpremul_alpha ? "un" : "");
         CodecInOut io2;
-        extras::JXLDecompressParams dparams;
+        JXLDecompressParams dparams;
         dparams.use_image_callback = use_image_callback;
         dparams.unpremultiply_alpha = unpremul_alpha;
         if (use_uint8) {
           dparams.accepted_formats = {
               {4, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0}};
         }
-        EXPECT_TRUE(test::DecodeFile(dparams, compressed, &io2, pool));
+        EXPECT_TRUE(test::DecodeFile(dparams, Bytes(compressed), &io2));
 
-        EXPECT_LE(compressed.size(), 10000u);
         EXPECT_EQ(unpremul_alpha, !io2.Main().AlphaIsPremultiplied());
         if (!unpremul_alpha) {
           EXPECT_THAT(
-              ButteraugliDistance(io, io2, cparams.ba_params, GetJxlCms(),
-                                  /*distmap=*/nullptr, pool),
-              IsSlightlyBelow(1.25));
-          EXPECT_TRUE(io2.UnpremultiplyAlpha());
+              ButteraugliDistance(io.frames, io2.frames, ButteraugliParams(),
+                                  *JxlGetDefaultCms(),
+                                  /*distmap=*/nullptr),
+              IsSlightlyBelow(1.111));
+          EXPECT_TRUE(UnpremultiplyAlpha(io2));
           EXPECT_FALSE(io2.Main().AlphaIsPremultiplied());
         }
-        EXPECT_THAT(ButteraugliDistance(io_nopremul, io2, cparams.ba_params,
-                                        GetJxlCms(),
-                                        /*distmap=*/nullptr, pool),
-                    IsSlightlyBelow(1.35));
+        EXPECT_THAT(
+            ButteraugliDistance(io_nopremul.frames, io2.frames,
+                                ButteraugliParams(), *JxlGetDefaultCms(),
+                                /*distmap=*/nullptr),
+            IsSlightlyBelow(1.55));
       }
     }
   }
@@ -923,513 +850,340 @@ TEST(JxlTest, RoundtripAlphaPremultiplied) {
 
 TEST(JxlTest, RoundtripAlphaResampling) {
   ThreadPool* pool = nullptr;
-  const PaddedBytes orig =
+  const std::vector<uint8_t> orig =
       ReadTestData("external/wesaturate/500px/tmshre_riaphotographs_alpha.png");
-  CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, pool));
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+  ASSERT_NE(t.ppf().info.xsize, 0);
+  ASSERT_TRUE(t.ppf().info.alpha_bits > 0);
 
-  ASSERT_NE(io.xsize(), 0u);
-  ASSERT_TRUE(io.metadata.m.HasAlpha());
-  ASSERT_TRUE(io.Main().HasAlpha());
-
-  CompressParams cparams;
-  cparams.resampling = 2;
-  cparams.ec_resampling = 2;
-  cparams.butteraugli_distance = 1.0;
-  cparams.speed_tier = SpeedTier::kHare;
-
-  PassesEncoderState enc_state;
-  AuxOut* aux_out = nullptr;
-  PaddedBytes compressed;
-  EXPECT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(),
-                         aux_out, pool));
-  CodecInOut io2;
-  EXPECT_TRUE(test::DecodeFile({}, compressed, &io2, pool));
-
-  EXPECT_LE(compressed.size(), 15000u);
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 5);  // kHare
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_RESAMPLING, 2);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EXTRA_CHANNEL_RESAMPLING, 2);
 
-  EXPECT_THAT(ButteraugliDistance(io, io2, cparams.ba_params, GetJxlCms(),
-                                  /*distmap=*/nullptr, pool),
-              IsSlightlyBelow(4.7));
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 13655, 130);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(5.2));
 }
 
 TEST(JxlTest, RoundtripAlphaResamplingOnlyAlpha) {
   ThreadPool* pool = nullptr;
-  const PaddedBytes orig =
+  const std::vector<uint8_t> orig =
       ReadTestData("external/wesaturate/500px/tmshre_riaphotographs_alpha.png");
-  CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, pool));
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+  ASSERT_NE(t.ppf().info.xsize, 0);
+  ASSERT_TRUE(t.ppf().info.alpha_bits > 0);
 
-  ASSERT_NE(io.xsize(), 0u);
-  ASSERT_TRUE(io.metadata.m.HasAlpha());
-  ASSERT_TRUE(io.Main().HasAlpha());
-
-  CompressParams cparams;
-  cparams.ec_resampling = 2;
-  cparams.butteraugli_distance = 1.0;
-  cparams.speed_tier = SpeedTier::kFalcon;
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 3);  // kFalcon
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EXTRA_CHANNEL_RESAMPLING, 2);
 
-  PassesEncoderState enc_state;
-  AuxOut* aux_out = nullptr;
-  PaddedBytes compressed;
-  EXPECT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(),
-                         aux_out, pool));
-  CodecInOut io2;
-  EXPECT_TRUE(test::DecodeFile({}, compressed, &io2, pool));
-
-  EXPECT_LE(compressed.size(), 34200u);
-
-  EXPECT_THAT(ButteraugliDistance(io, io2, cparams.ba_params, GetJxlCms(),
-                                  /*distmap=*/nullptr, pool),
-              IsSlightlyBelow(1.85));
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 33571, 400);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.49));
 }
 
 TEST(JxlTest, RoundtripAlphaNonMultipleOf8) {
   ThreadPool* pool = nullptr;
-  const PaddedBytes orig =
+  const std::vector<uint8_t> orig =
       ReadTestData("external/wesaturate/500px/tmshre_riaphotographs_alpha.png");
-  CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, pool));
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata().SetDimensions(12, 12);
+  ASSERT_NE(t.ppf().info.xsize, 0);
+  ASSERT_TRUE(t.ppf().info.alpha_bits > 0);
+  EXPECT_EQ(t.ppf().frames[0].color.format.data_type, JXL_TYPE_UINT8);
 
-  ASSERT_NE(io.xsize(), 0u);
-  ASSERT_TRUE(io.metadata.m.HasAlpha());
-  ASSERT_TRUE(io.Main().HasAlpha());
-  io.ShrinkTo(12, 12);
-
-  CompressParams cparams;
-  cparams.butteraugli_distance = 1.0;
-
-  EXPECT_EQ(8u, io.metadata.m.bit_depth.bits_per_sample);
-  EXPECT_FALSE(io.metadata.m.bit_depth.floating_point_sample);
-  EXPECT_EQ(0u, io.metadata.m.bit_depth.exponent_bits_per_sample);
-  EXPECT_TRUE(io.metadata.m.color_encoding.tf.IsSRGB());
-  PassesEncoderState enc_state;
-  AuxOut* aux_out = nullptr;
-  PaddedBytes compressed;
-  EXPECT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(),
-                         aux_out, pool));
-  CodecInOut io2;
-  EXPECT_TRUE(test::DecodeFile({}, compressed, &io2, pool));
-
-  EXPECT_LE(compressed.size(), 180u);
-
-  // TODO(robryk): Fix the following line in presence of different alpha_bits in
-  // the two contexts.
-  // EXPECT_TRUE(SamePixels(io.Main().alpha(), io2.Main().alpha()));
-  // TODO(robryk): Fix the distance estimate used in the encoder.
-  EXPECT_THAT(ButteraugliDistance(io, io2, cparams.ba_params, GetJxlCms(),
-                                  /*distmap=*/nullptr, pool),
-              IsSlightlyBelow(0.9));
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), {}, {}, pool, &ppf_out), 107, 10);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(0.95));
 }
 
 TEST(JxlTest, RoundtripAlpha16) {
-  ThreadPoolInternal pool(4);
-
+  ThreadPoolForTests pool(4);
+  // The image is wider than 512 pixels to ensure multiple groups are tested.
   size_t xsize = 1200, ysize = 160;
-  Image3F color(xsize, ysize);
-  ImageF alpha(xsize, ysize);
+  TestImage t;
+  t.SetDimensions(xsize, ysize).SetChannels(4).SetAllBitDepths(16);
+  TestImage::Frame frame = t.AddFrame();
   // Generate 16-bit pattern that uses various colors and alpha values.
+  const float mul = 1.0f / 65535;
   for (size_t y = 0; y < ysize; y++) {
     for (size_t x = 0; x < xsize; x++) {
-      color.PlaneRow(0, y)[x] = (y * 65535 / ysize) * (1.0f / 65535);
-      color.PlaneRow(1, y)[x] = (x * 65535 / xsize) * (1.0f / 65535);
-      color.PlaneRow(2, y)[x] =
-          ((y + x) * 65535 / (xsize + ysize)) * (1.0f / 65535);
-      alpha.Row(y)[x] = (x * 65535 / xsize) * (1.0f / 65535);
+      uint16_t r = y * 65535 / ysize;
+      uint16_t g = x * 65535 / xsize;
+      uint16_t b = (y + x) * 65535 / (xsize + ysize);
+      frame.SetValue(y, x, 0, r * mul);
+      frame.SetValue(y, x, 1, g * mul);
+      frame.SetValue(y, x, 2, b * mul);
+      frame.SetValue(y, x, 3, g * mul);
     }
   }
-  const bool is_gray = false;
-  CodecInOut io;
-  io.metadata.m.SetUintSamples(16);
-  io.metadata.m.SetAlphaBits(16);
-  io.metadata.m.color_encoding = ColorEncoding::SRGB(is_gray);
-  io.SetFromImage(std::move(color), io.metadata.m.color_encoding);
-  io.Main().SetAlpha(std::move(alpha), /*alpha_is_premultiplied=*/false);
-
-  // The image is wider than 512 pixels to ensure multiple groups are tested.
 
-  ASSERT_NE(io.xsize(), 0u);
-  ASSERT_TRUE(io.metadata.m.HasAlpha());
-  ASSERT_TRUE(io.Main().HasAlpha());
+  ASSERT_NE(t.ppf().info.xsize, 0);
+  ASSERT_EQ(t.ppf().info.alpha_bits, 16);
 
-  CompressParams cparams;
-  cparams.butteraugli_distance = 0.5;
-  cparams.speed_tier = SpeedTier::kWombat;
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 6);  // kWombat
+  cparams.distance = 0.5;
 
-  io.metadata.m.SetUintSamples(16);
-  EXPECT_TRUE(io.metadata.m.color_encoding.tf.IsSRGB());
-  PassesEncoderState enc_state;
-  AuxOut* aux_out = nullptr;
-  PaddedBytes compressed;
-  EXPECT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(),
-                         aux_out, &pool));
-  CodecInOut io2;
-  EXPECT_TRUE(test::DecodeFile({}, compressed, &io2, &pool));
-  EXPECT_THAT(ButteraugliDistance(io, io2, cparams.ba_params, GetJxlCms(),
-                                  /*distmap=*/nullptr, &pool),
-              IsSlightlyBelow(0.8));
+  PackedPixelFile ppf_out;
+  // TODO(szabadka) Investigate big size difference on i686
+  // This still keeps happening (2023-04-18).
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 3666, 120);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(0.65));
 }
 
 namespace {
-CompressParams CParamsForLossless() {
-  CompressParams cparams;
-  cparams.modular_mode = true;
-  cparams.color_transform = jxl::ColorTransform::kNone;
-  cparams.butteraugli_distance = 0.f;
-  cparams.options.predictor = {Predictor::Weighted};
+JXLCompressParams CompressParamsForLossless() {
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_MODULAR, 1);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_COLOR_TRANSFORM, 1);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_MODULAR_PREDICTOR, 6);  // Weighted
+  cparams.distance = 0;
   return cparams;
 }
 }  // namespace
 
 TEST(JxlTest, JXL_SLOW_TEST(RoundtripLossless8)) {
-  ThreadPoolInternal pool(8);
-  const PaddedBytes orig =
-      ReadTestData("external/wesaturate/500px/tmshre_riaphotographs_srgb8.png");
-  CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
-
-  CompressParams cparams = CParamsForLossless();
-
-  CodecInOut io2;
-  EXPECT_LE(Roundtrip(&io, cparams, {}, &pool, &io2), 3500000u);
-  // If this test fails with a very close to 0.0 but not exactly 0.0 butteraugli
-  // distance, then there is likely a floating point issue, that could be
-  // happening either in io or io2. The values of io are generated by
-  // external_image.cc, and those in io2 by the jxl decoder. If they use
-  // slightly different floating point operations (say, one casts int to float
-  // while other divides the int through 255.0f and later multiplies it by
-  // 255 again) they will get slightly different values. To fix, ensure both
-  // sides do the following formula for converting integer range 0-255 to
-  // floating point range 0.0f-255.0f: static_cast<float>(i)
-  // without any further intermediate operations.
-  // Note that this precision issue is not a problem in practice if the values
-  // are equal when rounded to 8-bit int, but currently full exact precision is
-  // tested.
-  EXPECT_EQ(ComputeDistance2(io.Main(), io2.Main(), GetJxlCms()), 0.0);
-}
-
-TEST(JxlTest, JXL_SLOW_TEST(RoundtripLosslessNoEncoderFastPathWP)) {
-  ThreadPoolInternal pool(8);
-  const PaddedBytes orig =
+  ThreadPoolForTests pool(8);
+  const std::vector<uint8_t> orig =
       ReadTestData("external/wesaturate/500px/tmshre_riaphotographs_srgb8.png");
-  CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
 
-  CompressParams cparams = CParamsForLossless();
-  cparams.speed_tier = SpeedTier::kFalcon;
-  cparams.options.skip_encoder_fast_path = true;
+  JXLCompressParams cparams = CompressParamsForLossless();
 
-  CodecInOut io2;
-  EXPECT_LE(Roundtrip(&io, cparams, {}, &pool, &io2), 3500000u);
-  EXPECT_EQ(ComputeDistance2(io.Main(), io2.Main(), GetJxlCms()), 0.0);
+  PackedPixelFile ppf_out;
+  EXPECT_EQ(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 223058);
+  EXPECT_EQ(ComputeDistance2(t.ppf(), ppf_out), 0.0);
 }
 
-TEST(JxlTest, JXL_SLOW_TEST(RoundtripLosslessNoEncoderFastPathGradient)) {
-  ThreadPoolInternal pool(8);
-  const PaddedBytes orig =
+TEST(JxlTest, JXL_SLOW_TEST(RoundtripLossless8ThunderGradient)) {
+  ThreadPoolForTests pool(8);
+  const std::vector<uint8_t> orig =
       ReadTestData("external/wesaturate/500px/tmshre_riaphotographs_srgb8.png");
-  CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
 
-  CompressParams cparams = CParamsForLossless();
-  cparams.speed_tier = SpeedTier::kThunder;
-  cparams.options.skip_encoder_fast_path = true;
-  cparams.options.predictor = {Predictor::Gradient};
+  JXLCompressParams cparams = CompressParamsForLossless();
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 2);             // kThunder
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_MODULAR_PREDICTOR, 5);  // Gradient
 
-  CodecInOut io2;
-  EXPECT_LE(Roundtrip(&io, cparams, {}, &pool, &io2), 3500000u);
-  EXPECT_EQ(ComputeDistance2(io.Main(), io2.Main(), GetJxlCms()), 0.0);
+  PackedPixelFile ppf_out;
+  EXPECT_EQ(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 261684);
+  EXPECT_EQ(ComputeDistance2(t.ppf(), ppf_out), 0.0);
 }
 
-TEST(JxlTest, JXL_SLOW_TEST(RoundtripLosslessNoEncoderVeryFastPathGradient)) {
-  ThreadPoolInternal pool(8);
-  const PaddedBytes orig =
+TEST(JxlTest, JXL_SLOW_TEST(RoundtripLossless8LightningGradient)) {
+  ThreadPoolForTests pool(8);
+  const std::vector<uint8_t> orig =
       ReadTestData("external/wesaturate/500px/tmshre_riaphotographs_srgb8.png");
-  CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
 
-  CompressParams cparams = CParamsForLossless();
-  cparams.speed_tier = SpeedTier::kLightning;
-  cparams.options.skip_encoder_fast_path = true;
-  cparams.options.predictor = {Predictor::Gradient};
+  JXLCompressParams cparams = CompressParamsForLossless();
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 1);  // kLightning
 
-  CodecInOut io2, io3;
-  EXPECT_LE(Roundtrip(&io, cparams, {}, &pool, &io2), 3500000u);
-  EXPECT_EQ(ComputeDistance2(io.Main(), io2.Main(), GetJxlCms()), 0.0);
-  cparams.options.skip_encoder_fast_path = false;
-  EXPECT_LE(Roundtrip(&io, cparams, {}, &pool, &io3), 3500000u);
-  EXPECT_EQ(ComputeDistance2(io.Main(), io3.Main(), GetJxlCms()), 0.0);
+  PackedPixelFile ppf_out;
+  // Lax comparison because different SIMD will cause different compression.
+  EXPECT_THAT(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out),
+              IsSlightlyBelow(286848u));
+  EXPECT_EQ(ComputeDistance2(t.ppf(), ppf_out), 0.0);
 }
 
 TEST(JxlTest, JXL_SLOW_TEST(RoundtripLossless8Falcon)) {
-  ThreadPoolInternal pool(8);
-  const PaddedBytes orig =
+  ThreadPoolForTests pool(8);
+  const std::vector<uint8_t> orig =
       ReadTestData("external/wesaturate/500px/tmshre_riaphotographs_srgb8.png");
-  CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
 
-  CompressParams cparams = CParamsForLossless();
-  cparams.speed_tier = SpeedTier::kFalcon;
+  JXLCompressParams cparams = CompressParamsForLossless();
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 3);  // kFalcon
 
-  CodecInOut io2;
-  EXPECT_LE(Roundtrip(&io, cparams, {}, &pool, &io2), 3500000u);
-  EXPECT_EQ(0.0, ButteraugliDistance(io, io2, cparams.ba_params, GetJxlCms(),
-                                     /*distmap=*/nullptr, &pool));
+  PackedPixelFile ppf_out;
+  EXPECT_EQ(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 230766);
+  EXPECT_EQ(ComputeDistance2(t.ppf(), ppf_out), 0.0);
 }
 
 TEST(JxlTest, RoundtripLossless8Alpha) {
   ThreadPool* pool = nullptr;
-  const PaddedBytes orig =
+  const std::vector<uint8_t> orig =
       ReadTestData("external/wesaturate/500px/tmshre_riaphotographs_alpha.png");
-  CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, pool));
-  EXPECT_EQ(8u, io.metadata.m.GetAlphaBits());
-  EXPECT_EQ(8u, io.metadata.m.bit_depth.bits_per_sample);
-  EXPECT_FALSE(io.metadata.m.bit_depth.floating_point_sample);
-  EXPECT_EQ(0u, io.metadata.m.bit_depth.exponent_bits_per_sample);
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+  ASSERT_EQ(t.ppf().info.alpha_bits, 8);
+  EXPECT_EQ(t.ppf().frames[0].color.format.data_type, JXL_TYPE_UINT8);
 
-  CompressParams cparams = CParamsForLossless();
+  JXLCompressParams cparams = CompressParamsForLossless();
 
-  CodecInOut io2;
-  EXPECT_LE(Roundtrip(&io, cparams, {}, pool, &io2), 350000u);
-  // If fails, see note about floating point in RoundtripLossless8.
-  EXPECT_EQ(ComputeDistance2(io.Main(), io2.Main(), GetJxlCms()), 0.0);
-  EXPECT_TRUE(SamePixels(*io.Main().alpha(), *io2.Main().alpha()));
-  EXPECT_EQ(8u, io2.metadata.m.GetAlphaBits());
-  EXPECT_EQ(8u, io2.metadata.m.bit_depth.bits_per_sample);
-  EXPECT_FALSE(io2.metadata.m.bit_depth.floating_point_sample);
-  EXPECT_EQ(0u, io2.metadata.m.bit_depth.exponent_bits_per_sample);
+  JXLDecompressParams dparams;
+  dparams.accepted_formats.push_back(t.ppf().frames[0].color.format);
+
+  PackedPixelFile ppf_out;
+  EXPECT_EQ(Roundtrip(t.ppf(), cparams, dparams, pool, &ppf_out), 251470);
+  EXPECT_EQ(ComputeDistance2(t.ppf(), ppf_out), 0.0);
+  EXPECT_EQ(ppf_out.info.alpha_bits, 8);
+  EXPECT_TRUE(test::SameAlpha(t.ppf(), ppf_out));
 }
 
 TEST(JxlTest, RoundtripLossless16Alpha) {
   ThreadPool* pool = nullptr;
-
   size_t xsize = 1200, ysize = 160;
-  Image3F color(xsize, ysize);
-  ImageF alpha(xsize, ysize);
+  TestImage t;
+  t.SetDimensions(xsize, ysize).SetChannels(4).SetAllBitDepths(16);
+  TestImage::Frame frame = t.AddFrame();
   // Generate 16-bit pattern that uses various colors and alpha values.
+  const float mul = 1.0f / 65535;
   for (size_t y = 0; y < ysize; y++) {
     for (size_t x = 0; x < xsize; x++) {
-      color.PlaneRow(0, y)[x] = (y * 65535 / ysize) * (1.0f / 65535);
-      color.PlaneRow(1, y)[x] = (x * 65535 / xsize) * (1.0f / 65535);
-      color.PlaneRow(2, y)[x] =
-          ((y + x) * 65535 / (xsize + ysize)) * (1.0f / 65535);
-      alpha.Row(y)[x] = (x * 65535 / xsize) * (1.0f / 65535);
+      uint16_t r = y * 65535 / ysize;
+      uint16_t g = x * 65535 / xsize + 37;
+      uint16_t b = (y + x) * 65535 / (xsize + ysize);
+      frame.SetValue(y, x, 0, r * mul);
+      frame.SetValue(y, x, 1, g * mul);
+      frame.SetValue(y, x, 2, b * mul);
+      frame.SetValue(y, x, 3, g * mul);
     }
   }
-  const bool is_gray = false;
-  CodecInOut io;
-  io.metadata.m.SetUintSamples(16);
-  io.metadata.m.SetAlphaBits(16);
-  io.metadata.m.color_encoding = ColorEncoding::SRGB(is_gray);
-  io.SetFromImage(std::move(color), io.metadata.m.color_encoding);
-  io.Main().SetAlpha(std::move(alpha), /*alpha_is_premultiplied=*/false);
-
-  EXPECT_EQ(16u, io.metadata.m.GetAlphaBits());
-  EXPECT_EQ(16u, io.metadata.m.bit_depth.bits_per_sample);
-  EXPECT_FALSE(io.metadata.m.bit_depth.floating_point_sample);
-  EXPECT_EQ(0u, io.metadata.m.bit_depth.exponent_bits_per_sample);
+  ASSERT_EQ(t.ppf().info.bits_per_sample, 16);
+  ASSERT_EQ(t.ppf().info.alpha_bits, 16);
 
-  CompressParams cparams = CParamsForLossless();
+  JXLCompressParams cparams = CompressParamsForLossless();
 
-  CodecInOut io2;
-  EXPECT_LE(Roundtrip(&io, cparams, {}, pool, &io2), 7100u);
-  // If this test fails with a very close to 0.0 but not exactly 0.0 butteraugli
-  // distance, then there is likely a floating point issue, that could be
-  // happening either in io or io2. The values of io are generated by
-  // external_image.cc, and those in io2 by the jxl decoder. If they use
-  // slightly different floating point operations (say, one does "i / 257.0f"
-  // while the other does "i * (1.0f / 257)" they will get slightly different
-  // values. To fix, ensure both sides do the following formula for converting
-  // integer range 0-65535 to Image3F floating point range 0.0f-255.0f:
-  // "i * (1.0f / 257)".
-  // Note that this precision issue is not a problem in practice if the values
-  // are equal when rounded to 16-bit int, but currently full exact precision is
-  // tested.
-  EXPECT_EQ(ComputeDistance2(io.Main(), io2.Main(), GetJxlCms()), 0.0);
-  EXPECT_TRUE(SamePixels(*io.Main().alpha(), *io2.Main().alpha()));
-  EXPECT_EQ(16u, io2.metadata.m.GetAlphaBits());
-  EXPECT_EQ(16u, io2.metadata.m.bit_depth.bits_per_sample);
-  EXPECT_FALSE(io2.metadata.m.bit_depth.floating_point_sample);
-  EXPECT_EQ(0u, io2.metadata.m.bit_depth.exponent_bits_per_sample);
+  JXLDecompressParams dparams;
+  dparams.accepted_formats.push_back(t.ppf().frames[0].color.format);
+
+  PackedPixelFile ppf_out;
+  // TODO(szabadka) Investigate big size difference on i686
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, dparams, pool, &ppf_out), 4884, 100);
+  EXPECT_EQ(ComputeDistance2(t.ppf(), ppf_out), 0.0);
+  EXPECT_EQ(ppf_out.info.alpha_bits, 16);
+  EXPECT_TRUE(test::SameAlpha(t.ppf(), ppf_out));
 }
 
 TEST(JxlTest, RoundtripLossless16AlphaNotMisdetectedAs8Bit) {
   ThreadPool* pool = nullptr;
-
   size_t xsize = 128, ysize = 128;
-  Image3F color(xsize, ysize);
-  ImageF alpha(xsize, ysize);
+  TestImage t;
+  t.SetDimensions(xsize, ysize).SetChannels(4).SetAllBitDepths(16);
+  TestImage::Frame frame = t.AddFrame();
   // All 16-bit values, both color and alpha, of this image are below 64.
   // This allows testing if a code path wrongly concludes it's an 8-bit instead
   // of 16-bit image (or even 6-bit).
+  const float mul = 1.0f / 65535;
   for (size_t y = 0; y < ysize; y++) {
     for (size_t x = 0; x < xsize; x++) {
-      color.PlaneRow(0, y)[x] = (y * 64 / ysize) * (1.0f / 65535);
-      color.PlaneRow(1, y)[x] = (x * 64 / xsize) * (1.0f / 65535);
-      color.PlaneRow(2, y)[x] =
-          ((y + x) * 64 / (xsize + ysize)) * (1.0f / 65535);
-      alpha.Row(y)[x] = (64 * x / xsize) * (1.0f / 65535);
+      uint16_t r = y * 64 / ysize;
+      uint16_t g = x * 64 / xsize + 37;
+      uint16_t b = (y + x) * 64 / (xsize + ysize);
+      frame.SetValue(y, x, 0, r * mul);
+      frame.SetValue(y, x, 1, g * mul);
+      frame.SetValue(y, x, 2, b * mul);
+      frame.SetValue(y, x, 3, g * mul);
     }
   }
-  const bool is_gray = false;
-  CodecInOut io;
-  io.metadata.m.SetUintSamples(16);
-  io.metadata.m.SetAlphaBits(16);
-  io.metadata.m.color_encoding = ColorEncoding::SRGB(is_gray);
-  io.SetFromImage(std::move(color), io.metadata.m.color_encoding);
-  io.Main().SetAlpha(std::move(alpha), /*alpha_is_premultiplied=*/false);
-
-  EXPECT_EQ(16u, io.metadata.m.GetAlphaBits());
-  EXPECT_EQ(16u, io.metadata.m.bit_depth.bits_per_sample);
-  EXPECT_FALSE(io.metadata.m.bit_depth.floating_point_sample);
-  EXPECT_EQ(0u, io.metadata.m.bit_depth.exponent_bits_per_sample);
-
-  CompressParams cparams = CParamsForLossless();
-
-  CodecInOut io2;
-  EXPECT_LE(Roundtrip(&io, cparams, {}, pool, &io2), 3100u);
-  EXPECT_EQ(16u, io2.metadata.m.GetAlphaBits());
-  EXPECT_EQ(16u, io2.metadata.m.bit_depth.bits_per_sample);
-  EXPECT_FALSE(io2.metadata.m.bit_depth.floating_point_sample);
-  EXPECT_EQ(0u, io2.metadata.m.bit_depth.exponent_bits_per_sample);
-  // If fails, see note about floating point in RoundtripLossless8.
-  EXPECT_EQ(ComputeDistance2(io.Main(), io2.Main(), GetJxlCms()), 0.0);
-  EXPECT_TRUE(SamePixels(*io.Main().alpha(), *io2.Main().alpha()));
-}
+  ASSERT_EQ(t.ppf().info.bits_per_sample, 16);
+  ASSERT_EQ(t.ppf().info.alpha_bits, 16);
 
-TEST(JxlTest, RoundtripYCbCr420) {
-  ThreadPool* pool = nullptr;
-  const PaddedBytes orig = ReadTestData("jxl/flower/flower.png");
-  CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, pool));
-  const PaddedBytes yuv420 = ReadTestData("jxl/flower/flower.png.ffmpeg.y4m");
-  CodecInOut io2;
-  ASSERT_TRUE(test::DecodeImageY4M(Span<const uint8_t>(yuv420), &io2));
-
-  CompressParams cparams = CParamsForLossless();
-  cparams.speed_tier = SpeedTier::kThunder;
-
-  PassesEncoderState enc_state;
-  AuxOut* aux_out = nullptr;
-  PaddedBytes compressed;
-  EXPECT_TRUE(EncodeFile(cparams, &io2, &enc_state, &compressed, GetJxlCms(),
-                         aux_out, pool));
-  CodecInOut io3;
-  EXPECT_TRUE(test::DecodeFile({}, compressed, &io3, pool));
+  JXLCompressParams cparams = CompressParamsForLossless();
 
-  EXPECT_LE(compressed.size(), 2000000u);
+  JXLDecompressParams dparams;
+  dparams.accepted_formats.push_back(t.ppf().frames[0].color.format);
 
-  // we're comparing an original PNG with a YCbCr 4:2:0 version
-  EXPECT_THAT(ComputeDistance2(io.Main(), io3.Main(), GetJxlCms()),
-              IsSlightlyBelow(4.3));
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, dparams, pool, &ppf_out), 591, 50);
+  EXPECT_EQ(ComputeDistance2(t.ppf(), ppf_out), 0.0);
+  EXPECT_EQ(ppf_out.info.bits_per_sample, 16);
+  EXPECT_EQ(ppf_out.info.alpha_bits, 16);
+  EXPECT_TRUE(test::SameAlpha(t.ppf(), ppf_out));
 }
 
 TEST(JxlTest, RoundtripDots) {
   ThreadPool* pool = nullptr;
-  const PaddedBytes orig =
+  const std::vector<uint8_t> orig =
       ReadTestData("external/wesaturate/500px/cvo9xd_keong_macan_srgb8.png");
-  CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, pool));
-
-  ASSERT_NE(io.xsize(), 0u);
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+  ASSERT_NE(t.ppf().info.xsize, 0);
+  EXPECT_EQ(t.ppf().info.bits_per_sample, 8);
+  EXPECT_EQ(t.ppf().color_encoding.transfer_function,
+            JXL_TRANSFER_FUNCTION_SRGB);
 
-  CompressParams cparams;
-  cparams.dots = Override::kOn;
-  cparams.butteraugli_distance = 0.04;
-  cparams.speed_tier = SpeedTier::kSquirrel;
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 7);  // kSkirrel
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_DOTS, 1);
+  cparams.distance = 0.04;
 
-  EXPECT_EQ(8u, io.metadata.m.bit_depth.bits_per_sample);
-  EXPECT_EQ(0u, io.metadata.m.bit_depth.exponent_bits_per_sample);
-  EXPECT_FALSE(io.metadata.m.bit_depth.floating_point_sample);
-  EXPECT_TRUE(io.metadata.m.color_encoding.tf.IsSRGB());
-  PassesEncoderState enc_state;
-  AuxOut* aux_out = nullptr;
-  PaddedBytes compressed;
-  EXPECT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(),
-                         aux_out, pool));
-  CodecInOut io2;
-  EXPECT_TRUE(test::DecodeFile({}, compressed, &io2, pool));
-
-  EXPECT_LE(compressed.size(), 400000u);
-  EXPECT_THAT(ButteraugliDistance(io, io2, cparams.ba_params, GetJxlCms(),
-                                  /*distmap=*/nullptr, pool),
-              IsSlightlyBelow(0.3));
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 280333, 4000);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(0.35));
 }
 
 TEST(JxlTest, RoundtripNoise) {
   ThreadPool* pool = nullptr;
-  const PaddedBytes orig =
-      ReadTestData("external/wesaturate/500px/cvo9xd_keong_macan_srgb8.png");
-  CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, pool));
-
-  ASSERT_NE(io.xsize(), 0u);
+  const std::vector<uint8_t> orig =
+      ReadTestData("external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+  ASSERT_NE(t.ppf().info.xsize, 0);
+  EXPECT_EQ(t.ppf().info.bits_per_sample, 8);
+  EXPECT_EQ(t.ppf().color_encoding.transfer_function,
+            JXL_TRANSFER_FUNCTION_SRGB);
 
-  CompressParams cparams;
-  cparams.noise = Override::kOn;
-  cparams.speed_tier = SpeedTier::kSquirrel;
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 7);  // kSkirrel
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_NOISE, 1);
 
-  EXPECT_EQ(8u, io.metadata.m.bit_depth.bits_per_sample);
-  EXPECT_EQ(0u, io.metadata.m.bit_depth.exponent_bits_per_sample);
-  EXPECT_FALSE(io.metadata.m.bit_depth.floating_point_sample);
-  EXPECT_TRUE(io.metadata.m.color_encoding.tf.IsSRGB());
-  PassesEncoderState enc_state;
-  AuxOut* aux_out = nullptr;
-  PaddedBytes compressed;
-  EXPECT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(),
-                         aux_out, pool));
-  CodecInOut io2;
-  EXPECT_TRUE(test::DecodeFile({}, compressed, &io2, pool));
-
-  EXPECT_LE(compressed.size(), 40000u);
-  EXPECT_THAT(ButteraugliDistance(io, io2, cparams.ba_params, GetJxlCms(),
-                                  /*distmap=*/nullptr, pool),
-              IsSlightlyBelow(1.6));
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 42345, 750);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.35));
 }
 
 TEST(JxlTest, RoundtripLossless8Gray) {
   ThreadPool* pool = nullptr;
-  const PaddedBytes orig = ReadTestData(
+  const std::vector<uint8_t> orig = ReadTestData(
       "external/wesaturate/500px/cvo9xd_keong_macan_grayscale.png");
-  CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, pool));
+  TestImage t;
+  t.SetColorEncoding("Gra_D65_Rel_SRG").DecodeFromBytes(orig).ClearMetadata();
+  EXPECT_EQ(t.ppf().color_encoding.color_space, JXL_COLOR_SPACE_GRAY);
+  EXPECT_EQ(t.ppf().info.bits_per_sample, 8);
 
-  CompressParams cparams = CParamsForLossless();
+  JXLCompressParams cparams = CompressParamsForLossless();
 
-  EXPECT_TRUE(io.Main().IsGray());
-  EXPECT_EQ(8u, io.metadata.m.bit_depth.bits_per_sample);
-  EXPECT_FALSE(io.metadata.m.bit_depth.floating_point_sample);
-  EXPECT_EQ(0u, io.metadata.m.bit_depth.exponent_bits_per_sample);
-  CodecInOut io2;
-  EXPECT_LE(Roundtrip(&io, cparams, {}, pool, &io2), 130000u);
-  // If fails, see note about floating point in RoundtripLossless8.
-  EXPECT_EQ(ComputeDistance2(io.Main(), io2.Main(), GetJxlCms()), 0);
-  EXPECT_EQ(0.0, ButteraugliDistance(io, io2, cparams.ba_params, GetJxlCms(),
-                                     /*distmap=*/nullptr, pool));
-  EXPECT_TRUE(io2.Main().IsGray());
-  EXPECT_EQ(8u, io2.metadata.m.bit_depth.bits_per_sample);
-  EXPECT_FALSE(io2.metadata.m.bit_depth.floating_point_sample);
-  EXPECT_EQ(0u, io2.metadata.m.bit_depth.exponent_bits_per_sample);
-}
+  JXLDecompressParams dparams;
+  dparams.accepted_formats.push_back(t.ppf().frames[0].color.format);
 
-#if JPEGXL_ENABLE_GIF
+  PackedPixelFile ppf_out;
+  EXPECT_EQ(Roundtrip(t.ppf(), cparams, dparams, pool, &ppf_out), 92185);
+  EXPECT_EQ(ComputeDistance2(t.ppf(), ppf_out), 0.0);
+  EXPECT_EQ(ppf_out.color_encoding.color_space, JXL_COLOR_SPACE_GRAY);
+  EXPECT_EQ(ppf_out.info.bits_per_sample, 8);
+}
 
 TEST(JxlTest, RoundtripAnimation) {
+  if (!jxl::extras::CanDecode(jxl::extras::Codec::kGIF)) {
+    fprintf(stderr, "Skipping test because of missing GIF decoder.\n");
+    return;
+  }
   ThreadPool* pool = nullptr;
-  const PaddedBytes orig = ReadTestData("jxl/traffic_light.gif");
-  CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, pool));
-  ASSERT_EQ(4u, io.frames.size());
+  const std::vector<uint8_t> orig = ReadTestData("jxl/traffic_light.gif");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+  EXPECT_EQ(4, t.ppf().frames.size());
 
-  CompressParams cparams;
-  CodecInOut io2;
-  EXPECT_LE(Roundtrip(&io, cparams, {}, pool, &io2), 3000u);
+  JXLDecompressParams dparams;
+  dparams.accepted_formats.push_back(t.ppf().frames[0].color.format);
 
-  EXPECT_EQ(io2.frames.size(), io.frames.size());
-  test::CoalesceGIFAnimationWithAlpha(&io);
-  EXPECT_LE(ButteraugliDistance(io, io2, cparams.ba_params, GetJxlCms(),
-                                /*distmap=*/nullptr, pool),
+  PackedPixelFile ppf_out;
+  EXPECT_THAT(Roundtrip(t.ppf(), {}, dparams, pool, &ppf_out),
+              IsSlightlyBelow(2888));
+
+  t.CoalesceGIFAnimationWithAlpha();
+  ASSERT_EQ(ppf_out.frames.size(), t.ppf().frames.size());
+  EXPECT_LE(ButteraugliDistance(t.ppf(), ppf_out),
 #if JXL_HIGH_PRECISION
             1.55);
 #else
@@ -1438,71 +1192,67 @@ TEST(JxlTest, RoundtripAnimation) {
 }
 
 TEST(JxlTest, RoundtripLosslessAnimation) {
+  if (!jxl::extras::CanDecode(jxl::extras::Codec::kGIF)) {
+    fprintf(stderr, "Skipping test because of missing GIF decoder.\n");
+    return;
+  }
   ThreadPool* pool = nullptr;
-  const PaddedBytes orig = ReadTestData("jxl/traffic_light.gif");
-  CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, pool));
-  ASSERT_EQ(4u, io.frames.size());
+  const std::vector<uint8_t> orig = ReadTestData("jxl/traffic_light.gif");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+  EXPECT_EQ(4, t.ppf().frames.size());
 
-  CompressParams cparams = CParamsForLossless();
-  CodecInOut io2;
-  EXPECT_LE(Roundtrip(&io, cparams, {}, pool, &io2), 1200u);
+  JXLCompressParams cparams = CompressParamsForLossless();
+
+  JXLDecompressParams dparams;
+  dparams.accepted_formats.push_back(t.ppf().frames[0].color.format);
+
+  PackedPixelFile ppf_out;
+  EXPECT_THAT(Roundtrip(t.ppf(), cparams, dparams, pool, &ppf_out),
+              IsSlightlyBelow(958));
 
-  EXPECT_EQ(io2.frames.size(), io.frames.size());
-  test::CoalesceGIFAnimationWithAlpha(&io);
-  EXPECT_LE(ButteraugliDistance(io, io2, cparams.ba_params, GetJxlCms(),
-                                /*distmap=*/nullptr, pool),
-            5e-4);
+  t.CoalesceGIFAnimationWithAlpha();
+  ASSERT_EQ(ppf_out.frames.size(), t.ppf().frames.size());
+  EXPECT_LE(ButteraugliDistance(t.ppf(), ppf_out), 5e-4);
 }
 
 TEST(JxlTest, RoundtripAnimationPatches) {
+  if (!jxl::extras::CanDecode(jxl::extras::Codec::kGIF)) {
+    fprintf(stderr, "Skipping test because of missing GIF decoder.\n");
+    return;
+  }
   ThreadPool* pool = nullptr;
-  const PaddedBytes orig = ReadTestData("jxl/animation_patches.gif");
-  CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, pool));
-  ASSERT_EQ(2u, io.frames.size());
+  const std::vector<uint8_t> orig = ReadTestData("jxl/animation_patches.gif");
 
-  CompressParams cparams;
-  cparams.patches = Override::kOn;
-  CodecInOut io2;
-  // 40k with no patches, 27k with patch frames encoded multiple times.
-  EXPECT_LE(Roundtrip(&io, cparams, {}, pool, &io2), 24000u);
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+  ASSERT_EQ(2u, t.ppf().frames.size());
+
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_PATCHES, 1);
 
-  EXPECT_EQ(io2.frames.size(), io.frames.size());
-  // >10 with broken patches
-  EXPECT_THAT(ButteraugliDistance(io, io2, cparams.ba_params, GetJxlCms(),
-                                  /*distmap=*/nullptr, pool),
-              IsSlightlyBelow(1.2));
+  JXLDecompressParams dparams;
+  dparams.accepted_formats.push_back(t.ppf().frames[0].color.format);
+
+  PackedPixelFile ppf_out;
+  // 40k with no patches, 27k with patch frames encoded multiple times.
+  EXPECT_THAT(Roundtrip(t.ppf(), cparams, dparams, pool, &ppf_out),
+              IsSlightlyBelow(19252));
+  EXPECT_EQ(ppf_out.frames.size(), t.ppf().frames.size());
+  // >10 with broken patches; not all patches are detected on borders.
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.9));
 }
 
-#endif  // JPEGXL_ENABLE_GIF
+size_t RoundtripJpeg(const std::vector<uint8_t>& jpeg_in, ThreadPool* pool) {
+  std::vector<uint8_t> compressed;
+  EXPECT_TRUE(extras::EncodeImageJXL({}, extras::PackedPixelFile(), &jpeg_in,
+                                     &compressed));
 
-size_t RoundtripJpeg(const PaddedBytes& jpeg_in, ThreadPool* pool) {
-  CodecInOut io;
-  EXPECT_TRUE(jpeg::DecodeImageJPG(Span<const uint8_t>(jpeg_in), &io));
-  CompressParams cparams;
-  cparams.color_transform = jxl::ColorTransform::kYCbCr;
-
-  PassesEncoderState passes_enc_state;
-  PaddedBytes compressed, codestream;
-
-  EXPECT_TRUE(EncodeFile(cparams, &io, &passes_enc_state, &codestream,
-                         GetJxlCms(),
-                         /*aux_out=*/nullptr, pool));
-  jpegxl::tools::JpegXlContainer enc_container;
-  enc_container.codestream = std::move(codestream);
-  jpeg::JPEGData data_in = *io.Main().jpeg_data;
-  jxl::PaddedBytes jpeg_data;
-  EXPECT_TRUE(EncodeJPEGData(data_in, &jpeg_data, cparams));
-  enc_container.jpeg_reconstruction = jpeg_data.data();
-  enc_container.jpeg_reconstruction_size = jpeg_data.size();
-  EXPECT_TRUE(EncodeJpegXlContainerOneShot(enc_container, &compressed));
-
-  jxl::extras::JXLDecompressParams dparams;
-  dparams.runner = pool->runner();
-  dparams.runner_opaque = pool->runner_opaque();
+  jxl::JXLDecompressParams dparams;
+  test::DefaultAcceptedFormats(dparams);
+  test::SetThreadParallelRunner(dparams, pool);
   std::vector<uint8_t> out;
-  jxl::extras::PackedPixelFile ppf;
+  jxl::PackedPixelFile ppf;
   EXPECT_TRUE(DecodeImageJXL(compressed.data(), compressed.size(), dparams,
                              nullptr, &ppf, &out));
   EXPECT_EQ(out.size(), jpeg_in.size());
@@ -1519,226 +1269,318 @@ size_t RoundtripJpeg(const PaddedBytes& jpeg_in, ThreadPool* pool) {
   return compressed.size();
 }
 
+void RoundtripJpegToPixels(const std::vector<uint8_t>& jpeg_in,
+                           JXLDecompressParams dparams, ThreadPool* pool,
+                           PackedPixelFile* ppf_out) {
+  std::vector<uint8_t> jpeg_bytes(jpeg_in.data(),
+                                  jpeg_in.data() + jpeg_in.size());
+  std::vector<uint8_t> compressed;
+  EXPECT_TRUE(extras::EncodeImageJXL({}, extras::PackedPixelFile(), &jpeg_bytes,
+                                     &compressed));
+
+  test::DefaultAcceptedFormats(dparams);
+  test::SetThreadParallelRunner(dparams, pool);
+  EXPECT_TRUE(DecodeImageJXL(compressed.data(), compressed.size(), dparams,
+                             nullptr, ppf_out, nullptr));
+}
+
 TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompression444)) {
-  ThreadPoolInternal pool(8);
-  const PaddedBytes orig = ReadTestData("jxl/flower/flower.png.im_q85_444.jpg");
+  ThreadPoolForTests pool(8);
+  const std::vector<uint8_t> orig =
+      ReadTestData("jxl/flower/flower.png.im_q85_444.jpg");
   // JPEG size is 696,659 bytes.
-  EXPECT_LE(RoundtripJpeg(orig, &pool), 570000u);
+  EXPECT_NEAR(RoundtripJpeg(orig, &pool), 568940u, 10);
 }
 
-#if JPEGXL_ENABLE_JPEG
-
 TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompressionToPixels)) {
-  ThreadPoolInternal pool(8);
-  const PaddedBytes orig = ReadTestData("jxl/flower/flower.png.im_q85_444.jpg");
-  CodecInOut io;
-  ASSERT_TRUE(jpeg::DecodeImageJPG(Span<const uint8_t>(orig), &io));
-
-  CodecInOut io2;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io2, &pool));
+  TEST_LIBJPEG_SUPPORT();
+  ThreadPoolForTests pool(8);
+  const std::vector<uint8_t> orig =
+      ReadTestData("jxl/flower/flower.png.im_q85_444.jpg");
+  TestImage t;
+  t.DecodeFromBytes(orig);
 
-  CompressParams cparams;
-  cparams.color_transform = jxl::ColorTransform::kYCbCr;
-
-  CodecInOut io3;
-  Roundtrip(&io, cparams, {}, &pool, &io3);
-
-  // TODO(eustas): investigate, why SJPEG and JpegRecompression pixels are
-  // different.
-  EXPECT_THAT(ComputeDistance2(io2.Main(), io3.Main(), GetJxlCms()),
-              IsSlightlyBelow(12));
+  PackedPixelFile ppf_out;
+  RoundtripJpegToPixels(orig, {}, &pool, &ppf_out);
+  EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(12));
 }
 
 TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompressionToPixels420)) {
-  ThreadPoolInternal pool(8);
-  const PaddedBytes orig = ReadTestData("jxl/flower/flower.png.im_q85_420.jpg");
-  CodecInOut io;
-  ASSERT_TRUE(jpeg::DecodeImageJPG(Span<const uint8_t>(orig), &io));
-
-  CodecInOut io2;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io2, &pool));
+  TEST_LIBJPEG_SUPPORT();
+  ThreadPoolForTests pool(8);
+  const std::vector<uint8_t> orig =
+      ReadTestData("jxl/flower/flower.png.im_q85_420.jpg");
+  TestImage t;
+  t.DecodeFromBytes(orig);
 
-  CompressParams cparams;
-  cparams.color_transform = jxl::ColorTransform::kYCbCr;
-
-  CodecInOut io3;
-  Roundtrip(&io, cparams, {}, &pool, &io3);
-
-  EXPECT_THAT(ComputeDistance2(io2.Main(), io3.Main(), GetJxlCms()),
-              IsSlightlyBelow(11));
+  PackedPixelFile ppf_out;
+  RoundtripJpegToPixels(orig, {}, &pool, &ppf_out);
+  EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(11));
 }
 
 TEST(JxlTest,
      JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompressionToPixels420EarlyFlush)) {
-  ThreadPoolInternal pool(8);
-  const PaddedBytes orig = ReadTestData("jxl/flower/flower.png.im_q85_420.jpg");
-  CodecInOut io;
-  ASSERT_TRUE(jpeg::DecodeImageJPG(Span<const uint8_t>(orig), &io));
-
-  CodecInOut io2;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io2, &pool));
-
-  CompressParams cparams;
-  cparams.color_transform = jxl::ColorTransform::kYCbCr;
-
-  extras::JXLDecompressParams dparams;
+  TEST_LIBJPEG_SUPPORT();
+  ThreadPoolForTests pool(8);
+  const std::vector<uint8_t> orig =
+      ReadTestData("jxl/flower/flower.png.im_q85_420.jpg");
+  TestImage t;
+  t.DecodeFromBytes(orig);
+
+  JXLDecompressParams dparams;
   dparams.max_downsampling = 8;
 
-  CodecInOut io3;
-  Roundtrip(&io, cparams, dparams, &pool, &io3);
-
-  EXPECT_THAT(ComputeDistance2(io2.Main(), io3.Main(), GetJxlCms()),
-              IsSlightlyBelow(4410));
+  PackedPixelFile ppf_out;
+  RoundtripJpegToPixels(orig, dparams, &pool, &ppf_out);
+  EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(4410));
 }
 
 TEST(JxlTest,
      JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompressionToPixels420Mul16)) {
-  ThreadPoolInternal pool(8);
-  const PaddedBytes orig = ReadTestData("jxl/flower/flower_cropped.jpg");
-  CodecInOut io;
-  ASSERT_TRUE(jpeg::DecodeImageJPG(Span<const uint8_t>(orig), &io));
+  TEST_LIBJPEG_SUPPORT();
+  ThreadPoolForTests pool(8);
+  const std::vector<uint8_t> orig =
+      ReadTestData("jxl/flower/flower_cropped.jpg");
+  TestImage t;
+  t.DecodeFromBytes(orig);
 
-  CodecInOut io2;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io2, &pool));
-
-  CompressParams cparams;
-  cparams.color_transform = jxl::ColorTransform::kYCbCr;
-
-  CodecInOut io3;
-  Roundtrip(&io, cparams, {}, &pool, &io3);
-
-  EXPECT_THAT(ComputeDistance2(io2.Main(), io3.Main(), GetJxlCms()),
-              IsSlightlyBelow(4));
+  PackedPixelFile ppf_out;
+  RoundtripJpegToPixels(orig, {}, &pool, &ppf_out);
+  EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(4));
 }
 
 TEST(JxlTest,
      JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompressionToPixels_asymmetric)) {
-  ThreadPoolInternal pool(8);
-  const PaddedBytes orig =
+  TEST_LIBJPEG_SUPPORT();
+  ThreadPoolForTests pool(8);
+  const std::vector<uint8_t> orig =
       ReadTestData("jxl/flower/flower.png.im_q85_asymmetric.jpg");
-  CodecInOut io;
-  ASSERT_TRUE(jpeg::DecodeImageJPG(Span<const uint8_t>(orig), &io));
-
-  CodecInOut io2;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io2, &pool));
-
-  CompressParams cparams;
-  cparams.color_transform = jxl::ColorTransform::kYCbCr;
+  TestImage t;
+  t.DecodeFromBytes(orig);
 
-  CodecInOut io3;
-  Roundtrip(&io, cparams, {}, &pool, &io3);
-
-  EXPECT_THAT(ComputeDistance2(io2.Main(), io3.Main(), GetJxlCms()),
-              IsSlightlyBelow(10));
+  PackedPixelFile ppf_out;
+  RoundtripJpegToPixels(orig, {}, &pool, &ppf_out);
+  EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(10));
 }
 
-#endif
-
 TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompressionGray)) {
-  ThreadPoolInternal pool(8);
-  const PaddedBytes orig =
+  ThreadPoolForTests pool(8);
+  const std::vector<uint8_t> orig =
       ReadTestData("jxl/flower/flower.png.im_q85_gray.jpg");
   // JPEG size is 456,528 bytes.
-  EXPECT_LE(RoundtripJpeg(orig, &pool), 390000u);
+  EXPECT_NEAR(RoundtripJpeg(orig, &pool), 387496u, 200);
 }
 
 TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompression420)) {
-  ThreadPoolInternal pool(8);
-  const PaddedBytes orig = ReadTestData("jxl/flower/flower.png.im_q85_420.jpg");
+  ThreadPoolForTests pool(8);
+  const std::vector<uint8_t> orig =
+      ReadTestData("jxl/flower/flower.png.im_q85_420.jpg");
   // JPEG size is 546,797 bytes.
-  EXPECT_LE(RoundtripJpeg(orig, &pool), 460000u);
+  EXPECT_NEAR(RoundtripJpeg(orig, &pool), 455560u, 10);
 }
 
 TEST(JxlTest,
      JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompression_luma_subsample)) {
-  ThreadPoolInternal pool(8);
-  const PaddedBytes orig =
+  ThreadPoolForTests pool(8);
+  const std::vector<uint8_t> orig =
       ReadTestData("jxl/flower/flower.png.im_q85_luma_subsample.jpg");
   // JPEG size is 400,724 bytes.
-  EXPECT_LE(RoundtripJpeg(orig, &pool), 330000u);
+  EXPECT_NEAR(RoundtripJpeg(orig, &pool), 325354u, 15);
 }
 
 TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompression444_12)) {
   // 444 JPEG that has an interesting sampling-factor (1x2, 1x2, 1x2).
-  ThreadPoolInternal pool(8);
-  const PaddedBytes orig =
+  ThreadPoolForTests pool(8);
+  const std::vector<uint8_t> orig =
       ReadTestData("jxl/flower/flower.png.im_q85_444_1x2.jpg");
   // JPEG size is 703,874 bytes.
-  EXPECT_LE(RoundtripJpeg(orig, &pool), 570000u);
+  EXPECT_NEAR(RoundtripJpeg(orig, &pool), 569679u, 10);
 }
 
 TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompression422)) {
-  ThreadPoolInternal pool(8);
-  const PaddedBytes orig = ReadTestData("jxl/flower/flower.png.im_q85_422.jpg");
+  ThreadPoolForTests pool(8);
+  const std::vector<uint8_t> orig =
+      ReadTestData("jxl/flower/flower.png.im_q85_422.jpg");
   // JPEG size is 522,057 bytes.
-  EXPECT_LE(RoundtripJpeg(orig, &pool), 500000u);
+  EXPECT_NEAR(RoundtripJpeg(orig, &pool), 499282u, 10);
 }
 
 TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompression440)) {
-  ThreadPoolInternal pool(8);
-  const PaddedBytes orig = ReadTestData("jxl/flower/flower.png.im_q85_440.jpg");
+  ThreadPoolForTests pool(8);
+  const std::vector<uint8_t> orig =
+      ReadTestData("jxl/flower/flower.png.im_q85_440.jpg");
   // JPEG size is 603,623 bytes.
-  EXPECT_LE(RoundtripJpeg(orig, &pool), 510000u);
+  EXPECT_NEAR(RoundtripJpeg(orig, &pool), 501151u, 10);
 }
 
 TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompression_asymmetric)) {
   // 2x vertical downsample of one chroma channel, 2x horizontal downsample of
   // the other.
-  ThreadPoolInternal pool(8);
-  const PaddedBytes orig =
+  ThreadPoolForTests pool(8);
+  const std::vector<uint8_t> orig =
       ReadTestData("jxl/flower/flower.png.im_q85_asymmetric.jpg");
   // JPEG size is 604,601 bytes.
-  EXPECT_LE(RoundtripJpeg(orig, &pool), 510000u);
+  EXPECT_NEAR(RoundtripJpeg(orig, &pool), 500602u, 10);
 }
 
 TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompression420Progr)) {
-  ThreadPoolInternal pool(8);
-  const PaddedBytes orig =
+  ThreadPoolForTests pool(8);
+  const std::vector<uint8_t> orig =
       ReadTestData("jxl/flower/flower.png.im_q85_420_progr.jpg");
   // JPEG size is 522,057 bytes.
-  EXPECT_LE(RoundtripJpeg(orig, &pool), 460000u);
+  EXPECT_NEAR(RoundtripJpeg(orig, &pool), 455499u, 10);
+}
+
+TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompressionMetadata)) {
+  ThreadPoolForTests pool(8);
+  const std::vector<uint8_t> orig =
+      ReadTestData("jxl/jpeg_reconstruction/1x1_exif_xmp.jpg");
+  // JPEG size is 4290 bytes
+  EXPECT_NEAR(RoundtripJpeg(orig, &pool), 1400u, 30);
+}
+
+TEST(JxlTest, JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompressionRestarts)) {
+  ThreadPoolForTests pool(8);
+  const std::vector<uint8_t> orig =
+      ReadTestData("jxl/jpeg_reconstruction/bicycles_restarts.jpg");
+  // JPEG size is 87478 bytes
+  EXPECT_NEAR(RoundtripJpeg(orig, &pool), 76125u, 30);
+}
+
+TEST(JxlTest,
+     JXL_TRANSCODE_JPEG_TEST(RoundtripJpegRecompressionOrientationICC)) {
+  ThreadPoolForTests pool(8);
+  const std::vector<uint8_t> orig =
+      ReadTestData("jxl/jpeg_reconstruction/sideways_bench.jpg");
+  // JPEG size is 15252 bytes
+  EXPECT_NEAR(RoundtripJpeg(orig, &pool), 12000u, 470);
+  // TODO(jon): investigate why 'Cross-compiling i686-linux-gnu' produces a
+  // larger result
 }
 
 TEST(JxlTest, RoundtripProgressive) {
-  ThreadPoolInternal pool(4);
-  const PaddedBytes orig = ReadTestData("jxl/flower/flower.png");
-  CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
-  io.ShrinkTo(600, 1024);
+  ThreadPoolForTests pool(4);
+  const std::vector<uint8_t> orig = ReadTestData("jxl/flower/flower.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata().SetDimensions(600, 1024);
 
-  CompressParams cparams;
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_PROGRESSIVE_DC, 1);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_PROGRESSIVE_AC, 1);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_RESPONSIVE, 1);
 
-  cparams.butteraugli_distance = 1.0f;
-  cparams.progressive_dc = 1;
-  cparams.responsive = true;
-  cparams.progressive_mode = true;
-  CodecInOut io2;
-  EXPECT_LE(Roundtrip(&io, cparams, {}, &pool, &io2), 61700u);
-  EXPECT_THAT(ButteraugliDistance(io, io2, cparams.ba_params, GetJxlCms(),
-                                  /*distmap=*/nullptr, &pool),
-              IsSlightlyBelow(1.17f));
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 71444, 750);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.4));
 }
 
 TEST(JxlTest, RoundtripProgressiveLevel2Slow) {
-  ThreadPoolInternal pool(8);
-  const PaddedBytes orig = ReadTestData("jxl/flower/flower.png");
-  CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
-  io.ShrinkTo(600, 1024);
+  ThreadPoolForTests pool(8);
+  const std::vector<uint8_t> orig = ReadTestData("jxl/flower/flower.png");
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata().SetDimensions(600, 1024);
 
-  CompressParams cparams;
+  JXLCompressParams cparams;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 9);  // kTortoise
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_PROGRESSIVE_DC, 2);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_PROGRESSIVE_AC, 1);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_RESPONSIVE, 1);
 
-  cparams.butteraugli_distance = 1.0f;
-  cparams.progressive_dc = 2;
-  cparams.speed_tier = SpeedTier::kTortoise;
-  cparams.responsive = true;
-  cparams.progressive_mode = true;
-  CodecInOut io2;
-  EXPECT_LE(Roundtrip(&io, cparams, {}, &pool, &io2), 71000u);
-  EXPECT_THAT(ButteraugliDistance(io, io2, cparams.ba_params, GetJxlCms(),
-                                  /*distmap=*/nullptr, &pool),
-              IsSlightlyBelow(1.2f));
+  PackedPixelFile ppf_out;
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 76666, 1000);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.17));
 }
 
+TEST(JxlTest, RoundtripUnsignedCustomBitdepthLossless) {
+  ThreadPool* pool = nullptr;
+  for (uint32_t num_channels = 1; num_channels < 6; ++num_channels) {
+    for (JxlEndianness endianness : {JXL_LITTLE_ENDIAN, JXL_BIG_ENDIAN}) {
+      for (uint32_t bitdepth = 3; bitdepth <= 16; ++bitdepth) {
+        if (bitdepth <= 8 && endianness == JXL_BIG_ENDIAN) continue;
+        printf("Testing %u channel unsigned %u bit %s endian lossless.\n",
+               num_channels, bitdepth,
+               endianness == JXL_LITTLE_ENDIAN ? "little" : "big");
+        TestImage t;
+        t.SetDimensions(256, 256).SetChannels(num_channels);
+        t.SetAllBitDepths(bitdepth).SetEndianness(endianness);
+        TestImage::Frame frame = t.AddFrame();
+        frame.RandomFill();
+
+        JXLCompressParams cparams = CompressParamsForLossless();
+        cparams.input_bitdepth.type = JXL_BIT_DEPTH_FROM_CODESTREAM;
+
+        JXLDecompressParams dparams;
+        dparams.accepted_formats.push_back(t.ppf().frames[0].color.format);
+        dparams.output_bitdepth.type = JXL_BIT_DEPTH_FROM_CODESTREAM;
+
+        PackedPixelFile ppf_out;
+        Roundtrip(t.ppf(), cparams, dparams, pool, &ppf_out);
+
+        ASSERT_TRUE(test::SamePixels(t.ppf(), ppf_out));
+      }
+    }
+  }
+}
+
+TEST(JxlTest, LosslessPNMRoundtrip) {
+  static const char* kChannels[] = {"", "g", "ga", "rgb", "rgba"};
+  static const char* kExtension[] = {"", ".pgm", ".pam", ".ppm", ".pam"};
+  for (size_t bit_depth = 1; bit_depth <= 16; ++bit_depth) {
+    for (size_t channels = 1; channels <= 4; ++channels) {
+      if (bit_depth == 1 && (channels == 2 || channels == 4)) continue;
+      std::string extension(kExtension[channels]);
+      std::string filename = "jxl/flower/flower_small." +
+                             std::string(kChannels[channels]) + ".depth" +
+                             std::to_string(bit_depth) + extension;
+      const std::vector<uint8_t> orig = ReadTestData(filename);
+      test::TestImage t;
+      if (channels < 3) t.SetColorEncoding("Gra_D65_Rel_SRG");
+      t.DecodeFromBytes(orig);
+
+      JXLCompressParams cparams = CompressParamsForLossless();
+      cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 1);  // kLightning
+      cparams.input_bitdepth.type = JXL_BIT_DEPTH_FROM_CODESTREAM;
+
+      JXLDecompressParams dparams;
+      dparams.accepted_formats.push_back(t.ppf().frames[0].color.format);
+      dparams.output_bitdepth.type = JXL_BIT_DEPTH_FROM_CODESTREAM;
+
+      PackedPixelFile ppf_out;
+      Roundtrip(t.ppf(), cparams, dparams, nullptr, &ppf_out);
+
+      extras::EncodedImage encoded;
+      auto encoder = extras::Encoder::FromExtension(extension);
+      ASSERT_TRUE(encoder.get());
+      ASSERT_TRUE(encoder->Encode(ppf_out, &encoded, nullptr));
+      ASSERT_EQ(encoded.bitstreams.size(), 1);
+      ASSERT_EQ(orig.size(), encoded.bitstreams[0].size());
+      EXPECT_EQ(0,
+                memcmp(orig.data(), encoded.bitstreams[0].data(), orig.size()));
+    }
+  }
+}
+
+class JxlTest : public ::testing::TestWithParam<const char*> {};
+
+TEST_P(JxlTest, LosslessSmallFewColors) {
+  ThreadPoolForTests pool(8);
+  const std::vector<uint8_t> orig = ReadTestData(GetParam());
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+
+  JXLCompressParams cparams;
+  cparams.distance = 0;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 1);
+
+  PackedPixelFile ppf_out;
+  Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out);
+  EXPECT_EQ(ComputeDistance2(t.ppf(), ppf_out), 0.0);
+}
+
+JXL_GTEST_INSTANTIATE_TEST_SUITE_P(
+    ImageTests, JxlTest,
+    ::testing::Values("jxl/blending/cropped_traffic_light_frame-0.png",
+                      "palette/358colors.png"));
+
 }  // namespace
 }  // namespace jxl
index 74109c8..acda762 100644 (file)
 #include <numeric>
 #include <vector>
 
-#include "gtest/gtest.h"
 #include "lib/jxl/base/bits.h"
 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/base/random.h"
-#include "lib/jxl/base/thread_pool_internal.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
 namespace {
@@ -86,7 +86,7 @@ void RoundtripSizeRange(ThreadPool* pool, uint32_t begin, uint32_t end) {
 }
 
 TEST(LehmerCodeTest, TestRoundtrips) {
-  ThreadPoolInternal pool(8);
+  test::ThreadPoolForTests pool(8);
 
   RoundtripSizeRange<uint16_t>(&pool, 1, 1026);
 
index 4a7af65..58b6941 100644 (file)
@@ -6,7 +6,7 @@ includedir=@PKGCONFIG_TARGET_INCLUDES@
 Name: libjxl
 Description: Loads and saves JPEG XL files
 Version: @JPEGXL_LIBRARY_VERSION@
-Requires.private: @JPEGXL_LIBRARY_REQUIRES@
+@JPEGXL_REQUIRES_TYPE@: @JPEGXL_LIBRARY_REQUIRES@
 Libs: -L${libdir} -ljxl
 Libs.private: -lm
 Cflags: -I${includedir}
diff --git a/lib/jxl/libjxl_cms.pc.in b/lib/jxl/libjxl_cms.pc.in
new file mode 100644 (file)
index 0000000..e31661a
--- /dev/null
@@ -0,0 +1,13 @@
+prefix=@CMAKE_INSTALL_PREFIX@
+exec_prefix=${prefix}
+libdir=@PKGCONFIG_TARGET_LIBS@
+includedir=@PKGCONFIG_TARGET_INCLUDES@
+
+Name: libjxl_cms
+Description: CMS support library for libjxl
+Version: @JPEGXL_LIBRARY_VERSION@
+@JPEGXL_REQUIRES_TYPE@: @JPEGXL_CMS_LIBRARY_REQUIRES@
+Libs: @JXL_CMS_PK_LIBS@ -L${libdir} -ljxl_cms
+Libs.private: -lm
+Cflags: -I${includedir}
+Cflags.private: -DJXL_CMS_STATIC_DEFINE
diff --git a/lib/jxl/linalg.cc b/lib/jxl/linalg.cc
deleted file mode 100644 (file)
index 61d66dd..0000000
+++ /dev/null
@@ -1,235 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "lib/jxl/linalg.h"
-
-#include <stdlib.h>
-
-#include <cmath>
-#include <deque>
-#include <utility>
-#include <vector>
-
-#include "lib/jxl/base/status.h"
-#include "lib/jxl/common.h"
-#include "lib/jxl/image_ops.h"
-
-namespace jxl {
-
-void AssertSymmetric(const ImageD& A) {
-#if JXL_ENABLE_ASSERT
-  JXL_ASSERT(A.xsize() == A.ysize());
-  for (size_t i = 0; i < A.xsize(); ++i) {
-    for (size_t j = i + 1; j < A.xsize(); ++j) {
-      JXL_ASSERT(std::abs(A.Row(i)[j] - A.Row(j)[i]) < 1e-15);
-    }
-  }
-#endif
-}
-
-void Diagonalize2x2(const double a0, const double a1, const double b, double* c,
-                    double* s) {
-  if (std::abs(b) < 1e-15) {
-    *c = 1.0;
-    *s = 0.0;
-    return;
-  }
-  double phi = std::atan2(2 * b, a1 - a0);
-  double theta = b > 0.0 ? 0.5 * phi : 0.5 * phi + Pi(1.0);
-  *c = std::cos(theta);
-  *s = std::sin(theta);
-}
-
-void GivensRotation(const double x, const double y, double* c, double* s) {
-  if (y == 0.0) {
-    *c = x < 0.0 ? -1.0 : 1.0;
-    *s = 0.0;
-  } else {
-    const double h = hypot(x, y);
-    const double d = 1.0 / h;
-    *c = x * d;
-    *s = -y * d;
-  }
-}
-
-void RotateMatrixCols(ImageD* const JXL_RESTRICT U, int i, int j, double c,
-                      double s) {
-  JXL_ASSERT(U->xsize() == U->ysize());
-  const size_t N = U->xsize();
-  double* const JXL_RESTRICT u_i = U->Row(i);
-  double* const JXL_RESTRICT u_j = U->Row(j);
-  std::vector<double> rot_i, rot_j;
-  rot_i.reserve(N);
-  rot_j.reserve(N);
-  for (size_t k = 0; k < N; ++k) {
-    rot_i.push_back(u_i[k] * c - u_j[k] * s);
-    rot_j.push_back(u_i[k] * s + u_j[k] * c);
-  }
-  for (size_t k = 0; k < N; ++k) {
-    u_i[k] = rot_i[k];
-    u_j[k] = rot_j[k];
-  }
-}
-void HouseholderReflector(const size_t N, const double* x, double* u) {
-  const double sigma = x[0] <= 0.0 ? 1.0 : -1.0;
-  u[0] = x[0] - sigma * std::sqrt(DotProduct(N, x, x));
-  for (size_t k = 1; k < N; ++k) {
-    u[k] = x[k];
-  }
-  double u_norm = 1.0 / std::sqrt(DotProduct(N, u, u));
-  for (size_t k = 0; k < N; ++k) {
-    u[k] *= u_norm;
-  }
-}
-
-void ConvertToTridiagonal(const ImageD& A, ImageD* const JXL_RESTRICT T,
-                          ImageD* const JXL_RESTRICT U) {
-  AssertSymmetric(A);
-  const size_t N = A.xsize();
-  *U = Identity<double>(A.xsize());
-  *T = CopyImage(A);
-  std::vector<ImageD> u_stack;
-  for (size_t k = 0; k + 2 < N; ++k) {
-    if (DotProduct(N - k - 2, &T->Row(k)[k + 2], &T->Row(k)[k + 2]) > 1e-15) {
-      ImageD u(N, 1);
-      ZeroFillImage(&u);
-      HouseholderReflector(N - k - 1, &T->Row(k)[k + 1], &u.Row(0)[k + 1]);
-      ImageD v = MatMul(*T, u);
-      double scale = DotProduct(u, v);
-      v = LinComb(2.0, v, -2.0 * scale, u);
-      SubtractFrom(MatMul(u, Transpose(v)), T);
-      SubtractFrom(MatMul(v, Transpose(u)), T);
-      u_stack.emplace_back(std::move(u));
-    }
-  }
-  while (!u_stack.empty()) {
-    const ImageD& u = u_stack.back();
-    ImageD v = MatMul(Transpose(*U), u);
-    SubtractFrom(ScaleImage(2.0, MatMul(u, Transpose(v))), U);
-    u_stack.pop_back();
-  }
-}
-
-double WilkinsonShift(const double a0, const double a1, const double b) {
-  const double d = 0.5 * (a0 - a1);
-  if (d == 0.0) {
-    return a1 - std::abs(b);
-  }
-  const double sign_d = d > 0.0 ? 1.0 : -1.0;
-  return a1 - b * b / (d + sign_d * hypotf(d, b));
-}
-
-void ImplicitQRStep(ImageD* const JXL_RESTRICT U, double* const JXL_RESTRICT a,
-                    double* const JXL_RESTRICT b, int m0, int m1) {
-  JXL_ASSERT(m1 - m0 > 2);
-  double x = a[m0] - WilkinsonShift(a[m1 - 2], a[m1 - 1], b[m1 - 1]);
-  double y = b[m0 + 1];
-  for (int k = m0; k < m1 - 1; ++k) {
-    double c, s;
-    GivensRotation(x, y, &c, &s);
-    const double w = c * x - s * y;
-    const double d = a[k] - a[k + 1];
-    const double z = (2 * c * b[k + 1] + d * s) * s;
-    a[k] -= z;
-    a[k + 1] += z;
-    b[k + 1] = d * c * s + (c * c - s * s) * b[k + 1];
-    x = b[k + 1];
-    if (k > m0) {
-      b[k] = w;
-    }
-    if (k < m1 - 2) {
-      y = -s * b[k + 2];
-      b[k + 2] *= c;
-    }
-    RotateMatrixCols(U, k, k + 1, c, s);
-  }
-}
-
-void ScanInterval(const double* const JXL_RESTRICT a,
-                  const double* const JXL_RESTRICT b, int istart,
-                  const int iend, const double eps,
-                  std::deque<std::pair<int, int> >* intervals) {
-  for (int k = istart; k < iend; ++k) {
-    if ((k + 1 == iend) ||
-        std::abs(b[k + 1]) < eps * (std::abs(a[k]) + std::abs(a[k + 1]))) {
-      if (k > istart) {
-        intervals->push_back(std::make_pair(istart, k + 1));
-      }
-      istart = k + 1;
-    }
-  }
-}
-
-void ConvertToDiagonal(const ImageD& A, ImageD* const JXL_RESTRICT diag,
-                       ImageD* const JXL_RESTRICT U) {
-  AssertSymmetric(A);
-  const size_t N = A.xsize();
-  ImageD T;
-  ConvertToTridiagonal(A, &T, U);
-  // From now on, the algorithm keeps the transformed matrix tri-diagonal,
-  // so we only need to keep track of the diagonal and the off-diagonal entries.
-  std::vector<double> a(N);
-  std::vector<double> b(N);
-  for (size_t k = 0; k < N; ++k) {
-    a[k] = T.Row(k)[k];
-    if (k > 0) b[k] = T.Row(k)[k - 1];
-  }
-  // Run the symmetric tri-diagonal QR algorithm with implicit Wilkinson shift.
-  const double kEpsilon = 1e-14;
-  std::deque<std::pair<int, int> > intervals;
-  ScanInterval(&a[0], &b[0], 0, N, kEpsilon, &intervals);
-  while (!intervals.empty()) {
-    const int istart = intervals[0].first;
-    const int iend = intervals[0].second;
-    intervals.pop_front();
-    if (iend == istart + 2) {
-      double& a0 = a[istart];
-      double& a1 = a[istart + 1];
-      double& b1 = b[istart + 1];
-      double c, s;
-      Diagonalize2x2(a0, a1, b1, &c, &s);
-      const double d = a0 - a1;
-      const double z = (2 * c * b1 + d * s) * s;
-      a0 -= z;
-      a1 += z;
-      b1 = 0.0;
-      RotateMatrixCols(U, istart, istart + 1, c, s);
-    } else {
-      ImplicitQRStep(U, &a[0], &b[0], istart, iend);
-      ScanInterval(&a[0], &b[0], istart, iend, kEpsilon, &intervals);
-    }
-  }
-  *diag = ImageD(N, 1);
-  double* const JXL_RESTRICT diag_row = diag->Row(0);
-  for (size_t k = 0; k < N; ++k) {
-    diag_row[k] = a[k];
-  }
-}
-
-void ComputeQRFactorization(const ImageD& A, ImageD* const JXL_RESTRICT Q,
-                            ImageD* const JXL_RESTRICT R) {
-  JXL_ASSERT(A.xsize() == A.ysize());
-  const size_t N = A.xsize();
-  *Q = Identity<double>(N);
-  *R = CopyImage(A);
-  std::vector<ImageD> u_stack;
-  for (size_t k = 0; k + 1 < N; ++k) {
-    if (DotProduct(N - k - 1, &R->Row(k)[k + 1], &R->Row(k)[k + 1]) > 1e-15) {
-      ImageD u(N, 1);
-      FillImage(0.0, &u);
-      HouseholderReflector(N - k, &R->Row(k)[k], &u.Row(0)[k]);
-      ImageD v = MatMul(Transpose(u), *R);
-      SubtractFrom(ScaleImage(2.0, MatMul(u, v)), R);
-      u_stack.emplace_back(std::move(u));
-    }
-  }
-  while (!u_stack.empty()) {
-    const ImageD& u = u_stack.back();
-    ImageD v = MatMul(Transpose(u), *Q);
-    SubtractFrom(ScaleImage(2.0, MatMul(u, v)), Q);
-    u_stack.pop_back();
-  }
-}
-}  // namespace jxl
diff --git a/lib/jxl/linalg.h b/lib/jxl/linalg.h
deleted file mode 100644 (file)
index e44dd85..0000000
+++ /dev/null
@@ -1,295 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#ifndef LIB_JXL_LINALG_H_
-#define LIB_JXL_LINALG_H_
-
-// Linear algebra.
-
-#include <stddef.h>
-
-#include <algorithm>
-#include <cmath>
-#include <vector>
-
-#include "lib/jxl/base/compiler_specific.h"
-#include "lib/jxl/base/status.h"
-#include "lib/jxl/common.h"
-#include "lib/jxl/image.h"
-#include "lib/jxl/image_ops.h"
-
-namespace jxl {
-
-using ImageD = Plane<double>;
-
-template <typename T>
-inline T DotProduct(const size_t N, const T* const JXL_RESTRICT a,
-                    const T* const JXL_RESTRICT b) {
-  T sum = 0.0;
-  for (size_t k = 0; k < N; ++k) {
-    sum += a[k] * b[k];
-  }
-  return sum;
-}
-
-template <typename T>
-inline T L2NormSquared(const size_t N, const T* const JXL_RESTRICT a) {
-  return DotProduct(N, a, a);
-}
-
-template <typename T>
-inline T L1Norm(const size_t N, const T* const JXL_RESTRICT a) {
-  T sum = 0;
-  for (size_t k = 0; k < N; ++k) {
-    sum += a[k] >= 0 ? a[k] : -a[k];
-  }
-  return sum;
-}
-
-inline double DotProduct(const ImageD& a, const ImageD& b) {
-  JXL_ASSERT(a.ysize() == 1);
-  JXL_ASSERT(b.ysize() == 1);
-  JXL_ASSERT(a.xsize() == b.xsize());
-  const double* const JXL_RESTRICT row_a = a.Row(0);
-  const double* const JXL_RESTRICT row_b = b.Row(0);
-  return DotProduct(a.xsize(), row_a, row_b);
-}
-
-inline ImageD Transpose(const ImageD& A) {
-  ImageD out(A.ysize(), A.xsize());
-  for (size_t x = 0; x < A.xsize(); ++x) {
-    double* const JXL_RESTRICT row_out = out.Row(x);
-    for (size_t y = 0; y < A.ysize(); ++y) {
-      row_out[y] = A.Row(y)[x];
-    }
-  }
-  return out;
-}
-
-template <typename Tout, typename Tin1, typename Tin2>
-Plane<Tout> MatMul(const Plane<Tin1>& A, const Plane<Tin2>& B) {
-  JXL_ASSERT(A.ysize() == B.xsize());
-  Plane<Tout> out(A.xsize(), B.ysize());
-  for (size_t y = 0; y < B.ysize(); ++y) {
-    const Tin2* const JXL_RESTRICT row_b = B.Row(y);
-    Tout* const JXL_RESTRICT row_out = out.Row(y);
-    for (size_t x = 0; x < A.xsize(); ++x) {
-      row_out[x] = 0.0;
-      for (size_t k = 0; k < B.xsize(); ++k) {
-        row_out[x] += A.Row(k)[x] * row_b[k];
-      }
-    }
-  }
-  return out;
-}
-
-template <typename T1, typename T2>
-ImageD MatMul(const Plane<T1>& A, const Plane<T2>& B) {
-  return MatMul<double, T1, T2>(A, B);
-}
-
-template <typename T1, typename T2>
-ImageI MatMulI(const Plane<T1>& A, const Plane<T2>& B) {
-  return MatMul<int, T1, T2>(A, B);
-}
-
-// Computes A = B * C, with sizes rows*cols: A=ha*wa, B=wa*wb, C=ha*wb
-template <typename T>
-void MatMul(const T* a, const T* b, int ha, int wa, int wb, T* c) {
-  std::vector<T> temp(wa);  // Make better use of cache lines
-  for (int x = 0; x < wb; x++) {
-    for (int z = 0; z < wa; z++) {
-      temp[z] = b[z * wb + x];
-    }
-    for (int y = 0; y < ha; y++) {
-      double e = 0;
-      for (int z = 0; z < wa; z++) {
-        e += a[y * wa + z] * temp[z];
-      }
-      c[y * wb + x] = e;
-    }
-  }
-}
-
-// Computes C = A + factor * B
-template <typename T, typename F>
-void MatAdd(const T* a, const T* b, F factor, int h, int w, T* c) {
-  for (int i = 0; i < w * h; i++) {
-    c[i] = a[i] + b[i] * factor;
-  }
-}
-
-template <typename T>
-inline Plane<T> Identity(const size_t N) {
-  Plane<T> out(N, N);
-  for (size_t i = 0; i < N; ++i) {
-    T* JXL_RESTRICT row = out.Row(i);
-    std::fill(row, row + N, 0);
-    row[i] = static_cast<T>(1.0);
-  }
-  return out;
-}
-
-inline ImageD Diagonal(const ImageD& d) {
-  JXL_ASSERT(d.ysize() == 1);
-  ImageD out(d.xsize(), d.xsize());
-  const double* JXL_RESTRICT row_diag = d.Row(0);
-  for (size_t k = 0; k < d.xsize(); ++k) {
-    double* JXL_RESTRICT row_out = out.Row(k);
-    std::fill(row_out, row_out + d.xsize(), 0.0);
-    row_out[k] = row_diag[k];
-  }
-  return out;
-}
-
-// Computes c, s such that c^2 + s^2 = 1 and
-//   [c -s] [x] = [ * ]
-//   [s  c] [y]   [ 0 ]
-void GivensRotation(double x, double y, double* c, double* s);
-
-// U = U * Givens(i, j, c, s)
-void RotateMatrixCols(ImageD* JXL_RESTRICT U, int i, int j, double c, double s);
-
-// A is symmetric, U is orthogonal, T is tri-diagonal and
-// A = U * T * Transpose(U).
-void ConvertToTridiagonal(const ImageD& A, ImageD* JXL_RESTRICT T,
-                          ImageD* JXL_RESTRICT U);
-
-// A is symmetric, U is orthogonal, and A = U * Diagonal(diag) * Transpose(U).
-void ConvertToDiagonal(const ImageD& A, ImageD* JXL_RESTRICT diag,
-                       ImageD* JXL_RESTRICT U);
-
-// A is square matrix, Q is orthogonal, R is upper triangular and A = Q * R;
-void ComputeQRFactorization(const ImageD& A, ImageD* JXL_RESTRICT Q,
-                            ImageD* JXL_RESTRICT R);
-
-// Inverts a 3x3 matrix in place
-template <typename T>
-Status Inv3x3Matrix(T* matrix) {
-  // Intermediate computation is done in double precision.
-  double temp[9];
-  temp[0] = static_cast<double>(matrix[4]) * matrix[8] -
-            static_cast<double>(matrix[5]) * matrix[7];
-  temp[1] = static_cast<double>(matrix[2]) * matrix[7] -
-            static_cast<double>(matrix[1]) * matrix[8];
-  temp[2] = static_cast<double>(matrix[1]) * matrix[5] -
-            static_cast<double>(matrix[2]) * matrix[4];
-  temp[3] = static_cast<double>(matrix[5]) * matrix[6] -
-            static_cast<double>(matrix[3]) * matrix[8];
-  temp[4] = static_cast<double>(matrix[0]) * matrix[8] -
-            static_cast<double>(matrix[2]) * matrix[6];
-  temp[5] = static_cast<double>(matrix[2]) * matrix[3] -
-            static_cast<double>(matrix[0]) * matrix[5];
-  temp[6] = static_cast<double>(matrix[3]) * matrix[7] -
-            static_cast<double>(matrix[4]) * matrix[6];
-  temp[7] = static_cast<double>(matrix[1]) * matrix[6] -
-            static_cast<double>(matrix[0]) * matrix[7];
-  temp[8] = static_cast<double>(matrix[0]) * matrix[4] -
-            static_cast<double>(matrix[1]) * matrix[3];
-  double det = matrix[0] * temp[0] + matrix[1] * temp[3] + matrix[2] * temp[6];
-  if (std::abs(det) < 1e-10) {
-    return JXL_FAILURE("Matrix determinant is too close to 0");
-  }
-  double idet = 1.0 / det;
-  for (int i = 0; i < 9; i++) {
-    matrix[i] = temp[i] * idet;
-  }
-  return true;
-}
-
-// Solves system of linear equations A * X = B using the conjugate gradient
-// method. Matrix a must be a n*n, symmetric and positive definite.
-// Vectors b and x must have n elements
-template <typename T>
-void ConjugateGradient(const T* a, int n, const T* b, T* x) {
-  std::vector<T> r(n);
-  MatMul(a, x, n, n, 1, r.data());
-  MatAdd(b, r.data(), -1, n, 1, r.data());
-  std::vector<T> p = r;
-  T rr;
-  MatMul(r.data(), r.data(), 1, n, 1, &rr);  // inner product
-
-  if (rr == 0) return;  // The initial values were already optimal
-
-  for (int i = 0; i < n; i++) {
-    std::vector<T> ap(n);
-    MatMul(a, p.data(), n, n, 1, ap.data());
-    T alpha;
-    MatMul(r.data(), ap.data(), 1, n, 1, &alpha);
-    // Normally alpha couldn't be zero here but if numerical issues caused it,
-    // return assuming the solution is close.
-    if (alpha == 0) return;
-    alpha = rr / alpha;
-    MatAdd(x, p.data(), alpha, n, 1, x);
-    MatAdd(r.data(), ap.data(), -alpha, n, 1, r.data());
-
-    T rr2;
-    MatMul(r.data(), r.data(), 1, n, 1, &rr2);  // inner product
-    if (rr2 < 1e-20) break;
-
-    T beta = rr2 / rr;
-    MatAdd(r.data(), p.data(), beta, 1, n, p.data());
-    rr = rr2;
-  }
-}
-
-// Computes optimal coefficients r to approximate points p with linear
-// combination of functions f. The matrix f has h rows and w columns, r has h
-// values, p has w values. h is the amount of functions, w the amount of points.
-// Uses the finite element method and minimizes mean square error.
-template <typename T>
-void FEM(const T* f, int h, int w, const T* p, T* r) {
-  // Compute "Gramian" matrix G = F * F^T
-  // Speed up multiplication by using non-zero intervals in sparse F.
-  std::vector<int> start(h);
-  std::vector<int> end(h);
-  for (int y = 0; y < h; y++) {
-    start[y] = end[y] = 0;
-    for (int x = 0; x < w; x++) {
-      if (f[y * w + x] != 0) {
-        start[y] = x;
-        break;
-      }
-    }
-    for (int x = w - 1; x >= 0; x--) {
-      if (f[y * w + x] != 0) {
-        end[y] = x + 1;
-        break;
-      }
-    }
-  }
-
-  std::vector<T> g(h * h);
-  for (int y = 0; y < h; y++) {
-    for (int x = 0; x <= y; x++) {
-      T v = 0;
-      // Intersection of the two sparse intervals.
-      int s = std::max(start[x], start[y]);
-      int e = std::min(end[x], end[y]);
-      for (int z = s; z < e; z++) {
-        v += f[x * w + z] * f[y * w + z];
-      }
-      // Symmetric, so two values output at once
-      g[y * h + x] = v;
-      g[x * h + y] = v;
-    }
-  }
-
-  // B vector: sum of each column of F multiplied by corresponding p
-  std::vector<T> b(h, 0);
-  for (int y = 0; y < h; y++) {
-    T v = 0;
-    for (int x = 0; x < w; x++) {
-      v += f[y * w + x] * p[x];
-    }
-    b[y] = v;
-  }
-
-  ConjugateGradient(g.data(), h, b.data(), r);
-}
-
-}  // namespace jxl
-
-#endif  // LIB_JXL_LINALG_H_
diff --git a/lib/jxl/linalg_test.cc b/lib/jxl/linalg_test.cc
deleted file mode 100644 (file)
index 292b984..0000000
+++ /dev/null
@@ -1,146 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "lib/jxl/linalg.h"
-
-#include "gtest/gtest.h"
-#include "lib/jxl/image_test_utils.h"
-
-namespace jxl {
-namespace {
-
-template <typename T>
-Plane<T> RandomMatrix(const size_t xsize, const size_t ysize, Rng& rng,
-                      const T vmin, const T vmax) {
-  Plane<T> A(xsize, ysize);
-  GenerateImage(rng, &A, vmin, vmax);
-  return A;
-}
-
-template <typename T>
-Plane<T> RandomSymmetricMatrix(const size_t N, Rng& rng, const T vmin,
-                               const T vmax) {
-  Plane<T> A = RandomMatrix<T>(N, N, rng, vmin, vmax);
-  for (size_t i = 0; i < N; ++i) {
-    for (size_t j = 0; j < i; ++j) {
-      A.Row(j)[i] = A.Row(i)[j];
-    }
-  }
-  return A;
-}
-void VerifyMatrixEqual(const ImageD& A, const ImageD& B, const double eps) {
-  ASSERT_EQ(A.xsize(), B.xsize());
-  ASSERT_EQ(A.ysize(), B.ysize());
-  for (size_t y = 0; y < A.ysize(); ++y) {
-    for (size_t x = 0; x < A.xsize(); ++x) {
-      ASSERT_NEAR(A.Row(y)[x], B.Row(y)[x], eps);
-    }
-  }
-}
-
-void VerifyOrthogonal(const ImageD& A, const double eps) {
-  VerifyMatrixEqual(Identity<double>(A.xsize()), MatMul(Transpose(A), A), eps);
-}
-
-void VerifyTridiagonal(const ImageD& T, const double eps) {
-  ASSERT_EQ(T.xsize(), T.ysize());
-  for (size_t i = 0; i < T.xsize(); ++i) {
-    for (size_t j = i + 2; j < T.xsize(); ++j) {
-      ASSERT_NEAR(T.Row(i)[j], 0.0, eps);
-      ASSERT_NEAR(T.Row(j)[i], 0.0, eps);
-    }
-  }
-}
-
-void VerifyUpperTriangular(const ImageD& R, const double eps) {
-  ASSERT_EQ(R.xsize(), R.ysize());
-  for (size_t i = 0; i < R.xsize(); ++i) {
-    for (size_t j = i + 1; j < R.xsize(); ++j) {
-      ASSERT_NEAR(R.Row(i)[j], 0.0, eps);
-    }
-  }
-}
-
-TEST(LinAlgTest, ConvertToTridiagonal) {
-  {
-    ImageD I = Identity<double>(5);
-    ImageD T, U;
-    ConvertToTridiagonal(I, &T, &U);
-    VerifyMatrixEqual(I, T, 1e-15);
-    VerifyMatrixEqual(I, U, 1e-15);
-  }
-  {
-    ImageD A = Identity<double>(5);
-    A.Row(0)[1] = A.Row(1)[0] = 2.0;
-    A.Row(0)[4] = A.Row(4)[0] = 3.0;
-    A.Row(2)[3] = A.Row(3)[2] = 2.0;
-    A.Row(3)[4] = A.Row(4)[3] = 2.0;
-    ImageD U, d;
-    ConvertToDiagonal(A, &d, &U);
-    VerifyOrthogonal(U, 1e-12);
-    VerifyMatrixEqual(A, MatMul(U, MatMul(Diagonal(d), Transpose(U))), 1e-12);
-  }
-  Rng rng(0);
-  for (int N = 2; N < 100; ++N) {
-    ImageD A = RandomSymmetricMatrix(N, rng, -1.0, 1.0);
-    ImageD T, U;
-    ConvertToTridiagonal(A, &T, &U);
-    VerifyOrthogonal(U, 1e-12);
-    VerifyTridiagonal(T, 1e-12);
-    VerifyMatrixEqual(A, MatMul(U, MatMul(T, Transpose(U))), 1e-12);
-  }
-}
-
-TEST(LinAlgTest, ConvertToDiagonal) {
-  {
-    ImageD I = Identity<double>(5);
-    ImageD U, d;
-    ConvertToDiagonal(I, &d, &U);
-    VerifyMatrixEqual(I, U, 1e-15);
-    for (int k = 0; k < 5; ++k) {
-      ASSERT_NEAR(d.Row(0)[k], 1.0, 1e-15);
-    }
-  }
-  {
-    ImageD A = Identity<double>(5);
-    A.Row(0)[1] = A.Row(1)[0] = 2.0;
-    A.Row(2)[3] = A.Row(3)[2] = 2.0;
-    A.Row(3)[4] = A.Row(4)[3] = 2.0;
-    ImageD U, d;
-    ConvertToDiagonal(A, &d, &U);
-    VerifyOrthogonal(U, 1e-12);
-    VerifyMatrixEqual(A, MatMul(U, MatMul(Diagonal(d), Transpose(U))), 1e-12);
-  }
-  Rng rng(0);
-  for (int N = 2; N < 100; ++N) {
-    ImageD A = RandomSymmetricMatrix(N, rng, -1.0, 1.0);
-    ImageD U, d;
-    ConvertToDiagonal(A, &d, &U);
-    VerifyOrthogonal(U, 1e-12);
-    VerifyMatrixEqual(A, MatMul(U, MatMul(Diagonal(d), Transpose(U))), 1e-12);
-  }
-}
-
-TEST(LinAlgTest, ComputeQRFactorization) {
-  {
-    ImageD I = Identity<double>(5);
-    ImageD Q, R;
-    ComputeQRFactorization(I, &Q, &R);
-    VerifyMatrixEqual(I, Q, 1e-15);
-    VerifyMatrixEqual(I, R, 1e-15);
-  }
-  Rng rng(0);
-  for (int N = 2; N < 100; ++N) {
-    ImageD A = RandomMatrix(N, N, rng, -1.0, 1.0);
-    ImageD Q, R;
-    ComputeQRFactorization(A, &Q, &R);
-    VerifyOrthogonal(Q, 1e-12);
-    VerifyUpperTriangular(R, 1e-12);
-    VerifyMatrixEqual(A, MatMul(Q, R), 1e-12);
-  }
-}
-
-}  // namespace
-}  // namespace jxl
index 1aec0f7..5afe876 100644 (file)
@@ -5,7 +5,6 @@
 
 #include "lib/jxl/loop_filter.h"
 
-#include "lib/jxl/aux_out.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/fields.h"
 
index 6250167..e4b418b 100644 (file)
 #include <stddef.h>
 #include <stdint.h>
 
-#include "lib/jxl/aux_out_fwd.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/dec_bit_reader.h"
-#include "lib/jxl/enc_bit_writer.h"
 #include "lib/jxl/field_encodings.h"
 
 namespace jxl {
index d5ce75a..7af4b2f 100644 (file)
@@ -5,17 +5,15 @@
 
 #include "lib/jxl/luminance.h"
 
-#include "lib/jxl/codec_in_out.h"
+#include "lib/jxl/image_metadata.h"
 
 namespace jxl {
 
-void SetIntensityTarget(CodecInOut* io) { SetIntensityTarget(&io->metadata.m); }
-
 void SetIntensityTarget(ImageMetadata* m) {
-  if (m->color_encoding.tf.IsPQ()) {
+  if (m->color_encoding.Tf().IsPQ()) {
     // Peak luminance of PQ as defined by SMPTE ST 2084:2014.
     m->SetIntensityTarget(10000);
-  } else if (m->color_encoding.tf.IsHLG()) {
+  } else if (m->color_encoding.Tf().IsHLG()) {
     // Nominal display peak luminance used as a reference by
     // Rec. ITU-R BT.2100-2.
     m->SetIntensityTarget(1000);
index 92f889a..3181576 100644 (file)
@@ -12,10 +12,9 @@ namespace jxl {
 // image, if known. For SDR images or images not known to be HDR, returns
 // kDefaultIntensityTarget, for images known to have PQ or HLG transfer function
 // returns a higher value.
-class CodecInOut;
-void SetIntensityTarget(CodecInOut* io);
 
 struct ImageMetadata;
+// TODO(eustas): rename
 void SetIntensityTarget(ImageMetadata* m);
 
 }  // namespace jxl
index b4a7890..f8a5cd8 100644 (file)
@@ -8,6 +8,7 @@
 
 // Memory allocator with support for alignment + misalignment.
 
+#include <jxl/memory_manager.h>
 #include <stddef.h>
 #include <stdint.h>
 #include <stdlib.h>
@@ -16,7 +17,6 @@
 #include <atomic>
 #include <memory>
 
-#include "jxl/memory_manager.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/status.h"
 
index 914cd6a..4c3a33a 100644 (file)
@@ -65,7 +65,17 @@ struct State {
   const Header header;
 
   // Allows to approximate division by a number from 1 to 64.
-  uint32_t divlookup[64];
+  //  for (int i = 0; i < 64; i++) divlookup[i] = (1 << 24) / (i + 1);
+
+  const uint32_t divlookup[64] = {
+      16777216, 8388608, 5592405, 4194304, 3355443, 2796202, 2396745, 2097152,
+      1864135,  1677721, 1525201, 1398101, 1290555, 1198372, 1118481, 1048576,
+      986895,   932067,  883011,  838860,  798915,  762600,  729444,  699050,
+      671088,   645277,  621378,  599186,  578524,  559240,  541200,  524288,
+      508400,   493447,  479349,  466033,  453438,  441505,  430185,  419430,
+      409200,   399457,  390167,  381300,  372827,  364722,  356962,  349525,
+      342392,   335544,  328965,  322638,  316551,  310689,  305040,  299593,
+      294337,   289262,  284359,  279620,  275036,  270600,  266305,  262144};
 
   constexpr static pixel_type_w AddBits(pixel_type_w x) {
     return uint64_t(x) << kPredExtraBits;
@@ -78,10 +88,6 @@ struct State {
       pred_errors[i].resize((xsize + 2) * 2);
     }
     error.resize((xsize + 2) * 2);
-    // Initialize division lookup table.
-    for (int i = 0; i < 64; i++) {
-      divlookup[i] = (1 << 24) / (i + 1);
-    }
   }
 
   // Approximates 4+(maxweight<<24)/(x+1), avoiding division
@@ -282,15 +288,15 @@ struct FlatDecisionNode {
     PropertyVal splitval0;
     Predictor predictor;
   };
-  uint32_t childID;  // childID is ctx id if leaf.
   // Property+splitval of the two child nodes.
   union {
     PropertyVal splitvals[2];
     int32_t multiplier;
   };
+  uint32_t childID;  // childID is ctx id if leaf.
   union {
-    int32_t properties[2];
-    int64_t predictor_offset;
+    int16_t properties[2];
+    int32_t predictor_offset;
   };
 };
 using FlatTree = std::vector<FlatDecisionNode>;
@@ -301,22 +307,27 @@ class MATreeLookup {
   struct LookupResult {
     uint32_t context;
     Predictor predictor;
-    int64_t offset;
+    int32_t offset;
     int32_t multiplier;
   };
   JXL_INLINE LookupResult Lookup(const Properties &properties) const {
     uint32_t pos = 0;
     while (true) {
-      const FlatDecisionNode &node = nodes_[pos];
-      if (node.property0 < 0) {
-        return {node.childID, node.predictor, node.predictor_offset,
-                node.multiplier};
-      }
-      bool p0 = properties[node.property0] <= node.splitval0;
-      uint32_t off0 = properties[node.properties[0]] <= node.splitvals[0];
-      uint32_t off1 =
-          2 | (properties[node.properties[1]] <= node.splitvals[1] ? 1 : 0);
-      pos = node.childID + (p0 ? off1 : off0);
+#define TRAVERSE_THE_TREE                                                      \
+  {                                                                            \
+    const FlatDecisionNode &node = nodes_[pos];                                \
+    if (node.property0 < 0) {                                                  \
+      return {node.childID, node.predictor, node.predictor_offset,             \
+              node.multiplier};                                                \
+    }                                                                          \
+    bool p0 = properties[node.property0] <= node.splitval0;                    \
+    uint32_t off0 = properties[node.properties[0]] <= node.splitvals[0];       \
+    uint32_t off1 = 2 | (properties[node.properties[1]] <= node.splitvals[1]); \
+    pos = node.childID + (p0 ? off1 : off0);                                   \
+  }
+
+      TRAVERSE_THE_TREE;
+      TRAVERSE_THE_TREE;
     }
   }
 
@@ -485,8 +496,8 @@ JXL_INLINE PredictionResult Predict(
     // location
     (*p)[offset++] = x;
     // neighbors
-    (*p)[offset++] = std::abs(top);
-    (*p)[offset++] = std::abs(left);
+    (*p)[offset++] = top > 0 ? top : -top;
+    (*p)[offset++] = left > 0 ? left : -left;
     (*p)[offset++] = top;
     (*p)[offset++] = left;
 
@@ -589,6 +600,18 @@ inline PredictionResult PredictTreeWP(Properties *p, size_t w,
       p, w, pp, onerow, x, y, Predictor::Zero, &tree_lookup, &references,
       wp_state, /*predictions=*/nullptr);
 }
+JXL_INLINE PredictionResult PredictTreeWPNEC(Properties *p, size_t w,
+                                             const pixel_type *JXL_RESTRICT pp,
+                                             const intptr_t onerow, const int x,
+                                             const int y,
+                                             const MATreeLookup &tree_lookup,
+                                             const Channel &references,
+                                             weighted::State *wp_state) {
+  return detail::Predict<detail::kUseTree | detail::kUseWP |
+                         detail::kNoEdgeCases>(
+      p, w, pp, onerow, x, y, Predictor::Zero, &tree_lookup, &references,
+      wp_state, /*predictions=*/nullptr);
+}
 
 inline PredictionResult PredictLearn(Properties *p, size_t w,
                                      const pixel_type *JXL_RESTRICT pp,
@@ -612,6 +635,29 @@ inline void PredictLearnAll(Properties *p, size_t w,
       p, w, pp, onerow, x, y, Predictor::Zero,
       /*lookup=*/nullptr, &references, wp_state, predictions);
 }
+inline PredictionResult PredictLearnNEC(Properties *p, size_t w,
+                                        const pixel_type *JXL_RESTRICT pp,
+                                        const intptr_t onerow, const int x,
+                                        const int y, Predictor predictor,
+                                        const Channel &references,
+                                        weighted::State *wp_state) {
+  return detail::Predict<detail::kForceComputeProperties | detail::kUseWP |
+                         detail::kNoEdgeCases>(
+      p, w, pp, onerow, x, y, predictor, /*lookup=*/nullptr, &references,
+      wp_state, /*predictions=*/nullptr);
+}
+
+inline void PredictLearnAllNEC(Properties *p, size_t w,
+                               const pixel_type *JXL_RESTRICT pp,
+                               const intptr_t onerow, const int x, const int y,
+                               const Channel &references,
+                               weighted::State *wp_state,
+                               pixel_type_w *predictions) {
+  detail::Predict<detail::kForceComputeProperties | detail::kUseWP |
+                  detail::kAllPredictions | detail::kNoEdgeCases>(
+      p, w, pp, onerow, x, y, Predictor::Zero,
+      /*lookup=*/nullptr, &references, wp_state, predictions);
+}
 
 inline void PredictAllNoWP(size_t w, const pixel_type *JXL_RESTRICT pp,
                            const intptr_t onerow, const int x, const int y,
index 66562f7..5996877 100644 (file)
@@ -9,6 +9,7 @@
 #include "lib/jxl/dec_ans.h"
 #include "lib/jxl/modular/encoding/ma_common.h"
 #include "lib/jxl/modular/modular_image.h"
+#include "lib/jxl/pack_signed.h"
 
 namespace jxl {
 
index eeed2ae..828c9de 100644 (file)
 #include <unordered_map>
 #include <unordered_set>
 
+#include "lib/jxl/base/common.h"
 #include "lib/jxl/base/printf_macros.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/dec_ans.h"
 #include "lib/jxl/dec_bit_reader.h"
 #include "lib/jxl/enc_ans.h"
+#include "lib/jxl/enc_aux_out.h"
 #include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/enc_fields.h"
 #include "lib/jxl/entropy_coder.h"
 #include "lib/jxl/fields.h"
 #include "lib/jxl/image_ops.h"
 #include "lib/jxl/modular/encoding/ma_common.h"
 #include "lib/jxl/modular/options.h"
 #include "lib/jxl/modular/transform/transform.h"
+#include "lib/jxl/pack_signed.h"
 #include "lib/jxl/toc.h"
 
 namespace jxl {
 
 namespace {
 // Plot tree (if enabled) and predictor usage map.
-constexpr bool kWantDebug = false;
-constexpr bool kPrintTree = false;
+constexpr bool kWantDebug = true;
+// constexpr bool kPrintTree = false;
 
 inline std::array<uint8_t, 3> PredictorColor(Predictor p) {
   switch (p) {
@@ -105,29 +108,62 @@ void GatherTreeData(const Image &image, pixel_type chan, size_t group_id,
   Channel references(properties.size() - kNumNonrefProperties, channel.w);
   weighted::State wp_state(wp_header, channel.w, channel.h);
   tree_samples.PrepareForSamples(pixel_fraction * channel.h * channel.w + 64);
+  const bool multiple_predictors = tree_samples.NumPredictors() != 1;
+  auto compute_sample = [&](const pixel_type *p, size_t x, size_t y) {
+    pixel_type_w pred[kNumModularPredictors];
+    if (multiple_predictors) {
+      PredictLearnAll(&properties, channel.w, p + x, onerow, x, y, references,
+                      &wp_state, pred);
+    } else {
+      pred[static_cast<int>(tree_samples.PredictorFromIndex(0))] =
+          PredictLearn(&properties, channel.w, p + x, onerow, x, y,
+                       tree_samples.PredictorFromIndex(0), references,
+                       &wp_state)
+              .guess;
+    }
+    (*total_pixels)++;
+    if (use_sample()) {
+      tree_samples.AddSample(p[x], properties, pred);
+    }
+    wp_state.UpdateErrors(p[x], x, y, channel.w);
+  };
+
   for (size_t y = 0; y < channel.h; y++) {
     const pixel_type *JXL_RESTRICT p = channel.Row(y);
     PrecomputeReferences(channel, y, image, chan, &references);
     InitPropsRow(&properties, static_props, y);
+
     // TODO(veluca): avoid computing WP if we don't use its property or
     // predictions.
-    for (size_t x = 0; x < channel.w; x++) {
-      pixel_type_w pred[kNumModularPredictors];
-      if (tree_samples.NumPredictors() != 1) {
-        PredictLearnAll(&properties, channel.w, p + x, onerow, x, y, references,
-                        &wp_state, pred);
-      } else {
-        pred[static_cast<int>(tree_samples.PredictorFromIndex(0))] =
-            PredictLearn(&properties, channel.w, p + x, onerow, x, y,
-                         tree_samples.PredictorFromIndex(0), references,
-                         &wp_state)
-                .guess;
+    if (y > 1 && channel.w > 8 && references.w == 0) {
+      for (size_t x = 0; x < 2; x++) {
+        compute_sample(p, x, y);
       }
-      (*total_pixels)++;
-      if (use_sample()) {
-        tree_samples.AddSample(p[x], properties, pred);
+      for (size_t x = 2; x < channel.w - 2; x++) {
+        pixel_type_w pred[kNumModularPredictors];
+        if (multiple_predictors) {
+          PredictLearnAllNEC(&properties, channel.w, p + x, onerow, x, y,
+                             references, &wp_state, pred);
+        } else {
+          pred[static_cast<int>(tree_samples.PredictorFromIndex(0))] =
+              PredictLearnNEC(&properties, channel.w, p + x, onerow, x, y,
+                              tree_samples.PredictorFromIndex(0), references,
+                              &wp_state)
+                  .guess;
+        }
+        (*total_pixels)++;
+        if (use_sample()) {
+          tree_samples.AddSample(p[x], properties, pred);
+        }
+        wp_state.UpdateErrors(p[x], x, y, channel.w);
+      }
+      for (size_t x = channel.w - 2; x < channel.w; x++) {
+        compute_sample(p, x, y);
+      }
+    } else {
+      for (size_t x = 0; x < channel.w; x++) {
+        compute_sample(p, x, y);
       }
-      wp_state.UpdateErrors(p[x], x, y, channel.w);
     }
   }
 }
@@ -193,13 +229,12 @@ Status EncodeModularChannelMAANS(const Image &image, pixel_type chan,
   // Check if this tree is a WP-only tree with a small enough property value
   // range.
   // Initialized to avoid clang-tidy complaining.
-  uint16_t context_lookup[2 * kPropRangeFast] = {};
-  int8_t offsets[2 * kPropRangeFast] = {};
+  auto tree_lut = jxl::make_unique<TreeLut<uint16_t, false>>();
   if (is_wp_only) {
-    is_wp_only = TreeToLookupTable(tree, context_lookup, offsets);
+    is_wp_only = TreeToLookupTable(tree, *tree_lut);
   }
   if (is_gradient_only) {
-    is_gradient_only = TreeToLookupTable(tree, context_lookup, offsets);
+    is_gradient_only = TreeToLookupTable(tree, *tree_lut);
   }
 
   if (is_wp_only && !skip_encoder_fast_path) {
@@ -226,8 +261,8 @@ Status EncodeModularChannelMAANS(const Image &image, pixel_type chan,
         uint32_t pos =
             kPropRangeFast + std::min(std::max(-kPropRangeFast, properties[0]),
                                       kPropRangeFast - 1);
-        uint32_t ctx_id = context_lookup[pos];
-        int32_t residual = r[x] - guess - offsets[pos];
+        uint32_t ctx_id = tree_lut->context_lookup[pos];
+        int32_t residual = r[x] - guess - tree_lut->offsets[pos];
         *tokenp++ = Token(ctx_id, PackSigned(residual));
         wp_state.UpdateErrors(r[x], x, y, channel.w);
       }
@@ -269,8 +304,8 @@ Status EncodeModularChannelMAANS(const Image &image, pixel_type chan,
             std::min<pixel_type_w>(
                 std::max<pixel_type_w>(-kPropRangeFast, top + left - topleft),
                 kPropRangeFast - 1);
-        uint32_t ctx_id = context_lookup[pos];
-        int32_t residual = r[x] - guess - offsets[pos];
+        uint32_t ctx_id = tree_lut->context_lookup[pos];
+        int32_t residual = r[x] - guess - tree_lut->offsets[pos];
         *tokenp++ = Token(ctx_id, PackSigned(residual));
       }
     }
@@ -331,7 +366,7 @@ Status EncodeModularChannelMAANS(const Image &image, pixel_type chan,
           }
         }
         pixel_type_w residual = p[x] - res.guess;
-        JXL_ASSERT(residual % res.multiplier == 0);
+        JXL_DASSERT(residual % res.multiplier == 0);
         *tokenp++ = Token(res.context, PackSigned(residual / res.multiplier));
       }
     }
@@ -359,17 +394,20 @@ Status EncodeModularChannelMAANS(const Image &image, pixel_type chan,
           }
         }
         pixel_type_w residual = p[x] - res.guess;
-        JXL_ASSERT(residual % res.multiplier == 0);
+        JXL_DASSERT(residual % res.multiplier == 0);
         *tokenp++ = Token(res.context, PackSigned(residual / res.multiplier));
         wp_state.UpdateErrors(p[x], x, y, channel.w);
       }
     }
   }
-  if (kWantDebug && WantDebugOutput(aux_out)) {
-    aux_out->DumpImage(
+  /* TODO(szabadka): Add cparams to the call stack here.
+  if (kWantDebug && WantDebugOutput(cparams)) {
+    DumpImage(
+        cparams,
         ("pred_" + ToString(group_id) + "_" + ToString(chan)).c_str(),
         predictor_img);
   }
+  */
   *tokenpp = tokenp;
   return true;
 }
@@ -423,11 +461,12 @@ Status ModularEncode(const Image &image, const ModularOptions &options,
       std::vector<uint32_t> channel_pixel_count;
       CollectPixelSamples(image, options, 0, group_pixel_count,
                           channel_pixel_count, pixel_samples, diff_samples);
-      std::vector<ModularMultiplierInfo> dummy_multiplier_info;
+      std::vector<ModularMultiplierInfo> placeholder_multiplier_info;
       StaticPropRange range;
       tree_samples_storage.PreQuantizeProperties(
-          range, dummy_multiplier_info, group_pixel_count, channel_pixel_count,
-          pixel_samples, diff_samples, options.max_property_values);
+          range, placeholder_multiplier_info, group_pixel_count,
+          channel_pixel_count, pixel_samples, diff_samples,
+          options.max_property_values);
     }
     for (size_t i = 0; i < nb_channels; i++) {
       if (!image.channel[i].w || !image.channel[i].h) {
@@ -465,9 +504,11 @@ Status ModularEncode(const Image &image, const ModularOptions &options,
     JXL_ASSERT(tree->size() == decoded_tree.size());
     tree_storage = std::move(decoded_tree);
 
+    /* TODO(szabadka) Add text output callback
     if (kWantDebug && kPrintTree && WantDebugOutput(aux_out)) {
       PrintTree(*tree, aux_out->debug_prefix + "/tree_" + ToString(group_id));
-    }
+    } */
+
     // Write tree
     BuildAndEncodeHistograms(HistogramParams(), kNumTreeContexts, tree_tokens,
                              &code, &context_map, writer, kLayerModularTree,
index 8491c93..4eba457 100644 (file)
@@ -6,28 +6,21 @@
 #ifndef LIB_JXL_MODULAR_ENCODING_ENC_ENCODING_H_
 #define LIB_JXL_MODULAR_ENCODING_ENC_ENCODING_H_
 
-#include <stddef.h>
-#include <stdint.h>
-
+#include <cstddef>
 #include <vector>
 
-#include "lib/jxl/aux_out_fwd.h"
-#include "lib/jxl/base/compiler_specific.h"
-#include "lib/jxl/base/padded_bytes.h"
-#include "lib/jxl/base/span.h"
-#include "lib/jxl/dec_ans.h"
 #include "lib/jxl/enc_ans.h"
 #include "lib/jxl/enc_bit_writer.h"
 #include "lib/jxl/image.h"
-#include "lib/jxl/modular/encoding/context_predict.h"
+#include "lib/jxl/modular/encoding/dec_ma.h"
 #include "lib/jxl/modular/encoding/enc_ma.h"
-#include "lib/jxl/modular/encoding/encoding.h"
-#include "lib/jxl/modular/modular_image.h"
 #include "lib/jxl/modular/options.h"
-#include "lib/jxl/modular/transform/transform.h"
 
 namespace jxl {
 
+struct AuxOut;
+struct GroupHeader;
+
 Tree LearnTree(TreeSamples &&tree_samples, size_t total_pixels,
                const ModularOptions &options,
                const std::vector<ModularMultiplierInfo> &multiplier_info = {},
index 90b11ba..ef72b24 100644 (file)
 #include <hwy/foreach_target.h>
 #include <hwy/highway.h>
 
+#include "lib/jxl/base/fast_math-inl.h"
 #include "lib/jxl/base/random.h"
 #include "lib/jxl/enc_ans.h"
-#include "lib/jxl/fast_math-inl.h"
 #include "lib/jxl/modular/encoding/context_predict.h"
 #include "lib/jxl/modular/options.h"
+#include "lib/jxl/pack_signed.h"
 HWY_BEFORE_NAMESPACE();
 namespace jxl {
 namespace HWY_NAMESPACE {
@@ -32,38 +33,29 @@ namespace HWY_NAMESPACE {
 using hwy::HWY_NAMESPACE::Eq;
 using hwy::HWY_NAMESPACE::IfThenElse;
 using hwy::HWY_NAMESPACE::Lt;
+using hwy::HWY_NAMESPACE::Max;
 
 const HWY_FULL(float) df;
 const HWY_FULL(int32_t) di;
 size_t Padded(size_t x) { return RoundUpTo(x, Lanes(df)); }
 
-float EstimateBits(const int32_t *counts, int32_t *rounded_counts,
-                   size_t num_symbols) {
-  // Try to approximate the effect of rounding up nonzero probabilities.
+// Compute entropy of the histogram, taking into account the minimum probability
+// for symbols with non-zero counts.
+float EstimateBits(const int32_t *counts, size_t num_symbols) {
   int32_t total = std::accumulate(counts, counts + num_symbols, 0);
-  const auto min = Set(di, (total + ANS_TAB_SIZE - 1) >> ANS_LOG_TAB_SIZE);
-  const auto zero_i = Zero(di);
-  for (size_t i = 0; i < num_symbols; i += Lanes(df)) {
-    auto counts_v = LoadU(di, &counts[i]);
-    counts_v = IfThenElse(Eq(counts_v, zero_i), zero_i,
-                          IfThenElse(Lt(counts_v, min), min, counts_v));
-    StoreU(counts_v, di, &rounded_counts[i]);
-  }
-  // Compute entropy of the "rounded" probabilities.
   const auto zero = Zero(df);
-  const size_t total_scalar =
-      std::accumulate(rounded_counts, rounded_counts + num_symbols, 0);
-  const auto inv_total = Set(df, 1.0f / total_scalar);
+  const auto minprob = Set(df, 1.0f / ANS_TAB_SIZE);
+  const auto inv_total = Set(df, 1.0f / total);
   auto bits_lanes = Zero(df);
-  auto total_v = Set(di, total_scalar);
+  auto total_v = Set(di, total);
   for (size_t i = 0; i < num_symbols; i += Lanes(df)) {
-    const auto counts_v = ConvertTo(df, LoadU(di, &counts[i]));
-    const auto round_counts_v = LoadU(di, &rounded_counts[i]);
-    const auto probs = Mul(ConvertTo(df, round_counts_v), inv_total);
-    const auto nbps = IfThenElse(Eq(round_counts_v, total_v), BitCast(di, zero),
-                                 BitCast(di, FastLog2f(df, probs)));
-    bits_lanes = Sub(bits_lanes, IfThenElse(Eq(counts_v, zero), zero,
-                                            Mul(counts_v, BitCast(df, nbps))));
+    const auto counts_iv = LoadU(di, &counts[i]);
+    const auto counts_fv = ConvertTo(df, counts_iv);
+    const auto probs = Mul(counts_fv, inv_total);
+    const auto mprobs = Max(probs, minprob);
+    const auto nbps = IfThenElse(Eq(counts_iv, total_v), BitCast(di, zero),
+                                 BitCast(di, FastLog2f(df, mprobs)));
+    bits_lanes = Sub(bits_lanes, Mul(counts_fv, BitCast(df, nbps)));
   }
   return GetLane(SumOfLanes(df, bits_lanes));
 }
@@ -225,7 +217,6 @@ void FindBestSplit(TreeSamples &tree_samples, float threshold,
       }
     }
     max_symbols = Padded(max_symbols);
-    std::vector<int32_t> rounded_counts(max_symbols);
     std::vector<int32_t> counts(max_symbols * num_predictors);
     std::vector<uint32_t> tot_extra_bits(num_predictors);
     for (size_t pred = 0; pred < num_predictors; pred++) {
@@ -240,9 +231,9 @@ void FindBestSplit(TreeSamples &tree_samples, float threshold,
     float base_bits;
     {
       size_t pred = tree_samples.PredictorIndex((*tree)[pos].predictor);
-      base_bits = EstimateBits(counts.data() + pred * max_symbols,
-                               rounded_counts.data(), max_symbols) +
-                  tot_extra_bits[pred];
+      base_bits =
+          EstimateBits(counts.data() + pred * max_symbols, max_symbols) +
+          tot_extra_bits[pred];
     }
 
     SplitInfo *best = &best_split_nonstatic;
@@ -353,11 +344,9 @@ void FindBestSplit(TreeSamples &tree_samples, float threshold,
               counts_below[sym] += count_increase[i * max_symbols + sym];
               count_increase[i * max_symbols + sym] = 0;
             }
-            float rcost = EstimateBits(counts_above.data(),
-                                       rounded_counts.data(), max_symbols) +
+            float rcost = EstimateBits(counts_above.data(), max_symbols) +
                           tot_extra_bits[pred] - extra_bits_below;
-            float lcost = EstimateBits(counts_below.data(),
-                                       rounded_counts.data(), max_symbols) +
+            float lcost = EstimateBits(counts_below.data(), max_symbols) +
                           extra_bits_below;
             JXL_DASSERT(extra_bits_below <= tot_extra_bits[pred]);
             float penalty = 0;
@@ -733,14 +722,14 @@ std::vector<int32_t> QuantizeHistogram(const std::vector<uint32_t> &histogram,
   // TODO(veluca): selecting distinct quantiles is likely not the best
   // way to go about this.
   std::vector<int32_t> thresholds;
-  size_t sum = std::accumulate(histogram.begin(), histogram.end(), 0LU);
-  size_t cumsum = 0;
-  size_t threshold = 0;
+  uint64_t sum = std::accumulate(histogram.begin(), histogram.end(), 0LU);
+  uint64_t cumsum = 0;
+  uint64_t threshold = 1;
   for (size_t i = 0; i + 1 < histogram.size(); i++) {
     cumsum += histogram[i];
-    if (cumsum > (threshold + 1) * sum / num_chunks) {
+    if (cumsum >= threshold * sum / num_chunks) {
       thresholds.push_back(i);
-      while (cumsum >= (threshold + 1) * sum / num_chunks) threshold++;
+      while (cumsum > threshold * sum / num_chunks) threshold++;
     }
   }
   return thresholds;
@@ -933,7 +922,7 @@ void CollectPixelSamples(const Image &image, const ModularOptions &options,
   Rng rng(group_id);
   // Sample 10% of the final number of samples for property quantization.
   float fraction = std::min(options.nb_repeats * 0.1, 0.99);
-  Rng::GeometricDistribution dist(fraction);
+  Rng::GeometricDistribution dist = Rng::MakeGeometric(fraction);
   size_t total_pixels = 0;
   std::vector<size_t> channel_ids;
   for (size_t i = 0; i < image.channel.size(); i++) {
@@ -977,7 +966,7 @@ void CollectPixelSamples(const Image &image, const ModularOptions &options,
     const pixel_type *row = image.channel[channel_ids[i]].Row(y);
     pixel_samples.push_back(row[x]);
     size_t xp = x == 0 ? 1 : x - 1;
-    diff_samples.push_back(row[x] - row[xp]);
+    diff_samples.push_back((int64_t)row[x] - row[xp]);
   }
 }
 
index 9d2c3e5..a6abdcf 100644 (file)
 
 #include "lib/jxl/base/printf_macros.h"
 #include "lib/jxl/base/scope_guard.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/dec_bit_reader.h"
 #include "lib/jxl/modular/encoding/context_predict.h"
 #include "lib/jxl/modular/options.h"
+#include "lib/jxl/pack_signed.h"
 
 namespace jxl {
 
@@ -92,7 +95,8 @@ FlatTree FilterTree(const Tree &global_tree,
           cur_child = global_tree[cur_child].rchild;
         }
       }
-      // We ended up in a leaf, add a dummy decision and two copies of the leaf.
+      // We ended up in a leaf, add a placeholder decision and two copies of the
+      // leaf.
       if (global_tree[cur_child].property == -1) {
         flat.properties[i] = 0;
         flat.splitvals[i] = 0;
@@ -125,11 +129,14 @@ FlatTree FilterTree(const Tree &global_tree,
   return output;
 }
 
+namespace detail {
+template <bool uses_lz77>
 Status DecodeModularChannelMAANS(BitReader *br, ANSSymbolReader *reader,
                                  const std::vector<uint8_t> &context_map,
                                  const Tree &global_tree,
                                  const weighted::Header &wp_header,
                                  pixel_type chan, size_t group_id,
+                                 TreeLut<uint8_t, true> &tree_lut,
                                  Image *image) {
   Channel &channel = image->channel[chan];
 
@@ -192,7 +199,8 @@ Status DecodeModularChannelMAANS(BitReader *br, ANSSymbolReader *reader,
           for (size_t y = 0; y < channel.h; y++) {
             pixel_type *JXL_RESTRICT r = channel.Row(y);
             for (size_t x = 0; x < channel.w; x++) {
-              uint32_t v = reader->ReadHybridUintClustered(ctx_id, br);
+              uint32_t v =
+                  reader->ReadHybridUintClusteredInlined<uses_lz77>(ctx_id, br);
               r[x] = UnpackSigned(v);
             }
           }
@@ -200,13 +208,16 @@ Status DecodeModularChannelMAANS(BitReader *br, ANSSymbolReader *reader,
           for (size_t y = 0; y < channel.h; y++) {
             pixel_type *JXL_RESTRICT r = channel.Row(y);
             for (size_t x = 0; x < channel.w; x++) {
-              uint32_t v = reader->ReadHybridUintClustered(ctx_id, br);
+              uint32_t v =
+                  reader->ReadHybridUintClusteredMaybeInlined<uses_lz77>(ctx_id,
+                                                                         br);
               r[x] = make_pixel(v, multiplier, offset);
             }
           }
         }
       }
-    } else if (predictor == Predictor::Gradient && offset == 0 &&
+      return true;
+    } else if (uses_lz77 && predictor == Predictor::Gradient && offset == 0 &&
                multiplier == 1 && reader->HuffRleOnly()) {
       JXL_DEBUG_V(8, "Gradient RLE (fjxl) very fast track.");
       uint32_t run = 0;
@@ -239,6 +250,7 @@ Status DecodeModularChannelMAANS(BitReader *br, ANSSymbolReader *reader,
           r[x] = sv + guess;
         }
       }
+      return true;
     } else if (predictor == Predictor::Gradient && offset == 0 &&
                multiplier == 1) {
       JXL_DEBUG_V(8, "Gradient very fast track.");
@@ -250,57 +262,22 @@ Status DecodeModularChannelMAANS(BitReader *br, ANSSymbolReader *reader,
           pixel_type top = (y ? *(r + x - onerow) : left);
           pixel_type topleft = (x && y ? *(r + x - 1 - onerow) : left);
           pixel_type guess = ClampedGradient(top, left, topleft);
-          uint64_t v = reader->ReadHybridUintClustered(ctx_id, br);
+          uint64_t v = reader->ReadHybridUintClusteredMaybeInlined<uses_lz77>(
+              ctx_id, br);
           r[x] = make_pixel(v, 1, guess);
         }
       }
-    } else if (predictor != Predictor::Weighted) {
-      // special optimized case: no wp
-      JXL_DEBUG_V(8, "Quite fast track.");
-      const intptr_t onerow = channel.plane.PixelsPerRow();
-      for (size_t y = 0; y < channel.h; y++) {
-        pixel_type *JXL_RESTRICT r = channel.Row(y);
-        for (size_t x = 0; x < channel.w; x++) {
-          PredictionResult pred =
-              PredictNoTreeNoWP(channel.w, r + x, onerow, x, y, predictor);
-          pixel_type_w g = pred.guess + offset;
-          uint64_t v = reader->ReadHybridUintClustered(ctx_id, br);
-          // NOTE: pred.multiplier is unset.
-          r[x] = make_pixel(v, multiplier, g);
-        }
-      }
-    } else {
-      JXL_DEBUG_V(8, "Somewhat fast track.");
-      const intptr_t onerow = channel.plane.PixelsPerRow();
-      weighted::State wp_state(wp_header, channel.w, channel.h);
-      for (size_t y = 0; y < channel.h; y++) {
-        pixel_type *JXL_RESTRICT r = channel.Row(y);
-        for (size_t x = 0; x < channel.w; x++) {
-          pixel_type_w g = PredictNoTreeWP(channel.w, r + x, onerow, x, y,
-                                           predictor, &wp_state)
-                               .guess +
-                           offset;
-          uint64_t v = reader->ReadHybridUintClustered(ctx_id, br);
-          r[x] = make_pixel(v, multiplier, g);
-          wp_state.UpdateErrors(r[x], x, y, channel.w);
-        }
-      }
+      return true;
     }
-    return true;
   }
 
   // Check if this tree is a WP-only tree with a small enough property value
   // range.
-  // Initialized to avoid clang-tidy complaining.
-  uint8_t context_lookup[2 * kPropRangeFast] = {};
-  int8_t multipliers[2 * kPropRangeFast] = {};
-  int8_t offsets[2 * kPropRangeFast] = {};
   if (is_wp_only) {
-    is_wp_only = TreeToLookupTable(tree, context_lookup, offsets, multipliers);
+    is_wp_only = TreeToLookupTable(tree, tree_lut);
   }
   if (is_gradient_only) {
-    is_gradient_only =
-        TreeToLookupTable(tree, context_lookup, offsets, multipliers);
+    is_gradient_only = TreeToLookupTable(tree, tree_lut);
   }
 
   if (is_gradient_only) {
@@ -318,37 +295,77 @@ Status DecodeModularChannelMAANS(BitReader *br, ANSSymbolReader *reader,
             std::min<pixel_type_w>(
                 std::max<pixel_type_w>(-kPropRangeFast, top + left - topleft),
                 kPropRangeFast - 1);
-        uint32_t ctx_id = context_lookup[pos];
-        uint64_t v = reader->ReadHybridUintClustered(ctx_id, br);
-        r[x] = make_pixel(v, multipliers[pos],
-                          static_cast<pixel_type_w>(offsets[pos]) + guess);
+        uint32_t ctx_id = tree_lut.context_lookup[pos];
+        uint64_t v =
+            reader->ReadHybridUintClusteredMaybeInlined<uses_lz77>(ctx_id, br);
+        r[x] = make_pixel(
+            v, tree_lut.multipliers[pos],
+            static_cast<pixel_type_w>(tree_lut.offsets[pos]) + guess);
       }
     }
-  } else if (is_wp_only) {
+  } else if (!uses_lz77 && is_wp_only && channel.w > 8) {
     JXL_DEBUG_V(8, "WP fast track.");
-    const intptr_t onerow = channel.plane.PixelsPerRow();
     weighted::State wp_state(wp_header, channel.w, channel.h);
     Properties properties(1);
     for (size_t y = 0; y < channel.h; y++) {
       pixel_type *JXL_RESTRICT r = channel.Row(y);
-      for (size_t x = 0; x < channel.w; x++) {
+      const pixel_type *JXL_RESTRICT rtop = (y ? channel.Row(y - 1) : r - 1);
+      const pixel_type *JXL_RESTRICT rtoptop =
+          (y > 1 ? channel.Row(y - 2) : rtop);
+      const pixel_type *JXL_RESTRICT rtopleft =
+          (y ? channel.Row(y - 1) - 1 : r - 1);
+      const pixel_type *JXL_RESTRICT rtopright =
+          (y ? channel.Row(y - 1) + 1 : r - 1);
+      size_t x = 0;
+      {
         size_t offset = 0;
-        pixel_type_w left = (x ? r[x - 1] : y ? *(r + x - onerow) : 0);
-        pixel_type_w top = (y ? *(r + x - onerow) : left);
-        pixel_type_w topleft = (x && y ? *(r + x - 1 - onerow) : left);
-        pixel_type_w topright =
-            (x + 1 < channel.w && y ? *(r + x + 1 - onerow) : top);
-        pixel_type_w toptop = (y > 1 ? *(r + x - onerow - onerow) : top);
+        pixel_type_w left = y ? rtop[x] : 0;
+        pixel_type_w toptop = y ? rtoptop[x] : 0;
+        pixel_type_w topright = (x + 1 < channel.w && y ? rtop[x + 1] : left);
         int32_t guess = wp_state.Predict</*compute_properties=*/true>(
-            x, y, channel.w, top, left, topright, topleft, toptop, &properties,
+            x, y, channel.w, left, left, topright, left, toptop, &properties,
             offset);
         uint32_t pos =
             kPropRangeFast + std::min(std::max(-kPropRangeFast, properties[0]),
                                       kPropRangeFast - 1);
-        uint32_t ctx_id = context_lookup[pos];
-        uint64_t v = reader->ReadHybridUintClustered(ctx_id, br);
-        r[x] = make_pixel(v, multipliers[pos],
-                          static_cast<pixel_type_w>(offsets[pos]) + guess);
+        uint32_t ctx_id = tree_lut.context_lookup[pos];
+        uint64_t v =
+            reader->ReadHybridUintClusteredInlined<uses_lz77>(ctx_id, br);
+        r[x] = make_pixel(
+            v, tree_lut.multipliers[pos],
+            static_cast<pixel_type_w>(tree_lut.offsets[pos]) + guess);
+        wp_state.UpdateErrors(r[x], x, y, channel.w);
+      }
+      for (x = 1; x + 1 < channel.w; x++) {
+        size_t offset = 0;
+        int32_t guess = wp_state.Predict</*compute_properties=*/true>(
+            x, y, channel.w, rtop[x], r[x - 1], rtopright[x], rtopleft[x],
+            rtoptop[x], &properties, offset);
+        uint32_t pos =
+            kPropRangeFast + std::min(std::max(-kPropRangeFast, properties[0]),
+                                      kPropRangeFast - 1);
+        uint32_t ctx_id = tree_lut.context_lookup[pos];
+        uint64_t v =
+            reader->ReadHybridUintClusteredInlined<uses_lz77>(ctx_id, br);
+        r[x] = make_pixel(
+            v, tree_lut.multipliers[pos],
+            static_cast<pixel_type_w>(tree_lut.offsets[pos]) + guess);
+        wp_state.UpdateErrors(r[x], x, y, channel.w);
+      }
+      {
+        size_t offset = 0;
+        int32_t guess = wp_state.Predict</*compute_properties=*/true>(
+            x, y, channel.w, rtop[x], r[x - 1], rtop[x], rtopleft[x],
+            rtoptop[x], &properties, offset);
+        uint32_t pos =
+            kPropRangeFast + std::min(std::max(-kPropRangeFast, properties[0]),
+                                      kPropRangeFast - 1);
+        uint32_t ctx_id = tree_lut.context_lookup[pos];
+        uint64_t v =
+            reader->ReadHybridUintClusteredInlined<uses_lz77>(ctx_id, br);
+        r[x] = make_pixel(
+            v, tree_lut.multipliers[pos],
+            static_cast<pixel_type_w>(tree_lut.offsets[pos]) + guess);
         wp_state.UpdateErrors(r[x], x, y, channel.w);
       }
     }
@@ -369,21 +386,24 @@ Status DecodeModularChannelMAANS(BitReader *br, ANSSymbolReader *reader,
           PredictionResult res =
               PredictTreeNoWP(&properties, channel.w, p + x, onerow, x, y,
                               tree_lookup, references);
-          uint64_t v = reader->ReadHybridUintClustered(res.context, br);
+          uint64_t v =
+              reader->ReadHybridUintClustered<uses_lz77>(res.context, br);
           p[x] = make_pixel(v, res.multiplier, res.guess);
         }
         for (size_t x = 2; x < channel.w - 2; x++) {
           PredictionResult res =
               PredictTreeNoWPNEC(&properties, channel.w, p + x, onerow, x, y,
                                  tree_lookup, references);
-          uint64_t v = reader->ReadHybridUintClustered(res.context, br);
+          uint64_t v = reader->ReadHybridUintClusteredInlined<uses_lz77>(
+              res.context, br);
           p[x] = make_pixel(v, res.multiplier, res.guess);
         }
         for (size_t x = channel.w - 2; x < channel.w; x++) {
           PredictionResult res =
               PredictTreeNoWP(&properties, channel.w, p + x, onerow, x, y,
                               tree_lookup, references);
-          uint64_t v = reader->ReadHybridUintClustered(res.context, br);
+          uint64_t v =
+              reader->ReadHybridUintClustered<uses_lz77>(res.context, br);
           p[x] = make_pixel(v, res.multiplier, res.guess);
         }
       } else {
@@ -391,7 +411,8 @@ Status DecodeModularChannelMAANS(BitReader *br, ANSSymbolReader *reader,
           PredictionResult res =
               PredictTreeNoWP(&properties, channel.w, p + x, onerow, x, y,
                               tree_lookup, references);
-          uint64_t v = reader->ReadHybridUintClustered(res.context, br);
+          uint64_t v = reader->ReadHybridUintClusteredMaybeInlined<uses_lz77>(
+              res.context, br);
           p[x] = make_pixel(v, res.multiplier, res.guess);
         }
       }
@@ -407,18 +428,68 @@ Status DecodeModularChannelMAANS(BitReader *br, ANSSymbolReader *reader,
       pixel_type *JXL_RESTRICT p = channel.Row(y);
       InitPropsRow(&properties, static_props, y);
       PrecomputeReferences(channel, y, *image, chan, &references);
-      for (size_t x = 0; x < channel.w; x++) {
-        PredictionResult res =
-            PredictTreeWP(&properties, channel.w, p + x, onerow, x, y,
-                          tree_lookup, references, &wp_state);
-        uint64_t v = reader->ReadHybridUintClustered(res.context, br);
-        p[x] = make_pixel(v, res.multiplier, res.guess);
-        wp_state.UpdateErrors(p[x], x, y, channel.w);
+      if (!uses_lz77 && y > 1 && channel.w > 8 && references.w == 0) {
+        for (size_t x = 0; x < 2; x++) {
+          PredictionResult res =
+              PredictTreeWP(&properties, channel.w, p + x, onerow, x, y,
+                            tree_lookup, references, &wp_state);
+          uint64_t v =
+              reader->ReadHybridUintClustered<uses_lz77>(res.context, br);
+          p[x] = make_pixel(v, res.multiplier, res.guess);
+          wp_state.UpdateErrors(p[x], x, y, channel.w);
+        }
+        for (size_t x = 2; x < channel.w - 2; x++) {
+          PredictionResult res =
+              PredictTreeWPNEC(&properties, channel.w, p + x, onerow, x, y,
+                               tree_lookup, references, &wp_state);
+          uint64_t v = reader->ReadHybridUintClusteredInlined<uses_lz77>(
+              res.context, br);
+          p[x] = make_pixel(v, res.multiplier, res.guess);
+          wp_state.UpdateErrors(p[x], x, y, channel.w);
+        }
+        for (size_t x = channel.w - 2; x < channel.w; x++) {
+          PredictionResult res =
+              PredictTreeWP(&properties, channel.w, p + x, onerow, x, y,
+                            tree_lookup, references, &wp_state);
+          uint64_t v =
+              reader->ReadHybridUintClustered<uses_lz77>(res.context, br);
+          p[x] = make_pixel(v, res.multiplier, res.guess);
+          wp_state.UpdateErrors(p[x], x, y, channel.w);
+        }
+      } else {
+        for (size_t x = 0; x < channel.w; x++) {
+          PredictionResult res =
+              PredictTreeWP(&properties, channel.w, p + x, onerow, x, y,
+                            tree_lookup, references, &wp_state);
+          uint64_t v =
+              reader->ReadHybridUintClustered<uses_lz77>(res.context, br);
+          p[x] = make_pixel(v, res.multiplier, res.guess);
+          wp_state.UpdateErrors(p[x], x, y, channel.w);
+        }
       }
     }
   }
   return true;
 }
+}  // namespace detail
+
+Status DecodeModularChannelMAANS(BitReader *br, ANSSymbolReader *reader,
+                                 const std::vector<uint8_t> &context_map,
+                                 const Tree &global_tree,
+                                 const weighted::Header &wp_header,
+                                 pixel_type chan, size_t group_id,
+                                 TreeLut<uint8_t, true> &tree_lut,
+                                 Image *image) {
+  if (reader->UsesLZ77()) {
+    return detail::DecodeModularChannelMAANS</*uses_lz77=*/true>(
+        br, reader, context_map, global_tree, wp_header, chan, group_id,
+        tree_lut, image);
+  } else {
+    return detail::DecodeModularChannelMAANS</*uses_lz77=*/false>(
+        br, reader, context_map, global_tree, wp_header, chan, group_id,
+        tree_lut, image);
+  }
+}
 
 GroupHeader::GroupHeader() { Bundle::Init(this); }
 
@@ -450,7 +521,7 @@ Status ModularDecode(BitReader *br, Image &image, GroupHeader &header,
                      size_t group_id, ModularOptions *options,
                      const Tree *global_tree, const ANSCode *global_code,
                      const std::vector<uint8_t> *global_ctx_map,
-                     bool allow_truncated_group) {
+                     const bool allow_truncated_group) {
   if (image.channel.empty()) return true;
 
   // decode transforms
@@ -500,12 +571,12 @@ Status ModularDecode(BitReader *br, Image &image, GroupHeader &header,
 
   size_t next_channel = 0;
   auto scope_guard = MakeScopeGuard([&]() {
-    // Do not do anything if truncated groups are not allowed.
-    if (!allow_truncated_group) return;
-    for (size_t c = next_channel; c < nb_channels; c++) {
+    for (size_t c = next_channel; c < image.channel.size(); c++) {
       ZeroFillImage(&image.channel[c].plane);
     }
   });
+  // Do not do anything if truncated groups are not allowed.
+  if (allow_truncated_group) scope_guard.Disarm();
 
   // Read tree.
   Tree tree_storage;
@@ -515,24 +586,17 @@ Status ModularDecode(BitReader *br, Image &image, GroupHeader &header,
   const ANSCode *code = &code_storage;
   const std::vector<uint8_t> *context_map = &context_map_storage;
   if (!header.use_global_tree) {
-    size_t max_tree_size = 1024;
+    uint64_t max_tree_size = 1024;
     for (size_t i = 0; i < nb_channels; i++) {
       Channel &channel = image.channel[i];
-      if (!channel.w || !channel.h) {
-        continue;  // skip empty channels
-      }
       if (i >= image.nb_meta_channels && (channel.w > options->max_chan_size ||
                                           channel.h > options->max_chan_size)) {
         break;
       }
-      size_t pixels = channel.w * channel.h;
-      if (pixels / channel.w != channel.h) {
-        return JXL_FAILURE("Tree size overflow");
-      }
+      uint64_t pixels = channel.w * channel.h;
       max_tree_size += pixels;
-      if (max_tree_size < pixels) return JXL_FAILURE("Tree size overflow");
     }
-    max_tree_size = std::min(static_cast<size_t>(1 << 20), max_tree_size);
+    max_tree_size = std::min(static_cast<uint64_t>(1 << 20), max_tree_size);
     JXL_RETURN_IF_ERROR(DecodeTree(br, &tree_storage, max_tree_size));
     JXL_RETURN_IF_ERROR(DecodeHistograms(br, (tree_storage.size() + 1) / 2,
                                          &code_storage, &context_map_storage));
@@ -548,6 +612,7 @@ Status ModularDecode(BitReader *br, Image &image, GroupHeader &header,
 
   // Read channels
   ANSSymbolReader reader(code, br, distance_multiplier);
+  auto tree_lut = jxl::make_unique<TreeLut<uint8_t, true>>();
   for (; next_channel < nb_channels; next_channel++) {
     Channel &channel = image.channel[next_channel];
     if (!channel.w || !channel.h) {
@@ -560,7 +625,8 @@ Status ModularDecode(BitReader *br, Image &image, GroupHeader &header,
     }
     JXL_RETURN_IF_ERROR(DecodeModularChannelMAANS(
         br, &reader, *context_map, *tree, header.wp_header, next_channel,
-        group_id, &image));
+        group_id, *tree_lut, &image));
+
     // Truncated group.
     if (!br->AllReadsWithinBounds()) {
       if (!allow_truncated_group) return JXL_FAILURE("Truncated input");
index 89697bc..2500780 100644 (file)
@@ -6,12 +6,14 @@
 #ifndef LIB_JXL_MODULAR_ENCODING_ENCODING_H_
 #define LIB_JXL_MODULAR_ENCODING_ENCODING_H_
 
-#include <stddef.h>
-#include <stdint.h>
-
+#include <array>
+#include <cstddef>
+#include <cstdint>
 #include <vector>
 
-#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/field_encodings.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/modular/encoding/context_predict.h"
 #include "lib/jxl/modular/encoding/dec_ma.h"
@@ -21,6 +23,9 @@
 
 namespace jxl {
 
+struct ANSCode;
+class BitReader;
+
 // Valid range of properties for using lookup tables instead of trees.
 constexpr int32_t kPropRangeFast = 512;
 
@@ -54,11 +59,15 @@ FlatTree FilterTree(const Tree &global_tree,
                     size_t *num_props, bool *use_wp, bool *wp_only,
                     bool *gradient_only);
 
-template <typename T>
-bool TreeToLookupTable(const FlatTree &tree,
-                       T context_lookup[2 * kPropRangeFast],
-                       int8_t offsets[2 * kPropRangeFast],
-                       int8_t multipliers[2 * kPropRangeFast] = nullptr) {
+template <typename T, bool HAS_MULTIPLIERS>
+struct TreeLut {
+  std::array<T, 2 * kPropRangeFast> context_lookup;
+  std::array<int8_t, 2 * kPropRangeFast> offsets;
+  std::array<int8_t, HAS_MULTIPLIERS ? (2 * kPropRangeFast) : 0> multipliers;
+};
+
+template <typename T, bool HAS_MULTIPLIERS>
+bool TreeToLookupTable(const FlatTree &tree, TreeLut<T, HAS_MULTIPLIERS> &lut) {
   struct TreeRange {
     // Begin *excluded*, end *included*. This works best with > vs <= decision
     // nodes.
@@ -86,13 +95,15 @@ bool TreeToLookupTable(const FlatTree &tree,
           node.multiplier > std::numeric_limits<int8_t>::max()) {
         return false;
       }
-      if (multipliers == nullptr && node.multiplier != 1) {
+      if (!HAS_MULTIPLIERS && node.multiplier != 1) {
         return false;
       }
       for (int i = cur.begin + 1; i < cur.end + 1; i++) {
-        context_lookup[i + kPropRangeFast] = node.childID;
-        if (multipliers) multipliers[i + kPropRangeFast] = node.multiplier;
-        offsets[i + kPropRangeFast] = node.predictor_offset;
+        lut.context_lookup[i + kPropRangeFast] = node.childID;
+        if (HAS_MULTIPLIERS) {
+          lut.multipliers[i + kPropRangeFast] = node.multiplier;
+        }
+        lut.offsets[i + kPropRangeFast] = node.predictor_offset;
       }
       continue;
     }
index 785d0c5..746d7c8 100644 (file)
@@ -8,7 +8,6 @@
 #include <sstream>
 
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/modular/transform/transform.h"
 
 namespace jxl {
@@ -60,6 +59,7 @@ Image Image::clone() {
   return c;
 }
 
+#if JXL_DEBUG_V_LEVEL >= 1
 std::string Image::DebugString() const {
   std::ostringstream os;
   os << w << "x" << h << ", depth: " << bitdepth;
@@ -73,5 +73,6 @@ std::string Image::DebugString() const {
   }
   return os.str();
 }
+#endif
 
 }  // namespace jxl
index 3e9b5a8..56e80d8 100644 (file)
@@ -8,7 +8,6 @@
 
 #include <stddef.h>
 #include <stdint.h>
-#include <stdio.h>
 #include <string.h>
 
 #include <string>
index 7065f80..f5172aa 100644 (file)
@@ -9,9 +9,9 @@
 #include <map>
 #include <set>
 
+#include "lib/jxl/base/common.h"
 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/modular/encoding/context_predict.h"
 #include "lib/jxl/modular/modular_image.h"
 #include "lib/jxl/modular/transform/enc_transform.h"
@@ -196,6 +196,7 @@ Status FwdPaletteIteration(Image &input, uint32_t begin_c, uint32_t end_c,
       JXL_DEBUG_V(6, "Channel %i uses only %i colors.", begin_c, idx);
       Channel pch(idx, 1);
       pch.hshift = -1;
+      pch.vshift = -1;
       nb_colors = idx;
       idx = 0;
       pixel_type *JXL_RESTRICT p_palette = pch.Row(0);
@@ -232,6 +233,7 @@ Status FwdPaletteIteration(Image &input, uint32_t begin_c, uint32_t end_c,
     JXL_DEBUG_V(6, "Channel %i uses only %i colors.", begin_c, idx);
     Channel pch(idx, 1);
     pch.hshift = -1;
+    pch.vshift = -1;
     nb_colors = idx;
     idx = 0;
     pixel_type *JXL_RESTRICT p_palette = pch.Row(0);
@@ -266,12 +268,12 @@ Status FwdPaletteIteration(Image &input, uint32_t begin_c, uint32_t end_c,
       begin_c, end_c, nb_colors);
   nb_deltas = 0;
   bool delta_used = false;
-  std::set<std::vector<pixel_type>>
-      candidate_palette;  // ordered lexicographically
+  std::set<std::vector<pixel_type>> candidate_palette;
   std::vector<std::vector<pixel_type>> candidate_palette_imageorder;
   std::vector<pixel_type> color(nb);
   std::vector<float> color_with_error(nb);
   std::vector<const pixel_type *> p_in(nb);
+  std::map<std::vector<pixel_type>, size_t> inv_palette;
 
   if (lossy) {
     palette_iteration_data.FindFrequentColorDeltas(w * h, input.bitdepth);
@@ -337,6 +339,7 @@ Status FwdPaletteIteration(Image &input, uint32_t begin_c, uint32_t end_c,
 
   Channel pch(nb_colors, nb);
   pch.hshift = -1;
+  pch.vshift = -1;
   pixel_type *JXL_RESTRICT p_palette = pch.Row(0);
   intptr_t onerow = pch.plane.PixelsPerRow();
   intptr_t onerow_image = input.channel[begin_c].plane.PixelsPerRow();
@@ -352,28 +355,30 @@ Status FwdPaletteIteration(Image &input, uint32_t begin_c, uint32_t end_c,
   }
 
   int x = 0;
-  if (ordered) {
-    JXL_DEBUG_V(7, "Palette of %i colors, using lexicographic order",
-                nb_colors);
-    for (auto pcol : candidate_palette) {
-      JXL_DEBUG_V(9, "  Color %i :  ", x);
-      for (size_t i = 0; i < nb; i++) {
-        p_palette[nb_deltas + i * onerow + x] = pcol[i];
-      }
-      for (size_t i = 0; i < nb; i++) {
-        JXL_DEBUG_V(9, "%i ", pcol[i]);
-      }
-      x++;
-    }
+  if (ordered && nb >= 3) {
+    JXL_DEBUG_V(7, "Palette of %i colors, using luma order", nb_colors);
+    // sort on luma (multiplied by alpha if available)
+    std::sort(candidate_palette_imageorder.begin(),
+              candidate_palette_imageorder.end(),
+              [](std::vector<pixel_type> ap, std::vector<pixel_type> bp) {
+                float ay, by;
+                ay = (0.299f * ap[0] + 0.587f * ap[1] + 0.114f * ap[2] + 0.1f);
+                if (ap.size() > 3) ay *= 1.f + ap[3];
+                by = (0.299f * bp[0] + 0.587f * bp[1] + 0.114f * bp[2] + 0.1f);
+                if (bp.size() > 3) by *= 1.f + bp[3];
+                return ay < by;
+              });
   } else {
     JXL_DEBUG_V(7, "Palette of %i colors, using image order", nb_colors);
-    for (auto pcol : candidate_palette_imageorder) {
-      JXL_DEBUG_V(9, "  Color %i :  ", x);
-      for (size_t i = 0; i < nb; i++)
-        p_palette[nb_deltas + i * onerow + x] = pcol[i];
-      for (size_t i = 0; i < nb; i++) JXL_DEBUG_V(9, "%i ", pcol[i]);
-      x++;
+  }
+  for (auto pcol : candidate_palette_imageorder) {
+    JXL_DEBUG_V(9, "  Color %i :  ", x);
+    for (size_t i = 0; i < nb; i++) {
+      p_palette[nb_deltas + i * onerow + x] = pcol[i];
+      JXL_DEBUG_V(9, "%i ", pcol[i]);
     }
+    inv_palette[pcol] = x;
+    x++;
   }
   std::vector<weighted::State> wp_states;
   for (size_t c = 0; c < nb; c++) {
@@ -402,20 +407,7 @@ Status FwdPaletteIteration(Image &input, uint32_t begin_c, uint32_t end_c,
       int index;
       if (!lossy) {
         for (size_t c = 0; c < nb; c++) color[c] = p_in[c][x];
-        // Exact search.
-        for (index = 0; static_cast<uint32_t>(index) < nb_colors; index++) {
-          bool found = true;
-          for (size_t c = 0; c < nb; c++) {
-            if (color[c] != p_palette[c * onerow + index]) {
-              found = false;
-              break;
-            }
-          }
-          if (found) break;
-        }
-        if (index < static_cast<int>(nb_deltas)) {
-          delta_used = true;
-        }
+        index = inv_palette[color];
       } else {
         int best_index = 0;
         bool best_is_delta = false;
index 050563a..6493027 100644 (file)
@@ -6,7 +6,6 @@
 #include "lib/jxl/modular/transform/enc_rct.h"
 
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/modular/modular_image.h"
 #include "lib/jxl/modular/transform/transform.h"  // CheckEqualChannels
 
index dfd90cd..489f72a 100644 (file)
@@ -8,7 +8,6 @@
 #include <stdlib.h>
 
 #include "lib/jxl/base/data_parallel.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/modular/modular_image.h"
 #include "lib/jxl/modular/transform/squeeze.h"
 #include "lib/jxl/modular/transform/transform.h"
diff --git a/lib/jxl/modular/transform/palette.cc b/lib/jxl/modular/transform/palette.cc
new file mode 100644 (file)
index 0000000..bffbacf
--- /dev/null
@@ -0,0 +1,177 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/modular/transform/palette.h"
+
+namespace jxl {
+
+Status InvPalette(Image &input, uint32_t begin_c, uint32_t nb_colors,
+                  uint32_t nb_deltas, Predictor predictor,
+                  const weighted::Header &wp_header, ThreadPool *pool) {
+  if (input.nb_meta_channels < 1) {
+    return JXL_FAILURE("Error: Palette transform without palette.");
+  }
+  std::atomic<int> num_errors{0};
+  int nb = input.channel[0].h;
+  uint32_t c0 = begin_c + 1;
+  if (c0 >= input.channel.size()) {
+    return JXL_FAILURE("Channel is out of range.");
+  }
+  size_t w = input.channel[c0].w;
+  size_t h = input.channel[c0].h;
+  if (nb < 1) return JXL_FAILURE("Corrupted transforms");
+  for (int i = 1; i < nb; i++) {
+    input.channel.insert(
+        input.channel.begin() + c0 + 1,
+        Channel(w, h, input.channel[c0].hshift, input.channel[c0].vshift));
+  }
+  const Channel &palette = input.channel[0];
+  const pixel_type *JXL_RESTRICT p_palette = input.channel[0].Row(0);
+  intptr_t onerow = input.channel[0].plane.PixelsPerRow();
+  intptr_t onerow_image = input.channel[c0].plane.PixelsPerRow();
+  const int bit_depth = std::min(input.bitdepth, 24);
+
+  if (w == 0) {
+    // Nothing to do.
+    // Avoid touching "empty" channels with non-zero height.
+  } else if (nb_deltas == 0 && predictor == Predictor::Zero) {
+    if (nb == 1) {
+      JXL_RETURN_IF_ERROR(RunOnPool(
+          pool, 0, h, ThreadPool::NoInit,
+          [&](const uint32_t task, size_t /* thread */) {
+            const size_t y = task;
+            pixel_type *p = input.channel[c0].Row(y);
+            for (size_t x = 0; x < w; x++) {
+              const int index = Clamp1<int>(p[x], 0, (pixel_type)palette.w - 1);
+              p[x] = palette_internal::GetPaletteValue(
+                  p_palette, index, /*c=*/0,
+                  /*palette_size=*/palette.w,
+                  /*onerow=*/onerow, /*bit_depth=*/bit_depth);
+            }
+          },
+          "UndoChannelPalette"));
+    } else {
+      JXL_RETURN_IF_ERROR(RunOnPool(
+          pool, 0, h, ThreadPool::NoInit,
+          [&](const uint32_t task, size_t /* thread */) {
+            const size_t y = task;
+            std::vector<pixel_type *> p_out(nb);
+            const pixel_type *p_index = input.channel[c0].Row(y);
+            for (int c = 0; c < nb; c++)
+              p_out[c] = input.channel[c0 + c].Row(y);
+            for (size_t x = 0; x < w; x++) {
+              const int index = p_index[x];
+              for (int c = 0; c < nb; c++) {
+                p_out[c][x] = palette_internal::GetPaletteValue(
+                    p_palette, index, /*c=*/c,
+                    /*palette_size=*/palette.w,
+                    /*onerow=*/onerow, /*bit_depth=*/bit_depth);
+              }
+            }
+          },
+          "UndoPalette"));
+    }
+  } else {
+    // Parallelized per channel.
+    ImageI indices = std::move(input.channel[c0].plane);
+    input.channel[c0].plane = ImageI(indices.xsize(), indices.ysize());
+    if (predictor == Predictor::Weighted) {
+      JXL_RETURN_IF_ERROR(RunOnPool(
+          pool, 0, nb, ThreadPool::NoInit,
+          [&](const uint32_t c, size_t /* thread */) {
+            Channel &channel = input.channel[c0 + c];
+            weighted::State wp_state(wp_header, channel.w, channel.h);
+            for (size_t y = 0; y < channel.h; y++) {
+              pixel_type *JXL_RESTRICT p = channel.Row(y);
+              const pixel_type *JXL_RESTRICT idx = indices.Row(y);
+              for (size_t x = 0; x < channel.w; x++) {
+                int index = idx[x];
+                pixel_type_w val = 0;
+                const pixel_type palette_entry =
+                    palette_internal::GetPaletteValue(
+                        p_palette, index, /*c=*/c,
+                        /*palette_size=*/palette.w, /*onerow=*/onerow,
+                        /*bit_depth=*/bit_depth);
+                if (index < static_cast<int32_t>(nb_deltas)) {
+                  PredictionResult pred =
+                      PredictNoTreeWP(channel.w, p + x, onerow_image, x, y,
+                                      predictor, &wp_state);
+                  val = pred.guess + palette_entry;
+                } else {
+                  val = palette_entry;
+                }
+                p[x] = val;
+                wp_state.UpdateErrors(p[x], x, y, channel.w);
+              }
+            }
+          },
+          "UndoDeltaPaletteWP"));
+    } else {
+      JXL_RETURN_IF_ERROR(RunOnPool(
+          pool, 0, nb, ThreadPool::NoInit,
+          [&](const uint32_t c, size_t /* thread */) {
+            Channel &channel = input.channel[c0 + c];
+            for (size_t y = 0; y < channel.h; y++) {
+              pixel_type *JXL_RESTRICT p = channel.Row(y);
+              const pixel_type *JXL_RESTRICT idx = indices.Row(y);
+              for (size_t x = 0; x < channel.w; x++) {
+                int index = idx[x];
+                pixel_type_w val = 0;
+                const pixel_type palette_entry =
+                    palette_internal::GetPaletteValue(
+                        p_palette, index, /*c=*/c,
+                        /*palette_size=*/palette.w,
+                        /*onerow=*/onerow, /*bit_depth=*/bit_depth);
+                if (index < static_cast<int32_t>(nb_deltas)) {
+                  PredictionResult pred = PredictNoTreeNoWP(
+                      channel.w, p + x, onerow_image, x, y, predictor);
+                  val = pred.guess + palette_entry;
+                } else {
+                  val = palette_entry;
+                }
+                p[x] = val;
+              }
+            }
+          },
+          "UndoDeltaPaletteNoWP"));
+    }
+  }
+  if (c0 >= input.nb_meta_channels) {
+    // Palette was done on normal channels
+    input.nb_meta_channels--;
+  } else {
+    // Palette was done on metachannels
+    JXL_ASSERT(static_cast<int>(input.nb_meta_channels) >= 2 - nb);
+    input.nb_meta_channels -= 2 - nb;
+    JXL_ASSERT(begin_c + nb - 1 < input.nb_meta_channels);
+  }
+  input.channel.erase(input.channel.begin(), input.channel.begin() + 1);
+  return num_errors.load(std::memory_order_relaxed) == 0;
+}
+
+Status MetaPalette(Image &input, uint32_t begin_c, uint32_t end_c,
+                   uint32_t nb_colors, uint32_t nb_deltas, bool lossy) {
+  JXL_RETURN_IF_ERROR(CheckEqualChannels(input, begin_c, end_c));
+
+  size_t nb = end_c - begin_c + 1;
+  if (begin_c >= input.nb_meta_channels) {
+    // Palette was done on normal channels
+    input.nb_meta_channels++;
+  } else {
+    // Palette was done on metachannels
+    JXL_ASSERT(end_c < input.nb_meta_channels);
+    // we remove nb-1 metachannels and add one
+    input.nb_meta_channels += 2 - nb;
+  }
+  input.channel.erase(input.channel.begin() + begin_c + 1,
+                      input.channel.begin() + end_c + 1);
+  Channel pch(nb_colors + nb_deltas, nb);
+  pch.hshift = -1;
+  pch.vshift = -1;
+  input.channel.insert(input.channel.begin(), std::move(pch));
+  return true;
+}
+
+}  // namespace jxl
index ed2d33b..279ef04 100644 (file)
@@ -10,7 +10,6 @@
 
 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/modular/encoding/context_predict.h"
 #include "lib/jxl/modular/modular_image.h"
 #include "lib/jxl/modular/transform/transform.h"  // CheckEqualChannels
@@ -45,9 +44,9 @@ static inline pixel_type Scale(uint64_t value, uint64_t bit_depth,
 // palette indices to implicit values. If index < nb_deltas, indicating that the
 // result is a delta palette entry, it is the responsibility of the caller to
 // treat it as such.
-static pixel_type GetPaletteValue(const pixel_type *const palette, int index,
-                                  const size_t c, const int palette_size,
-                                  const int onerow, const int bit_depth) {
+static JXL_MAYBE_UNUSED pixel_type
+GetPaletteValue(const pixel_type *const palette, int index, const size_t c,
+                const int palette_size, const int onerow, const int bit_depth) {
   if (index < 0) {
     static constexpr std::array<std::array<pixel_type, 3>, 72> kDeltaPalette = {
         {
@@ -117,170 +116,12 @@ static pixel_type GetPaletteValue(const pixel_type *const palette, int index,
 
 }  // namespace palette_internal
 
-static Status InvPalette(Image &input, uint32_t begin_c, uint32_t nb_colors,
-                         uint32_t nb_deltas, Predictor predictor,
-                         const weighted::Header &wp_header, ThreadPool *pool) {
-  if (input.nb_meta_channels < 1) {
-    return JXL_FAILURE("Error: Palette transform without palette.");
-  }
-  std::atomic<int> num_errors{0};
-  int nb = input.channel[0].h;
-  uint32_t c0 = begin_c + 1;
-  if (c0 >= input.channel.size()) {
-    return JXL_FAILURE("Channel is out of range.");
-  }
-  size_t w = input.channel[c0].w;
-  size_t h = input.channel[c0].h;
-  if (nb < 1) return JXL_FAILURE("Corrupted transforms");
-  for (int i = 1; i < nb; i++) {
-    input.channel.insert(
-        input.channel.begin() + c0 + 1,
-        Channel(w, h, input.channel[c0].hshift, input.channel[c0].vshift));
-  }
-  const Channel &palette = input.channel[0];
-  const pixel_type *JXL_RESTRICT p_palette = input.channel[0].Row(0);
-  intptr_t onerow = input.channel[0].plane.PixelsPerRow();
-  intptr_t onerow_image = input.channel[c0].plane.PixelsPerRow();
-  const int bit_depth = std::min(input.bitdepth, 24);
-
-  if (w == 0) {
-    // Nothing to do.
-    // Avoid touching "empty" channels with non-zero height.
-  } else if (nb_deltas == 0 && predictor == Predictor::Zero) {
-    if (nb == 1) {
-      JXL_RETURN_IF_ERROR(RunOnPool(
-          pool, 0, h, ThreadPool::NoInit,
-          [&](const uint32_t task, size_t /* thread */) {
-            const size_t y = task;
-            pixel_type *p = input.channel[c0].Row(y);
-            for (size_t x = 0; x < w; x++) {
-              const int index = Clamp1<int>(p[x], 0, (pixel_type)palette.w - 1);
-              p[x] = palette_internal::GetPaletteValue(
-                  p_palette, index, /*c=*/0,
-                  /*palette_size=*/palette.w,
-                  /*onerow=*/onerow, /*bit_depth=*/bit_depth);
-            }
-          },
-          "UndoChannelPalette"));
-    } else {
-      JXL_RETURN_IF_ERROR(RunOnPool(
-          pool, 0, h, ThreadPool::NoInit,
-          [&](const uint32_t task, size_t /* thread */) {
-            const size_t y = task;
-            std::vector<pixel_type *> p_out(nb);
-            const pixel_type *p_index = input.channel[c0].Row(y);
-            for (int c = 0; c < nb; c++)
-              p_out[c] = input.channel[c0 + c].Row(y);
-            for (size_t x = 0; x < w; x++) {
-              const int index = p_index[x];
-              for (int c = 0; c < nb; c++) {
-                p_out[c][x] = palette_internal::GetPaletteValue(
-                    p_palette, index, /*c=*/c,
-                    /*palette_size=*/palette.w,
-                    /*onerow=*/onerow, /*bit_depth=*/bit_depth);
-              }
-            }
-          },
-          "UndoPalette"));
-    }
-  } else {
-    // Parallelized per channel.
-    ImageI indices = CopyImage(input.channel[c0].plane);
-    if (predictor == Predictor::Weighted) {
-      JXL_RETURN_IF_ERROR(RunOnPool(
-          pool, 0, nb, ThreadPool::NoInit,
-          [&](const uint32_t c, size_t /* thread */) {
-            Channel &channel = input.channel[c0 + c];
-            weighted::State wp_state(wp_header, channel.w, channel.h);
-            for (size_t y = 0; y < channel.h; y++) {
-              pixel_type *JXL_RESTRICT p = channel.Row(y);
-              const pixel_type *JXL_RESTRICT idx = indices.Row(y);
-              for (size_t x = 0; x < channel.w; x++) {
-                int index = idx[x];
-                pixel_type_w val = 0;
-                const pixel_type palette_entry =
-                    palette_internal::GetPaletteValue(
-                        p_palette, index, /*c=*/c,
-                        /*palette_size=*/palette.w, /*onerow=*/onerow,
-                        /*bit_depth=*/bit_depth);
-                if (index < static_cast<int32_t>(nb_deltas)) {
-                  PredictionResult pred =
-                      PredictNoTreeWP(channel.w, p + x, onerow_image, x, y,
-                                      predictor, &wp_state);
-                  val = pred.guess + palette_entry;
-                } else {
-                  val = palette_entry;
-                }
-                p[x] = val;
-                wp_state.UpdateErrors(p[x], x, y, channel.w);
-              }
-            }
-          },
-          "UndoDeltaPaletteWP"));
-    } else {
-      JXL_RETURN_IF_ERROR(RunOnPool(
-          pool, 0, nb, ThreadPool::NoInit,
-          [&](const uint32_t c, size_t /* thread */) {
-            Channel &channel = input.channel[c0 + c];
-            for (size_t y = 0; y < channel.h; y++) {
-              pixel_type *JXL_RESTRICT p = channel.Row(y);
-              const pixel_type *JXL_RESTRICT idx = indices.Row(y);
-              for (size_t x = 0; x < channel.w; x++) {
-                int index = idx[x];
-                pixel_type_w val = 0;
-                const pixel_type palette_entry =
-                    palette_internal::GetPaletteValue(
-                        p_palette, index, /*c=*/c,
-                        /*palette_size=*/palette.w,
-                        /*onerow=*/onerow, /*bit_depth=*/bit_depth);
-                if (index < static_cast<int32_t>(nb_deltas)) {
-                  PredictionResult pred = PredictNoTreeNoWP(
-                      channel.w, p + x, onerow_image, x, y, predictor);
-                  val = pred.guess + palette_entry;
-                } else {
-                  val = palette_entry;
-                }
-                p[x] = val;
-              }
-            }
-          },
-          "UndoDeltaPaletteNoWP"));
-    }
-  }
-  if (c0 >= input.nb_meta_channels) {
-    // Palette was done on normal channels
-    input.nb_meta_channels--;
-  } else {
-    // Palette was done on metachannels
-    JXL_ASSERT(static_cast<int>(input.nb_meta_channels) >= 2 - nb);
-    input.nb_meta_channels -= 2 - nb;
-    JXL_ASSERT(begin_c + nb - 1 < input.nb_meta_channels);
-  }
-  input.channel.erase(input.channel.begin(), input.channel.begin() + 1);
-  return num_errors.load(std::memory_order_relaxed) == 0;
-}
+Status InvPalette(Image &input, uint32_t begin_c, uint32_t nb_colors,
+                  uint32_t nb_deltas, Predictor predictor,
+                  const weighted::Header &wp_header, ThreadPool *pool);
 
-static Status MetaPalette(Image &input, uint32_t begin_c, uint32_t end_c,
-                          uint32_t nb_colors, uint32_t nb_deltas, bool lossy) {
-  JXL_RETURN_IF_ERROR(CheckEqualChannels(input, begin_c, end_c));
-
-  size_t nb = end_c - begin_c + 1;
-  if (begin_c >= input.nb_meta_channels) {
-    // Palette was done on normal channels
-    input.nb_meta_channels++;
-  } else {
-    // Palette was done on metachannels
-    JXL_ASSERT(end_c < input.nb_meta_channels);
-    // we remove nb-1 metachannels and add one
-    input.nb_meta_channels += 2 - nb;
-  }
-  input.channel.erase(input.channel.begin() + begin_c + 1,
-                      input.channel.begin() + end_c + 1);
-  Channel pch(nb_colors + nb_deltas, nb);
-  pch.hshift = -1;
-  input.channel.insert(input.channel.begin(), std::move(pch));
-  return true;
-}
+Status MetaPalette(Image &input, uint32_t begin_c, uint32_t end_c,
+                   uint32_t nb_colors, uint32_t nb_deltas, bool lossy);
 
 }  // namespace jxl
 
index aef6562..1ab57fe 100644 (file)
@@ -7,7 +7,6 @@
 #define LIB_JXL_MODULAR_TRANSFORM_RCT_H_
 
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/modular/modular_image.h"
 #include "lib/jxl/modular/transform/transform.h"  // CheckEqualChannels
 
index 3431189..e9892ea 100644 (file)
@@ -7,9 +7,9 @@
 
 #include <stdlib.h>
 
+#include "lib/jxl/base/common.h"
 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/base/printf_macros.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/modular/modular_image.h"
 #include "lib/jxl/modular/transform/transform.h"
 #undef HWY_TARGET_INCLUDE
@@ -449,22 +449,23 @@ Status MetaSqueeze(Image &image, std::vector<SqueezeParams> *parameters) {
       }
       size_t w = image.channel[c].w;
       size_t h = image.channel[c].h;
+      if (w == 0 || h == 0) return JXL_FAILURE("Squeezing empty channel");
       if (horizontal) {
         image.channel[c].w = (w + 1) / 2;
-        image.channel[c].hshift++;
+        if (image.channel[c].hshift >= 0) image.channel[c].hshift++;
         w = w - (w + 1) / 2;
       } else {
         image.channel[c].h = (h + 1) / 2;
-        image.channel[c].vshift++;
+        if (image.channel[c].vshift >= 0) image.channel[c].vshift++;
         h = h - (h + 1) / 2;
       }
       image.channel[c].shrink();
-      Channel dummy(w, h);
-      dummy.hshift = image.channel[c].hshift;
-      dummy.vshift = image.channel[c].vshift;
+      Channel placeholder(w, h);
+      placeholder.hshift = image.channel[c].hshift;
+      placeholder.vshift = image.channel[c].vshift;
 
       image.channel.insert(image.channel.begin() + offset + (c - beginc),
-                           std::move(dummy));
+                           std::move(placeholder));
       JXL_DEBUG_V(8, "MetaSqueeze applied, current image: %s",
                   image.DebugString().c_str());
     }
index fb18710..305a0ca 100644 (file)
@@ -26,7 +26,6 @@
 #include <stdlib.h>
 
 #include "lib/jxl/base/data_parallel.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/modular/modular_image.h"
 #include "lib/jxl/modular/transform/transform.h"
 
index c87be68..cdff03f 100644 (file)
@@ -3,31 +3,27 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-#include <stdint.h>
-#include <stdio.h>
+#include <jxl/cms.h>
 
 #include <array>
+#include <cstdint>
 #include <string>
 #include <utility>
 #include <vector>
 
-#include "gtest/gtest.h"
 #include "lib/extras/codec.h"
 #include "lib/extras/dec/jxl.h"
-#include "lib/jxl/aux_out.h"
+#include "lib/extras/metrics.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/base/override.h"
-#include "lib/jxl/base/padded_bytes.h"
-#include "lib/jxl/base/thread_pool_internal.h"
+#include "lib/jxl/base/span.h"
 #include "lib/jxl/codec_in_out.h"
 #include "lib/jxl/color_encoding_internal.h"
-#include "lib/jxl/color_management.h"
+#include "lib/jxl/enc_aux_out.h"
 #include "lib/jxl/enc_butteraugli_comparator.h"
-#include "lib/jxl/enc_butteraugli_pnorm.h"
 #include "lib/jxl/enc_cache.h"
-#include "lib/jxl/enc_color_management.h"
-#include "lib/jxl/enc_file.h"
+#include "lib/jxl/enc_fields.h"
 #include "lib/jxl/enc_params.h"
 #include "lib/jxl/enc_toc.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/modular/encoding/enc_encoding.h"
 #include "lib/jxl/modular/encoding/encoding.h"
 #include "lib/jxl/modular/encoding/ma_common.h"
+#include "lib/jxl/padded_bytes.h"
 #include "lib/jxl/test_utils.h"
-#include "lib/jxl/testdata.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
 namespace {
+
+using test::ReadTestData;
 using test::Roundtrip;
 
 void TestLosslessGroups(size_t group_size_shift) {
-  ThreadPool* pool = nullptr;
-  const PaddedBytes orig = ReadTestData("jxl/flower/flower.png");
+  const std::vector<uint8_t> orig = ReadTestData("jxl/flower/flower.png");
   CompressParams cparams;
   cparams.SetLossless();
   cparams.modular_group_size_shift = group_size_shift;
 
   CodecInOut io_out;
-  size_t compressed_size;
 
   CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, pool));
+  ASSERT_TRUE(SetFromBytes(Bytes(orig), &io));
   io.ShrinkTo(io.xsize() / 4, io.ysize() / 4);
 
-  compressed_size = Roundtrip(&io, cparams, {}, pool, &io_out);
+  size_t compressed_size;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io_out, _, &compressed_size));
   EXPECT_LE(compressed_size, 280000u);
-  EXPECT_LE(ButteraugliDistance(io, io_out, cparams.ba_params, GetJxlCms(),
-                                /*distmap=*/nullptr, pool),
-            0.0);
+  JXL_EXPECT_OK(SamePixels(*io.Main().color(), *io_out.Main().color(), _));
 }
 
 TEST(ModularTest, RoundtripLosslessGroups128) { TestLosslessGroups(0); }
@@ -76,8 +72,7 @@ TEST(ModularTest, JXL_TSAN_SLOW_TEST(RoundtripLosslessGroups1024)) {
 }
 
 TEST(ModularTest, RoundtripLosslessCustomWP_PermuteRCT) {
-  ThreadPool* pool = nullptr;
-  const PaddedBytes orig =
+  const std::vector<uint8_t> orig =
       ReadTestData("external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
   CompressParams cparams;
   cparams.SetLossless();
@@ -88,22 +83,19 @@ TEST(ModularTest, RoundtripLosslessCustomWP_PermuteRCT) {
   cparams.options.predictor = {Predictor::Weighted};
 
   CodecInOut io_out;
-  size_t compressed_size;
 
   CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, pool));
+  ASSERT_TRUE(SetFromBytes(Bytes(orig), &io));
   io.ShrinkTo(100, 100);
 
-  compressed_size = Roundtrip(&io, cparams, {}, pool, &io_out);
-  EXPECT_LE(compressed_size, 10150u);
-  EXPECT_LE(ButteraugliDistance(io, io_out, cparams.ba_params, GetJxlCms(),
-                                /*distmap=*/nullptr, pool),
-            0.0);
+  size_t compressed_size;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io_out, _, &compressed_size));
+  EXPECT_LE(compressed_size, 10169u);
+  JXL_EXPECT_OK(SamePixels(*io.Main().color(), *io_out.Main().color(), _));
 }
 
 TEST(ModularTest, RoundtripLossyDeltaPalette) {
-  ThreadPool* pool = nullptr;
-  const PaddedBytes orig =
+  const std::vector<uint8_t> orig =
       ReadTestData("external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
   CompressParams cparams;
   cparams.modular_mode = true;
@@ -112,22 +104,21 @@ TEST(ModularTest, RoundtripLossyDeltaPalette) {
   cparams.palette_colors = 0;
 
   CodecInOut io_out;
-  size_t compressed_size;
 
   CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, pool));
+  ASSERT_TRUE(SetFromBytes(Bytes(orig), &io));
   io.ShrinkTo(300, 100);
 
-  compressed_size = Roundtrip(&io, cparams, {}, pool, &io_out);
+  size_t compressed_size;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io_out, _, &compressed_size));
   EXPECT_LE(compressed_size, 6800u);
-  cparams.ba_params.intensity_target = 80.0f;
-  EXPECT_THAT(ButteraugliDistance(io, io_out, cparams.ba_params, GetJxlCms(),
-                                  /*distmap=*/nullptr, pool),
+  EXPECT_THAT(ButteraugliDistance(io.frames, io_out.frames, ButteraugliParams(),
+                                  *JxlGetDefaultCms(),
+                                  /*distmap=*/nullptr),
               IsSlightlyBelow(1.5));
 }
 TEST(ModularTest, RoundtripLossyDeltaPaletteWP) {
-  ThreadPool* pool = nullptr;
-  const PaddedBytes orig =
+  const std::vector<uint8_t> orig =
       ReadTestData("external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
   CompressParams cparams;
   cparams.SetLossless();
@@ -136,63 +127,65 @@ TEST(ModularTest, RoundtripLossyDeltaPaletteWP) {
   cparams.options.predictor = jxl::Predictor::Weighted;
 
   CodecInOut io_out;
-  size_t compressed_size;
 
   CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, pool));
+  ASSERT_TRUE(SetFromBytes(Bytes(orig), &io));
   io.ShrinkTo(300, 100);
 
-  compressed_size = Roundtrip(&io, cparams, {}, pool, &io_out);
+  size_t compressed_size;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io_out, _, &compressed_size));
   EXPECT_LE(compressed_size, 7000u);
-  cparams.ba_params.intensity_target = 80.0f;
-  EXPECT_THAT(ButteraugliDistance(io, io_out, cparams.ba_params, GetJxlCms(),
-                                  /*distmap=*/nullptr, pool),
-              IsSlightlyBelow(10.0));
+  EXPECT_THAT(ButteraugliDistance(io.frames, io_out.frames, ButteraugliParams(),
+                                  *JxlGetDefaultCms(),
+                                  /*distmap=*/nullptr),
+              IsSlightlyBelow(10.1));
 }
 
 TEST(ModularTest, RoundtripLossy) {
-  ThreadPool* pool = nullptr;
-  const PaddedBytes orig =
+  const std::vector<uint8_t> orig =
       ReadTestData("external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
   CompressParams cparams;
   cparams.modular_mode = true;
   cparams.butteraugli_distance = 2.f;
+  cparams.SetCms(*JxlGetDefaultCms());
 
   CodecInOut io_out;
-  size_t compressed_size;
 
   CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, pool));
+  ASSERT_TRUE(SetFromBytes(Bytes(orig), &io));
 
-  compressed_size = Roundtrip(&io, cparams, {}, pool, &io_out);
+  size_t compressed_size;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io_out, _, &compressed_size));
   EXPECT_LE(compressed_size, 30000u);
-  cparams.ba_params.intensity_target = 80.0f;
-  EXPECT_THAT(ButteraugliDistance(io, io_out, cparams.ba_params, GetJxlCms(),
-                                  /*distmap=*/nullptr, pool),
+  EXPECT_THAT(ButteraugliDistance(io.frames, io_out.frames, ButteraugliParams(),
+                                  *JxlGetDefaultCms(),
+                                  /*distmap=*/nullptr),
               IsSlightlyBelow(2.3));
 }
 
 TEST(ModularTest, RoundtripLossy16) {
-  ThreadPool* pool = nullptr;
-  const PaddedBytes orig =
+  const std::vector<uint8_t> orig =
       ReadTestData("external/raw.pixls/DJI-FC6310-16bit_709_v4_krita.png");
   CompressParams cparams;
   cparams.modular_mode = true;
   cparams.butteraugli_distance = 2.f;
 
   CodecInOut io_out;
-  size_t compressed_size;
 
   CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, pool));
-  JXL_CHECK(io.TransformTo(ColorEncoding::SRGB(), GetJxlCms(), pool));
+  ASSERT_TRUE(SetFromBytes(Bytes(orig), &io));
+  JXL_CHECK(!io.metadata.m.have_preview);
+  JXL_CHECK(io.frames.size() == 1);
+  JXL_CHECK(
+      io.frames[0].TransformTo(ColorEncoding::SRGB(), *JxlGetDefaultCms()));
   io.metadata.m.color_encoding = ColorEncoding::SRGB();
 
-  compressed_size = Roundtrip(&io, cparams, {}, pool, &io_out);
+  size_t compressed_size;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io_out, _, &compressed_size));
   EXPECT_LE(compressed_size, 300u);
-  cparams.ba_params.intensity_target = 80.0f;
-  EXPECT_THAT(ButteraugliDistance(io, io_out, cparams.ba_params, GetJxlCms(),
-                                  /*distmap=*/nullptr, pool),
+  EXPECT_THAT(ButteraugliDistance(io.frames, io_out.frames, ButteraugliParams(),
+                                  *JxlGetDefaultCms(),
+                                  /*distmap=*/nullptr),
               IsSlightlyBelow(1.6));
 }
 
@@ -239,11 +232,10 @@ TEST(ModularTest, RoundtripExtraProperties) {
 }
 
 TEST(ModularTest, RoundtripLosslessCustomSqueeze) {
-  ThreadPool* pool = nullptr;
-  const PaddedBytes orig =
+  const std::vector<uint8_t> orig =
       ReadTestData("external/wesaturate/500px/tmshre_riaphotographs_srgb8.png");
   CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, pool));
+  ASSERT_TRUE(SetFromBytes(Bytes(orig), &io));
 
   CompressParams cparams;
   cparams.modular_mode = true;
@@ -265,9 +257,10 @@ TEST(ModularTest, RoundtripLosslessCustomSqueeze) {
   cparams.squeezes.push_back(p);
 
   CodecInOut io2;
-  EXPECT_LE(Roundtrip(&io, cparams, {}, pool, &io2), 265000u);
-  EXPECT_EQ(0.0, ButteraugliDistance(io, io2, cparams.ba_params, GetJxlCms(),
-                                     /*distmap=*/nullptr, pool));
+  size_t compressed_size;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _, &compressed_size));
+  EXPECT_LE(compressed_size, 265000u);
+  JXL_EXPECT_OK(SamePixels(*io.Main().color(), *io2.Main().color(), _));
 }
 
 struct RoundtripLosslessConfig {
@@ -306,10 +299,10 @@ TEST_P(ModularTestParam, RoundtripLossless) {
 
   ThreadPool* pool = nullptr;
   Rng generator(123);
-  const PaddedBytes orig =
+  const std::vector<uint8_t> orig =
       ReadTestData("external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
   CodecInOut io1;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io1, pool));
+  ASSERT_TRUE(SetFromBytes(Bytes(orig), &io1, pool));
 
   // vary the dimensions a bit, in case of bugs related to
   // even vs odd width or height.
@@ -349,9 +342,10 @@ TEST_P(ModularTestParam, RoundtripLossless) {
   cparams.speed_tier = SpeedTier::kThunder;
   cparams.responsive = responsive;
   CodecInOut io2;
-  EXPECT_LE(Roundtrip(&io, cparams, {}, pool, &io2),
-            bitdepth * xsize * ysize / 3);
-  EXPECT_LE(0, ComputeDistance2(io.Main(), io2.Main(), GetJxlCms()));
+  size_t compressed_size;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _, &compressed_size));
+  EXPECT_LE(compressed_size, bitdepth * xsize * ysize / 3);
+  EXPECT_LE(0, ComputeDistance2(io.Main(), io2.Main(), *JxlGetDefaultCms()));
   size_t different = 0;
   for (size_t c = 0; c < 3; c++) {
     for (size_t y = 0; y < ysize; y++) {
@@ -369,7 +363,6 @@ TEST_P(ModularTestParam, RoundtripLossless) {
 }
 
 TEST(ModularTest, RoundtripLosslessCustomFloat) {
-  ThreadPool* pool = nullptr;
   CodecInOut io;
   size_t xsize = 100, ysize = 300;
   io.SetSize(xsize, ysize);
@@ -378,7 +371,7 @@ TEST(ModularTest, RoundtripLosslessCustomFloat) {
   io.metadata.m.bit_depth.floating_point_sample = true;
   io.metadata.m.modular_16_bit_buffer_sufficient = false;
   ColorEncoding color_encoding;
-  color_encoding.tf.SetTransferFunction(TransferFunction::kLinear);
+  color_encoding.Tf().SetTransferFunction(TransferFunction::kLinear);
   color_encoding.SetColorSpace(ColorSpace::kRGB);
   Image3F testimage(xsize, ysize);
   float factor = 1.f / (1 << 14);
@@ -403,16 +396,17 @@ TEST(ModularTest, RoundtripLosslessCustomFloat) {
   cparams.decoding_speed_tier = 2;
 
   CodecInOut io2;
-  EXPECT_LE(Roundtrip(&io, cparams, {}, pool, &io2), 23000u);
-  EXPECT_EQ(0.0, ButteraugliDistance(io, io2, cparams.ba_params, GetJxlCms(),
-                                     /*distmap=*/nullptr, pool));
+  size_t compressed_size;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _, &compressed_size));
+  EXPECT_LE(compressed_size, 23000u);
+  JXL_EXPECT_OK(SamePixels(*io.Main().color(), *io2.Main().color(), _));
 }
 
 void WriteHeaders(BitWriter* writer, size_t xsize, size_t ysize) {
   BitWriter::Allotment allotment(writer, 16);
   writer->Write(8, 0xFF);
   writer->Write(8, kCodestreamMarker);
-  ReclaimAndCharge(writer, &allotment, 0, nullptr);
+  allotment.ReclaimAndCharge(writer, 0, nullptr);
   CodecMetadata metadata;
   EXPECT_TRUE(metadata.size.Set(xsize, ysize));
   EXPECT_TRUE(WriteSizeHeader(metadata.size, writer, 0, nullptr));
@@ -473,7 +467,7 @@ TEST(ModularTest, PredictorIntegerOverflow) {
     bw->Write(8, 119);
     bw->Write(28, 0xfffffff);
     bw->ZeroPadToByte();
-    ReclaimAndCharge(bw, &allotment, 0, nullptr);
+    allotment.ReclaimAndCharge(bw, 0, nullptr);
   }
   EXPECT_TRUE(WriteGroupOffsets(group_codes, nullptr, &writer, nullptr));
   writer.AppendByteAligned(group_codes);
@@ -481,7 +475,7 @@ TEST(ModularTest, PredictorIntegerOverflow) {
   PaddedBytes compressed = std::move(writer).TakeBytes();
   extras::PackedPixelFile ppf;
   extras::JXLDecompressParams params;
-  params.accepted_formats.push_back({1, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0});
+  params.accepted_formats.push_back({1, JXL_TYPE_FLOAT, JXL_NATIVE_ENDIAN, 0});
   EXPECT_TRUE(DecodeImageJXL(compressed.data(), compressed.size(), params,
                              nullptr, &ppf));
   ASSERT_EQ(1, ppf.frames.size());
@@ -521,7 +515,7 @@ TEST(ModularTest, UnsqueezeIntegerOverflow) {
       bw->Write(28, 0xffffffe);
     }
     bw->ZeroPadToByte();
-    ReclaimAndCharge(bw, &allotment, 0, nullptr);
+    allotment.ReclaimAndCharge(bw, 0, nullptr);
   }
   EXPECT_TRUE(WriteGroupOffsets(group_codes, nullptr, &writer, nullptr));
   writer.AppendByteAligned(group_codes);
@@ -529,7 +523,7 @@ TEST(ModularTest, UnsqueezeIntegerOverflow) {
   PaddedBytes compressed = std::move(writer).TakeBytes();
   extras::PackedPixelFile ppf;
   extras::JXLDecompressParams params;
-  params.accepted_formats.push_back({1, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0});
+  params.accepted_formats.push_back({1, JXL_TYPE_FLOAT, JXL_NATIVE_ENDIAN, 0});
   EXPECT_TRUE(DecodeImageJXL(compressed.data(), compressed.size(), params,
                              nullptr, &ppf));
   ASSERT_EQ(1, ppf.frames.size());
index d897ea3..585fab0 100644 (file)
@@ -38,7 +38,7 @@ struct NoiseParams {
 
 static inline std::pair<int, float> IndexAndFrac(float x) {
   constexpr size_t kScaleNumerator = NoiseParams::kNumNoisePoints - 2;
-  // TODO: instead of 1, this should be a proper Y range.
+  // TODO(user): instead of 1, this should be a proper Y range.
   constexpr float kScale = kScaleNumerator / 1;
   float scaled_x = std::max(0.f, x * kScale);
   float floor_x;
index 7573d6b..b591e10 100644 (file)
@@ -3,18 +3,16 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-#include <stdio.h>
-
-#include <hwy/tests/test_util-inl.h>
+#include <jxl/cms.h>
 
 #include "lib/jxl/base/compiler_specific.h"
-#include "lib/jxl/color_management.h"
+#include "lib/jxl/base/matrix_ops.h"
+#include "lib/jxl/cms/opsin_params.h"
 #include "lib/jxl/dec_xyb.h"
-#include "lib/jxl/enc_color_management.h"
 #include "lib/jxl/enc_xyb.h"
 #include "lib/jxl/image.h"
-#include "lib/jxl/linalg.h"
 #include "lib/jxl/opsin_params.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
 namespace {
@@ -35,7 +33,7 @@ void LinearSrgbToOpsin(float rgb_r, float rgb_g, float rgb_b,
   ImageBundle ib(&metadata);
   ib.SetFromImage(std::move(linear), metadata.color_encoding);
   Image3F opsin(1, 1);
-  (void)ToXYB(ib, /*pool=*/nullptr, &opsin, GetJxlCms());
+  (void)ToXYB(ib, /*pool=*/nullptr, &opsin, *JxlGetDefaultCms());
 
   *xyb_x = opsin.PlaneRow(0, 0)[0];
   *xyb_y = opsin.PlaneRow(1, 0)[0];
@@ -77,7 +75,7 @@ TEST(OpsinImageTest, VerifyOpsinAbsorbanceInverseMatrix) {
   }
   EXPECT_TRUE(Inv3x3Matrix(matrix));
   for (int i = 0; i < 9; i++) {
-    EXPECT_NEAR(matrix[i], kOpsinAbsorbanceMatrix[i], 1e-6);
+    EXPECT_NEAR(matrix[i], jxl::cms::kOpsinAbsorbanceMatrix[i], 1e-6);
   }
 }
 
@@ -115,7 +113,7 @@ TEST(OpsinImageTest, VerifyGray) {
     float x, y, b;
     LinearSrgbToOpsin(i / 255., i / 255., i / 255., &x, &y, &b);
     EXPECT_NEAR(0, x, 1e-6);
-    EXPECT_NEAR(kYToBRatio, b / y, 3e-5);
+    EXPECT_NEAR(jxl::cms::kYToBRatio, b / y, 3e-5);
   }
 }
 
index 9fa8290..76e8d29 100644 (file)
@@ -3,17 +3,17 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-#include "gtest/gtest.h"
+#include <jxl/cms.h>
+
 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/codec_in_out.h"
 #include "lib/jxl/color_encoding_internal.h"
-#include "lib/jxl/color_management.h"
 #include "lib/jxl/dec_xyb.h"
-#include "lib/jxl/enc_color_management.h"
 #include "lib/jxl/enc_xyb.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/image_bundle.h"
 #include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
 namespace {
@@ -25,16 +25,18 @@ TEST(OpsinInverseTest, LinearInverseInverts) {
   CodecInOut io;
   io.metadata.m.SetFloat32Samples();
   io.metadata.m.color_encoding = ColorEncoding::LinearSRGB();
-  io.SetFromImage(CopyImage(linear), io.metadata.m.color_encoding);
+  Image3F linear2(128, 128);
+  CopyImageTo(linear, &linear2);
+  io.SetFromImage(std::move(linear2), io.metadata.m.color_encoding);
   ThreadPool* null_pool = nullptr;
   Image3F opsin(io.xsize(), io.ysize());
-  (void)ToXYB(io.Main(), null_pool, &opsin, GetJxlCms());
+  (void)ToXYB(io.Main(), null_pool, &opsin, *JxlGetDefaultCms());
 
   OpsinParams opsin_params;
   opsin_params.Init(/*intensity_target=*/255.0f);
   OpsinToLinearInplace(&opsin, /*pool=*/nullptr, opsin_params);
 
-  VerifyRelativeError(linear, opsin, 3E-3, 2E-4);
+  JXL_ASSERT_OK(VerifyRelativeError(linear, opsin, 3E-3, 2E-4, _));
 }
 
 TEST(OpsinInverseTest, YcbCrInverts) {
@@ -50,7 +52,7 @@ TEST(OpsinInverseTest, YcbCrInverts) {
   Image3F rgb2(rgb.xsize(), rgb.ysize());
   YcbcrToRgb(ycbcr, &rgb2, Rect(rgb));
 
-  VerifyRelativeError(rgb, rgb2, 4E-5, 4E-7);
+  JXL_ASSERT_OK(VerifyRelativeError(rgb, rgb2, 4E-5, 4E-7, _));
 }
 
 }  // namespace
index f80a18a..e1fdda5 100644 (file)
@@ -5,17 +5,19 @@
 
 #include "lib/jxl/opsin_params.h"
 
-#include <stdlib.h>
+#include "lib/jxl/cms/opsin_params.h"
 
-#include "lib/jxl/linalg.h"
+#define INVERSE_OPSIN_FROM_SPEC 1
 
-namespace jxl {
+#if not(INVERSE_OPSIN_FROM_SPEC)
+#include "lib/jxl/base/matrix_ops.h"
+#endif
 
-#define INVERSE_OPSIN_FROM_SPEC 1
+namespace jxl {
 
 const float* GetOpsinAbsorbanceInverseMatrix() {
 #if INVERSE_OPSIN_FROM_SPEC
-  return DefaultInverseOpsinAbsorbanceMatrix();
+  return jxl::cms::DefaultInverseOpsinAbsorbanceMatrix();
 #else   // INVERSE_OPSIN_FROM_SPEC
   // Compute the inverse opsin matrix from the forward matrix. Less precise
   // than taking the values from the specification, but must be used if the
index e8e2e43..fc285ac 100644 (file)
@@ -8,49 +8,10 @@
 
 // Constants that define the XYB color space.
 
-#include <stdlib.h>
-
-#include <cmath>
-
 #include "lib/jxl/base/compiler_specific.h"
 
 namespace jxl {
 
-// Parameters for opsin absorbance.
-static const float kM02 = 0.078f;
-static const float kM00 = 0.30f;
-static const float kM01 = 1.0f - kM02 - kM00;
-
-static const float kM12 = 0.078f;
-static const float kM10 = 0.23f;
-static const float kM11 = 1.0f - kM12 - kM10;
-
-static const float kM20 = 0.24342268924547819f;
-static const float kM21 = 0.20476744424496821f;
-static const float kM22 = 1.0f - kM20 - kM21;
-
-static const float kBScale = 1.0f;
-static const float kYToBRatio = 1.0f;  // works better with 0.50017729543783418
-static const float kBToYRatio = 1.0f / kYToBRatio;
-
-static const float kB0 = 0.0037930732552754493f;
-static const float kB1 = kB0;
-static const float kB2 = kB0;
-
-// Opsin absorbance matrix is now frozen.
-static const float kOpsinAbsorbanceMatrix[9] = {
-    kM00, kM01, kM02, kM10, kM11, kM12, kM20, kM21, kM22,
-};
-
-// Must be the inverse matrix of kOpsinAbsorbanceMatrix and match the spec.
-static inline const float* DefaultInverseOpsinAbsorbanceMatrix() {
-  static float kDefaultInverseOpsinAbsorbanceMatrix[9] = {
-      11.031566901960783f,  -9.866943921568629f, -0.16462299647058826f,
-      -3.254147380392157f,  4.418770392156863f,  -0.16462299647058826f,
-      -3.6588512862745097f, 2.7129230470588235f, 1.9459282392156863f};
-  return kDefaultInverseOpsinAbsorbanceMatrix;
-}
-
 // Returns 3x3 row-major matrix inverse of kOpsinAbsorbanceMatrix.
 // opsin_image_test verifies this is actually the inverse.
 const float* GetOpsinAbsorbanceInverseMatrix();
@@ -59,16 +20,6 @@ void InitSIMDInverseMatrix(const float* JXL_RESTRICT inverse,
                            float* JXL_RESTRICT simd_inverse,
                            float intensity_target);
 
-static const float kOpsinAbsorbanceBias[3] = {
-    kB0,
-    kB1,
-    kB2,
-};
-
-static const float kNegOpsinAbsorbanceBiasRGB[4] = {
-    -kOpsinAbsorbanceBias[0], -kOpsinAbsorbanceBias[1],
-    -kOpsinAbsorbanceBias[2], 1.0f};
-
 }  // namespace jxl
 
 #endif  // LIB_JXL_OPSIN_PARAMS_H_
diff --git a/lib/jxl/pack_signed.h b/lib/jxl/pack_signed.h
new file mode 100644 (file)
index 0000000..326f06e
--- /dev/null
@@ -0,0 +1,34 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_PACK_H_
+#define LIB_JXL_PACK_H_
+
+// Pack/UnpackSigned utilities.
+
+#include <cstddef>
+#include <cstdint>
+
+#include "lib/jxl/base/compiler_specific.h"
+
+namespace jxl {
+// Encodes non-negative (X) into (2 * X), negative (-X) into (2 * X - 1)
+constexpr uint32_t PackSigned(int32_t value)
+    JXL_NO_SANITIZE("unsigned-integer-overflow") {
+  return (static_cast<uint32_t>(value) << 1) ^
+         ((static_cast<uint32_t>(~value) >> 31) - 1);
+}
+
+// Reverse to PackSigned, i.e. UnpackSigned(PackSigned(X)) == X.
+// (((~value) & 1) - 1) is either 0 or 0xFF...FF and it will have an expected
+// unsigned-integer-overflow.
+constexpr intptr_t UnpackSigned(size_t value)
+    JXL_NO_SANITIZE("unsigned-integer-overflow") {
+  return static_cast<intptr_t>((value >> 1) ^ (((~value) & 1) - 1));
+}
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_PACK_H_
similarity index 81%
rename from lib/jxl/base/padded_bytes.h
rename to lib/jxl/padded_bytes.h
index 4534ddf..0d69647 100644 (file)
@@ -16,9 +16,9 @@
 #include <initializer_list>
 #include <utility>  // swap
 
-#include "lib/jxl/base/cache_aligned.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/status.h"
+#include "lib/jxl/cache_aligned.h"
 
 namespace jxl {
 
@@ -32,20 +32,18 @@ class PaddedBytes {
   PaddedBytes() : size_(0), capacity_(0) {}
 
   explicit PaddedBytes(size_t size) : size_(size), capacity_(0) {
-    if (size != 0) IncreaseCapacityTo(size);
+    reserve(size);
   }
 
   PaddedBytes(size_t size, uint8_t value) : size_(size), capacity_(0) {
-    if (size != 0) {
-      IncreaseCapacityTo(size);
-    }
+    reserve(size);
     if (size_ != 0) {
       memset(data(), value, size);
     }
   }
 
   PaddedBytes(const PaddedBytes& other) : size_(other.size_), capacity_(0) {
-    if (size_ != 0) IncreaseCapacityTo(size_);
+    reserve(size_);
     if (data() != nullptr) memcpy(data(), other.data(), size_);
   }
   PaddedBytes& operator=(const PaddedBytes& other) {
@@ -79,8 +77,38 @@ class PaddedBytes {
     std::swap(data_, other.data_);
   }
 
+  // If current capacity is greater than requested, then no-op. Otherwise
+  // copies existing data to newly allocated "data_". If allocation fails,
+  // data() == nullptr and size_ = capacity_ = 0.
+  // The new capacity will be at least 1.5 times the old capacity. This ensures
+  // that we avoid quadratic behaviour.
   void reserve(size_t capacity) {
-    if (capacity > capacity_) IncreaseCapacityTo(capacity);
+    if (capacity <= capacity_) return;
+
+    size_t new_capacity = std::max(capacity, 3 * capacity_ / 2);
+    new_capacity = std::max<size_t>(64, new_capacity);
+
+    // BitWriter writes up to 7 bytes past the end.
+    CacheAlignedUniquePtr new_data = AllocateArray(new_capacity + 8);
+    if (new_data == nullptr) {
+      // Allocation failed, discard all data to ensure this is noticed.
+      size_ = capacity_ = 0;
+      return;
+    }
+
+    if (data_ == nullptr) {
+      // First allocation: ensure first byte is initialized (won't be copied).
+      new_data[0] = 0;
+    } else {
+      // Subsequent resize: copy existing data to new location.
+      memcpy(new_data.get(), data_.get(), size_);
+      // Ensure that the first new byte is initialized, to allow write_bits to
+      // safely append to the newly-resized PaddedBytes.
+      new_data[size_] = 0;
+    }
+
+    capacity_ = new_capacity;
+    std::swap(new_data, data_);
   }
 
   // NOTE: unlike vector, this does not initialize the new data!
@@ -88,7 +116,7 @@ class PaddedBytes {
   // the resize, as we zero-initialize the first new byte of data.
   // If size < capacity(), does not invalidate the memory.
   void resize(size_t size) {
-    if (size > capacity_) IncreaseCapacityTo(size);
+    reserve(size);
     size_ = (data() == nullptr) ? 0 : size;
   }
 
@@ -104,7 +132,7 @@ class PaddedBytes {
   // Amortized constant complexity due to exponential growth.
   void push_back(uint8_t x) {
     if (size_ == capacity_) {
-      IncreaseCapacityTo(capacity_ + 1);
+      reserve(capacity_ + 1);
       if (data() == nullptr) return;
     }
 
@@ -127,9 +155,6 @@ class PaddedBytes {
     memcpy(data(), il.begin(), il.size());
   }
 
-  // Replaces data() with [new_begin, new_end); potentially reallocates.
-  void assign(const uint8_t* new_begin, const uint8_t* new_end);
-
   uint8_t* begin() { return data(); }
   const uint8_t* begin() const { return data(); }
   uint8_t* end() { return begin() + size(); }
@@ -173,12 +198,6 @@ class PaddedBytes {
     JXL_ASSERT(i <= size());
   }
 
-  // Copies existing data to newly allocated "data_". If allocation fails,
-  // data() == nullptr and size_ = capacity_ = 0.
-  // The new capacity will be at least 1.5 times the old capacity. This ensures
-  // that we avoid quadratic behaviour.
-  void IncreaseCapacityTo(size_t capacity);
-
   size_t size_;
   size_t capacity_;
   CacheAlignedUniquePtr data_;
index d8005e4..83d1da9 100644 (file)
@@ -3,12 +3,12 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/padded_bytes.h"
 
 #include <numeric>  // iota
 #include <vector>
 
-#include "gtest/gtest.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
 namespace {
@@ -60,67 +60,5 @@ TEST(PaddedBytesTest, TestFillWithMoreReserve) {
   EXPECT_GT(pb.capacity(), 170u);
 }
 
-// Can assign() a subset of the valid data.
-TEST(PaddedBytesTest, TestAssignFromWithin) {
-  PaddedBytes pb;
-  pb.reserve(256);
-  for (size_t i = 0; i < 256; ++i) {
-    pb.push_back(i);
-  }
-  pb.assign(pb.data() + 64, pb.data() + 192);
-  EXPECT_EQ(128u, pb.size());
-  for (size_t i = 0; i < 128; ++i) {
-    EXPECT_EQ(i + 64, pb[i]);
-  }
-}
-
-// Can assign() a range with both valid and previously-allocated data.
-TEST(PaddedBytesTest, TestAssignReclaim) {
-  PaddedBytes pb;
-  pb.reserve(256);
-  for (size_t i = 0; i < 256; ++i) {
-    pb.push_back(i);
-  }
-
-  const uint8_t* mem = pb.data();
-  pb.resize(200);
-  // Just shrank without reallocating
-  EXPECT_EQ(mem, pb.data());
-  EXPECT_EQ(256u, pb.capacity());
-
-  // Reclaim part of initial allocation
-  pb.assign(pb.data() + 100, pb.data() + 240);
-  EXPECT_EQ(140u, pb.size());
-
-  for (size_t i = 0; i < 140; ++i) {
-    EXPECT_EQ(i + 100, pb[i]);
-  }
-}
-
-// Can assign() smaller and larger ranges outside the current allocation.
-TEST(PaddedBytesTest, TestAssignOutside) {
-  PaddedBytes pb;
-  pb.resize(400);
-  std::iota(pb.begin(), pb.end(), 1);
-
-  std::vector<uint8_t> small(64);
-  std::iota(small.begin(), small.end(), 500);
-
-  pb.assign(small.data(), small.data() + small.size());
-  EXPECT_EQ(64u, pb.size());
-  for (size_t i = 0; i < 64; ++i) {
-    EXPECT_EQ((i + 500) & 0xFF, pb[i]);
-  }
-
-  std::vector<uint8_t> large(1000);
-  std::iota(large.begin(), large.end(), 600);
-
-  pb.assign(large.data(), large.data() + large.size());
-  EXPECT_EQ(1000u, pb.size());
-  for (size_t i = 0; i < 1000; ++i) {
-    EXPECT_EQ((i + 600) & 0xFF, pb[i]);
-  }
-}
-
 }  // namespace
 }  // namespace jxl
index 2f287ec..fde526b 100644 (file)
@@ -7,7 +7,7 @@
 
 #include "lib/jxl/chroma_from_luma.h"
 #include "lib/jxl/coeff_order.h"
-#include "lib/jxl/common.h"
+#include "lib/jxl/frame_dimensions.h"
 
 namespace jxl {
 
index 069d7ac..d066155 100644 (file)
@@ -9,7 +9,6 @@
 #include "lib/jxl/ac_context.h"
 #include "lib/jxl/ac_strategy.h"
 #include "lib/jxl/chroma_from_luma.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/dec_patch_dictionary.h"
 #include "lib/jxl/frame_header.h"
 #include "lib/jxl/image.h"
@@ -74,12 +73,7 @@ struct PassesSharedState {
   Image3F dc_frames[4];
 
   struct {
-    ImageBundle storage;
-    // Can either point to `storage`, if this is a frame that is not stored in
-    // the CodecInOut, or can point to an existing ImageBundle.
-    // TODO(veluca): pointing to ImageBundles in CodecInOut is not possible for
-    // now, as they are stored in a vector and thus may be moved. Fix this.
-    ImageBundle* JXL_RESTRICT frame = &storage;
+    ImageBundle frame;
     // ImageBundle doesn't yet have a simple way to state it is in XYB.
     bool ib_is_in_xyb = false;
   } reference_frames[4] = {};
index a58aadc..229d8e5 100644 (file)
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+#include <jxl/cms.h>
 #include <stddef.h>
 
 #include <future>
 #include <string>
 #include <utility>
 
-#include "gtest/gtest.h"
 #include "lib/extras/codec.h"
-#include "lib/jxl/aux_out.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/base/override.h"
-#include "lib/jxl/base/padded_bytes.h"
-#include "lib/jxl/base/thread_pool_internal.h"
+#include "lib/jxl/base/span.h"
 #include "lib/jxl/color_encoding_internal.h"
-#include "lib/jxl/common.h"
+#include "lib/jxl/enc_aux_out.h"
 #include "lib/jxl/enc_butteraugli_comparator.h"
 #include "lib/jxl/enc_cache.h"
-#include "lib/jxl/enc_color_management.h"
-#include "lib/jxl/enc_file.h"
 #include "lib/jxl/enc_params.h"
 #include "lib/jxl/image_bundle.h"
 #include "lib/jxl/image_ops.h"
 #include "lib/jxl/test_utils.h"
-#include "lib/jxl/testdata.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
-namespace {
+
+using test::ReadTestData;
 using test::Roundtrip;
+using test::ThreadPoolForTests;
+
+namespace {
 
 TEST(PassesTest, RoundtripSmallPasses) {
-  ThreadPool* pool = nullptr;
-  const PaddedBytes orig =
+  const std::vector<uint8_t> orig =
       ReadTestData("external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
   CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, pool));
+  ASSERT_TRUE(SetFromBytes(Bytes(orig), &io));
   io.ShrinkTo(io.xsize() / 8, io.ysize() / 8);
 
   CompressParams cparams;
   cparams.butteraugli_distance = 1.0;
   cparams.progressive_mode = true;
+  cparams.SetCms(*JxlGetDefaultCms());
 
   CodecInOut io2;
-  Roundtrip(&io, cparams, {}, pool, &io2);
-  EXPECT_THAT(ButteraugliDistance(io, io2, cparams.ba_params, GetJxlCms(),
-                                  /*distmap=*/nullptr, pool),
-              IsSlightlyBelow(1.0));
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _));
+  EXPECT_THAT(ButteraugliDistance(io.frames, io2.frames, ButteraugliParams(),
+                                  *JxlGetDefaultCms(),
+                                  /*distmap=*/nullptr),
+              IsSlightlyBelow(0.8222));
 }
 
 TEST(PassesTest, RoundtripUnalignedPasses) {
-  ThreadPool* pool = nullptr;
-  const PaddedBytes orig =
+  const std::vector<uint8_t> orig =
       ReadTestData("external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
   CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, pool));
+  ASSERT_TRUE(SetFromBytes(Bytes(orig), &io));
   io.ShrinkTo(io.xsize() / 12, io.ysize() / 7);
 
   CompressParams cparams;
   cparams.butteraugli_distance = 2.0;
   cparams.progressive_mode = true;
+  cparams.SetCms(*JxlGetDefaultCms());
 
   CodecInOut io2;
-  Roundtrip(&io, cparams, {}, pool, &io2);
-  EXPECT_THAT(ButteraugliDistance(io, io2, cparams.ba_params, GetJxlCms(),
-                                  /*distmap=*/nullptr, pool),
-              IsSlightlyBelow(1.6));
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _));
+  EXPECT_THAT(ButteraugliDistance(io.frames, io2.frames, ButteraugliParams(),
+                                  *JxlGetDefaultCms(),
+                                  /*distmap=*/nullptr),
+              IsSlightlyBelow(1.72));
 }
 
 TEST(PassesTest, RoundtripMultiGroupPasses) {
-  const PaddedBytes orig = ReadTestData("jxl/flower/flower.png");
+  const std::vector<uint8_t> orig = ReadTestData("jxl/flower/flower.png");
   CodecInOut io;
   {
-    ThreadPoolInternal pool(4);
-    ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
+    ThreadPoolForTests pool(4);
+    ASSERT_TRUE(SetFromBytes(Bytes(orig), &io, &pool));
   }
   io.ShrinkTo(600, 1024);  // partial X, full Y group
 
   auto test = [&](float target_distance, float threshold) {
-    ThreadPoolInternal pool(4);
+    ThreadPoolForTests pool(4);
     CompressParams cparams;
     cparams.butteraugli_distance = target_distance;
     cparams.progressive_mode = true;
+    cparams.SetCms(*JxlGetDefaultCms());
     CodecInOut io2;
-    Roundtrip(&io, cparams, {}, &pool, &io2);
-    EXPECT_THAT(ButteraugliDistance(io, io2, cparams.ba_params, GetJxlCms(),
+    JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _,
+                            /* compressed_size */ nullptr, &pool));
+    EXPECT_THAT(ButteraugliDistance(io.frames, io2.frames, ButteraugliParams(),
+                                    *JxlGetDefaultCms(),
                                     /*distmap=*/nullptr, &pool),
                 IsSlightlyBelow(target_distance + threshold));
   };
 
-  auto run1 = std::async(std::launch::async, test, 1.0f, 0.3f);
-  auto run2 = std::async(std::launch::async, test, 2.0f, 0.3f);
+  auto run1 = std::async(std::launch::async, test, 1.0f, 0.15f);
+  auto run2 = std::async(std::launch::async, test, 2.0f, 0.0f);
 }
 
 TEST(PassesTest, RoundtripLargeFastPasses) {
-  ThreadPoolInternal pool(8);
-  const PaddedBytes orig = ReadTestData("jxl/flower/flower.png");
+  ThreadPoolForTests pool(8);
+  const std::vector<uint8_t> orig = ReadTestData("jxl/flower/flower.png");
   CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
+  ASSERT_TRUE(SetFromBytes(Bytes(orig), &io, &pool));
 
   CompressParams cparams;
   cparams.speed_tier = SpeedTier::kSquirrel;
   cparams.progressive_mode = true;
+  cparams.SetCms(*JxlGetDefaultCms());
 
   CodecInOut io2;
-  Roundtrip(&io, cparams, {}, &pool, &io2);
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _,
+                          /* compressed_size */ nullptr, &pool));
 }
 
 // Checks for differing size/distance in two consecutive runs of distance 2,
 // which involves additional processing including adaptive reconstruction.
 // Failing this may be a sign of race conditions or invalid memory accesses.
 TEST(PassesTest, RoundtripProgressiveConsistent) {
-  ThreadPoolInternal pool(8);
-  const PaddedBytes orig = ReadTestData("jxl/flower/flower.png");
+  ThreadPoolForTests pool(8);
+  const std::vector<uint8_t> orig = ReadTestData("jxl/flower/flower.png");
   CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
+  ASSERT_TRUE(SetFromBytes(Bytes(orig), &io, &pool));
 
   CompressParams cparams;
   cparams.speed_tier = SpeedTier::kSquirrel;
   cparams.progressive_mode = true;
   cparams.butteraugli_distance = 2.0;
+  cparams.SetCms(*JxlGetDefaultCms());
 
   // Try each xsize mod kBlockDim to verify right border handling.
   for (size_t xsize = 48; xsize > 40; --xsize) {
     io.ShrinkTo(xsize, 15);
 
     CodecInOut io2;
-    const size_t size2 = Roundtrip(&io, cparams, {}, &pool, &io2);
+    size_t size2;
+    JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _, &size2, &pool));
 
     CodecInOut io3;
-    const size_t size3 = Roundtrip(&io, cparams, {}, &pool, &io3);
+    size_t size3;
+    JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io3, _, &size3, &pool));
 
     // Exact same compressed size.
     EXPECT_EQ(size2, size3);
 
     // Exact same distance.
-    const float dist2 =
-        ButteraugliDistance(io, io2, cparams.ba_params, GetJxlCms(),
-                            /*distmap=*/nullptr, &pool);
-    const float dist3 =
-        ButteraugliDistance(io, io3, cparams.ba_params, GetJxlCms(),
-                            /*distmap=*/nullptr, &pool);
+    const float dist2 = ButteraugliDistance(
+        io.frames, io2.frames, ButteraugliParams(), *JxlGetDefaultCms(),
+        /*distmap=*/nullptr, &pool);
+    const float dist3 = ButteraugliDistance(
+        io.frames, io3.frames, ButteraugliParams(), *JxlGetDefaultCms(),
+        /*distmap=*/nullptr, &pool);
     EXPECT_EQ(dist2, dist3);
   }
 }
 
 TEST(PassesTest, AllDownsampleFeasible) {
-  ThreadPoolInternal pool(8);
-  const PaddedBytes orig =
+  ThreadPoolForTests pool(8);
+  const std::vector<uint8_t> orig =
       ReadTestData("external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
   CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
+  ASSERT_TRUE(SetFromBytes(Bytes(orig), &io, &pool));
 
-  PaddedBytes compressed;
-  AuxOut aux;
+  std::vector<uint8_t> compressed;
 
   CompressParams cparams;
   cparams.speed_tier = SpeedTier::kSquirrel;
   cparams.progressive_mode = true;
   cparams.butteraugli_distance = 1.0;
   PassesEncoderState enc_state;
-  ASSERT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(),
-                         &aux, &pool));
+  ASSERT_TRUE(test::EncodeFile(cparams, &io, &enc_state, &compressed, &pool));
 
   EXPECT_LE(compressed.size(), 240000u);
   float target_butteraugli[9] = {};
@@ -183,10 +191,11 @@ TEST(PassesTest, AllDownsampleFeasible) {
     extras::JXLDecompressParams dparams;
     dparams.max_downsampling = downsampling;
     CodecInOut output;
-    ASSERT_TRUE(test::DecodeFile(dparams, compressed, &output, nullptr));
+    ASSERT_TRUE(test::DecodeFile(dparams, Bytes(compressed), &output));
     EXPECT_EQ(output.xsize(), io.xsize()) << "downsampling = " << downsampling;
     EXPECT_EQ(output.ysize(), io.ysize()) << "downsampling = " << downsampling;
-    EXPECT_LE(ButteraugliDistance(io, output, cparams.ba_params, GetJxlCms(),
+    EXPECT_LE(ButteraugliDistance(io.frames, output.frames, ButteraugliParams(),
+                                  *JxlGetDefaultCms(),
                                   /*distmap=*/nullptr, nullptr),
               target_butteraugli[downsampling])
         << "downsampling: " << downsampling;
@@ -196,22 +205,20 @@ TEST(PassesTest, AllDownsampleFeasible) {
 }
 
 TEST(PassesTest, AllDownsampleFeasibleQProgressive) {
-  ThreadPoolInternal pool(8);
-  const PaddedBytes orig =
+  ThreadPoolForTests pool(8);
+  const std::vector<uint8_t> orig =
       ReadTestData("external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
   CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
+  ASSERT_TRUE(SetFromBytes(Bytes(orig), &io, &pool));
 
-  PaddedBytes compressed;
-  AuxOut aux;
+  std::vector<uint8_t> compressed;
 
   CompressParams cparams;
   cparams.speed_tier = SpeedTier::kSquirrel;
   cparams.qprogressive_mode = true;
   cparams.butteraugli_distance = 1.0;
   PassesEncoderState enc_state;
-  ASSERT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(),
-                         &aux, &pool));
+  ASSERT_TRUE(test::EncodeFile(cparams, &io, &enc_state, &compressed, &pool));
 
   EXPECT_LE(compressed.size(), 220000u);
 
@@ -230,11 +237,12 @@ TEST(PassesTest, AllDownsampleFeasibleQProgressive) {
     extras::JXLDecompressParams dparams;
     dparams.max_downsampling = downsampling;
     CodecInOut output;
-    ASSERT_TRUE(test::DecodeFile(dparams, compressed, &output, nullptr));
+    ASSERT_TRUE(test::DecodeFile(dparams, Bytes(compressed), &output));
     EXPECT_EQ(output.xsize(), io.xsize()) << "downsampling = " << downsampling;
     EXPECT_EQ(output.ysize(), io.ysize()) << "downsampling = " << downsampling;
-    EXPECT_LE(ButteraugliDistance(io, output, cparams.ba_params, GetJxlCms(),
-                                  /*distmap=*/nullptr, nullptr),
+    EXPECT_LE(ButteraugliDistance(io.frames, output.frames, ButteraugliParams(),
+                                  *JxlGetDefaultCms(),
+                                  /*distmap=*/nullptr),
               target_butteraugli[downsampling])
         << "downsampling: " << downsampling;
   };
@@ -243,11 +251,11 @@ TEST(PassesTest, AllDownsampleFeasibleQProgressive) {
 }
 
 TEST(PassesTest, ProgressiveDownsample2DegradesCorrectlyGrayscale) {
-  ThreadPoolInternal pool(8);
-  const PaddedBytes orig = ReadTestData(
+  ThreadPoolForTests pool(8);
+  const std::vector<uint8_t> orig = ReadTestData(
       "external/wesaturate/500px/cvo9xd_keong_macan_grayscale.png");
   CodecInOut io_orig;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io_orig, &pool));
+  ASSERT_TRUE(SetFromBytes(Bytes(orig), &io_orig, &pool));
   Rect rect(0, 0, io_orig.xsize(), 128);
   // need 2 DC groups for the DC frame to actually be progressive.
   Image3F large(4242, rect.ysize());
@@ -257,8 +265,7 @@ TEST(PassesTest, ProgressiveDownsample2DegradesCorrectlyGrayscale) {
   io.metadata = io_orig.metadata;
   io.SetFromImage(std::move(large), io_orig.Main().c_current());
 
-  PaddedBytes compressed;
-  AuxOut aux;
+  std::vector<uint8_t> compressed;
 
   CompressParams cparams;
   cparams.speed_tier = SpeedTier::kSquirrel;
@@ -267,34 +274,33 @@ TEST(PassesTest, ProgressiveDownsample2DegradesCorrectlyGrayscale) {
   cparams.qprogressive_mode = true;
   cparams.butteraugli_distance = 1.0;
   PassesEncoderState enc_state;
-  ASSERT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(),
-                         &aux, &pool));
+  ASSERT_TRUE(test::EncodeFile(cparams, &io, &enc_state, &compressed, &pool));
 
   EXPECT_LE(compressed.size(), 10000u);
 
   extras::JXLDecompressParams dparams;
   dparams.max_downsampling = 1;
   CodecInOut output;
-  ASSERT_TRUE(test::DecodeFile(dparams, compressed, &output, nullptr));
+  ASSERT_TRUE(test::DecodeFile(dparams, Bytes(compressed), &output));
 
   dparams.max_downsampling = 2;
   CodecInOut output_d2;
-  ASSERT_TRUE(test::DecodeFile(dparams, compressed, &output_d2, nullptr));
+  ASSERT_TRUE(test::DecodeFile(dparams, Bytes(compressed), &output_d2));
 
   // 0 if reading all the passes, ~15 if skipping the 8x pass.
-  float butteraugli_distance_down2_full =
-      ButteraugliDistance(output, output_d2, cparams.ba_params, GetJxlCms(),
-                          /*distmap=*/nullptr, nullptr);
+  float butteraugli_distance_down2_full = ButteraugliDistance(
+      output.frames, output_d2.frames, ButteraugliParams(), *JxlGetDefaultCms(),
+      /*distmap=*/nullptr);
 
   EXPECT_LE(butteraugli_distance_down2_full, 3.2f);
   EXPECT_GE(butteraugli_distance_down2_full, 1.0f);
 }
 
 TEST(PassesTest, ProgressiveDownsample2DegradesCorrectly) {
-  ThreadPoolInternal pool(8);
-  const PaddedBytes orig = ReadTestData("jxl/flower/flower.png");
+  ThreadPoolForTests pool(8);
+  const std::vector<uint8_t> orig = ReadTestData("jxl/flower/flower.png");
   CodecInOut io_orig;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io_orig, &pool));
+  ASSERT_TRUE(SetFromBytes(Bytes(orig), &io_orig, &pool));
   Rect rect(0, 0, io_orig.xsize(), 128);
   // need 2 DC groups for the DC frame to actually be progressive.
   Image3F large(4242, rect.ysize());
@@ -303,8 +309,7 @@ TEST(PassesTest, ProgressiveDownsample2DegradesCorrectly) {
   CodecInOut io;
   io.SetFromImage(std::move(large), io_orig.Main().c_current());
 
-  PaddedBytes compressed;
-  AuxOut aux;
+  std::vector<uint8_t> compressed;
 
   CompressParams cparams;
   cparams.speed_tier = SpeedTier::kSquirrel;
@@ -313,73 +318,71 @@ TEST(PassesTest, ProgressiveDownsample2DegradesCorrectly) {
   cparams.qprogressive_mode = true;
   cparams.butteraugli_distance = 1.0;
   PassesEncoderState enc_state;
-  ASSERT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(),
-                         &aux, &pool));
+  ASSERT_TRUE(test::EncodeFile(cparams, &io, &enc_state, &compressed, &pool));
 
   EXPECT_LE(compressed.size(), 220000u);
 
   extras::JXLDecompressParams dparams;
   dparams.max_downsampling = 1;
   CodecInOut output;
-  ASSERT_TRUE(test::DecodeFile(dparams, compressed, &output, nullptr));
+  ASSERT_TRUE(test::DecodeFile(dparams, Bytes(compressed), &output));
 
   dparams.max_downsampling = 2;
   CodecInOut output_d2;
-  ASSERT_TRUE(test::DecodeFile(dparams, compressed, &output_d2, nullptr));
+  ASSERT_TRUE(test::DecodeFile(dparams, Bytes(compressed), &output_d2));
 
   // 0 if reading all the passes, ~15 if skipping the 8x pass.
-  float butteraugli_distance_down2_full =
-      ButteraugliDistance(output, output_d2, cparams.ba_params, GetJxlCms(),
-                          /*distmap=*/nullptr, nullptr);
+  float butteraugli_distance_down2_full = ButteraugliDistance(
+      output.frames, output_d2.frames, ButteraugliParams(), *JxlGetDefaultCms(),
+      /*distmap=*/nullptr);
 
   EXPECT_LE(butteraugli_distance_down2_full, 3.0f);
   EXPECT_GE(butteraugli_distance_down2_full, 1.0f);
 }
 
 TEST(PassesTest, NonProgressiveDCImage) {
-  ThreadPoolInternal pool(8);
-  const PaddedBytes orig = ReadTestData("jxl/flower/flower.png");
+  ThreadPoolForTests pool(8);
+  const std::vector<uint8_t> orig = ReadTestData("jxl/flower/flower.png");
   CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
+  ASSERT_TRUE(SetFromBytes(Bytes(orig), &io, &pool));
 
-  PaddedBytes compressed;
-  AuxOut aux;
+  std::vector<uint8_t> compressed;
 
   CompressParams cparams;
   cparams.speed_tier = SpeedTier::kSquirrel;
   cparams.progressive_mode = false;
   cparams.butteraugli_distance = 2.0;
   PassesEncoderState enc_state;
-  ASSERT_TRUE(EncodeFile(cparams, &io, &enc_state, &compressed, GetJxlCms(),
-                         &aux, &pool));
+  ASSERT_TRUE(test::EncodeFile(cparams, &io, &enc_state, &compressed, &pool));
 
   // Even in non-progressive mode, it should be possible to return a DC-only
   // image.
   extras::JXLDecompressParams dparams;
   dparams.max_downsampling = 100;
   CodecInOut output;
-  ASSERT_TRUE(test::DecodeFile(dparams, compressed, &output, &pool));
+  ASSERT_TRUE(test::DecodeFile(dparams, Bytes(compressed), &output, &pool));
   EXPECT_EQ(output.xsize(), io.xsize());
   EXPECT_EQ(output.ysize(), io.ysize());
 }
 
 TEST(PassesTest, RoundtripSmallNoGaborishPasses) {
-  ThreadPool* pool = nullptr;
-  const PaddedBytes orig =
+  const std::vector<uint8_t> orig =
       ReadTestData("external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
   CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, pool));
+  ASSERT_TRUE(SetFromBytes(Bytes(orig), &io));
   io.ShrinkTo(io.xsize() / 8, io.ysize() / 8);
 
   CompressParams cparams;
   cparams.gaborish = Override::kOff;
   cparams.butteraugli_distance = 1.0;
   cparams.progressive_mode = true;
+  cparams.SetCms(*JxlGetDefaultCms());
 
   CodecInOut io2;
-  Roundtrip(&io, cparams, {}, pool, &io2);
-  EXPECT_THAT(ButteraugliDistance(io, io2, cparams.ba_params, GetJxlCms(),
-                                  /*distmap=*/nullptr, pool),
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _));
+  EXPECT_THAT(ButteraugliDistance(io.frames, io2.frames, ButteraugliParams(),
+                                  *JxlGetDefaultCms(),
+                                  /*distmap=*/nullptr),
               IsSlightlyBelow(1.2));
 }
 
index 3a34b83..d7feded 100644 (file)
@@ -3,24 +3,28 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-#include "gtest/gtest.h"
+#include <jxl/cms.h>
+
+#include <cstdint>
+#include <vector>
+
 #include "lib/extras/codec.h"
 #include "lib/jxl/enc_butteraugli_comparator.h"
 #include "lib/jxl/enc_params.h"
 #include "lib/jxl/image_test_utils.h"
 #include "lib/jxl/test_utils.h"
-#include "lib/jxl/testdata.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
 namespace {
 
-using ::jxl::test::Roundtrip;
+using test::ReadTestData;
+using test::Roundtrip;
 
 TEST(PatchDictionaryTest, GrayscaleModular) {
-  ThreadPool* pool = nullptr;
-  const PaddedBytes orig = ReadTestData("jxl/grayscale_patches.png");
+  const std::vector<uint8_t> orig = ReadTestData("jxl/grayscale_patches.png");
   CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, pool));
+  ASSERT_TRUE(SetFromBytes(Bytes(orig), &io));
 
   CompressParams cparams;
   cparams.SetLossless();
@@ -28,25 +32,30 @@ TEST(PatchDictionaryTest, GrayscaleModular) {
 
   CodecInOut io2;
   // Without patches: ~25k
-  EXPECT_LE(Roundtrip(&io, cparams, {}, pool, &io2), 8000u);
-  VerifyRelativeError(*io.Main().color(), *io2.Main().color(), 1e-7f, 0);
+  size_t compressed_size;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _, &compressed_size));
+  EXPECT_LE(compressed_size, 8000u);
+  JXL_ASSERT_OK(VerifyRelativeError(*io.Main().color(), *io2.Main().color(),
+                                    1e-7f, 0, _));
 }
 
 TEST(PatchDictionaryTest, GrayscaleVarDCT) {
-  ThreadPool* pool = nullptr;
-  const PaddedBytes orig = ReadTestData("jxl/grayscale_patches.png");
+  const std::vector<uint8_t> orig = ReadTestData("jxl/grayscale_patches.png");
   CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, pool));
+  ASSERT_TRUE(SetFromBytes(Bytes(orig), &io));
 
   CompressParams cparams;
   cparams.patches = jxl::Override::kOn;
 
   CodecInOut io2;
   // Without patches: ~47k
-  EXPECT_LE(Roundtrip(&io, cparams, {}, pool, &io2), 14000u);
+  size_t compressed_size;
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _, &compressed_size));
+  EXPECT_LE(compressed_size, 14000u);
   // Without patches: ~1.2
-  EXPECT_LE(ButteraugliDistance(io, io2, cparams.ba_params, GetJxlCms(),
-                                /*distmap=*/nullptr, pool),
+  EXPECT_LE(ButteraugliDistance(io.frames, io2.frames, ButteraugliParams(),
+                                *JxlGetDefaultCms(),
+                                /*distmap=*/nullptr),
             1.1);
 }
 
index 35ec70b..7e88f51 100644 (file)
@@ -3,38 +3,38 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-#include <stddef.h>
+#include <jxl/cms.h>
 
+#include <cstddef>
+#include <cstdint>
 #include <string>
+#include <vector>
 
-#include "gtest/gtest.h"
 #include "lib/extras/codec.h"
-#include "lib/jxl/aux_out.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/base/override.h"
-#include "lib/jxl/base/padded_bytes.h"
+#include "lib/jxl/base/span.h"
 #include "lib/jxl/codec_in_out.h"
 #include "lib/jxl/color_encoding_internal.h"
 #include "lib/jxl/enc_butteraugli_comparator.h"
 #include "lib/jxl/enc_cache.h"
-#include "lib/jxl/enc_file.h"
 #include "lib/jxl/enc_params.h"
 #include "lib/jxl/headers.h"
 #include "lib/jxl/image_bundle.h"
 #include "lib/jxl/test_utils.h"
-#include "lib/jxl/testdata.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
 namespace {
+using test::ReadTestData;
 using test::Roundtrip;
 
 TEST(PreviewTest, RoundtripGivenPreview) {
-  ThreadPool* pool = nullptr;
-  const PaddedBytes orig =
+  const std::vector<uint8_t> orig =
       ReadTestData("external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
   CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, pool));
+  ASSERT_TRUE(SetFromBytes(Bytes(orig), &io));
   io.ShrinkTo(io.xsize() / 8, io.ysize() / 8);
   // Same as main image
   io.preview_frame = io.Main().Copy();
@@ -48,22 +48,23 @@ TEST(PreviewTest, RoundtripGivenPreview) {
   CompressParams cparams;
   cparams.butteraugli_distance = 2.0;
   cparams.speed_tier = SpeedTier::kSquirrel;
+  cparams.SetCms(*JxlGetDefaultCms());
 
   CodecInOut io2;
-  Roundtrip(&io, cparams, {}, pool, &io2);
+  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _));
   EXPECT_EQ(preview_xsize, io2.metadata.m.preview_size.xsize());
   EXPECT_EQ(preview_ysize, io2.metadata.m.preview_size.ysize());
   EXPECT_EQ(preview_xsize, io2.preview_frame.xsize());
   EXPECT_EQ(preview_ysize, io2.preview_frame.ysize());
 
   EXPECT_LE(ButteraugliDistance(io.preview_frame, io2.preview_frame,
-                                cparams.ba_params, GetJxlCms(),
-                                /*distmap=*/nullptr, pool),
+                                ButteraugliParams(), *JxlGetDefaultCms(),
+                                /*distmap=*/nullptr),
+            2.5);
+  EXPECT_LE(ButteraugliDistance(io.Main(), io2.Main(), ButteraugliParams(),
+                                *JxlGetDefaultCms(),
+                                /*distmap=*/nullptr),
             2.5);
-  EXPECT_LE(
-      ButteraugliDistance(io.Main(), io2.Main(), cparams.ba_params, GetJxlCms(),
-                          /*distmap=*/nullptr, pool),
-      2.5);
 }
 
 }  // namespace
diff --git a/lib/jxl/progressive_split.cc b/lib/jxl/progressive_split.cc
deleted file mode 100644 (file)
index d0a16b9..0000000
+++ /dev/null
@@ -1,128 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "lib/jxl/progressive_split.h"
-
-#include <string.h>
-
-#include <algorithm>
-#include <memory>
-
-#include "lib/jxl/common.h"
-#include "lib/jxl/image.h"
-
-namespace jxl {
-
-bool ProgressiveSplitter::SuperblockIsSalient(size_t row_start,
-                                              size_t col_start, size_t num_rows,
-                                              size_t num_cols) const {
-  if (saliency_map_ == nullptr || saliency_map_->xsize() == 0 ||
-      saliency_threshold_ == 0.0) {
-    // If we do not have a saliency-map, or the threshold says to include
-    // every block, we straightaway classify the superblock as 'salient'.
-    return true;
-  }
-  const size_t row_end = std::min(saliency_map_->ysize(), row_start + num_rows);
-  const size_t col_end = std::min(saliency_map_->xsize(), col_start + num_cols);
-  for (size_t num_row = row_start; num_row < row_end; num_row++) {
-    const float* JXL_RESTRICT map_row = saliency_map_->ConstRow(num_row);
-    for (size_t num_col = col_start; num_col < col_end; num_col++) {
-      if (map_row[num_col] >= saliency_threshold_) {
-        // One of the blocks covered by this superblock is above the saliency
-        // threshold.
-        return true;
-      }
-    }
-  }
-  // We did not see any block above the saliency threshold.
-  return false;
-}
-
-template <typename T>
-void ProgressiveSplitter::SplitACCoefficients(
-    const T* JXL_RESTRICT block, size_t size, const AcStrategy& acs, size_t bx,
-    size_t by, size_t offset, T* JXL_RESTRICT output[kMaxNumPasses][3]) {
-  auto shift_right_round0 = [&](T v, int shift) {
-    T one_if_negative = static_cast<uint32_t>(v) >> 31;
-    T add = (one_if_negative << shift) - one_if_negative;
-    return (v + add) >> shift;
-  };
-  // Early quit for the simple case of only one pass.
-  if (mode_.num_passes == 1) {
-    for (size_t c = 0; c < 3; c++) {
-      memcpy(output[0][c] + offset, block + c * size, sizeof(T) * size);
-    }
-    return;
-  }
-  size_t ncoeffs_all_done_from_earlier_passes = 1;
-  size_t previous_pass_salient_only = false;
-
-  int previous_pass_shift = 0;
-  for (size_t num_pass = 0; num_pass < mode_.num_passes; num_pass++) {  // pass
-    // Zero out output block.
-    for (size_t c = 0; c < 3; c++) {
-      memset(output[num_pass][c] + offset, 0, size * sizeof(T));
-    }
-    const bool current_pass_salient_only = mode_.passes[num_pass].salient_only;
-    const int pass_shift = mode_.passes[num_pass].shift;
-    size_t frame_ncoeffs = mode_.passes[num_pass].num_coefficients;
-    for (size_t c = 0; c < 3; c++) {  // color-channel
-      size_t xsize = acs.covered_blocks_x();
-      size_t ysize = acs.covered_blocks_y();
-      CoefficientLayout(&ysize, &xsize);
-      if (current_pass_salient_only || previous_pass_salient_only) {
-        // Current or previous pass is salient-only.
-        const bool superblock_is_salient =
-            SuperblockIsSalient(by, bx, ysize, xsize);
-        if (current_pass_salient_only != superblock_is_salient) {
-          // Current pass is salient-only, but block is not salient,
-          // OR last pass was salient-only, and block is salient
-          // (hence was already included in last pass).
-          continue;
-        }
-      }
-      for (size_t y = 0; y < ysize * frame_ncoeffs; y++) {    // superblk-y
-        for (size_t x = 0; x < xsize * frame_ncoeffs; x++) {  // superblk-x
-          size_t pos = y * xsize * kBlockDim + x;
-          if (x < xsize * ncoeffs_all_done_from_earlier_passes &&
-              y < ysize * ncoeffs_all_done_from_earlier_passes) {
-            // This coefficient was already included in an earlier pass,
-            // which included a genuinely smaller set of coefficients
-            // (= is not about saliency-splitting).
-            continue;
-          }
-          T v = block[c * size + pos];
-          // Previous pass discarded some bits: do not encode them again.
-          if (previous_pass_shift != 0) {
-            T previous_v = shift_right_round0(v, previous_pass_shift) *
-                           (1 << previous_pass_shift);
-            v -= previous_v;
-          }
-          output[num_pass][c][offset + pos] = shift_right_round0(v, pass_shift);
-        }  // superblk-x
-      }    // superblk-y
-    }      // color-channel
-    if (!current_pass_salient_only) {
-      // We just finished a non-salient pass.
-      // Hence, we are now guaranteed to have included all coeffs up to
-      // frame_ncoeffs in every block, unless the current pass is shifted.
-      if (mode_.passes[num_pass].shift == 0) {
-        ncoeffs_all_done_from_earlier_passes = frame_ncoeffs;
-      }
-    }
-    previous_pass_salient_only = current_pass_salient_only;
-    previous_pass_shift = mode_.passes[num_pass].shift;
-  }  // num_pass
-}
-
-template void ProgressiveSplitter::SplitACCoefficients<int32_t>(
-    const int32_t* JXL_RESTRICT, size_t, const AcStrategy&, size_t, size_t,
-    size_t, int32_t* JXL_RESTRICT[kMaxNumPasses][3]);
-
-template void ProgressiveSplitter::SplitACCoefficients<int16_t>(
-    const int16_t* JXL_RESTRICT, size_t, const AcStrategy&, size_t, size_t,
-    size_t, int16_t* JXL_RESTRICT[kMaxNumPasses][3]);
-
-}  // namespace jxl
index 756a481..70b3b9e 100644 (file)
@@ -13,9 +13,7 @@
 #include <utility>
 
 #include "lib/jxl/base/bits.h"
-#include "lib/jxl/base/printf_macros.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/dct_scales.h"
 #include "lib/jxl/dec_modular.h"
 #include "lib/jxl/fields.h"
@@ -26,7 +24,7 @@
 #include <hwy/foreach_target.h>
 #include <hwy/highway.h>
 
-#include "lib/jxl/fast_math-inl.h"
+#include "lib/jxl/base/fast_math-inl.h"
 
 HWY_BEFORE_NAMESPACE();
 namespace jxl {
@@ -525,7 +523,7 @@ constexpr float V(float v) { return static_cast<float>(v); }
 namespace {
 struct DequantMatricesLibraryDef {
   // DCT8
-  static constexpr const QuantEncodingInternal DCT() {
+  static constexpr QuantEncodingInternal DCT() {
     return QuantEncodingInternal::DCT(DctQuantWeightParams({{{{
                                                                  V(3150.0),
                                                                  V(0.0),
@@ -554,7 +552,7 @@ struct DequantMatricesLibraryDef {
   }
 
   // Identity
-  static constexpr const QuantEncodingInternal IDENTITY() {
+  static constexpr QuantEncodingInternal IDENTITY() {
     return QuantEncodingInternal::Identity({{{{
                                                  V(280.0),
                                                  V(3160.0),
@@ -573,7 +571,7 @@ struct DequantMatricesLibraryDef {
   }
 
   // DCT2
-  static constexpr const QuantEncodingInternal DCT2X2() {
+  static constexpr QuantEncodingInternal DCT2X2() {
     return QuantEncodingInternal::DCT2({{{{
                                              V(3840.0),
                                              V(2560.0),
@@ -601,7 +599,7 @@ struct DequantMatricesLibraryDef {
   }
 
   // DCT4 (quant_kind 3)
-  static constexpr const QuantEncodingInternal DCT4X4() {
+  static constexpr QuantEncodingInternal DCT4X4() {
     return QuantEncodingInternal::DCT4(DctQuantWeightParams({{{{
                                                                   V(2200.0),
                                                                   V(0.0),
@@ -637,7 +635,7 @@ struct DequantMatricesLibraryDef {
   }
 
   // DCT16
-  static constexpr const QuantEncodingInternal DCT16X16() {
+  static constexpr QuantEncodingInternal DCT16X16() {
     return QuantEncodingInternal::DCT(
         DctQuantWeightParams({{{{
                                    V(8996.8725711814115328),
@@ -670,7 +668,7 @@ struct DequantMatricesLibraryDef {
   }
 
   // DCT32
-  static constexpr const QuantEncodingInternal DCT32X32() {
+  static constexpr QuantEncodingInternal DCT32X32() {
     return QuantEncodingInternal::DCT(
         DctQuantWeightParams({{{{
                                    V(15718.40830982518931456),
@@ -706,7 +704,7 @@ struct DequantMatricesLibraryDef {
   }
 
   // DCT16X8
-  static constexpr const QuantEncodingInternal DCT8X16() {
+  static constexpr QuantEncodingInternal DCT8X16() {
     return QuantEncodingInternal::DCT(
         DctQuantWeightParams({{{{
                                    V(7240.7734393502),
@@ -739,7 +737,7 @@ struct DequantMatricesLibraryDef {
   }
 
   // DCT32X8
-  static constexpr const QuantEncodingInternal DCT8X32() {
+  static constexpr QuantEncodingInternal DCT8X32() {
     return QuantEncodingInternal::DCT(
         DctQuantWeightParams({{{{
                                    V(16283.2494710648897),
@@ -775,7 +773,7 @@ struct DequantMatricesLibraryDef {
   }
 
   // DCT32X16
-  static constexpr const QuantEncodingInternal DCT16X32() {
+  static constexpr QuantEncodingInternal DCT16X32() {
     return QuantEncodingInternal::DCT(
         DctQuantWeightParams({{{{
                                    V(13844.97076442300573),
@@ -811,7 +809,7 @@ struct DequantMatricesLibraryDef {
   }
 
   // DCT4X8 and 8x4
-  static constexpr const QuantEncodingInternal DCT4X8() {
+  static constexpr QuantEncodingInternal DCT4X8() {
     return QuantEncodingInternal::DCT4X8(
         DctQuantWeightParams({{
                                  {{
@@ -842,7 +840,7 @@ struct DequantMatricesLibraryDef {
         }});
   }
   // AFV
-  static const QuantEncodingInternal AFV0() {
+  static QuantEncodingInternal AFV0() {
     return QuantEncodingInternal::AFV(DCT4X8().dct_params, DCT4X4().dct_params,
                                       {{{{
                                             // 4x4/4x8 DC tendency.
@@ -889,7 +887,7 @@ struct DequantMatricesLibraryDef {
   }
 
   // DCT64
-  static const QuantEncodingInternal DCT64X64() {
+  static QuantEncodingInternal DCT64X64() {
     return QuantEncodingInternal::DCT(
         DctQuantWeightParams({{{{
                                    V(0.9 * 26629.073922049845),
@@ -925,7 +923,7 @@ struct DequantMatricesLibraryDef {
   }
 
   // DCT64X32
-  static const QuantEncodingInternal DCT32X64() {
+  static QuantEncodingInternal DCT32X64() {
     return QuantEncodingInternal::DCT(
         DctQuantWeightParams({{{{
                                    V(0.65 * 23629.073922049845),
@@ -960,7 +958,7 @@ struct DequantMatricesLibraryDef {
                              8));
   }
   // DCT128X128
-  static const QuantEncodingInternal DCT128X128() {
+  static QuantEncodingInternal DCT128X128() {
     return QuantEncodingInternal::DCT(
         DctQuantWeightParams({{{{
                                    V(1.8 * 26629.073922049845),
@@ -996,7 +994,7 @@ struct DequantMatricesLibraryDef {
   }
 
   // DCT128X64
-  static const QuantEncodingInternal DCT64X128() {
+  static QuantEncodingInternal DCT64X128() {
     return QuantEncodingInternal::DCT(
         DctQuantWeightParams({{{{
                                    V(1.3 * 23629.073922049845),
@@ -1031,7 +1029,7 @@ struct DequantMatricesLibraryDef {
                              8));
   }
   // DCT256X256
-  static const QuantEncodingInternal DCT256X256() {
+  static QuantEncodingInternal DCT256X256() {
     return QuantEncodingInternal::DCT(
         DctQuantWeightParams({{{{
                                    V(3.6 * 26629.073922049845),
@@ -1067,7 +1065,7 @@ struct DequantMatricesLibraryDef {
   }
 
   // DCT256X128
-  static const QuantEncodingInternal DCT128X256() {
+  static QuantEncodingInternal DCT128X256() {
     return QuantEncodingInternal::DCT(
         DctQuantWeightParams({{{{
                                    V(2.6 * 23629.073922049845),
@@ -1104,7 +1102,7 @@ struct DequantMatricesLibraryDef {
 };
 }  // namespace
 
-const DequantMatrices::DequantLibraryInternal DequantMatrices::LibraryInit() {
+DequantMatrices::DequantLibraryInternal DequantMatrices::LibraryInit() {
   static_assert(kNum == 17,
                 "Update this function when adding new quantization kinds.");
   static_assert(kNumPredefinedTables == 1,
index 92a2d9e..3004176 100644 (file)
 #include <vector>
 
 #include "lib/jxl/ac_strategy.h"
-#include "lib/jxl/aux_out_fwd.h"
-#include "lib/jxl/base/cache_aligned.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/span.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/dec_bit_reader.h"
 #include "lib/jxl/image.h"
 
@@ -248,8 +245,8 @@ class QuantEncoding final : public QuantEncodingInternal {
   // create a QuantEncodingInternal instance is if you need a constexpr version
   // of this class. Note that RAW() is not supported in that case since it uses
   // a std::vector.
-  static QuantEncoding Library(uint8_t predefined) {
-    return QuantEncoding(QuantEncodingInternal::Library(predefined));
+  static QuantEncoding Library(uint8_t predefined_arg) {
+    return QuantEncoding(QuantEncodingInternal::Library(predefined_arg));
   }
   static QuantEncoding Identity(const IdWeights& xybweights) {
     return QuantEncoding(QuantEncodingInternal::Identity(xybweights));
@@ -288,8 +285,8 @@ class QuantEncoding final : public QuantEncodingInternal {
   explicit QuantEncoding(const QuantEncodingInternal& other)
       : QuantEncodingInternal(other) {}
 
-  explicit QuantEncoding(QuantEncodingInternal::Mode mode)
-      : QuantEncodingInternal(mode) {}
+  explicit QuantEncoding(QuantEncodingInternal::Mode mode_arg)
+      : QuantEncodingInternal(mode_arg) {}
 };
 
 // A constexpr QuantEncodingInternal instance is often downcasted to the
@@ -372,7 +369,7 @@ class DequantMatrices {
   // Return the array of library kNumPredefinedTables QuantEncoding entries as
   // a constexpr array. Use Library() to obtain a pointer to the copy in the
   // .cc file.
-  static const DequantLibraryInternal LibraryInit();
+  static DequantLibraryInternal LibraryInit();
 
   // Returns aligned memory.
   JXL_INLINE const float* Matrix(size_t quant_kind, size_t c) const {
index f049794..2dd5138 100644 (file)
@@ -9,7 +9,7 @@
 #include <algorithm>
 #include <cmath>
 #include <hwy/base.h>  // HWY_ALIGN_MAX
-#include <hwy/tests/test_util-inl.h>
+#include <hwy/tests/hwy_gtest.h>
 #include <numeric>
 
 #include "lib/jxl/base/random.h"
@@ -18,6 +18,7 @@
 #include "lib/jxl/enc_modular.h"
 #include "lib/jxl/enc_quant_weights.h"
 #include "lib/jxl/enc_transforms.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
 namespace {
@@ -179,7 +180,7 @@ TEST_P(QuantWeightsTargetTest, DCTUniform) {
                              1.0f / kUniformQuant};
   DequantMatricesSetCustomDC(&dequant_matrices, dc_quant);
 
-  HWY_ALIGN_MAX float scratch_space[16 * 16 * 2];
+  HWY_ALIGN_MAX float scratch_space[16 * 16 * 5];
 
   // DCT8
   {
index 814aea2..b9ea43e 100644 (file)
@@ -5,7 +5,6 @@
 
 #include "lib/jxl/quantizer.h"
 
-#include <stdio.h>
 #include <string.h>
 
 #include <algorithm>
@@ -125,12 +124,11 @@ Status QuantizerParams::VisitFields(Visitor* JXL_RESTRICT visitor) {
   return true;
 }
 
-Status Quantizer::Encode(BitWriter* writer, size_t layer,
-                         AuxOut* aux_out) const {
+QuantizerParams Quantizer::GetParams() const {
   QuantizerParams params;
   params.global_scale = global_scale_;
   params.quant_dc = quant_dc_;
-  return Bundle::Write(params, writer, layer, aux_out);
+  return params;
 }
 
 Status Quantizer::Decode(BitReader* reader) {
index 09e2e5e..4e34ac7 100644 (file)
 #include <vector>
 
 #include "lib/jxl/ac_strategy.h"
-#include "lib/jxl/aux_out_fwd.h"
 #include "lib/jxl/base/bits.h"
 #include "lib/jxl/base/compiler_specific.h"
-#include "lib/jxl/base/profiler.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/dct_util.h"
 #include "lib/jxl/dec_bit_reader.h"
-#include "lib/jxl/enc_bit_writer.h"
 #include "lib/jxl/fields.h"
 #include "lib/jxl/image.h"
-#include "lib/jxl/linalg.h"
 #include "lib/jxl/quant_weights.h"
 
 // Quantizes DC and AC coefficients, with separate quantization tables according
@@ -63,6 +58,8 @@ static constexpr float kDefaultQuantBias[4] = {
     0.145f,
 };
 
+struct QuantizerParams;
+
 class Quantizer {
  public:
   explicit Quantizer(const DequantMatrices* dequant);
@@ -118,7 +115,7 @@ class Quantizer {
   // Dequantize by multiplying with this times dequant_matrix.
   float inv_quant_ac(int32_t quant) const { return inv_global_scale_ / quant; }
 
-  Status Encode(BitWriter* writer, size_t layer, AuxOut* aux_out) const;
+  QuantizerParams GetParams() const;
 
   Status Decode(BitReader* reader);
 
index d570bf6..aff19f4 100644 (file)
@@ -5,12 +5,12 @@
 
 #include "lib/jxl/quantizer.h"
 
-#include "gtest/gtest.h"
 #include "lib/jxl/base/span.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/enc_fields.h"
 #include "lib/jxl/image_ops.h"
 #include "lib/jxl/image_test_utils.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
 namespace {
@@ -39,7 +39,8 @@ TEST(QuantizerTest, BitStreamRoundtripSameQuant) {
   ImageI raw_quant_field(qxsize, qysize);
   quantizer1.SetQuant(0.17f, 0.17f, &raw_quant_field);
   BitWriter writer;
-  EXPECT_TRUE(quantizer1.Encode(&writer, 0, nullptr));
+  QuantizerParams params = quantizer1.GetParams();
+  EXPECT_TRUE(WriteQuantizerParams(params, &writer, 0, nullptr));
   writer.ZeroPadToByte();
   const size_t bits_written = writer.BitsWritten();
   Quantizer quantizer2(&dequant);
@@ -63,7 +64,8 @@ TEST(QuantizerTest, BitStreamRoundtripRandomQuant) {
   RandomFillImage(&qf, 0.0f, 1.0f);
   quantizer1.SetQuantField(quant_dc, qf, &raw_quant_field);
   BitWriter writer;
-  EXPECT_TRUE(quantizer1.Encode(&writer, 0, nullptr));
+  QuantizerParams params = quantizer1.GetParams();
+  EXPECT_TRUE(WriteQuantizerParams(params, &writer, 0, nullptr));
   writer.ZeroPadToByte();
   const size_t bits_written = writer.BitsWritten();
   Quantizer quantizer2(&dequant);
index 13fc044..bc31cdd 100644 (file)
@@ -3,8 +3,6 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-#include <stdio.h>
-
 #include <cmath>
 #include <string>
 
 #define HWY_TARGET_INCLUDE "lib/jxl/rational_polynomial_test.cc"
 #include <hwy/foreach_target.h>
 #include <hwy/highway.h>
-#include <hwy/tests/test_util-inl.h>
+#include <hwy/tests/hwy_gtest.h>
 
+#include "lib/jxl/base/common.h"
+#include "lib/jxl/base/rational_polynomial-inl.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/common.h"
-#include "lib/jxl/rational_polynomial-inl.h"
+#include "lib/jxl/testing.h"
 
 HWY_BEFORE_NAMESPACE();
 namespace jxl {
index 9114730..9aefdd0 100644 (file)
@@ -9,8 +9,8 @@
 #include <queue>
 #include <tuple>
 
-#include "lib/jxl/aux_out.h"
 #include "lib/jxl/base/arch_macros.h"
+#include "lib/jxl/image_ops.h"
 
 namespace jxl {
 std::pair<size_t, size_t>
@@ -294,7 +294,7 @@ void LowMemoryRenderPipeline::Init() {
   }
   for (size_t i = first_image_dim_stage_; i < stages_.size(); i++) {
     if (stages_[i]->SwitchToImageDimensions()) {
-      JXL_ABORT("Cannot switch to image dimensions multiple times");
+      JXL_UNREACHABLE("Cannot switch to image dimensions multiple times");
     }
     std::vector<std::pair<size_t, size_t>> input_sizes(shifts.size());
     for (size_t c = 0; c < shifts.size(); c++) {
@@ -360,7 +360,6 @@ void LowMemoryRenderPipeline::Init() {
 void LowMemoryRenderPipeline::PrepareForThreadsInternal(size_t num,
                                                         bool use_group_ids) {
   const auto& shifts = channel_shifts_[0];
-
   use_group_ids_ = use_group_ids;
   size_t num_buffers = use_group_ids_ ? frame_dimensions_.num_groups : num;
   for (size_t t = group_data_.size(); t < num_buffers; t++) {
index 3cece17..3a47bdb 100644 (file)
@@ -5,25 +5,30 @@
 
 #include "lib/jxl/render_pipeline/render_pipeline.h"
 
-#include <stdint.h>
-#include <stdio.h>
+#include <jxl/cms.h>
 
 #include <algorithm>
+#include <cstdint>
+#include <cstdio>
 #include <utility>
 #include <vector>
 
-#include "gtest/gtest.h"
 #include "lib/extras/codec.h"
-#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/common.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/common.h"  // JXL_HIGH_PRECISION, JPEGXL_ENABLE_TRANSCODE_JPEG
 #include "lib/jxl/dec_frame.h"
+#include "lib/jxl/enc_cache.h"
 #include "lib/jxl/enc_params.h"
 #include "lib/jxl/fake_parallel_runner_testonly.h"
+#include "lib/jxl/frame_dimensions.h"
 #include "lib/jxl/icc_codec.h"
 #include "lib/jxl/image_test_utils.h"
 #include "lib/jxl/jpeg/enc_jpeg_data.h"
 #include "lib/jxl/render_pipeline/test_render_pipeline_stages.h"
 #include "lib/jxl/test_utils.h"
-#include "lib/jxl/testdata.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
 namespace {
@@ -40,13 +45,11 @@ Status DecodeFile(const Span<const uint8_t> file, bool use_slow_pipeline,
     io->metadata.transform_data.nonserialized_xyb_encoded =
         io->metadata.m.xyb_encoded;
     JXL_RETURN_IF_ERROR(Bundle::Read(&reader, &io->metadata.transform_data));
-    size_t xsize = io->metadata.xsize();
-    size_t ysize = io->metadata.ysize();
-    JXL_RETURN_IF_ERROR(VerifyDimensions(&io->constraints, xsize, ysize));
     if (io->metadata.m.color_encoding.WantICC()) {
-      PaddedBytes icc;
-      JXL_RETURN_IF_ERROR(ReadICC(&reader, &icc));
-      JXL_RETURN_IF_ERROR(io->metadata.m.color_encoding.SetICC(std::move(icc)));
+      std::vector<uint8_t> icc;
+      JXL_RETURN_IF_ERROR(test::ReadICC(&reader, &icc));
+      JXL_RETURN_IF_ERROR(io->metadata.m.color_encoding.SetICC(
+          std::move(icc), JxlGetDefaultCms()));
     }
     PassesDecoderState dec_state;
     JXL_RETURN_IF_ERROR(
@@ -179,13 +182,13 @@ TEST_P(RenderPipelineTestParam, PipelineTest) {
   // border handling bugs.
   FakeParallelRunner fake_pool(/*order_seed=*/123, /*num_threads=*/8);
   ThreadPool pool(&JxlFakeParallelRunner, &fake_pool);
-  const PaddedBytes orig = ReadTestData(config.input_path);
+  const std::vector<uint8_t> orig = jxl::test::ReadTestData(config.input_path);
 
   CodecInOut io;
   if (config.jpeg_transcode) {
-    ASSERT_TRUE(jpeg::DecodeImageJPG(Span<const uint8_t>(orig), &io));
+    ASSERT_TRUE(jpeg::DecodeImageJPG(Bytes(orig), &io));
   } else {
-    ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
+    ASSERT_TRUE(SetFromBytes(Bytes(orig), &io, &pool));
   }
   io.ShrinkTo(config.xsize, config.ysize);
 
@@ -214,19 +217,18 @@ TEST_P(RenderPipelineTestParam, PipelineTest) {
     io.frames[0].SetExtraChannels(std::move(ec));
   }
 
-  PaddedBytes compressed;
+  std::vector<uint8_t> compressed;
 
   PassesEncoderState enc_state;
   enc_state.shared.image_features.splines = config.splines;
-  ASSERT_TRUE(EncodeFile(config.cparams, &io, &enc_state, &compressed,
-                         GetJxlCms(), /*aux_out=*/nullptr, &pool));
-
+  ASSERT_TRUE(
+      test::EncodeFile(config.cparams, &io, &enc_state, &compressed, &pool));
 
   CodecInOut io_default;
-  ASSERT_TRUE(DecodeFile(Span<const uint8_t>(compressed),
+  ASSERT_TRUE(DecodeFile(Bytes(compressed),
                          /*use_slow_pipeline=*/false, &io_default, &pool));
   CodecInOut io_slow_pipeline;
-  ASSERT_TRUE(DecodeFile(Span<const uint8_t>(compressed),
+  ASSERT_TRUE(DecodeFile(Bytes(compressed),
                          /*use_slow_pipeline=*/true, &io_slow_pipeline, &pool));
 
   ASSERT_EQ(io_default.frames.size(), io_slow_pipeline.frames.size());
@@ -234,16 +236,16 @@ TEST_P(RenderPipelineTestParam, PipelineTest) {
 #if JXL_HIGH_PRECISION
     constexpr float kMaxError = 1e-5;
 #else
-    constexpr float kMaxError = 1e-4;
+    constexpr float kMaxError = 5e-4;
 #endif
     Image3F def = std::move(*io_default.frames[i].color());
     Image3F pip = std::move(*io_slow_pipeline.frames[i].color());
-    VerifyRelativeError(pip, def, kMaxError, kMaxError);
+    JXL_ASSERT_OK(VerifyRelativeError(pip, def, kMaxError, kMaxError, _));
     for (size_t ec = 0; ec < io_default.frames[i].extra_channels().size();
          ec++) {
-      VerifyRelativeError(io_slow_pipeline.frames[i].extra_channels()[ec],
-                          io_default.frames[i].extra_channels()[ec], kMaxError,
-                          kMaxError);
+      JXL_ASSERT_OK(VerifyRelativeError(
+          io_slow_pipeline.frames[i].extra_channels()[ec],
+          io_default.frames[i].extra_channels()[ec], kMaxError, kMaxError, _));
     }
   }
 }
@@ -529,14 +531,14 @@ TEST(RenderPipelineDecodingTest, Animation) {
   FakeParallelRunner fake_pool(/*order_seed=*/123, /*num_threads=*/8);
   ThreadPool pool(&JxlFakeParallelRunner, &fake_pool);
 
-  PaddedBytes compressed =
-      ReadTestData("jxl/blending/cropped_traffic_light.jxl");
+  std::vector<uint8_t> compressed =
+      jxl::test::ReadTestData("jxl/blending/cropped_traffic_light.jxl");
 
   CodecInOut io_default;
-  ASSERT_TRUE(DecodeFile(Span<const uint8_t>(compressed),
+  ASSERT_TRUE(DecodeFile(Bytes(compressed),
                          /*use_slow_pipeline=*/false, &io_default, &pool));
   CodecInOut io_slow_pipeline;
-  ASSERT_TRUE(DecodeFile(Span<const uint8_t>(compressed),
+  ASSERT_TRUE(DecodeFile(Bytes(compressed),
                          /*use_slow_pipeline=*/true, &io_slow_pipeline, &pool));
 
   ASSERT_EQ(io_default.frames.size(), io_slow_pipeline.frames.size());
@@ -549,12 +551,13 @@ TEST(RenderPipelineDecodingTest, Animation) {
 
     Image3F fast_pipeline = std::move(*io_default.frames[i].color());
     Image3F slow_pipeline = std::move(*io_slow_pipeline.frames[i].color());
-    VerifyRelativeError(slow_pipeline, fast_pipeline, kMaxError, kMaxError);
+    JXL_ASSERT_OK(VerifyRelativeError(slow_pipeline, fast_pipeline, kMaxError,
+                                      kMaxError, _))
     for (size_t ec = 0; ec < io_default.frames[i].extra_channels().size();
          ec++) {
-      VerifyRelativeError(io_slow_pipeline.frames[i].extra_channels()[ec],
-                          io_default.frames[i].extra_channels()[ec], kMaxError,
-                          kMaxError);
+      JXL_ASSERT_OK(VerifyRelativeError(
+          io_slow_pipeline.frames[i].extra_channels()[ec],
+          io_default.frames[i].extra_channels()[ec], kMaxError, kMaxError, _));
     }
   }
 }
index 6e6bcb7..4495288 100644 (file)
@@ -5,7 +5,8 @@
 
 #include "lib/jxl/render_pipeline/simple_render_pipeline.h"
 
-#include "hwy/base.h"
+#include <hwy/base.h>
+
 #include "lib/jxl/image_ops.h"
 #include "lib/jxl/render_pipeline/render_pipeline_stage.h"
 #include "lib/jxl/sanitizers.h"
index 5d36c0a..a66a60d 100644 (file)
@@ -30,7 +30,7 @@ class BlendingStage : public RenderPipelineStage {
     info_ = state_.frame_header.blending_info;
     const std::vector<BlendingInfo>& ec_info =
         state_.frame_header.extra_channel_blending_info;
-    ImageBundle& bg = *state_.reference_frames[info_.source].frame;
+    const ImageBundle& bg = state_.reference_frames[info_.source].frame;
     bg_ = &bg;
     if (bg.xsize() == 0 || bg.ysize() == 0) {
       zeroes_.resize(image_xsize_, 0.f);
@@ -42,20 +42,33 @@ class BlendingStage : public RenderPipelineStage {
     } else if (std::any_of(ec_info.begin(), ec_info.end(),
                            [this](const BlendingInfo& info) {
                              const ImageBundle& bg =
-                                 *state_.reference_frames[info.source].frame;
+                                 state_.reference_frames[info.source].frame;
                              return bg.xsize() == 0 || bg.ysize() == 0;
                            })) {
       zeroes_.resize(image_xsize_, 0.f);
     }
 
-    if (bg.xsize() != 0 && bg.ysize() != 0 &&
-        (bg.xsize() < image_xsize_ || bg.ysize() < image_ysize_ ||
-         bg.origin.x0 != 0 || bg.origin.y0 != 0)) {
-      initialized_ = JXL_FAILURE("Trying to use a %" PRIuS "x%" PRIuS
-                                 " crop as a background",
-                                 bg.xsize(), bg.ysize());
+    auto verify_bg_size = [&](const ImageBundle& bg) -> Status {
+      if (bg.xsize() != 0 && bg.ysize() != 0 &&
+          (bg.xsize() < image_xsize_ || bg.ysize() < image_ysize_ ||
+           bg.origin.x0 != 0 || bg.origin.y0 != 0)) {
+        return JXL_FAILURE("Trying to use a %" PRIuS "x%" PRIuS
+                           " crop as a background",
+                           bg.xsize(), bg.ysize());
+      }
+      return true;
+    };
+
+    Status ok = verify_bg_size(bg);
+    for (const auto& info : ec_info) {
+      const ImageBundle& bg = state_.reference_frames[info.source].frame;
+      if (!!ok) ok = verify_bg_size(bg);
+    }
+    if (!ok) {
+      initialized_ = ok;
       return;
     }
+
     if (state_.metadata->m.xyb_encoded) {
       if (!dec_state->output_encoding_info.color_encoding_is_original) {
         initialized_ = JXL_FAILURE("Blending in unsupported color space");
@@ -89,7 +102,8 @@ class BlendingStage : public RenderPipelineStage {
           break;
         }
         default: {
-          JXL_ABORT("Invalid blend mode");  // should have failed to decode
+          JXL_UNREACHABLE(
+              "Invalid blend mode");  // should have failed to decode
         }
       }
     };
@@ -104,7 +118,6 @@ class BlendingStage : public RenderPipelineStage {
   void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
                   size_t xextra, size_t xsize, size_t xpos, size_t ypos,
                   size_t thread_id) const final {
-    PROFILER_ZONE("Blend");
     JXL_ASSERT(initialized_);
     const FrameOrigin& frame_origin = state_.frame_header.frame_origin;
     ssize_t bg_xpos = frame_origin.x0 + static_cast<ssize_t>(xpos);
@@ -130,17 +143,16 @@ class BlendingStage : public RenderPipelineStage {
     for (size_t c = 0; c < num_c; ++c) {
       fg_row_ptrs_[c] = GetInputRow(input_rows, c, 0) + offset;
       if (c < 3) {
-        bg_row_ptrs_[c] =
-            bg_->xsize() != 0 && bg_->ysize() != 0
-                ? bg_->color()->ConstPlaneRow(c, bg_ypos) + bg_xpos
-                : zeroes_.data();
+        bg_row_ptrs_[c] = bg_->xsize() != 0 && bg_->ysize() != 0
+                              ? bg_->color().ConstPlaneRow(c, bg_ypos) + bg_xpos
+                              : zeroes_.data();
       } else {
         const ImageBundle& ec_bg =
-            *state_
-                 .reference_frames[state_.frame_header
-                                       .extra_channel_blending_info[c - 3]
-                                       .source]
-                 .frame;
+            state_
+                .reference_frames[state_.frame_header
+                                      .extra_channel_blending_info[c - 3]
+                                      .source]
+                .frame;
         bg_row_ptrs_[c] =
             ec_bg.xsize() != 0 && ec_bg.ysize() != 0
                 ? ec_bg.extra_channels()[c - 3].ConstRow(bg_ypos) + bg_xpos
@@ -174,16 +186,16 @@ class BlendingStage : public RenderPipelineStage {
     } else {
       for (size_t c = 0; c < 3; ++c) {
         memcpy(GetInputRow(output_rows, c, 0),
-               bg_->color()->ConstPlaneRow(c, ypos) + xpos,
+               bg_->color().ConstPlaneRow(c, ypos) + xpos,
                xsize * sizeof(float));
       }
     }
     for (size_t ec = 0; ec < extra_channel_info_->size(); ++ec) {
       const ImageBundle& ec_bg =
-          *state_
-               .reference_frames
-                   [state_.frame_header.extra_channel_blending_info[ec].source]
-               .frame;
+          state_
+              .reference_frames
+                  [state_.frame_header.extra_channel_blending_info[ec].source]
+              .frame;
       if (ec_bg.xsize() == 0 || ec_bg.ysize() == 0) {
         memset(GetInputRow(output_rows, 3 + ec, 0), 0, xsize * sizeof(float));
       } else {
@@ -199,7 +211,7 @@ class BlendingStage : public RenderPipelineStage {
  private:
   const PassesSharedState& state_;
   BlendingInfo info_;
-  ImageBundle* bg_;
+  const ImageBundle* bg_;
   Status initialized_ = true;
   size_t image_xsize_;
   size_t image_ysize_;
index 9b73ee9..936fbd3 100644 (file)
@@ -30,7 +30,6 @@ class HorizontalChromaUpsamplingStage : public RenderPipelineStage {
   void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
                   size_t xextra, size_t xsize, size_t xpos, size_t ypos,
                   size_t thread_id) const final {
-    PROFILER_ZONE("HorizontalChromaUpsampling");
     HWY_FULL(float) df;
     xextra = RoundUpTo(xextra, Lanes(df));
     auto threefour = Set(df, 0.75f);
@@ -69,7 +68,6 @@ class VerticalChromaUpsamplingStage : public RenderPipelineStage {
   void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
                   size_t xextra, size_t xsize, size_t xpos, size_t ypos,
                   size_t thread_id) const final {
-    PROFILER_ZONE("VerticalChromaUpsampling");
     HWY_FULL(float) df;
     xextra = RoundUpTo(xextra, Lanes(df));
     auto threefour = Set(df, 0.75f);
index b8bfc15..b4d0cbd 100644 (file)
@@ -7,7 +7,6 @@
 #define LIB_JXL_RENDER_PIPELINE_STAGE_CHROMA_UPSAMPLING_H_
 #include <math.h>
 #include <stdint.h>
-#include <stdio.h>
 
 #include <algorithm>
 #include <utility>
diff --git a/lib/jxl/render_pipeline/stage_cms.cc b/lib/jxl/render_pipeline/stage_cms.cc
new file mode 100644 (file)
index 0000000..2465146
--- /dev/null
@@ -0,0 +1,134 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/render_pipeline/stage_cms.h"
+
+#include <memory>
+
+#include "jxl/cms_interface.h"
+#include "jxl/color_encoding.h"
+#include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dec_xyb.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/render_pipeline/stage_cms.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+#include "lib/jxl/dec_xyb-inl.h"
+#include "lib/jxl/sanitizers.h"
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+class CmsStage : public RenderPipelineStage {
+ public:
+  explicit CmsStage(OutputEncodingInfo output_encoding_info)
+      : RenderPipelineStage(RenderPipelineStage::Settings()),
+        output_encoding_info_(std::move(output_encoding_info)) {
+    c_src_ = output_encoding_info_.linear_color_encoding;
+  }
+
+  bool IsNeeded() const {
+    const size_t channels_src = (c_src_.IsCMYK() ? 4 : c_src_.Channels());
+    const size_t channels_dst = output_encoding_info_.color_encoding.Channels();
+    const bool not_mixing_color_and_grey =
+        (channels_src == channels_dst ||
+         (channels_src == 4 && channels_dst == 3));
+    return (output_encoding_info_.cms_set) &&
+           !c_src_.SameColorEncoding(output_encoding_info_.color_encoding) &&
+           not_mixing_color_and_grey;
+  }
+
+  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
+                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
+                  size_t thread_id) const final {
+    JXL_ASSERT(xsize == xsize_);
+    // TODO(firsching): handle grey case seperately
+    //  interleave
+    float* JXL_RESTRICT row0 = GetInputRow(input_rows, 0, 0);
+    float* JXL_RESTRICT row1 = GetInputRow(input_rows, 1, 0);
+    float* JXL_RESTRICT row2 = GetInputRow(input_rows, 2, 0);
+    float* mutable_buf_src = color_space_transform->BufSrc(thread_id);
+
+    for (size_t x = 0; x < xsize; x++) {
+      mutable_buf_src[3 * x + 0] = row0[x];
+      mutable_buf_src[3 * x + 1] = row1[x];
+      mutable_buf_src[3 * x + 2] = row2[x];
+    }
+    const float* buf_src = mutable_buf_src;
+    float* JXL_RESTRICT buf_dst = color_space_transform->BufDst(thread_id);
+    if (!color_space_transform->Run(thread_id, buf_src, buf_dst)) {
+      // TODO(firsching): somehow mark failing here?
+      return;
+    }
+    // de-interleave
+    for (size_t x = 0; x < xsize; x++) {
+      row0[x] = buf_dst[3 * x + 0];
+      row1[x] = buf_dst[3 * x + 1];
+      row2[x] = buf_dst[3 * x + 2];
+    }
+  }
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    return c < 3 ? RenderPipelineChannelMode::kInPlace
+                 : RenderPipelineChannelMode::kIgnored;
+  }
+
+  const char* GetName() const override { return "Cms"; }
+
+ private:
+  OutputEncodingInfo output_encoding_info_;
+  size_t xsize_;
+  std::unique_ptr<jxl::ColorSpaceTransform> color_space_transform;
+  ColorEncoding c_src_;
+
+  void SetInputSizes(
+      const std::vector<std::pair<size_t, size_t>>& input_sizes) override {
+#if JXL_ENABLE_ASSERT
+    JXL_ASSERT(input_sizes.size() >= 3);
+    for (size_t c = 1; c < input_sizes.size(); c++) {
+      JXL_ASSERT(input_sizes[c].first == input_sizes[0].first);
+      JXL_ASSERT(input_sizes[c].second == input_sizes[0].second);
+    }
+#endif
+    xsize_ = input_sizes[0].first;
+  }
+
+  Status PrepareForThreads(size_t num_threads) override {
+    color_space_transform = jxl::make_unique<jxl::ColorSpaceTransform>(
+        output_encoding_info_.color_management_system);
+    JXL_RETURN_IF_ERROR(color_space_transform->Init(
+        c_src_, output_encoding_info_.color_encoding,
+        output_encoding_info_.desired_intensity_target, xsize_, num_threads));
+    return true;
+  }
+};
+
+std::unique_ptr<RenderPipelineStage> GetCmsStage(
+    const OutputEncodingInfo& output_encoding_info) {
+  auto stage = jxl::make_unique<CmsStage>(output_encoding_info);
+  if (!stage->IsNeeded()) return nullptr;
+  return stage;
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(GetCmsStage);
+
+std::unique_ptr<RenderPipelineStage> GetCmsStage(
+    const OutputEncodingInfo& output_encoding_info) {
+  return HWY_DYNAMIC_DISPATCH(GetCmsStage)(output_encoding_info);
+}
+
+}  // namespace jxl
+#endif
diff --git a/lib/jxl/render_pipeline/stage_cms.h b/lib/jxl/render_pipeline/stage_cms.h
new file mode 100644 (file)
index 0000000..23277ae
--- /dev/null
@@ -0,0 +1,21 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_RENDER_PIPELINE_STAGE_CMS_H_
+#define LIB_JXL_RENDER_PIPELINE_STAGE_CMS_H_
+
+#include <memory>
+
+#include "lib/jxl/dec_xyb.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
+
+namespace jxl {
+
+std::unique_ptr<RenderPipelineStage> GetCmsStage(
+    const OutputEncodingInfo& output_encoding_info);
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_RENDER_PIPELINE_STAGE_CMS_H_
index d59c497..5d1a379 100644 (file)
@@ -5,6 +5,8 @@
 
 #include "lib/jxl/render_pipeline/stage_epf.h"
 
+#include "lib/jxl/base/common.h"
+#include "lib/jxl/common.h"  // JXL_HIGH_PRECISION
 #include "lib/jxl/epf.h"
 #include "lib/jxl/sanitizers.h"
 
@@ -516,7 +518,7 @@ std::unique_ptr<RenderPipelineStage> GetEPFStage(const LoopFilter& lf,
     case 2:
       return HWY_DYNAMIC_DISPATCH(GetEPFStage2)(lf, sigma);
     default:
-      JXL_ABORT("Invalid EPF stage");
+      JXL_UNREACHABLE("Invalid EPF stage");
   }
 }
 
index 81f546c..6b1f646 100644 (file)
 #include <hwy/foreach_target.h>
 #include <hwy/highway.h>
 
-#include "lib/jxl/dec_tone_mapping-inl.h"
+#include "lib/jxl/cms/tone_mapping-inl.h"
+#include "lib/jxl/cms/transfer_functions-inl.h"
+#include "lib/jxl/common.h"  // JXL_HIGH_PRECISION
 #include "lib/jxl/sanitizers.h"
-#include "lib/jxl/transfer_functions-inl.h"
 
 HWY_BEFORE_NAMESPACE();
 namespace jxl {
 namespace HWY_NAMESPACE {
+namespace {
 
 // These templates are not found via ADL.
 using hwy::HWY_NAMESPACE::IfThenZeroElse;
@@ -57,10 +59,12 @@ struct OpRgb {
 };
 
 struct OpPq {
+  explicit OpPq(const float intensity_target) : tf_pq_(intensity_target) {}
   template <typename D, typename T>
   T Transform(D d, const T& linear) const {
-    return TF_PQ().EncodedFromDisplay(d, linear);
+    return tf_pq_.EncodedFromDisplay(d, linear);
   }
+  TF_PQ tf_pq_;
 };
 
 struct OpHlg {
@@ -104,7 +108,6 @@ class FromLinearStage : public RenderPipelineStage {
   void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
                   size_t xextra, size_t xsize, size_t xpos, size_t ypos,
                   size_t thread_id) const final {
-    PROFILER_ZONE("FromLinear");
     const HWY_FULL(float) d;
     const size_t xsize_v = RoundUpTo(xsize, Lanes(d));
     float* JXL_RESTRICT row0 = GetInputRow(input_rows, 0, 0);
@@ -148,28 +151,30 @@ std::unique_ptr<FromLinearStage<Op>> MakeFromLinearStage(Op&& op) {
 
 std::unique_ptr<RenderPipelineStage> GetFromLinearStage(
     const OutputEncodingInfo& output_encoding_info) {
-  if (output_encoding_info.color_encoding.tf.IsLinear()) {
+  const auto& tf = output_encoding_info.color_encoding.Tf();
+  if (tf.IsLinear()) {
     return MakeFromLinearStage(MakePerChannelOp(OpLinear()));
-  } else if (output_encoding_info.color_encoding.tf.IsSRGB()) {
+  } else if (tf.IsSRGB()) {
     return MakeFromLinearStage(MakePerChannelOp(OpRgb()));
-  } else if (output_encoding_info.color_encoding.tf.IsPQ()) {
-    return MakeFromLinearStage(MakePerChannelOp(OpPq()));
-  } else if (output_encoding_info.color_encoding.tf.IsHLG()) {
+  } else if (tf.IsPQ()) {
+    return MakeFromLinearStage(
+        MakePerChannelOp(OpPq(output_encoding_info.orig_intensity_target)));
+  } else if (tf.IsHLG()) {
     return MakeFromLinearStage(
         OpHlg(output_encoding_info.luminances,
               output_encoding_info.desired_intensity_target));
-  } else if (output_encoding_info.color_encoding.tf.Is709()) {
+  } else if (tf.Is709()) {
     return MakeFromLinearStage(MakePerChannelOp(Op709()));
-  } else if (output_encoding_info.color_encoding.tf.IsGamma() ||
-             output_encoding_info.color_encoding.tf.IsDCI()) {
+  } else if (tf.have_gamma || tf.IsDCI()) {
     return MakeFromLinearStage(
         MakePerChannelOp(OpGamma{output_encoding_info.inverse_gamma}));
   } else {
     // This is a programming error.
-    JXL_ABORT("Invalid target encoding");
+    JXL_UNREACHABLE("Invalid target encoding");
   }
 }
 
+}  // namespace
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
 }  // namespace jxl
index fc90acb..0917db3 100644 (file)
@@ -47,8 +47,6 @@ class GaborishStage : public RenderPipelineStage {
   void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
                   size_t xextra, size_t xsize, size_t xpos, size_t ypos,
                   size_t thread_id) const final {
-    PROFILER_ZONE("Gaborish");
-
     const HWY_FULL(float) d;
     for (size_t c = 0; c < 3; c++) {
       float* JXL_RESTRICT row_t = GetInputRow(input_rows, c, -1);
index 761800f..55166e3 100644 (file)
@@ -7,7 +7,6 @@
 #define LIB_JXL_RENDER_PIPELINE_STAGE_GABORISH_H_
 #include <math.h>
 #include <stdint.h>
-#include <stdio.h>
 
 #include <algorithm>
 #include <utility>
index 9f0cee3..5cf8a6e 100644 (file)
 #include <hwy/highway.h>
 
 #include "lib/jxl/sanitizers.h"
-#include "lib/jxl/transfer_functions-inl.h"
 
 HWY_BEFORE_NAMESPACE();
 namespace jxl {
 namespace HWY_NAMESPACE {
 
 // These templates are not found via ADL.
+using hwy::HWY_NAMESPACE::Add;
+using hwy::HWY_NAMESPACE::And;
+using hwy::HWY_NAMESPACE::Floor;
+using hwy::HWY_NAMESPACE::Ge;
+using hwy::HWY_NAMESPACE::IfThenElse;
 using hwy::HWY_NAMESPACE::Max;
-using hwy::HWY_NAMESPACE::ShiftRight;
+using hwy::HWY_NAMESPACE::Min;
+using hwy::HWY_NAMESPACE::Mul;
+using hwy::HWY_NAMESPACE::MulAdd;
+using hwy::HWY_NAMESPACE::Or;
+using hwy::HWY_NAMESPACE::Sub;
+using hwy::HWY_NAMESPACE::TableLookupBytes;
 using hwy::HWY_NAMESPACE::Vec;
 using hwy::HWY_NAMESPACE::ZeroIfNegative;
 
@@ -68,9 +77,10 @@ class StrengthEvalLut {
     auto scaled_vx = Max(Zero(D()), Mul(vx, Set(D(), kScale)));
     auto floor_x = Floor(scaled_vx);
     auto frac_x = Sub(scaled_vx, floor_x);
-    floor_x = IfThenElse(Ge(scaled_vx, Set(D(), kScale)), Set(D(), kScale - 1),
+    floor_x = IfThenElse(Ge(scaled_vx, Set(D(), kScale + 1)), Set(D(), kScale),
                          floor_x);
-    frac_x = IfThenElse(Ge(scaled_vx, Set(D(), kScale)), Set(D(), 1), frac_x);
+    frac_x =
+        IfThenElse(Ge(scaled_vx, Set(D(), kScale + 1)), Set(D(), 1), frac_x);
     auto floor_x_int = ConvertTo(DI(), floor_x);
 #if HWY_TARGET == HWY_SCALAR
     auto low = Set(D(), noise_params_.lut[floor_x_int.raw]);
@@ -154,8 +164,6 @@ class AddNoiseStage : public RenderPipelineStage {
   void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
                   size_t xextra, size_t xsize, size_t xpos, size_t ypos,
                   size_t thread_id) const final {
-    PROFILER_ZONE("Noise apply");
-
     if (!noise_params_.HasAny()) return;
     const StrengthEvalLut noise_model(noise_params_);
     D d;
@@ -236,8 +244,6 @@ class ConvolveNoiseStage : public RenderPipelineStage {
   void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
                   size_t xextra, size_t xsize, size_t xpos, size_t ypos,
                   size_t thread_id) const final {
-    PROFILER_ZONE("Noise convolve");
-
     const HWY_FULL(float) d;
     for (size_t c = first_c_; c < first_c_ + 3; c++) {
       float* JXL_RESTRICT rows[5];
index 527be03..c5a75b0 100644 (file)
@@ -17,7 +17,6 @@ class PatchDictionaryStage : public RenderPipelineStage {
   void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
                   size_t xextra, size_t xsize, size_t xpos, size_t ypos,
                   size_t thread_id) const final {
-    PROFILER_ZONE("RenderPatches");
     JXL_ASSERT(xpos == 0 || xpos >= xextra);
     size_t x0 = xpos ? xpos - xextra : 0;
     std::vector<float*> row_ptrs(num_channels_);
index d97d97e..4a0529c 100644 (file)
@@ -23,7 +23,6 @@ class SplineStage : public RenderPipelineStage {
   void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
                   size_t xextra, size_t xsize, size_t xpos, size_t ypos,
                   size_t thread_id) const final {
-    PROFILER_ZONE("RenderSplines");
     float* row_x = GetInputRow(input_rows, 0, 0);
     float* row_y = GetInputRow(input_rows, 1, 0);
     float* row_b = GetInputRow(input_rows, 2, 0);
index d4f6152..a43cb4e 100644 (file)
@@ -19,7 +19,6 @@ class SpotColorStage : public RenderPipelineStage {
                   size_t xextra, size_t xsize, size_t xpos, size_t ypos,
                   size_t thread_id) const final {
     // TODO(veluca): add SIMD.
-    PROFILER_ZONE("RenderSpotColors");
     float scale = spot_color_[3];
     for (size_t c = 0; c < 3; c++) {
       float* JXL_RESTRICT p = GetInputRow(input_rows, c, 0);
index bf79481..85eca2f 100644 (file)
 #include <hwy/foreach_target.h>
 #include <hwy/highway.h>
 
-#include "lib/jxl/dec_tone_mapping-inl.h"
+#include "lib/jxl/cms/tone_mapping-inl.h"
+#include "lib/jxl/cms/transfer_functions-inl.h"
 #include "lib/jxl/sanitizers.h"
-#include "lib/jxl/transfer_functions-inl.h"
 
 HWY_BEFORE_NAMESPACE();
 namespace jxl {
 namespace HWY_NAMESPACE {
+namespace {
 
 // These templates are not found via ADL.
 using hwy::HWY_NAMESPACE::IfThenZeroElse;
@@ -53,10 +54,12 @@ struct OpRgb {
 };
 
 struct OpPq {
+  explicit OpPq(const float intensity_target) : tf_pq_(intensity_target) {}
   template <typename D, typename T>
   T Transform(D d, const T& encoded) const {
-    return TF_PQ().DisplayFromEncoded(d, encoded);
+    return tf_pq_.DisplayFromEncoded(d, encoded);
   }
+  TF_PQ tf_pq_;
 };
 
 struct OpHlg {
@@ -67,10 +70,10 @@ struct OpHlg {
   template <typename D, typename T>
   void Transform(D d, T* r, T* g, T* b) const {
     for (T* val : {r, g, b}) {
-      float vals[MaxLanes(d)];
+      HWY_ALIGN float vals[MaxLanes(d)];
       Store(*val, d, vals);
       for (size_t i = 0; i < Lanes(d); ++i) {
-        vals[i] = TF_HLG().DisplayFromEncoded(vals[i]);
+        vals[i] = TF_HLG_Base::DisplayFromEncoded(vals[i]);
       }
       *val = Load(d, vals);
     }
@@ -113,8 +116,6 @@ class ToLinearStage : public RenderPipelineStage {
   void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
                   size_t xextra, size_t xsize, size_t xpos, size_t ypos,
                   size_t thread_id) const final {
-    PROFILER_ZONE("ToLinear");
-
     const HWY_FULL(float) d;
     const size_t xsize_v = RoundUpTo(xsize, Lanes(d));
     float* JXL_RESTRICT row0 = GetInputRow(input_rows, 0, 0);
@@ -161,19 +162,20 @@ std::unique_ptr<ToLinearStage<Op>> MakeToLinearStage(Op&& op) {
 
 std::unique_ptr<RenderPipelineStage> GetToLinearStage(
     const OutputEncodingInfo& output_encoding_info) {
-  if (output_encoding_info.color_encoding.tf.IsLinear()) {
+  const auto& tf = output_encoding_info.color_encoding.Tf();
+  if (tf.IsLinear()) {
     return MakeToLinearStage(MakePerChannelOp(OpLinear()));
-  } else if (output_encoding_info.color_encoding.tf.IsSRGB()) {
+  } else if (tf.IsSRGB()) {
     return MakeToLinearStage(MakePerChannelOp(OpRgb()));
-  } else if (output_encoding_info.color_encoding.tf.IsPQ()) {
-    return MakeToLinearStage(MakePerChannelOp(OpPq()));
-  } else if (output_encoding_info.color_encoding.tf.IsHLG()) {
+  } else if (tf.IsPQ()) {
+    return MakeToLinearStage(
+        MakePerChannelOp(OpPq(output_encoding_info.orig_intensity_target)));
+  } else if (tf.IsHLG()) {
     return MakeToLinearStage(OpHlg(output_encoding_info.luminances,
                                    output_encoding_info.orig_intensity_target));
-  } else if (output_encoding_info.color_encoding.tf.Is709()) {
+  } else if (tf.Is709()) {
     return MakeToLinearStage(MakePerChannelOp(Op709()));
-  } else if (output_encoding_info.color_encoding.tf.IsGamma() ||
-             output_encoding_info.color_encoding.tf.IsDCI()) {
+  } else if (tf.have_gamma || tf.IsDCI()) {
     return MakeToLinearStage(
         MakePerChannelOp(OpGamma{1.f / output_encoding_info.inverse_gamma}));
   } else {
@@ -181,6 +183,7 @@ std::unique_ptr<RenderPipelineStage> GetToLinearStage(
   }
 }
 
+}  // namespace
 // NOLINTNEXTLINE(google-readability-namespace-comments)
 }  // namespace HWY_NAMESPACE
 }  // namespace jxl
index 7609534..2a272e1 100644 (file)
 #include <hwy/foreach_target.h>
 #include <hwy/highway.h>
 
-#include "lib/jxl/dec_tone_mapping-inl.h"
+#include "lib/jxl/cms/tone_mapping-inl.h"
 #include "lib/jxl/dec_xyb-inl.h"
 #include "lib/jxl/sanitizers.h"
-#include "lib/jxl/transfer_functions-inl.h"
 
 HWY_BEFORE_NAMESPACE();
 namespace jxl {
@@ -29,9 +28,10 @@ class ToneMappingStage : public RenderPipelineStage {
       // No tone mapping requested.
       return;
     }
-    if (output_encoding_info_.orig_color_encoding.tf.IsPQ() &&
-        output_encoding_info_.desired_intensity_target <
-            output_encoding_info_.orig_intensity_target) {
+    const auto& orig_tf = output_encoding_info_.orig_color_encoding.Tf();
+    const auto& dest_tf = output_encoding_info_.color_encoding.Tf();
+    if (orig_tf.IsPQ() && output_encoding_info_.desired_intensity_target <
+                              output_encoding_info_.orig_intensity_target) {
       tone_mapper_ = jxl::make_unique<ToneMapper>(
           /*source_range=*/std::pair<float, float>(
               0, output_encoding_info_.orig_intensity_target),
@@ -39,16 +39,14 @@ class ToneMappingStage : public RenderPipelineStage {
           std::pair<float, float>(
               0, output_encoding_info_.desired_intensity_target),
           output_encoding_info_.luminances);
-    } else if (output_encoding_info_.orig_color_encoding.tf.IsHLG() &&
-               !output_encoding_info_.color_encoding.tf.IsHLG()) {
+    } else if (orig_tf.IsHLG() && !dest_tf.IsHLG()) {
       hlg_ootf_ = jxl::make_unique<HlgOOTF>(
           /*source_luminance=*/output_encoding_info_.orig_intensity_target,
           /*target_luminance=*/output_encoding_info_.desired_intensity_target,
           output_encoding_info_.luminances);
     }
 
-    if (output_encoding_info_.color_encoding.tf.IsPQ() &&
-        (tone_mapper_ || hlg_ootf_)) {
+    if (dest_tf.IsPQ() && (tone_mapper_ || hlg_ootf_)) {
       to_intensity_target_ =
           10000.f / output_encoding_info_.orig_intensity_target;
       from_desired_intensity_target_ =
@@ -61,8 +59,6 @@ class ToneMappingStage : public RenderPipelineStage {
   void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
                   size_t xextra, size_t xsize, size_t xpos, size_t ypos,
                   size_t thread_id) const final {
-    PROFILER_ZONE("ToneMapping");
-
     if (!(tone_mapper_ || hlg_ootf_)) return;
 
     const HWY_FULL(float) d;
index 99824f8..57eb9a9 100644 (file)
@@ -7,7 +7,6 @@
 #define LIB_JXL_RENDER_PIPELINE_STAGE_TONE_MAPPING_H_
 #include <math.h>
 #include <stdint.h>
-#include <stdio.h>
 
 #include <algorithm>
 #include <utility>
index a75e259..bb8d9b2 100644 (file)
@@ -47,7 +47,6 @@ class UpsamplingStage : public RenderPipelineStage {
   void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
                   size_t xextra, size_t xsize, size_t xpos, size_t ypos,
                   size_t thread_id) const final {
-    PROFILER_ZONE("Upsampling");
     static HWY_FULL(float) df;
     size_t shift = settings_.shift_x;
     size_t N = 1 << shift;
@@ -100,7 +99,7 @@ class UpsamplingStage : public RenderPipelineStage {
                     [x % 8 < 4 ? x % 4 : 3 - x % 4][y % 8 < 4 ? iy : 4 - iy]
                     [x % 8 < 4 ? ix : 4 - ix];
     }
-    JXL_ABORT("Invalid upsample");
+    JXL_UNREACHABLE("Invalid upsample");
   }
 
   template <ssize_t N>
index 71c4d97..3a5d999 100644 (file)
@@ -6,7 +6,7 @@
 #include "lib/jxl/render_pipeline/stage_write.h"
 
 #include "lib/jxl/alpha.h"
-#include "lib/jxl/common.h"
+#include "lib/jxl/base/common.h"
 #include "lib/jxl/dec_cache.h"
 #include "lib/jxl/image_bundle.h"
 #include "lib/jxl/sanitizers.h"
@@ -22,151 +22,451 @@ namespace HWY_NAMESPACE {
 
 // These templates are not found via ADL.
 using hwy::HWY_NAMESPACE::Clamp;
+using hwy::HWY_NAMESPACE::Div;
+using hwy::HWY_NAMESPACE::Max;
 using hwy::HWY_NAMESPACE::Mul;
 using hwy::HWY_NAMESPACE::NearestInt;
-using hwy::HWY_NAMESPACE::U8FromU32;
-
-template <typename D, typename V>
-void StoreRGBA(D d, V r, V g, V b, V a, bool alpha, size_t n, size_t extra,
-               uint8_t* buf) {
-#if HWY_TARGET == HWY_SCALAR
-  buf[0] = r.raw;
-  buf[1] = g.raw;
-  buf[2] = b.raw;
-  if (alpha) {
-    buf[3] = a.raw;
-  }
-#elif HWY_TARGET == HWY_NEON
-  if (alpha) {
-    uint8x8x4_t data = {r.raw, g.raw, b.raw, a.raw};
-    if (extra >= 8) {
-      vst4_u8(buf, data);
-    } else {
-      uint8_t tmp[8 * 4];
-      vst4_u8(tmp, data);
-      memcpy(buf, tmp, n * 4);
-    }
-  } else {
-    uint8x8x3_t data = {r.raw, g.raw, b.raw};
-    if (extra >= 8) {
-      vst3_u8(buf, data);
-    } else {
-      uint8_t tmp[8 * 3];
-      vst3_u8(tmp, data);
-      memcpy(buf, tmp, n * 3);
-    }
-  }
-#else
-  // TODO(veluca): implement this for x86.
-  size_t mul = alpha ? 4 : 3;
-  HWY_ALIGN uint8_t bytes[16];
-  StoreU(r, d, bytes);
-  for (size_t i = 0; i < n; i++) {
-    buf[mul * i] = bytes[i];
-  }
-  StoreU(g, d, bytes);
-  for (size_t i = 0; i < n; i++) {
-    buf[mul * i + 1] = bytes[i];
-  }
-  StoreU(b, d, bytes);
-  for (size_t i = 0; i < n; i++) {
-    buf[mul * i + 2] = bytes[i];
-  }
-  if (alpha) {
-    StoreU(a, d, bytes);
-    for (size_t i = 0; i < n; i++) {
-      buf[4 * i + 3] = bytes[i];
-    }
-  }
-#endif
-}
+using hwy::HWY_NAMESPACE::Or;
+using hwy::HWY_NAMESPACE::Rebind;
+using hwy::HWY_NAMESPACE::ShiftLeftSame;
+using hwy::HWY_NAMESPACE::ShiftRightSame;
 
-class WriteToU8Stage : public RenderPipelineStage {
+class WriteToOutputStage : public RenderPipelineStage {
  public:
-  WriteToU8Stage(uint8_t* rgb, size_t stride, size_t height, bool rgba,
-                 bool has_alpha, size_t alpha_c)
+  WriteToOutputStage(const ImageOutput& main_output, size_t width,
+                     size_t height, bool has_alpha, bool unpremul_alpha,
+                     size_t alpha_c, Orientation undo_orientation,
+                     const std::vector<ImageOutput>& extra_output)
       : RenderPipelineStage(RenderPipelineStage::Settings()),
-        rgb_(rgb),
-        stride_(stride),
+        width_(width),
         height_(height),
-        rgba_(rgba),
+        main_(main_output),
+        num_color_(main_.num_channels_ < 3 ? 1 : 3),
+        want_alpha_(main_.num_channels_ == 2 || main_.num_channels_ == 4),
         has_alpha_(has_alpha),
-        alpha_c_(alpha_c) {}
+        unpremul_alpha_(unpremul_alpha),
+        alpha_c_(alpha_c),
+        flip_x_(ShouldFlipX(undo_orientation)),
+        flip_y_(ShouldFlipY(undo_orientation)),
+        transpose_(ShouldTranspose(undo_orientation)),
+        opaque_alpha_(kMaxPixelsPerCall, 1.0f) {
+    for (size_t ec = 0; ec < extra_output.size(); ++ec) {
+      if (extra_output[ec].callback.IsPresent() || extra_output[ec].buffer) {
+        Output extra(extra_output[ec]);
+        extra.channel_index_ = 3 + ec;
+        extra_channels_.push_back(extra);
+      }
+    }
+  }
+
+  WriteToOutputStage(const WriteToOutputStage&) = delete;
+  WriteToOutputStage& operator=(const WriteToOutputStage&) = delete;
+  WriteToOutputStage(WriteToOutputStage&&) = delete;
+  WriteToOutputStage& operator=(WriteToOutputStage&&) = delete;
+
+  ~WriteToOutputStage() override {
+    if (main_.run_opaque_) {
+      main_.pixel_callback_.destroy(main_.run_opaque_);
+    }
+    for (auto& extra : extra_channels_) {
+      if (extra.run_opaque_) {
+        extra.pixel_callback_.destroy(extra.run_opaque_);
+      }
+    }
+  }
 
   void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
                   size_t xextra, size_t xsize, size_t xpos, size_t ypos,
                   size_t thread_id) const final {
-    if (ypos >= height_) return;
     JXL_DASSERT(xextra == 0);
-    size_t bytes = rgba_ ? 4 : 3;
-    const float* JXL_RESTRICT row_in_r = GetInputRow(input_rows, 0, 0);
-    const float* JXL_RESTRICT row_in_g = GetInputRow(input_rows, 1, 0);
-    const float* JXL_RESTRICT row_in_b = GetInputRow(input_rows, 2, 0);
-    const float* JXL_RESTRICT row_in_a =
-        has_alpha_ ? GetInputRow(input_rows, alpha_c_, 0) : nullptr;
-    size_t base_ptr = ypos * stride_ + bytes * (xpos - xextra);
-    using D = HWY_CAPPED(float, 4);
-    const D d;
-    D::Rebind<uint32_t> du;
-    auto zero = Zero(d);
-    auto one = Set(d, 1.0f);
-    auto mul = Set(d, 255.0f);
+    JXL_DASSERT(main_.run_opaque_ || main_.buffer_);
+    if (ypos >= height_) return;
+    if (xpos >= width_) return;
+    if (flip_y_) {
+      ypos = height_ - 1u - ypos;
+    }
+    size_t limit = std::min(xsize, width_ - xpos);
+    for (size_t x0 = 0; x0 < limit; x0 += kMaxPixelsPerCall) {
+      size_t xstart = xpos + x0;
+      size_t len = std::min<size_t>(kMaxPixelsPerCall, limit - x0);
+
+      const float* line_buffers[4];
+      for (size_t c = 0; c < num_color_; c++) {
+        line_buffers[c] = GetInputRow(input_rows, c, 0) + x0;
+      }
+      if (has_alpha_) {
+        line_buffers[num_color_] = GetInputRow(input_rows, alpha_c_, 0) + x0;
+      } else {
+        // opaque_alpha_ is a way to set all values to 1.0f.
+        line_buffers[num_color_] = opaque_alpha_.data();
+      }
+      if (has_alpha_ && want_alpha_ && unpremul_alpha_) {
+        UnpremulAlpha(thread_id, len, line_buffers);
+      }
+      OutputBuffers(main_, thread_id, ypos, xstart, len, line_buffers);
+      for (const auto& extra : extra_channels_) {
+        line_buffers[0] = GetInputRow(input_rows, extra.channel_index_, 0) + x0;
+        OutputBuffers(extra, thread_id, ypos, xstart, len, line_buffers);
+      }
+    }
+  }
+
+  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
+    if (c < num_color_ || (has_alpha_ && c == alpha_c_)) {
+      return RenderPipelineChannelMode::kInput;
+    }
+    for (const auto& extra : extra_channels_) {
+      if (c == extra.channel_index_) {
+        return RenderPipelineChannelMode::kInput;
+      }
+    }
+    return RenderPipelineChannelMode::kIgnored;
+  }
 
-    ssize_t x1 = RoundUpTo(xsize, Lanes(d));
+  const char* GetName() const override { return "WritePixelCB"; }
 
-    msan::UnpoisonMemory(row_in_r + xsize, sizeof(float) * (x1 - xsize));
-    msan::UnpoisonMemory(row_in_g + xsize, sizeof(float) * (x1 - xsize));
-    msan::UnpoisonMemory(row_in_b + xsize, sizeof(float) * (x1 - xsize));
-    if (row_in_a) {
-      msan::UnpoisonMemory(row_in_a + xsize, sizeof(float) * (x1 - xsize));
+ private:
+  struct Output {
+    Output(const ImageOutput& image_out)
+        : pixel_callback_(image_out.callback),
+          buffer_(image_out.buffer),
+          buffer_size_(image_out.buffer_size),
+          stride_(image_out.stride),
+          num_channels_(image_out.format.num_channels),
+          swap_endianness_(SwapEndianness(image_out.format.endianness)),
+          data_type_(image_out.format.data_type),
+          bits_per_sample_(image_out.bits_per_sample) {}
+
+    Status PrepareForThreads(size_t num_threads) {
+      if (pixel_callback_.IsPresent()) {
+        run_opaque_ =
+            pixel_callback_.Init(num_threads, /*num_pixels=*/kMaxPixelsPerCall);
+        JXL_RETURN_IF_ERROR(run_opaque_ != nullptr);
+      } else {
+        JXL_RETURN_IF_ERROR(buffer_ != nullptr);
+      }
+      return true;
+    }
+
+    PixelCallback pixel_callback_;
+    void* run_opaque_ = nullptr;
+    void* buffer_ = nullptr;
+    size_t buffer_size_;
+    size_t stride_;
+    size_t num_channels_;
+    bool swap_endianness_;
+    JxlDataType data_type_;
+    size_t bits_per_sample_;
+    size_t channel_index_;  // used for extra_channels
+  };
+
+  Status PrepareForThreads(size_t num_threads) override {
+    JXL_RETURN_IF_ERROR(main_.PrepareForThreads(num_threads));
+    for (auto& extra : extra_channels_) {
+      JXL_RETURN_IF_ERROR(extra.PrepareForThreads(num_threads));
+    }
+    temp_out_.resize(num_threads);
+    for (CacheAlignedUniquePtr& temp : temp_out_) {
+      temp = AllocateArray(sizeof(float) * kMaxPixelsPerCall *
+                           main_.num_channels_);
+    }
+    if ((has_alpha_ && want_alpha_ && unpremul_alpha_) || flip_x_) {
+      temp_in_.resize(num_threads * main_.num_channels_);
+      for (CacheAlignedUniquePtr& temp : temp_in_) {
+        temp = AllocateArray(sizeof(float) * kMaxPixelsPerCall);
+      }
+    }
+    return true;
+  }
+  static bool ShouldFlipX(Orientation undo_orientation) {
+    return (undo_orientation == Orientation::kFlipHorizontal ||
+            undo_orientation == Orientation::kRotate180 ||
+            undo_orientation == Orientation::kRotate270 ||
+            undo_orientation == Orientation::kAntiTranspose);
+  }
+  static bool ShouldFlipY(Orientation undo_orientation) {
+    return (undo_orientation == Orientation::kFlipVertical ||
+            undo_orientation == Orientation::kRotate180 ||
+            undo_orientation == Orientation::kRotate90 ||
+            undo_orientation == Orientation::kAntiTranspose);
+  }
+  static bool ShouldTranspose(Orientation undo_orientation) {
+    return (undo_orientation == Orientation::kTranspose ||
+            undo_orientation == Orientation::kRotate90 ||
+            undo_orientation == Orientation::kRotate270 ||
+            undo_orientation == Orientation::kAntiTranspose);
+  }
+
+  void UnpremulAlpha(size_t thread_id, size_t len,
+                     const float** line_buffers) const {
+    const HWY_FULL(float) d;
+    auto one = Set(d, 1.0f);
+    float* temp_in[4];
+    for (size_t c = 0; c < main_.num_channels_; ++c) {
+      size_t tix = thread_id * main_.num_channels_ + c;
+      temp_in[c] = reinterpret_cast<float*>(temp_in_[tix].get());
+      memcpy(temp_in[c], line_buffers[c], sizeof(float) * len);
+    }
+    auto small_alpha = Set(d, kSmallAlpha);
+    for (size_t ix = 0; ix < len; ix += Lanes(d)) {
+      auto alpha = LoadU(d, temp_in[num_color_] + ix);
+      auto mul = Div(one, Max(small_alpha, alpha));
+      for (size_t c = 0; c < num_color_; ++c) {
+        auto val = LoadU(d, temp_in[c] + ix);
+        StoreU(Mul(val, mul), d, temp_in[c] + ix);
+      }
+    }
+    for (size_t c = 0; c < main_.num_channels_; ++c) {
+      line_buffers[c] = temp_in[c];
     }
+  }
 
-    for (ssize_t x = 0; x < x1; x += Lanes(d)) {
-      auto rf = Mul(Clamp(zero, LoadU(d, row_in_r + x), one), mul);
-      auto gf = Mul(Clamp(zero, LoadU(d, row_in_g + x), one), mul);
-      auto bf = Mul(Clamp(zero, LoadU(d, row_in_b + x), one), mul);
-      auto af = row_in_a ? Mul(Clamp(zero, LoadU(d, row_in_a + x), one), mul)
-                         : Set(d, 255.0f);
-      auto r8 = U8FromU32(BitCast(du, NearestInt(rf)));
-      auto g8 = U8FromU32(BitCast(du, NearestInt(gf)));
-      auto b8 = U8FromU32(BitCast(du, NearestInt(bf)));
-      auto a8 = U8FromU32(BitCast(du, NearestInt(af)));
-      size_t n = xsize - x;
-      if (JXL_LIKELY(n >= Lanes(d))) {
-        StoreRGBA(D::Rebind<uint8_t>(), r8, g8, b8, a8, rgba_, Lanes(d), n,
-                  rgb_ + base_ptr + bytes * x);
+  void OutputBuffers(const Output& out, size_t thread_id, size_t ypos,
+                     size_t xstart, size_t len, const float* input[4]) const {
+    if (flip_x_) {
+      FlipX(out, thread_id, len, &xstart, input);
+    }
+    if (out.data_type_ == JXL_TYPE_UINT8) {
+      uint8_t* JXL_RESTRICT temp =
+          reinterpret_cast<uint8_t*>(temp_out_[thread_id].get());
+      StoreUnsignedRow(out, input, len, temp);
+      WriteToOutput(out, thread_id, ypos, xstart, len, temp);
+    } else if (out.data_type_ == JXL_TYPE_UINT16 ||
+               out.data_type_ == JXL_TYPE_FLOAT16) {
+      uint16_t* JXL_RESTRICT temp =
+          reinterpret_cast<uint16_t*>(temp_out_[thread_id].get());
+      if (out.data_type_ == JXL_TYPE_UINT16) {
+        StoreUnsignedRow(out, input, len, temp);
       } else {
-        StoreRGBA(D::Rebind<uint8_t>(), r8, g8, b8, a8, rgba_, n, n,
-                  rgb_ + base_ptr + bytes * x);
+        StoreFloat16Row(out, input, len, temp);
+      }
+      if (out.swap_endianness_) {
+        const HWY_FULL(uint16_t) du;
+        size_t output_len = len * out.num_channels_;
+        for (size_t j = 0; j < output_len; j += Lanes(du)) {
+          auto v = LoadU(du, temp + j);
+          auto vswap = Or(ShiftRightSame(v, 8), ShiftLeftSame(v, 8));
+          StoreU(vswap, du, temp + j);
+        }
       }
+      WriteToOutput(out, thread_id, ypos, xstart, len, temp);
+    } else if (out.data_type_ == JXL_TYPE_FLOAT) {
+      float* JXL_RESTRICT temp =
+          reinterpret_cast<float*>(temp_out_[thread_id].get());
+      StoreFloatRow(out, input, len, temp);
+      if (out.swap_endianness_) {
+        size_t output_len = len * out.num_channels_;
+        for (size_t j = 0; j < output_len; ++j) {
+          temp[j] = BSwapFloat(temp[j]);
+        }
+      }
+      WriteToOutput(out, thread_id, ypos, xstart, len, temp);
     }
   }
 
-  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
-    return c < 3 || (has_alpha_ && c == alpha_c_)
-               ? RenderPipelineChannelMode::kInput
-               : RenderPipelineChannelMode::kIgnored;
+  void FlipX(const Output& out, size_t thread_id, size_t len, size_t* xstart,
+             const float** line_buffers) const {
+    float* temp_in[4];
+    for (size_t c = 0; c < out.num_channels_; ++c) {
+      size_t tix = thread_id * main_.num_channels_ + c;
+      temp_in[c] = reinterpret_cast<float*>(temp_in_[tix].get());
+      if (temp_in[c] != line_buffers[c]) {
+        memcpy(temp_in[c], line_buffers[c], sizeof(float) * len);
+      }
+    }
+    size_t last = (len - 1u);
+    size_t num = (len / 2);
+    for (size_t i = 0; i < num; ++i) {
+      for (size_t c = 0; c < out.num_channels_; ++c) {
+        std::swap(temp_in[c][i], temp_in[c][last - i]);
+      }
+    }
+    for (size_t c = 0; c < out.num_channels_; ++c) {
+      line_buffers[c] = temp_in[c];
+    }
+    *xstart = width_ - *xstart - len;
   }
 
-  const char* GetName() const override { return "WriteToU8"; }
+  template <typename T>
+  void StoreUnsignedRow(const Output& out, const float* input[4], size_t len,
+                        T* output) const {
+    const HWY_FULL(float) d;
+    auto zero = Zero(d);
+    auto one = Set(d, 1.0f);
+    auto mul = Set(d, (1u << (out.bits_per_sample_)) - 1);
+    const Rebind<T, decltype(d)> du;
+    const size_t padding = RoundUpTo(len, Lanes(d)) - len;
+    for (size_t c = 0; c < out.num_channels_; ++c) {
+      msan::UnpoisonMemory(input[c] + len, sizeof(input[c][0]) * padding);
+    }
+    if (out.num_channels_ == 1) {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        auto v0 = Mul(Clamp(zero, LoadU(d, &input[0][i]), one), mul);
+        StoreU(DemoteTo(du, NearestInt(v0)), du, &output[i]);
+      }
+    } else if (out.num_channels_ == 2) {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        auto v0 = Mul(Clamp(zero, LoadU(d, &input[0][i]), one), mul);
+        auto v1 = Mul(Clamp(zero, LoadU(d, &input[1][i]), one), mul);
+        StoreInterleaved2(DemoteTo(du, NearestInt(v0)),
+                          DemoteTo(du, NearestInt(v1)), du, &output[2 * i]);
+      }
+    } else if (out.num_channels_ == 3) {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        auto v0 = Mul(Clamp(zero, LoadU(d, &input[0][i]), one), mul);
+        auto v1 = Mul(Clamp(zero, LoadU(d, &input[1][i]), one), mul);
+        auto v2 = Mul(Clamp(zero, LoadU(d, &input[2][i]), one), mul);
+        StoreInterleaved3(DemoteTo(du, NearestInt(v0)),
+                          DemoteTo(du, NearestInt(v1)),
+                          DemoteTo(du, NearestInt(v2)), du, &output[3 * i]);
+      }
+    } else if (out.num_channels_ == 4) {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        auto v0 = Mul(Clamp(zero, LoadU(d, &input[0][i]), one), mul);
+        auto v1 = Mul(Clamp(zero, LoadU(d, &input[1][i]), one), mul);
+        auto v2 = Mul(Clamp(zero, LoadU(d, &input[2][i]), one), mul);
+        auto v3 = Mul(Clamp(zero, LoadU(d, &input[3][i]), one), mul);
+        StoreInterleaved4(DemoteTo(du, NearestInt(v0)),
+                          DemoteTo(du, NearestInt(v1)),
+                          DemoteTo(du, NearestInt(v2)),
+                          DemoteTo(du, NearestInt(v3)), du, &output[4 * i]);
+      }
+    }
+    msan::PoisonMemory(output + out.num_channels_ * len,
+                       sizeof(output[0]) * out.num_channels_ * padding);
+  }
 
- private:
-  uint8_t* rgb_;
-  size_t stride_;
+  void StoreFloat16Row(const Output& out, const float* input[4], size_t len,
+                       uint16_t* output) const {
+    const HWY_FULL(float) d;
+    const Rebind<uint16_t, decltype(d)> du;
+    const Rebind<hwy::float16_t, decltype(d)> df16;
+    const size_t padding = RoundUpTo(len, Lanes(d)) - len;
+    for (size_t c = 0; c < out.num_channels_; ++c) {
+      msan::UnpoisonMemory(input[c] + len, sizeof(input[c][0]) * padding);
+    }
+    if (out.num_channels_ == 1) {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        auto v0 = LoadU(d, &input[0][i]);
+        StoreU(BitCast(du, DemoteTo(df16, v0)), du, &output[i]);
+      }
+    } else if (out.num_channels_ == 2) {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        auto v0 = LoadU(d, &input[0][i]);
+        auto v1 = LoadU(d, &input[1][i]);
+        StoreInterleaved2(BitCast(du, DemoteTo(df16, v0)),
+                          BitCast(du, DemoteTo(df16, v1)), du, &output[2 * i]);
+      }
+    } else if (out.num_channels_ == 3) {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        auto v0 = LoadU(d, &input[0][i]);
+        auto v1 = LoadU(d, &input[1][i]);
+        auto v2 = LoadU(d, &input[2][i]);
+        StoreInterleaved3(BitCast(du, DemoteTo(df16, v0)),
+                          BitCast(du, DemoteTo(df16, v1)),
+                          BitCast(du, DemoteTo(df16, v2)), du, &output[3 * i]);
+      }
+    } else if (out.num_channels_ == 4) {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        auto v0 = LoadU(d, &input[0][i]);
+        auto v1 = LoadU(d, &input[1][i]);
+        auto v2 = LoadU(d, &input[2][i]);
+        auto v3 = LoadU(d, &input[3][i]);
+        StoreInterleaved4(BitCast(du, DemoteTo(df16, v0)),
+                          BitCast(du, DemoteTo(df16, v1)),
+                          BitCast(du, DemoteTo(df16, v2)),
+                          BitCast(du, DemoteTo(df16, v3)), du, &output[4 * i]);
+      }
+    }
+    msan::PoisonMemory(output + out.num_channels_ * len,
+                       sizeof(output[0]) * out.num_channels_ * padding);
+  }
+
+  void StoreFloatRow(const Output& out, const float* input[4], size_t len,
+                     float* output) const {
+    const HWY_FULL(float) d;
+    if (out.num_channels_ == 1) {
+      memcpy(output, input[0], len * sizeof(output[0]));
+    } else if (out.num_channels_ == 2) {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        StoreInterleaved2(LoadU(d, &input[0][i]), LoadU(d, &input[1][i]), d,
+                          &output[2 * i]);
+      }
+    } else if (out.num_channels_ == 3) {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        StoreInterleaved3(LoadU(d, &input[0][i]), LoadU(d, &input[1][i]),
+                          LoadU(d, &input[2][i]), d, &output[3 * i]);
+      }
+    } else {
+      for (size_t i = 0; i < len; i += Lanes(d)) {
+        StoreInterleaved4(LoadU(d, &input[0][i]), LoadU(d, &input[1][i]),
+                          LoadU(d, &input[2][i]), LoadU(d, &input[3][i]), d,
+                          &output[4 * i]);
+      }
+    }
+  }
+
+  template <typename T>
+  void WriteToOutput(const Output& out, size_t thread_id, size_t ypos,
+                     size_t xstart, size_t len, T* output) const {
+    if (transpose_) {
+      // TODO(szabadka) Buffer 8x8 chunks and transpose with SIMD.
+      if (out.run_opaque_) {
+        for (size_t i = 0, j = 0; i < len; ++i, j += out.num_channels_) {
+          out.pixel_callback_.run(out.run_opaque_, thread_id, ypos, xstart + i,
+                                  1, output + j);
+        }
+      } else {
+        const size_t pixel_stride = out.num_channels_ * sizeof(T);
+        const size_t offset = xstart * out.stride_ + ypos * pixel_stride;
+        for (size_t i = 0, j = 0; i < len; ++i, j += out.num_channels_) {
+          const size_t ix = offset + i * out.stride_;
+          JXL_DASSERT(ix + pixel_stride <= out.buffer_size_);
+          memcpy(reinterpret_cast<uint8_t*>(out.buffer_) + ix, output + j,
+                 pixel_stride);
+        }
+      }
+    } else {
+      if (out.run_opaque_) {
+        out.pixel_callback_.run(out.run_opaque_, thread_id, xstart, ypos, len,
+                                output);
+      } else {
+        const size_t pixel_stride = out.num_channels_ * sizeof(T);
+        const size_t offset = ypos * out.stride_ + xstart * pixel_stride;
+        JXL_DASSERT(offset + len * pixel_stride <= out.buffer_size_);
+        memcpy(reinterpret_cast<uint8_t*>(out.buffer_) + offset, output,
+               len * pixel_stride);
+      }
+    }
+  }
+
+  static constexpr size_t kMaxPixelsPerCall = 1024;
+  size_t width_;
   size_t height_;
-  bool rgba_;
+  Output main_;  // color + alpha
+  size_t num_color_;
+  bool want_alpha_;
   bool has_alpha_;
+  bool unpremul_alpha_;
   size_t alpha_c_;
+  bool flip_x_;
+  bool flip_y_;
+  bool transpose_;
+  std::vector<Output> extra_channels_;
   std::vector<float> opaque_alpha_;
+  std::vector<CacheAlignedUniquePtr> temp_in_;
+  std::vector<CacheAlignedUniquePtr> temp_out_;
 };
 
-std::unique_ptr<RenderPipelineStage> GetWriteToU8Stage(uint8_t* rgb,
-                                                       size_t stride,
-                                                       size_t height, bool rgba,
-                                                       bool has_alpha,
-                                                       size_t alpha_c) {
-  return jxl::make_unique<WriteToU8Stage>(rgb, stride, height, rgba, has_alpha,
-                                          alpha_c);
+constexpr size_t WriteToOutputStage::kMaxPixelsPerCall;
+
+std::unique_ptr<RenderPipelineStage> GetWriteToOutputStage(
+    const ImageOutput& main_output, size_t width, size_t height, bool has_alpha,
+    bool unpremul_alpha, size_t alpha_c, Orientation undo_orientation,
+    std::vector<ImageOutput>& extra_output) {
+  return jxl::make_unique<WriteToOutputStage>(
+      main_output, width, height, has_alpha, unpremul_alpha, alpha_c,
+      undo_orientation, extra_output);
 }
 
 // NOLINTNEXTLINE(google-readability-namespace-comments)
@@ -178,7 +478,7 @@ HWY_AFTER_NAMESPACE();
 
 namespace jxl {
 
-HWY_EXPORT(GetWriteToU8Stage);
+HWY_EXPORT(GetWriteToOutputStage);
 
 namespace {
 class WriteToImageBundleStage : public RenderPipelineStage {
@@ -275,107 +575,6 @@ class WriteToImage3FStage : public RenderPipelineStage {
   Image3F* image_;
 };
 
-class WriteToPixelCallbackStage : public RenderPipelineStage {
- public:
-  WriteToPixelCallbackStage(const PixelCallback& pixel_callback, size_t width,
-                            size_t height, bool rgba, bool has_alpha,
-                            bool unpremul_alpha, size_t alpha_c)
-      : RenderPipelineStage(RenderPipelineStage::Settings()),
-        pixel_callback_(pixel_callback),
-        width_(width),
-        height_(height),
-        rgba_(rgba),
-        has_alpha_(has_alpha),
-        unpremul_alpha_(unpremul_alpha),
-        alpha_c_(alpha_c),
-        opaque_alpha_(kMaxPixelsPerCall, 1.0f) {}
-
-  WriteToPixelCallbackStage(const WriteToPixelCallbackStage&) = delete;
-  WriteToPixelCallbackStage& operator=(const WriteToPixelCallbackStage&) =
-      delete;
-  WriteToPixelCallbackStage(WriteToPixelCallbackStage&&) = delete;
-  WriteToPixelCallbackStage& operator=(WriteToPixelCallbackStage&&) = delete;
-
-  ~WriteToPixelCallbackStage() override {
-    if (run_opaque_) {
-      pixel_callback_.destroy(run_opaque_);
-    }
-  }
-
-  void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
-                  size_t xextra, size_t xsize, size_t xpos, size_t ypos,
-                  size_t thread_id) const final {
-    JXL_DASSERT(run_opaque_);
-    if (ypos >= height_) return;
-    const float* line_buffers[4];
-    for (size_t c = 0; c < 3; c++) {
-      line_buffers[c] = GetInputRow(input_rows, c, 0) - xextra;
-    }
-    if (has_alpha_) {
-      line_buffers[3] = GetInputRow(input_rows, alpha_c_, 0) - xextra;
-    } else {
-      // No xextra offset; opaque_alpha_ is a way to set all values to 1.0f.
-      line_buffers[3] = opaque_alpha_.data();
-    }
-
-    // TODO(veluca): SIMD.
-    ssize_t limit = std::min(xextra + xsize, width_ - xpos);
-    for (ssize_t x0 = -xextra; x0 < limit; x0 += kMaxPixelsPerCall) {
-      size_t j = 0;
-      size_t ix = 0;
-      float* JXL_RESTRICT temp =
-          reinterpret_cast<float*>(temp_[thread_id].get());
-      for (; ix < kMaxPixelsPerCall && ssize_t(ix) + x0 < limit; ix++) {
-        temp[j++] = line_buffers[0][ix];
-        temp[j++] = line_buffers[1][ix];
-        temp[j++] = line_buffers[2][ix];
-        if (rgba_) {
-          temp[j++] = line_buffers[3][ix];
-        }
-      }
-      if (has_alpha_ && rgba_ && unpremul_alpha_) {
-        // TODO(szabadka) SIMDify (possibly in a separate pipeline stage).
-        UnpremultiplyAlpha(temp, ix);
-      }
-      pixel_callback_.run(run_opaque_, thread_id, xpos + x0, ypos, ix, temp);
-      for (size_t c = 0; c < 3; c++) line_buffers[c] += kMaxPixelsPerCall;
-      if (has_alpha_) line_buffers[3] += kMaxPixelsPerCall;
-    }
-  }
-
-  RenderPipelineChannelMode GetChannelMode(size_t c) const final {
-    return c < 3 || (has_alpha_ && c == alpha_c_)
-               ? RenderPipelineChannelMode::kInput
-               : RenderPipelineChannelMode::kIgnored;
-  }
-
-  const char* GetName() const override { return "WritePixelCB"; }
-
- private:
-  Status PrepareForThreads(size_t num_threads) override {
-    run_opaque_ =
-        pixel_callback_.Init(num_threads, /*num_pixels=*/kMaxPixelsPerCall);
-    JXL_RETURN_IF_ERROR(run_opaque_ != nullptr);
-    temp_.resize(num_threads);
-    for (CacheAlignedUniquePtr& temp : temp_) {
-      temp = AllocateArray(sizeof(float) * kMaxPixelsPerCall * (rgba_ ? 4 : 3));
-    }
-    return true;
-  }
-
-  static constexpr size_t kMaxPixelsPerCall = 1024;
-  PixelCallback pixel_callback_;
-  void* run_opaque_ = nullptr;
-  size_t width_;
-  size_t height_;
-  bool rgba_;
-  bool has_alpha_;
-  bool unpremul_alpha_;
-  size_t alpha_c_;
-  std::vector<float> opaque_alpha_;
-  std::vector<CacheAlignedUniquePtr> temp_;
-};
-
 }  // namespace
 
 std::unique_ptr<RenderPipelineStage> GetWriteToImageBundleStage(
@@ -388,20 +587,13 @@ std::unique_ptr<RenderPipelineStage> GetWriteToImage3FStage(Image3F* image) {
   return jxl::make_unique<WriteToImage3FStage>(image);
 }
 
-std::unique_ptr<RenderPipelineStage> GetWriteToU8Stage(uint8_t* rgb,
-                                                       size_t stride,
-                                                       size_t height, bool rgba,
-                                                       bool has_alpha,
-                                                       size_t alpha_c) {
-  return HWY_DYNAMIC_DISPATCH(GetWriteToU8Stage)(rgb, stride, height, rgba,
-                                                 has_alpha, alpha_c);
-}
-
-std::unique_ptr<RenderPipelineStage> GetWriteToPixelCallbackStage(
-    const PixelCallback& pixel_callback, size_t width, size_t height, bool rgba,
-    bool has_alpha, bool unpremul_alpha, size_t alpha_c) {
-  return jxl::make_unique<WriteToPixelCallbackStage>(
-      pixel_callback, width, height, rgba, has_alpha, unpremul_alpha, alpha_c);
+std::unique_ptr<RenderPipelineStage> GetWriteToOutputStage(
+    const ImageOutput& main_output, size_t width, size_t height, bool has_alpha,
+    bool unpremul_alpha, size_t alpha_c, Orientation undo_orientation,
+    std::vector<ImageOutput>& extra_output) {
+  return HWY_DYNAMIC_DISPATCH(GetWriteToOutputStage)(
+      main_output, width, height, has_alpha, unpremul_alpha, alpha_c,
+      undo_orientation, extra_output);
 }
 
 }  // namespace jxl
index b942fd6..c5f844e 100644 (file)
@@ -20,17 +20,11 @@ std::unique_ptr<RenderPipelineStage> GetWriteToImageBundleStage(
 // Gets a stage to write color channels to an Image3F.
 std::unique_ptr<RenderPipelineStage> GetWriteToImage3FStage(Image3F* image);
 
-// Gets a stage to write to a uint8 buffer.
-std::unique_ptr<RenderPipelineStage> GetWriteToU8Stage(uint8_t* rgb,
-                                                       size_t stride,
-                                                       size_t height, bool rgba,
-                                                       bool has_alpha,
-                                                       size_t alpha_c);
-
-// Gets a stage to write to a pixel callback.
-std::unique_ptr<RenderPipelineStage> GetWriteToPixelCallbackStage(
-    const PixelCallback& pixel_callback, size_t width, size_t height, bool rgba,
-    bool has_alpha, bool unpremul_alpha, size_t alpha_c);
+// Gets a stage to write to a pixel callback or image buffer.
+std::unique_ptr<RenderPipelineStage> GetWriteToOutputStage(
+    const ImageOutput& main_output, size_t width, size_t height, bool has_alpha,
+    bool unpremul_alpha, size_t alpha_c, Orientation undo_orientation,
+    std::vector<ImageOutput>& extra_output);
 
 }  // namespace jxl
 
index 0022a61..56e86e6 100644 (file)
@@ -10,6 +10,9 @@
 #include <hwy/foreach_target.h>
 #include <hwy/highway.h>
 
+#include "lib/jxl/base/common.h"
+#include "lib/jxl/cms/opsin_params.h"
+#include "lib/jxl/common.h"  // JXL_HIGH_PRECISION
 #include "lib/jxl/dec_xyb-inl.h"
 #include "lib/jxl/sanitizers.h"
 
@@ -19,15 +22,15 @@ namespace HWY_NAMESPACE {
 
 class XYBStage : public RenderPipelineStage {
  public:
-  explicit XYBStage(const OpsinParams& opsin_params)
+  explicit XYBStage(const OutputEncodingInfo& output_encoding_info)
       : RenderPipelineStage(RenderPipelineStage::Settings()),
-        opsin_params_(opsin_params) {}
+        opsin_params_(output_encoding_info.opsin_params),
+        output_is_xyb_(output_encoding_info.color_encoding.GetColorSpace() ==
+                       ColorSpace::kXYB) {}
 
   void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
                   size_t xextra, size_t xsize, size_t xpos, size_t ypos,
                   size_t thread_id) const final {
-    PROFILER_ZONE("UndoXYB");
-
     const HWY_FULL(float) d;
     JXL_ASSERT(xextra == 0);
     const size_t xsize_v = RoundUpTo(xsize, Lanes(d));
@@ -42,18 +45,38 @@ class XYBStage : public RenderPipelineStage {
     msan::UnpoisonMemory(row2 + xsize, sizeof(float) * (xsize_v - xsize));
     // TODO(eustas): when using frame origin, addresses might be unaligned;
     //               making them aligned will void performance penalty.
-    for (ssize_t x = -xextra; x < (ssize_t)(xsize + xextra); x += Lanes(d)) {
-      const auto in_opsin_x = LoadU(d, row0 + x);
-      const auto in_opsin_y = LoadU(d, row1 + x);
-      const auto in_opsin_b = LoadU(d, row2 + x);
-      auto r = Undefined(d);
-      auto g = Undefined(d);
-      auto b = Undefined(d);
-      XybToRgb(d, in_opsin_x, in_opsin_y, in_opsin_b, opsin_params_, &r, &g,
-               &b);
-      StoreU(r, d, row0 + x);
-      StoreU(g, d, row1 + x);
-      StoreU(b, d, row2 + x);
+    if (output_is_xyb_) {
+      const auto scale_x = Set(d, jxl::cms::kScaledXYBScale[0]);
+      const auto scale_y = Set(d, jxl::cms::kScaledXYBScale[1]);
+      const auto scale_bmy = Set(d, jxl::cms::kScaledXYBScale[2]);
+      const auto offset_x = Set(d, jxl::cms::kScaledXYBOffset[0]);
+      const auto offset_y = Set(d, jxl::cms::kScaledXYBOffset[1]);
+      const auto offset_bmy = Set(d, jxl::cms::kScaledXYBOffset[2]);
+      for (ssize_t x = -xextra; x < (ssize_t)(xsize + xextra); x += Lanes(d)) {
+        const auto in_x = LoadU(d, row0 + x);
+        const auto in_y = LoadU(d, row1 + x);
+        const auto in_b = LoadU(d, row2 + x);
+        auto out_x = Mul(Add(in_x, offset_x), scale_x);
+        auto out_y = Mul(Add(in_y, offset_y), scale_y);
+        auto out_b = Mul(Add(Sub(in_b, in_y), offset_bmy), scale_bmy);
+        StoreU(out_x, d, row0 + x);
+        StoreU(out_y, d, row1 + x);
+        StoreU(out_b, d, row2 + x);
+      }
+    } else {
+      for (ssize_t x = -xextra; x < (ssize_t)(xsize + xextra); x += Lanes(d)) {
+        const auto in_opsin_x = LoadU(d, row0 + x);
+        const auto in_opsin_y = LoadU(d, row1 + x);
+        const auto in_opsin_b = LoadU(d, row2 + x);
+        auto r = Undefined(d);
+        auto g = Undefined(d);
+        auto b = Undefined(d);
+        XybToRgb(d, in_opsin_x, in_opsin_y, in_opsin_b, opsin_params_, &r, &g,
+                 &b);
+        StoreU(r, d, row0 + x);
+        StoreU(g, d, row1 + x);
+        StoreU(b, d, row2 + x);
+      }
     }
     msan::PoisonMemory(row0 + xsize, sizeof(float) * (xsize_v - xsize));
     msan::PoisonMemory(row1 + xsize, sizeof(float) * (xsize_v - xsize));
@@ -69,11 +92,12 @@ class XYBStage : public RenderPipelineStage {
 
  private:
   const OpsinParams opsin_params_;
+  const bool output_is_xyb_;
 };
 
 std::unique_ptr<RenderPipelineStage> GetXYBStage(
-    const OpsinParams& opsin_params) {
-  return jxl::make_unique<XYBStage>(opsin_params);
+    const OutputEncodingInfo& output_encoding_info) {
+  return jxl::make_unique<XYBStage>(output_encoding_info);
 }
 
 // NOLINTNEXTLINE(google-readability-namespace-comments)
@@ -87,10 +111,11 @@ namespace jxl {
 HWY_EXPORT(GetXYBStage);
 
 std::unique_ptr<RenderPipelineStage> GetXYBStage(
-    const OpsinParams& opsin_params) {
-  return HWY_DYNAMIC_DISPATCH(GetXYBStage)(opsin_params);
+    const OutputEncodingInfo& output_encoding_info) {
+  return HWY_DYNAMIC_DISPATCH(GetXYBStage)(output_encoding_info);
 }
 
+#if !JXL_HIGH_PRECISION
 namespace {
 class FastXYBStage : public RenderPipelineStage {
  public:
@@ -147,6 +172,7 @@ std::unique_ptr<RenderPipelineStage> GetFastXYBTosRGB8Stage(
   return make_unique<FastXYBStage>(rgb, stride, width, height, rgba, has_alpha,
                                    alpha_c);
 }
+#endif
 
 }  // namespace jxl
 #endif
index 2bc5075..7b06345 100644 (file)
@@ -14,7 +14,7 @@ namespace jxl {
 
 // Converts the color channels from XYB to linear with appropriate primaries.
 std::unique_ptr<RenderPipelineStage> GetXYBStage(
-    const OpsinParams& output_encoding_info);
+    const OutputEncodingInfo& output_encoding_info);
 
 // Gets a stage to convert with fixed point arithmetic from XYB to sRGB8 and
 // write to a uint8 buffer.
index 5cba4a7..30ad327 100644 (file)
@@ -25,8 +25,6 @@ class kYCbCrStage : public RenderPipelineStage {
   void ProcessRow(const RowInfo& input_rows, const RowInfo& output_rows,
                   size_t xextra, size_t xsize, size_t xpos, size_t ypos,
                   size_t thread_id) const final {
-    PROFILER_ZONE("UndoYCbCr");
-
     const HWY_FULL(float) df;
 
     // Full-range BT.601 as defined by JFIF Clause 7:
index 9320c97..3e99af7 100644 (file)
@@ -7,7 +7,6 @@
 #define LIB_JXL_RENDER_PIPELINE_STAGE_YCBCR_H_
 #include <math.h>
 #include <stdint.h>
-#include <stdio.h>
 
 #include <algorithm>
 #include <utility>
index d057172..0eb4393 100644 (file)
@@ -3,28 +3,32 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+#include <jxl/cms.h>
+#include <jxl/codestream_header.h>
+#include <jxl/decode.h>
+#include <jxl/decode_cxx.h>
+#include <jxl/encode.h>
+#include <jxl/encode_cxx.h>
+#include <jxl/types.h>
+
 #include <cmath>  // std::abs
 #include <cstddef>
 #include <cstdint>
 #include <cstdio>
 #include <vector>
 
-#include "gtest/gtest.h"
-#include "jxl/codestream_header.h"
-#include "jxl/decode.h"
-#include "jxl/decode_cxx.h"
-#include "jxl/encode.h"
-#include "jxl/encode_cxx.h"
-#include "jxl/types.h"
 #include "lib/extras/codec.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/dec_bit_reader.h"
 #include "lib/jxl/dec_external_image.h"
 #include "lib/jxl/enc_butteraugli_comparator.h"
 #include "lib/jxl/enc_comparator.h"
 #include "lib/jxl/enc_external_image.h"
 #include "lib/jxl/encode_internal.h"
 #include "lib/jxl/icc_codec.h"
+#include "lib/jxl/image_test_utils.h"
 #include "lib/jxl/test_utils.h"
-#include "lib/jxl/testdata.h"
+#include "lib/jxl/testing.h"
 
 namespace {
 
@@ -34,7 +38,7 @@ namespace {
 jxl::CodecInOut ConvertTestImage(const std::vector<uint8_t>& buf,
                                  const size_t xsize, const size_t ysize,
                                  const JxlPixelFormat& pixel_format,
-                                 const jxl::PaddedBytes& icc_profile) {
+                                 const jxl::Bytes& icc_profile) {
   jxl::CodecInOut io;
   io.SetSize(xsize, ysize);
 
@@ -56,58 +60,52 @@ jxl::CodecInOut ConvertTestImage(const std::vector<uint8_t>& buf,
         io.metadata.m.SetAlphaBits(16);
         break;
       default:
-        EXPECT_TRUE(false) << "Roundtrip tests for data type "
-                           << pixel_format.data_type << " not yet implemented.";
+        ADD_FAILURE() << "Roundtrip tests for data type "
+                      << pixel_format.data_type << " not yet implemented.";
     }
   }
   size_t bitdepth = 0;
-  bool float_in = false;
   switch (pixel_format.data_type) {
     case JXL_TYPE_FLOAT:
       bitdepth = 32;
-      float_in = true;
       io.metadata.m.SetFloat32Samples();
       break;
     case JXL_TYPE_FLOAT16:
       bitdepth = 16;
-      float_in = true;
       io.metadata.m.SetFloat16Samples();
       break;
     case JXL_TYPE_UINT8:
       bitdepth = 8;
-      float_in = false;
       io.metadata.m.SetUintSamples(8);
       break;
     case JXL_TYPE_UINT16:
       bitdepth = 16;
-      float_in = false;
       io.metadata.m.SetUintSamples(16);
       break;
     default:
-      EXPECT_TRUE(false) << "Roundtrip tests for data type "
-                         << pixel_format.data_type << " not yet implemented.";
+      ADD_FAILURE() << "Roundtrip tests for data type "
+                    << pixel_format.data_type << " not yet implemented.";
   }
   jxl::ColorEncoding color_encoding;
   if (!icc_profile.empty()) {
-    jxl::PaddedBytes icc_profile_copy(icc_profile);
-    EXPECT_TRUE(color_encoding.SetICC(std::move(icc_profile_copy)));
+    jxl::IccBytes icc_profile_copy;
+    icc_profile.AppendTo(&icc_profile_copy);
+    EXPECT_TRUE(
+        color_encoding.SetICC(std::move(icc_profile_copy), JxlGetDefaultCms()));
   } else if (pixel_format.data_type == JXL_TYPE_FLOAT) {
     color_encoding = jxl::ColorEncoding::LinearSRGB(is_gray);
   } else {
     color_encoding = jxl::ColorEncoding::SRGB(is_gray);
   }
-  EXPECT_TRUE(ConvertFromExternal(
-      jxl::Span<const uint8_t>(buf.data(), buf.size()), xsize, ysize,
-      color_encoding, pixel_format.num_channels,
-      /*alpha_is_premultiplied=*/false,
-      /*bits_per_sample=*/bitdepth, pixel_format.endianness,
-      /*pool=*/nullptr, &io.Main(), float_in,
-      /*align=*/0));
+  EXPECT_TRUE(ConvertFromExternal(jxl::Bytes(buf.data(), buf.size()), xsize,
+                                  ysize, color_encoding,
+                                  /*bits_per_sample=*/bitdepth, pixel_format,
+                                  /*pool=*/nullptr, &io.Main()));
   return io;
 }
 
 template <typename T>
-T ConvertTestPixel(const float val);
+T ConvertTestPixel(float val);
 
 template <>
 float ConvertTestPixel<float>(const float val) {
@@ -175,7 +173,7 @@ void EncodeWithEncoder(JxlEncoder* enc, std::vector<uint8_t>* compressed) {
   EXPECT_EQ(JXL_ENC_SUCCESS, process_result);
 }
 
-// Generates some pixels using using some dimensions and pixel_format,
+// Generates some pixels using some dimensions and pixel_format,
 // compresses them, and verifies that the decoded version is similar to the
 // original pixels.
 // TODO(firsching): change this to be a parameterized test, like in
@@ -188,7 +186,8 @@ void VerifyRoundtripCompression(
     const bool use_container, const uint32_t resampling = 1,
     const bool already_downsampled = false,
     const std::vector<std::pair<JxlExtraChannelType, std::string>>&
-        extra_channels = {}) {
+        extra_channels = {},
+    const int upsampling_mode = -1) {
   size_t orig_xsize = xsize;
   size_t orig_ysize = ysize;
   if (already_downsampled) {
@@ -231,19 +230,13 @@ void VerifyRoundtripCompression(
   }
   if (alpha_in_extra_channels_vector && !has_interleaved_alpha) {
     jxl::ImageF alpha_channel(xsize, ysize);
-
-    EXPECT_EQ(
-        jxl::ConvertFromExternal(
-            jxl::Span<const uint8_t>(extra_channel_bytes.data(),
-                                     extra_channel_bytes.size()),
-            xsize, ysize, basic_info.bits_per_sample,
-            input_pixel_format.endianness, /*pool=*/nullptr, &alpha_channel,
-            /*float_in=*/input_pixel_format.data_type == JXL_TYPE_FLOAT,
-            /*align=*/0),
-        true);
+    EXPECT_TRUE(jxl::ConvertFromExternal(
+        jxl::Bytes(extra_channel_bytes.data(), extra_channel_bytes.size()),
+        xsize, ysize, basic_info.bits_per_sample, extra_channel_pixel_format, 0,
+        /*pool=*/nullptr, &alpha_channel));
 
     original_io.metadata.m.SetAlphaBits(basic_info.bits_per_sample);
-    original_io.Main().SetAlpha(std::move(alpha_channel), false);
+    original_io.Main().SetAlpha(std::move(alpha_channel));
     output_pixel_format_with_extra_channel_alpha.num_channels++;
   }
   // Those are the num_extra_channels including a potential alpha channel.
@@ -281,6 +274,13 @@ void VerifyRoundtripCompression(
                                             name.c_str(), name.length()));
   }
   EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetColorEncoding(enc, &color_encoding));
+  if (resampling > 1) {
+    EXPECT_EQ(JXL_ENC_ERROR, JxlEncoderSetUpsamplingMode(enc, 3, 0));
+    EXPECT_EQ(JXL_ENC_ERROR, JxlEncoderSetUpsamplingMode(enc, resampling, -2));
+    EXPECT_EQ(JXL_ENC_ERROR, JxlEncoderSetUpsamplingMode(enc, resampling, 2));
+  }
+  EXPECT_EQ(JXL_ENC_SUCCESS,
+            JxlEncoderSetUpsamplingMode(enc, resampling, upsampling_mode));
   JxlEncoderFrameSettings* frame_settings =
       JxlEncoderFrameSettingsCreate(enc, nullptr);
   JxlEncoderSetFrameLossless(frame_settings, lossless);
@@ -306,7 +306,7 @@ void VerifyRoundtripCompression(
   for (size_t index = 0; index < channel_infos.size(); index++) {
     EXPECT_EQ(JXL_ENC_SUCCESS,
               JxlEncoderSetExtraChannelBuffer(
-                  frame_settings, &input_pixel_format,
+                  frame_settings, &extra_channel_pixel_format,
                   (void*)extra_channel_bytes.data(), extra_channel_bytes.size(),
                   index + has_interleaved_alpha));
   }
@@ -353,14 +353,12 @@ void VerifyRoundtripCompression(
 
   size_t icc_profile_size;
   EXPECT_EQ(JXL_DEC_SUCCESS,
-            JxlDecoderGetICCProfileSize(
-                dec, &output_pixel_format_with_extra_channel_alpha,
-                JXL_COLOR_PROFILE_TARGET_DATA, &icc_profile_size));
-  jxl::PaddedBytes icc_profile(icc_profile_size);
-  EXPECT_EQ(JXL_DEC_SUCCESS,
-            JxlDecoderGetColorAsICCProfile(
-                dec, &output_pixel_format, JXL_COLOR_PROFILE_TARGET_DATA,
-                icc_profile.data(), icc_profile.size()));
+            JxlDecoderGetICCProfileSize(dec, JXL_COLOR_PROFILE_TARGET_DATA,
+                                        &icc_profile_size));
+  std::vector<uint8_t> icc_profile(icc_profile_size);
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetColorAsICCProfile(
+                                 dec, JXL_COLOR_PROFILE_TARGET_DATA,
+                                 icc_profile.data(), icc_profile.size()));
 
   std::vector<uint8_t> decoded_bytes(buffer_size);
 
@@ -414,7 +412,7 @@ void VerifyRoundtripCompression(
 
   jxl::CodecInOut decoded_io = ConvertTestImage(
       decoded_bytes, xsize, ysize, output_pixel_format_with_extra_channel_alpha,
-      icc_profile);
+      jxl::Bytes(icc_profile));
 
   if (already_downsampled) {
     jxl::Image3F* color = decoded_io.Main().color();
@@ -426,14 +424,20 @@ void VerifyRoundtripCompression(
     decoded_io.SetSize(color->xsize(), color->ysize());
   }
 
-  jxl::ButteraugliParams ba;
-  float butteraugli_score =
-      ButteraugliDistance(original_io, decoded_io, ba, jxl::GetJxlCms(),
-                          /*distmap=*/nullptr, nullptr);
   if (lossless && !already_downsampled) {
-    EXPECT_LE(butteraugli_score, 0.0f);
+    JXL_EXPECT_OK(jxl::SamePixels(*original_io.Main().color(),
+                                  *decoded_io.Main().color(), _));
   } else {
-    EXPECT_LE(butteraugli_score, 2.0f);
+    jxl::ButteraugliParams ba;
+    float butteraugli_score = ButteraugliDistance(
+        original_io.frames, decoded_io.frames, ba, *JxlGetDefaultCms(),
+        /*distmap=*/nullptr, nullptr);
+    float target_score = 1.3f;
+    // upsampling mode 1 (unlike default and NN) does not downscale back to the
+    // already downsampled image
+    if (upsampling_mode == 1 && resampling >= 4 && already_downsampled)
+      target_score = 15.f;
+    EXPECT_LE(butteraugli_score, target_score);
   }
   JxlPixelFormat extra_channel_output_pixel_format = output_pixel_format;
   extra_channel_output_pixel_format.num_channels = 1;
@@ -540,7 +544,7 @@ TEST(RoundtripTest, TestNonlinearSrgbAsXybEncoded) {
           JxlPixelFormat{num_channels, JXL_TYPE_FLOAT, JXL_NATIVE_ENDIAN, 0};
       VerifyRoundtripCompression<uint8_t>(
           63, 129, pixel_format_in, pixel_format_out,
-          /*lossless=*/false, (bool)use_container, {});
+          /*lossless=*/false, (bool)use_container, 1, false, {});
     }
   }
 }
@@ -556,10 +560,15 @@ TEST(RoundtripTest, Resampling) {
   // TODO(lode): also make this work for odd sizes. This requires a fix in
   // enc_frame.cc to not set custom_size_or_origin to true due to even/odd
   // mismatch.
-  VerifyRoundtripCompression<uint8_t>(64, 128, pixel_format, pixel_format,
-                                      /*lossless=*/true,
-                                      /*use_container=*/false, 2,
-                                      /*already_downsampled=*/true);
+  for (int factor : {2, 4, 8}) {
+    for (int upsampling_mode : {-1, 0, 1}) {
+      VerifyRoundtripCompression<uint8_t>(
+          64, 128, pixel_format, pixel_format,
+          /*lossless=*/true,
+          /*use_container=*/false, factor,
+          /*already_downsampled=*/true, /*extra_channels=*/{}, upsampling_mode);
+    }
+  }
 }
 
 TEST(RoundtripTest, ExtraBoxesTest) {
@@ -638,14 +647,12 @@ TEST(RoundtripTest, ExtraBoxesTest) {
 
   size_t icc_profile_size;
   EXPECT_EQ(JXL_DEC_SUCCESS,
-            JxlDecoderGetICCProfileSize(dec, &pixel_format,
-                                        JXL_COLOR_PROFILE_TARGET_DATA,
+            JxlDecoderGetICCProfileSize(dec, JXL_COLOR_PROFILE_TARGET_DATA,
                                         &icc_profile_size));
-  jxl::PaddedBytes icc_profile(icc_profile_size);
-  EXPECT_EQ(JXL_DEC_SUCCESS,
-            JxlDecoderGetColorAsICCProfile(
-                dec, &pixel_format, JXL_COLOR_PROFILE_TARGET_DATA,
-                icc_profile.data(), icc_profile.size()));
+  std::vector<uint8_t> icc_profile(icc_profile_size);
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetColorAsICCProfile(
+                                 dec, JXL_COLOR_PROFILE_TARGET_DATA,
+                                 icc_profile.data(), icc_profile.size()));
 
   std::vector<uint8_t> decoded_bytes(buffer_size);
 
@@ -659,14 +666,152 @@ TEST(RoundtripTest, ExtraBoxesTest) {
 
   JxlDecoderDestroy(dec);
 
-  jxl::CodecInOut decoded_io =
-      ConvertTestImage(decoded_bytes, xsize, ysize, pixel_format, icc_profile);
+  jxl::CodecInOut decoded_io = ConvertTestImage(
+      decoded_bytes, xsize, ysize, pixel_format, jxl::Bytes(icc_profile));
 
   jxl::ButteraugliParams ba;
-  float butteraugli_score =
-      ButteraugliDistance(original_io, decoded_io, ba, jxl::GetJxlCms(),
-                          /*distmap=*/nullptr, nullptr);
-  EXPECT_LE(butteraugli_score, 2.0f);
+  float butteraugli_score = ButteraugliDistance(
+      original_io.frames, decoded_io.frames, ba, *JxlGetDefaultCms(),
+      /*distmap=*/nullptr, nullptr);
+  EXPECT_LE(butteraugli_score, 1.0f);
+}
+
+TEST(RoundtripTest, MultiFrameTest) {
+  JxlPixelFormat pixel_format =
+      JxlPixelFormat{4, JXL_TYPE_FLOAT, JXL_NATIVE_ENDIAN, 0};
+  const size_t xsize = 61;
+  const size_t ysize = 71;
+  const size_t nb_frames = 4;
+  size_t compressed_size = 0;
+
+  for (int index_frames : {0, 1}) {
+    // use a vertical filmstrip of nb_frames frames
+    const std::vector<uint8_t> original_bytes =
+        GetTestImage<float>(xsize, ysize * nb_frames, pixel_format);
+    jxl::CodecInOut original_io = ConvertTestImage(
+        original_bytes, xsize, ysize * nb_frames, pixel_format, {});
+
+    JxlEncoder* enc = JxlEncoderCreate(nullptr);
+    EXPECT_NE(nullptr, enc);
+
+    EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderUseContainer(enc, true));
+    JxlBasicInfo basic_info;
+    jxl::test::JxlBasicInfoSetFromPixelFormat(&basic_info, &pixel_format);
+    basic_info.xsize = xsize;
+    basic_info.ysize = ysize;
+    basic_info.uses_original_profile = JXL_FALSE;
+    basic_info.have_animation = JXL_TRUE;
+    basic_info.animation.tps_numerator = 1;
+    basic_info.animation.tps_denominator = 1;
+    EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc, 10));
+
+    EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc, &basic_info));
+    JxlColorEncoding color_encoding;
+    if (pixel_format.data_type == JXL_TYPE_FLOAT) {
+      JxlColorEncodingSetToLinearSRGB(
+          &color_encoding,
+          /*is_gray=*/pixel_format.num_channels < 3);
+    } else {
+      JxlColorEncodingSetToSRGB(&color_encoding,
+                                /*is_gray=*/pixel_format.num_channels < 3);
+    }
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderSetColorEncoding(enc, &color_encoding));
+    JxlEncoderFrameSettings* frame_settings =
+        JxlEncoderFrameSettingsCreate(enc, nullptr);
+    JxlEncoderSetFrameLossless(frame_settings, false);
+    if (index_frames == 1) {
+      EXPECT_EQ(JXL_ENC_SUCCESS,
+                JxlEncoderFrameSettingsSetOption(frame_settings,
+                                                 JXL_ENC_FRAME_INDEX_BOX, 1));
+    }
+
+    size_t oneframesize = original_bytes.size() / nb_frames;
+    JxlFrameHeader frame_header;
+    JxlEncoderInitFrameHeader(&frame_header);
+    frame_header.duration = 1;
+    frame_header.is_last = JXL_FALSE;
+
+    for (size_t i = 0; i < nb_frames; i++) {
+      if (i + 1 == nb_frames) frame_header.is_last = JXL_TRUE;
+      JxlEncoderSetFrameHeader(frame_settings, &frame_header);
+      EXPECT_EQ(
+          JXL_ENC_SUCCESS,
+          JxlEncoderAddImageFrame(
+              frame_settings, &pixel_format,
+              (void*)(original_bytes.data() + oneframesize * i), oneframesize));
+    }
+    JxlEncoderCloseInput(enc);
+
+    std::vector<uint8_t> compressed;
+    EncodeWithEncoder(enc, &compressed);
+    JxlEncoderDestroy(enc);
+
+    JxlDecoder* dec = JxlDecoderCreate(nullptr);
+    EXPECT_NE(nullptr, dec);
+
+    const uint8_t* next_in = compressed.data();
+    size_t avail_in = compressed.size();
+
+    if (index_frames == 0) {
+      compressed_size = avail_in;
+    } else {
+      // a non-empty jxli box should be added
+      EXPECT_LE(avail_in, compressed_size + 50);
+      EXPECT_GE(avail_in, compressed_size + 10);
+    }
+
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderSubscribeEvents(dec, JXL_DEC_BASIC_INFO |
+                                                 JXL_DEC_COLOR_ENCODING |
+                                                 JXL_DEC_FULL_IMAGE));
+
+    JxlDecoderSetInput(dec, next_in, avail_in);
+    EXPECT_EQ(JXL_DEC_BASIC_INFO, JxlDecoderProcessInput(dec));
+    size_t buffer_size;
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderImageOutBufferSize(dec, &pixel_format, &buffer_size));
+    EXPECT_EQ(buffer_size, oneframesize);
+
+    JxlBasicInfo info;
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetBasicInfo(dec, &info));
+    EXPECT_EQ(xsize, info.xsize);
+    EXPECT_EQ(ysize, info.ysize);
+
+    EXPECT_EQ(JXL_DEC_COLOR_ENCODING, JxlDecoderProcessInput(dec));
+
+    size_t icc_profile_size;
+    EXPECT_EQ(JXL_DEC_SUCCESS,
+              JxlDecoderGetICCProfileSize(dec, JXL_COLOR_PROFILE_TARGET_DATA,
+                                          &icc_profile_size));
+    std::vector<uint8_t> icc_profile(icc_profile_size);
+    EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetColorAsICCProfile(
+                                   dec, JXL_COLOR_PROFILE_TARGET_DATA,
+                                   icc_profile.data(), icc_profile.size()));
+
+    std::vector<uint8_t> decoded_bytes(buffer_size * nb_frames);
+
+    EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec));
+
+    for (size_t i = 0; i < nb_frames; i++) {
+      EXPECT_EQ(JXL_DEC_SUCCESS,
+                JxlDecoderSetImageOutBuffer(
+                    dec, &pixel_format, decoded_bytes.data() + i * oneframesize,
+                    buffer_size));
+
+      EXPECT_EQ(JXL_DEC_FULL_IMAGE, JxlDecoderProcessInput(dec));
+    }
+    JxlDecoderDestroy(dec);
+    jxl::CodecInOut decoded_io =
+        ConvertTestImage(decoded_bytes, xsize, ysize * nb_frames, pixel_format,
+                         jxl::Bytes(icc_profile));
+
+    jxl::ButteraugliParams ba;
+    float butteraugli_score = ButteraugliDistance(
+        original_io.frames, decoded_io.frames, ba, *JxlGetDefaultCms(),
+        /*distmap=*/nullptr, nullptr);
+    EXPECT_LE(butteraugli_score, 1.0f);
+  }
 }
 
 static const unsigned char kEncodedTestProfile[] = {
@@ -708,10 +853,10 @@ static const unsigned char kEncodedTestProfile[] = {
 TEST(RoundtripTest, TestICCProfile) {
   // JxlEncoderSetICCProfile parses the ICC profile, so a valid profile is
   // needed. The profile should be passed correctly through the roundtrip.
-  jxl::BitReader reader(jxl::Span<const uint8_t>(kEncodedTestProfile,
-                                                 sizeof(kEncodedTestProfile)));
-  jxl::PaddedBytes icc;
-  ASSERT_TRUE(ReadICC(&reader, &icc));
+  jxl::BitReader reader(
+      jxl::Bytes(kEncodedTestProfile, sizeof(kEncodedTestProfile)));
+  std::vector<uint8_t> icc;
+  ASSERT_TRUE(jxl::test::ReadICC(&reader, &icc));
   ASSERT_TRUE(reader.Close());
 
   JxlPixelFormat format =
@@ -772,16 +917,14 @@ TEST(RoundtripTest, TestICCProfile) {
   EXPECT_EQ(JXL_DEC_COLOR_ENCODING, JxlDecoderProcessInput(dec));
 
   size_t dec_icc_size;
-  EXPECT_EQ(
-      JXL_DEC_SUCCESS,
-      JxlDecoderGetICCProfileSize(
-          dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL, &dec_icc_size));
-  EXPECT_EQ(icc.size(), dec_icc_size);
-  jxl::PaddedBytes dec_icc(dec_icc_size);
   EXPECT_EQ(JXL_DEC_SUCCESS,
-            JxlDecoderGetColorAsICCProfile(dec, &format,
-                                           JXL_COLOR_PROFILE_TARGET_ORIGINAL,
-                                           dec_icc.data(), dec_icc.size()));
+            JxlDecoderGetICCProfileSize(dec, JXL_COLOR_PROFILE_TARGET_ORIGINAL,
+                                        &dec_icc_size));
+  EXPECT_EQ(icc.size(), dec_icc_size);
+  std::vector<uint8_t> dec_icc(dec_icc_size);
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetColorAsICCProfile(
+                                 dec, JXL_COLOR_PROFILE_TARGET_ORIGINAL,
+                                 dec_icc.data(), dec_icc.size()));
 
   std::vector<uint8_t> decoded_bytes(buffer_size);
 
@@ -798,13 +941,12 @@ TEST(RoundtripTest, TestICCProfile) {
   JxlDecoderDestroy(dec);
 }
 
-#if JPEGXL_ENABLE_JPEG  // Loading .jpg files requires libjpeg support.
 TEST(RoundtripTest, JXL_TRANSCODE_JPEG_TEST(TestJPEGReconstruction)) {
+  TEST_LIBJPEG_SUPPORT();
   const std::string jpeg_path = "jxl/flower/flower.png.im_q85_420.jpg";
-  const jxl::PaddedBytes orig = jxl::ReadTestData(jpeg_path);
+  const std::vector<uint8_t> orig = jxl::test::ReadTestData(jpeg_path);
   jxl::CodecInOut orig_io;
-  ASSERT_TRUE(
-      SetFromBytes(jxl::Span<const uint8_t>(orig), &orig_io, /*pool=*/nullptr));
+  ASSERT_TRUE(SetFromBytes(jxl::Bytes(orig), &orig_io, /*pool=*/nullptr));
 
   JxlEncoderPtr enc = JxlEncoderMake(nullptr);
   JxlEncoderFrameSettings* frame_settings =
@@ -845,4 +987,3 @@ TEST(RoundtripTest, JXL_TRANSCODE_JPEG_TEST(TestJPEGReconstruction)) {
   ASSERT_EQ(used, orig.size());
   EXPECT_EQ(0, memcmp(reconstructed_buffer.data(), orig.data(), used));
 }
-#endif  // JPEGXL_ENABLE_JPEG
diff --git a/lib/jxl/simd_util.cc b/lib/jxl/simd_util.cc
new file mode 100644 (file)
index 0000000..a3971ff
--- /dev/null
@@ -0,0 +1,40 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/simd_util.h"
+
+#undef HWY_TARGET_INCLUDE
+#define HWY_TARGET_INCLUDE "lib/jxl/simd_util.cc"
+#include <hwy/foreach_target.h>
+#include <hwy/highway.h>
+
+HWY_BEFORE_NAMESPACE();
+namespace jxl {
+namespace HWY_NAMESPACE {
+
+size_t MaxVectorSize() {
+  HWY_FULL(float) df;
+  return Lanes(df) * sizeof(float);
+}
+
+// NOLINTNEXTLINE(google-readability-namespace-comments)
+}  // namespace HWY_NAMESPACE
+}  // namespace jxl
+HWY_AFTER_NAMESPACE();
+
+#if HWY_ONCE
+namespace jxl {
+
+HWY_EXPORT(MaxVectorSize);
+
+size_t MaxVectorSize() {
+  // Ideally HWY framework should provide us this value.
+  // Less than ideal is to check all available targets and choose maximal.
+  // As for now, we just ask current active target, assuming it won't change.
+  return HWY_DYNAMIC_DISPATCH(MaxVectorSize)();
+}
+
+}  // namespace jxl
+#endif
diff --git a/lib/jxl/simd_util.h b/lib/jxl/simd_util.h
new file mode 100644 (file)
index 0000000..84938a9
--- /dev/null
@@ -0,0 +1,17 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_SIMD_UTIL_H_
+#define LIB_JXL_SIMD_UTIL_H_
+#include <stddef.h>
+
+namespace jxl {
+
+// Maximal vector size in bytes.
+size_t MaxVectorSize();
+
+}  // namespace jxl
+
+#endif  // LIB_JXL_SIMD_UTIL_H_
index b81f5d1..94f7788 100644 (file)
@@ -3,7 +3,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-#include <stdio.h>
+#include "lib/jxl/testing.h"
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "lib/jxl/simd_util_test.cc"
@@ -13,7 +13,7 @@
 
 // Test utils
 #include <hwy/highway.h>
-#include <hwy/tests/test_util-inl.h>
+#include <hwy/tests/hwy_gtest.h>
 HWY_BEFORE_NAMESPACE();
 namespace jxl {
 namespace HWY_NAMESPACE {
index 9c120fe..078d763 100644 (file)
@@ -3,23 +3,23 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+#include <jxl/cms.h>
+
+#include <cstdint>
 #include <string>
+#include <vector>
 
-#include "gtest/gtest.h"
 #include "lib/extras/codec.h"
-#include "lib/jxl/aux_out.h"
 #include "lib/jxl/base/data_parallel.h"
-#include "lib/jxl/base/padded_bytes.h"
-#include "lib/jxl/base/thread_pool_internal.h"
+#include "lib/jxl/base/span.h"
 #include "lib/jxl/codec_in_out.h"
 #include "lib/jxl/enc_butteraugli_comparator.h"
 #include "lib/jxl/enc_cache.h"
-#include "lib/jxl/enc_file.h"
 #include "lib/jxl/enc_params.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/image_test_utils.h"
 #include "lib/jxl/test_utils.h"
-#include "lib/jxl/testdata.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
 namespace {
@@ -35,7 +35,7 @@ struct SpeedTierTestParams {
 std::ostream& operator<<(std::ostream& os, SpeedTierTestParams params) {
   auto previous_flags = os.flags();
   os << std::boolalpha;
-  os << "SpeedTierTestParams{" << SpeedTierName(params.speed_tier)
+  os << "SpeedTierTestParams{" << static_cast<size_t>(params.speed_tier)
      << ", /*shrink8=*/" << params.shrink8 << "}";
   os.flags(previous_flags);
   return os;
@@ -74,36 +74,46 @@ JXL_GTEST_INSTANTIATE_TEST_SUITE_P(
                     SpeedTierTestParams{SpeedTier::kSquirrel,
                                         /*shrink8=*/false},
                     SpeedTierTestParams{SpeedTier::kKitten,
-                                        /*shrink8=*/true},
-                    SpeedTierTestParams{SpeedTier::kKitten,
                                         /*shrink8=*/false},
                     // Only downscaled image for Tortoise mode.
                     SpeedTierTestParams{SpeedTier::kTortoise,
+                                        /*shrink8=*/true},
+                    SpeedTierTestParams{SpeedTier::kGlacier,
                                         /*shrink8=*/true}));
 
 TEST_P(SpeedTierTest, Roundtrip) {
-  const PaddedBytes orig =
-      ReadTestData("external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
+  const std::vector<uint8_t> orig = jxl::test::ReadTestData(
+      "external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
   CodecInOut io;
-  ThreadPoolInternal pool(8);
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io, &pool));
+  test::ThreadPoolForTests pool(8);
+  ASSERT_TRUE(SetFromBytes(Bytes(orig), &io, &pool));
 
   const SpeedTierTestParams& params = GetParam();
 
-  if (params.shrink8) {
+  if (params.speed_tier == SpeedTier::kGlacier) {
+    // just a few pixels will already take enough time at this setting
+    io.ShrinkTo(8, 8);
+  } else if (params.shrink8) {
     io.ShrinkTo(io.xsize() / 8, io.ysize() / 8);
   }
 
   CompressParams cparams;
   cparams.speed_tier = params.speed_tier;
+  cparams.SetCms(*JxlGetDefaultCms());
 
-  CodecInOut io2;
-  test::Roundtrip(&io, cparams, {}, nullptr, &io2);
-
-  // Can be 2.2 in non-hare mode.
-  EXPECT_LE(ButteraugliDistance(io, io2, cparams.ba_params, GetJxlCms(),
+  CodecInOut io2, io3;
+  JXL_EXPECT_OK(test::Roundtrip(&io, cparams, {}, &io2, _));
+  EXPECT_LE(ButteraugliDistance(io.frames, io2.frames, ButteraugliParams(),
+                                *JxlGetDefaultCms(),
                                 /*distmap=*/nullptr, /*pool=*/nullptr),
-            2.8);
+            1.6);
+
+  if (params.shrink8) {
+    cparams.SetLossless();
+    JXL_EXPECT_OK(test::Roundtrip(&io, cparams, {}, &io3, _));
+
+    JXL_EXPECT_OK(SamePixels(*io.Main().color(), *io3.Main().color(), _));
+  }
 }
 }  // namespace
 }  // namespace jxl
index edaaf27..9d80e50 100644 (file)
@@ -7,22 +7,24 @@
 
 #include <algorithm>
 #include <cmath>
+#include <limits>
 
-#include "lib/jxl/ans_params.h"
+#include "lib/jxl/base/common.h"
 #include "lib/jxl/base/printf_macros.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/chroma_from_luma.h"
-#include "lib/jxl/common.h"
+#include "lib/jxl/common.h"  // JXL_HIGH_PRECISION
 #include "lib/jxl/dct_scales.h"
-#include "lib/jxl/entropy_coder.h"
-#include "lib/jxl/opsin_params.h"
+#include "lib/jxl/dec_ans.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/pack_signed.h"
 
 #undef HWY_TARGET_INCLUDE
 #define HWY_TARGET_INCLUDE "lib/jxl/splines.cc"
 #include <hwy/foreach_target.h>
 #include <hwy/highway.h>
 
-#include "lib/jxl/fast_math-inl.h"
+#include "lib/jxl/base/fast_math-inl.h"
 HWY_BEFORE_NAMESPACE();
 namespace jxl {
 namespace HWY_NAMESPACE {
@@ -108,15 +110,10 @@ void DrawSegment(const SplineSegment& segment, const bool add, const size_t y,
 void ComputeSegments(const Spline::Point& center, const float intensity,
                      const float color[3], const float sigma,
                      std::vector<SplineSegment>& segments,
-                     std::vector<std::pair<size_t, size_t>>& segments_by_y,
-                     size_t* pixel_limit) {
-  // In worst case zero-sized dot spans over 2 rows / columns.
-  constexpr const float kThinDotSpan = 2.0f;
+                     std::vector<std::pair<size_t, size_t>>& segments_by_y) {
   // Sanity check sigma, inverse sigma and intensity
   if (!(std::isfinite(sigma) && sigma != 0.0f && std::isfinite(1.0f / sigma) &&
         std::isfinite(intensity))) {
-    // Even no-draw should still be accounted.
-    *pixel_limit -= std::min<size_t>(*pixel_limit, kThinDotSpan * kThinDotSpan);
     return;
   }
 #if JXL_HIGH_PRECISION
@@ -142,20 +139,6 @@ void ComputeSegments(const Spline::Point& center, const float intensity,
   segment.inv_sigma = 1.0f / sigma;
   segment.sigma_over_4_times_intensity = .25f * sigma * intensity;
   segment.maximum_distance = maximum_distance;
-  float cost = 2.0f * maximum_distance + kThinDotSpan;
-  // Check cost^2 fits size_t.
-  if (cost >= static_cast<float>(1 << 15)) {
-    // Too much to rasterize.
-    *pixel_limit = 0;
-    return;
-  }
-  size_t area_cost = static_cast<size_t>(cost * cost);
-  if (area_cost > *pixel_limit) {
-    *pixel_limit = 0;
-    return;
-  }
-  // TODO(eustas): perhaps we should charge less: (y1 - y0) <= cost
-  *pixel_limit -= area_cost;
   ssize_t y0 = center.y - maximum_distance + .5f;
   ssize_t y1 = center.y + maximum_distance + 1.5f;  // one-past-the-end
   for (ssize_t y = std::max<ssize_t>(y0, 0); y < y1; y++) {
@@ -184,8 +167,7 @@ void SegmentsFromPoints(
     const Spline& spline,
     const std::vector<std::pair<Spline::Point, float>>& points_to_draw,
     const float arc_length, std::vector<SplineSegment>& segments,
-    std::vector<std::pair<size_t, size_t>>& segments_by_y,
-    size_t* pixel_limit) {
+    std::vector<std::pair<size_t, size_t>>& segments_by_y) {
   const float inv_arc_length = 1.0f / arc_length;
   int k = 0;
   for (const auto& point_to_draw : points_to_draw) {
@@ -201,11 +183,7 @@ void SegmentsFromPoints(
     }
     const float sigma =
         ContinuousIDCT(spline.sigma_dct, (32 - 1) * progress_along_arc);
-    ComputeSegments(point, multiplier, color, sigma, segments, segments_by_y,
-                    pixel_limit);
-    if (*pixel_limit == 0) {
-      return;
-    }
+    ComputeSegments(point, multiplier, color, sigma, segments, segments_by_y);
   }
 }
 }  // namespace
@@ -290,9 +268,6 @@ Vector operator*(const float k, const Vector& vec) {
 Spline::Point operator+(const Spline::Point& p, const Vector& vec) {
   return {p.x + vec.x, p.y + vec.y};
 }
-Spline::Point operator-(const Spline::Point& p, const Vector& vec) {
-  return p + -vec;
-}
 Vector operator-(const Spline::Point& a, const Spline::Point& b) {
   return {a.x - b.x, a.y - b.y};
 }
@@ -349,7 +324,7 @@ void DrawCentripetalCatmullRomSpline(std::vector<Spline::Point> points,
 // TODO(eustas): this method always adds the last point, but never the first
 //               (unless those are one); I believe both ends matter.
 template <typename Points, typename Functor>
-bool ForEachEquallySpacedPoint(const Points& points, const Functor& functor) {
+void ForEachEquallySpacedPoint(const Points& points, const Functor& functor) {
   JXL_ASSERT(!points.empty());
   Spline::Point current = points.front();
   functor(current, kDesiredRenderingDistance);
@@ -359,7 +334,8 @@ bool ForEachEquallySpacedPoint(const Points& points, const Functor& functor) {
     float arclength_from_previous = 0.f;
     for (;;) {
       if (next == points.end()) {
-        return functor(*previous, arclength_from_previous);
+        functor(*previous, arclength_from_previous);
+        return;
       }
       const float arclength_to_next =
           std::sqrt((*next - *previous).SquaredNorm());
@@ -369,9 +345,7 @@ bool ForEachEquallySpacedPoint(const Points& points, const Functor& functor) {
             *previous + ((kDesiredRenderingDistance - arclength_from_previous) /
                          arclength_to_next) *
                             (*next - *previous);
-        if (!functor(current, kDesiredRenderingDistance)) {
-          return false;
-        }
+        functor(current, kDesiredRenderingDistance);
         break;
       }
       arclength_from_previous += arclength_to_next;
@@ -379,7 +353,6 @@ bool ForEachEquallySpacedPoint(const Points& points, const Functor& functor) {
       ++next;
     }
   }
-  return true;
 }
 
 }  // namespace
@@ -390,13 +363,13 @@ QuantizedSpline::QuantizedSpline(const Spline& original,
   JXL_ASSERT(!original.control_points.empty());
   control_points_.reserve(original.control_points.size() - 1);
   const Spline::Point& starting_point = original.control_points.front();
-  int previous_x = static_cast<int>(roundf(starting_point.x)),
-      previous_y = static_cast<int>(roundf(starting_point.y));
+  int previous_x = static_cast<int>(std::roundf(starting_point.x));
+  int previous_y = static_cast<int>(std::roundf(starting_point.y));
   int previous_delta_x = 0, previous_delta_y = 0;
   for (auto it = original.control_points.begin() + 1;
        it != original.control_points.end(); ++it) {
-    const int new_x = static_cast<int>(roundf(it->x));
-    const int new_y = static_cast<int>(roundf(it->y));
+    const int new_x = static_cast<int>(std::roundf(it->x));
+    const int new_y = static_cast<int>(std::roundf(it->y));
     const int new_delta_x = new_x - previous_x;
     const int new_delta_y = new_y - previous_y;
     control_points_.emplace_back(new_delta_x - previous_delta_x,
@@ -408,7 +381,10 @@ QuantizedSpline::QuantizedSpline(const Spline& original,
   }
 
   const auto to_int = [](float v) -> int {
-    return static_cast<int>(roundf(v));
+    // Maximal int representable with float.
+    constexpr float kMax = std::numeric_limits<int>::max() - 127;
+    constexpr float kMin = -kMax;
+    return static_cast<int>(std::roundf(Clamp1(v, kMin, kMax)));
   };
 
   const auto quant = AdjustedQuant(quantization_adjustment);
@@ -435,20 +411,32 @@ QuantizedSpline::QuantizedSpline(const Spline& original,
 Status QuantizedSpline::Dequantize(const Spline::Point& starting_point,
                                    const int32_t quantization_adjustment,
                                    const float y_to_x, const float y_to_b,
+                                   const uint64_t image_size,
+                                   uint64_t* total_estimated_area_reached,
                                    Spline& result) const {
+  constexpr uint64_t kOne = static_cast<uint64_t>(1);
+  const uint64_t area_limit =
+      std::min(1024 * image_size + (kOne << 32), kOne << 42);
+
   result.control_points.clear();
   result.control_points.reserve(control_points_.size() + 1);
-  float px = roundf(starting_point.x);
-  float py = roundf(starting_point.y);
+  float px = std::roundf(starting_point.x);
+  float py = std::roundf(starting_point.y);
   JXL_RETURN_IF_ERROR(ValidateSplinePointPos(px, py));
   int current_x = static_cast<int>(px);
   int current_y = static_cast<int>(py);
   result.control_points.push_back(Spline::Point{static_cast<float>(current_x),
                                                 static_cast<float>(current_y)});
   int current_delta_x = 0, current_delta_y = 0;
+  uint64_t manhattan_distance = 0;
   for (const auto& point : control_points_) {
     current_delta_x += point.first;
     current_delta_y += point.second;
+    manhattan_distance += std::abs(current_delta_x) + std::abs(current_delta_y);
+    if (manhattan_distance > area_limit) {
+      return JXL_FAILURE("Too large manhattan_distance reached: %" PRIu64,
+                         manhattan_distance);
+    }
     JXL_RETURN_IF_ERROR(
         ValidateSplinePointPos(current_delta_x, current_delta_y));
     current_x += current_delta_x;
@@ -470,10 +458,45 @@ Status QuantizedSpline::Dequantize(const Spline::Point& starting_point,
     result.color_dct[0][i] += y_to_x * result.color_dct[1][i];
     result.color_dct[2][i] += y_to_b * result.color_dct[1][i];
   }
+  uint64_t width_estimate = 0;
+
+  uint64_t color[3] = {};
+  for (int c = 0; c < 3; ++c) {
+    for (int i = 0; i < 32; ++i) {
+      color[c] += static_cast<uint64_t>(
+          std::ceil(inv_quant * std::abs(color_dct_[c][i])));
+    }
+  }
+  color[0] += static_cast<uint64_t>(std::ceil(std::abs(y_to_x))) * color[1];
+  color[2] += static_cast<uint64_t>(std::ceil(std::abs(y_to_b))) * color[1];
+  // This is not taking kChannelWeight into account, but up to constant factors
+  // it gives an indication of the influence of the color values on the area
+  // that will need to be rendered.
+  const uint64_t max_color = std::max({color[1], color[0], color[2]});
+  uint64_t logcolor =
+      std::max(kOne, static_cast<uint64_t>(CeilLog2Nonzero(kOne + max_color)));
+
+  const float weight_limit =
+      std::ceil(std::sqrt((static_cast<float>(area_limit) / logcolor) /
+                          std::max<size_t>(1, manhattan_distance)));
+
   for (int i = 0; i < 32; ++i) {
     const float inv_dct_factor = (i == 0) ? kSqrt0_5 : 1.0f;
     result.sigma_dct[i] =
         sigma_dct_[i] * inv_dct_factor * kChannelWeight[3] * inv_quant;
+    // If we include the factor kChannelWeight[3]=.3333f here, we get a
+    // realistic area estimate. We leave it out to simplify the calculations,
+    // and understand that this way we underestimate the area by a factor of
+    // 1/(0.3333*0.3333). This is taken into account in the limits below.
+    float weight_f = std::ceil(inv_quant * std::abs(sigma_dct_[i]));
+    uint64_t weight =
+        static_cast<uint64_t>(std::min(weight_limit, std::max(1.0f, weight_f)));
+    width_estimate += weight * weight * logcolor;
+  }
+  *total_estimated_area_reached += (width_estimate * manhattan_distance);
+  if (*total_estimated_area_reached > area_limit) {
+    return JXL_FAILURE("Too large total_estimated_area eached: %" PRIu64,
+                       *total_estimated_area_reached);
   }
 
   return true;
@@ -510,9 +533,13 @@ Status QuantizedSpline::Decode(const std::vector<uint8_t>& context_map,
   }
 
   const auto decode_dct = [decoder, br, &context_map](int dct[32]) -> Status {
+    constexpr int kWeirdNumber = std::numeric_limits<int>::min();
     for (int i = 0; i < 32; ++i) {
       dct[i] =
           UnpackSigned(decoder->ReadHybridUint(kDCTContext, br, context_map));
+      if (dct[i] == kWeirdNumber) {
+        return JXL_FAILURE("The weird number in spline DCT");
+      }
     }
     return true;
   };
@@ -592,19 +619,15 @@ Status Splines::InitializeDrawCache(const size_t image_xsize,
   segment_indices_.clear();
   segment_y_start_.clear();
   std::vector<std::pair<size_t, size_t>> segments_by_y;
-  Spline spline;
-  // TODO(eustas): not in the spec; limit spline pixels with image area.
-  float pixel_limit = 16.0f * image_xsize * image_ysize + (1 << 16);
-  // Apply some extra cap to avoid overflows.
-  constexpr size_t kHardPixelLimit = 1u << 30;
-  size_t px_limit = (pixel_limit < static_cast<float>(kHardPixelLimit))
-                        ? static_cast<size_t>(pixel_limit)
-                        : kHardPixelLimit;
   std::vector<Spline::Point> intermediate_points;
+  uint64_t total_estimated_area_reached = 0;
+  std::vector<Spline> splines;
   for (size_t i = 0; i < splines_.size(); ++i) {
-    JXL_RETURN_IF_ERROR(
-        splines_[i].Dequantize(starting_points_[i], quantization_adjustment_,
-                               cmap.YtoXRatio(0), cmap.YtoBRatio(0), spline));
+    Spline spline;
+    JXL_RETURN_IF_ERROR(splines_[i].Dequantize(
+        starting_points_[i], quantization_adjustment_, cmap.YtoXRatio(0),
+        cmap.YtoBRatio(0), image_xsize * image_ysize,
+        &total_estimated_area_reached, spline));
     if (std::adjacent_find(spline.control_points.begin(),
                            spline.control_points.end()) !=
         spline.control_points.end()) {
@@ -613,17 +636,28 @@ Status Splines::InitializeDrawCache(const size_t image_xsize,
       return JXL_FAILURE(
           "identical successive control points in spline %" PRIuS, i);
     }
+    splines.push_back(spline);
+  }
+  // TODO(firsching) Change this into a JXL_FAILURE for level 5 codestreams.
+  if (total_estimated_area_reached >
+      std::min((8 * image_xsize * image_ysize + (uint64_t(1) << 25)),
+               (uint64_t(1) << 30))) {
+    JXL_WARNING(
+        "Large total_estimated_area_reached, expect slower decoding: %" PRIu64,
+        total_estimated_area_reached);
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+    return JXL_FAILURE("Total spline area is too large");
+#endif
+  }
+
+  for (Spline& spline : splines) {
     std::vector<std::pair<Spline::Point, float>> points_to_draw;
-    const auto add_point = [&](const Spline::Point& point,
-                               const float multiplier) -> bool {
+    auto add_point = [&](const Spline::Point& point, const float multiplier) {
       points_to_draw.emplace_back(point, multiplier);
-      return (points_to_draw.size() <= px_limit);
     };
     intermediate_points.clear();
     DrawCentripetalCatmullRomSpline(spline.control_points, intermediate_points);
-    if (!ForEachEquallySpacedPoint(intermediate_points, add_point)) {
-      return JXL_FAILURE("Too many pixels covered with splines");
-    }
+    ForEachEquallySpacedPoint(intermediate_points, add_point);
     const float arc_length =
         (points_to_draw.size() - 2) * kDesiredRenderingDistance +
         points_to_draw.back().second;
@@ -632,11 +666,9 @@ Status Splines::InitializeDrawCache(const size_t image_xsize,
       continue;
     }
     HWY_DYNAMIC_DISPATCH(SegmentsFromPoints)
-    (spline, points_to_draw, arc_length, segments_, segments_by_y, &px_limit);
-    if (px_limit == 0) {
-      return JXL_FAILURE("Too many pixels covered with splines");
-    }
+    (spline, points_to_draw, arc_length, segments_, segments_by_y);
   }
+
   // TODO(eustas): consider linear sorting here.
   std::sort(segments_by_y.begin(), segments_by_y.end());
   segment_indices_.resize(segments_by_y.size());
index 9d2b1a4..acdd085 100644 (file)
@@ -6,24 +6,22 @@
 #ifndef LIB_JXL_SPLINES_H_
 #define LIB_JXL_SPLINES_H_
 
-#include <stddef.h>
-#include <stdint.h>
-
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
 #include <utility>
 #include <vector>
 
-#include "lib/jxl/ans_params.h"
-#include "lib/jxl/aux_out.h"
-#include "lib/jxl/aux_out_fwd.h"
+#include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/chroma_from_luma.h"
-#include "lib/jxl/dec_ans.h"
-#include "lib/jxl/dec_bit_reader.h"
-#include "lib/jxl/entropy_coder.h"
 #include "lib/jxl/image.h"
 
 namespace jxl {
 
+class ANSSymbolReader;
+class BitReader;
+
 static constexpr float kDesiredRenderingDistance = 1.f;
 
 enum SplineEntropyContexts : size_t {
@@ -64,6 +62,7 @@ class QuantizedSpline {
 
   Status Dequantize(const Spline::Point& starting_point,
                     int32_t quantization_adjustment, float y_to_x, float y_to_b,
+                    uint64_t image_size, uint64_t* total_estimated_area_reached,
                     Spline& result) const;
 
   Status Decode(const std::vector<uint8_t>& context_map,
index 09b2dd5..06f994d 100644 (file)
@@ -5,14 +5,20 @@
 
 #include "lib/jxl/splines.h"
 
+#include <jxl/cms.h>
+
+#include <cstdint>
+#include <vector>
+
 #include "lib/extras/codec.h"
 #include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/enc_aux_out.h"
 #include "lib/jxl/enc_butteraugli_comparator.h"
-#include "lib/jxl/enc_color_management.h"
 #include "lib/jxl/enc_splines.h"
 #include "lib/jxl/image_test_utils.h"
 #include "lib/jxl/test_utils.h"
-#include "lib/jxl/testdata.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
 
@@ -27,6 +33,7 @@ std::ostream& operator<<(std::ostream& os, const Spline& spline) {
 
 namespace {
 
+using test::ReadTestData;
 using ::testing::AllOf;
 using ::testing::Field;
 using ::testing::FloatNear;
@@ -45,11 +52,12 @@ std::vector<Spline> DequantizeSplines(const Splines& splines) {
   JXL_CHECK(quantized_splines.size() == starting_points.size());
 
   std::vector<Spline> dequantized;
+  uint64_t total = 0;
   for (size_t i = 0; i < quantized_splines.size(); ++i) {
     dequantized.emplace_back();
-    JXL_CHECK(quantized_splines[i].Dequantize(starting_points[i],
-                                              kQuantizationAdjustment, kYToX,
-                                              kYToB, dequantized.back()));
+    JXL_CHECK(quantized_splines[i].Dequantize(
+        starting_points[i], kQuantizationAdjustment, kYToX, kYToB, 2u << 30u,
+        &total, dequantized.back()));
   }
   return dequantized;
 }
@@ -275,8 +283,8 @@ TEST(SplinesTest, DuplicatePoints) {
 
 TEST(SplinesTest, Drawing) {
   CodecInOut io_expected;
-  const PaddedBytes orig = ReadTestData("jxl/splines.pfm");
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(orig), &io_expected,
+  const std::vector<uint8_t> orig = ReadTestData("jxl/splines.pfm");
+  ASSERT_TRUE(SetFromBytes(Bytes(orig), &io_expected,
                            /*pool=*/nullptr));
 
   std::vector<Spline::Point> control_points{{9, 54},  {118, 159}, {97, 3},
@@ -309,27 +317,29 @@ TEST(SplinesTest, Drawing) {
   splines.AddTo(&image, Rect(image), Rect(image));
 
   CodecInOut io_actual;
-  io_actual.SetFromImage(CopyImage(image), ColorEncoding::SRGB());
-  ASSERT_TRUE(
-      io_actual.TransformTo(io_expected.Main().c_current(), GetJxlCms()));
-
-  VerifyRelativeError(*io_expected.Main().color(), *io_actual.Main().color(),
-                      1e-2f, 1e-1f);
+  Image3F image2(320, 320);
+  CopyImageTo(image, &image2);
+  io_actual.SetFromImage(std::move(image2), ColorEncoding::SRGB());
+  ASSERT_TRUE(io_actual.frames[0].TransformTo(io_expected.Main().c_current(),
+                                              *JxlGetDefaultCms()));
+
+  JXL_ASSERT_OK(VerifyRelativeError(
+      *io_expected.Main().color(), *io_actual.Main().color(), 1e-2f, 1e-1f, _));
 }
 
 TEST(SplinesTest, ClearedEveryFrame) {
   CodecInOut io_expected;
-  const PaddedBytes bytes_expected =
+  const std::vector<uint8_t> bytes_expected =
       ReadTestData("jxl/spline_on_first_frame.png");
-  ASSERT_TRUE(SetFromBytes(Span<const uint8_t>(bytes_expected), &io_expected,
+  ASSERT_TRUE(SetFromBytes(Bytes(bytes_expected), &io_expected,
                            /*pool=*/nullptr));
   CodecInOut io_actual;
-  const PaddedBytes bytes_actual =
+  const std::vector<uint8_t> bytes_actual =
       ReadTestData("jxl/spline_on_first_frame.jxl");
-  ASSERT_TRUE(test::DecodeFile({}, bytes_actual, &io_actual,
-                               /*pool=*/nullptr));
+  ASSERT_TRUE(test::DecodeFile({}, Bytes(bytes_actual), &io_actual));
 
-  ASSERT_TRUE(io_actual.TransformTo(ColorEncoding::SRGB(), GetJxlCms()));
+  ASSERT_TRUE(io_actual.frames[0].TransformTo(ColorEncoding::SRGB(),
+                                              *JxlGetDefaultCms()));
   for (size_t c = 0; c < 3; ++c) {
     for (size_t y = 0; y < io_actual.ysize(); ++y) {
       float* const JXL_RESTRICT row = io_actual.Main().color()->PlaneRow(c, y);
@@ -338,8 +348,8 @@ TEST(SplinesTest, ClearedEveryFrame) {
       }
     }
   }
-  VerifyRelativeError(*io_expected.Main().color(), *io_actual.Main().color(),
-                      1e-2f, 1e-1f);
+  JXL_ASSERT_OK(VerifyRelativeError(
+      *io_expected.Main().color(), *io_actual.Main().color(), 1e-2f, 1e-1f, _));
 }
 
 }  // namespace jxl
diff --git a/lib/jxl/test_image.cc b/lib/jxl/test_image.cc
new file mode 100644 (file)
index 0000000..1c43226
--- /dev/null
@@ -0,0 +1,450 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/test_image.h"
+
+#include <jxl/encode.h>
+
+#include <algorithm>
+#include <cstring>
+#include <utility>
+
+#include "lib/extras/dec/color_description.h"
+#include "lib/extras/dec/color_hints.h"
+#include "lib/extras/dec/decode.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/random.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/color_encoding_internal.h"
+
+namespace jxl {
+namespace test {
+
+namespace {
+
+void StoreValue(float val, size_t bits_per_sample, JxlPixelFormat format,
+                uint8_t** out) {
+  const float mul = (1u << bits_per_sample) - 1;
+  if (format.data_type == JXL_TYPE_UINT8) {
+    **out = val * mul;
+  } else if (format.data_type == JXL_TYPE_UINT16) {
+    uint16_t uval = val * mul;
+    if (SwapEndianness(format.endianness)) {
+      uval = JXL_BSWAP16(uval);
+    }
+    memcpy(*out, &uval, 2);
+  } else if (format.data_type == JXL_TYPE_FLOAT) {
+    // TODO(szabadka) Add support for custom bits / exponent bits floats.
+    if (SwapEndianness(format.endianness)) {
+      val = BSwapFloat(val);
+    }
+    memcpy(*out, &val, 4);
+  } else {
+    // TODO(szabadka) Add support for FLOAT16.
+  }
+  *out += extras::PackedImage::BitsPerChannel(format.data_type) / 8;
+}
+
+void FillPackedImage(size_t bits_per_sample, uint16_t seed,
+                     extras::PackedImage* image) {
+  const size_t xsize = image->xsize;
+  const size_t ysize = image->ysize;
+  const JxlPixelFormat format = image->format;
+
+  // Cause more significant image difference for successive seeds.
+  Rng generator(seed);
+
+  // Returns random integer in interval [0, max_value)
+  auto rngu = [&generator](size_t max_value) -> size_t {
+    return generator.UniformU(0, max_value);
+  };
+
+  // Returns random float in interval [0.0, max_value)
+  auto rngf = [&generator](float max_value) {
+    return generator.UniformF(0.0f, max_value);
+  };
+
+  // Dark background gradient color
+  float r0 = rngf(0.5f);
+  float g0 = rngf(0.5f);
+  float b0 = rngf(0.5f);
+  float a0 = rngf(0.5f);
+  float r1 = rngf(0.5f);
+  float g1 = rngf(0.5f);
+  float b1 = rngf(0.5f);
+  float a1 = rngf(0.5f);
+
+  // Circle with different color
+  size_t circle_x = rngu(xsize);
+  size_t circle_y = rngu(ysize);
+  size_t circle_r = rngu(std::min(xsize, ysize));
+
+  // Rectangle with random noise
+  size_t rect_x0 = rngu(xsize);
+  size_t rect_y0 = rngu(ysize);
+  size_t rect_x1 = rngu(xsize);
+  size_t rect_y1 = rngu(ysize);
+  if (rect_x1 < rect_x0) std::swap(rect_x0, rect_y1);
+  if (rect_y1 < rect_y0) std::swap(rect_y0, rect_y1);
+
+  // Create pixel content to test, actual content does not matter as long as it
+  // can be compared after roundtrip.
+  uint8_t* out = reinterpret_cast<uint8_t*>(image->pixels());
+  const float imul16 = 1.0f / 65536.0f;
+  for (size_t y = 0; y < ysize; y++) {
+    for (size_t x = 0; x < xsize; x++) {
+      float r = r0 * (ysize - y - 1) / ysize + r1 * y / ysize;
+      float g = g0 * (ysize - y - 1) / ysize + g1 * y / ysize;
+      float b = b0 * (ysize - y - 1) / ysize + b1 * y / ysize;
+      float a = a0 * (ysize - y - 1) / ysize + a1 * y / ysize;
+      // put some shape in there for visual debugging
+      if ((x - circle_x) * (x - circle_x) + (y - circle_y) * (y - circle_y) <
+          circle_r * circle_r) {
+        r = std::min(1.0f, ((65535 - x * y) ^ seed) * imul16);
+        g = std::min(1.0f, ((x << 8) + y + seed) * imul16);
+        b = std::min(1.0f, ((y << 8) + x * seed) * imul16);
+        a = std::min(1.0f, (32768 + x * 256 - y) * imul16);
+      } else if (x > rect_x0 && x < rect_x1 && y > rect_y0 && y < rect_y1) {
+        r = rngf(1.0f);
+        g = rngf(1.0f);
+        b = rngf(1.0f);
+        a = rngf(1.0f);
+      }
+      if (format.num_channels == 1) {
+        StoreValue(g, bits_per_sample, format, &out);
+      } else if (format.num_channels == 2) {
+        StoreValue(g, bits_per_sample, format, &out);
+        StoreValue(a, bits_per_sample, format, &out);
+      } else if (format.num_channels == 3) {
+        StoreValue(r, bits_per_sample, format, &out);
+        StoreValue(g, bits_per_sample, format, &out);
+        StoreValue(b, bits_per_sample, format, &out);
+      } else if (format.num_channels == 4) {
+        StoreValue(r, bits_per_sample, format, &out);
+        StoreValue(g, bits_per_sample, format, &out);
+        StoreValue(b, bits_per_sample, format, &out);
+        StoreValue(a, bits_per_sample, format, &out);
+      }
+    }
+  }
+}
+
+}  // namespace
+
+std::vector<uint8_t> GetSomeTestImage(size_t xsize, size_t ysize,
+                                      size_t num_channels, uint16_t seed) {
+  // Cause more significant image difference for successive seeds.
+  Rng generator(seed);
+
+  // Returns random integer in interval [0, max_value)
+  auto rng = [&generator](size_t max_value) -> size_t {
+    return generator.UniformU(0, max_value);
+  };
+
+  // Dark background gradient color
+  uint16_t r0 = rng(32768);
+  uint16_t g0 = rng(32768);
+  uint16_t b0 = rng(32768);
+  uint16_t a0 = rng(32768);
+  uint16_t r1 = rng(32768);
+  uint16_t g1 = rng(32768);
+  uint16_t b1 = rng(32768);
+  uint16_t a1 = rng(32768);
+
+  // Circle with different color
+  size_t circle_x = rng(xsize);
+  size_t circle_y = rng(ysize);
+  size_t circle_r = rng(std::min(xsize, ysize));
+
+  // Rectangle with random noise
+  size_t rect_x0 = rng(xsize);
+  size_t rect_y0 = rng(ysize);
+  size_t rect_x1 = rng(xsize);
+  size_t rect_y1 = rng(ysize);
+  if (rect_x1 < rect_x0) std::swap(rect_x0, rect_y1);
+  if (rect_y1 < rect_y0) std::swap(rect_y0, rect_y1);
+
+  size_t num_pixels = xsize * ysize;
+  // 16 bits per channel, big endian, 4 channels
+  std::vector<uint8_t> pixels(num_pixels * num_channels * 2);
+  // Create pixel content to test, actual content does not matter as long as it
+  // can be compared after roundtrip.
+  for (size_t y = 0; y < ysize; y++) {
+    for (size_t x = 0; x < xsize; x++) {
+      uint16_t r = r0 * (ysize - y - 1) / ysize + r1 * y / ysize;
+      uint16_t g = g0 * (ysize - y - 1) / ysize + g1 * y / ysize;
+      uint16_t b = b0 * (ysize - y - 1) / ysize + b1 * y / ysize;
+      uint16_t a = a0 * (ysize - y - 1) / ysize + a1 * y / ysize;
+      // put some shape in there for visual debugging
+      if ((x - circle_x) * (x - circle_x) + (y - circle_y) * (y - circle_y) <
+          circle_r * circle_r) {
+        r = (65535 - x * y) ^ seed;
+        g = (x << 8) + y + seed;
+        b = (y << 8) + x * seed;
+        a = 32768 + x * 256 - y;
+      } else if (x > rect_x0 && x < rect_x1 && y > rect_y0 && y < rect_y1) {
+        r = rng(65536);
+        g = rng(65536);
+        b = rng(65536);
+        a = rng(65536);
+      }
+      size_t i = (y * xsize + x) * 2 * num_channels;
+      pixels[i + 0] = (r >> 8);
+      pixels[i + 1] = (r & 255);
+      if (num_channels >= 2) {
+        // This may store what is called 'g' in the alpha channel of a 2-channel
+        // image, but that's ok since the content is arbitrary
+        pixels[i + 2] = (g >> 8);
+        pixels[i + 3] = (g & 255);
+      }
+      if (num_channels >= 3) {
+        pixels[i + 4] = (b >> 8);
+        pixels[i + 5] = (b & 255);
+      }
+      if (num_channels >= 4) {
+        pixels[i + 6] = (a >> 8);
+        pixels[i + 7] = (a & 255);
+      }
+    }
+  }
+  return pixels;
+}
+
+TestImage::TestImage() {
+  SetChannels(3);
+  SetAllBitDepths(8);
+  SetColorEncoding("RGB_D65_SRG_Rel_SRG");
+}
+
+TestImage& TestImage::DecodeFromBytes(const std::vector<uint8_t>& bytes) {
+  ColorEncoding c_enc;
+  JXL_CHECK(c_enc.FromExternal(ppf_.color_encoding));
+  extras::ColorHints color_hints;
+  color_hints.Add("color_space", Description(c_enc));
+  JXL_CHECK(extras::DecodeBytes(Bytes(bytes), color_hints, &ppf_));
+  return *this;
+}
+
+TestImage& TestImage::ClearMetadata() {
+  ppf_.metadata = extras::PackedMetadata();
+  return *this;
+}
+
+TestImage& TestImage::SetDimensions(size_t xsize, size_t ysize) {
+  if (xsize <= ppf_.info.xsize && ysize <= ppf_.info.ysize) {
+    for (auto& frame : ppf_.frames) {
+      CropLayerInfo(xsize, ysize, &frame.frame_info.layer_info);
+      CropImage(xsize, ysize, &frame.color);
+      for (auto& ec : frame.extra_channels) {
+        CropImage(xsize, ysize, &ec);
+      }
+    }
+  } else {
+    JXL_CHECK(ppf_.info.xsize == 0 && ppf_.info.ysize == 0);
+  }
+  ppf_.info.xsize = xsize;
+  ppf_.info.ysize = ysize;
+  return *this;
+}
+
+TestImage& TestImage::SetChannels(size_t num_channels) {
+  JXL_CHECK(ppf_.frames.empty());
+  JXL_CHECK(!ppf_.preview_frame);
+  ppf_.info.num_color_channels = num_channels < 3 ? 1 : 3;
+  ppf_.info.num_extra_channels = num_channels - ppf_.info.num_color_channels;
+  if (ppf_.info.num_extra_channels > 0 && ppf_.info.alpha_bits == 0) {
+    ppf_.info.alpha_bits = ppf_.info.bits_per_sample;
+    ppf_.info.alpha_exponent_bits = ppf_.info.exponent_bits_per_sample;
+  }
+  ppf_.extra_channels_info.clear();
+  for (size_t i = 1; i < ppf_.info.num_extra_channels; ++i) {
+    extras::PackedExtraChannel ec;
+    ec.index = i;
+    JxlEncoderInitExtraChannelInfo(JXL_CHANNEL_ALPHA, &ec.ec_info);
+    if (ec.ec_info.bits_per_sample == 0) {
+      ec.ec_info.bits_per_sample = ppf_.info.bits_per_sample;
+      ec.ec_info.exponent_bits_per_sample = ppf_.info.exponent_bits_per_sample;
+    }
+    ppf_.extra_channels_info.emplace_back(std::move(ec));
+  }
+  format_.num_channels = std::min(static_cast<size_t>(4), num_channels);
+  if (ppf_.info.num_color_channels == 1 &&
+      ppf_.color_encoding.color_space != JXL_COLOR_SPACE_GRAY) {
+    SetColorEncoding("Gra_D65_Rel_SRG");
+  }
+  return *this;
+}
+
+// Sets the same bit depth on color, alpha and all extra channels.
+TestImage& TestImage::SetAllBitDepths(uint32_t bits_per_sample,
+                                      uint32_t exponent_bits_per_sample) {
+  ppf_.info.bits_per_sample = bits_per_sample;
+  ppf_.info.exponent_bits_per_sample = exponent_bits_per_sample;
+  if (ppf_.info.num_extra_channels > 0) {
+    ppf_.info.alpha_bits = bits_per_sample;
+    ppf_.info.alpha_exponent_bits = exponent_bits_per_sample;
+  }
+  for (size_t i = 0; i < ppf_.extra_channels_info.size(); ++i) {
+    extras::PackedExtraChannel& ec = ppf_.extra_channels_info[i];
+    ec.ec_info.bits_per_sample = bits_per_sample;
+    ec.ec_info.exponent_bits_per_sample = exponent_bits_per_sample;
+  }
+  format_.data_type = DefaultDataType(ppf_.info);
+  return *this;
+}
+
+TestImage& TestImage::SetDataType(JxlDataType data_type) {
+  format_.data_type = data_type;
+  return *this;
+}
+
+TestImage& TestImage::SetEndianness(JxlEndianness endianness) {
+  format_.endianness = endianness;
+  return *this;
+}
+
+TestImage& TestImage::SetColorEncoding(const std::string& description) {
+  JXL_CHECK(ParseDescription(description, &ppf_.color_encoding));
+  ColorEncoding c_enc;
+  JXL_CHECK(c_enc.FromExternal(ppf_.color_encoding));
+  IccBytes icc = c_enc.ICC();
+  JXL_CHECK(!icc.empty());
+  ppf_.icc.assign(icc.begin(), icc.end());
+  return *this;
+}
+
+TestImage& TestImage::CoalesceGIFAnimationWithAlpha() {
+  extras::PackedFrame canvas = ppf_.frames[0].Copy();
+  JXL_CHECK(canvas.color.format.num_channels == 3);
+  JXL_CHECK(canvas.color.format.data_type == JXL_TYPE_UINT8);
+  JXL_CHECK(canvas.extra_channels.size() == 1);
+  for (size_t i = 1; i < ppf_.frames.size(); i++) {
+    const extras::PackedFrame& frame = ppf_.frames[i];
+    JXL_CHECK(frame.extra_channels.size() == 1);
+    const JxlLayerInfo& layer_info = frame.frame_info.layer_info;
+    extras::PackedFrame rendered = canvas.Copy();
+    uint8_t* pixels_rendered =
+        reinterpret_cast<uint8_t*>(rendered.color.pixels());
+    const uint8_t* pixels_frame =
+        reinterpret_cast<const uint8_t*>(frame.color.pixels());
+    uint8_t* alpha_rendered =
+        reinterpret_cast<uint8_t*>(rendered.extra_channels[0].pixels());
+    const uint8_t* alpha_frame =
+        reinterpret_cast<const uint8_t*>(frame.extra_channels[0].pixels());
+    for (size_t y = 0; y < frame.color.ysize; y++) {
+      for (size_t x = 0; x < frame.color.xsize; x++) {
+        size_t idx_frame = y * frame.color.xsize + x;
+        size_t idx_rendered = ((layer_info.crop_y0 + y) * rendered.color.xsize +
+                               (layer_info.crop_x0 + x));
+        if (alpha_frame[idx_frame] != 0) {
+          memcpy(&pixels_rendered[idx_rendered * 3],
+                 &pixels_frame[idx_frame * 3], 3);
+          alpha_rendered[idx_rendered] = alpha_frame[idx_frame];
+        }
+      }
+    }
+    if (layer_info.save_as_reference != 0) {
+      canvas = rendered.Copy();
+    }
+    ppf_.frames[i] = std::move(rendered);
+  }
+  return *this;
+}
+
+TestImage::Frame::Frame(TestImage* parent, bool is_preview, size_t index)
+    : parent_(parent), is_preview_(is_preview), index_(index) {}
+
+void TestImage::Frame::ZeroFill() {
+  memset(frame().color.pixels(), 0, frame().color.pixels_size);
+  for (auto& ec : frame().extra_channels) {
+    memset(ec.pixels(), 0, ec.pixels_size);
+  }
+}
+
+void TestImage::Frame::RandomFill(uint16_t seed) {
+  FillPackedImage(ppf().info.bits_per_sample, seed, &frame().color);
+  for (size_t i = 0; i < ppf().extra_channels_info.size(); ++i) {
+    FillPackedImage(ppf().extra_channels_info[i].ec_info.bits_per_sample,
+                    seed + 1 + i, &frame().extra_channels[i]);
+  }
+}
+
+void TestImage::Frame::SetValue(size_t y, size_t x, size_t c, float val) {
+  const extras::PackedImage& color = frame().color;
+  JxlPixelFormat format = color.format;
+  JXL_CHECK(y < ppf().info.ysize);
+  JXL_CHECK(x < ppf().info.xsize);
+  JXL_CHECK(c < format.num_channels);
+  size_t pwidth = extras::PackedImage::BitsPerChannel(format.data_type) / 8;
+  size_t idx = ((y * color.xsize + x) * format.num_channels + c) * pwidth;
+  uint8_t* pixels = reinterpret_cast<uint8_t*>(frame().color.pixels());
+  uint8_t* p = pixels + idx;
+  StoreValue(val, ppf().info.bits_per_sample, frame().color.format, &p);
+}
+
+TestImage::Frame TestImage::AddFrame() {
+  size_t index = ppf_.frames.size();
+  extras::PackedFrame frame(ppf_.info.xsize, ppf_.info.ysize, format_);
+  for (size_t i = 0; i < ppf_.extra_channels_info.size(); ++i) {
+    JxlPixelFormat ec_format = {1, format_.data_type, format_.endianness, 0};
+    extras::PackedImage image(ppf_.info.xsize, ppf_.info.ysize, ec_format);
+    frame.extra_channels.emplace_back(std::move(image));
+  }
+  ppf_.frames.emplace_back(std::move(frame));
+  return Frame(this, false, index);
+}
+
+TestImage::Frame TestImage::AddPreview(size_t xsize, size_t ysize) {
+  extras::PackedFrame frame(xsize, ysize, format_);
+  for (size_t i = 0; i < ppf_.extra_channels_info.size(); ++i) {
+    JxlPixelFormat ec_format = {1, format_.data_type, format_.endianness, 0};
+    extras::PackedImage image(xsize, ysize, ec_format);
+    frame.extra_channels.emplace_back(std::move(image));
+  }
+  ppf_.preview_frame = make_unique<extras::PackedFrame>(std::move(frame));
+  return Frame(this, true, 0);
+}
+
+void TestImage::CropLayerInfo(size_t xsize, size_t ysize, JxlLayerInfo* info) {
+  if (info->crop_x0 < static_cast<ssize_t>(xsize)) {
+    info->xsize = std::min<size_t>(info->xsize, xsize - info->crop_x0);
+  } else {
+    info->xsize = 0;
+  }
+  if (info->crop_y0 < static_cast<ssize_t>(ysize)) {
+    info->ysize = std::min<size_t>(info->ysize, ysize - info->crop_y0);
+  } else {
+    info->ysize = 0;
+  }
+}
+
+void TestImage::CropImage(size_t xsize, size_t ysize,
+                          extras::PackedImage* image) {
+  size_t new_stride = (image->stride / image->xsize) * xsize;
+  uint8_t* buf = reinterpret_cast<uint8_t*>(image->pixels());
+  for (size_t y = 0; y < ysize; ++y) {
+    memmove(&buf[y * new_stride], &buf[y * image->stride], new_stride);
+  }
+  image->xsize = xsize;
+  image->ysize = ysize;
+  image->stride = new_stride;
+  image->pixels_size = ysize * new_stride;
+}
+
+JxlDataType TestImage::DefaultDataType(const JxlBasicInfo& info) {
+  if (info.bits_per_sample == 16 && info.exponent_bits_per_sample == 5) {
+    return JXL_TYPE_FLOAT16;
+  } else if (info.exponent_bits_per_sample > 0 || info.bits_per_sample > 16) {
+    return JXL_TYPE_FLOAT;
+  } else if (info.bits_per_sample > 8) {
+    return JXL_TYPE_UINT16;
+  } else {
+    return JXL_TYPE_UINT8;
+  }
+}
+
+}  // namespace test
+}  // namespace jxl
index 0093443..137806b 100644 (file)
@@ -6,11 +6,16 @@
 #ifndef LIB_JXL_TEST_IMAGE_H_
 #define LIB_JXL_TEST_IMAGE_H_
 
-#include <stdint.h>
+#include <jxl/codestream_header.h>
+#include <jxl/types.h>
+#include <stddef.h>
 
+#include <cstdint>
+#include <string>
 #include <vector>
 
-#include "lib/jxl/base/random.h"
+#include "lib/extras/packed_image.h"
+#include "lib/jxl/base/span.h"
 
 namespace jxl {
 namespace test {
@@ -19,83 +24,69 @@ namespace test {
 // channel, big endian order, 1 to 4 channels
 // The seed parameter allows to create images with different pixel content.
 std::vector<uint8_t> GetSomeTestImage(size_t xsize, size_t ysize,
-                                      size_t num_channels, uint16_t seed) {
-  // Cause more significant image difference for successive seeds.
-  Rng generator(seed);
+                                      size_t num_channels, uint16_t seed);
 
-  // Returns random integer in interval [0, max_value)
-  auto rng = [&generator](size_t max_value) -> size_t {
-    return generator.UniformU(0, max_value);
-  };
+class TestImage {
+ public:
+  TestImage();
+
+  extras::PackedPixelFile& ppf() { return ppf_; }
+
+  TestImage& DecodeFromBytes(const std::vector<uint8_t>& bytes);
+
+  TestImage& ClearMetadata();
+
+  TestImage& SetDimensions(size_t xsize, size_t ysize);
+
+  TestImage& SetChannels(size_t num_channels);
+
+  // Sets the same bit depth on color, alpha and all extra channels.
+  TestImage& SetAllBitDepths(uint32_t bits_per_sample,
+                             uint32_t exponent_bits_per_sample = 0);
+
+  TestImage& SetDataType(JxlDataType data_type);
+
+  TestImage& SetEndianness(JxlEndianness endianness);
+
+  TestImage& SetColorEncoding(const std::string& description);
+
+  TestImage& CoalesceGIFAnimationWithAlpha();
+
+  class Frame {
+   public:
+    Frame(TestImage* parent, bool is_preview, size_t index);
 
-  // Dark background gradient color
-  uint16_t r0 = rng(32768);
-  uint16_t g0 = rng(32768);
-  uint16_t b0 = rng(32768);
-  uint16_t a0 = rng(32768);
-  uint16_t r1 = rng(32768);
-  uint16_t g1 = rng(32768);
-  uint16_t b1 = rng(32768);
-  uint16_t a1 = rng(32768);
-
-  // Circle with different color
-  size_t circle_x = rng(xsize);
-  size_t circle_y = rng(ysize);
-  size_t circle_r = rng(std::min(xsize, ysize));
-
-  // Rectangle with random noise
-  size_t rect_x0 = rng(xsize);
-  size_t rect_y0 = rng(ysize);
-  size_t rect_x1 = rng(xsize);
-  size_t rect_y1 = rng(ysize);
-  if (rect_x1 < rect_x0) std::swap(rect_x0, rect_y1);
-  if (rect_y1 < rect_y0) std::swap(rect_y0, rect_y1);
-
-  size_t num_pixels = xsize * ysize;
-  // 16 bits per channel, big endian, 4 channels
-  std::vector<uint8_t> pixels(num_pixels * num_channels * 2);
-  // Create pixel content to test, actual content does not matter as long as it
-  // can be compared after roundtrip.
-  for (size_t y = 0; y < ysize; y++) {
-    for (size_t x = 0; x < xsize; x++) {
-      uint16_t r = r0 * (ysize - y - 1) / ysize + r1 * y / ysize;
-      uint16_t g = g0 * (ysize - y - 1) / ysize + g1 * y / ysize;
-      uint16_t b = b0 * (ysize - y - 1) / ysize + b1 * y / ysize;
-      uint16_t a = a0 * (ysize - y - 1) / ysize + a1 * y / ysize;
-      // put some shape in there for visual debugging
-      if ((x - circle_x) * (x - circle_x) + (y - circle_y) * (y - circle_y) <
-          circle_r * circle_r) {
-        r = (65535 - x * y) ^ seed;
-        g = (x << 8) + y + seed;
-        b = (y << 8) + x * seed;
-        a = 32768 + x * 256 - y;
-      } else if (x > rect_x0 && x < rect_x1 && y > rect_y0 && y < rect_y1) {
-        r = rng(65536);
-        g = rng(65536);
-        b = rng(65536);
-        a = rng(65536);
-      }
-      size_t i = (y * xsize + x) * 2 * num_channels;
-      pixels[i + 0] = (r >> 8);
-      pixels[i + 1] = (r & 255);
-      if (num_channels >= 2) {
-        // This may store what is called 'g' in the alpha channel of a 2-channel
-        // image, but that's ok since the content is arbitrary
-        pixels[i + 2] = (g >> 8);
-        pixels[i + 3] = (g & 255);
-      }
-      if (num_channels >= 3) {
-        pixels[i + 4] = (b >> 8);
-        pixels[i + 5] = (b & 255);
-      }
-      if (num_channels >= 4) {
-        pixels[i + 6] = (a >> 8);
-        pixels[i + 7] = (a & 255);
-      }
+    void ZeroFill();
+    void RandomFill(uint16_t seed = 177);
+
+    void SetValue(size_t y, size_t x, size_t c, float val);
+
+   private:
+    extras::PackedPixelFile& ppf() const { return parent_->ppf(); }
+
+    extras::PackedFrame& frame() {
+      return is_preview_ ? *ppf().preview_frame : ppf().frames[index_];
     }
-  }
-  return pixels;
-}
+
+    TestImage* parent_;
+    bool is_preview_;
+    size_t index_;
+  };
+
+  Frame AddFrame();
+
+  Frame AddPreview(size_t xsize, size_t ysize);
+
+ private:
+  extras::PackedPixelFile ppf_;
+  JxlPixelFormat format_ = {3, JXL_TYPE_UINT8, JXL_LITTLE_ENDIAN, 0};
+
+  static void CropLayerInfo(size_t xsize, size_t ysize, JxlLayerInfo* info);
+
+  static void CropImage(size_t xsize, size_t ysize, extras::PackedImage* image);
+
+  static JxlDataType DefaultDataType(const JxlBasicInfo& info);
+};
 
 }  // namespace test
 }  // namespace jxl
diff --git a/lib/jxl/test_utils.cc b/lib/jxl/test_utils.cc
new file mode 100644 (file)
index 0000000..5758135
--- /dev/null
@@ -0,0 +1,802 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "lib/jxl/test_utils.h"
+
+#include <jxl/cms.h>
+#include <jxl/cms_interface.h>
+
+#include <cstddef>
+#include <fstream>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "lib/extras/metrics.h"
+#include "lib/extras/packed_image_convert.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/float.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/codec_in_out.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/enc_butteraugli_comparator.h"
+#include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_external_image.h"
+#include "lib/jxl/enc_fields.h"
+#include "lib/jxl/enc_frame.h"
+#include "lib/jxl/enc_icc_codec.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/icc_codec.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/padded_bytes.h"
+
+#if !defined(TEST_DATA_PATH)
+#include "tools/cpp/runfiles/runfiles.h"
+#endif
+
+namespace jxl {
+namespace test {
+
+#if defined(TEST_DATA_PATH)
+std::string GetTestDataPath(const std::string& filename) {
+  return std::string(TEST_DATA_PATH "/") + filename;
+}
+#else
+using bazel::tools::cpp::runfiles::Runfiles;
+const std::unique_ptr<Runfiles> kRunfiles(Runfiles::Create(""));
+std::string GetTestDataPath(const std::string& filename) {
+  std::string root(JPEGXL_ROOT_PACKAGE "/testdata/");
+  return kRunfiles->Rlocation(root + filename);
+}
+#endif
+
+std::vector<uint8_t> ReadTestData(const std::string& filename) {
+  std::string full_path = GetTestDataPath(filename);
+  fprintf(stderr, "ReadTestData %s\n", full_path.c_str());
+  std::ifstream file(full_path, std::ios::binary);
+  std::vector<char> str((std::istreambuf_iterator<char>(file)),
+                        std::istreambuf_iterator<char>());
+  JXL_CHECK(file.good());
+  const uint8_t* raw = reinterpret_cast<const uint8_t*>(str.data());
+  std::vector<uint8_t> data(raw, raw + str.size());
+  printf("Test data %s is %d bytes long.\n", filename.c_str(),
+         static_cast<int>(data.size()));
+  return data;
+}
+
+void DefaultAcceptedFormats(extras::JXLDecompressParams& dparams) {
+  if (dparams.accepted_formats.empty()) {
+    for (const uint32_t num_channels : {1, 2, 3, 4}) {
+      dparams.accepted_formats.push_back(
+          {num_channels, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, /*align=*/0});
+    }
+  }
+}
+
+Status DecodeFile(extras::JXLDecompressParams dparams,
+                  const Span<const uint8_t> file, CodecInOut* JXL_RESTRICT io,
+                  ThreadPool* pool) {
+  DefaultAcceptedFormats(dparams);
+  SetThreadParallelRunner(dparams, pool);
+  extras::PackedPixelFile ppf;
+  JXL_RETURN_IF_ERROR(DecodeImageJXL(file.data(), file.size(), dparams,
+                                     /*decoded_bytes=*/nullptr, &ppf));
+  JXL_RETURN_IF_ERROR(ConvertPackedPixelFileToCodecInOut(ppf, pool, io));
+  return true;
+}
+
+void JxlBasicInfoSetFromPixelFormat(JxlBasicInfo* basic_info,
+                                    const JxlPixelFormat* pixel_format) {
+  JxlEncoderInitBasicInfo(basic_info);
+  switch (pixel_format->data_type) {
+    case JXL_TYPE_FLOAT:
+      basic_info->bits_per_sample = 32;
+      basic_info->exponent_bits_per_sample = 8;
+      break;
+    case JXL_TYPE_FLOAT16:
+      basic_info->bits_per_sample = 16;
+      basic_info->exponent_bits_per_sample = 5;
+      break;
+    case JXL_TYPE_UINT8:
+      basic_info->bits_per_sample = 8;
+      basic_info->exponent_bits_per_sample = 0;
+      break;
+    case JXL_TYPE_UINT16:
+      basic_info->bits_per_sample = 16;
+      basic_info->exponent_bits_per_sample = 0;
+      break;
+    default:
+      JXL_ABORT("Unhandled JxlDataType");
+  }
+  if (pixel_format->num_channels < 3) {
+    basic_info->num_color_channels = 1;
+  } else {
+    basic_info->num_color_channels = 3;
+  }
+  if (pixel_format->num_channels == 2 || pixel_format->num_channels == 4) {
+    basic_info->alpha_exponent_bits = basic_info->exponent_bits_per_sample;
+    basic_info->alpha_bits = basic_info->bits_per_sample;
+    basic_info->num_extra_channels = 1;
+  } else {
+    basic_info->alpha_exponent_bits = 0;
+    basic_info->alpha_bits = 0;
+  }
+}
+
+ColorEncoding ColorEncodingFromDescriptor(const ColorEncodingDescriptor& desc) {
+  ColorEncoding c;
+  c.SetColorSpace(desc.color_space);
+  if (desc.color_space != ColorSpace::kXYB) {
+    JXL_CHECK(c.SetWhitePointType(desc.white_point));
+    if (desc.color_space != ColorSpace::kGray) {
+      JXL_CHECK(c.SetPrimariesType(desc.primaries));
+    }
+    c.Tf().SetTransferFunction(desc.tf);
+  }
+  c.SetRenderingIntent(desc.rendering_intent);
+  JXL_CHECK(c.CreateICC());
+  return c;
+}
+
+namespace {
+void CheckSameEncodings(const std::vector<ColorEncoding>& a,
+                        const std::vector<ColorEncoding>& b,
+                        const std::string& check_name,
+                        std::stringstream& failures) {
+  JXL_CHECK(a.size() == b.size());
+  for (size_t i = 0; i < a.size(); ++i) {
+    if ((a[i].ICC() == b[i].ICC()) ||
+        ((a[i].GetPrimariesType() == b[i].GetPrimariesType()) &&
+         a[i].Tf().IsSame(b[i].Tf()))) {
+      continue;
+    }
+    failures << "CheckSameEncodings " << check_name << ": " << i
+             << "-th encoding mismatch\n";
+  }
+}
+}  // namespace
+
+bool Roundtrip(const CodecInOut* io, const CompressParams& cparams,
+               extras::JXLDecompressParams dparams,
+               CodecInOut* JXL_RESTRICT io2, std::stringstream& failures,
+               size_t* compressed_size, ThreadPool* pool) {
+  DefaultAcceptedFormats(dparams);
+  if (compressed_size) {
+    *compressed_size = static_cast<size_t>(-1);
+  }
+  std::vector<uint8_t> compressed;
+
+  std::vector<ColorEncoding> original_metadata_encodings;
+  std::vector<ColorEncoding> original_current_encodings;
+  std::vector<ColorEncoding> metadata_encodings_1;
+  std::vector<ColorEncoding> metadata_encodings_2;
+  std::vector<ColorEncoding> current_encodings_2;
+  original_metadata_encodings.reserve(io->frames.size());
+  original_current_encodings.reserve(io->frames.size());
+  metadata_encodings_1.reserve(io->frames.size());
+  metadata_encodings_2.reserve(io->frames.size());
+  current_encodings_2.reserve(io->frames.size());
+
+  for (const ImageBundle& ib : io->frames) {
+    // Remember original encoding, will be returned by decoder.
+    original_metadata_encodings.push_back(ib.metadata()->color_encoding);
+    // c_current should not change during encoding.
+    original_current_encodings.push_back(ib.c_current());
+  }
+
+  std::unique_ptr<PassesEncoderState> enc_state =
+      jxl::make_unique<PassesEncoderState>();
+  JXL_CHECK(test::EncodeFile(cparams, io, enc_state.get(), &compressed, pool));
+
+  for (const ImageBundle& ib1 : io->frames) {
+    metadata_encodings_1.push_back(ib1.metadata()->color_encoding);
+  }
+
+  // Should still be in the same color space after encoding.
+  CheckSameEncodings(metadata_encodings_1, original_metadata_encodings,
+                     "original vs after encoding", failures);
+
+  JXL_CHECK(DecodeFile(dparams, Bytes(compressed), io2, pool));
+  JXL_CHECK(io2->frames.size() == io->frames.size());
+
+  for (const ImageBundle& ib2 : io2->frames) {
+    metadata_encodings_2.push_back(ib2.metadata()->color_encoding);
+    current_encodings_2.push_back(ib2.c_current());
+  }
+
+  // We always produce the original color encoding if a color transform hook is
+  // set.
+  CheckSameEncodings(current_encodings_2, original_current_encodings,
+                     "current: original vs decoded", failures);
+
+  // Decoder returns the originals passed to the encoder.
+  CheckSameEncodings(metadata_encodings_2, original_metadata_encodings,
+                     "metadata: original vs decoded", failures);
+
+  if (compressed_size) {
+    *compressed_size = compressed.size();
+  }
+
+  return failures.str().empty();
+}
+
+size_t Roundtrip(const extras::PackedPixelFile& ppf_in,
+                 extras::JXLCompressParams cparams,
+                 extras::JXLDecompressParams dparams, ThreadPool* pool,
+                 extras::PackedPixelFile* ppf_out) {
+  DefaultAcceptedFormats(dparams);
+  SetThreadParallelRunner(cparams, pool);
+  SetThreadParallelRunner(dparams, pool);
+  std::vector<uint8_t> compressed;
+  JXL_CHECK(extras::EncodeImageJXL(cparams, ppf_in, /*jpeg_bytes=*/nullptr,
+                                   &compressed));
+  size_t decoded_bytes = 0;
+  JXL_CHECK(extras::DecodeImageJXL(compressed.data(), compressed.size(),
+                                   dparams, &decoded_bytes, ppf_out));
+  JXL_CHECK(decoded_bytes == compressed.size());
+  return compressed.size();
+}
+
+std::vector<ColorEncodingDescriptor> AllEncodings() {
+  std::vector<ColorEncodingDescriptor> all_encodings;
+  all_encodings.reserve(300);
+
+  for (ColorSpace cs : Values<ColorSpace>()) {
+    if (cs == ColorSpace::kUnknown || cs == ColorSpace::kXYB ||
+        cs == ColorSpace::kGray) {
+      continue;
+    }
+
+    for (WhitePoint wp : Values<WhitePoint>()) {
+      if (wp == WhitePoint::kCustom) continue;
+      for (Primaries primaries : Values<Primaries>()) {
+        if (primaries == Primaries::kCustom) continue;
+        for (TransferFunction tf : Values<TransferFunction>()) {
+          if (tf == TransferFunction::kUnknown) continue;
+          for (RenderingIntent ri : Values<RenderingIntent>()) {
+            ColorEncodingDescriptor cdesc;
+            cdesc.color_space = cs;
+            cdesc.white_point = wp;
+            cdesc.primaries = primaries;
+            cdesc.tf = tf;
+            cdesc.rendering_intent = ri;
+            all_encodings.push_back(cdesc);
+          }
+        }
+      }
+    }
+  }
+
+  return all_encodings;
+}
+
+jxl::CodecInOut SomeTestImageToCodecInOut(const std::vector<uint8_t>& buf,
+                                          size_t num_channels, size_t xsize,
+                                          size_t ysize) {
+  jxl::CodecInOut io;
+  io.SetSize(xsize, ysize);
+  io.metadata.m.SetAlphaBits(16);
+  io.metadata.m.color_encoding = jxl::ColorEncoding::SRGB(
+      /*is_gray=*/num_channels == 1 || num_channels == 2);
+  JxlPixelFormat format = {static_cast<uint32_t>(num_channels), JXL_TYPE_UINT16,
+                           JXL_BIG_ENDIAN, 0};
+  JXL_CHECK(ConvertFromExternal(
+      jxl::Bytes(buf.data(), buf.size()), xsize, ysize,
+      jxl::ColorEncoding::SRGB(/*is_gray=*/num_channels < 3),
+      /*bits_per_sample=*/16, format,
+      /*pool=*/nullptr,
+      /*ib=*/&io.Main()));
+  return io;
+}
+
+bool Near(double expected, double value, double max_dist) {
+  double dist = expected > value ? expected - value : value - expected;
+  return dist <= max_dist;
+}
+
+float LoadLEFloat16(const uint8_t* p) {
+  uint16_t bits16 = LoadLE16(p);
+  return LoadFloat16(bits16);
+}
+
+float LoadBEFloat16(const uint8_t* p) {
+  uint16_t bits16 = LoadBE16(p);
+  return LoadFloat16(bits16);
+}
+
+size_t GetPrecision(JxlDataType data_type) {
+  switch (data_type) {
+    case JXL_TYPE_UINT8:
+      return 8;
+    case JXL_TYPE_UINT16:
+      return 16;
+    case JXL_TYPE_FLOAT:
+      // Floating point mantissa precision
+      return 24;
+    case JXL_TYPE_FLOAT16:
+      return 11;
+    default:
+      JXL_ABORT("Unhandled JxlDataType");
+  }
+}
+
+size_t GetDataBits(JxlDataType data_type) {
+  switch (data_type) {
+    case JXL_TYPE_UINT8:
+      return 8;
+    case JXL_TYPE_UINT16:
+      return 16;
+    case JXL_TYPE_FLOAT:
+      return 32;
+    case JXL_TYPE_FLOAT16:
+      return 16;
+    default:
+      JXL_ABORT("Unhandled JxlDataType");
+  }
+}
+
+std::vector<double> ConvertToRGBA32(const uint8_t* pixels, size_t xsize,
+                                    size_t ysize, const JxlPixelFormat& format,
+                                    double factor) {
+  std::vector<double> result(xsize * ysize * 4);
+  size_t num_channels = format.num_channels;
+  bool gray = num_channels == 1 || num_channels == 2;
+  bool alpha = num_channels == 2 || num_channels == 4;
+  JxlEndianness endianness = format.endianness;
+  // Compute actual type:
+  if (endianness == JXL_NATIVE_ENDIAN) {
+    endianness = IsLittleEndian() ? JXL_LITTLE_ENDIAN : JXL_BIG_ENDIAN;
+  }
+
+  size_t stride =
+      xsize * jxl::DivCeil(GetDataBits(format.data_type) * num_channels,
+                           jxl::kBitsPerByte);
+  if (format.align > 1) stride = jxl::RoundUpTo(stride, format.align);
+
+  if (format.data_type == JXL_TYPE_UINT8) {
+    // Multiplier to bring to 0-1.0 range
+    double mul = factor > 0.0 ? factor : 1.0 / 255.0;
+    for (size_t y = 0; y < ysize; ++y) {
+      for (size_t x = 0; x < xsize; ++x) {
+        size_t j = (y * xsize + x) * 4;
+        size_t i = y * stride + x * num_channels;
+        double r = pixels[i];
+        double g = gray ? r : pixels[i + 1];
+        double b = gray ? r : pixels[i + 2];
+        double a = alpha ? pixels[i + num_channels - 1] : 255;
+        result[j + 0] = r * mul;
+        result[j + 1] = g * mul;
+        result[j + 2] = b * mul;
+        result[j + 3] = a * mul;
+      }
+    }
+  } else if (format.data_type == JXL_TYPE_UINT16) {
+    JXL_ASSERT(endianness != JXL_NATIVE_ENDIAN);
+    // Multiplier to bring to 0-1.0 range
+    double mul = factor > 0.0 ? factor : 1.0 / 65535.0;
+    for (size_t y = 0; y < ysize; ++y) {
+      for (size_t x = 0; x < xsize; ++x) {
+        size_t j = (y * xsize + x) * 4;
+        size_t i = y * stride + x * num_channels * 2;
+        double r, g, b, a;
+        if (endianness == JXL_BIG_ENDIAN) {
+          r = (pixels[i + 0] << 8) + pixels[i + 1];
+          g = gray ? r : (pixels[i + 2] << 8) + pixels[i + 3];
+          b = gray ? r : (pixels[i + 4] << 8) + pixels[i + 5];
+          a = alpha ? (pixels[i + num_channels * 2 - 2] << 8) +
+                          pixels[i + num_channels * 2 - 1]
+                    : 65535;
+        } else {
+          r = (pixels[i + 1] << 8) + pixels[i + 0];
+          g = gray ? r : (pixels[i + 3] << 8) + pixels[i + 2];
+          b = gray ? r : (pixels[i + 5] << 8) + pixels[i + 4];
+          a = alpha ? (pixels[i + num_channels * 2 - 1] << 8) +
+                          pixels[i + num_channels * 2 - 2]
+                    : 65535;
+        }
+        result[j + 0] = r * mul;
+        result[j + 1] = g * mul;
+        result[j + 2] = b * mul;
+        result[j + 3] = a * mul;
+      }
+    }
+  } else if (format.data_type == JXL_TYPE_FLOAT) {
+    JXL_ASSERT(endianness != JXL_NATIVE_ENDIAN);
+    for (size_t y = 0; y < ysize; ++y) {
+      for (size_t x = 0; x < xsize; ++x) {
+        size_t j = (y * xsize + x) * 4;
+        size_t i = y * stride + x * num_channels * 4;
+        double r, g, b, a;
+        if (endianness == JXL_BIG_ENDIAN) {
+          r = LoadBEFloat(pixels + i);
+          g = gray ? r : LoadBEFloat(pixels + i + 4);
+          b = gray ? r : LoadBEFloat(pixels + i + 8);
+          a = alpha ? LoadBEFloat(pixels + i + num_channels * 4 - 4) : 1.0;
+        } else {
+          r = LoadLEFloat(pixels + i);
+          g = gray ? r : LoadLEFloat(pixels + i + 4);
+          b = gray ? r : LoadLEFloat(pixels + i + 8);
+          a = alpha ? LoadLEFloat(pixels + i + num_channels * 4 - 4) : 1.0;
+        }
+        result[j + 0] = r;
+        result[j + 1] = g;
+        result[j + 2] = b;
+        result[j + 3] = a;
+      }
+    }
+  } else if (format.data_type == JXL_TYPE_FLOAT16) {
+    JXL_ASSERT(endianness != JXL_NATIVE_ENDIAN);
+    for (size_t y = 0; y < ysize; ++y) {
+      for (size_t x = 0; x < xsize; ++x) {
+        size_t j = (y * xsize + x) * 4;
+        size_t i = y * stride + x * num_channels * 2;
+        double r, g, b, a;
+        if (endianness == JXL_BIG_ENDIAN) {
+          r = LoadBEFloat16(pixels + i);
+          g = gray ? r : LoadBEFloat16(pixels + i + 2);
+          b = gray ? r : LoadBEFloat16(pixels + i + 4);
+          a = alpha ? LoadBEFloat16(pixels + i + num_channels * 2 - 2) : 1.0;
+        } else {
+          r = LoadLEFloat16(pixels + i);
+          g = gray ? r : LoadLEFloat16(pixels + i + 2);
+          b = gray ? r : LoadLEFloat16(pixels + i + 4);
+          a = alpha ? LoadLEFloat16(pixels + i + num_channels * 2 - 2) : 1.0;
+        }
+        result[j + 0] = r;
+        result[j + 1] = g;
+        result[j + 2] = b;
+        result[j + 3] = a;
+      }
+    }
+  } else {
+    JXL_ASSERT(false);  // Unsupported type
+  }
+  return result;
+}
+
+size_t ComparePixels(const uint8_t* a, const uint8_t* b, size_t xsize,
+                     size_t ysize, const JxlPixelFormat& format_a,
+                     const JxlPixelFormat& format_b,
+                     double threshold_multiplier) {
+  // Convert both images to equal full precision for comparison.
+  std::vector<double> a_full = ConvertToRGBA32(a, xsize, ysize, format_a);
+  std::vector<double> b_full = ConvertToRGBA32(b, xsize, ysize, format_b);
+  bool gray_a = format_a.num_channels < 3;
+  bool gray_b = format_b.num_channels < 3;
+  bool alpha_a = !(format_a.num_channels & 1);
+  bool alpha_b = !(format_b.num_channels & 1);
+  size_t bits_a = GetPrecision(format_a.data_type);
+  size_t bits_b = GetPrecision(format_b.data_type);
+  size_t bits = std::min(bits_a, bits_b);
+  // How much distance is allowed in case of pixels with lower bit depths, given
+  // that the double precision float images use range 0-1.0.
+  // E.g. in case of 1-bit this is 0.5 since 0.499 must map to 0 and 0.501 must
+  // map to 1.
+  double precision = 0.5 * threshold_multiplier / ((1ull << bits) - 1ull);
+  if (format_a.data_type == JXL_TYPE_FLOAT16 ||
+      format_b.data_type == JXL_TYPE_FLOAT16) {
+    // Lower the precision for float16, because it currently looks like the
+    // scalar and wasm implementations of hwy have 1 less bit of precision
+    // than the x86 implementations.
+    // TODO(lode): Set the required precision back to 11 bits when possible.
+    precision = 0.5 * threshold_multiplier / ((1ull << (bits - 1)) - 1ull);
+  }
+  size_t numdiff = 0;
+  for (size_t y = 0; y < ysize; y++) {
+    for (size_t x = 0; x < xsize; x++) {
+      size_t i = (y * xsize + x) * 4;
+      bool ok = true;
+      if (gray_a || gray_b) {
+        if (!Near(a_full[i + 0], b_full[i + 0], precision)) ok = false;
+        // If the input was grayscale and the output not, then the output must
+        // have all channels equal.
+        if (gray_a && b_full[i + 0] != b_full[i + 1] &&
+            b_full[i + 2] != b_full[i + 2]) {
+          ok = false;
+        }
+      } else {
+        if (!Near(a_full[i + 0], b_full[i + 0], precision) ||
+            !Near(a_full[i + 1], b_full[i + 1], precision) ||
+            !Near(a_full[i + 2], b_full[i + 2], precision)) {
+          ok = false;
+        }
+      }
+      if (alpha_a && alpha_b) {
+        if (!Near(a_full[i + 3], b_full[i + 3], precision)) ok = false;
+      } else {
+        // If the input had no alpha channel, the output should be opaque
+        // after roundtrip.
+        if (alpha_b && !Near(1.0, b_full[i + 3], precision)) ok = false;
+      }
+      if (!ok) numdiff++;
+    }
+  }
+  return numdiff;
+}
+
+double DistanceRMS(const uint8_t* a, const uint8_t* b, size_t xsize,
+                   size_t ysize, const JxlPixelFormat& format) {
+  // Convert both images to equal full precision for comparison.
+  std::vector<double> a_full = ConvertToRGBA32(a, xsize, ysize, format);
+  std::vector<double> b_full = ConvertToRGBA32(b, xsize, ysize, format);
+  double sum = 0.0;
+  for (size_t y = 0; y < ysize; y++) {
+    double row_sum = 0.0;
+    for (size_t x = 0; x < xsize; x++) {
+      size_t i = (y * xsize + x) * 4;
+      for (size_t c = 0; c < format.num_channels; ++c) {
+        double diff = a_full[i + c] - b_full[i + c];
+        row_sum += diff * diff;
+      }
+    }
+    sum += row_sum;
+  }
+  sum /= (xsize * ysize);
+  return sqrt(sum);
+}
+
+float ButteraugliDistance(const extras::PackedPixelFile& a,
+                          const extras::PackedPixelFile& b, ThreadPool* pool) {
+  CodecInOut io0;
+  JXL_CHECK(ConvertPackedPixelFileToCodecInOut(a, pool, &io0));
+  CodecInOut io1;
+  JXL_CHECK(ConvertPackedPixelFileToCodecInOut(b, pool, &io1));
+  // TODO(eustas): simplify?
+  return ButteraugliDistance(io0.frames, io1.frames, ButteraugliParams(),
+                             *JxlGetDefaultCms(),
+                             /*distmap=*/nullptr, pool);
+}
+
+float Butteraugli3Norm(const extras::PackedPixelFile& a,
+                       const extras::PackedPixelFile& b, ThreadPool* pool) {
+  CodecInOut io0;
+  JXL_CHECK(ConvertPackedPixelFileToCodecInOut(a, pool, &io0));
+  CodecInOut io1;
+  JXL_CHECK(ConvertPackedPixelFileToCodecInOut(b, pool, &io1));
+  ButteraugliParams ba;
+  ImageF distmap;
+  ButteraugliDistance(io0.frames, io1.frames, ba, *JxlGetDefaultCms(), &distmap,
+                      pool);
+  return ComputeDistanceP(distmap, ba, 3);
+}
+
+float ComputeDistance2(const extras::PackedPixelFile& a,
+                       const extras::PackedPixelFile& b) {
+  CodecInOut io0;
+  JXL_CHECK(ConvertPackedPixelFileToCodecInOut(a, nullptr, &io0));
+  CodecInOut io1;
+  JXL_CHECK(ConvertPackedPixelFileToCodecInOut(b, nullptr, &io1));
+  return ComputeDistance2(io0.Main(), io1.Main(), *JxlGetDefaultCms());
+}
+
+bool SameAlpha(const extras::PackedPixelFile& a,
+               const extras::PackedPixelFile& b) {
+  JXL_CHECK(a.info.xsize == b.info.xsize);
+  JXL_CHECK(a.info.ysize == b.info.ysize);
+  JXL_CHECK(a.info.alpha_bits == b.info.alpha_bits);
+  JXL_CHECK(a.info.alpha_exponent_bits == b.info.alpha_exponent_bits);
+  JXL_CHECK(a.info.alpha_bits > 0);
+  JXL_CHECK(a.frames.size() == b.frames.size());
+  for (size_t i = 0; i < a.frames.size(); ++i) {
+    const extras::PackedImage& color_a = a.frames[i].color;
+    const extras::PackedImage& color_b = b.frames[i].color;
+    JXL_CHECK(color_a.format.num_channels == color_b.format.num_channels);
+    JXL_CHECK(color_a.format.data_type == color_b.format.data_type);
+    JXL_CHECK(color_a.format.endianness == color_b.format.endianness);
+    JXL_CHECK(color_a.pixels_size == color_b.pixels_size);
+    size_t pwidth =
+        extras::PackedImage::BitsPerChannel(color_a.format.data_type) / 8;
+    size_t num_color = color_a.format.num_channels < 3 ? 1 : 3;
+    const uint8_t* p_a = reinterpret_cast<const uint8_t*>(color_a.pixels());
+    const uint8_t* p_b = reinterpret_cast<const uint8_t*>(color_b.pixels());
+    for (size_t y = 0; y < a.info.ysize; ++y) {
+      for (size_t x = 0; x < a.info.xsize; ++x) {
+        size_t idx =
+            ((y * a.info.xsize + x) * color_a.format.num_channels + num_color) *
+            pwidth;
+        if (memcmp(&p_a[idx], &p_b[idx], pwidth) != 0) {
+          return false;
+        }
+      }
+    }
+  }
+  return true;
+}
+
+bool SamePixels(const extras::PackedImage& a, const extras::PackedImage& b) {
+  JXL_CHECK(a.xsize == b.xsize);
+  JXL_CHECK(a.ysize == b.ysize);
+  JXL_CHECK(a.format.num_channels == b.format.num_channels);
+  JXL_CHECK(a.format.data_type == b.format.data_type);
+  JXL_CHECK(a.format.endianness == b.format.endianness);
+  JXL_CHECK(a.pixels_size == b.pixels_size);
+  const uint8_t* p_a = reinterpret_cast<const uint8_t*>(a.pixels());
+  const uint8_t* p_b = reinterpret_cast<const uint8_t*>(b.pixels());
+  for (size_t y = 0; y < a.ysize; ++y) {
+    for (size_t x = 0; x < a.xsize; ++x) {
+      size_t idx = (y * a.xsize + x) * a.pixel_stride();
+      if (memcmp(&p_a[idx], &p_b[idx], a.pixel_stride()) != 0) {
+        printf("Mismatch at row %" PRIuS " col %" PRIuS "\n", y, x);
+        printf("  a: ");
+        for (size_t j = 0; j < a.pixel_stride(); ++j) {
+          printf(" %3u", p_a[idx + j]);
+        }
+        printf("\n  b: ");
+        for (size_t j = 0; j < a.pixel_stride(); ++j) {
+          printf(" %3u", p_b[idx + j]);
+        }
+        printf("\n");
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+bool SamePixels(const extras::PackedPixelFile& a,
+                const extras::PackedPixelFile& b) {
+  JXL_CHECK(a.info.xsize == b.info.xsize);
+  JXL_CHECK(a.info.ysize == b.info.ysize);
+  JXL_CHECK(a.info.bits_per_sample == b.info.bits_per_sample);
+  JXL_CHECK(a.info.exponent_bits_per_sample == b.info.exponent_bits_per_sample);
+  JXL_CHECK(a.frames.size() == b.frames.size());
+  for (size_t i = 0; i < a.frames.size(); ++i) {
+    const auto& frame_a = a.frames[i];
+    const auto& frame_b = b.frames[i];
+    if (!SamePixels(frame_a.color, frame_b.color)) {
+      return false;
+    }
+    JXL_CHECK(frame_a.extra_channels.size() == frame_b.extra_channels.size());
+    for (size_t j = 0; j < frame_a.extra_channels.size(); ++j) {
+      if (!SamePixels(frame_a.extra_channels[i], frame_b.extra_channels[i])) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+Status ReadICC(BitReader* JXL_RESTRICT reader,
+               std::vector<uint8_t>* JXL_RESTRICT icc, size_t output_limit) {
+  icc->clear();
+  ICCReader icc_reader;
+  PaddedBytes icc_buffer;
+  JXL_RETURN_IF_ERROR(icc_reader.Init(reader, output_limit));
+  JXL_RETURN_IF_ERROR(icc_reader.Process(reader, &icc_buffer));
+  Bytes(icc_buffer).AppendTo(icc);
+  return true;
+}
+
+namespace {  // For EncodeFile
+Status PrepareCodecMetadataFromIO(const CompressParams& cparams,
+                                  const CodecInOut* io,
+                                  CodecMetadata* metadata) {
+  *metadata = io->metadata;
+  size_t ups = 1;
+  if (cparams.already_downsampled) ups = cparams.resampling;
+
+  JXL_RETURN_IF_ERROR(metadata->size.Set(io->xsize() * ups, io->ysize() * ups));
+
+  // Keep ICC profile in lossless modes because a reconstructed profile may be
+  // slightly different (quantization).
+  // Also keep ICC in JPEG reconstruction mode as we need byte-exact profiles.
+  if (!cparams.IsLossless() && !io->Main().IsJPEG() && cparams.cms_set) {
+    metadata->m.color_encoding.DecideIfWantICC(cparams.cms);
+  }
+
+  metadata->m.xyb_encoded =
+      cparams.color_transform == ColorTransform::kXYB ? true : false;
+
+  // TODO(firsching): move this EncodeFile to test_utils / re-implement this
+  // using API functions
+  return true;
+}
+
+Status EncodePreview(const CompressParams& cparams, const ImageBundle& ib,
+                     const CodecMetadata* metadata, const JxlCmsInterface& cms,
+                     ThreadPool* pool, BitWriter* JXL_RESTRICT writer) {
+  BitWriter preview_writer;
+  // TODO(janwas): also support generating preview by downsampling
+  if (ib.HasColor()) {
+    AuxOut aux_out;
+    PassesEncoderState passes_enc_state;
+    // TODO(lode): check if we want all extra channels and matching xyb_encoded
+    // for the preview, such that using the main ImageMetadata object for
+    // encoding this frame is warrented.
+    FrameInfo frame_info;
+    frame_info.is_preview = true;
+    JXL_RETURN_IF_ERROR(EncodeFrame(cparams, frame_info, metadata, ib,
+                                    &passes_enc_state, cms, pool,
+                                    &preview_writer, &aux_out));
+    preview_writer.ZeroPadToByte();
+  }
+
+  if (preview_writer.BitsWritten() != 0) {
+    writer->ZeroPadToByte();
+    writer->AppendByteAligned(preview_writer);
+  }
+
+  return true;
+}
+
+}  // namespace
+
+Status EncodeFile(const CompressParams& params, const CodecInOut* io,
+                  PassesEncoderState* passes_enc_state,
+                  std::vector<uint8_t>* compressed, ThreadPool* pool) {
+  compressed->clear();
+  const JxlCmsInterface& cms = *JxlGetDefaultCms();
+  io->CheckMetadata();
+  BitWriter writer;
+
+  CompressParams cparams = params;
+  if (io->Main().color_transform != ColorTransform::kNone) {
+    // Set the color transform to YCbCr or XYB if the original image is such.
+    cparams.color_transform = io->Main().color_transform;
+  }
+
+  JXL_RETURN_IF_ERROR(ParamsPostInit(&cparams));
+
+  std::unique_ptr<CodecMetadata> metadata = jxl::make_unique<CodecMetadata>();
+  JXL_RETURN_IF_ERROR(PrepareCodecMetadataFromIO(cparams, io, metadata.get()));
+  JXL_RETURN_IF_ERROR(
+      WriteCodestreamHeaders(metadata.get(), &writer, /*aux_out*/ nullptr));
+
+  // Only send ICC (at least several hundred bytes) if fields aren't enough.
+  if (metadata->m.color_encoding.WantICC()) {
+    JXL_RETURN_IF_ERROR(WriteICC(metadata->m.color_encoding.ICC(), &writer,
+                                 kLayerHeader, /* aux_out */ nullptr));
+  }
+
+  if (metadata->m.have_preview) {
+    JXL_RETURN_IF_ERROR(EncodePreview(cparams, io->preview_frame,
+                                      metadata.get(), cms, pool, &writer));
+  }
+
+  // Each frame should start on byte boundaries.
+  BitWriter::Allotment allotment(&writer, 8);
+  writer.ZeroPadToByte();
+  allotment.ReclaimAndCharge(&writer, kLayerHeader, /* aux_out */ nullptr);
+
+  for (size_t i = 0; i < io->frames.size(); i++) {
+    FrameInfo info;
+    info.is_last = i == io->frames.size() - 1;
+    if (io->frames[i].use_for_next_frame) {
+      info.save_as_reference = 1;
+    }
+    JXL_RETURN_IF_ERROR(EncodeFrame(cparams, info, metadata.get(),
+                                    io->frames[i], passes_enc_state, cms, pool,
+                                    &writer, /* aux_out */ nullptr));
+  }
+
+  // Clean up passes_enc_state in case it gets reused.
+  for (size_t i = 0; i < 4; i++) {
+    passes_enc_state->shared.dc_frames[i] = Image3F();
+    passes_enc_state->shared.reference_frames[i].frame = ImageBundle();
+  }
+
+  PaddedBytes output = std::move(writer).TakeBytes();
+  Bytes(output).AppendTo(compressed);
+  return true;
+}
+
+}  // namespace test
+
+bool operator==(const jxl::Bytes& a, const jxl::Bytes& b) {
+  if (a.size() != b.size()) return false;
+  if (memcmp(a.data(), b.data(), a.size()) != 0) return false;
+  return true;
+}
+
+// Allow using EXPECT_EQ on jxl::Bytes
+bool operator!=(const jxl::Bytes& a, const jxl::Bytes& b) { return !(a == b); }
+
+}  // namespace jxl
index b55cc3d..40de66b 100644 (file)
 #ifndef LIB_JXL_TEST_UTILS_H_
 #define LIB_JXL_TEST_UTILS_H_
 
+// TODO(eustas): reduce includes (move to .cc)
+
 // Macros and functions useful for tests.
 
-// gmock unconditionally redefines those macros (to wrong values).
-// Lets include it only here and mitigate the problem.
-#pragma push_macro("PRIdS")
-#pragma push_macro("PRIuS")
-#include "gmock/gmock.h"
-#pragma pop_macro("PRIuS")
-#pragma pop_macro("PRIdS")
-
-#include "gtest/gtest.h"
-#include "jxl/codestream_header.h"
-#include "jxl/encode.h"
+#include <jxl/codestream_header.h>
+#include <jxl/thread_parallel_runner_cxx.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <ostream>
+#include <vector>
+
+#include "lib/extras/dec/decode.h"
 #include "lib/extras/dec/jxl.h"
-#include "lib/extras/packed_image_convert.h"
-#include "lib/jxl/aux_out_fwd.h"
+#include "lib/extras/enc/jxl.h"
+#include "lib/extras/packed_image.h"
 #include "lib/jxl/base/data_parallel.h"
-#include "lib/jxl/base/random.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
 #include "lib/jxl/codec_in_out.h"
 #include "lib/jxl/color_encoding_internal.h"
-#include "lib/jxl/common.h"  // JPEGXL_ENABLE_TRANSCODE_JPEG
-#include "lib/jxl/enc_color_management.h"
-#include "lib/jxl/enc_external_image.h"
-#include "lib/jxl/enc_file.h"
 #include "lib/jxl/enc_params.h"
-#include "lib/jxl/test_image.h"
-
-#ifdef JXL_DISABLE_SLOW_TESTS
-#define JXL_SLOW_TEST(X) DISABLED_##X
-#else
-#define JXL_SLOW_TEST(X) X
-#endif  // JXL_DISABLE_SLOW_TESTS
-
-#if JPEGXL_ENABLE_TRANSCODE_JPEG
-#define JXL_TRANSCODE_JPEG_TEST(X) X
-#else
-#define JXL_TRANSCODE_JPEG_TEST(X) DISABLED_##X
-#endif  // JPEGXL_ENABLE_TRANSCODE_JPEG
-
-#ifdef THREAD_SANITIZER
-#define JXL_TSAN_SLOW_TEST(X) DISABLED_##X
-#else
-#define JXL_TSAN_SLOW_TEST(X) X
-#endif  // THREAD_SANITIZER
-
-// googletest before 1.10 didn't define INSTANTIATE_TEST_SUITE_P() but instead
-// used INSTANTIATE_TEST_CASE_P which is now deprecated.
-#ifdef INSTANTIATE_TEST_SUITE_P
-#define JXL_GTEST_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_SUITE_P
-#else
-#define JXL_GTEST_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_CASE_P
-#endif
-
-// Ensures that we don't make our test bounds too lax, effectively disabling the
-// tests.
-MATCHER_P(IsSlightlyBelow, max, "") { return max * 0.75 <= arg && arg <= max; }
 
-namespace jxl {
-namespace test {
+#define TEST_LIBJPEG_SUPPORT()                                              \
+  do {                                                                      \
+    if (!jxl::extras::CanDecode(jxl::extras::Codec::kJPG)) {                \
+      fprintf(stderr, "Skipping test because of missing libjpeg codec.\n"); \
+      return;                                                               \
+    }                                                                       \
+  } while (0)
 
-void JxlBasicInfoSetFromPixelFormat(JxlBasicInfo* basic_info,
-                                    const JxlPixelFormat* pixel_format) {
-  JxlEncoderInitBasicInfo(basic_info);
-  switch (pixel_format->data_type) {
-    case JXL_TYPE_FLOAT:
-      basic_info->bits_per_sample = 32;
-      basic_info->exponent_bits_per_sample = 8;
-      break;
-    case JXL_TYPE_FLOAT16:
-      basic_info->bits_per_sample = 16;
-      basic_info->exponent_bits_per_sample = 5;
-      break;
-    case JXL_TYPE_UINT8:
-      basic_info->bits_per_sample = 8;
-      basic_info->exponent_bits_per_sample = 0;
-      break;
-    case JXL_TYPE_UINT16:
-      basic_info->bits_per_sample = 16;
-      basic_info->exponent_bits_per_sample = 0;
-      break;
-    default:
-      JXL_ABORT("Unhandled JxlDataType");
-  }
-  if (pixel_format->num_channels < 3) {
-    basic_info->num_color_channels = 1;
-  } else {
-    basic_info->num_color_channels = 3;
-  }
-  if (pixel_format->num_channels == 2 || pixel_format->num_channels == 4) {
-    basic_info->alpha_exponent_bits = basic_info->exponent_bits_per_sample;
-    basic_info->alpha_bits = basic_info->bits_per_sample;
-    basic_info->num_extra_channels = 1;
-  } else {
-    basic_info->alpha_exponent_bits = 0;
-    basic_info->alpha_bits = 0;
-  }
-}
-
-MATCHER_P(MatchesPrimariesAndTransferFunction, color_encoding, "") {
-  return (arg.ICC() == color_encoding.ICC() ||
-          (arg.primaries == color_encoding.primaries &&
-           arg.tf.IsSame(color_encoding.tf)));
-}
-
-MATCHER(MatchesPrimariesAndTransferFunction, "") {
-  return testing::ExplainMatchResult(
-      MatchesPrimariesAndTransferFunction(std::get<1>(arg)), std::get<0>(arg),
-      result_listener);
-}
-
-template <typename Source>
-Status DecodeFile(extras::JXLDecompressParams dparams, const Source& file,
-                  CodecInOut* JXL_RESTRICT io, ThreadPool* pool) {
-  if (pool && !dparams.runner_opaque) {
-    dparams.runner = pool->runner();
-    dparams.runner_opaque = pool->runner_opaque();
-  }
-  extras::PackedPixelFile ppf;
-  JXL_RETURN_IF_ERROR(DecodeImageJXL(file.data(), file.size(), dparams,
-                                     /*decoded_bytes=*/nullptr, &ppf));
-  JXL_RETURN_IF_ERROR(ConvertPackedPixelFileToCodecInOut(ppf, pool, io));
-  return true;
-}
+namespace jxl {
 
-// Returns compressed size [bytes].
-size_t Roundtrip(const CodecInOut* io, const CompressParams& cparams,
-                 extras::JXLDecompressParams dparams, ThreadPool* pool,
-                 CodecInOut* JXL_RESTRICT io2, AuxOut* aux_out = nullptr) {
-  PaddedBytes compressed;
-
-  std::vector<ColorEncoding> original_metadata_encodings;
-  std::vector<ColorEncoding> original_current_encodings;
-  for (const ImageBundle& ib : io->frames) {
-    // Remember original encoding, will be returned by decoder.
-    original_metadata_encodings.push_back(ib.metadata()->color_encoding);
-    // c_current should not change during encoding.
-    original_current_encodings.push_back(ib.c_current());
-  }
+struct AuxOut;
+class CodecInOut;
+class PaddedBytes;
+struct PassesEncoderState;
+class ThreadPool;
 
-  std::unique_ptr<PassesEncoderState> enc_state =
-      jxl::make_unique<PassesEncoderState>();
-  EXPECT_TRUE(EncodeFile(cparams, io, enc_state.get(), &compressed, GetJxlCms(),
-                         aux_out, pool));
+namespace test {
 
-  std::vector<ColorEncoding> metadata_encodings_1;
-  for (const ImageBundle& ib1 : io->frames) {
-    metadata_encodings_1.push_back(ib1.metadata()->color_encoding);
-  }
+std::string GetTestDataPath(const std::string& filename);
+std::vector<uint8_t> ReadTestData(const std::string& filename);
 
-  // Should still be in the same color space after encoding.
-  EXPECT_THAT(metadata_encodings_1,
-              testing::Pointwise(MatchesPrimariesAndTransferFunction(),
-                                 original_metadata_encodings));
+void JxlBasicInfoSetFromPixelFormat(JxlBasicInfo* basic_info,
+                                    const JxlPixelFormat* pixel_format);
 
-  EXPECT_TRUE(DecodeFile(dparams, compressed, io2, pool));
+void DefaultAcceptedFormats(extras::JXLDecompressParams& dparams);
 
-  std::vector<ColorEncoding> metadata_encodings_2;
-  std::vector<ColorEncoding> current_encodings_2;
-  for (const ImageBundle& ib2 : io2->frames) {
-    metadata_encodings_2.push_back(ib2.metadata()->color_encoding);
-    current_encodings_2.push_back(ib2.c_current());
+template <typename Params>
+void SetThreadParallelRunner(Params params, ThreadPool* pool) {
+  if (pool && !params.runner_opaque) {
+    params.runner = pool->runner();
+    params.runner_opaque = pool->runner_opaque();
   }
+}
 
-  EXPECT_THAT(io2->frames, testing::SizeIs(io->frames.size()));
-  // We always produce the original color encoding if a color transform hook is
-  // set.
-  EXPECT_THAT(current_encodings_2,
-              testing::Pointwise(MatchesPrimariesAndTransferFunction(),
-                                 original_current_encodings));
-
-  // Decoder returns the originals passed to the encoder.
-  EXPECT_THAT(metadata_encodings_2,
-              testing::Pointwise(MatchesPrimariesAndTransferFunction(),
-                                 original_metadata_encodings));
+Status DecodeFile(extras::JXLDecompressParams dparams,
+                  const Span<const uint8_t> file, CodecInOut* JXL_RESTRICT io,
+                  ThreadPool* pool = nullptr);
 
-  return compressed.size();
-}
+bool Roundtrip(const CodecInOut* io, const CompressParams& cparams,
+               extras::JXLDecompressParams dparams,
+               CodecInOut* JXL_RESTRICT io2, std::stringstream& failures,
+               size_t* compressed_size = nullptr, ThreadPool* pool = nullptr);
 
-void CoalesceGIFAnimationWithAlpha(CodecInOut* io) {
-  ImageBundle canvas = io->frames[0].Copy();
-  for (size_t i = 1; i < io->frames.size(); i++) {
-    const ImageBundle& frame = io->frames[i];
-    ImageBundle rendered = canvas.Copy();
-    for (size_t y = 0; y < frame.ysize(); y++) {
-      float* row0 =
-          rendered.color()->PlaneRow(0, frame.origin.y0 + y) + frame.origin.x0;
-      float* row1 =
-          rendered.color()->PlaneRow(1, frame.origin.y0 + y) + frame.origin.x0;
-      float* row2 =
-          rendered.color()->PlaneRow(2, frame.origin.y0 + y) + frame.origin.x0;
-      float* rowa =
-          rendered.alpha()->Row(frame.origin.y0 + y) + frame.origin.x0;
-      const float* row0f = frame.color().PlaneRow(0, y);
-      const float* row1f = frame.color().PlaneRow(1, y);
-      const float* row2f = frame.color().PlaneRow(2, y);
-      const float* rowaf = frame.alpha().Row(y);
-      for (size_t x = 0; x < frame.xsize(); x++) {
-        if (rowaf[x] != 0) {
-          row0[x] = row0f[x];
-          row1[x] = row1f[x];
-          row2[x] = row2f[x];
-          rowa[x] = rowaf[x];
-        }
-      }
-    }
-    if (frame.use_for_next_frame) {
-      canvas = rendered.Copy();
-    }
-    io->frames[i] = std::move(rendered);
-  }
-}
+// Returns compressed size [bytes].
+size_t Roundtrip(const extras::PackedPixelFile& ppf_in,
+                 extras::JXLCompressParams cparams,
+                 extras::JXLDecompressParams dparams, ThreadPool* pool,
+                 extras::PackedPixelFile* ppf_out);
 
 // A POD descriptor of a ColorEncoding. Only used in tests as the return value
 // of AllEncodings().
@@ -228,17 +88,7 @@ struct ColorEncodingDescriptor {
   RenderingIntent rendering_intent;
 };
 
-static inline ColorEncoding ColorEncodingFromDescriptor(
-    const ColorEncodingDescriptor& desc) {
-  ColorEncoding c;
-  c.SetColorSpace(desc.color_space);
-  c.white_point = desc.white_point;
-  c.primaries = desc.primaries;
-  c.tf.SetTransferFunction(desc.tf);
-  c.rendering_intent = desc.rendering_intent;
-  JXL_CHECK(c.CreateICC());
-  return c;
-}
+ColorEncoding ColorEncodingFromDescriptor(const ColorEncodingDescriptor& desc);
 
 // Define the operator<< for tests.
 static inline ::std::ostream& operator<<(::std::ostream& os,
@@ -249,152 +99,23 @@ static inline ::std::ostream& operator<<(::std::ostream& os,
 // Returns ColorEncodingDescriptors, which are only used in tests. To obtain a
 // ColorEncoding object call ColorEncodingFromDescriptor and then call
 // ColorEncoding::CreateProfile() on that object to generate a profile.
-std::vector<ColorEncodingDescriptor> AllEncodings() {
-  std::vector<ColorEncodingDescriptor> all_encodings;
-  all_encodings.reserve(300);
-  ColorEncoding c;
-
-  for (ColorSpace cs : Values<ColorSpace>()) {
-    if (cs == ColorSpace::kUnknown || cs == ColorSpace::kXYB) continue;
-    c.SetColorSpace(cs);
-
-    for (WhitePoint wp : Values<WhitePoint>()) {
-      if (wp == WhitePoint::kCustom) continue;
-      if (c.ImplicitWhitePoint() && c.white_point != wp) continue;
-      c.white_point = wp;
-
-      for (Primaries primaries : Values<Primaries>()) {
-        if (primaries == Primaries::kCustom) continue;
-        if (!c.HasPrimaries()) continue;
-        c.primaries = primaries;
-
-        for (TransferFunction tf : Values<TransferFunction>()) {
-          if (tf == TransferFunction::kUnknown) continue;
-          if (c.tf.SetImplicit() &&
-              (c.tf.IsGamma() || c.tf.GetTransferFunction() != tf)) {
-            continue;
-          }
-          c.tf.SetTransferFunction(tf);
-
-          for (RenderingIntent ri : Values<RenderingIntent>()) {
-            ColorEncodingDescriptor cdesc;
-            cdesc.color_space = cs;
-            cdesc.white_point = wp;
-            cdesc.primaries = primaries;
-            cdesc.tf = tf;
-            cdesc.rendering_intent = ri;
-            all_encodings.push_back(cdesc);
-          }
-        }
-      }
-    }
-  }
-
-  return all_encodings;
-}
+std::vector<ColorEncodingDescriptor> AllEncodings();
 
 // Returns a CodecInOut based on the buf, xsize, ysize, and the assumption
 // that the buffer was created using `GetSomeTestImage`.
 jxl::CodecInOut SomeTestImageToCodecInOut(const std::vector<uint8_t>& buf,
                                           size_t num_channels, size_t xsize,
-                                          size_t ysize) {
-  jxl::CodecInOut io;
-  io.SetSize(xsize, ysize);
-  io.metadata.m.SetAlphaBits(16);
-  io.metadata.m.color_encoding = jxl::ColorEncoding::SRGB(
-      /*is_gray=*/num_channels == 1 || num_channels == 2);
-  EXPECT_TRUE(ConvertFromExternal(
-      jxl::Span<const uint8_t>(buf.data(), buf.size()), xsize, ysize,
-      jxl::ColorEncoding::SRGB(/*is_gray=*/num_channels < 3), num_channels,
-      /*alpha_is_premultiplied=*/false, /*bits_per_sample=*/16, JXL_BIG_ENDIAN,
-      /*pool=*/nullptr,
-      /*ib=*/&io.Main(), /*float_in=*/false, 0));
-  return io;
-}
-
-bool Near(double expected, double value, double max_dist) {
-  double dist = expected > value ? expected - value : value - expected;
-  return dist <= max_dist;
-}
+                                          size_t ysize);
 
-// Loads a Big-Endian float
-float LoadBEFloat(const uint8_t* p) {
-  uint32_t u = LoadBE32(p);
-  float result;
-  memcpy(&result, &u, 4);
-  return result;
-}
-
-// Loads a Little-Endian float
-float LoadLEFloat(const uint8_t* p) {
-  uint32_t u = LoadLE32(p);
-  float result;
-  memcpy(&result, &u, 4);
-  return result;
-}
-
-// Based on highway scalar implementation, for testing
-float LoadFloat16(uint16_t bits16) {
-  const uint32_t sign = bits16 >> 15;
-  const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
-  const uint32_t mantissa = bits16 & 0x3FF;
-
-  // Subnormal or zero
-  if (biased_exp == 0) {
-    const float subnormal = (1.0f / 16384) * (mantissa * (1.0f / 1024));
-    return sign ? -subnormal : subnormal;
-  }
+bool Near(double expected, double value, double max_dist);
 
-  // Normalized: convert the representation directly (faster than ldexp/tables).
-  const uint32_t biased_exp32 = biased_exp + (127 - 15);
-  const uint32_t mantissa32 = mantissa << (23 - 10);
-  const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
+float LoadLEFloat16(const uint8_t* p);
 
-  float result;
-  memcpy(&result, &bits32, 4);
-  return result;
-}
+float LoadBEFloat16(const uint8_t* p);
 
-float LoadLEFloat16(const uint8_t* p) {
-  uint16_t bits16 = LoadLE16(p);
-  return LoadFloat16(bits16);
-}
+size_t GetPrecision(JxlDataType data_type);
 
-float LoadBEFloat16(const uint8_t* p) {
-  uint16_t bits16 = LoadBE16(p);
-  return LoadFloat16(bits16);
-}
-
-size_t GetPrecision(JxlDataType data_type) {
-  switch (data_type) {
-    case JXL_TYPE_UINT8:
-      return 8;
-    case JXL_TYPE_UINT16:
-      return 16;
-    case JXL_TYPE_FLOAT:
-      // Floating point mantissa precision
-      return 24;
-    case JXL_TYPE_FLOAT16:
-      return 11;
-    default:
-      JXL_ABORT("Unhandled JxlDataType");
-  }
-}
-
-size_t GetDataBits(JxlDataType data_type) {
-  switch (data_type) {
-    case JXL_TYPE_UINT8:
-      return 8;
-    case JXL_TYPE_UINT16:
-      return 16;
-    case JXL_TYPE_FLOAT:
-      return 32;
-    case JXL_TYPE_FLOAT16:
-      return 16;
-    default:
-      JXL_ABORT("Unhandled JxlDataType");
-  }
-}
+size_t GetDataBits(JxlDataType data_type);
 
 // Procedure to convert pixels to double precision, not efficient, but
 // well-controlled for testing. It uses double, to be able to represent all
@@ -402,114 +123,8 @@ size_t GetDataBits(JxlDataType data_type) {
 // integers, and, single precision float. The values are in range 0-1 for SDR.
 std::vector<double> ConvertToRGBA32(const uint8_t* pixels, size_t xsize,
                                     size_t ysize, const JxlPixelFormat& format,
-                                    double factor = 0.0) {
-  std::vector<double> result(xsize * ysize * 4);
-  size_t num_channels = format.num_channels;
-  bool gray = num_channels == 1 || num_channels == 2;
-  bool alpha = num_channels == 2 || num_channels == 4;
-
-  size_t stride =
-      xsize * jxl::DivCeil(GetDataBits(format.data_type) * num_channels,
-                           jxl::kBitsPerByte);
-  if (format.align > 1) stride = jxl::RoundUpTo(stride, format.align);
-
-  if (format.data_type == JXL_TYPE_UINT8) {
-    // Multiplier to bring to 0-1.0 range
-    double mul = factor > 0.0 ? factor : 1.0 / 255.0;
-    for (size_t y = 0; y < ysize; ++y) {
-      for (size_t x = 0; x < xsize; ++x) {
-        size_t j = (y * xsize + x) * 4;
-        size_t i = y * stride + x * num_channels;
-        double r = pixels[i];
-        double g = gray ? r : pixels[i + 1];
-        double b = gray ? r : pixels[i + 2];
-        double a = alpha ? pixels[i + num_channels - 1] : 255;
-        result[j + 0] = r * mul;
-        result[j + 1] = g * mul;
-        result[j + 2] = b * mul;
-        result[j + 3] = a * mul;
-      }
-    }
-  } else if (format.data_type == JXL_TYPE_UINT16) {
-    // Multiplier to bring to 0-1.0 range
-    double mul = factor > 0.0 ? factor : 1.0 / 65535.0;
-    for (size_t y = 0; y < ysize; ++y) {
-      for (size_t x = 0; x < xsize; ++x) {
-        size_t j = (y * xsize + x) * 4;
-        size_t i = y * stride + x * num_channels * 2;
-        double r, g, b, a;
-        if (format.endianness == JXL_BIG_ENDIAN) {
-          r = (pixels[i + 0] << 8) + pixels[i + 1];
-          g = gray ? r : (pixels[i + 2] << 8) + pixels[i + 3];
-          b = gray ? r : (pixels[i + 4] << 8) + pixels[i + 5];
-          a = alpha ? (pixels[i + num_channels * 2 - 2] << 8) +
-                          pixels[i + num_channels * 2 - 1]
-                    : 65535;
-        } else {
-          r = (pixels[i + 1] << 8) + pixels[i + 0];
-          g = gray ? r : (pixels[i + 3] << 8) + pixels[i + 2];
-          b = gray ? r : (pixels[i + 5] << 8) + pixels[i + 4];
-          a = alpha ? (pixels[i + num_channels * 2 - 1] << 8) +
-                          pixels[i + num_channels * 2 - 2]
-                    : 65535;
-        }
-        result[j + 0] = r * mul;
-        result[j + 1] = g * mul;
-        result[j + 2] = b * mul;
-        result[j + 3] = a * mul;
-      }
-    }
-  } else if (format.data_type == JXL_TYPE_FLOAT) {
-    for (size_t y = 0; y < ysize; ++y) {
-      for (size_t x = 0; x < xsize; ++x) {
-        size_t j = (y * xsize + x) * 4;
-        size_t i = y * stride + x * num_channels * 4;
-        double r, g, b, a;
-        if (format.endianness == JXL_BIG_ENDIAN) {
-          r = LoadBEFloat(pixels + i);
-          g = gray ? r : LoadBEFloat(pixels + i + 4);
-          b = gray ? r : LoadBEFloat(pixels + i + 8);
-          a = alpha ? LoadBEFloat(pixels + i + num_channels * 4 - 4) : 1.0;
-        } else {
-          r = LoadLEFloat(pixels + i);
-          g = gray ? r : LoadLEFloat(pixels + i + 4);
-          b = gray ? r : LoadLEFloat(pixels + i + 8);
-          a = alpha ? LoadLEFloat(pixels + i + num_channels * 4 - 4) : 1.0;
-        }
-        result[j + 0] = r;
-        result[j + 1] = g;
-        result[j + 2] = b;
-        result[j + 3] = a;
-      }
-    }
-  } else if (format.data_type == JXL_TYPE_FLOAT16) {
-    for (size_t y = 0; y < ysize; ++y) {
-      for (size_t x = 0; x < xsize; ++x) {
-        size_t j = (y * xsize + x) * 4;
-        size_t i = y * stride + x * num_channels * 2;
-        double r, g, b, a;
-        if (format.endianness == JXL_BIG_ENDIAN) {
-          r = LoadBEFloat16(pixels + i);
-          g = gray ? r : LoadBEFloat16(pixels + i + 2);
-          b = gray ? r : LoadBEFloat16(pixels + i + 4);
-          a = alpha ? LoadBEFloat16(pixels + i + num_channels * 2 - 2) : 1.0;
-        } else {
-          r = LoadLEFloat16(pixels + i);
-          g = gray ? r : LoadLEFloat16(pixels + i + 2);
-          b = gray ? r : LoadLEFloat16(pixels + i + 4);
-          a = alpha ? LoadLEFloat16(pixels + i + num_channels * 2 - 2) : 1.0;
-        }
-        result[j + 0] = r;
-        result[j + 1] = g;
-        result[j + 2] = b;
-        result[j + 3] = a;
-      }
-    }
-  } else {
-    JXL_ASSERT(false);  // Unsupported type
-  }
-  return result;
-}
+                                    double factor = 0.0);
+
 // Returns amount of pixels which differ between the two pictures. Image b is
 // the image after roundtrip after roundtrip, image a before roundtrip. There
 // are more strict requirements for the alpha channel and grayscale values of
@@ -517,94 +132,67 @@ std::vector<double> ConvertToRGBA32(const uint8_t* pixels, size_t xsize,
 size_t ComparePixels(const uint8_t* a, const uint8_t* b, size_t xsize,
                      size_t ysize, const JxlPixelFormat& format_a,
                      const JxlPixelFormat& format_b,
-                     double threshold_multiplier = 1.0) {
-  // Convert both images to equal full precision for comparison.
-  std::vector<double> a_full = ConvertToRGBA32(a, xsize, ysize, format_a);
-  std::vector<double> b_full = ConvertToRGBA32(b, xsize, ysize, format_b);
-  bool gray_a = format_a.num_channels < 3;
-  bool gray_b = format_b.num_channels < 3;
-  bool alpha_a = !(format_a.num_channels & 1);
-  bool alpha_b = !(format_b.num_channels & 1);
-  size_t bits_a = GetPrecision(format_a.data_type);
-  size_t bits_b = GetPrecision(format_b.data_type);
-  size_t bits = std::min(bits_a, bits_b);
-  // How much distance is allowed in case of pixels with lower bit depths, given
-  // that the double precision float images use range 0-1.0.
-  // E.g. in case of 1-bit this is 0.5 since 0.499 must map to 0 and 0.501 must
-  // map to 1.
-  double precision = 0.5 * threshold_multiplier / ((1ull << bits) - 1ull);
-  if (format_a.data_type == JXL_TYPE_FLOAT16 ||
-      format_b.data_type == JXL_TYPE_FLOAT16) {
-    // Lower the precision for float16, because it currently looks like the
-    // scalar and wasm implementations of hwy have 1 less bit of precision
-    // than the x86 implementations.
-    // TODO(lode): Set the required precision back to 11 bits when possible.
-    precision = 0.5 * threshold_multiplier / ((1ull << (bits - 1)) - 1ull);
-  }
-  size_t numdiff = 0;
-  for (size_t y = 0; y < ysize; y++) {
-    for (size_t x = 0; x < xsize; x++) {
-      size_t i = (y * xsize + x) * 4;
-      bool ok = true;
-      if (gray_a || gray_b) {
-        if (!Near(a_full[i + 0], b_full[i + 0], precision)) ok = false;
-        // If the input was grayscale and the output not, then the output must
-        // have all channels equal.
-        if (gray_a && b_full[i + 0] != b_full[i + 1] &&
-            b_full[i + 2] != b_full[i + 2]) {
-          ok = false;
-        }
-      } else {
-        if (!Near(a_full[i + 0], b_full[i + 0], precision) ||
-            !Near(a_full[i + 1], b_full[i + 1], precision) ||
-            !Near(a_full[i + 2], b_full[i + 2], precision)) {
-          ok = false;
-        }
-      }
-      if (alpha_a && alpha_b) {
-        if (!Near(a_full[i + 3], b_full[i + 3], precision)) ok = false;
-      } else {
-        // If the input had no alpha channel, the output should be opaque
-        // after roundtrip.
-        if (alpha_b && !Near(1.0, b_full[i + 3], precision)) ok = false;
-      }
-      if (!ok) numdiff++;
-    }
-  }
-  return numdiff;
-}
+                     double threshold_multiplier = 1.0);
+
 double DistanceRMS(const uint8_t* a, const uint8_t* b, size_t xsize,
-                   size_t ysize, const JxlPixelFormat& format) {
-  // Convert both images to equal full precision for comparison.
-  std::vector<double> a_full = ConvertToRGBA32(a, xsize, ysize, format);
-  std::vector<double> b_full = ConvertToRGBA32(b, xsize, ysize, format);
-  double sum = 0.0;
-  for (size_t y = 0; y < ysize; y++) {
-    double row_sum = 0.0;
-    for (size_t x = 0; x < xsize; x++) {
-      size_t i = (y * xsize + x) * 4;
-      for (size_t c = 0; c < format.num_channels; ++c) {
-        double diff = a_full[i + c] - b_full[i + c];
-        row_sum += diff * diff;
-      }
-    }
-    sum += row_sum;
+                   size_t ysize, const JxlPixelFormat& format);
+
+float ButteraugliDistance(const extras::PackedPixelFile& a,
+                          const extras::PackedPixelFile& b,
+                          ThreadPool* pool = nullptr);
+
+float Butteraugli3Norm(const extras::PackedPixelFile& a,
+                       const extras::PackedPixelFile& b,
+                       ThreadPool* pool = nullptr);
+
+float ComputeDistance2(const extras::PackedPixelFile& a,
+                       const extras::PackedPixelFile& b);
+
+bool SameAlpha(const extras::PackedPixelFile& a,
+               const extras::PackedPixelFile& b);
+
+bool SamePixels(const extras::PackedImage& a, const extras::PackedImage& b);
+
+bool SamePixels(const extras::PackedPixelFile& a,
+                const extras::PackedPixelFile& b);
+
+class ThreadPoolForTests {
+ public:
+  explicit ThreadPoolForTests(int num_threads) {
+    runner_ =
+        JxlThreadParallelRunnerMake(/* memory_manager */ nullptr, num_threads);
+    pool_ =
+        jxl::make_unique<ThreadPool>(JxlThreadParallelRunner, runner_.get());
   }
-  sum /= (xsize * ysize);
-  return sqrt(sum);
-}
+  ThreadPoolForTests(const ThreadPoolForTests&) = delete;
+  ThreadPoolForTests& operator&(const ThreadPoolForTests&) = delete;
+  ThreadPool* operator&() { return pool_.get(); }
+
+ private:
+  JxlThreadParallelRunnerPtr runner_;
+  std::unique_ptr<ThreadPool> pool_;
+};
+
+// `icc` may be empty afterwards - if so, call CreateProfile. Does not append,
+// clears any original data that was in icc.
+// If `output_limit` is not 0, then returns error if resulting profile would be
+// longer than `output_limit`
+Status ReadICC(BitReader* JXL_RESTRICT reader,
+               std::vector<uint8_t>* JXL_RESTRICT icc, size_t output_limit = 0);
+
+// Compresses pixels from `io` (given in any ColorEncoding).
+// `io->metadata.m.original` must be set.
+Status EncodeFile(const CompressParams& params, const CodecInOut* io,
+                  PassesEncoderState* passes_enc_state,
+                  std::vector<uint8_t>* compressed, ThreadPool* pool = nullptr);
+
 }  // namespace test
 
-bool operator==(const jxl::PaddedBytes& a, const jxl::PaddedBytes& b) {
-  if (a.size() != b.size()) return false;
-  if (memcmp(a.data(), b.data(), a.size()) != 0) return false;
-  return true;
-}
+bool operator==(const jxl::Bytes& a, const jxl::Bytes& b);
+
+// Allow using EXPECT_EQ on jxl::Bytes
+bool operator!=(const jxl::Bytes& a, const jxl::Bytes& b);
 
-// Allow using EXPECT_EQ on jxl::PaddedBytes
-bool operator!=(const jxl::PaddedBytes& a, const jxl::PaddedBytes& b) {
-  return !(a == b);
-}
 }  // namespace jxl
 
 #endif  // LIB_JXL_TEST_UTILS_H_
diff --git a/lib/jxl/testdata.h b/lib/jxl/testdata.h
deleted file mode 100644 (file)
index d387219..0000000
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#ifndef LIB_JXL_TESTDATA_H_
-#define LIB_JXL_TESTDATA_H_
-
-#ifdef __EMSCRIPTEN__
-#include <emscripten.h>
-#endif
-
-#include <string>
-
-#include "lib/jxl/base/file_io.h"
-
-namespace jxl {
-
-static inline PaddedBytes ReadTestData(const std::string& filename) {
-  std::string full_path = std::string(TEST_DATA_PATH "/") + filename;
-  PaddedBytes data;
-  JXL_CHECK(ReadFile(full_path, &data));
-  return data;
-}
-
-}  // namespace jxl
-
-#endif  // LIB_JXL_TESTDATA_H_
diff --git a/lib/jxl/testing.h b/lib/jxl/testing.h
new file mode 100644 (file)
index 0000000..22a2388
--- /dev/null
@@ -0,0 +1,75 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef LIB_JXL_TESTING_H_
+#define LIB_JXL_TESTING_H_
+
+// GTest/GMock specific macros / wrappers.
+
+// gmock unconditionally redefines those macros (to wrong values).
+// Lets include it only here and mitigate the problem.
+#pragma push_macro("PRIdS")
+#pragma push_macro("PRIuS")
+#include "gmock/gmock.h"
+#pragma pop_macro("PRIuS")
+#pragma pop_macro("PRIdS")
+
+#include <sstream>
+
+// JPEGXL_ENABLE_BOXES, JPEGXL_ENABLE_TRANSCODE_JPEG
+#include "gtest/gtest.h"
+#include "lib/jxl/common.h"
+
+#ifdef JXL_DISABLE_SLOW_TESTS
+#define JXL_SLOW_TEST(X) DISABLED_##X
+#else
+#define JXL_SLOW_TEST(X) X
+#endif  // JXL_DISABLE_SLOW_TESTS
+
+#if JPEGXL_ENABLE_TRANSCODE_JPEG
+#define JXL_TRANSCODE_JPEG_TEST(X) X
+#else
+#define JXL_TRANSCODE_JPEG_TEST(X) DISABLED_##X
+#endif  // JPEGXL_ENABLE_TRANSCODE_JPEG
+
+#if JPEGXL_ENABLE_BOXES
+#define JXL_BOXES_TEST(X) X
+#else
+#define JXL_BOXES_TEST(X) DISABLED_##X
+#endif  // JPEGXL_ENABLE_BOXES
+
+#ifdef THREAD_SANITIZER
+#define JXL_TSAN_SLOW_TEST(X) DISABLED_##X
+#else
+#define JXL_TSAN_SLOW_TEST(X) X
+#endif  // THREAD_SANITIZER
+
+// googletest before 1.10 didn't define INSTANTIATE_TEST_SUITE_P() but instead
+// used INSTANTIATE_TEST_CASE_P which is now deprecated.
+#ifdef INSTANTIATE_TEST_SUITE_P
+#define JXL_GTEST_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_SUITE_P
+#else
+#define JXL_GTEST_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_CASE_P
+#endif
+
+// Ensures that we don't make our test bounds too lax, effectively disabling the
+// tests.
+MATCHER_P(IsSlightlyBelow, max, "") {
+  return max * 0.75 <= arg && arg <= max * 1.0;
+}
+
+#define JXL_EXPECT_OK(F)       \
+  {                            \
+    std::stringstream _;       \
+    EXPECT_TRUE(F) << _.str(); \
+  }
+
+#define JXL_ASSERT_OK(F)       \
+  {                            \
+    std::stringstream _;       \
+    ASSERT_TRUE(F) << _.str(); \
+  }
+
+#endif  // LIB_JXL_TESTING_H_
index 9c010d4..e93a936 100644 (file)
@@ -11,7 +11,7 @@
 #include <hwy/foreach_target.h>
 #include <hwy/highway.h>
 
-#include "lib/jxl/transfer_functions-inl.h"
+#include "lib/jxl/cms/transfer_functions-inl.h"
 
 HWY_BEFORE_NAMESPACE();
 namespace jxl {
@@ -43,7 +43,7 @@ namespace {
   state.SetItemsProcessed(kNum* state.iterations() * Lanes(d) * 3); \
   benchmark::DoNotOptimize(sum1 + sum2 + sum3);
 
-#define RUN_BENCHMARK_SCALAR(F)                              \
+#define RUN_BENCHMARK_SCALAR(F, I)                           \
   constexpr size_t kNum = 1 << 12;                           \
   /* Three parallel runs, as this will run on R, G and B. */ \
   float sum1 = 0, sum2 = 0, sum3 = 0;                        \
@@ -53,9 +53,9 @@ namespace {
     float v2 = 1.1e-5;                                       \
     float v3 = 1.2e-5;                                       \
     for (size_t i = 0; i < kNum; i++) {                      \
-      sum1 += F(v1);                                         \
-      sum2 += F(v2);                                         \
-      sum3 += F(v3);                                         \
+      sum1 += F(I, v1);                                      \
+      sum2 += F(I, v2);                                      \
+      sum3 += F(I, v3);                                      \
       v1 += x;                                               \
       v2 += x;                                               \
       v3 += x;                                               \
@@ -74,19 +74,21 @@ HWY_NOINLINE void BM_TFSRGB(benchmark::State& state) {
 }
 
 HWY_NOINLINE void BM_PQDFE(benchmark::State& state) {
-  RUN_BENCHMARK(TF_PQ().DisplayFromEncoded);
+  TF_PQ tf_pq(10000.0);
+  RUN_BENCHMARK(tf_pq.DisplayFromEncoded);
 }
 
 HWY_NOINLINE void BM_PQEFD(benchmark::State& state) {
-  RUN_BENCHMARK(TF_PQ().EncodedFromDisplay);
+  TF_PQ tf_pq(10000.0);
+  RUN_BENCHMARK(tf_pq.EncodedFromDisplay);
 }
 
 HWY_NOINLINE void BM_PQSlowDFE(benchmark::State& state) {
-  RUN_BENCHMARK_SCALAR(TF_PQ().DisplayFromEncoded);
+  RUN_BENCHMARK_SCALAR(TF_PQ_Base::DisplayFromEncoded, 10000.0);
 }
 
 HWY_NOINLINE void BM_PQSlowEFD(benchmark::State& state) {
-  RUN_BENCHMARK_SCALAR(TF_PQ().EncodedFromDisplay);
+  RUN_BENCHMARK_SCALAR(TF_PQ_Base::EncodedFromDisplay, 10000.0);
 }
 }  // namespace
 // NOLINTNEXTLINE(google-readability-namespace-comments)
@@ -105,7 +107,7 @@ HWY_EXPORT(BM_PQEFD);
 HWY_EXPORT(BM_PQSlowDFE);
 HWY_EXPORT(BM_PQSlowEFD);
 
-float SRGB_pow(float x) {
+float SRGB_pow(float _, float x) {
   return x < 0.0031308f ? 12.92f * x : 1.055f * powf(x, 1.0f / 2.4f) - 0.055f;
 }
 
@@ -128,7 +130,7 @@ void BM_PQSlowEFD(benchmark::State& state) {
   HWY_DYNAMIC_DISPATCH(BM_PQSlowEFD)(state);
 }
 
-void BM_SRGB_pow(benchmark::State& state) { RUN_BENCHMARK_SCALAR(SRGB_pow); }
+void BM_SRGB_pow(benchmark::State& state) { RUN_BENCHMARK_SCALAR(SRGB_pow, 0); }
 
 BENCHMARK(BM_FastSRGB);
 BENCHMARK(BM_TFSRGB);
index 24cdd02..72c8ac0 100644 (file)
@@ -7,10 +7,9 @@
 
 #include <stdint.h>
 
-#include "lib/jxl/aux_out_fwd.h"
+#include "lib/jxl/base/common.h"
 #include "lib/jxl/coeff_order.h"
 #include "lib/jxl/coeff_order_fwd.h"
-#include "lib/jxl/common.h"
 #include "lib/jxl/fields.h"
 
 namespace jxl {
index 2f3bf5b..712a9a1 100644 (file)
@@ -5,12 +5,12 @@
 
 #include "lib/jxl/toc.h"
 
-#include "gtest/gtest.h"
-#include "lib/jxl/aux_out_fwd.h"
+#include "lib/jxl/base/common.h"
 #include "lib/jxl/base/random.h"
 #include "lib/jxl/base/span.h"
-#include "lib/jxl/common.h"
+#include "lib/jxl/enc_aux_out.h"
 #include "lib/jxl/enc_toc.h"
+#include "lib/jxl/testing.h"
 
 namespace jxl {
 namespace {
@@ -45,7 +45,7 @@ void Roundtrip(size_t num_entries, bool permute, Rng* rng) {
     }
     writer.ZeroPadToByte();
     AuxOut aux_out;
-    ReclaimAndCharge(&writer, &allotment, 0, &aux_out);
+    allotment.ReclaimAndCharge(&writer, 0, &aux_out);
   }
 
   BitWriter writer;
index 7514f0e..2ee4535 100644 (file)
@@ -4,7 +4,6 @@
 // license that can be found in the LICENSE file.
 
 #include <stdint.h>
-#include <stdio.h>
 
 #include <algorithm>
 #include <vector>
 #define HWY_TARGET_INCLUDE "lib/jxl/xorshift128plus_test.cc"
 #include <hwy/foreach_target.h>
 #include <hwy/highway.h>
-#include <hwy/tests/test_util-inl.h>
+#include <hwy/tests/hwy_gtest.h>
 
 #include "lib/jxl/base/data_parallel.h"
-#include "lib/jxl/base/thread_pool_internal.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
 #include "lib/jxl/xorshift128plus-inl.h"
 
 HWY_BEFORE_NAMESPACE();
@@ -286,7 +286,7 @@ void TestSeedChanges() {
 }
 
 void TestFloat() {
-  ThreadPoolInternal pool(8);
+  test::ThreadPoolForTests pool(8);
 
 #ifdef JXL_DISABLE_SLOW_TESTS
   const uint32_t kMaxSeed = 256;
@@ -332,7 +332,7 @@ void TestFloat() {
 
 // Not more than one 64-bit zero
 void TestNotZero() {
-  ThreadPoolInternal pool(8);
+  test::ThreadPoolForTests pool(8);
 
 #ifdef JXL_DISABLE_SLOW_TESTS
   const uint32_t kMaxSeed = 500;
index f0535d7..592298c 100644 (file)
@@ -3,43 +3,21 @@
 # Use of this source code is governed by a BSD-style
 # license that can be found in the LICENSE file.
 
-# All files ending in "_gbench.cc" are considered Google benchmark files and
-# should be listed here.
-set(JPEGXL_INTERNAL_SOURCES_GBENCH
-  extras/tone_mapping_gbench.cc
-  jxl/dec_external_image_gbench.cc
-  jxl/enc_external_image_gbench.cc
-  jxl/gauss_blur_gbench.cc
-  jxl/splines_gbench.cc
-  jxl/tf_gbench.cc
-)
-
-# benchmark.h doesn't work in our MINGW set up since it ends up including the
-# wrong stdlib header. We don't run gbench on MINGW targets anyway.
-if(NOT MINGW)
+include(jxl_lists.cmake)
 
 # This is the Google benchmark project (https://github.com/google/benchmark).
 find_package(benchmark QUIET)
 
 if(benchmark_FOUND)
-  if(JPEGXL_STATIC AND NOT MINGW)
-    # benchmark::benchmark hardcodes the librt.so which obviously doesn't
-    # compile in static mode.
-    set_target_properties(benchmark::benchmark PROPERTIES
-      INTERFACE_LINK_LIBRARIES "Threads::Threads;-lrt")
-  endif()
-
   # Compiles all the benchmark files into a single binary. Individual benchmarks
   # can be run with --benchmark_filter.
-  add_executable(jxl_gbench "${JPEGXL_INTERNAL_SOURCES_GBENCH}" gbench_main.cc)
+  add_executable(jxl_gbench "${JPEGXL_INTERNAL_GBENCH_SOURCES}" gbench_main.cc)
 
   target_compile_definitions(jxl_gbench PRIVATE
     -DTEST_DATA_PATH="${JPEGXL_TEST_DATA_PATH}")
   target_link_libraries(jxl_gbench
-    jxl_extras-static
-    jxl-static
+    jxl_extras-internal
+    jxl-internal
     benchmark::benchmark
   )
 endif() # benchmark_FOUND
-
-endif() # MINGW
diff --git a/lib/jxl_cms.cmake b/lib/jxl_cms.cmake
new file mode 100644 (file)
index 0000000..f8812ef
--- /dev/null
@@ -0,0 +1,70 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+include(jxl_lists.cmake)
+
+# Headers for exporting/importing public headers
+include(GenerateExportHeader)
+
+add_library(jxl_cms
+  ${JPEGXL_INTERNAL_CMS_SOURCES}
+)
+target_compile_options(jxl_cms PRIVATE "${JPEGXL_INTERNAL_FLAGS}")
+set_target_properties(jxl_cms PROPERTIES POSITION_INDEPENDENT_CODE ON)
+target_link_libraries(jxl_cms PUBLIC jxl_base)
+target_include_directories(jxl_cms PRIVATE
+  ${JXL_HWY_INCLUDE_DIRS}
+)
+generate_export_header(jxl_cms
+  BASE_NAME JXL_CMS
+  EXPORT_FILE_NAME include/jxl/jxl_cms_export.h)
+target_include_directories(jxl_cms PUBLIC
+  "$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}>")
+
+set(JXL_CMS_PK_LIBS "")
+
+if (JPEGXL_ENABLE_SKCMS)
+  target_link_libraries(jxl_cms PRIVATE skcms)
+else()
+  target_link_libraries(jxl_cms PRIVATE lcms2)
+  if (JPEGXL_FORCE_SYSTEM_LCMS2)
+    set(JXL_CMS_PK_LIBS "-llcms2")
+  endif()
+endif()
+
+target_link_libraries(jxl_cms PRIVATE hwy)
+
+set_target_properties(jxl_cms PROPERTIES
+        VERSION ${JPEGXL_LIBRARY_VERSION}
+        SOVERSION ${JPEGXL_LIBRARY_SOVERSION})
+
+# Check whether the linker support excluding libs
+set(LINKER_EXCLUDE_LIBS_FLAG "-Wl,--exclude-libs=ALL")
+include(CheckCSourceCompiles)
+list(APPEND CMAKE_EXE_LINKER_FLAGS ${LINKER_EXCLUDE_LIBS_FLAG})
+check_c_source_compiles("int main(){return 0;}" LINKER_SUPPORT_EXCLUDE_LIBS)
+list(REMOVE_ITEM CMAKE_EXE_LINKER_FLAGS ${LINKER_EXCLUDE_LIBS_FLAG})
+
+if(LINKER_SUPPORT_EXCLUDE_LIBS)
+  set_property(TARGET jxl_cms APPEND_STRING PROPERTY
+      LINK_FLAGS " ${LINKER_EXCLUDE_LIBS_FLAG}")
+endif()
+
+install(TARGETS jxl_cms
+        RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+        LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+        ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
+
+if (BUILD_SHARED_LIBS)
+  set(JPEGXL_REQUIRES_TYPE "Requires.private")
+else()
+  set(JPEGXL_REQUIRES_TYPE "Requires")
+endif()
+
+set(JPEGXL_CMS_LIBRARY_REQUIRES "")
+configure_file("${CMAKE_CURRENT_SOURCE_DIR}/jxl/libjxl_cms.pc.in"
+               "libjxl_cms.pc" @ONLY)
+install(FILES "${CMAKE_CURRENT_BINARY_DIR}/libjxl_cms.pc"
+  DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
index fd801fc..597f691 100644 (file)
 # Use of this source code is governed by a BSD-style
 # license that can be found in the LICENSE file.
 
-set(JPEGXL_EXTRAS_SOURCES
-  extras/codec.cc
-  extras/codec.h
-  extras/dec/color_description.cc
-  extras/dec/color_description.h
-  extras/dec/color_hints.cc
-  extras/dec/color_hints.h
-  extras/dec/decode.cc
-  extras/dec/decode.h
-  extras/dec/jxl.cc
-  extras/dec/jxl.h
-  extras/dec/pgx.cc
-  extras/dec/pgx.h
-  extras/dec/pnm.cc
-  extras/dec/pnm.h
-  extras/enc/encode.cc
-  extras/enc/encode.h
-  extras/enc/npy.cc
-  extras/enc/npy.h
-  extras/enc/pgx.cc
-  extras/enc/pgx.h
-  extras/enc/pnm.cc
-  extras/enc/pnm.h
-  extras/exif.cc
-  extras/exif.h
-  extras/hlg.cc
-  extras/hlg.h
-  extras/packed_image.h
-  extras/packed_image_convert.cc
-  extras/packed_image_convert.h
-  extras/render_hdr.cc
-  extras/render_hdr.h
-  extras/time.cc
-  extras/time.h
-  extras/tone_mapping.cc
-  extras/tone_mapping.h
-)
+include(jxl_lists.cmake)
 
-set(JPEGXL_EXTRAS_CODEC_SOURCES
-  extras/dec/color_description.cc
-  extras/dec/color_description.h
-  extras/dec/color_hints.cc
-  extras/dec/color_hints.h
-  extras/dec/decode.cc
-  extras/dec/decode.h
-  extras/dec/jxl.cc
-  extras/dec/jxl.h
-  extras/dec/pgx.cc
-  extras/dec/pgx.h
-  extras/dec/pnm.cc
-  extras/dec/pnm.h
-  extras/enc/encode.cc
-  extras/enc/encode.h
-  extras/enc/npy.cc
-  extras/enc/npy.h
-  extras/enc/pgx.cc
-  extras/enc/pgx.h
-  extras/enc/pnm.cc
-  extras/enc/pnm.h
-  extras/exif.cc
-  extras/exif.h
-  extras/packed_image.h
-  extras/time.cc
-  extras/time.h
+# Object library for those parts of extras that do not depend on jxl internals
+# or jpegli. We will create two versions of these object files, one with and one
+# without external codec support compiled in.
+list(APPEND JPEGXL_EXTRAS_CORE_SOURCES
+  "${JPEGXL_INTERNAL_EXTRAS_SOURCES}"
+  "${JPEGXL_INTERNAL_CODEC_APNG_SOURCES}"
+  "${JPEGXL_INTERNAL_CODEC_EXR_SOURCES}"
+  "${JPEGXL_INTERNAL_CODEC_JPG_SOURCES}"
+  "${JPEGXL_INTERNAL_CODEC_JXL_SOURCES}"
+  "${JPEGXL_INTERNAL_CODEC_PGX_SOURCES}"
+  "${JPEGXL_INTERNAL_CODEC_PNM_SOURCES}"
+  "${JPEGXL_INTERNAL_CODEC_NPY_SOURCES}"
+  extras/dec/gif.cc
+  extras/dec/gif.h
 )
+foreach(LIB jxl_extras_core-obj jxl_extras_core_nocodec-obj)
+  add_library("${LIB}" OBJECT "${JPEGXL_EXTRAS_CORE_SOURCES}")
+  list(APPEND JXL_EXTRAS_OBJECT_LIBRARIES "${LIB}")
+endforeach()
+list(APPEND JXL_EXTRAS_OBJECTS $<TARGET_OBJECTS:jxl_extras_core-obj>)
 
-add_library(jxl_extras_codec-obj OBJECT "${JPEGXL_EXTRAS_CODEC_SOURCES}")
-target_compile_options(jxl_extras_codec-obj PRIVATE "${JPEGXL_INTERNAL_FLAGS}")
-target_compile_definitions(jxl_extras_codec-obj PRIVATE -DJXL_EXPORT=)
-set_property(TARGET jxl_extras_codec-obj PROPERTY POSITION_INDEPENDENT_CODE ON)
-target_include_directories(jxl_extras_codec-obj PUBLIC
-  ${PROJECT_SOURCE_DIR}
-  ${CMAKE_CURRENT_SOURCE_DIR}/include
-  ${CMAKE_CURRENT_BINARY_DIR}/include
-  $<TARGET_PROPERTY:hwy,INTERFACE_INCLUDE_DIRECTORIES>
+# Object library for those parts of extras that depend on jxl internals.
+add_library(jxl_extras_internal-obj OBJECT
+  "${JPEGXL_INTERNAL_EXTRAS_FOR_TOOLS_SOURCES}"
 )
-set(JXL_EXTRAS_CODEC_INTERNAL_LIBRARIES)
-set(JXL_EXTRAS_CODEC_PUBLIC_COMPILE_DEFINITIONS)
+list(APPEND JXL_EXTRAS_OBJECT_LIBRARIES jxl_extras_internal-obj)
+list(APPEND JXL_EXTRAS_OBJECTS $<TARGET_OBJECTS:jxl_extras_internal-obj>)
 
-# We only define a static library for jxl_extras since it uses internal parts
-# of jxl library which are not accessible from outside the library in the
-# shared library case.
-add_library(jxl_extras-static STATIC EXCLUDE_FROM_ALL
-  "${JPEGXL_EXTRAS_SOURCES}")
-target_compile_options(jxl_extras-static PRIVATE "${JPEGXL_INTERNAL_FLAGS}")
-set_property(TARGET jxl_extras-static PROPERTY POSITION_INDEPENDENT_CODE ON)
-target_include_directories(jxl_extras-static PUBLIC "${PROJECT_SOURCE_DIR}")
-target_link_libraries(jxl_extras-static PUBLIC
-  jxl-static
-  jxl_threads-static
-)
+set(JXL_EXTRAS_CODEC_INTERNAL_LIBRARIES)
 
 find_package(GIF 5.1)
 if(GIF_FOUND)
-  target_sources(jxl_extras_codec-obj PRIVATE
-    extras/dec/gif.cc
-    extras/dec/gif.h
-  )
-  target_include_directories(jxl_extras_codec-obj PRIVATE "${GIF_INCLUDE_DIRS}")
+  target_include_directories(jxl_extras_core-obj PRIVATE "${GIF_INCLUDE_DIRS}")
+  target_compile_definitions(jxl_extras_core-obj PRIVATE -DJPEGXL_ENABLE_GIF=1)
   list(APPEND JXL_EXTRAS_CODEC_INTERNAL_LIBRARIES ${GIF_LIBRARIES})
-  list(APPEND JXL_EXTRAS_CODEC_PUBLIC_DEFINITIONS -DJPEGXL_ENABLE_GIF=1)
   if(JPEGXL_DEP_LICENSE_DIR)
     configure_file("${JPEGXL_DEP_LICENSE_DIR}/libgif-dev/copyright"
                    ${PROJECT_BINARY_DIR}/LICENSE.libgif COPYONLY)
@@ -112,81 +48,53 @@ endif()
 
 find_package(JPEG)
 if(JPEG_FOUND)
-  target_sources(jxl_extras_codec-obj PRIVATE
-    extras/dec/jpg.cc
-    extras/dec/jpg.h
-    extras/enc/jpg.cc
-    extras/enc/jpg.h
-  )
-  target_include_directories(jxl_extras_codec-obj PRIVATE "${JPEG_INCLUDE_DIRS}")
+  target_include_directories(jxl_extras_core-obj PRIVATE "${JPEG_INCLUDE_DIRS}")
+  target_compile_definitions(jxl_extras_core-obj PRIVATE -DJPEGXL_ENABLE_JPEG=1)
   list(APPEND JXL_EXTRAS_CODEC_INTERNAL_LIBRARIES ${JPEG_LIBRARIES})
-  list(APPEND JXL_EXTRAS_CODEC_PUBLIC_DEFINITIONS -DJPEGXL_ENABLE_JPEG=1)
-  target_sources(jxl_extras-static PRIVATE
-    extras/dec/jpg.cc
-    extras/dec/jpg.h
-    extras/enc/jpg.cc
-    extras/enc/jpg.h
-  )
-  target_include_directories(jxl_extras-static PRIVATE "${JPEG_INCLUDE_DIRS}")
-  target_link_libraries(jxl_extras-static PRIVATE ${JPEG_LIBRARIES})
-  target_compile_definitions(jxl_extras-static PUBLIC -DJPEGXL_ENABLE_JPEG=1)
   if(JPEGXL_DEP_LICENSE_DIR)
     configure_file("${JPEGXL_DEP_LICENSE_DIR}/libjpeg-dev/copyright"
                    ${PROJECT_BINARY_DIR}/LICENSE.libjpeg COPYONLY)
   endif()  # JPEGXL_DEP_LICENSE_DIR
 endif()
 
+if (JPEGXL_ENABLE_SJPEG)
+  target_compile_definitions(jxl_extras_core-obj PRIVATE
+    -DJPEGXL_ENABLE_SJPEG=1)
+  target_include_directories(jxl_extras_core-obj PRIVATE
+    ../third_party/sjpeg/src)
+  list(APPEND JXL_EXTRAS_CODEC_INTERNAL_LIBRARIES sjpeg)
+endif()
+
+if(JPEGXL_ENABLE_JPEGLI)
+  add_library(jxl_extras_jpegli-obj OBJECT
+    "${JPEGXL_INTERNAL_CODEC_JPEGLI_SOURCES}"
+  )
+  target_include_directories(jxl_extras_jpegli-obj PRIVATE
+    "${CMAKE_CURRENT_BINARY_DIR}/include/jpegli"
+  )
+  list(APPEND JXL_EXTRAS_OBJECT_LIBRARIES jxl_extras_jpegli-obj)
+  list(APPEND JXL_EXTRAS_OBJECTS $<TARGET_OBJECTS:jxl_extras_jpegli-obj>)
+endif()
+
 if(NOT JPEGXL_BUNDLE_LIBPNG)
   find_package(PNG)
 endif()
 if(PNG_FOUND)
-  target_sources(jxl_extras_codec-obj PRIVATE
-    extras/dec/apng.cc
-    extras/dec/apng.h
-    extras/enc/apng.cc
-    extras/enc/apng.h
-  )
-  target_include_directories(jxl_extras_codec-obj PRIVATE "${PNG_INCLUDE_DIRS}")
+  target_include_directories(jxl_extras_core-obj PRIVATE "${PNG_INCLUDE_DIRS}")
+  target_compile_definitions(jxl_extras_core-obj PRIVATE -DJPEGXL_ENABLE_APNG=1)
   list(APPEND JXL_EXTRAS_CODEC_INTERNAL_LIBRARIES ${PNG_LIBRARIES})
-  list(APPEND JXL_EXTRAS_CODEC_PUBLIC_DEFINITIONS -DJPEGXL_ENABLE_APNG=1)
-  target_sources(jxl_extras-static PRIVATE
-    extras/dec/apng.cc
-    extras/dec/apng.h
-    extras/enc/apng.cc
-    extras/enc/apng.h
-  )
-  target_include_directories(jxl_extras-static PUBLIC "${PNG_INCLUDE_DIRS}")
-  target_link_libraries(jxl_extras-static PUBLIC ${PNG_LIBRARIES})
-  target_compile_definitions(jxl_extras-static PUBLIC -DJPEGXL_ENABLE_APNG=1)
   configure_file(extras/LICENSE.apngdis
                  ${PROJECT_BINARY_DIR}/LICENSE.apngdis COPYONLY)
 endif()
 
-if (JPEGXL_ENABLE_SJPEG)
-  target_compile_definitions(jxl_extras-static PUBLIC -DJPEGXL_ENABLE_SJPEG=1)
-  target_link_libraries(jxl_extras-static PRIVATE sjpeg)
-endif ()
-
 if (JPEGXL_ENABLE_OPENEXR)
 pkg_check_modules(OpenEXR IMPORTED_TARGET OpenEXR)
 if (OpenEXR_FOUND)
-  target_sources(jxl_extras_codec-obj PRIVATE
-    extras/dec/exr.cc
-    extras/dec/exr.h
-    extras/enc/exr.cc
-    extras/enc/exr.h
+  target_include_directories(jxl_extras_core-obj PRIVATE
+    "${OpenEXR_INCLUDE_DIRS}"
   )
-  list(APPEND JXL_EXTRAS_CODEC_PUBLIC_DEFINITIONS -DJPEGXL_ENABLE_EXR=1)
-  target_include_directories(jxl_extras_codec-obj PRIVATE "${OpenEXR_INCLUDE_DIRS}")
+  target_compile_definitions(jxl_extras_core-obj PRIVATE -DJPEGXL_ENABLE_EXR=1)
   list(APPEND JXL_EXTRAS_CODEC_INTERNAL_LIBRARIES PkgConfig::OpenEXR)
-  target_sources(jxl_extras-static PRIVATE
-    extras/dec/exr.cc
-    extras/dec/exr.h
-    extras/enc/exr.cc
-    extras/enc/exr.h
-  )
-  target_compile_definitions(jxl_extras-static PUBLIC -DJPEGXL_ENABLE_EXR=1)
-  target_link_libraries(jxl_extras-static PRIVATE PkgConfig::OpenEXR)
   if(JPEGXL_DEP_LICENSE_DIR)
     configure_file("${JPEGXL_DEP_LICENSE_DIR}/libopenexr-dev/copyright"
                    ${PROJECT_BINARY_DIR}/LICENSE.libopenexr COPYONLY)
@@ -194,26 +102,68 @@ if (OpenEXR_FOUND)
   # OpenEXR generates exceptions, so we need exception support to catch them.
   # Actually those flags counteract the ones set in JPEGXL_INTERNAL_FLAGS.
   if (NOT WIN32)
-    set_source_files_properties(extras/dec/exr.cc extras/enc/exr.cc PROPERTIES COMPILE_FLAGS -fexceptions)
+    set_source_files_properties(
+      extras/dec/exr.cc extras/enc/exr.cc PROPERTIES COMPILE_FLAGS -fexceptions)
     if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-      set_source_files_properties(extras/dec/exr.cc extras/enc/exr.cc PROPERTIES COMPILE_FLAGS -fcxx-exceptions)
+      set_source_files_properties(
+       extras/dec/exr.cc extras/enc/exr.cc PROPERTIES COMPILE_FLAGS
+       -fcxx-exceptions)
     endif()
   endif()
 endif() # OpenEXR_FOUND
 endif() # JPEGXL_ENABLE_OPENEXR
 
-target_compile_definitions(jxl_extras_codec-obj PRIVATE ${JXL_EXTRAS_CODEC_PUBLIC_DEFINITIONS})
+# Common settings for the object libraries.
+foreach(LIB ${JXL_EXTRAS_OBJECT_LIBRARIES})
+  target_compile_options("${LIB}" PRIVATE "${JPEGXL_INTERNAL_FLAGS}")
+  target_compile_definitions("${LIB}" PRIVATE -DJXL_EXPORT=)
+  set_property(TARGET "${LIB}" PROPERTY POSITION_INDEPENDENT_CODE ON)
+  target_include_directories("${LIB}" PRIVATE
+    ${PROJECT_SOURCE_DIR}
+    ${CMAKE_CURRENT_SOURCE_DIR}/include
+    ${CMAKE_CURRENT_BINARY_DIR}/include
+    ${JXL_HWY_INCLUDE_DIRS}
+  )
+endforeach()
 
-### Static library.
-add_library(jxl_extras_codec-static STATIC $<TARGET_OBJECTS:jxl_extras_codec-obj>)
-target_compile_definitions(jxl_extras_codec-static PUBLIC ${JXL_EXTRAS_CODEC_PUBLIC_DEFINITIONS})
-target_link_libraries(jxl_extras_codec-static PRIVATE ${JXL_EXTRAS_CODEC_INTERNAL_LIBRARIES} jxl)
+# Define an extras library that does not have the image codecs, only the core
+# extras code. This is needed for some of the fuzzers.
+add_library(jxl_extras_nocodec-internal STATIC EXCLUDE_FROM_ALL
+  $<TARGET_OBJECTS:jxl_extras_core_nocodec-obj>
+  $<TARGET_OBJECTS:jxl_extras_internal-obj>
+)
+target_link_libraries(jxl_extras_nocodec-internal PRIVATE jxl_threads)
+target_link_libraries(jxl_extras_nocodec-internal PUBLIC jxl-internal)
 
-### Shared library.
-if (BUILD_SHARED_LIBS)
-add_library(jxl_extras_codec SHARED $<TARGET_OBJECTS:jxl_extras_codec-obj>)
-target_compile_definitions(jxl_extras_codec PUBLIC ${JXL_EXTRAS_CODEC_PUBLIC_DEFINITIONS})
-target_link_libraries(jxl_extras_codec PRIVATE ${JXL_EXTRAS_CODEC_INTERNAL_LIBRARIES} jxl)
-else()
-add_library(jxl_extras_codec ALIAS jxl_extras_codec-static)
-endif()  # BUILD_SHARED_LIBS
+# We only define a static library jxl_extras since it uses internal parts of
+# jxl library which are not accessible from outside the library in the
+# shared library case.
+add_library(jxl_extras-internal STATIC EXCLUDE_FROM_ALL ${JXL_EXTRAS_OBJECTS})
+target_link_libraries(jxl_extras-internal PRIVATE
+  ${JXL_EXTRAS_CODEC_INTERNAL_LIBRARIES}
+  jxl_threads
+)
+target_link_libraries(jxl_extras-internal PUBLIC jxl-internal)
+if(JPEGXL_ENABLE_JPEGLI)
+  target_compile_definitions(jxl_extras-internal PUBLIC -DJPEGXL_ENABLE_JPEGLI=1)
+  target_link_libraries(jxl_extras-internal PRIVATE jpegli-static)
+endif()
+
+### Library that does not depend on internal parts of jxl library.
+### Used by cjxl and djxl binaries.
+add_library(jxl_extras_codec
+  $<TARGET_OBJECTS:jxl_extras_core-obj>
+)
+target_link_libraries(jxl_extras_codec PRIVATE
+  ${JXL_EXTRAS_CODEC_INTERNAL_LIBRARIES}
+)
+target_link_libraries(jxl_extras_codec PUBLIC jxl)
+set_target_properties(jxl_extras_codec PROPERTIES
+  VERSION ${JPEGXL_LIBRARY_VERSION}
+  SOVERSION ${JPEGXL_LIBRARY_SOVERSION}
+)
+install(TARGETS jxl_extras_codec
+  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
diff --git a/lib/jxl_lists.bzl b/lib/jxl_lists.bzl
new file mode 100644 (file)
index 0000000..ff517ac
--- /dev/null
@@ -0,0 +1,658 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# This file is generated, do not modify by manually.
+# Run `tools/scripts/build_cleaner.py --update` to regenerate it.
+
+libjxl_base_sources = [
+    "jxl/base/arch_macros.h",
+    "jxl/base/bits.h",
+    "jxl/base/byte_order.h",
+    "jxl/base/common.h",
+    "jxl/base/compiler_specific.h",
+    "jxl/base/data_parallel.h",
+    "jxl/base/fast_math-inl.h",
+    "jxl/base/float.h",
+    "jxl/base/iaca.h",
+    "jxl/base/matrix_ops.h",
+    "jxl/base/os_macros.h",
+    "jxl/base/override.h",
+    "jxl/base/printf_macros.h",
+    "jxl/base/random.h",
+    "jxl/base/rational_polynomial-inl.h",
+    "jxl/base/sanitizer_definitions.h",
+    "jxl/base/scope_guard.h",
+    "jxl/base/span.h",
+    "jxl/base/status.h",
+]
+
+libjxl_cms_sources = [
+    "jxl/cms/color_encoding_cms.h",
+    "jxl/cms/jxl_cms.cc",
+    "jxl/cms/jxl_cms_internal.h",
+    "jxl/cms/opsin_params.h",
+    "jxl/cms/tone_mapping-inl.h",
+    "jxl/cms/tone_mapping.h",
+    "jxl/cms/transfer_functions-inl.h",
+    "jxl/cms/transfer_functions.h",
+]
+
+libjxl_codec_apng_sources = [
+    "extras/dec/apng.cc",
+    "extras/dec/apng.h",
+    "extras/enc/apng.cc",
+    "extras/enc/apng.h",
+]
+
+libjxl_codec_exr_sources = [
+    "extras/dec/exr.cc",
+    "extras/dec/exr.h",
+    "extras/enc/exr.cc",
+    "extras/enc/exr.h",
+]
+
+libjxl_codec_gif_sources = [
+    "extras/dec/gif.cc",
+    "extras/dec/gif.h",
+]
+
+libjxl_codec_jpegli_sources = [
+    "extras/dec/jpegli.cc",
+    "extras/dec/jpegli.h",
+    "extras/enc/jpegli.cc",
+    "extras/enc/jpegli.h",
+]
+
+libjxl_codec_jpg_sources = [
+    "extras/dec/jpg.cc",
+    "extras/dec/jpg.h",
+    "extras/enc/jpg.cc",
+    "extras/enc/jpg.h",
+]
+
+libjxl_codec_jxl_sources = [
+    "extras/dec/jxl.cc",
+    "extras/dec/jxl.h",
+    "extras/enc/jxl.cc",
+    "extras/enc/jxl.h",
+]
+
+libjxl_codec_npy_sources = [
+    "extras/enc/npy.cc",
+    "extras/enc/npy.h",
+]
+
+libjxl_codec_pgx_sources = [
+    "extras/dec/pgx.cc",
+    "extras/dec/pgx.h",
+    "extras/enc/pgx.cc",
+    "extras/enc/pgx.h",
+]
+
+libjxl_codec_pnm_sources = [
+    "extras/dec/pnm.cc",
+    "extras/dec/pnm.h",
+    "extras/enc/pnm.cc",
+    "extras/enc/pnm.h",
+]
+
+libjxl_dec_box_sources = [
+    "jxl/box_content_decoder.cc",
+    "jxl/box_content_decoder.h",
+]
+
+libjxl_dec_jpeg_sources = [
+    "jxl/decode_to_jpeg.cc",
+    "jxl/decode_to_jpeg.h",
+    "jxl/jpeg/dec_jpeg_data.cc",
+    "jxl/jpeg/dec_jpeg_data.h",
+    "jxl/jpeg/dec_jpeg_data_writer.cc",
+    "jxl/jpeg/dec_jpeg_data_writer.h",
+    "jxl/jpeg/dec_jpeg_output_chunk.h",
+    "jxl/jpeg/dec_jpeg_serialization_state.h",
+    "jxl/jpeg/jpeg_data.cc",
+    "jxl/jpeg/jpeg_data.h",
+]
+
+libjxl_dec_sources = [
+    "jxl/ac_context.h",
+    "jxl/ac_strategy.cc",
+    "jxl/ac_strategy.h",
+    "jxl/alpha.cc",
+    "jxl/alpha.h",
+    "jxl/ans_common.cc",
+    "jxl/ans_common.h",
+    "jxl/ans_params.h",
+    "jxl/blending.cc",
+    "jxl/blending.h",
+    "jxl/cache_aligned.cc",
+    "jxl/cache_aligned.h",
+    "jxl/chroma_from_luma.cc",
+    "jxl/chroma_from_luma.h",
+    "jxl/codec_in_out.h",
+    "jxl/coeff_order.cc",
+    "jxl/coeff_order.h",
+    "jxl/coeff_order_fwd.h",
+    "jxl/color_encoding_internal.cc",
+    "jxl/color_encoding_internal.h",
+    "jxl/common.h",
+    "jxl/compressed_dc.cc",
+    "jxl/compressed_dc.h",
+    "jxl/convolve-inl.h",
+    "jxl/convolve.h",
+    "jxl/convolve_separable5.cc",
+    "jxl/convolve_separable7.cc",
+    "jxl/convolve_slow.cc",
+    "jxl/convolve_symmetric3.cc",
+    "jxl/convolve_symmetric5.cc",
+    "jxl/dct-inl.h",
+    "jxl/dct_block-inl.h",
+    "jxl/dct_scales.cc",
+    "jxl/dct_scales.h",
+    "jxl/dct_util.h",
+    "jxl/dec_ans.cc",
+    "jxl/dec_ans.h",
+    "jxl/dec_bit_reader.h",
+    "jxl/dec_cache.cc",
+    "jxl/dec_cache.h",
+    "jxl/dec_context_map.cc",
+    "jxl/dec_context_map.h",
+    "jxl/dec_external_image.cc",
+    "jxl/dec_external_image.h",
+    "jxl/dec_frame.cc",
+    "jxl/dec_frame.h",
+    "jxl/dec_group.cc",
+    "jxl/dec_group.h",
+    "jxl/dec_group_border.cc",
+    "jxl/dec_group_border.h",
+    "jxl/dec_huffman.cc",
+    "jxl/dec_huffman.h",
+    "jxl/dec_modular.cc",
+    "jxl/dec_modular.h",
+    "jxl/dec_noise.cc",
+    "jxl/dec_noise.h",
+    "jxl/dec_patch_dictionary.cc",
+    "jxl/dec_patch_dictionary.h",
+    "jxl/dec_transforms-inl.h",
+    "jxl/dec_xyb-inl.h",
+    "jxl/dec_xyb.cc",
+    "jxl/dec_xyb.h",
+    "jxl/decode.cc",
+    "jxl/entropy_coder.cc",
+    "jxl/entropy_coder.h",
+    "jxl/epf.cc",
+    "jxl/epf.h",
+    "jxl/exif.h",
+    "jxl/fast_dct-inl.h",
+    "jxl/fast_dct.cc",
+    "jxl/fast_dct.h",
+    "jxl/fast_dct128-inl.h",
+    "jxl/fast_dct16-inl.h",
+    "jxl/fast_dct256-inl.h",
+    "jxl/fast_dct32-inl.h",
+    "jxl/fast_dct64-inl.h",
+    "jxl/fast_dct8-inl.h",
+    "jxl/field_encodings.h",
+    "jxl/fields.cc",
+    "jxl/fields.h",
+    "jxl/frame_dimensions.h",
+    "jxl/frame_header.cc",
+    "jxl/frame_header.h",
+    "jxl/gauss_blur.cc",
+    "jxl/gauss_blur.h",
+    "jxl/headers.cc",
+    "jxl/headers.h",
+    "jxl/huffman_table.cc",
+    "jxl/huffman_table.h",
+    "jxl/icc_codec.cc",
+    "jxl/icc_codec.h",
+    "jxl/icc_codec_common.cc",
+    "jxl/icc_codec_common.h",
+    "jxl/image.cc",
+    "jxl/image.h",
+    "jxl/image_bundle.cc",
+    "jxl/image_bundle.h",
+    "jxl/image_metadata.cc",
+    "jxl/image_metadata.h",
+    "jxl/image_ops.h",
+    "jxl/inverse_mtf-inl.h",
+    "jxl/lehmer_code.h",
+    "jxl/loop_filter.cc",
+    "jxl/loop_filter.h",
+    "jxl/luminance.cc",
+    "jxl/luminance.h",
+    "jxl/memory_manager_internal.cc",
+    "jxl/memory_manager_internal.h",
+    "jxl/modular/encoding/context_predict.h",
+    "jxl/modular/encoding/dec_ma.cc",
+    "jxl/modular/encoding/dec_ma.h",
+    "jxl/modular/encoding/encoding.cc",
+    "jxl/modular/encoding/encoding.h",
+    "jxl/modular/encoding/ma_common.h",
+    "jxl/modular/modular_image.cc",
+    "jxl/modular/modular_image.h",
+    "jxl/modular/options.h",
+    "jxl/modular/transform/palette.cc",
+    "jxl/modular/transform/palette.h",
+    "jxl/modular/transform/rct.cc",
+    "jxl/modular/transform/rct.h",
+    "jxl/modular/transform/squeeze.cc",
+    "jxl/modular/transform/squeeze.h",
+    "jxl/modular/transform/transform.cc",
+    "jxl/modular/transform/transform.h",
+    "jxl/noise.h",
+    "jxl/opsin_params.cc",
+    "jxl/opsin_params.h",
+    "jxl/pack_signed.h",
+    "jxl/padded_bytes.h",
+    "jxl/passes_state.cc",
+    "jxl/passes_state.h",
+    "jxl/patch_dictionary_internal.h",
+    "jxl/quant_weights.cc",
+    "jxl/quant_weights.h",
+    "jxl/quantizer-inl.h",
+    "jxl/quantizer.cc",
+    "jxl/quantizer.h",
+    "jxl/render_pipeline/low_memory_render_pipeline.cc",
+    "jxl/render_pipeline/low_memory_render_pipeline.h",
+    "jxl/render_pipeline/render_pipeline.cc",
+    "jxl/render_pipeline/render_pipeline.h",
+    "jxl/render_pipeline/render_pipeline_stage.h",
+    "jxl/render_pipeline/simple_render_pipeline.cc",
+    "jxl/render_pipeline/simple_render_pipeline.h",
+    "jxl/render_pipeline/stage_blending.cc",
+    "jxl/render_pipeline/stage_blending.h",
+    "jxl/render_pipeline/stage_chroma_upsampling.cc",
+    "jxl/render_pipeline/stage_chroma_upsampling.h",
+    "jxl/render_pipeline/stage_cms.cc",
+    "jxl/render_pipeline/stage_cms.h",
+    "jxl/render_pipeline/stage_epf.cc",
+    "jxl/render_pipeline/stage_epf.h",
+    "jxl/render_pipeline/stage_from_linear.cc",
+    "jxl/render_pipeline/stage_from_linear.h",
+    "jxl/render_pipeline/stage_gaborish.cc",
+    "jxl/render_pipeline/stage_gaborish.h",
+    "jxl/render_pipeline/stage_noise.cc",
+    "jxl/render_pipeline/stage_noise.h",
+    "jxl/render_pipeline/stage_patches.cc",
+    "jxl/render_pipeline/stage_patches.h",
+    "jxl/render_pipeline/stage_splines.cc",
+    "jxl/render_pipeline/stage_splines.h",
+    "jxl/render_pipeline/stage_spot.cc",
+    "jxl/render_pipeline/stage_spot.h",
+    "jxl/render_pipeline/stage_to_linear.cc",
+    "jxl/render_pipeline/stage_to_linear.h",
+    "jxl/render_pipeline/stage_tone_mapping.cc",
+    "jxl/render_pipeline/stage_tone_mapping.h",
+    "jxl/render_pipeline/stage_upsampling.cc",
+    "jxl/render_pipeline/stage_upsampling.h",
+    "jxl/render_pipeline/stage_write.cc",
+    "jxl/render_pipeline/stage_write.h",
+    "jxl/render_pipeline/stage_xyb.cc",
+    "jxl/render_pipeline/stage_xyb.h",
+    "jxl/render_pipeline/stage_ycbcr.cc",
+    "jxl/render_pipeline/stage_ycbcr.h",
+    "jxl/sanitizers.h",
+    "jxl/simd_util-inl.h",
+    "jxl/simd_util.cc",
+    "jxl/simd_util.h",
+    "jxl/splines.cc",
+    "jxl/splines.h",
+    "jxl/toc.cc",
+    "jxl/toc.h",
+    "jxl/transpose-inl.h",
+    "jxl/xorshift128plus-inl.h",
+]
+
+libjxl_enc_sources = [
+    "jxl/butteraugli/butteraugli.cc",
+    "jxl/butteraugli/butteraugli.h",
+    "jxl/enc_ac_strategy.cc",
+    "jxl/enc_ac_strategy.h",
+    "jxl/enc_adaptive_quantization.cc",
+    "jxl/enc_adaptive_quantization.h",
+    "jxl/enc_ans.cc",
+    "jxl/enc_ans.h",
+    "jxl/enc_ans_params.h",
+    "jxl/enc_ar_control_field.cc",
+    "jxl/enc_ar_control_field.h",
+    "jxl/enc_aux_out.cc",
+    "jxl/enc_aux_out.h",
+    "jxl/enc_bit_writer.cc",
+    "jxl/enc_bit_writer.h",
+    "jxl/enc_butteraugli_comparator.cc",
+    "jxl/enc_butteraugli_comparator.h",
+    "jxl/enc_cache.cc",
+    "jxl/enc_cache.h",
+    "jxl/enc_chroma_from_luma.cc",
+    "jxl/enc_chroma_from_luma.h",
+    "jxl/enc_cluster.cc",
+    "jxl/enc_cluster.h",
+    "jxl/enc_coeff_order.cc",
+    "jxl/enc_coeff_order.h",
+    "jxl/enc_comparator.cc",
+    "jxl/enc_comparator.h",
+    "jxl/enc_context_map.cc",
+    "jxl/enc_context_map.h",
+    "jxl/enc_debug_image.cc",
+    "jxl/enc_debug_image.h",
+    "jxl/enc_detect_dots.cc",
+    "jxl/enc_detect_dots.h",
+    "jxl/enc_dot_dictionary.cc",
+    "jxl/enc_dot_dictionary.h",
+    "jxl/enc_entropy_coder.cc",
+    "jxl/enc_entropy_coder.h",
+    "jxl/enc_external_image.cc",
+    "jxl/enc_external_image.h",
+    "jxl/enc_fast_lossless.cc",
+    "jxl/enc_fast_lossless.h",
+    "jxl/enc_fields.cc",
+    "jxl/enc_fields.h",
+    "jxl/enc_frame.cc",
+    "jxl/enc_frame.h",
+    "jxl/enc_gaborish.cc",
+    "jxl/enc_gaborish.h",
+    "jxl/enc_gamma_correct.h",
+    "jxl/enc_group.cc",
+    "jxl/enc_group.h",
+    "jxl/enc_heuristics.cc",
+    "jxl/enc_heuristics.h",
+    "jxl/enc_huffman.cc",
+    "jxl/enc_huffman.h",
+    "jxl/enc_huffman_tree.cc",
+    "jxl/enc_huffman_tree.h",
+    "jxl/enc_icc_codec.cc",
+    "jxl/enc_icc_codec.h",
+    "jxl/enc_image_bundle.cc",
+    "jxl/enc_image_bundle.h",
+    "jxl/enc_linalg.cc",
+    "jxl/enc_linalg.h",
+    "jxl/enc_modular.cc",
+    "jxl/enc_modular.h",
+    "jxl/enc_noise.cc",
+    "jxl/enc_noise.h",
+    "jxl/enc_optimize.cc",
+    "jxl/enc_optimize.h",
+    "jxl/enc_params.h",
+    "jxl/enc_patch_dictionary.cc",
+    "jxl/enc_patch_dictionary.h",
+    "jxl/enc_photon_noise.cc",
+    "jxl/enc_photon_noise.h",
+    "jxl/enc_progressive_split.cc",
+    "jxl/enc_progressive_split.h",
+    "jxl/enc_quant_weights.cc",
+    "jxl/enc_quant_weights.h",
+    "jxl/enc_splines.cc",
+    "jxl/enc_splines.h",
+    "jxl/enc_toc.cc",
+    "jxl/enc_toc.h",
+    "jxl/enc_transforms-inl.h",
+    "jxl/enc_transforms.cc",
+    "jxl/enc_transforms.h",
+    "jxl/enc_xyb.cc",
+    "jxl/enc_xyb.h",
+    "jxl/encode.cc",
+    "jxl/encode_internal.h",
+    "jxl/jpeg/enc_jpeg_data.cc",
+    "jxl/jpeg/enc_jpeg_data.h",
+    "jxl/jpeg/enc_jpeg_data_reader.cc",
+    "jxl/jpeg/enc_jpeg_data_reader.h",
+    "jxl/jpeg/enc_jpeg_huffman_decode.cc",
+    "jxl/jpeg/enc_jpeg_huffman_decode.h",
+    "jxl/modular/encoding/enc_debug_tree.cc",
+    "jxl/modular/encoding/enc_debug_tree.h",
+    "jxl/modular/encoding/enc_encoding.cc",
+    "jxl/modular/encoding/enc_encoding.h",
+    "jxl/modular/encoding/enc_ma.cc",
+    "jxl/modular/encoding/enc_ma.h",
+    "jxl/modular/transform/enc_palette.cc",
+    "jxl/modular/transform/enc_palette.h",
+    "jxl/modular/transform/enc_rct.cc",
+    "jxl/modular/transform/enc_rct.h",
+    "jxl/modular/transform/enc_squeeze.cc",
+    "jxl/modular/transform/enc_squeeze.h",
+    "jxl/modular/transform/enc_transform.cc",
+    "jxl/modular/transform/enc_transform.h",
+]
+
+libjxl_extras_for_tools_sources = [
+    "extras/codec.cc",
+    "extras/codec.h",
+    "extras/hlg.cc",
+    "extras/hlg.h",
+    "extras/metrics.cc",
+    "extras/metrics.h",
+    "extras/packed_image_convert.cc",
+    "extras/packed_image_convert.h",
+    "extras/tone_mapping.cc",
+    "extras/tone_mapping.h",
+]
+
+libjxl_extras_sources = [
+    "extras/alpha_blend.cc",
+    "extras/alpha_blend.h",
+    "extras/common.cc",
+    "extras/common.h",
+    "extras/dec/color_description.cc",
+    "extras/dec/color_description.h",
+    "extras/dec/color_hints.cc",
+    "extras/dec/color_hints.h",
+    "extras/dec/decode.cc",
+    "extras/dec/decode.h",
+    "extras/enc/encode.cc",
+    "extras/enc/encode.h",
+    "extras/exif.cc",
+    "extras/exif.h",
+    "extras/packed_image.h",
+    "extras/size_constraints.h",
+    "extras/time.cc",
+    "extras/time.h",
+]
+
+libjxl_gbench_sources = [
+    "extras/tone_mapping_gbench.cc",
+    "jxl/dec_external_image_gbench.cc",
+    "jxl/enc_external_image_gbench.cc",
+    "jxl/gauss_blur_gbench.cc",
+    "jxl/splines_gbench.cc",
+    "jxl/tf_gbench.cc",
+]
+
+libjxl_jpegli_lib_version = 62
+
+libjxl_jpegli_libjpeg_helper_files = [
+    "jpegli/libjpeg_test_util.cc",
+    "jpegli/libjpeg_test_util.h",
+]
+
+libjxl_jpegli_sources = [
+    "jpegli/adaptive_quantization.cc",
+    "jpegli/adaptive_quantization.h",
+    "jpegli/bit_writer.cc",
+    "jpegli/bit_writer.h",
+    "jpegli/bitstream.cc",
+    "jpegli/bitstream.h",
+    "jpegli/color_quantize.cc",
+    "jpegli/color_quantize.h",
+    "jpegli/color_transform.cc",
+    "jpegli/color_transform.h",
+    "jpegli/common.cc",
+    "jpegli/common.h",
+    "jpegli/common_internal.h",
+    "jpegli/dct-inl.h",
+    "jpegli/decode.cc",
+    "jpegli/decode.h",
+    "jpegli/decode_internal.h",
+    "jpegli/decode_marker.cc",
+    "jpegli/decode_marker.h",
+    "jpegli/decode_scan.cc",
+    "jpegli/decode_scan.h",
+    "jpegli/destination_manager.cc",
+    "jpegli/downsample.cc",
+    "jpegli/downsample.h",
+    "jpegli/encode.cc",
+    "jpegli/encode.h",
+    "jpegli/encode_finish.cc",
+    "jpegli/encode_finish.h",
+    "jpegli/encode_internal.h",
+    "jpegli/encode_streaming.cc",
+    "jpegli/encode_streaming.h",
+    "jpegli/entropy_coding-inl.h",
+    "jpegli/entropy_coding.cc",
+    "jpegli/entropy_coding.h",
+    "jpegli/error.cc",
+    "jpegli/error.h",
+    "jpegli/huffman.cc",
+    "jpegli/huffman.h",
+    "jpegli/idct.cc",
+    "jpegli/idct.h",
+    "jpegli/input.cc",
+    "jpegli/input.h",
+    "jpegli/memory_manager.cc",
+    "jpegli/memory_manager.h",
+    "jpegli/quant.cc",
+    "jpegli/quant.h",
+    "jpegli/render.cc",
+    "jpegli/render.h",
+    "jpegli/simd.cc",
+    "jpegli/simd.h",
+    "jpegli/source_manager.cc",
+    "jpegli/transpose-inl.h",
+    "jpegli/types.h",
+    "jpegli/upsample.cc",
+    "jpegli/upsample.h",
+]
+
+libjxl_jpegli_testlib_files = [
+    "jpegli/test_params.h",
+    "jpegli/test_utils-inl.h",
+    "jpegli/test_utils.cc",
+    "jpegli/test_utils.h",
+]
+
+libjxl_jpegli_tests = [
+    "jpegli/decode_api_test.cc",
+    "jpegli/encode_api_test.cc",
+    "jpegli/error_handling_test.cc",
+    "jpegli/input_suspension_test.cc",
+    "jpegli/output_suspension_test.cc",
+    "jpegli/source_manager_test.cc",
+    "jpegli/streaming_test.cc",
+    "jpegli/transcode_api_test.cc",
+]
+
+libjxl_jpegli_wrapper_sources = [
+    "jpegli/libjpeg_wrapper.cc",
+]
+
+libjxl_major_version = 0
+
+libjxl_minor_version = 9
+
+libjxl_patch_version = 0
+
+libjxl_public_headers = [
+    "include/jxl/cms.h",
+    "include/jxl/cms_interface.h",
+    "include/jxl/codestream_header.h",
+    "include/jxl/color_encoding.h",
+    "include/jxl/decode.h",
+    "include/jxl/decode_cxx.h",
+    "include/jxl/encode.h",
+    "include/jxl/encode_cxx.h",
+    "include/jxl/memory_manager.h",
+    "include/jxl/parallel_runner.h",
+    "include/jxl/stats.h",
+    "include/jxl/types.h",
+]
+
+libjxl_testlib_files = [
+    "jxl/dct_for_test.h",
+    "jxl/dec_transforms_testonly.cc",
+    "jxl/dec_transforms_testonly.h",
+    "jxl/fake_parallel_runner_testonly.h",
+    "jxl/image_test_utils.h",
+    "jxl/render_pipeline/test_render_pipeline_stages.h",
+    "jxl/test_image.cc",
+    "jxl/test_image.h",
+    "jxl/test_utils.cc",
+    "jxl/test_utils.h",
+]
+
+libjxl_tests = [
+    "extras/codec_test.cc",
+    "extras/dec/color_description_test.cc",
+    "extras/dec/pgx_test.cc",
+    "extras/jpegli_test.cc",
+    "jxl/ac_strategy_test.cc",
+    "jxl/alpha_test.cc",
+    "jxl/ans_common_test.cc",
+    "jxl/ans_test.cc",
+    "jxl/bit_reader_test.cc",
+    "jxl/bits_test.cc",
+    "jxl/blending_test.cc",
+    "jxl/butteraugli/butteraugli_test.cc",
+    "jxl/byte_order_test.cc",
+    "jxl/cms/tone_mapping_test.cc",
+    "jxl/cms/transfer_functions_test.cc",
+    "jxl/coeff_order_test.cc",
+    "jxl/color_encoding_internal_test.cc",
+    "jxl/color_management_test.cc",
+    "jxl/convolve_test.cc",
+    "jxl/data_parallel_test.cc",
+    "jxl/dct_test.cc",
+    "jxl/decode_test.cc",
+    "jxl/enc_external_image_test.cc",
+    "jxl/enc_gaborish_test.cc",
+    "jxl/enc_linalg_test.cc",
+    "jxl/enc_optimize_test.cc",
+    "jxl/enc_photon_noise_test.cc",
+    "jxl/encode_test.cc",
+    "jxl/entropy_coder_test.cc",
+    "jxl/fast_dct_test.cc",
+    "jxl/fast_math_test.cc",
+    "jxl/fields_test.cc",
+    "jxl/gamma_correct_test.cc",
+    "jxl/gauss_blur_test.cc",
+    "jxl/gradient_test.cc",
+    "jxl/iaca_test.cc",
+    "jxl/icc_codec_test.cc",
+    "jxl/image_bundle_test.cc",
+    "jxl/image_ops_test.cc",
+    "jxl/jxl_test.cc",
+    "jxl/lehmer_code_test.cc",
+    "jxl/modular_test.cc",
+    "jxl/opsin_image_test.cc",
+    "jxl/opsin_inverse_test.cc",
+    "jxl/padded_bytes_test.cc",
+    "jxl/passes_test.cc",
+    "jxl/patch_dictionary_test.cc",
+    "jxl/preview_test.cc",
+    "jxl/quant_weights_test.cc",
+    "jxl/quantizer_test.cc",
+    "jxl/rational_polynomial_test.cc",
+    "jxl/render_pipeline/render_pipeline_test.cc",
+    "jxl/roundtrip_test.cc",
+    "jxl/simd_util_test.cc",
+    "jxl/speed_tier_test.cc",
+    "jxl/splines_test.cc",
+    "jxl/toc_test.cc",
+    "jxl/xorshift128plus_test.cc",
+    "threads/thread_parallel_runner_test.cc",
+]
+
+libjxl_threads_public_headers = [
+    "include/jxl/resizable_parallel_runner.h",
+    "include/jxl/resizable_parallel_runner_cxx.h",
+    "include/jxl/thread_parallel_runner.h",
+    "include/jxl/thread_parallel_runner_cxx.h",
+]
+
+libjxl_threads_sources = [
+    "threads/resizable_parallel_runner.cc",
+    "threads/thread_parallel_runner.cc",
+    "threads/thread_parallel_runner_internal.cc",
+    "threads/thread_parallel_runner_internal.h",
+]
diff --git a/lib/jxl_lists.cmake b/lib/jxl_lists.cmake
new file mode 100644 (file)
index 0000000..68c5a4a
--- /dev/null
@@ -0,0 +1,650 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# This file is generated, do not modify by manually.
+# Run `tools/scripts/build_cleaner.py --update` to regenerate it.
+
+set(JPEGXL_INTERNAL_BASE_SOURCES
+  jxl/base/arch_macros.h
+  jxl/base/bits.h
+  jxl/base/byte_order.h
+  jxl/base/common.h
+  jxl/base/compiler_specific.h
+  jxl/base/data_parallel.h
+  jxl/base/fast_math-inl.h
+  jxl/base/float.h
+  jxl/base/iaca.h
+  jxl/base/matrix_ops.h
+  jxl/base/os_macros.h
+  jxl/base/override.h
+  jxl/base/printf_macros.h
+  jxl/base/random.h
+  jxl/base/rational_polynomial-inl.h
+  jxl/base/sanitizer_definitions.h
+  jxl/base/scope_guard.h
+  jxl/base/span.h
+  jxl/base/status.h
+)
+
+set(JPEGXL_INTERNAL_CMS_SOURCES
+  jxl/cms/color_encoding_cms.h
+  jxl/cms/jxl_cms.cc
+  jxl/cms/jxl_cms_internal.h
+  jxl/cms/opsin_params.h
+  jxl/cms/tone_mapping-inl.h
+  jxl/cms/tone_mapping.h
+  jxl/cms/transfer_functions-inl.h
+  jxl/cms/transfer_functions.h
+)
+
+set(JPEGXL_INTERNAL_CODEC_APNG_SOURCES
+  extras/dec/apng.cc
+  extras/dec/apng.h
+  extras/enc/apng.cc
+  extras/enc/apng.h
+)
+
+set(JPEGXL_INTERNAL_CODEC_EXR_SOURCES
+  extras/dec/exr.cc
+  extras/dec/exr.h
+  extras/enc/exr.cc
+  extras/enc/exr.h
+)
+
+set(JPEGXL_INTERNAL_CODEC_GIF_SOURCES
+  extras/dec/gif.cc
+  extras/dec/gif.h
+)
+
+set(JPEGXL_INTERNAL_CODEC_JPEGLI_SOURCES
+  extras/dec/jpegli.cc
+  extras/dec/jpegli.h
+  extras/enc/jpegli.cc
+  extras/enc/jpegli.h
+)
+
+set(JPEGXL_INTERNAL_CODEC_JPG_SOURCES
+  extras/dec/jpg.cc
+  extras/dec/jpg.h
+  extras/enc/jpg.cc
+  extras/enc/jpg.h
+)
+
+set(JPEGXL_INTERNAL_CODEC_JXL_SOURCES
+  extras/dec/jxl.cc
+  extras/dec/jxl.h
+  extras/enc/jxl.cc
+  extras/enc/jxl.h
+)
+
+set(JPEGXL_INTERNAL_CODEC_NPY_SOURCES
+  extras/enc/npy.cc
+  extras/enc/npy.h
+)
+
+set(JPEGXL_INTERNAL_CODEC_PGX_SOURCES
+  extras/dec/pgx.cc
+  extras/dec/pgx.h
+  extras/enc/pgx.cc
+  extras/enc/pgx.h
+)
+
+set(JPEGXL_INTERNAL_CODEC_PNM_SOURCES
+  extras/dec/pnm.cc
+  extras/dec/pnm.h
+  extras/enc/pnm.cc
+  extras/enc/pnm.h
+)
+
+set(JPEGXL_INTERNAL_DEC_BOX_SOURCES
+  jxl/box_content_decoder.cc
+  jxl/box_content_decoder.h
+)
+
+set(JPEGXL_INTERNAL_DEC_JPEG_SOURCES
+  jxl/decode_to_jpeg.cc
+  jxl/decode_to_jpeg.h
+  jxl/jpeg/dec_jpeg_data.cc
+  jxl/jpeg/dec_jpeg_data.h
+  jxl/jpeg/dec_jpeg_data_writer.cc
+  jxl/jpeg/dec_jpeg_data_writer.h
+  jxl/jpeg/dec_jpeg_output_chunk.h
+  jxl/jpeg/dec_jpeg_serialization_state.h
+  jxl/jpeg/jpeg_data.cc
+  jxl/jpeg/jpeg_data.h
+)
+
+set(JPEGXL_INTERNAL_DEC_SOURCES
+  jxl/ac_context.h
+  jxl/ac_strategy.cc
+  jxl/ac_strategy.h
+  jxl/alpha.cc
+  jxl/alpha.h
+  jxl/ans_common.cc
+  jxl/ans_common.h
+  jxl/ans_params.h
+  jxl/blending.cc
+  jxl/blending.h
+  jxl/cache_aligned.cc
+  jxl/cache_aligned.h
+  jxl/chroma_from_luma.cc
+  jxl/chroma_from_luma.h
+  jxl/codec_in_out.h
+  jxl/coeff_order.cc
+  jxl/coeff_order.h
+  jxl/coeff_order_fwd.h
+  jxl/color_encoding_internal.cc
+  jxl/color_encoding_internal.h
+  jxl/common.h
+  jxl/compressed_dc.cc
+  jxl/compressed_dc.h
+  jxl/convolve-inl.h
+  jxl/convolve.h
+  jxl/convolve_separable5.cc
+  jxl/convolve_separable7.cc
+  jxl/convolve_slow.cc
+  jxl/convolve_symmetric3.cc
+  jxl/convolve_symmetric5.cc
+  jxl/dct-inl.h
+  jxl/dct_block-inl.h
+  jxl/dct_scales.cc
+  jxl/dct_scales.h
+  jxl/dct_util.h
+  jxl/dec_ans.cc
+  jxl/dec_ans.h
+  jxl/dec_bit_reader.h
+  jxl/dec_cache.cc
+  jxl/dec_cache.h
+  jxl/dec_context_map.cc
+  jxl/dec_context_map.h
+  jxl/dec_external_image.cc
+  jxl/dec_external_image.h
+  jxl/dec_frame.cc
+  jxl/dec_frame.h
+  jxl/dec_group.cc
+  jxl/dec_group.h
+  jxl/dec_group_border.cc
+  jxl/dec_group_border.h
+  jxl/dec_huffman.cc
+  jxl/dec_huffman.h
+  jxl/dec_modular.cc
+  jxl/dec_modular.h
+  jxl/dec_noise.cc
+  jxl/dec_noise.h
+  jxl/dec_patch_dictionary.cc
+  jxl/dec_patch_dictionary.h
+  jxl/dec_transforms-inl.h
+  jxl/dec_xyb-inl.h
+  jxl/dec_xyb.cc
+  jxl/dec_xyb.h
+  jxl/decode.cc
+  jxl/entropy_coder.cc
+  jxl/entropy_coder.h
+  jxl/epf.cc
+  jxl/epf.h
+  jxl/exif.h
+  jxl/fast_dct-inl.h
+  jxl/fast_dct.cc
+  jxl/fast_dct.h
+  jxl/fast_dct128-inl.h
+  jxl/fast_dct16-inl.h
+  jxl/fast_dct256-inl.h
+  jxl/fast_dct32-inl.h
+  jxl/fast_dct64-inl.h
+  jxl/fast_dct8-inl.h
+  jxl/field_encodings.h
+  jxl/fields.cc
+  jxl/fields.h
+  jxl/frame_dimensions.h
+  jxl/frame_header.cc
+  jxl/frame_header.h
+  jxl/gauss_blur.cc
+  jxl/gauss_blur.h
+  jxl/headers.cc
+  jxl/headers.h
+  jxl/huffman_table.cc
+  jxl/huffman_table.h
+  jxl/icc_codec.cc
+  jxl/icc_codec.h
+  jxl/icc_codec_common.cc
+  jxl/icc_codec_common.h
+  jxl/image.cc
+  jxl/image.h
+  jxl/image_bundle.cc
+  jxl/image_bundle.h
+  jxl/image_metadata.cc
+  jxl/image_metadata.h
+  jxl/image_ops.h
+  jxl/inverse_mtf-inl.h
+  jxl/lehmer_code.h
+  jxl/loop_filter.cc
+  jxl/loop_filter.h
+  jxl/luminance.cc
+  jxl/luminance.h
+  jxl/memory_manager_internal.cc
+  jxl/memory_manager_internal.h
+  jxl/modular/encoding/context_predict.h
+  jxl/modular/encoding/dec_ma.cc
+  jxl/modular/encoding/dec_ma.h
+  jxl/modular/encoding/encoding.cc
+  jxl/modular/encoding/encoding.h
+  jxl/modular/encoding/ma_common.h
+  jxl/modular/modular_image.cc
+  jxl/modular/modular_image.h
+  jxl/modular/options.h
+  jxl/modular/transform/palette.cc
+  jxl/modular/transform/palette.h
+  jxl/modular/transform/rct.cc
+  jxl/modular/transform/rct.h
+  jxl/modular/transform/squeeze.cc
+  jxl/modular/transform/squeeze.h
+  jxl/modular/transform/transform.cc
+  jxl/modular/transform/transform.h
+  jxl/noise.h
+  jxl/opsin_params.cc
+  jxl/opsin_params.h
+  jxl/pack_signed.h
+  jxl/padded_bytes.h
+  jxl/passes_state.cc
+  jxl/passes_state.h
+  jxl/patch_dictionary_internal.h
+  jxl/quant_weights.cc
+  jxl/quant_weights.h
+  jxl/quantizer-inl.h
+  jxl/quantizer.cc
+  jxl/quantizer.h
+  jxl/render_pipeline/low_memory_render_pipeline.cc
+  jxl/render_pipeline/low_memory_render_pipeline.h
+  jxl/render_pipeline/render_pipeline.cc
+  jxl/render_pipeline/render_pipeline.h
+  jxl/render_pipeline/render_pipeline_stage.h
+  jxl/render_pipeline/simple_render_pipeline.cc
+  jxl/render_pipeline/simple_render_pipeline.h
+  jxl/render_pipeline/stage_blending.cc
+  jxl/render_pipeline/stage_blending.h
+  jxl/render_pipeline/stage_chroma_upsampling.cc
+  jxl/render_pipeline/stage_chroma_upsampling.h
+  jxl/render_pipeline/stage_cms.cc
+  jxl/render_pipeline/stage_cms.h
+  jxl/render_pipeline/stage_epf.cc
+  jxl/render_pipeline/stage_epf.h
+  jxl/render_pipeline/stage_from_linear.cc
+  jxl/render_pipeline/stage_from_linear.h
+  jxl/render_pipeline/stage_gaborish.cc
+  jxl/render_pipeline/stage_gaborish.h
+  jxl/render_pipeline/stage_noise.cc
+  jxl/render_pipeline/stage_noise.h
+  jxl/render_pipeline/stage_patches.cc
+  jxl/render_pipeline/stage_patches.h
+  jxl/render_pipeline/stage_splines.cc
+  jxl/render_pipeline/stage_splines.h
+  jxl/render_pipeline/stage_spot.cc
+  jxl/render_pipeline/stage_spot.h
+  jxl/render_pipeline/stage_to_linear.cc
+  jxl/render_pipeline/stage_to_linear.h
+  jxl/render_pipeline/stage_tone_mapping.cc
+  jxl/render_pipeline/stage_tone_mapping.h
+  jxl/render_pipeline/stage_upsampling.cc
+  jxl/render_pipeline/stage_upsampling.h
+  jxl/render_pipeline/stage_write.cc
+  jxl/render_pipeline/stage_write.h
+  jxl/render_pipeline/stage_xyb.cc
+  jxl/render_pipeline/stage_xyb.h
+  jxl/render_pipeline/stage_ycbcr.cc
+  jxl/render_pipeline/stage_ycbcr.h
+  jxl/sanitizers.h
+  jxl/simd_util-inl.h
+  jxl/simd_util.cc
+  jxl/simd_util.h
+  jxl/splines.cc
+  jxl/splines.h
+  jxl/toc.cc
+  jxl/toc.h
+  jxl/transpose-inl.h
+  jxl/xorshift128plus-inl.h
+)
+
+set(JPEGXL_INTERNAL_ENC_SOURCES
+  jxl/butteraugli/butteraugli.cc
+  jxl/butteraugli/butteraugli.h
+  jxl/enc_ac_strategy.cc
+  jxl/enc_ac_strategy.h
+  jxl/enc_adaptive_quantization.cc
+  jxl/enc_adaptive_quantization.h
+  jxl/enc_ans.cc
+  jxl/enc_ans.h
+  jxl/enc_ans_params.h
+  jxl/enc_ar_control_field.cc
+  jxl/enc_ar_control_field.h
+  jxl/enc_aux_out.cc
+  jxl/enc_aux_out.h
+  jxl/enc_bit_writer.cc
+  jxl/enc_bit_writer.h
+  jxl/enc_butteraugli_comparator.cc
+  jxl/enc_butteraugli_comparator.h
+  jxl/enc_cache.cc
+  jxl/enc_cache.h
+  jxl/enc_chroma_from_luma.cc
+  jxl/enc_chroma_from_luma.h
+  jxl/enc_cluster.cc
+  jxl/enc_cluster.h
+  jxl/enc_coeff_order.cc
+  jxl/enc_coeff_order.h
+  jxl/enc_comparator.cc
+  jxl/enc_comparator.h
+  jxl/enc_context_map.cc
+  jxl/enc_context_map.h
+  jxl/enc_debug_image.cc
+  jxl/enc_debug_image.h
+  jxl/enc_detect_dots.cc
+  jxl/enc_detect_dots.h
+  jxl/enc_dot_dictionary.cc
+  jxl/enc_dot_dictionary.h
+  jxl/enc_entropy_coder.cc
+  jxl/enc_entropy_coder.h
+  jxl/enc_external_image.cc
+  jxl/enc_external_image.h
+  jxl/enc_fast_lossless.cc
+  jxl/enc_fast_lossless.h
+  jxl/enc_fields.cc
+  jxl/enc_fields.h
+  jxl/enc_frame.cc
+  jxl/enc_frame.h
+  jxl/enc_gaborish.cc
+  jxl/enc_gaborish.h
+  jxl/enc_gamma_correct.h
+  jxl/enc_group.cc
+  jxl/enc_group.h
+  jxl/enc_heuristics.cc
+  jxl/enc_heuristics.h
+  jxl/enc_huffman.cc
+  jxl/enc_huffman.h
+  jxl/enc_huffman_tree.cc
+  jxl/enc_huffman_tree.h
+  jxl/enc_icc_codec.cc
+  jxl/enc_icc_codec.h
+  jxl/enc_image_bundle.cc
+  jxl/enc_image_bundle.h
+  jxl/enc_linalg.cc
+  jxl/enc_linalg.h
+  jxl/enc_modular.cc
+  jxl/enc_modular.h
+  jxl/enc_noise.cc
+  jxl/enc_noise.h
+  jxl/enc_optimize.cc
+  jxl/enc_optimize.h
+  jxl/enc_params.h
+  jxl/enc_patch_dictionary.cc
+  jxl/enc_patch_dictionary.h
+  jxl/enc_photon_noise.cc
+  jxl/enc_photon_noise.h
+  jxl/enc_progressive_split.cc
+  jxl/enc_progressive_split.h
+  jxl/enc_quant_weights.cc
+  jxl/enc_quant_weights.h
+  jxl/enc_splines.cc
+  jxl/enc_splines.h
+  jxl/enc_toc.cc
+  jxl/enc_toc.h
+  jxl/enc_transforms-inl.h
+  jxl/enc_transforms.cc
+  jxl/enc_transforms.h
+  jxl/enc_xyb.cc
+  jxl/enc_xyb.h
+  jxl/encode.cc
+  jxl/encode_internal.h
+  jxl/jpeg/enc_jpeg_data.cc
+  jxl/jpeg/enc_jpeg_data.h
+  jxl/jpeg/enc_jpeg_data_reader.cc
+  jxl/jpeg/enc_jpeg_data_reader.h
+  jxl/jpeg/enc_jpeg_huffman_decode.cc
+  jxl/jpeg/enc_jpeg_huffman_decode.h
+  jxl/modular/encoding/enc_debug_tree.cc
+  jxl/modular/encoding/enc_debug_tree.h
+  jxl/modular/encoding/enc_encoding.cc
+  jxl/modular/encoding/enc_encoding.h
+  jxl/modular/encoding/enc_ma.cc
+  jxl/modular/encoding/enc_ma.h
+  jxl/modular/transform/enc_palette.cc
+  jxl/modular/transform/enc_palette.h
+  jxl/modular/transform/enc_rct.cc
+  jxl/modular/transform/enc_rct.h
+  jxl/modular/transform/enc_squeeze.cc
+  jxl/modular/transform/enc_squeeze.h
+  jxl/modular/transform/enc_transform.cc
+  jxl/modular/transform/enc_transform.h
+)
+
+set(JPEGXL_INTERNAL_EXTRAS_FOR_TOOLS_SOURCES
+  extras/codec.cc
+  extras/codec.h
+  extras/hlg.cc
+  extras/hlg.h
+  extras/metrics.cc
+  extras/metrics.h
+  extras/packed_image_convert.cc
+  extras/packed_image_convert.h
+  extras/tone_mapping.cc
+  extras/tone_mapping.h
+)
+
+set(JPEGXL_INTERNAL_EXTRAS_SOURCES
+  extras/alpha_blend.cc
+  extras/alpha_blend.h
+  extras/common.cc
+  extras/common.h
+  extras/dec/color_description.cc
+  extras/dec/color_description.h
+  extras/dec/color_hints.cc
+  extras/dec/color_hints.h
+  extras/dec/decode.cc
+  extras/dec/decode.h
+  extras/enc/encode.cc
+  extras/enc/encode.h
+  extras/exif.cc
+  extras/exif.h
+  extras/packed_image.h
+  extras/size_constraints.h
+  extras/time.cc
+  extras/time.h
+)
+
+set(JPEGXL_INTERNAL_GBENCH_SOURCES
+  extras/tone_mapping_gbench.cc
+  jxl/dec_external_image_gbench.cc
+  jxl/enc_external_image_gbench.cc
+  jxl/gauss_blur_gbench.cc
+  jxl/splines_gbench.cc
+  jxl/tf_gbench.cc
+)
+
+set(JPEGXL_INTERNAL_JPEGLI_LIBJPEG_HELPER_FILES
+  jpegli/libjpeg_test_util.cc
+  jpegli/libjpeg_test_util.h
+)
+
+set(JPEGXL_INTERNAL_JPEGLI_SOURCES
+  jpegli/adaptive_quantization.cc
+  jpegli/adaptive_quantization.h
+  jpegli/bit_writer.cc
+  jpegli/bit_writer.h
+  jpegli/bitstream.cc
+  jpegli/bitstream.h
+  jpegli/color_quantize.cc
+  jpegli/color_quantize.h
+  jpegli/color_transform.cc
+  jpegli/color_transform.h
+  jpegli/common.cc
+  jpegli/common.h
+  jpegli/common_internal.h
+  jpegli/dct-inl.h
+  jpegli/decode.cc
+  jpegli/decode.h
+  jpegli/decode_internal.h
+  jpegli/decode_marker.cc
+  jpegli/decode_marker.h
+  jpegli/decode_scan.cc
+  jpegli/decode_scan.h
+  jpegli/destination_manager.cc
+  jpegli/downsample.cc
+  jpegli/downsample.h
+  jpegli/encode.cc
+  jpegli/encode.h
+  jpegli/encode_finish.cc
+  jpegli/encode_finish.h
+  jpegli/encode_internal.h
+  jpegli/encode_streaming.cc
+  jpegli/encode_streaming.h
+  jpegli/entropy_coding-inl.h
+  jpegli/entropy_coding.cc
+  jpegli/entropy_coding.h
+  jpegli/error.cc
+  jpegli/error.h
+  jpegli/huffman.cc
+  jpegli/huffman.h
+  jpegli/idct.cc
+  jpegli/idct.h
+  jpegli/input.cc
+  jpegli/input.h
+  jpegli/memory_manager.cc
+  jpegli/memory_manager.h
+  jpegli/quant.cc
+  jpegli/quant.h
+  jpegli/render.cc
+  jpegli/render.h
+  jpegli/simd.cc
+  jpegli/simd.h
+  jpegli/source_manager.cc
+  jpegli/transpose-inl.h
+  jpegli/types.h
+  jpegli/upsample.cc
+  jpegli/upsample.h
+)
+
+set(JPEGXL_INTERNAL_JPEGLI_TESTLIB_FILES
+  jpegli/test_params.h
+  jpegli/test_utils-inl.h
+  jpegli/test_utils.cc
+  jpegli/test_utils.h
+)
+
+set(JPEGXL_INTERNAL_JPEGLI_TESTS
+  jpegli/decode_api_test.cc
+  jpegli/encode_api_test.cc
+  jpegli/error_handling_test.cc
+  jpegli/input_suspension_test.cc
+  jpegli/output_suspension_test.cc
+  jpegli/source_manager_test.cc
+  jpegli/streaming_test.cc
+  jpegli/transcode_api_test.cc
+)
+
+set(JPEGXL_INTERNAL_JPEGLI_WRAPPER_SOURCES
+  jpegli/libjpeg_wrapper.cc
+)
+
+set(JPEGXL_INTERNAL_PUBLIC_HEADERS
+  include/jxl/cms.h
+  include/jxl/cms_interface.h
+  include/jxl/codestream_header.h
+  include/jxl/color_encoding.h
+  include/jxl/decode.h
+  include/jxl/decode_cxx.h
+  include/jxl/encode.h
+  include/jxl/encode_cxx.h
+  include/jxl/memory_manager.h
+  include/jxl/parallel_runner.h
+  include/jxl/stats.h
+  include/jxl/types.h
+)
+
+set(JPEGXL_INTERNAL_TESTLIB_FILES
+  jxl/dct_for_test.h
+  jxl/dec_transforms_testonly.cc
+  jxl/dec_transforms_testonly.h
+  jxl/fake_parallel_runner_testonly.h
+  jxl/image_test_utils.h
+  jxl/render_pipeline/test_render_pipeline_stages.h
+  jxl/test_image.cc
+  jxl/test_image.h
+  jxl/test_utils.cc
+  jxl/test_utils.h
+)
+
+set(JPEGXL_INTERNAL_TESTS
+  extras/codec_test.cc
+  extras/dec/color_description_test.cc
+  extras/dec/pgx_test.cc
+  extras/jpegli_test.cc
+  jxl/ac_strategy_test.cc
+  jxl/alpha_test.cc
+  jxl/ans_common_test.cc
+  jxl/ans_test.cc
+  jxl/bit_reader_test.cc
+  jxl/bits_test.cc
+  jxl/blending_test.cc
+  jxl/butteraugli/butteraugli_test.cc
+  jxl/byte_order_test.cc
+  jxl/cms/tone_mapping_test.cc
+  jxl/cms/transfer_functions_test.cc
+  jxl/coeff_order_test.cc
+  jxl/color_encoding_internal_test.cc
+  jxl/color_management_test.cc
+  jxl/convolve_test.cc
+  jxl/data_parallel_test.cc
+  jxl/dct_test.cc
+  jxl/decode_test.cc
+  jxl/enc_external_image_test.cc
+  jxl/enc_gaborish_test.cc
+  jxl/enc_linalg_test.cc
+  jxl/enc_optimize_test.cc
+  jxl/enc_photon_noise_test.cc
+  jxl/encode_test.cc
+  jxl/entropy_coder_test.cc
+  jxl/fast_dct_test.cc
+  jxl/fast_math_test.cc
+  jxl/fields_test.cc
+  jxl/gamma_correct_test.cc
+  jxl/gauss_blur_test.cc
+  jxl/gradient_test.cc
+  jxl/iaca_test.cc
+  jxl/icc_codec_test.cc
+  jxl/image_bundle_test.cc
+  jxl/image_ops_test.cc
+  jxl/jxl_test.cc
+  jxl/lehmer_code_test.cc
+  jxl/modular_test.cc
+  jxl/opsin_image_test.cc
+  jxl/opsin_inverse_test.cc
+  jxl/padded_bytes_test.cc
+  jxl/passes_test.cc
+  jxl/patch_dictionary_test.cc
+  jxl/preview_test.cc
+  jxl/quant_weights_test.cc
+  jxl/quantizer_test.cc
+  jxl/rational_polynomial_test.cc
+  jxl/render_pipeline/render_pipeline_test.cc
+  jxl/roundtrip_test.cc
+  jxl/simd_util_test.cc
+  jxl/speed_tier_test.cc
+  jxl/splines_test.cc
+  jxl/toc_test.cc
+  jxl/xorshift128plus_test.cc
+  threads/thread_parallel_runner_test.cc
+)
+
+set(JPEGXL_INTERNAL_THREADS_PUBLIC_HEADERS
+  include/jxl/resizable_parallel_runner.h
+  include/jxl/resizable_parallel_runner_cxx.h
+  include/jxl/thread_parallel_runner.h
+  include/jxl/thread_parallel_runner_cxx.h
+)
+
+set(JPEGXL_INTERNAL_THREADS_SOURCES
+  threads/resizable_parallel_runner.cc
+  threads/thread_parallel_runner.cc
+  threads/thread_parallel_runner_internal.cc
+  threads/thread_parallel_runner_internal.h
+)
diff --git a/lib/jxl_profiler.cmake b/lib/jxl_profiler.cmake
deleted file mode 100644 (file)
index 8faa626..0000000
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright (c) the JPEG XL Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style
-# license that can be found in the LICENSE file.
-
-set(JPEGXL_PROFILER_SOURCES
-  profiler/profiler.cc
-  profiler/profiler.h
-  profiler/tsc_timer.h
-)
-
-### Static library.
-add_library(jxl_profiler STATIC ${JPEGXL_PROFILER_SOURCES})
-target_link_libraries(jxl_profiler PUBLIC hwy)
-
-target_compile_options(jxl_profiler PRIVATE ${JPEGXL_INTERNAL_FLAGS})
-target_compile_options(jxl_profiler PUBLIC ${JPEGXL_COVERAGE_FLAGS})
-set_property(TARGET jxl_profiler PROPERTY POSITION_INDEPENDENT_CODE ON)
-
-target_include_directories(jxl_profiler
-  PRIVATE "${PROJECT_SOURCE_DIR}")
-
-set_target_properties(jxl_profiler PROPERTIES
-  CXX_VISIBILITY_PRESET hidden
-  VISIBILITY_INLINES_HIDDEN 1
-)
-
-# Make every library linking against the jxl_profiler define this macro to
-# enable the profiler.
-target_compile_definitions(jxl_profiler
-  PUBLIC -DPROFILER_ENABLED=1)
index c858ae9..decd77f 100644 (file)
 # Use of this source code is governed by a BSD-style
 # license that can be found in the LICENSE file.
 
-set(TEST_FILES
-  extras/codec_test.cc
-  extras/dec/color_description_test.cc
-  extras/dec/pgx_test.cc
-  jxl/ac_strategy_test.cc
-  jxl/alpha_test.cc
-  jxl/ans_common_test.cc
-  jxl/ans_test.cc
-  jxl/bit_reader_test.cc
-  jxl/bits_test.cc
-  jxl/blending_test.cc
-  jxl/butteraugli_test.cc
-  jxl/byte_order_test.cc
-  jxl/coeff_order_test.cc
-  jxl/color_encoding_internal_test.cc
-  jxl/color_management_test.cc
-  jxl/convolve_test.cc
-  jxl/data_parallel_test.cc
-  jxl/dct_test.cc
-  jxl/decode_test.cc
-  jxl/enc_external_image_test.cc
-  jxl/enc_photon_noise_test.cc
-  jxl/encode_test.cc
-  jxl/entropy_coder_test.cc
-  jxl/fast_dct_test.cc
-  jxl/fast_math_test.cc
-  jxl/fields_test.cc
-  jxl/gaborish_test.cc
-  jxl/gamma_correct_test.cc
-  jxl/gauss_blur_test.cc
-  jxl/gradient_test.cc
-  jxl/iaca_test.cc
-  jxl/icc_codec_test.cc
-  jxl/image_bundle_test.cc
-  jxl/image_ops_test.cc
-  jxl/jxl_test.cc
-  jxl/lehmer_code_test.cc
-  jxl/linalg_test.cc
-  jxl/modular_test.cc
-  jxl/opsin_image_test.cc
-  jxl/opsin_inverse_test.cc
-  jxl/optimize_test.cc
-  jxl/padded_bytes_test.cc
-  jxl/passes_test.cc
-  jxl/patch_dictionary_test.cc
-  jxl/preview_test.cc
-  jxl/quant_weights_test.cc
-  jxl/quantizer_test.cc
-  jxl/rational_polynomial_test.cc
-  jxl/render_pipeline/render_pipeline_test.cc
-  jxl/roundtrip_test.cc
-  jxl/simd_util_test.cc
-  jxl/speed_tier_test.cc
-  jxl/splines_test.cc
-  jxl/toc_test.cc
-  jxl/xorshift128plus_test.cc
-  threads/thread_parallel_runner_test.cc
-  ### Files before this line are handled by build_cleaner.py
-  # TODO(deymo): Move this to tools/
-  ../tools/box/box_test.cc
-  ../tools/djxl_fuzzer_test.cc
-)
+include(jxl_lists.cmake)
 
-# Test-only library code.
-set(TESTLIB_FILES
-  jxl/codec_y4m_testonly.cc
-  jxl/codec_y4m_testonly.h
-  jxl/dct_for_test.h
-  jxl/dec_transforms_testonly.cc
-  jxl/dec_transforms_testonly.h
-  jxl/fake_parallel_runner_testonly.h
-  jxl/image_test_utils.h
-  jxl/test_image.h
-  jxl/test_utils.h
-  jxl/testdata.h
+if(BUILD_TESTING OR JPEGXL_ENABLE_TOOLS)
+# Library with test-only code shared between all tests / fuzzers.
+add_library(jxl_testlib-internal STATIC ${JPEGXL_INTERNAL_TESTLIB_FILES})
+target_compile_options(jxl_testlib-internal PRIVATE
+  ${JPEGXL_INTERNAL_FLAGS}
+  ${JPEGXL_COVERAGE_FLAGS}
 )
-
-find_package(GTest)
-
-# Library with test-only code shared between all tests.
-add_library(jxl_testlib-static STATIC ${TESTLIB_FILES})
-  target_compile_options(jxl_testlib-static PRIVATE
-    ${JPEGXL_INTERNAL_FLAGS}
-    ${JPEGXL_COVERAGE_FLAGS}
-  )
-target_compile_definitions(jxl_testlib-static PUBLIC
+target_compile_definitions(jxl_testlib-internal PUBLIC
   -DTEST_DATA_PATH="${JPEGXL_TEST_DATA_PATH}")
-target_include_directories(jxl_testlib-static PUBLIC
+target_include_directories(jxl_testlib-internal PUBLIC
   "${PROJECT_SOURCE_DIR}"
 )
-target_link_libraries(jxl_testlib-static hwy jxl-static)
+target_link_libraries(jxl_testlib-internal
+  hwy
+  jxl_extras_nocodec-internal
+  jxl-internal
+  jxl_threads
+)
+endif()
+
+if(NOT BUILD_TESTING)
+  return()
+endif()
+
+list(APPEND JPEGXL_INTERNAL_TESTS
+  # TODO(deymo): Move this to tools/
+  ../tools/djxl_fuzzer_test.cc
+)
+
+find_package(GTest)
 
 # Individual test binaries:
 file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tests)
-foreach (TESTFILE IN LISTS TEST_FILES)
+foreach (TESTFILE IN LISTS JPEGXL_INTERNAL_TESTS)
   # The TESTNAME is the name without the extension or directory.
   get_filename_component(TESTNAME ${TESTFILE} NAME_WE)
   if(TESTFILE STREQUAL ../tools/djxl_fuzzer_test.cc)
@@ -103,19 +44,21 @@ foreach (TESTFILE IN LISTS TEST_FILES)
   else()
     add_executable(${TESTNAME} ${TESTFILE})
   endif()
-  if(JPEGXL_EMSCRIPTEN)
+  if(EMSCRIPTEN)
     # The emscripten linking step takes too much memory and crashes during the
     # wasm-opt step when using -O2 optimization level
     set_target_properties(${TESTNAME} PROPERTIES LINK_FLAGS "\
       -O1 \
       -s USE_LIBPNG=1 \
-      -s TOTAL_MEMORY=1536MB \
+      -s ALLOW_MEMORY_GROWTH=1 \
       -s SINGLE_FILE=1 \
       -s PROXY_TO_PTHREAD \
       -s EXIT_RUNTIME=1 \
       -s USE_PTHREADS=1 \
       -s NODERAWFS=1 \
     ")
+  else()
+    set_target_properties(${TESTNAME} PROPERTIES LINK_FLAGS "${JPEGXL_COVERAGE_LINK_FLAGS}")
   endif()
   target_compile_options(${TESTNAME} PRIVATE
     ${JPEGXL_INTERNAL_FLAGS}
@@ -124,21 +67,17 @@ foreach (TESTFILE IN LISTS TEST_FILES)
     ${JPEGXL_COVERAGE_FLAGS}
   )
   target_link_libraries(${TESTNAME}
-    box
-    jxl_extras-static
-    jxl_testlib-static
     gmock
     GTest::GTest
     GTest::Main
+    jxl_extras-internal
+    jxl_testlib-internal
   )
   # Output test targets in the test directory.
   set_target_properties(${TESTNAME} PROPERTIES PREFIX "tests/")
   if (WIN32 AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
     set_target_properties(${TESTNAME} PROPERTIES COMPILE_FLAGS "-Wno-error")
   endif ()
-  if(CMAKE_VERSION VERSION_LESS "3.10.3")
-    gtest_discover_tests(${TESTNAME} TIMEOUT 240)
-  else ()
-    gtest_discover_tests(${TESTNAME} DISCOVERY_TIMEOUT 240)
-  endif ()
+  # 240 seconds because some build types (e.g. coverage) can be quite slow.
+  gtest_discover_tests(${TESTNAME} DISCOVERY_TIMEOUT 240)
 endforeach ()
index 006e71e..5179e2c 100644 (file)
@@ -5,85 +5,50 @@
 
 find_package(Threads REQUIRED)
 
-set(JPEGXL_THREADS_SOURCES
-  threads/resizable_parallel_runner.cc
-  threads/thread_parallel_runner.cc
-  threads/thread_parallel_runner_internal.cc
-  threads/thread_parallel_runner_internal.h
-)
-
-### Define the jxl_threads shared or static target library. The ${target}
-# parameter should already be created with add_library(), but this function
-# sets all the remaining common properties.
-function(_set_jxl_threads _target)
+include(jxl_lists.cmake)
 
-target_compile_options(${_target} PRIVATE ${JPEGXL_INTERNAL_FLAGS})
-target_compile_options(${_target} PUBLIC ${JPEGXL_COVERAGE_FLAGS})
-set_property(TARGET ${_target} PROPERTY POSITION_INDEPENDENT_CODE ON)
+add_library(jxl_threads ${JPEGXL_INTERNAL_THREADS_SOURCES})
+target_compile_options(jxl_threads PRIVATE ${JPEGXL_INTERNAL_FLAGS})
+target_compile_options(jxl_threads PUBLIC ${JPEGXL_COVERAGE_FLAGS})
+set_property(TARGET jxl_threads PROPERTY POSITION_INDEPENDENT_CODE ON)
 
-target_include_directories(${_target}
+target_include_directories(jxl_threads
   PRIVATE
     "${PROJECT_SOURCE_DIR}"
   PUBLIC
     "${CMAKE_CURRENT_SOURCE_DIR}/include"
     "${CMAKE_CURRENT_BINARY_DIR}/include")
 
-target_link_libraries(${_target}
+target_link_libraries(jxl_threads
   PUBLIC ${JPEGXL_COVERAGE_FLAGS} Threads::Threads
 )
 
-set_target_properties(${_target} PROPERTIES
+set_target_properties(jxl_threads PROPERTIES
   CXX_VISIBILITY_PRESET hidden
   VISIBILITY_INLINES_HIDDEN 1
   DEFINE_SYMBOL JXL_THREADS_INTERNAL_LIBRARY_BUILD
 )
 
-# Always install the library as jxl_threads.{a,so} file without the "-static"
-# suffix, except in Windows.
-if (NOT WIN32 OR MINGW)
-  set_target_properties(${_target} PROPERTIES OUTPUT_NAME "jxl_threads")
-endif()
-install(TARGETS ${_target}
+install(TARGETS jxl_threads
   RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
   LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
   ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
 
-endfunction()
-
-
-### Static library.
-add_library(jxl_threads-static STATIC ${JPEGXL_THREADS_SOURCES})
-_set_jxl_threads(jxl_threads-static)
-
-# Make jxl_threads symbols neither imported nor exported when using the static
-# library. These will have hidden visibility anyway in the static library case
-# in unix.
-target_compile_definitions(jxl_threads-static
-  PUBLIC -DJXL_THREADS_STATIC_DEFINE)
-
-
-### Public shared library.
-if (BUILD_SHARED_LIBS)
-add_library(jxl_threads SHARED ${JPEGXL_THREADS_SOURCES})
-_set_jxl_threads(jxl_threads)
-
 set_target_properties(jxl_threads PROPERTIES
   VERSION ${JPEGXL_LIBRARY_VERSION}
-  SOVERSION ${JPEGXL_LIBRARY_SOVERSION}
-  LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
-  RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
+  SOVERSION ${JPEGXL_LIBRARY_SOVERSION})
 
-  set_target_properties(jxl_threads PROPERTIES
-      LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/jxl/jxl.version)
-  if(APPLE)
+set_target_properties(jxl_threads PROPERTIES
+    LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/jxl/jxl.version)
+if(APPLE)
   set_property(TARGET ${target} APPEND_STRING PROPERTY
       LINK_FLAGS "-Wl,-exported_symbols_list,${CMAKE_CURRENT_SOURCE_DIR}/jxl/jxl_osx.syms")
-  elseif(WIN32)
-    # Nothing needed here, we use __declspec(dllexport) (jxl_threads_export.h)
-  else()
+elseif(WIN32)
+# Nothing needed here, we use __declspec(dllexport) (jxl_threads_export.h)
+else()
   set_property(TARGET jxl_threads APPEND_STRING PROPERTY
       LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/jxl/jxl.version")
-  endif()  # APPLE
+endif()  # APPLE
 
 # Compile the shared library such that the JXL_THREADS_EXPORT symbols are
 # exported. Users of the library will not set this flag and therefore import
@@ -91,19 +56,9 @@ set_target_properties(jxl_threads PROPERTIES
 target_compile_definitions(jxl_threads
   PRIVATE -DJXL_THREADS_INTERNAL_LIBRARY_BUILD)
 
-# Generate the jxl/jxl_threads_export.h header, we only need to generate it once
-# but we can use it from both libraries.
 generate_export_header(jxl_threads
   BASE_NAME JXL_THREADS
   EXPORT_FILE_NAME include/jxl/jxl_threads_export.h)
-else()
-add_library(jxl_threads ALIAS jxl_threads-static)
-# When not building the shared library generate the jxl_threads_export.h header
-# only based on the static target.
-generate_export_header(jxl_threads-static
-  BASE_NAME JXL_THREADS
-  EXPORT_FILE_NAME include/jxl/jxl_threads_export.h)
-endif()  # BUILD_SHARED_LIBS
 
 
 ### Add a pkg-config file for libjxl_threads.
@@ -121,6 +76,12 @@ else()
     set(PKGCONFIG_TARGET_LIBS "\${exec_prefix}/${CMAKE_INSTALL_LIBDIR}")
 endif()
 
+if (BUILD_SHARED_LIBS)
+  set(JPEGXL_REQUIRES_TYPE "Requires.private")
+else()
+  set(JPEGXL_REQUIRES_TYPE "Requires")
+endif()
+
 set(JPEGXL_THREADS_LIBRARY_REQUIRES "")
 configure_file("${CMAKE_CURRENT_SOURCE_DIR}/threads/libjxl_threads.pc.in"
                "libjxl_threads.pc" @ONLY)
diff --git a/lib/jxl_vars.bzl b/lib/jxl_vars.bzl
new file mode 100644 (file)
index 0000000..f8971d0
--- /dev/null
@@ -0,0 +1,45 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Extra build variables.
+
+libjxl_root_package = "libjxl"
+
+libjxl_deps_brotli = ["@brotli//:brotlidec", "@brotli//:brotlienc"]
+libjxl_deps_gif = ["@gif//:gif"]
+libjxl_deps_gtest = ["@googletest//:gtest_main"]
+libjxl_deps_hwy = ["@highway//:hwy"]
+libjxl_deps_hwy_nanobenchmark = ["@highway//:nanobenchmark"]
+libjxl_deps_hwy_test_util = ["@highway//:hwy_test_util"]
+libjxl_deps_jpeg = ["@libjpeg_turbo//:jpeg"]
+libjxl_deps_exr = ["@openexr//:OpenEXR"]
+libjxl_deps_png = ["@png//:png"]
+libjxl_deps_runfiles = ["@bazel_tools//tools/cpp/runfiles"]
+libjxl_deps_skcms = ["@skcms//:skcms"]
+libjxl_deps_testdata = ["//:testdata"]
+
+libjxl_test_shards = {
+    "jpegli/decode_api_test": 10,
+    "jpegli/encode_api_test": 4,
+    "jpegli/input_suspension_test": 6,
+    "jpegli/output_suspension_test": 2,
+    "jxl/ans_test": 2,
+    "jxl/linalg_test": 2,
+    "jxl/modular_test": 4,
+    "jxl/roundtrip_test": 4,
+    "jxl/xorshift128plus_test": 2,
+    "jxl/ac_strategy_test": 10,  # TODO(eustas): separate heavy shard
+    "jxl/dct_test": 32,
+    "jxl/decode_test": 10,  # TODO(eustas): separate heavy shard
+    "jxl/fast_dct_test": 8,  # TODO(eustas): separate ultra-heavy shard
+    "jxl/fast_math_test": 10,  # TODO(eustas): separate heavy shard
+    "jxl/jxl_test": 10,  # TODO(eustas): separate heavy shard
+    "jxl/render_pipeline/render_pipeline_test": 10,
+}
+
+libjxl_test_timeouts = {
+    "jxl/fast_dct_test": "long",
+    "jxl/dct_test": "long",
+}
deleted file mode 100644 (file)
index 2914de99180e33f879280001c725cb3674892a82..0000000000000000000000000000000000000000
+++ /dev/null
@@ -1,501 +0,0 @@
-# Copyright (c) the JPEG XL Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style
-# license that can be found in the LICENSE file.
-
-# Source files definitions for GN-based build systems.
-
-# Library version macros
-libjxl_version_defines = [
-    "JPEGXL_MAJOR_VERSION=0",
-    "JPEGXL_MINOR_VERSION=7",
-    "JPEGXL_PATCH_VERSION=0",
-]
-
-libjxl_public_headers = [
-    "include/jxl/butteraugli.h",
-    "include/jxl/butteraugli_cxx.h",
-    "include/jxl/cms_interface.h",
-    "include/jxl/codestream_header.h",
-    "include/jxl/color_encoding.h",
-    "include/jxl/decode.h",
-    "include/jxl/decode_cxx.h",
-    "include/jxl/encode.h",
-    "include/jxl/encode_cxx.h",
-    "include/jxl/memory_manager.h",
-    "include/jxl/parallel_runner.h",
-    "include/jxl/types.h",
-]
-
-libjxl_dec_sources = [
-    "jxl/ac_context.h",
-    "jxl/ac_strategy.cc",
-    "jxl/ac_strategy.h",
-    "jxl/alpha.cc",
-    "jxl/alpha.h",
-    "jxl/ans_common.cc",
-    "jxl/ans_common.h",
-    "jxl/ans_params.h",
-    "jxl/aux_out.cc",
-    "jxl/aux_out.h",
-    "jxl/aux_out_fwd.h",
-    "jxl/base/arch_macros.h",
-    "jxl/base/bits.h",
-    "jxl/base/byte_order.h",
-    "jxl/base/cache_aligned.cc",
-    "jxl/base/cache_aligned.h",
-    "jxl/base/compiler_specific.h",
-    "jxl/base/data_parallel.cc",
-    "jxl/base/data_parallel.h",
-    "jxl/base/file_io.h",
-    "jxl/base/iaca.h",
-    "jxl/base/os_macros.h",
-    "jxl/base/override.h",
-    "jxl/base/padded_bytes.cc",
-    "jxl/base/padded_bytes.h",
-    "jxl/base/printf_macros.h",
-    "jxl/base/profiler.h",
-    "jxl/base/random.cc",
-    "jxl/base/random.h",
-    "jxl/base/sanitizer_definitions.h",
-    "jxl/base/scope_guard.h",
-    "jxl/base/span.h",
-    "jxl/base/status.h",
-    "jxl/base/thread_pool_internal.h",
-    "jxl/blending.cc",
-    "jxl/blending.h",
-    "jxl/box_content_decoder.cc",
-    "jxl/box_content_decoder.h",
-    "jxl/chroma_from_luma.cc",
-    "jxl/chroma_from_luma.h",
-    "jxl/codec_in_out.h",
-    "jxl/coeff_order.cc",
-    "jxl/coeff_order.h",
-    "jxl/coeff_order_fwd.h",
-    "jxl/color_encoding_internal.cc",
-    "jxl/color_encoding_internal.h",
-    "jxl/color_management.cc",
-    "jxl/color_management.h",
-    "jxl/common.h",
-    "jxl/compressed_dc.cc",
-    "jxl/compressed_dc.h",
-    "jxl/convolve-inl.h",
-    "jxl/convolve.h",
-    "jxl/convolve_separable5.cc",
-    "jxl/convolve_separable7.cc",
-    "jxl/convolve_slow.cc",
-    "jxl/convolve_symmetric3.cc",
-    "jxl/convolve_symmetric5.cc",
-    "jxl/dct-inl.h",
-    "jxl/dct_block-inl.h",
-    "jxl/dct_scales.cc",
-    "jxl/dct_scales.h",
-    "jxl/dct_util.h",
-    "jxl/dec_ans.cc",
-    "jxl/dec_ans.h",
-    "jxl/dec_bit_reader.h",
-    "jxl/dec_cache.cc",
-    "jxl/dec_cache.h",
-    "jxl/dec_context_map.cc",
-    "jxl/dec_context_map.h",
-    "jxl/dec_external_image.cc",
-    "jxl/dec_external_image.h",
-    "jxl/dec_frame.cc",
-    "jxl/dec_frame.h",
-    "jxl/dec_group.cc",
-    "jxl/dec_group.h",
-    "jxl/dec_group_border.cc",
-    "jxl/dec_group_border.h",
-    "jxl/dec_huffman.cc",
-    "jxl/dec_huffman.h",
-    "jxl/dec_modular.cc",
-    "jxl/dec_modular.h",
-    "jxl/dec_noise.cc",
-    "jxl/dec_noise.h",
-    "jxl/dec_patch_dictionary.cc",
-    "jxl/dec_patch_dictionary.h",
-    "jxl/dec_tone_mapping-inl.h",
-    "jxl/dec_transforms-inl.h",
-    "jxl/dec_xyb-inl.h",
-    "jxl/dec_xyb.cc",
-    "jxl/dec_xyb.h",
-    "jxl/decode.cc",
-    "jxl/decode_to_jpeg.cc",
-    "jxl/decode_to_jpeg.h",
-    "jxl/enc_bit_writer.cc",
-    "jxl/enc_bit_writer.h",
-    "jxl/entropy_coder.cc",
-    "jxl/entropy_coder.h",
-    "jxl/epf.cc",
-    "jxl/epf.h",
-    "jxl/exif.h",
-    "jxl/fast_dct-inl.h",
-    "jxl/fast_dct.cc",
-    "jxl/fast_dct.h",
-    "jxl/fast_dct128-inl.h",
-    "jxl/fast_dct16-inl.h",
-    "jxl/fast_dct256-inl.h",
-    "jxl/fast_dct32-inl.h",
-    "jxl/fast_dct64-inl.h",
-    "jxl/fast_dct8-inl.h",
-    "jxl/fast_math-inl.h",
-    "jxl/field_encodings.h",
-    "jxl/fields.cc",
-    "jxl/fields.h",
-    "jxl/frame_header.cc",
-    "jxl/frame_header.h",
-    "jxl/gauss_blur.cc",
-    "jxl/gauss_blur.h",
-    "jxl/headers.cc",
-    "jxl/headers.h",
-    "jxl/huffman_table.cc",
-    "jxl/huffman_table.h",
-    "jxl/icc_codec.cc",
-    "jxl/icc_codec.h",
-    "jxl/icc_codec_common.cc",
-    "jxl/icc_codec_common.h",
-    "jxl/image.cc",
-    "jxl/image.h",
-    "jxl/image_bundle.cc",
-    "jxl/image_bundle.h",
-    "jxl/image_metadata.cc",
-    "jxl/image_metadata.h",
-    "jxl/image_ops.h",
-    "jxl/jpeg/dec_jpeg_data.cc",
-    "jxl/jpeg/dec_jpeg_data.h",
-    "jxl/jpeg/dec_jpeg_data_writer.cc",
-    "jxl/jpeg/dec_jpeg_data_writer.h",
-    "jxl/jpeg/dec_jpeg_output_chunk.h",
-    "jxl/jpeg/dec_jpeg_serialization_state.h",
-    "jxl/jpeg/jpeg_data.cc",
-    "jxl/jpeg/jpeg_data.h",
-    "jxl/jxl_inspection.h",
-    "jxl/lehmer_code.h",
-    "jxl/linalg.h",
-    "jxl/loop_filter.cc",
-    "jxl/loop_filter.h",
-    "jxl/luminance.cc",
-    "jxl/luminance.h",
-    "jxl/memory_manager_internal.cc",
-    "jxl/memory_manager_internal.h",
-    "jxl/modular/encoding/context_predict.h",
-    "jxl/modular/encoding/dec_ma.cc",
-    "jxl/modular/encoding/dec_ma.h",
-    "jxl/modular/encoding/encoding.cc",
-    "jxl/modular/encoding/encoding.h",
-    "jxl/modular/encoding/ma_common.h",
-    "jxl/modular/modular_image.cc",
-    "jxl/modular/modular_image.h",
-    "jxl/modular/options.h",
-    "jxl/modular/transform/palette.h",
-    "jxl/modular/transform/rct.cc",
-    "jxl/modular/transform/rct.h",
-    "jxl/modular/transform/squeeze.cc",
-    "jxl/modular/transform/squeeze.h",
-    "jxl/modular/transform/transform.cc",
-    "jxl/modular/transform/transform.h",
-    "jxl/noise.h",
-    "jxl/opsin_params.cc",
-    "jxl/opsin_params.h",
-    "jxl/passes_state.cc",
-    "jxl/passes_state.h",
-    "jxl/patch_dictionary_internal.h",
-    "jxl/quant_weights.cc",
-    "jxl/quant_weights.h",
-    "jxl/quantizer-inl.h",
-    "jxl/quantizer.cc",
-    "jxl/quantizer.h",
-    "jxl/rational_polynomial-inl.h",
-    "jxl/render_pipeline/low_memory_render_pipeline.cc",
-    "jxl/render_pipeline/low_memory_render_pipeline.h",
-    "jxl/render_pipeline/render_pipeline.cc",
-    "jxl/render_pipeline/render_pipeline.h",
-    "jxl/render_pipeline/render_pipeline_stage.h",
-    "jxl/render_pipeline/simple_render_pipeline.cc",
-    "jxl/render_pipeline/simple_render_pipeline.h",
-    "jxl/render_pipeline/stage_blending.cc",
-    "jxl/render_pipeline/stage_blending.h",
-    "jxl/render_pipeline/stage_chroma_upsampling.cc",
-    "jxl/render_pipeline/stage_chroma_upsampling.h",
-    "jxl/render_pipeline/stage_epf.cc",
-    "jxl/render_pipeline/stage_epf.h",
-    "jxl/render_pipeline/stage_from_linear.cc",
-    "jxl/render_pipeline/stage_from_linear.h",
-    "jxl/render_pipeline/stage_gaborish.cc",
-    "jxl/render_pipeline/stage_gaborish.h",
-    "jxl/render_pipeline/stage_noise.cc",
-    "jxl/render_pipeline/stage_noise.h",
-    "jxl/render_pipeline/stage_patches.cc",
-    "jxl/render_pipeline/stage_patches.h",
-    "jxl/render_pipeline/stage_splines.cc",
-    "jxl/render_pipeline/stage_splines.h",
-    "jxl/render_pipeline/stage_spot.cc",
-    "jxl/render_pipeline/stage_spot.h",
-    "jxl/render_pipeline/stage_to_linear.cc",
-    "jxl/render_pipeline/stage_to_linear.h",
-    "jxl/render_pipeline/stage_tone_mapping.cc",
-    "jxl/render_pipeline/stage_tone_mapping.h",
-    "jxl/render_pipeline/stage_upsampling.cc",
-    "jxl/render_pipeline/stage_upsampling.h",
-    "jxl/render_pipeline/stage_write.cc",
-    "jxl/render_pipeline/stage_write.h",
-    "jxl/render_pipeline/stage_xyb.cc",
-    "jxl/render_pipeline/stage_xyb.h",
-    "jxl/render_pipeline/stage_ycbcr.cc",
-    "jxl/render_pipeline/stage_ycbcr.h",
-    "jxl/render_pipeline/test_render_pipeline_stages.h",
-    "jxl/sanitizers.h",
-    "jxl/simd_util-inl.h",
-    "jxl/size_constraints.h",
-    "jxl/splines.cc",
-    "jxl/splines.h",
-    "jxl/toc.cc",
-    "jxl/toc.h",
-    "jxl/transfer_functions-inl.h",
-    "jxl/transpose-inl.h",
-    "jxl/xorshift128plus-inl.h",
-]
-
-libjxl_enc_sources = [
-    "jxl/butteraugli/butteraugli.cc",
-    "jxl/butteraugli/butteraugli.h",
-    "jxl/butteraugli_wrapper.cc",
-    "jxl/enc_ac_strategy.cc",
-    "jxl/enc_ac_strategy.h",
-    "jxl/enc_adaptive_quantization.cc",
-    "jxl/enc_adaptive_quantization.h",
-    "jxl/enc_ans.cc",
-    "jxl/enc_ans.h",
-    "jxl/enc_ans_params.h",
-    "jxl/enc_ar_control_field.cc",
-    "jxl/enc_ar_control_field.h",
-    "jxl/enc_butteraugli_comparator.cc",
-    "jxl/enc_butteraugli_comparator.h",
-    "jxl/enc_butteraugli_pnorm.cc",
-    "jxl/enc_butteraugli_pnorm.h",
-    "jxl/enc_cache.cc",
-    "jxl/enc_cache.h",
-    "jxl/enc_chroma_from_luma.cc",
-    "jxl/enc_chroma_from_luma.h",
-    "jxl/enc_cluster.cc",
-    "jxl/enc_cluster.h",
-    "jxl/enc_coeff_order.cc",
-    "jxl/enc_coeff_order.h",
-    "jxl/enc_color_management.cc",
-    "jxl/enc_color_management.h",
-    "jxl/enc_comparator.cc",
-    "jxl/enc_comparator.h",
-    "jxl/enc_context_map.cc",
-    "jxl/enc_context_map.h",
-    "jxl/enc_detect_dots.cc",
-    "jxl/enc_detect_dots.h",
-    "jxl/enc_dot_dictionary.cc",
-    "jxl/enc_dot_dictionary.h",
-    "jxl/enc_entropy_coder.cc",
-    "jxl/enc_entropy_coder.h",
-    "jxl/enc_external_image.cc",
-    "jxl/enc_external_image.h",
-    "jxl/enc_file.cc",
-    "jxl/enc_file.h",
-    "jxl/enc_frame.cc",
-    "jxl/enc_frame.h",
-    "jxl/enc_gamma_correct.h",
-    "jxl/enc_group.cc",
-    "jxl/enc_group.h",
-    "jxl/enc_heuristics.cc",
-    "jxl/enc_heuristics.h",
-    "jxl/enc_huffman.cc",
-    "jxl/enc_huffman.h",
-    "jxl/enc_icc_codec.cc",
-    "jxl/enc_icc_codec.h",
-    "jxl/enc_image_bundle.cc",
-    "jxl/enc_image_bundle.h",
-    "jxl/enc_jxl_skcms.h",
-    "jxl/enc_modular.cc",
-    "jxl/enc_modular.h",
-    "jxl/enc_noise.cc",
-    "jxl/enc_noise.h",
-    "jxl/enc_params.h",
-    "jxl/enc_patch_dictionary.cc",
-    "jxl/enc_patch_dictionary.h",
-    "jxl/enc_photon_noise.cc",
-    "jxl/enc_photon_noise.h",
-    "jxl/enc_quant_weights.cc",
-    "jxl/enc_quant_weights.h",
-    "jxl/enc_splines.cc",
-    "jxl/enc_splines.h",
-    "jxl/enc_toc.cc",
-    "jxl/enc_toc.h",
-    "jxl/enc_transforms-inl.h",
-    "jxl/enc_transforms.cc",
-    "jxl/enc_transforms.h",
-    "jxl/enc_xyb.cc",
-    "jxl/enc_xyb.h",
-    "jxl/encode.cc",
-    "jxl/encode_internal.h",
-    "jxl/gaborish.cc",
-    "jxl/gaborish.h",
-    "jxl/huffman_tree.cc",
-    "jxl/huffman_tree.h",
-    "jxl/jpeg/enc_jpeg_data.cc",
-    "jxl/jpeg/enc_jpeg_data.h",
-    "jxl/jpeg/enc_jpeg_data_reader.cc",
-    "jxl/jpeg/enc_jpeg_data_reader.h",
-    "jxl/jpeg/enc_jpeg_huffman_decode.cc",
-    "jxl/jpeg/enc_jpeg_huffman_decode.h",
-    "jxl/linalg.cc",
-    "jxl/modular/encoding/enc_debug_tree.cc",
-    "jxl/modular/encoding/enc_debug_tree.h",
-    "jxl/modular/encoding/enc_encoding.cc",
-    "jxl/modular/encoding/enc_encoding.h",
-    "jxl/modular/encoding/enc_ma.cc",
-    "jxl/modular/encoding/enc_ma.h",
-    "jxl/modular/transform/enc_palette.cc",
-    "jxl/modular/transform/enc_palette.h",
-    "jxl/modular/transform/enc_rct.cc",
-    "jxl/modular/transform/enc_rct.h",
-    "jxl/modular/transform/enc_squeeze.cc",
-    "jxl/modular/transform/enc_squeeze.h",
-    "jxl/modular/transform/enc_transform.cc",
-    "jxl/modular/transform/enc_transform.h",
-    "jxl/optimize.cc",
-    "jxl/optimize.h",
-    "jxl/progressive_split.cc",
-    "jxl/progressive_split.h",
-]
-
-libjxl_gbench_sources = [
-    "extras/tone_mapping_gbench.cc",
-    "jxl/dec_external_image_gbench.cc",
-    "jxl/enc_external_image_gbench.cc",
-    "jxl/gauss_blur_gbench.cc",
-    "jxl/splines_gbench.cc",
-    "jxl/tf_gbench.cc",
-]
-
-libjxl_tests_sources = [
-    "jxl/ac_strategy_test.cc",
-    "jxl/alpha_test.cc",
-    "jxl/ans_common_test.cc",
-    "jxl/ans_test.cc",
-    "jxl/bit_reader_test.cc",
-    "jxl/bits_test.cc",
-    "jxl/blending_test.cc",
-    "jxl/butteraugli_test.cc",
-    "jxl/byte_order_test.cc",
-    "jxl/coeff_order_test.cc",
-    "jxl/color_encoding_internal_test.cc",
-    "jxl/color_management_test.cc",
-    "jxl/convolve_test.cc",
-    "jxl/data_parallel_test.cc",
-    "jxl/dct_test.cc",
-    "jxl/decode_test.cc",
-    "jxl/enc_external_image_test.cc",
-    "jxl/enc_photon_noise_test.cc",
-    "jxl/encode_test.cc",
-    "jxl/entropy_coder_test.cc",
-    "jxl/fast_dct_test.cc",
-    "jxl/fast_math_test.cc",
-    "jxl/fields_test.cc",
-    "jxl/gaborish_test.cc",
-    "jxl/gamma_correct_test.cc",
-    "jxl/gauss_blur_test.cc",
-    "jxl/gradient_test.cc",
-    "jxl/iaca_test.cc",
-    "jxl/icc_codec_test.cc",
-    "jxl/image_bundle_test.cc",
-    "jxl/image_ops_test.cc",
-    "jxl/jxl_test.cc",
-    "jxl/lehmer_code_test.cc",
-    "jxl/linalg_test.cc",
-    "jxl/modular_test.cc",
-    "jxl/opsin_image_test.cc",
-    "jxl/opsin_inverse_test.cc",
-    "jxl/optimize_test.cc",
-    "jxl/padded_bytes_test.cc",
-    "jxl/passes_test.cc",
-    "jxl/patch_dictionary_test.cc",
-    "jxl/preview_test.cc",
-    "jxl/quant_weights_test.cc",
-    "jxl/quantizer_test.cc",
-    "jxl/rational_polynomial_test.cc",
-    "jxl/render_pipeline/render_pipeline_test.cc",
-    "jxl/roundtrip_test.cc",
-    "jxl/simd_util_test.cc",
-    "jxl/speed_tier_test.cc",
-    "jxl/splines_test.cc",
-    "jxl/toc_test.cc",
-    "jxl/xorshift128plus_test.cc",
-]
-
-# Test-only library code.
-libjxl_testlib_sources = [
-    "jxl/codec_y4m_testonly.cc",
-    "jxl/codec_y4m_testonly.h",
-    "jxl/dct_for_test.h",
-    "jxl/dec_transforms_testonly.cc",
-    "jxl/dec_transforms_testonly.h",
-    "jxl/fake_parallel_runner_testonly.h",
-    "jxl/image_test_utils.h",
-    "jxl/test_image.h",
-    "jxl/test_utils.h",
-    "jxl/testdata.h",
-]
-
-libjxl_extras_sources = [
-    "extras/codec.cc",
-    "extras/codec.h",
-    "extras/dec/color_description.cc",
-    "extras/dec/color_description.h",
-    "extras/dec/color_hints.cc",
-    "extras/dec/color_hints.h",
-    "extras/dec/decode.cc",
-    "extras/dec/decode.h",
-    "extras/dec/jxl.cc",
-    "extras/dec/jxl.h",
-    "extras/dec/pgx.cc",
-    "extras/dec/pgx.h",
-    "extras/dec/pnm.cc",
-    "extras/dec/pnm.h",
-    "extras/enc/encode.cc",
-    "extras/enc/encode.h",
-    "extras/enc/npy.cc",
-    "extras/enc/npy.h",
-    "extras/enc/pgx.cc",
-    "extras/enc/pgx.h",
-    "extras/enc/pnm.cc",
-    "extras/enc/pnm.h",
-    "extras/exif.cc",
-    "extras/exif.h",
-    "extras/hlg.cc",
-    "extras/hlg.h",
-    "extras/packed_image.h",
-    "extras/packed_image_convert.cc",
-    "extras/packed_image_convert.h",
-    "extras/render_hdr.cc",
-    "extras/render_hdr.h",
-    "extras/time.cc",
-    "extras/time.h",
-    "extras/tone_mapping.cc",
-    "extras/tone_mapping.h",
-]
-
-libjxl_threads_sources = [
-    "threads/resizable_parallel_runner.cc",
-    "threads/thread_parallel_runner.cc",
-    "threads/thread_parallel_runner_internal.cc",
-    "threads/thread_parallel_runner_internal.h",
-]
-
-libjxl_threads_public_headers = [
-    "include/jxl/resizable_parallel_runner.h",
-    "include/jxl/resizable_parallel_runner_cxx.h",
-    "include/jxl/thread_parallel_runner.h",
-    "include/jxl/thread_parallel_runner_cxx.h",
-]
-
-libjxl_profiler_sources = [
-    "profiler/profiler.cc",
-    "profiler/profiler.h",
-    "profiler/tsc_timer.h",
-]
new file mode 120000 (symlink)
index 0000000000000000000000000000000000000000..416aa0c9e401a519834c65b06d33ed17224e2e8c
--- /dev/null
@@ -0,0 +1 @@
+jxl_lists.bzl
\ No newline at end of file
diff --git a/lib/profiler/profiler.cc b/lib/profiler/profiler.cc
deleted file mode 100644 (file)
index c72656e..0000000
+++ /dev/null
@@ -1,536 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "lib/jxl/base/profiler.h"
-
-#if PROFILER_ENABLED
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>  // memcpy
-
-#include <algorithm>  // sort
-#include <atomic>
-#include <cinttypes>  // PRIu64
-#include <hwy/cache_control.h>
-#include <limits>
-#include <new>
-
-// Optionally use SIMD in StreamCacheLine if available.
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "lib/profiler/profiler.cc"
-#include <hwy/foreach_target.h>
-#include <hwy/highway.h>
-
-HWY_BEFORE_NAMESPACE();
-namespace profiler {
-namespace HWY_NAMESPACE {
-
-// Overwrites `to` without loading it into cache (read-for-ownership).
-// Copies 64 bytes from/to naturally aligned addresses.
-void StreamCacheLine(const Packet* HWY_RESTRICT from, Packet* HWY_RESTRICT to) {
-#if HWY_TARGET == HWY_SCALAR
-  hwy::CopyBytes<64>(from, to);
-#else
-  const HWY_CAPPED(uint64_t, 2) d;
-  HWY_FENCE;
-  const uint64_t* HWY_RESTRICT from64 = reinterpret_cast<const uint64_t*>(from);
-  const auto v0 = Load(d, from64 + 0);
-  const auto v1 = Load(d, from64 + 2);
-  const auto v2 = Load(d, from64 + 4);
-  const auto v3 = Load(d, from64 + 6);
-  // Fences prevent the compiler from reordering loads/stores, which may
-  // interfere with write-combining.
-  HWY_FENCE;
-  uint64_t* HWY_RESTRICT to64 = reinterpret_cast<uint64_t*>(to);
-  Stream(v0, d, to64 + 0);
-  Stream(v1, d, to64 + 2);
-  Stream(v2, d, to64 + 4);
-  Stream(v3, d, to64 + 6);
-  HWY_FENCE;
-#endif
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace profiler
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace profiler {
-
-HWY_EXPORT(StreamCacheLine);
-
-namespace {
-
-// How many mebibytes to allocate (if PROFILER_ENABLED) per thread that
-// enters at least one zone. Once this buffer is full, the thread will analyze
-// packets (two per zone), which introduces observer overhead.
-#ifndef PROFILER_THREAD_STORAGE
-#define PROFILER_THREAD_STORAGE 32ULL
-#endif
-
-#define PROFILER_PRINT_OVERHEAD 0
-
-// Upper bounds for fixed-size data structures (guarded via HWY_ASSERT):
-constexpr size_t kMaxDepth = 64;   // Maximum nesting of zones.
-constexpr size_t kMaxZones = 256;  // Total number of zones.
-
-// Stack of active (entered but not exited) zones. POD, uninitialized.
-// Used to deduct child duration from the parent's self time.
-struct ActiveZone {
-  const char* name;
-  uint64_t entry_timestamp;
-  uint64_t child_total;
-};
-
-// Totals for all Zones with the same name. POD, must be zero-initialized.
-struct ZoneTotals {
-  uint64_t total_duration;
-  const char* name;
-  uint64_t num_calls;
-};
-
-template <typename T>
-inline T ClampedSubtract(const T minuend, const T subtrahend) {
-  if (subtrahend > minuend) {
-    return 0;
-  }
-  return minuend - subtrahend;
-}
-
-}  // namespace
-
-// Per-thread call graph (stack) and ZoneTotals for each zone.
-class Results {
- public:
-  Results() {
-    // Zero-initialize all accumulators (avoids a check for num_zones_ == 0).
-    memset(zones_, 0, sizeof(zones_));
-  }
-
-  // Used for computing overhead when this thread encounters its first Zone.
-  // This has no observable effect apart from increasing "analyze_elapsed_".
-  uint64_t ZoneDuration(const Packet* packets) {
-    HWY_ASSERT(depth_ == 0);
-    HWY_ASSERT(num_zones_ == 0);
-    AnalyzePackets(packets, 2);
-    const uint64_t duration = zones_[0].total_duration;
-    zones_[0].num_calls = 0;
-    zones_[0].total_duration = 0;
-    HWY_ASSERT(depth_ == 0);
-    num_zones_ = 0;
-    return duration;
-  }
-
-  void SetSelfOverhead(const uint64_t self_overhead) {
-    self_overhead_ = self_overhead;
-  }
-
-  void SetChildOverhead(const uint64_t child_overhead) {
-    child_overhead_ = child_overhead;
-  }
-
-  // Draw all required information from the packets, which can be discarded
-  // afterwards. Called whenever this thread's storage is full.
-  void AnalyzePackets(const Packet* HWY_RESTRICT packets,
-                      const size_t num_packets) {
-    // Ensures prior weakly-ordered streaming stores are globally visible.
-    hwy::FlushStream();
-
-    const uint64_t t0 = TicksBefore();
-
-    for (size_t i = 0; i < num_packets; ++i) {
-      const uint64_t timestamp = packets[i].timestamp;
-      // Entering a zone
-      if (packets[i].name != nullptr) {
-        HWY_ASSERT(depth_ < kMaxDepth);
-        zone_stack_[depth_].name = packets[i].name;
-        zone_stack_[depth_].entry_timestamp = timestamp;
-        zone_stack_[depth_].child_total = 0;
-        ++depth_;
-        continue;
-      }
-
-      HWY_ASSERT(depth_ != 0);
-      const ActiveZone& active = zone_stack_[depth_ - 1];
-      const uint64_t duration = timestamp - active.entry_timestamp;
-      const uint64_t self_duration = ClampedSubtract(
-          duration, self_overhead_ + child_overhead_ + active.child_total);
-
-      UpdateOrAdd(active.name, 1, self_duration);
-      --depth_;
-
-      // "Deduct" the nested time from its parent's self_duration.
-      if (depth_ != 0) {
-        zone_stack_[depth_ - 1].child_total += duration + child_overhead_;
-      }
-    }
-
-    const uint64_t t1 = TicksAfter();
-    analyze_elapsed_ += t1 - t0;
-  }
-
-  // Incorporates results from another thread. Call after all threads have
-  // exited any zones.
-  void Assimilate(const Results& other) {
-    const uint64_t t0 = TicksBefore();
-    HWY_ASSERT(depth_ == 0);
-    HWY_ASSERT(other.depth_ == 0);
-
-    for (size_t i = 0; i < other.num_zones_; ++i) {
-      const ZoneTotals& zone = other.zones_[i];
-      UpdateOrAdd(zone.name, zone.num_calls, zone.total_duration);
-    }
-    const uint64_t t1 = TicksAfter();
-    analyze_elapsed_ += t1 - t0 + other.analyze_elapsed_;
-  }
-
-  // Single-threaded.
-  void Print() {
-    const uint64_t t0 = TicksBefore();
-    MergeDuplicates();
-
-    // Sort by decreasing total (self) cost.
-    std::sort(zones_, zones_ + num_zones_,
-              [](const ZoneTotals& r1, const ZoneTotals& r2) {
-                return r1.total_duration > r2.total_duration;
-              });
-
-    uint64_t total_visible_duration = 0;
-    for (size_t i = 0; i < num_zones_; ++i) {
-      const ZoneTotals& r = zones_[i];
-      if (r.name[0] != '@') {
-        total_visible_duration += r.total_duration;
-        printf("%-40s: %10" PRIu64 " x %15" PRIu64 "= %15" PRIu64 "\n", r.name,
-               r.num_calls, r.total_duration / r.num_calls, r.total_duration);
-      }
-    }
-
-    const uint64_t t1 = TicksAfter();
-    analyze_elapsed_ += t1 - t0;
-    printf("Total clocks during analysis: %" PRIu64 "\n", analyze_elapsed_);
-    printf("Total clocks measured: %" PRIu64 "\n", total_visible_duration);
-  }
-
-  // Single-threaded. Clears all results as if no zones had been recorded.
-  void Reset() {
-    analyze_elapsed_ = 0;
-    HWY_ASSERT(depth_ == 0);
-    num_zones_ = 0;
-    memset(zone_stack_, 0, sizeof(zone_stack_));
-    memset(zones_, 0, sizeof(zones_));
-  }
-
- private:
-  // Updates ZoneTotals of the same name, or inserts a new one if this thread
-  // has not yet seen that name. Uses a self-organizing list data structure,
-  // which avoids dynamic memory allocations and is faster than unordered_map.
-  void UpdateOrAdd(const char* name, const uint64_t num_calls,
-                   const uint64_t duration) {
-    // Special case for first zone: (maybe) update, without swapping.
-    if (zones_[0].name == name) {
-      zones_[0].total_duration += duration;
-      zones_[0].num_calls += num_calls;
-      return;
-    }
-
-    // Look for a zone with the same name.
-    for (size_t i = 1; i < num_zones_; ++i) {
-      if (zones_[i].name == name) {
-        zones_[i].total_duration += duration;
-        zones_[i].num_calls += num_calls;
-        // Swap with predecessor (more conservative than move to front,
-        // but at least as successful).
-        std::swap(zones_[i - 1], zones_[i]);
-        return;
-      }
-    }
-
-    // Not found; create a new ZoneTotals.
-    HWY_ASSERT(num_zones_ < kMaxZones);
-    ZoneTotals* HWY_RESTRICT zone = zones_ + num_zones_;
-    zone->name = name;
-    zone->num_calls = num_calls;
-    zone->total_duration = duration;
-    ++num_zones_;
-  }
-
-  // Each instantiation of a function template seems to get its own copy of
-  // __func__ and GCC doesn't merge them. An N^2 search for duplicates is
-  // acceptable because we only expect a few dozen zones.
-  void MergeDuplicates() {
-    for (size_t i = 0; i < num_zones_; ++i) {
-      // Add any subsequent duplicates to num_calls and total_duration.
-      for (size_t j = i + 1; j < num_zones_;) {
-        if (!strcmp(zones_[i].name, zones_[j].name)) {
-          zones_[i].num_calls += zones_[j].num_calls;
-          zones_[i].total_duration += zones_[j].total_duration;
-          // Fill hole with last item.
-          zones_[j] = zones_[--num_zones_];
-        } else {  // Name differed, try next ZoneTotals.
-          ++j;
-        }
-      }
-    }
-  }
-
-  uint64_t analyze_elapsed_ = 0;
-  uint64_t self_overhead_ = 0;
-  uint64_t child_overhead_ = 0;
-
-  size_t depth_ = 0;      // Number of active zones <= kMaxDepth.
-  size_t num_zones_ = 0;  // Number of unique zones <= kMaxZones.
-
-  // After other members to avoid large pointer offsets.
-  alignas(64) ActiveZone zone_stack_[kMaxDepth];  // Last = newest
-  alignas(64) ZoneTotals zones_[kMaxZones];       // Self-organizing list
-};
-
-ThreadSpecific::ThreadSpecific()
-    : max_packets_(PROFILER_THREAD_STORAGE << 16),  // MiB / sizeof(Packet)
-      packets_(hwy::AllocateAligned<Packet>(max_packets_)),
-      num_packets_(0),
-      results_(hwy::MakeUniqueAligned<Results>()) {}
-
-ThreadSpecific::~ThreadSpecific() {}
-
-void ThreadSpecific::FlushBuffer() {
-  if (num_packets_ + kBufferCapacity > max_packets_) {
-    results_->AnalyzePackets(packets_.get(), num_packets_);
-    num_packets_ = 0;
-  }
-  // This buffering halves observer overhead and decreases the overall
-  // runtime by about 3%.
-  HWY_DYNAMIC_DISPATCH(StreamCacheLine)
-  (buffer_, packets_.get() + num_packets_);
-  num_packets_ += kBufferCapacity;
-  buffer_size_ = 0;
-}
-
-void ThreadSpecific::AnalyzeRemainingPackets() {
-  // Storage full => empty it.
-  if (num_packets_ + buffer_size_ > max_packets_) {
-    results_->AnalyzePackets(packets_.get(), num_packets_);
-    num_packets_ = 0;
-  }
-
-  // Move buffer to storage
-  memcpy(packets_.get() + num_packets_, buffer_, buffer_size_ * sizeof(Packet));
-  num_packets_ += buffer_size_;
-  buffer_size_ = 0;
-
-  results_->AnalyzePackets(packets_.get(), num_packets_);
-  num_packets_ = 0;
-}
-
-namespace {
-
-class HalfSampleMode {
- public:
-  // Returns mode. "sorted" must be in ascending order.
-  template <typename T>
-  T operator()(const T* const HWY_RESTRICT sorted,
-               const size_t num_values) const {
-    int64_t center = num_values / 2;
-    int64_t width = num_values;
-
-    // Zoom in on modal intervals of decreasing width. Stop before we reach
-    // width=1, i.e. single values, for which there is no "slope".
-    while (width > 2) {
-      // Round up so we can still reach the outer edges of odd widths.
-      width = (width + 1) / 2;
-
-      center = CenterOfIntervalWithMinSlope(sorted, num_values, center, width);
-    }
-
-    return sorted[center];  // mode := middle value in modal interval.
-  }
-
- private:
-  // Returns center of the densest region [c-radius, c+radius].
-  template <typename T>
-  static HWY_INLINE int64_t CenterOfIntervalWithMinSlope(
-      const T* HWY_RESTRICT sorted, const int64_t total_values,
-      const int64_t center, const int64_t width) {
-    const int64_t radius = (width + 1) / 2;
-
-    auto compute_slope = [radius, total_values, sorted](
-                             int64_t c, int64_t* actual_center = nullptr) {
-      // For symmetry, check 2*radius+1 values, i.e. [min, max].
-      const int64_t min = std::max(c - radius, int64_t(0));
-      const int64_t max = std::min(c + radius, total_values - 1);
-      HWY_ASSERT(min < max);
-      HWY_ASSERT(sorted[min] <=
-                 sorted[max] + std::numeric_limits<float>::epsilon());
-      const float dx = max - min + 1;
-      const float slope = (sorted[max] - sorted[min]) / dx;
-
-      if (actual_center != nullptr) {
-        // c may be out of bounds, so return center of the clamped bounds.
-        *actual_center = (min + max + 1) / 2;
-      }
-      return slope;
-    };
-
-    // First find min_slope for all centers.
-    float min_slope = std::numeric_limits<float>::max();
-    for (int64_t c = center - radius; c <= center + radius; ++c) {
-      min_slope = std::min(min_slope, compute_slope(c));
-    }
-
-    // Candidates := centers with slope ~= min_slope.
-    std::vector<int64_t> candidates;
-    for (int64_t c = center - radius; c <= center + radius; ++c) {
-      int64_t actual_center;
-      const float slope = compute_slope(c, &actual_center);
-      if (slope <= min_slope * 1.001f) {
-        candidates.push_back(actual_center);
-      }
-    }
-
-    // Keep the median.
-    HWY_ASSERT(!candidates.empty());
-    if (candidates.size() == 1) return candidates[0];
-    std::nth_element(candidates.begin(),
-                     candidates.begin() + candidates.size() / 2,
-                     candidates.end());
-    return candidates[candidates.size() / 2];
-  }
-};
-
-}  // namespace
-
-void ThreadSpecific::ComputeOverhead() {
-  // Delay after capturing timestamps before/after the actual zone runs. Even
-  // with frequency throttling disabled, this has a multimodal distribution,
-  // including 32, 34, 48, 52, 59, 62.
-  uint64_t self_overhead;
-  {
-    const size_t kNumSamples = 32;
-    uint32_t samples[kNumSamples];
-    for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) {
-      const size_t kNumDurations = 1024;
-      uint32_t durations[kNumDurations];
-
-      for (size_t idx_duration = 0; idx_duration < kNumDurations;
-           ++idx_duration) {
-        {  //
-          PROFILER_ZONE("Dummy Zone (never shown)");
-        }
-        const uint64_t duration = results_->ZoneDuration(buffer_);
-        buffer_size_ = 0;
-        durations[idx_duration] = static_cast<uint32_t>(duration);
-        HWY_ASSERT(num_packets_ == 0);
-      }
-      std::sort(durations, durations + kNumDurations);
-      samples[idx_sample] = HalfSampleMode()(durations, kNumDurations);
-    }
-    // Median.
-    std::sort(samples, samples + kNumSamples);
-    self_overhead = samples[kNumSamples / 2];
-#if PROFILER_PRINT_OVERHEAD
-    printf("Overhead: %" PRIu64 "\n", static_cast<uint64_t>(self_overhead));
-#endif
-    results_->SetSelfOverhead(self_overhead);
-  }
-
-  // Delay before capturing start timestamp / after end timestamp.
-  const size_t kNumSamples = 32;
-  uint32_t samples[kNumSamples];
-  for (size_t idx_sample = 0; idx_sample < kNumSamples; ++idx_sample) {
-    const size_t kNumDurations = 16;
-    uint32_t durations[kNumDurations];
-    for (size_t idx_duration = 0; idx_duration < kNumDurations;
-         ++idx_duration) {
-      const size_t kReps = 10000;
-      // Analysis time should not be included => must fit within buffer.
-      HWY_ASSERT(kReps * 2 < max_packets_);
-      hwy::FlushStream();
-      const uint64_t t0 = TicksBefore();
-      for (size_t i = 0; i < kReps; ++i) {
-        PROFILER_ZONE("Dummy");
-      }
-      hwy::FlushStream();
-      const uint64_t t1 = TicksAfter();
-      HWY_ASSERT(num_packets_ + buffer_size_ == kReps * 2);
-      buffer_size_ = 0;
-      num_packets_ = 0;
-      const uint64_t avg_duration = (t1 - t0 + kReps / 2) / kReps;
-      durations[idx_duration] =
-          static_cast<uint32_t>(ClampedSubtract(avg_duration, self_overhead));
-    }
-    std::sort(durations, durations + kNumDurations);
-    samples[idx_sample] = HalfSampleMode()(durations, kNumDurations);
-  }
-  std::sort(samples, samples + kNumSamples);
-  const uint64_t child_overhead = samples[9 * kNumSamples / 10];
-#if PROFILER_PRINT_OVERHEAD
-  printf("Child overhead: %" PRIu64 "\n",
-         static_cast<uint64_t>(child_overhead));
-#endif
-  results_->SetChildOverhead(child_overhead);
-}
-
-namespace {
-
-// Could be a static member of Zone, but that would expose <atomic> in header.
-std::atomic<ThreadSpecific*>& GetHead() {
-  static std::atomic<ThreadSpecific*> head_{nullptr};  // Owning
-  return head_;
-}
-
-}  // namespace
-
-// Thread-safe.
-ThreadSpecific* Zone::InitThreadSpecific() {
-  ThreadSpecific* thread_specific =
-      hwy::MakeUniqueAligned<ThreadSpecific>().release();
-
-  // Insert into unordered list
-  std::atomic<ThreadSpecific*>& head = GetHead();
-  ThreadSpecific* old_head = head.load(std::memory_order_relaxed);
-  thread_specific->SetNext(old_head);
-  while (!head.compare_exchange_weak(old_head, thread_specific,
-                                     std::memory_order_release,
-                                     std::memory_order_relaxed)) {
-    thread_specific->SetNext(old_head);
-    // TODO(janwas): pause
-  }
-
-  // ComputeOverhead also creates a Zone, so this needs to be set before that
-  // to prevent infinite recursion.
-  GetThreadSpecific() = thread_specific;
-
-  thread_specific->ComputeOverhead();
-  return thread_specific;
-}
-
-// Single-threaded.
-/*static*/ void Zone::PrintResults() {
-  ThreadSpecific* head = GetHead().load(std::memory_order_relaxed);
-  ThreadSpecific* p = head;
-  while (p) {
-    p->AnalyzeRemainingPackets();
-
-    // Combine all threads into a single Result.
-    if (p != head) {
-      head->GetResults().Assimilate(p->GetResults());
-      p->GetResults().Reset();
-    }
-
-    p = p->GetNext();
-  }
-
-  if (head != nullptr) {
-    head->GetResults().Print();
-    head->GetResults().Reset();
-  }
-}
-
-}  // namespace profiler
-
-#endif  // HWY_ONCE
-#endif  // PROFILER_ENABLED
diff --git a/lib/profiler/profiler.h b/lib/profiler/profiler.h
deleted file mode 100644 (file)
index c71f63c..0000000
+++ /dev/null
@@ -1,165 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#ifndef LIB_PROFILER_PROFILER_H_
-#define LIB_PROFILER_PROFILER_H_
-
-// High precision, low overhead time measurements. Returns exact call counts and
-// total elapsed time for user-defined 'zones' (code regions, i.e. C++ scopes).
-//
-// Usage: instrument regions of interest: { PROFILER_ZONE("name"); /*code*/ } or
-// void FuncToMeasure() { PROFILER_FUNC; /*code*/ }.
-// After all threads have exited any zones, invoke PROFILER_PRINT_RESULTS() to
-// print call counts and average durations [CPU cycles] to stdout, sorted in
-// descending order of total duration.
-
-// If zero, this file has no effect and no measurements will be recorded.
-#ifndef PROFILER_ENABLED
-#define PROFILER_ENABLED 0
-#endif
-#if PROFILER_ENABLED
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include <hwy/aligned_allocator.h>
-#include <hwy/base.h>
-
-#include "lib/profiler/tsc_timer.h"
-
-#if HWY_COMPILER_MSVC
-#define PROFILER_PUBLIC
-#else
-#define PROFILER_PUBLIC __attribute__((visibility("default")))
-#endif
-
-namespace profiler {
-
-// Represents zone entry/exit events. POD.
-#pragma pack(push, 1)
-struct Packet {
-  // Computing a hash or string table is likely too expensive, and offsets
-  // from other libraries' string literals can be too large to combine them and
-  // a full-resolution timestamp into 64 bits.
-  uint64_t timestamp;
-  const char* name;  // nullptr for exit packets
-#if UINTPTR_MAX <= 0xFFFFFFFFu
-  uint32_t padding;
-#endif
-};
-#pragma pack(pop)
-static_assert(sizeof(Packet) == 16, "Wrong Packet size");
-
-class Results;  // pImpl
-
-// Per-thread packet storage, dynamically allocated and aligned.
-class ThreadSpecific {
-  static constexpr size_t kBufferCapacity = 64 / sizeof(Packet);
-
- public:
-  PROFILER_PUBLIC explicit ThreadSpecific();
-  PROFILER_PUBLIC ~ThreadSpecific();
-
-  // Depends on Zone => defined out of line.
-  PROFILER_PUBLIC void ComputeOverhead();
-
-  HWY_INLINE void WriteEntry(const char* name) { Write(name, TicksBefore()); }
-  HWY_INLINE void WriteExit() { Write(nullptr, TicksAfter()); }
-
-  PROFILER_PUBLIC void AnalyzeRemainingPackets();
-
-  // Accessors instead of public member for well-defined data layout.
-  void SetNext(ThreadSpecific* next) { next_ = next; }
-  ThreadSpecific* GetNext() const { return next_; }
-
-  Results& GetResults() { return *results_; }
-
- private:
-  PROFILER_PUBLIC void FlushBuffer();
-
-  // Write packet to buffer/storage, emptying them as needed.
-  void Write(const char* name, const uint64_t timestamp) {
-    if (buffer_size_ == kBufferCapacity) {  // Full
-      FlushBuffer();
-    }
-    buffer_[buffer_size_].name = name;
-    buffer_[buffer_size_].timestamp = timestamp;
-    ++buffer_size_;
-  }
-
-  // Write-combining buffer to avoid cache pollution. Must be the first
-  // non-static member to ensure cache-line alignment.
-  Packet buffer_[kBufferCapacity];
-  size_t buffer_size_ = 0;
-
-  // Contiguous storage for zone enter/exit packets.
-  const size_t max_packets_;
-  hwy::AlignedFreeUniquePtr<Packet[]> packets_;
-  size_t num_packets_;
-
-  // Linked list of all threads.
-  ThreadSpecific* next_ = nullptr;  // Owned, never released.
-
-  hwy::AlignedUniquePtr<Results> results_;
-};
-
-// RAII zone enter/exit recorder constructed by PROFILER_ZONE; also
-// responsible for initializing ThreadSpecific.
-class Zone {
- public:
-  HWY_NOINLINE explicit Zone(const char* name) {
-    HWY_FENCE;
-    ThreadSpecific* HWY_RESTRICT thread_specific = GetThreadSpecific();
-    if (HWY_UNLIKELY(thread_specific == nullptr)) {
-      thread_specific = InitThreadSpecific();
-    }
-
-    thread_specific->WriteEntry(name);
-  }
-
-  HWY_NOINLINE ~Zone() { GetThreadSpecific()->WriteExit(); }
-
-  // Call exactly once after all threads have exited all zones.
-  PROFILER_PUBLIC static void PrintResults();
-
- private:
-  // Returns reference to the thread's ThreadSpecific pointer (initially null).
-  // Function-local static avoids needing a separate definition.
-  static ThreadSpecific*& GetThreadSpecific() {
-    static thread_local ThreadSpecific* thread_specific;
-    return thread_specific;
-  }
-
-  // Non time-critical.
-  PROFILER_PUBLIC ThreadSpecific* InitThreadSpecific();
-};
-
-// Creates a zone starting from here until the end of the current scope.
-// Timestamps will be recorded when entering and exiting the zone.
-// To ensure the name pointer remains valid, we require it to be a string
-// literal (by merging with ""). We also compare strings by address.
-#define PROFILER_ZONE(name)             \
-  HWY_FENCE;                            \
-  const ::profiler::Zone zone("" name); \
-  HWY_FENCE
-
-// Creates a zone for an entire function (when placed at its beginning).
-// Shorter/more convenient than ZONE.
-#define PROFILER_FUNC                    \
-  HWY_FENCE;                             \
-  const ::profiler::Zone zone(__func__); \
-  HWY_FENCE
-
-#define PROFILER_PRINT_RESULTS ::profiler::Zone::PrintResults
-
-}  // namespace profiler
-
-#else  // !PROFILER_ENABLED
-#define PROFILER_ZONE(name)
-#define PROFILER_FUNC
-#define PROFILER_PRINT_RESULTS()
-#endif
-
-#endif  // LIB_PROFILER_PROFILER_H_
diff --git a/lib/profiler/tsc_timer.h b/lib/profiler/tsc_timer.h
deleted file mode 100644 (file)
index 9387f41..0000000
+++ /dev/null
@@ -1,170 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#ifndef LIB_PROFILER_TSC_TIMER_H_
-#define LIB_PROFILER_TSC_TIMER_H_
-
-// High-resolution (~10 ns) timestamps, using fences to prevent reordering and
-// ensure exactly the desired regions are measured.
-
-#include <stdint.h>
-#include <time.h>  // clock_gettime
-
-#if defined(_WIN32) || defined(_WIN64)
-#ifndef WIN32_LEAN_AND_MEAN
-#define WIN32_LEAN_AND_MEAN
-#endif  // WIN32_LEAN_AND_MEAN
-#ifndef NOMINMAX
-#define NOMINMAX
-#endif  // NOMINMAX
-#ifndef NOGDI
-#define NOGDI
-#endif  // NOGDI
-#include <windows.h>
-// Undef macros to avoid collisions
-#undef LoadFence
-#endif
-
-#if defined(__APPLE__)
-#include <mach/mach.h>
-#include <mach/mach_time.h>
-#endif
-
-#if defined(__HAIKU__)
-#include <OS.h>
-#endif
-
-#include <ctime>
-#include <hwy/base.h>
-#include <hwy/cache_control.h>  // LoadFence
-
-namespace profiler {
-
-// Ticks := platform-specific timer values (CPU cycles on x86). Must be
-// unsigned to guarantee wraparound on overflow.
-using Ticks = uint64_t;
-
-// TicksBefore/After return absolute timestamps and must be placed immediately
-// before and after the region to measure. We provide separate Before/After
-// functions because they use different fences.
-//
-// Background: RDTSC is not 'serializing'; earlier instructions may complete
-// after it, and/or later instructions may complete before it. 'Fences' ensure
-// regions' elapsed times are independent of such reordering. The only
-// documented unprivileged serializing instruction is CPUID, which acts as a
-// full fence (no reordering across it in either direction). Unfortunately
-// the latency of CPUID varies wildly (perhaps made worse by not initializing
-// its EAX input). Because it cannot reliably be deducted from the region's
-// elapsed time, it must not be included in the region to measure (i.e.
-// between the two RDTSC).
-//
-// The newer RDTSCP is sometimes described as serializing, but it actually
-// only serves as a half-fence with release semantics. Although all
-// instructions in the region will complete before the final timestamp is
-// captured, subsequent instructions may leak into the region and increase the
-// elapsed time. Inserting another fence after the final RDTSCP would prevent
-// such reordering without affecting the measured region.
-//
-// Fortunately, such a fence exists. The LFENCE instruction is only documented
-// to delay later loads until earlier loads are visible. However, Intel's
-// reference manual says it acts as a full fence (waiting until all earlier
-// instructions have completed, and delaying later instructions until it
-// completes). AMD assigns the same behavior to MFENCE.
-//
-// We need a fence before the initial RDTSC to prevent earlier instructions
-// from leaking into the region, and arguably another after RDTSC to avoid
-// region instructions from completing before the timestamp is recorded.
-// When surrounded by fences, the additional RDTSCP half-fence provides no
-// benefit, so the initial timestamp can be recorded via RDTSC, which has
-// lower overhead than RDTSCP because it does not read TSC_AUX. In summary,
-// we define Before = LFENCE/RDTSC/LFENCE; After = RDTSCP/LFENCE.
-//
-// Using Before+Before leads to higher variance and overhead than After+After.
-// However, After+After includes an LFENCE in the region measurements, which
-// adds a delay dependent on earlier loads. The combination of Before+After
-// is faster than Before+Before and more consistent than After+After because
-// the first LFENCE already delayed subsequent loads before the measured
-// region. This combination seems not to have been considered in prior work:
-// http://akaros.cs.berkeley.edu/lxr/akaros/kern/arch/x86/rdtsc_test.c
-//
-// Note: performance counters can measure 'exact' instructions-retired or
-// (unhalted) cycle counts. The RDPMC instruction is not serializing and also
-// requires fences. Unfortunately, it is not accessible on all OSes and we
-// prefer to avoid kernel-mode drivers. Performance counters are also affected
-// by several under/over-count errata, so we use the TSC instead.
-
-// Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds,
-// divide by InvariantTicksPerSecond.
-static HWY_INLINE HWY_MAYBE_UNUSED Ticks TicksBefore() {
-  Ticks t;
-#if HWY_ARCH_PPC && defined(__GLIBC__)
-  asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
-#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
-  hwy::LoadFence();
-  HWY_FENCE;
-  t = __rdtsc();
-  hwy::LoadFence();
-  HWY_FENCE;
-#elif HWY_ARCH_X86_64
-  asm volatile(
-      "lfence\n\t"
-      "rdtsc\n\t"
-      "shl $32, %%rdx\n\t"
-      "or %%rdx, %0\n\t"
-      "lfence"
-      : "=a"(t)
-      :
-      // "memory" avoids reordering. rdx = TSC >> 32.
-      // "cc" = flags modified by SHL.
-      : "rdx", "memory", "cc");
-#elif HWY_ARCH_RVV
-  asm volatile("rdcycle %0" : "=r"(t));
-#elif defined(_WIN32) || defined(_WIN64)
-  LARGE_INTEGER counter;
-  (void)QueryPerformanceCounter(&counter);
-  t = counter.QuadPart;
-#elif defined(__APPLE__)
-  t = mach_absolute_time();
-#elif defined(__HAIKU__)
-  t = system_time_nsecs();  // since boot
-#else  // POSIX
-  timespec ts;
-  clock_gettime(CLOCK_MONOTONIC, &ts);
-  t = static_cast<Ticks>(ts.tv_sec * 1000000000LL + ts.tv_nsec);
-#endif
-  return t;
-}
-
-static HWY_INLINE HWY_MAYBE_UNUSED Ticks TicksAfter() {
-  Ticks t;
-#if HWY_ARCH_PPC && defined(__GLIBC__)
-  asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
-#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
-  HWY_FENCE;
-  unsigned aux;
-  t = __rdtscp(&aux);
-  hwy::LoadFence();
-  HWY_FENCE;
-#elif HWY_ARCH_X86_64
-  // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
-  asm volatile(
-      "rdtscp\n\t"
-      "shl $32, %%rdx\n\t"
-      "or %%rdx, %0\n\t"
-      "lfence"
-      : "=a"(t)
-      :
-      // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
-      // "cc" = flags modified by SHL.
-      : "rcx", "rdx", "memory", "cc");
-#else
-  t = TicksBefore();  // no difference on other platforms.
-#endif
-  return t;
-}
-
-}  // namespace profiler
-
-#endif  // LIB_PROFILER_TSC_TIMER_H_
index 50b937a..dfbaa3f 100644 (file)
@@ -6,7 +6,7 @@ includedir=@PKGCONFIG_TARGET_INCLUDES@
 Name: libjxl_threads
 Description: JPEG XL multi-thread runner using std::threads.
 Version: @JPEGXL_LIBRARY_VERSION@
-Requires.private: @JPEGXL_THREADS_LIBRARY_REQUIRES@
+@JPEGXL_REQUIRES_TYPE@: @JPEGXL_THREADS_LIBRARY_REQUIRES@
 Libs: -L${libdir} -ljxl_threads
 Libs.private: -lm
 Cflags: -I${includedir}
index 1208a38..db27286 100644 (file)
@@ -3,7 +3,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-#include "jxl/resizable_parallel_runner.h"
+#include <jxl/resizable_parallel_runner.h>
 
 #include <algorithm>
 #include <atomic>
index 0d5b962..47b81bd 100644 (file)
@@ -3,8 +3,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-#include "jxl/thread_parallel_runner.h"
-
+#include <jxl/thread_parallel_runner.h>
 #include <string.h>
 
 #include "lib/threads/thread_parallel_runner_internal.h"
index 2b05ad9..cfc7e22 100644 (file)
 #include "sanitizer/common_interface_defs.h"  // __sanitizer_print_stack_trace
 #endif                                        // defined(*_SANITIZER)
 
-#include "jxl/thread_parallel_runner.h"
-#include "lib/jxl/base/profiler.h"
+#include <jxl/thread_parallel_runner.h>
 
 namespace {
 
+// Important: JXL_ASSERT does not guarantee running the `condition` code,
+// use only for debug mode checks.
+
+#if JXL_ENABLE_ASSERT
 // Exits the program after printing a stack trace when possible.
 bool Abort() {
 #if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \
@@ -34,9 +37,6 @@ bool Abort() {
   __builtin_trap();
 #endif
 }
-
-// Does not guarantee running the code, use only for debug mode checks.
-#if JXL_ENABLE_ASSERT
 #define JXL_ASSERT(condition) \
   do {                        \
     if (!(condition)) {       \
@@ -175,8 +175,6 @@ void ThreadParallelRunner::ThreadFunc(ThreadParallelRunner* self,
 ThreadParallelRunner::ThreadParallelRunner(const int num_worker_threads)
     : num_worker_threads_(num_worker_threads),
       num_threads_(std::max(num_worker_threads, 1)) {
-  PROFILER_ZONE("ThreadParallelRunner ctor");
-
   threads_.reserve(num_worker_threads_);
 
   // Suppress "unused-private-field" warning.
@@ -193,11 +191,6 @@ ThreadParallelRunner::ThreadParallelRunner(const int num_worker_threads)
   if (num_worker_threads_ != 0) {
     WorkersReadyBarrier();
   }
-
-  // Warm up profiler on worker threads so its expensive initialization
-  // doesn't count towards other timer measurements.
-  RunOnEachThread(
-      [](const int task, const int thread) { PROFILER_ZONE("@InitWorkers"); });
 }
 
 ThreadParallelRunner::~ThreadParallelRunner() {
index 372c6a8..199a5f2 100644 (file)
@@ -32,6 +32,8 @@
 #ifndef LIB_THREADS_THREAD_PARALLEL_RUNNER_INTERNAL_H_
 #define LIB_THREADS_THREAD_PARALLEL_RUNNER_INTERNAL_H_
 
+#include <jxl/memory_manager.h>
+#include <jxl/parallel_runner.h>
 #include <stddef.h>
 #include <stdint.h>
 #include <stdlib.h>
@@ -42,9 +44,6 @@
 #include <thread>              //NOLINT
 #include <vector>
 
-#include "jxl/memory_manager.h"
-#include "jxl/parallel_runner.h"
-
 namespace jpegxl {
 
 // Main helper class implementing the ::JxlParallelRunner interface.
@@ -65,11 +64,6 @@ class ThreadParallelRunner {
   // Waits for all threads to exit.
   ~ThreadParallelRunner();
 
-  // Returns number of worker threads created (some may be sleeping and never
-  // wake up in time to participate in Run). Useful for characterizing
-  // performance; 0 means "run on main thread".
-  size_t NumWorkerThreads() const { return num_worker_threads_; }
-
   // Returns maximum number of main/worker threads that may call Func. Useful
   // for allocating per-thread storage.
   size_t NumThreads() const { return num_threads_; }
index 2293b5c..7c8e602 100644 (file)
@@ -5,9 +5,11 @@
 
 #include <atomic>
 
-#include "gtest/gtest.h"
 #include "lib/jxl/base/data_parallel.h"
-#include "lib/jxl/base/thread_pool_internal.h"
+#include "lib/jxl/test_utils.h"
+#include "lib/jxl/testing.h"
+
+using jxl::test::ThreadPoolForTests;
 
 namespace jpegxl {
 namespace {
@@ -26,7 +28,7 @@ int PopulationCount(uint64_t bits) {
 // (joining with its threads), num_threads=0 works (runs on current thread).
 TEST(ThreadParallelRunnerTest, TestPool) {
   for (int num_threads = 0; num_threads <= 18; ++num_threads) {
-    jxl::ThreadPoolInternal pool(num_threads);
+    ThreadPoolForTests pool(num_threads);
     for (int num_tasks = 0; num_tasks < 32; ++num_tasks) {
       std::vector<int> mementos(num_tasks);
       for (int begin = 0; begin < 32; ++begin) {
@@ -52,10 +54,9 @@ TEST(ThreadParallelRunnerTest, TestPool) {
 
 // Verify "thread" parameter when processing few tasks.
 TEST(ThreadParallelRunnerTest, TestSmallAssignments) {
-  // WARNING: cumulative total threads must not exceed profiler.h kMaxThreads.
   const int kMaxThreads = 8;
   for (int num_threads = 1; num_threads <= kMaxThreads; ++num_threads) {
-    jxl::ThreadPoolInternal pool(num_threads);
+    ThreadPoolForTests pool(num_threads);
 
     // (Avoid mutex because it may perturb the worker thread scheduling)
     std::atomic<uint64_t> id_bits{0};
@@ -95,7 +96,7 @@ struct Counter {
 
 TEST(ThreadParallelRunnerTest, TestCounter) {
   const int kNumThreads = 12;
-  jxl::ThreadPoolInternal pool(kNumThreads);
+  ThreadPoolForTests pool(kNumThreads);
   alignas(128) Counter counters[kNumThreads];
 
   const int kNumTasks = kNumThreads * 19;
index e56d312..7b53b98 100644 (file)
@@ -6,13 +6,15 @@
 find_package(PkgConfig)
 pkg_check_modules(Gdk-Pixbuf IMPORTED_TARGET gdk-pixbuf-2.0>=2.36)
 
+include(GNUInstallDirs)
+
 if (NOT Gdk-Pixbuf_FOUND)
   message(WARNING "GDK Pixbuf development libraries not found, \
                    the Gdk-Pixbuf plugin will not be built")
   return ()
 endif ()
 
-add_library(pixbufloader-jxl SHARED pixbufloader-jxl.c)
+add_library(pixbufloader-jxl MODULE pixbufloader-jxl.c)
 
 # Mark all symbols as hidden by default. The PkgConfig::Gdk-Pixbuf dependency
 # will cause fill_info and fill_vtable entry points to be made public.
@@ -23,15 +25,15 @@ set_target_properties(pixbufloader-jxl PROPERTIES
 
 # Note: This only needs the decoder library, but we don't install the decoder
 # shared library.
-target_link_libraries(pixbufloader-jxl jxl jxl_threads skcms-interface PkgConfig::Gdk-Pixbuf)
+target_link_libraries(pixbufloader-jxl jxl jxl_threads lcms2 PkgConfig::Gdk-Pixbuf)
 
 execute_process(COMMAND ${PKG_CONFIG_EXECUTABLE} gdk-pixbuf-2.0 --variable gdk_pixbuf_moduledir --define-variable=prefix=${CMAKE_INSTALL_PREFIX} OUTPUT_VARIABLE GDK_PIXBUF_MODULEDIR OUTPUT_STRIP_TRAILING_WHITESPACE)
-install(TARGETS pixbufloader-jxl LIBRARY DESTINATION "${GDK_PIXBUF_MODULEDIR}")
+install(TARGETS pixbufloader-jxl DESTINATION "${GDK_PIXBUF_MODULEDIR}")
 
 # Instead of the following, we might instead add the
 # mime type image/jxl to
 # /usr/share/thumbnailers/gdk-pixbuf-thumbnailer.thumbnailer
-install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/jxl.thumbnailer DESTINATION share/thumbnailers/)
+install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/jxl.thumbnailer DESTINATION "${CMAKE_INSTALL_DATADIR}/thumbnailers/")
 
 if(BUILD_TESTING AND NOT CMAKE_CROSSCOMPILING)
   pkg_check_modules(Gdk IMPORTED_TARGET gdk-2.0)
@@ -65,7 +67,8 @@ if(BUILD_TESTING AND NOT CMAKE_CROSSCOMPILING)
 
     # libX11.so and libgdk-x11-2.0.so are not compiled with MSAN -> report
     # use-of-uninitialized-value for string some internal string value.
-    if (NOT (SANITIZER STREQUAL "msan"))
+    # TODO(eustas): investigate direct memory leak (32 bytes).
+    if (NOT (SANITIZER STREQUAL "msan") AND NOT (SANITIZER STREQUAL "asan"))
       add_test(
         NAME pixbufloader_test_jxl
         COMMAND
index f7174ba..1859194 100644 (file)
@@ -2,7 +2,7 @@
 
 
 The plugin may already have been installed when following the instructions from the
-[Installing section of README.md](../../README.md#installing), in which case it should
+[Installing section of BUILDING.md](../../BUILDING.md#installing), in which case it should
 already be in the correct place, e.g.
 
 ```/usr/lib/x86_64-linux-gnu/gdk-pixbuf-2.0/2.10.0/loaders/libpixbufloader-jxl.so```
index 24bbcf8..bafa57b 100644 (file)
@@ -3,11 +3,11 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-#include "jxl/codestream_header.h"
-#include "jxl/decode.h"
-#include "jxl/resizable_parallel_runner.h"
-#include "jxl/types.h"
-#include "skcms.h"
+#include <jxl/codestream_header.h>
+#include <jxl/decode.h>
+#include <jxl/encode.h>
+#include <jxl/resizable_parallel_runner.h>
+#include <jxl/types.h>
 
 #define GDK_PIXBUF_ENABLE_BACKEND
 #include <gdk-pixbuf/gdk-pixbuf.h>
@@ -58,9 +58,7 @@ struct _GdkPixbufJxlAnimation {
   uint64_t tick_duration_us;
   uint64_t repetition_count;  // 0 = loop forever
 
-  // ICC profile, to which `icc` might refer to.
-  gpointer icc_buff;
-  skcms_ICCProfile icc;
+  gchar *icc_base64;
 };
 
 #define GDK_TYPE_PIXBUF_JXL_ANIMATION (gdk_pixbuf_jxl_animation_get_type())
@@ -144,7 +142,7 @@ static void gdk_pixbuf_jxl_animation_finalize(GObject *obj) {
   }
   JxlResizableParallelRunnerDestroy(decoder_state->parallel_runner);
   JxlDecoderDestroy(decoder_state->decoder);
-  g_free(decoder_state->icc_buff);
+  g_free(decoder_state->icc_base64);
 }
 
 static void gdk_pixbuf_jxl_animation_class_init(
@@ -222,7 +220,7 @@ static gboolean gdk_pixbuf_jxl_animation_iter_advance(
     if (total_duration_ms == 0) total_duration_ms = 1;
     uint64_t loop_offset = current_time_ms % total_duration_ms;
     jxl_iter->current_frame = 0;
-    while (true) {
+    while (TRUE) {
       uint64_t duration =
           g_array_index(jxl_iter->animation->frames, GdkPixbufJxlAnimationFrame,
                         jxl_iter->current_frame)
@@ -329,30 +327,6 @@ static gboolean stop_load(gpointer context, GError **error) {
   return TRUE;
 }
 
-static void draw_pixels(void *context, size_t x, size_t y, size_t num_pixels,
-                        const void *pixels) {
-  GdkPixbufJxlAnimation *decoder_state = context;
-  gboolean has_alpha = decoder_state->pixel_format.num_channels == 4;
-
-  GdkPixbuf *output =
-      g_array_index(decoder_state->frames, GdkPixbufJxlAnimationFrame,
-                    decoder_state->frames->len - 1)
-          .data;
-
-  guchar *dst = gdk_pixbuf_get_pixels(output) +
-                decoder_state->pixel_format.num_channels * x +
-                gdk_pixbuf_get_rowstride(output) * y;
-
-  skcms_Transform(
-      pixels,
-      has_alpha ? skcms_PixelFormat_RGBA_ffff : skcms_PixelFormat_RGB_fff,
-      decoder_state->alpha_premultiplied ? skcms_AlphaFormat_PremulAsEncoded
-                                         : skcms_AlphaFormat_Unpremul,
-      &decoder_state->icc, dst,
-      has_alpha ? skcms_PixelFormat_RGBA_8888 : skcms_PixelFormat_RGB_888,
-      skcms_AlphaFormat_Unpremul, skcms_sRGB_profile(), num_pixels);
-}
-
 static gboolean load_increment(gpointer context, const guchar *buf, guint size,
                                GError **error) {
   GdkPixbufJxlAnimation *decoder_state = context;
@@ -422,35 +396,47 @@ static gboolean load_increment(gpointer context, const guchar *buf, guint size,
 
       case JXL_DEC_COLOR_ENCODING: {
         // Get the ICC color profile of the pixel data
+        gpointer icc_buff;
         size_t icc_size;
+        JxlColorEncoding color_encoding;
+        if (JXL_DEC_SUCCESS == JxlDecoderGetColorAsEncodedProfile(
+                                   decoder_state->decoder,
+                                   JXL_COLOR_PROFILE_TARGET_ORIGINAL,
+                                   &color_encoding)) {
+          // we don't check the return status here because it's not a problem if
+          // this fails
+          JxlDecoderSetPreferredColorProfile(decoder_state->decoder,
+                                             &color_encoding);
+        }
         if (JXL_DEC_SUCCESS != JxlDecoderGetICCProfileSize(
                                    decoder_state->decoder,
-                                   &decoder_state->pixel_format,
                                    JXL_COLOR_PROFILE_TARGET_DATA, &icc_size)) {
           g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_FAILED,
                       "JxlDecoderGetICCProfileSize failed");
           return FALSE;
         }
-        if (!(decoder_state->icc_buff = g_malloc(icc_size))) {
+        if (!(icc_buff = g_malloc(icc_size))) {
           g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_FAILED,
                       "Allocating ICC profile failed");
           return FALSE;
         }
         if (JXL_DEC_SUCCESS !=
             JxlDecoderGetColorAsICCProfile(decoder_state->decoder,
-                                           &decoder_state->pixel_format,
                                            JXL_COLOR_PROFILE_TARGET_DATA,
-                                           decoder_state->icc_buff, icc_size)) {
+                                           icc_buff, icc_size)) {
           g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_FAILED,
                       "JxlDecoderGetColorAsICCProfile failed");
+          g_free(icc_buff);
           return FALSE;
         }
-        if (!skcms_Parse(decoder_state->icc_buff, icc_size,
-                         &decoder_state->icc)) {
+        decoder_state->icc_base64 = g_base64_encode(icc_buff, icc_size);
+        g_free(icc_buff);
+        if (!decoder_state->icc_base64) {
           g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_FAILED,
-                      "Invalid ICC profile from JXL image decoder");
+                      "Allocating ICC profile base64 string failed");
           return FALSE;
         }
+
         break;
       }
 
@@ -479,8 +465,11 @@ static gboolean load_increment(gpointer context, const guchar *buf, guint size,
                         "Failed to allocate output pixel buffer");
             return FALSE;
           }
+          gdk_pixbuf_set_option(frame.data, "icc-profile",
+                                decoder_state->icc_base64);
           decoder_state->pixel_format.align =
               gdk_pixbuf_get_rowstride(frame.data);
+          decoder_state->pixel_format.data_type = JXL_TYPE_UINT8;
           g_array_append_val(decoder_state->frames, frame);
         }
         if (decoder_state->pixbuf_prepared_callback &&
@@ -497,12 +486,19 @@ static gboolean load_increment(gpointer context, const guchar *buf, guint size,
       }
 
       case JXL_DEC_NEED_IMAGE_OUT_BUFFER: {
-        if (JXL_DEC_SUCCESS !=
-            JxlDecoderSetImageOutCallback(decoder_state->decoder,
-                                          &decoder_state->pixel_format,
-                                          draw_pixels, decoder_state)) {
+        GdkPixbuf *output =
+            g_array_index(decoder_state->frames, GdkPixbufJxlAnimationFrame,
+                          decoder_state->frames->len - 1)
+                .data;
+        decoder_state->pixel_format.align = gdk_pixbuf_get_rowstride(output);
+        guchar *dst = gdk_pixbuf_get_pixels(output);
+        size_t num_pixels = decoder_state->xsize * decoder_state->ysize;
+        size_t size = num_pixels * decoder_state->pixel_format.num_channels;
+        if (JXL_DEC_SUCCESS != JxlDecoderSetImageOutBuffer(
+                                   decoder_state->decoder,
+                                   &decoder_state->pixel_format, dst, size)) {
           g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_FAILED,
-                      "JxlDecoderSetImageOutCallback failed");
+                      "JxlDecoderSetImageOutBuffer failed");
           return FALSE;
         }
         break;
@@ -540,11 +536,230 @@ static gboolean load_increment(gpointer context, const guchar *buf, guint size,
   return TRUE;
 }
 
+static gboolean jxl_is_save_option_supported(const gchar *option_key) {
+  if (g_strcmp0(option_key, "quality") == 0) {
+    return TRUE;
+  }
+
+  return FALSE;
+}
+
+static gboolean jxl_image_saver(FILE *f, GdkPixbuf *pixbuf, gchar **keys,
+                                gchar **values, GError **error) {
+  long quality = 90; /* default; must be between 0 and 100 */
+  double distance;
+  gboolean save_alpha;
+  JxlEncoder *encoder;
+  void *parallel_runner;
+  JxlEncoderFrameSettings *frame_settings;
+  JxlBasicInfo output_info;
+  JxlPixelFormat pixel_format;
+  JxlColorEncoding color_profile;
+  JxlEncoderStatus status;
+
+  GByteArray *compressed;
+  size_t offset = 0;
+  uint8_t *next_out;
+  size_t avail_out;
+
+  if (f == NULL || pixbuf == NULL) {
+    return FALSE;
+  }
+
+  if (keys && *keys) {
+    gchar **kiter = keys;
+    gchar **viter = values;
+
+    while (*kiter) {
+      if (strcmp(*kiter, "quality") == 0) {
+        char *endptr = NULL;
+        quality = strtol(*viter, &endptr, 10);
+
+        if (endptr == *viter) {
+          g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_BAD_OPTION,
+                      "JXL quality must be a value between 0 and 100; value "
+                      "\"%s\" could not be parsed.",
+                      *viter);
+
+          return FALSE;
+        }
+
+        if (quality < 0 || quality > 100) {
+          g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_BAD_OPTION,
+                      "JXL quality must be a value between 0 and 100; value "
+                      "\"%ld\" is not allowed.",
+                      quality);
+
+          return FALSE;
+        }
+      } else {
+        g_warning("Unrecognized parameter (%s) passed to JXL saver.", *kiter);
+      }
+
+      ++kiter;
+      ++viter;
+    }
+  }
+
+  if (gdk_pixbuf_get_bits_per_sample(pixbuf) != 8) {
+    g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_UNKNOWN_TYPE,
+                "Sorry, only 8bit images are supported by this JXL saver");
+    return FALSE;
+  }
+
+  JxlEncoderInitBasicInfo(&output_info);
+  output_info.have_container = JXL_FALSE;
+  output_info.xsize = gdk_pixbuf_get_width(pixbuf);
+  output_info.ysize = gdk_pixbuf_get_height(pixbuf);
+  output_info.bits_per_sample = 8;
+  output_info.orientation = JXL_ORIENT_IDENTITY;
+  output_info.num_color_channels = 3;
+
+  if (output_info.xsize == 0 || output_info.ysize == 0) {
+    g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_CORRUPT_IMAGE,
+                "Empty image, nothing to save");
+    return FALSE;
+  }
+
+  save_alpha = gdk_pixbuf_get_has_alpha(pixbuf);
+
+  pixel_format.data_type = JXL_TYPE_UINT8;
+  pixel_format.endianness = JXL_NATIVE_ENDIAN;
+  pixel_format.align = gdk_pixbuf_get_rowstride(pixbuf);
+
+  if (save_alpha) {
+    if (gdk_pixbuf_get_n_channels(pixbuf) != 4) {
+      g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_UNKNOWN_TYPE,
+                  "Unsupported number of channels");
+      return FALSE;
+    }
+
+    output_info.num_extra_channels = 1;
+    output_info.alpha_bits = 8;
+    pixel_format.num_channels = 4;
+  } else {
+    if (gdk_pixbuf_get_n_channels(pixbuf) != 3) {
+      g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_UNKNOWN_TYPE,
+                  "Unsupported number of channels");
+      return FALSE;
+    }
+
+    output_info.num_extra_channels = 0;
+    output_info.alpha_bits = 0;
+    pixel_format.num_channels = 3;
+  }
+
+  encoder = JxlEncoderCreate(NULL);
+  if (!encoder) {
+    g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_FAILED,
+                "Creation of the JXL encoder failed");
+    return FALSE;
+  }
+
+  parallel_runner = JxlResizableParallelRunnerCreate(NULL);
+  if (!parallel_runner) {
+    JxlEncoderDestroy(encoder);
+    g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_FAILED,
+                "Creation of the JXL decoder failed");
+    return FALSE;
+  }
+
+  JxlResizableParallelRunnerSetThreads(
+      parallel_runner, JxlResizableParallelRunnerSuggestThreads(
+                           output_info.xsize, output_info.ysize));
+
+  status = JxlEncoderSetParallelRunner(encoder, JxlResizableParallelRunner,
+                                       parallel_runner);
+  if (status != JXL_ENC_SUCCESS) {
+    JxlResizableParallelRunnerDestroy(parallel_runner);
+    JxlEncoderDestroy(encoder);
+    g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_FAILED,
+                "JxlDecoderSetParallelRunner failed: %x", status);
+    return FALSE;
+  }
+
+  if (quality > 99) {
+    output_info.uses_original_profile = JXL_TRUE;
+    distance = 0;
+  } else {
+    output_info.uses_original_profile = JXL_FALSE;
+    distance = JxlEncoderDistanceFromQuality((float)quality);
+  }
+
+  status = JxlEncoderSetBasicInfo(encoder, &output_info);
+  if (status != JXL_ENC_SUCCESS) {
+    JxlResizableParallelRunnerDestroy(parallel_runner);
+    JxlEncoderDestroy(encoder);
+    g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_FAILED,
+                "JxlEncoderSetBasicInfo failed: %x", status);
+    return FALSE;
+  }
+
+  JxlColorEncodingSetToSRGB(&color_profile, JXL_FALSE);
+  status = JxlEncoderSetColorEncoding(encoder, &color_profile);
+  if (status != JXL_ENC_SUCCESS) {
+    JxlResizableParallelRunnerDestroy(parallel_runner);
+    JxlEncoderDestroy(encoder);
+    g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_FAILED,
+                "JxlEncoderSetColorEncoding failed: %x", status);
+    return FALSE;
+  }
+
+  frame_settings = JxlEncoderFrameSettingsCreate(encoder, NULL);
+  JxlEncoderSetFrameDistance(frame_settings, distance);
+  JxlEncoderSetFrameLossless(frame_settings, output_info.uses_original_profile);
+
+  status = JxlEncoderAddImageFrame(frame_settings, &pixel_format,
+                                   gdk_pixbuf_read_pixels(pixbuf),
+                                   gdk_pixbuf_get_byte_length(pixbuf));
+  if (status != JXL_ENC_SUCCESS) {
+    JxlResizableParallelRunnerDestroy(parallel_runner);
+    JxlEncoderDestroy(encoder);
+    g_set_error(error, GDK_PIXBUF_ERROR, GDK_PIXBUF_ERROR_FAILED,
+                "JxlEncoderAddImageFrame failed: %x", status);
+    return FALSE;
+  }
+
+  JxlEncoderCloseInput(encoder);
+
+  compressed = g_byte_array_sized_new(4096);
+  g_byte_array_set_size(compressed, 4096);
+  do {
+    next_out = compressed->data + offset;
+    avail_out = compressed->len - offset;
+    status = JxlEncoderProcessOutput(encoder, &next_out, &avail_out);
+
+    if (status == JXL_ENC_NEED_MORE_OUTPUT) {
+      offset = next_out - compressed->data;
+      g_byte_array_set_size(compressed, compressed->len * 2);
+    } else if (status == JXL_ENC_ERROR) {
+      JxlResizableParallelRunnerDestroy(parallel_runner);
+      JxlEncoderDestroy(encoder);
+      g_set_error(error, G_FILE_ERROR, 0, "JxlEncoderProcessOutput failed: %x",
+                  status);
+      return FALSE;
+    }
+  } while (status != JXL_ENC_SUCCESS);
+
+  JxlResizableParallelRunnerDestroy(parallel_runner);
+  JxlEncoderDestroy(encoder);
+
+  g_byte_array_set_size(compressed, next_out - compressed->data);
+  if (compressed->len > 0) {
+    fwrite(compressed->data, 1, compressed->len, f);
+    g_byte_array_free(compressed, TRUE);
+    return TRUE;
+  }
+
+  return FALSE;
+}
+
 void fill_vtable(GdkPixbufModule *module) {
   module->begin_load = begin_load;
   module->stop_load = stop_load;
   module->load_increment = load_increment;
-  // TODO(veluca): implement saving.
+  module->is_save_option_supported = jxl_is_save_option_supported;
+  module->save = jxl_image_saver;
 }
 
 void fill_info(GdkPixbufFormat *info) {
@@ -563,7 +778,6 @@ void fill_info(GdkPixbufFormat *info) {
   info->description = "JPEG XL image";
   info->mime_types = mime_types;
   info->extensions = extensions;
-  // TODO(veluca): add writing support.
-  info->flags = GDK_PIXBUF_FORMAT_THREADSAFE;
+  info->flags = GDK_PIXBUF_FORMAT_WRITABLE | GDK_PIXBUF_FORMAT_THREADSAFE;
   info->license = "BSD-3";
 }
index 95c51bf..3fe63c1 100644 (file)
@@ -23,8 +23,8 @@
 #undef MIN
 #undef CLAMP
 
-#include "jxl/resizable_parallel_runner.h"
-#include "jxl/resizable_parallel_runner_cxx.h"
+#include <jxl/resizable_parallel_runner.h>
+#include <jxl/resizable_parallel_runner_cxx.h>
 
 namespace jxl {
 
index b1d1f15..4796c17 100644 (file)
@@ -5,17 +5,45 @@
 
 #include "plugins/gimp/file-jxl-load.h"
 
+#include <jxl/decode.h>
+#include <jxl/decode_cxx.h>
+
 #define _PROFILE_ORIGIN_ JXL_COLOR_PROFILE_TARGET_ORIGINAL
 #define _PROFILE_TARGET_ JXL_COLOR_PROFILE_TARGET_DATA
 #define LOAD_PROC "file-jxl-load"
 
 namespace jxl {
 
+bool SetJpegXlOutBuffer(
+    std::unique_ptr<JxlDecoderStruct, JxlDecoderDestroyStruct> *dec,
+    JxlPixelFormat *format, size_t *buffer_size, gpointer *pixels_buffer_1) {
+  if (JXL_DEC_SUCCESS !=
+      JxlDecoderImageOutBufferSize(dec->get(), format, buffer_size)) {
+    g_printerr(LOAD_PROC " Error: JxlDecoderImageOutBufferSize failed\n");
+    return false;
+  }
+  *pixels_buffer_1 = g_malloc(*buffer_size);
+  if (JXL_DEC_SUCCESS != JxlDecoderSetImageOutBuffer(dec->get(), format,
+                                                     *pixels_buffer_1,
+                                                     *buffer_size)) {
+    g_printerr(LOAD_PROC " Error: JxlDecoderSetImageOutBuffer failed\n");
+    return false;
+  }
+  return true;
+}
+
 bool LoadJpegXlImage(const gchar *const filename, gint32 *const image_id) {
+  bool stop_processing = false;
+  JxlDecoderStatus status = JXL_DEC_NEED_MORE_INPUT;
   std::vector<uint8_t> icc_profile;
   GimpColorProfile *profile_icc = nullptr;
   GimpColorProfile *profile_int = nullptr;
   bool is_linear = false;
+  unsigned long xsize = 0, ysize = 0;
+  long crop_x0 = 0, crop_y0 = 0;
+  size_t layer_idx = 0;
+  uint32_t frame_duration = 0;
+  double tps_denom = 1.f, tps_numer = 1.f;
 
   gint32 layer;
 
@@ -28,6 +56,10 @@ bool LoadJpegXlImage(const gchar *const filename, gint32 *const image_id) {
   GimpPrecision precision = GIMP_PRECISION_U16_GAMMA;
   JxlBasicInfo info = {};
   JxlPixelFormat format = {};
+  JxlAnimationHeader animation = {};
+  JxlBlendMode blend_mode = JXL_BLEND_BLEND;
+  char *frame_name = nullptr;  // will be realloced
+  size_t frame_name_len = 0;
 
   format.num_channels = 4;
   format.data_type = JXL_TYPE_FLOAT;
@@ -53,9 +85,10 @@ bool LoadJpegXlImage(const gchar *const filename, gint32 *const image_id) {
 
   auto dec = JxlDecoderMake(nullptr);
   if (JXL_DEC_SUCCESS !=
-      JxlDecoderSubscribeEvents(dec.get(), JXL_DEC_BASIC_INFO |
-                                               JXL_DEC_COLOR_ENCODING |
-                                               JXL_DEC_FULL_IMAGE)) {
+      JxlDecoderSubscribeEvents(
+          dec.get(), JXL_DEC_BASIC_INFO | JXL_DEC_COLOR_ENCODING |
+                         JXL_DEC_FULL_IMAGE | JXL_DEC_FRAME_PROGRESSION |
+                         JXL_DEC_FRAME)) {
     g_printerr(LOAD_PROC " Error: JxlDecoderSubscribeEvents failed\n");
     return false;
   }
@@ -66,14 +99,26 @@ bool LoadJpegXlImage(const gchar *const filename, gint32 *const image_id) {
     g_printerr(LOAD_PROC " Error: JxlDecoderSetParallelRunner failed\n");
     return false;
   }
+  // TODO(user): make this work with coalescing set to false, while handling
+  // frames with duration 0 and references to earlier frames correctly.
+  if (JXL_DEC_SUCCESS != JxlDecoderSetCoalescing(dec.get(), JXL_TRUE)) {
+    g_printerr(LOAD_PROC " Error: JxlDecoderSetCoalescing failed\n");
+    return false;
+  }
 
   // grand decode loop...
   JxlDecoderSetInput(dec.get(), compressed.data(), compressed.size());
 
+  if (JXL_DEC_SUCCESS != JxlDecoderSetProgressiveDetail(
+                             dec.get(), JxlProgressiveDetail::kPasses)) {
+    g_printerr(LOAD_PROC " Error: JxlDecoderSetProgressiveDetail failed\n");
+    return false;
+  }
+
   while (true) {
     gimp_load_progress.update();
 
-    JxlDecoderStatus status = JxlDecoderProcessInput(dec.get());
+    if (!stop_processing) status = JxlDecoderProcessInput(dec.get());
 
     if (status == JXL_DEC_BASIC_INFO) {
       if (JXL_DEC_SUCCESS != JxlDecoderGetBasicInfo(dec.get(), &info)) {
@@ -81,20 +126,26 @@ bool LoadJpegXlImage(const gchar *const filename, gint32 *const image_id) {
         return false;
       }
 
+      xsize = info.xsize;
+      ysize = info.ysize;
+      if (info.have_animation) {
+        animation = info.animation;
+        tps_denom = animation.tps_denominator;
+        tps_numer = animation.tps_numerator;
+      }
+
       JxlResizableParallelRunnerSetThreads(
-          runner.get(),
-          JxlResizableParallelRunnerSuggestThreads(info.xsize, info.ysize));
+          runner.get(), JxlResizableParallelRunnerSuggestThreads(xsize, ysize));
     } else if (status == JXL_DEC_COLOR_ENCODING) {
       // check for ICC profile
       size_t icc_size = 0;
       JxlColorEncoding color_encoding;
       if (JXL_DEC_SUCCESS !=
-          JxlDecoderGetColorAsEncodedProfile(
-              dec.get(), &format, _PROFILE_ORIGIN_, &color_encoding)) {
+          JxlDecoderGetColorAsEncodedProfile(dec.get(), _PROFILE_ORIGIN_,
+                                             &color_encoding)) {
         // Attempt to load ICC profile when no internal color encoding
-        if (JXL_DEC_SUCCESS != JxlDecoderGetICCProfileSize(dec.get(), &format,
-                                                           _PROFILE_ORIGIN_,
-                                                           &icc_size)) {
+        if (JXL_DEC_SUCCESS != JxlDecoderGetICCProfileSize(
+                                   dec.get(), _PROFILE_ORIGIN_, &icc_size)) {
           g_printerr(LOAD_PROC
                      " Warning: JxlDecoderGetICCProfileSize failed\n");
         }
@@ -102,7 +153,7 @@ bool LoadJpegXlImage(const gchar *const filename, gint32 *const image_id) {
         if (icc_size > 0) {
           icc_profile.resize(icc_size);
           if (JXL_DEC_SUCCESS != JxlDecoderGetColorAsICCProfile(
-                                     dec.get(), &format, _PROFILE_ORIGIN_,
+                                     dec.get(), _PROFILE_ORIGIN_,
                                      icc_profile.data(), icc_profile.size())) {
             g_printerr(LOAD_PROC
                        " Warning: JxlDecoderGetColorAsICCProfile failed\n");
@@ -125,8 +176,8 @@ bool LoadJpegXlImage(const gchar *const filename, gint32 *const image_id) {
 
       // Internal color profile detection...
       if (JXL_DEC_SUCCESS ==
-          JxlDecoderGetColorAsEncodedProfile(
-              dec.get(), &format, _PROFILE_TARGET_, &color_encoding)) {
+          JxlDecoderGetColorAsEncodedProfile(dec.get(), _PROFILE_TARGET_,
+                                             &color_encoding)) {
         g_printerr(LOAD_PROC " Info: Internal color encoding detected.\n");
 
         // figure out linearity of internal profile
@@ -279,11 +330,11 @@ bool LoadJpegXlImage(const gchar *const filename, gint32 *const image_id) {
 
       // create new image
       if (is_linear) {
-        *image_id = gimp_image_new_with_precision(
-            info.xsize, info.ysize, image_type, GIMP_PRECISION_FLOAT_LINEAR);
+        *image_id = gimp_image_new_with_precision(xsize, ysize, image_type,
+                                                  GIMP_PRECISION_FLOAT_LINEAR);
       } else {
-        *image_id = gimp_image_new_with_precision(
-            info.xsize, info.ysize, image_type, GIMP_PRECISION_FLOAT_GAMMA);
+        *image_id = gimp_image_new_with_precision(xsize, ysize, image_type,
+                                                  GIMP_PRECISION_FLOAT_GAMMA);
       }
 
       if (profile_int) {
@@ -294,22 +345,40 @@ bool LoadJpegXlImage(const gchar *const filename, gint32 *const image_id) {
     } else if (status == JXL_DEC_NEED_IMAGE_OUT_BUFFER) {
       // get image from decoder in FLOAT
       format.data_type = JXL_TYPE_FLOAT;
-      if (JXL_DEC_SUCCESS !=
-          JxlDecoderImageOutBufferSize(dec.get(), &format, &buffer_size)) {
-        g_printerr(LOAD_PROC " Error: JxlDecoderImageOutBufferSize failed\n");
+      if (!SetJpegXlOutBuffer(&dec, &format, &buffer_size, &pixels_buffer_1))
         return false;
-      }
-      pixels_buffer_1 = g_malloc(buffer_size);
-      if (JXL_DEC_SUCCESS != JxlDecoderSetImageOutBuffer(dec.get(), &format,
-                                                         pixels_buffer_1,
-                                                         buffer_size)) {
-        g_printerr(LOAD_PROC " Error: JxlDecoderSetImageOutBuffer failed\n");
-        return false;
-      }
-    } else if (status == JXL_DEC_FULL_IMAGE || status == JXL_DEC_FRAME) {
+    } else if (status == JXL_DEC_FULL_IMAGE) {
       // create and insert layer
-      layer = gimp_layer_new(*image_id, "Background", info.xsize, info.ysize,
-                             layer_type, /*opacity=*/100,
+      gchar *layer_name;
+      if (layer_idx == 0 && !info.have_animation) {
+        layer_name = g_strdup_printf("Background");
+      } else {
+        const GString *blend_null_flag = g_string_new("");
+        const GString *blend_replace_flag = g_string_new(" (replace)");
+        const GString *blend_combine_flag = g_string_new(" (combine)");
+        GString *blend;
+        if (blend_mode == JXL_BLEND_REPLACE) {
+          blend = (GString *)blend_replace_flag;
+        } else if (blend_mode == JXL_BLEND_BLEND) {
+          blend = (GString *)blend_combine_flag;
+        } else {
+          blend = (GString *)blend_null_flag;
+        }
+        char *temp_frame_name = nullptr;
+        bool must_free_frame_name = false;
+        if (frame_name_len == 0) {
+          temp_frame_name = g_strdup_printf("Frame %lu", layer_idx + 1);
+          must_free_frame_name = true;
+        } else {
+          temp_frame_name = frame_name;
+        }
+        double fduration = frame_duration * 1000.f * tps_denom / tps_numer;
+        layer_name = g_strdup_printf("%s (%.15gms)%s", temp_frame_name,
+                                     fduration, blend->str);
+        if (must_free_frame_name) free(temp_frame_name);
+      }
+      layer = gimp_layer_new(*image_id, layer_name, xsize, ysize, layer_type,
+                             /*opacity=*/100,
                              gimp_image_get_default_new_layer_mode(*image_id));
 
       gimp_image_insert_layer(*image_id, layer, /*parent_id=*/-1,
@@ -333,18 +402,57 @@ bool LoadJpegXlImage(const gchar *const filename, gint32 *const image_id) {
       const Babl *source_format = babl_format(babl_format_str.c_str());
 
       babl_process(babl_fish(source_format, destination_format),
-                   pixels_buffer_1, pixels_buffer_2, info.xsize * info.ysize);
+                   pixels_buffer_1, pixels_buffer_2, xsize * ysize);
 
-      gegl_buffer_set(buffer, GEGL_RECTANGLE(0, 0, info.xsize, info.ysize), 0,
-                      nullptr, pixels_buffer_2, GEGL_AUTO_ROWSTRIDE);
+      gegl_buffer_set(buffer, GEGL_RECTANGLE(0, 0, xsize, ysize), 0, nullptr,
+                      pixels_buffer_2, GEGL_AUTO_ROWSTRIDE);
+      gimp_item_transform_translate(layer, crop_x0, crop_y0);
 
       g_clear_object(&buffer);
+      g_free(pixels_buffer_1);
+      g_free(pixels_buffer_2);
+      if (stop_processing) status = JXL_DEC_SUCCESS;
+      g_free(layer_name);
+      layer_idx++;
+    } else if (status == JXL_DEC_FRAME) {
+      JxlFrameHeader frame_header;
+      if (JxlDecoderGetFrameHeader(dec.get(), &frame_header) !=
+          JXL_DEC_SUCCESS) {
+        g_printerr(LOAD_PROC " Error: JxlDecoderSetImageOutBuffer failed\n");
+        return false;
+      }
+      xsize = frame_header.layer_info.xsize;
+      ysize = frame_header.layer_info.ysize;
+      crop_x0 = frame_header.layer_info.crop_x0;
+      crop_y0 = frame_header.layer_info.crop_y0;
+      frame_duration = frame_header.duration;
+      blend_mode = frame_header.layer_info.blend_info.blendmode;
+      if (blend_mode != JXL_BLEND_BLEND && blend_mode != JXL_BLEND_REPLACE) {
+        g_printerr(
+            LOAD_PROC
+            " Warning: JxlDecoderGetFrameHeader: Unhandled blend mode: %d\n",
+            blend_mode);
+      }
+      if ((frame_name_len = frame_header.name_length) > 0) {
+        frame_name = (char *)realloc(frame_name, frame_name_len);
+        if (JXL_DEC_SUCCESS !=
+            JxlDecoderGetFrameName(dec.get(), frame_name, frame_name_len)) {
+          g_printerr(LOAD_PROC "Error: JxlDecoderGetFrameName failed");
+          return false;
+        };
+      }
     } else if (status == JXL_DEC_SUCCESS) {
       // All decoding successfully finished.
       // It's not required to call JxlDecoderReleaseInput(dec.get())
       // since the decoder will be destroyed.
       break;
-    } else if (status == JXL_DEC_NEED_MORE_INPUT) {
+    } else if (status == JXL_DEC_NEED_MORE_INPUT ||
+               status == JXL_DEC_FRAME_PROGRESSION) {
+      stop_processing = status != JXL_DEC_FRAME_PROGRESSION;
+      if (JxlDecoderFlushImage(dec.get()) == JXL_DEC_SUCCESS) {
+        status = JXL_DEC_FULL_IMAGE;
+        continue;
+      }
       g_printerr(LOAD_PROC " Error: Already provided all input\n");
       return false;
     } else if (status == JXL_DEC_ERROR) {
index c9ca6d9..ef5b92f 100644 (file)
@@ -6,8 +6,6 @@
 #ifndef PLUGINS_GIMP_FILE_JXL_LOAD_H_
 #define PLUGINS_GIMP_FILE_JXL_LOAD_H_
 
-#include "jxl/decode.h"
-#include "jxl/decode_cxx.h"
 #include "plugins/gimp/common.h"
 
 namespace jxl {
index 5eb1412..45aaa1f 100644 (file)
@@ -5,7 +5,11 @@
 
 #include "plugins/gimp/file-jxl-save.h"
 
+#include <jxl/encode.h>
+#include <jxl/encode_cxx.h>
+
 #include <cmath>
+#include <utility>
 
 #include "gobject/gsignal.h"
 
@@ -116,8 +120,7 @@ bool JpegXlSaveGui::GuiOnChangeQuality(GtkAdjustment* adj_qual,
   g_clear_signal_handler(&self->handle_toggle_lossless, self->toggle_lossless);
 
   GtkAdjustment* adj_dist = self->entry_distance;
-  jxl_save_opts.quality = gtk_adjustment_get_value(adj_qual);
-  jxl_save_opts.UpdateDistance();
+  jxl_save_opts.SetQuality(gtk_adjustment_get_value(adj_qual));
   gtk_adjustment_set_value(adj_dist, jxl_save_opts.distance);
 
   self->handle_toggle_lossless = g_signal_connect(
@@ -140,8 +143,7 @@ bool JpegXlSaveGui::GuiOnChangeDistance(GtkAdjustment* adj_dist,
   g_clear_signal_handler(&self->handle_entry_quality, self->entry_quality);
   g_clear_signal_handler(&self->handle_toggle_lossless, self->toggle_lossless);
 
-  jxl_save_opts.distance = gtk_adjustment_get_value(adj_dist);
-  jxl_save_opts.UpdateQuality();
+  jxl_save_opts.SetDistance(gtk_adjustment_get_value(adj_dist));
   gtk_adjustment_set_value(adj_qual, jxl_save_opts.quality);
 
   if (!(jxl_save_opts.distance < 0.001)) {
@@ -356,8 +358,6 @@ bool JpegXlSaveGui::SaveDialog() {
   gtk_widget_show(separator);
 
   // Advanced Settings Frame
-  std::vector<GtkWidget*> advanced_opts;
-
   frame_advanced = gtk_frame_new("Advanced Settings");
   gimp_help_set_help_data(frame_advanced,
                           "Some advanced settings may produce malformed files.",
@@ -519,8 +519,8 @@ bool JpegXlSaveOpts::UpdateQuality() {
 
   if (distance < 0.1) {
     qual = 100;
-  } else if (distance > 6.56) {
-    qual = 30 - 5 * log(abs(6.25 * distance - 40)) / log(2.5);
+  } else if (distance > 6.4) {
+    qual = -5.0 / 53.0 * sqrt(6360.0 * distance - 39975.0) + 1725.0 / 53.0;
     lossless = false;
   } else {
     qual = 100 - (distance - 0.1) / 0.09;
@@ -539,15 +539,10 @@ bool JpegXlSaveOpts::UpdateQuality() {
 }
 
 bool JpegXlSaveOpts::UpdateDistance() {
-  float dist;
-  if (quality >= 30) {
-    dist = 0.1 + (100 - quality) * 0.09;
-  } else {
-    dist = 6.4 + pow(2.5, (30 - quality) / 5.0) / 6.25;
-  }
+  float dist = JxlEncoderDistanceFromQuality(quality);
 
-  if (dist > 15) {
-    distance = 15;
+  if (dist > 25) {
+    distance = 25;
   } else {
     distance = dist;
   }
@@ -602,12 +597,12 @@ bool JpegXlSaveOpts::UpdateBablFormat() {
 }
 
 bool JpegXlSaveOpts::SetBablModel(std::string model) {
-  babl_model_str = model;
+  babl_model_str = std::move(model);
   return UpdateBablFormat();
 }
 
 bool JpegXlSaveOpts::SetBablType(std::string type) {
-  babl_type_str = type;
+  babl_type_str = std::move(type);
   return UpdateBablFormat();
 }
 
@@ -727,6 +722,15 @@ bool SaveJpegXlImage(const gint32 image_id, const gint32 drawable_id,
     return false;
   }
 
+  // this sets some basic_info properties
+  jxl_save_opts.SetModel(jxl_save_opts.is_linear);
+
+  if (JXL_ENC_SUCCESS !=
+      JxlEncoderSetBasicInfo(enc.get(), &jxl_save_opts.basic_info)) {
+    g_printerr(SAVE_PROC " Error: JxlEncoderSetBasicInfo failed\n");
+    return false;
+  }
+
   // try to use ICC profile
   if (!icc.empty() && !jxl_save_opts.is_gray) {
     if (JXL_ENC_SUCCESS ==
@@ -785,15 +789,6 @@ bool SaveJpegXlImage(const gint32 image_id, const gint32 drawable_id,
     JxlEncoderSetFrameDistance(frame_settings, jxl_save_opts.distance);
   }
 
-  // this sets some basic_info properties
-  jxl_save_opts.SetModel(jxl_save_opts.is_linear);
-
-  if (JXL_ENC_SUCCESS !=
-      JxlEncoderSetBasicInfo(enc.get(), &jxl_save_opts.basic_info)) {
-    g_printerr(SAVE_PROC " Error: JxlEncoderSetBasicInfo failed\n");
-    return false;
-  }
-
   // convert precision and colorspace
   if (jxl_save_opts.is_linear &&
       jxl_save_opts.basic_info.bits_per_sample < 32) {
@@ -832,11 +827,7 @@ bool SaveJpegXlImage(const gint32 image_id, const gint32 drawable_id,
     g_clear_object(&buffer);
 
     // use babl to fix gamma mismatch issues
-    if (jxl_save_opts.icc_attached) {
-      jxl_save_opts.SetModel(jxl_save_opts.is_linear);
-    } else {
-      jxl_save_opts.SetModel(!jxl_save_opts.is_linear);
-    }
+    jxl_save_opts.SetModel(jxl_save_opts.is_linear);
     jxl_save_opts.pixel_format.data_type = JXL_TYPE_FLOAT;
     jxl_save_opts.SetBablType("float");
     const Babl* destination_format =
index 9dfa45c..c9d0e80 100644 (file)
@@ -6,8 +6,6 @@
 #ifndef PLUGINS_GIMP_FILE_JXL_SAVE_H_
 #define PLUGINS_GIMP_FILE_JXL_SAVE_H_
 
-#include "jxl/encode.h"
-#include "jxl/encode_cxx.h"
 #include "plugins/gimp/common.h"
 
 namespace jxl {
index 4b5373c..4d398c7 100644 (file)
@@ -1,6 +1,23 @@
+## :warning: Not needed anymore
+
+As `image/jxl` is now supported by [shared-mine-info 2.2](https://gitlab.freedesktop.org/xdg/shared-mime-info/-/releases/2.2), it should not be necessary anymore to install this plugin.
+
+You can test if your system correctly understand the MIME type of JPEG XL image by obtaining a JPEG XL image, e.g. with
+```bash
+wget https://raw.githubusercontent.com/libjxl/conformance/master/testcases/bicycles/input.jxl
+```
+and with that sample JPEG XL file `input.jxl` (or any other valid JPEG XL file), run any of the following commands:
+```bash
+xdg-mime query filetype input.jxl
+file --mime-type input.jxl
+mimetype input.jxl
+```
+If the output contains `image/jxl` you are all set!
+
+
 ## JPEG XL MIME type
 
-If not already installed by the [Installing section of README.md](../../README.md#installing), then it can be done manually:
+If not already installed by the [Installing section of BUILDING.md](../../BUILDING.md#installing), then it can be done manually:
 
 ### Install
 ```bash
index 50cc72c..ea22103 100644 (file)
@@ -21,6 +21,7 @@ set(HWY_SYSTEM_GTEST ON CACHE INTERNAL "")
 set(HWY_FORCE_STATIC_LIBS ON CACHE INTERNAL "")
 set(HWY_ENABLE_CONTRIB OFF CACHE INTERNAL "")
 set(HWY_ENABLE_EXAMPLES OFF CACHE INTERNAL "")
+set(HWY_ENABLE_TESTS OFF CACHE INTERNAL "")
 if((SANITIZER STREQUAL "asan") OR (SANITIZER STREQUAL "msan"))
   set(HWY_ENABLE_INSTALL OFF CACHE INTERNAL "")
 endif()
@@ -30,7 +31,7 @@ if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/highway/CMakeLists.txt" AND
   configure_file("${CMAKE_CURRENT_SOURCE_DIR}/highway/LICENSE"
                  ${PROJECT_BINARY_DIR}/LICENSE.highway COPYONLY)
 else()
-  find_package(HWY 0.15.0)
+  find_package(HWY 1.0.7)
   if (NOT HWY_FOUND)
     message(FATAL_ERROR
         "Highway library (hwy) not found. Install libhwy-dev or download it "
@@ -68,16 +69,33 @@ else()
   add_subdirectory(brotli)
   configure_file("${CMAKE_CURRENT_SOURCE_DIR}/brotli/LICENSE"
                  ${PROJECT_BINARY_DIR}/LICENSE.brotli COPYONLY)
-  if(BROTLI_EMSCRIPTEN)
-    # Brotli only defines the -static targets when using emscripten.
-    foreach(brlib IN ITEMS brotlienc brotlidec brotlicommon)
-      add_library(${brlib} ALIAS ${brlib}-static)
-    endforeach()
-  endif()  # BROTLI_EMSCRIPTEN
+  if(APPLE)
+    if(NOT DEFINED CMAKE_MACOSX_RPATH)
+      # Use @rpath in install_name when CMAKE_MACOSX_RPATH is not set.
+      set_property(TARGET brotlienc PROPERTY MACOSX_RPATH TRUE)
+      set_property(TARGET brotlidec PROPERTY MACOSX_RPATH TRUE)
+      set_property(TARGET brotlicommon PROPERTY MACOSX_RPATH TRUE)
+    endif()
+    if((NOT DEFINED CMAKE_MACOSX_RPATH) OR CMAKE_MACOSX_RPATH)
+      # Set library search path when @rpath is used.
+      if(NOT DEFINED CMAKE_INSTALL_RPATH)
+        set_property(TARGET brotlienc PROPERTY INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
+        set_property(TARGET brotlidec PROPERTY INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
+        set_property(TARGET brotlicommon PROPERTY INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
+      endif()
+    else()
+      # Set conventional install_name when @rpath is not used.
+      if(NOT DEFINED CMAKE_INSTALL_NAME_DIR)
+        set_property(TARGET brotlienc PROPERTY INSTALL_NAME_DIR "${CMAKE_INSTALL_PREFIX}/lib")
+        set_property(TARGET brotlidec PROPERTY INSTALL_NAME_DIR "${CMAKE_INSTALL_PREFIX}/lib")
+        set_property(TARGET brotlicommon PROPERTY INSTALL_NAME_DIR "${CMAKE_INSTALL_PREFIX}/lib")
+      endif()
+    endif()
+  endif()  # APPLE
 endif()
 
 # *cms
-if (JPEGXL_ENABLE_SKCMS OR JPEGXL_ENABLE_PLUGINS)
+if (JPEGXL_ENABLE_SKCMS)
   if( NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/skcms/skcms.h" )
     message(FATAL_ERROR "Please run ${PROJECT_SOURCE_DIR}/deps.sh to fetch the "
             "build dependencies.")
@@ -86,9 +104,9 @@ if (JPEGXL_ENABLE_SKCMS OR JPEGXL_ENABLE_PLUGINS)
   configure_file("${CMAKE_CURRENT_SOURCE_DIR}/skcms/LICENSE"
                  ${PROJECT_BINARY_DIR}/LICENSE.skcms COPYONLY)
 endif ()
-if (JPEGXL_ENABLE_VIEWERS OR NOT JPEGXL_ENABLE_SKCMS)
+if (JPEGXL_ENABLE_VIEWERS OR NOT JPEGXL_ENABLE_SKCMS OR JPEGXL_ENABLE_PLUGINS)
   if( NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/lcms/.git" OR JPEGXL_FORCE_SYSTEM_LCMS2 )
-    find_package(LCMS2 2.13)
+    find_package(LCMS2 2.12)
     if ( NOT LCMS2_FOUND )
       message(FATAL_ERROR "Please install lcms2 or run git submodule update --init")
     endif ()
@@ -100,15 +118,15 @@ if (JPEGXL_ENABLE_VIEWERS OR NOT JPEGXL_ENABLE_SKCMS)
 endif()
 
 # libpng
-if (JPEGXL_EMSCRIPTEN)
+if (JPEGXL_BUNDLE_LIBPNG AND EMSCRIPTEN)
   if (NOT EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/libpng/CMakeLists.txt")
   message(FATAL_ERROR "Please run ${PROJECT_SOURCE_DIR}/deps.sh to fetch the "
           "build dependencies.")
   endif()
   file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/libpng/scripts/pnglibconf.h.prebuilt" DESTINATION "${CMAKE_CURRENT_SOURCE_DIR}/libpng")
   file(RENAME "${CMAKE_CURRENT_SOURCE_DIR}/libpng/pnglibconf.h.prebuilt" "${CMAKE_CURRENT_SOURCE_DIR}/libpng/pnglibconf.h")
-  set(ZLIB_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/zlib/")
-  set(ZLIB_LIBRARY "")
+  set(ZLIB_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/zlib/")
+  set(ZLIB_LIBRARIES "")
   set(PNG_FOUND YES PARENT_SCOPE)
   set(PNG_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/libpng/" PARENT_SCOPE)
   set(PNG_LIBRARIES "" PARENT_SCOPE)
@@ -123,8 +141,8 @@ elseif (JPEGXL_BUNDLE_LIBPNG)
   set(PNG_BUILD_ZLIB ON CACHE BOOL "")
   set(PNG_TESTS OFF CACHE BOOL "")
   set(SKIP_INSTALL_ALL ON CACHE BOOL "")
-  set(ZLIB_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/zlib/")
-  set(ZLIB_LIBRARY zlibstatic)
+  set(ZLIB_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/zlib/")
+  set(ZLIB_LIBRARIES zlibstatic)
   add_subdirectory(libpng EXCLUDE_FROM_ALL)
   set(PNG_FOUND YES PARENT_SCOPE)
   set(PNG_INCLUDE_DIRS "${CMAKE_CURRENT_SOURCE_DIR}/libpng/" PARENT_SCOPE)
diff --git a/third_party/highway/.github/workflows/build_test.yml b/third_party/highway/.github/workflows/build_test.yml
deleted file mode 100644 (file)
index bab1630..0000000
+++ /dev/null
@@ -1,57 +0,0 @@
-# Copyright 2021 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-name: Build / test
-on: [push, pull_request]
-jobs:
-  cmake:
-    name: Build and test ${{ matrix.name }}
-    runs-on: ubuntu-18.04
-    strategy:
-      matrix:
-        include:
-          - name: Clang-5.0
-            extra_deps: clang-5.0
-            c_compiler: clang-5.0
-            cxx_compiler: clang++-5.0
-
-          - name: Clang-6.0
-            extra_deps: clang-6.0
-            c_compiler: clang-6.0
-            cxx_compiler: clang++-6.0
-
-    steps:
-      - uses: actions/checkout@v2
-
-      - name: Install deps
-        run: sudo apt-get install ${{ matrix.extra_deps }}
-
-      - name: Build and test
-        run: |
-          export CMAKE_BUILD_PARALLEL_LEVEL=2
-          export CTEST_PARALLEL_LEVEL=2
-          CXXFLAGS=-Werror CC=${{ matrix.c_compiler }} CXX=${{ matrix.cxx_compiler }} cmake -B out .
-          cmake --build out
-          ctest --test-dir out
-
-  bazel:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v2
-      - uses: bazelbuild/setup-bazelisk@v1
-      - uses: actions/cache@v2
-        with:
-          path: ~/.cache/bazel
-          key: bazel-${{ runner.os }}
-      - run: bazel build //...
diff --git a/third_party/highway/BUILD b/third_party/highway/BUILD
deleted file mode 100644 (file)
index 136550d..0000000
+++ /dev/null
@@ -1,411 +0,0 @@
-load("@bazel_skylib//lib:selects.bzl", "selects")
-
-load("@rules_cc//cc:defs.bzl", "cc_test")
-package(
-    default_visibility = ["//visibility:public"],
-)
-
-licenses(["notice"])
-
-exports_files(["LICENSE"])
-
-# Detect compiler:
-config_setting(
-    name = "compiler_clang",
-    flag_values = {"@bazel_tools//tools/cpp:compiler": "clang"},
-)
-
-config_setting(
-    name = "compiler_clangcl",
-    flag_values = {"@bazel_tools//tools/cpp:compiler": "lexan"},
-)
-
-config_setting(
-    name = "compiler_msvc_actual",
-    flag_values = {"@bazel_tools//tools/cpp:compiler": "msvc"},
-)
-
-# The above is insufficient for Bazel on Windows, which does not seem to
-# detect/set a compiler flag. This workaround prevents compile errors due to
-# passing clang-only warning flags to MSVC.
-config_setting(
-    name = "compiler_msvc_cpu",
-    values = {
-        "cpu": "x64_windows",
-    },
-)
-
-selects.config_setting_group(
-    name = "compiler_msvc",
-    match_any = [
-        ":compiler_msvc_actual",
-        ":compiler_msvc_cpu",
-    ],
-)
-
-config_setting(
-    name = "compiler_emscripten",
-    values = {"cpu": "wasm32"},
-)
-
-# See https://github.com/bazelbuild/bazel/issues/12707
-config_setting(
-    name = "compiler_gcc_bug",
-    flag_values = {
-        "@bazel_tools//tools/cpp:compiler": "compiler",
-    },
-)
-
-config_setting(
-    name = "compiler_gcc_actual",
-    flag_values = {
-        "@bazel_tools//tools/cpp:compiler": "gcc",
-    },
-)
-
-selects.config_setting_group(
-    name = "compiler_gcc",
-    match_any = [
-        ":compiler_gcc_bug",
-        ":compiler_gcc_actual",
-    ],
-)
-
-# Additional warnings for Clang OR GCC (skip for MSVC)
-CLANG_GCC_COPTS = [
-    "-Wunused-parameter",
-    "-Wunused-variable",
-    "-Wextra-semi",
-    "-Wunreachable-code",
-]
-
-# Warnings supported by Clang and Clang-cl
-CLANG_OR_CLANGCL_OPTS = CLANG_GCC_COPTS + [
-    "-Wfloat-overflow-conversion",
-    "-Wfloat-zero-conversion",
-    "-Wfor-loop-analysis",
-    "-Wgnu-redeclared-enum",
-    "-Winfinite-recursion",
-    "-Wliteral-conversion",
-    "-Wno-c++98-compat",
-    "-Wno-unused-command-line-argument",
-    "-Wprivate-header",
-    "-Wself-assign",
-    "-Wstring-conversion",
-    "-Wtautological-overlap-compare",
-    "-Wthread-safety-analysis",
-    "-Wundefined-func-template",
-    "-Wunused-comparison",
-]
-
-# Warnings only supported by Clang, but not Clang-cl
-CLANG_ONLY_COPTS = CLANG_OR_CLANGCL_OPTS + [
-    # Do not treat the third_party headers as system headers when building
-    # highway - the errors are pertinent.
-    "--no-system-header-prefix=third_party/highway",
-]
-
-COPTS = select({
-    ":compiler_msvc": [],
-    ":compiler_gcc": CLANG_GCC_COPTS,
-    ":compiler_clangcl": CLANG_OR_CLANGCL_OPTS,
-    # Default to clang because compiler detection only works in Bazel
-    "//conditions:default": CLANG_ONLY_COPTS,
-}) + select({
-    "@platforms//cpu:riscv64": [
-        "-march=rv64gcv1p0",
-        "-menable-experimental-extensions",
-    ],
-    "//conditions:default": [
-    ],
-})
-
-DEFINES = select({
-    ":compiler_msvc": ["HWY_SHARED_DEFINE"],
-    ":compiler_clangcl": ["HWY_SHARED_DEFINE"],
-    "//conditions:default": [],
-})
-
-# Unused on Bazel builds, where this is not defined/known; Copybara replaces
-# usages with an empty list.
-COMPAT = [
-    "//buildenv/target:non_prod",  # includes mobile/vendor.
-]
-
-# WARNING: changing flags such as HWY_DISABLED_TARGETS may break users without
-# failing integration tests, if the machine running tests does not support the
-# newly enabled instruction set, or the failure is only caught by sanitizers
-# which do not run in CI.
-
-cc_library(
-    name = "hwy",
-    srcs = [
-        "hwy/aligned_allocator.cc",
-        "hwy/per_target.cc",
-        "hwy/print.cc",
-        "hwy/targets.cc",
-    ],
-    # Normal headers with include guards
-    hdrs = [
-        "hwy/aligned_allocator.h",
-        "hwy/base.h",
-        "hwy/cache_control.h",
-        "hwy/detect_compiler_arch.h",  # private
-        "hwy/print.h",
-    ],
-    compatible_with = [],
-    copts = COPTS,
-    defines = DEFINES,
-    local_defines = ["hwy_EXPORTS"],
-    textual_hdrs = [
-        # These are textual because config macros influence them:
-        "hwy/detect_targets.h",  # private
-        "hwy/targets.h",
-        # End of list
-        "hwy/highway.h",  # public
-        "hwy/foreach_target.h",  # public
-        "hwy/per_target.h",  # public
-        "hwy/print-inl.h",  # public
-        "hwy/highway_export.h",  # public
-        "hwy/ops/arm_neon-inl.h",
-        "hwy/ops/arm_sve-inl.h",
-        "hwy/ops/emu128-inl.h",
-        "hwy/ops/generic_ops-inl.h",
-        "hwy/ops/scalar-inl.h",
-        "hwy/ops/set_macros-inl.h",
-        "hwy/ops/shared-inl.h",
-        "hwy/ops/x86_128-inl.h",
-        "hwy/ops/x86_256-inl.h",
-        "hwy/ops/x86_512-inl.h",
-        # Select avoids recompiling native arch if only non-native changed
-    ] + select({
-        ":compiler_emscripten": ["hwy/ops/wasm_128-inl.h"],
-        "//conditions:default": [],
-    }) + select({
-        "@platforms//cpu:riscv64": ["hwy/ops/rvv-inl.h"],
-        "//conditions:default": [],
-    }),
-)
-
-cc_library(
-    name = "algo",
-    compatible_with = [],
-    copts = COPTS,
-    textual_hdrs = [
-        "hwy/contrib/algo/copy-inl.h",
-        "hwy/contrib/algo/find-inl.h",
-        "hwy/contrib/algo/transform-inl.h",
-    ],
-    deps = [
-        ":hwy",
-    ],
-)
-
-cc_library(
-    name = "dot",
-    compatible_with = [],
-    copts = COPTS,
-    textual_hdrs = [
-        "hwy/contrib/dot/dot-inl.h",
-    ],
-    deps = [
-        ":hwy",
-    ],
-)
-
-cc_library(
-    name = "image",
-    srcs = [
-        "hwy/contrib/image/image.cc",
-    ],
-    hdrs = [
-        "hwy/contrib/image/image.h",
-    ],
-    compatible_with = [],
-    copts = COPTS,
-    local_defines = ["hwy_contrib_EXPORTS"],
-    deps = [
-        ":hwy",
-    ],
-)
-
-cc_library(
-    name = "math",
-    compatible_with = [],
-    copts = COPTS,
-    textual_hdrs = [
-        "hwy/contrib/math/math-inl.h",
-    ],
-    deps = [
-        ":hwy",
-    ],
-)
-
-# Everything required for tests that use Highway.
-cc_library(
-    name = "hwy_test_util",
-    srcs = ["hwy/tests/test_util.cc"],
-    hdrs = ["hwy/tests/test_util.h"],
-    compatible_with = [],
-    copts = COPTS,
-    local_defines = ["hwy_test_EXPORTS"],
-    textual_hdrs = [
-        "hwy/tests/test_util-inl.h",
-        "hwy/tests/hwy_gtest.h",
-    ],
-    # Must not depend on a gtest variant, which can conflict with the
-    # GUNIT_INTERNAL_BUILD_MODE defined by the test.
-    deps = [
-        ":hwy",
-    ],
-)
-
-cc_library(
-    name = "nanobenchmark",
-    srcs = ["hwy/nanobenchmark.cc"],
-    hdrs = ["hwy/nanobenchmark.h"],
-    compatible_with = [],
-    copts = COPTS,
-    local_defines = ["hwy_EXPORTS"],
-    deps = [":hwy"],
-)
-
-cc_binary(
-    name = "benchmark",
-    srcs = ["hwy/examples/benchmark.cc"],
-    copts = COPTS,
-    deps = [
-        ":hwy",
-        ":nanobenchmark",
-    ],
-)
-
-cc_library(
-    name = "skeleton",
-    srcs = ["hwy/examples/skeleton.cc"],
-    hdrs = ["hwy/examples/skeleton.h"],
-    copts = COPTS,
-    local_defines = ["hwy_EXPORTS"],
-    textual_hdrs = ["hwy/examples/skeleton-inl.h"],
-    deps = [
-        ":hwy",
-    ],
-)
-
-cc_binary(
-    name = "list_targets",
-    srcs = ["hwy/tests/list_targets.cc"],
-    deps = [":hwy"],
-)
-
-# path, name
-HWY_TESTS = [
-    ("hwy/contrib/algo/", "copy_test"),
-    ("hwy/contrib/algo/", "find_test"),
-    ("hwy/contrib/algo/", "transform_test"),
-    ("hwy/contrib/dot/", "dot_test"),
-    ("hwy/contrib/image/", "image_test"),
-    ("hwy/contrib/math/", "math_test"),
-    # contrib/sort has its own BUILD, we add it to GUITAR_TESTS.
-    ("hwy/examples/", "skeleton_test"),
-    ("hwy/", "nanobenchmark_test"),
-    ("hwy/", "aligned_allocator_test"),
-    ("hwy/", "base_test"),
-    ("hwy/", "highway_test"),
-    ("hwy/", "targets_test"),
-    ("hwy/tests/", "arithmetic_test"),
-    ("hwy/tests/", "blockwise_test"),
-    ("hwy/tests/", "blockwise_shift_test"),
-    ("hwy/tests/", "combine_test"),
-    ("hwy/tests/", "compare_test"),
-    ("hwy/tests/", "compress_test"),
-    ("hwy/tests/", "convert_test"),
-    ("hwy/tests/", "crypto_test"),
-    ("hwy/tests/", "demote_test"),
-    ("hwy/tests/", "float_test"),
-    ("hwy/tests/", "if_test"),
-    ("hwy/tests/", "interleaved_test"),
-    ("hwy/tests/", "logical_test"),
-    ("hwy/tests/", "mask_test"),
-    ("hwy/tests/", "mask_mem_test"),
-    ("hwy/tests/", "memory_test"),
-    ("hwy/tests/", "mul_test"),
-    ("hwy/tests/", "reduction_test"),
-    ("hwy/tests/", "reverse_test"),
-    ("hwy/tests/", "shift_test"),
-    ("hwy/tests/", "swizzle_test"),
-    ("hwy/tests/", "test_util_test"),
-]
-
-HWY_TEST_COPTS = select({
-    ":compiler_msvc": [],
-    "//conditions:default": [
-        # gTest triggers this warning (which is enabled by the
-        # extra-semi in COPTS), so we need to disable it here,
-        # but it's still enabled for :hwy.
-        "-Wno-c++98-compat-extra-semi",
-    ],
-})
-
-HWY_TEST_DEPS = [
-    ":algo",
-    ":dot",
-    ":hwy",
-    ":hwy_test_util",
-    ":image",
-    ":math",
-    ":nanobenchmark",
-    ":skeleton",
-    "//hwy/contrib/sort:vqsort",
-    "@com_google_googletest//:gtest_main",
-]
-
-[
-    [
-        cc_test(
-            name = test,
-            size = "medium",
-            timeout = "long",  # default moderate is not enough for math_test
-            srcs = [
-                subdir + test + ".cc",
-            ],
-            copts = COPTS + HWY_TEST_COPTS,
-            features = select({
-                "@platforms//cpu:riscv64": ["fully_static_link"],
-                "//conditions:default": [],
-            }),
-            linkopts = select({
-                ":compiler_emscripten": [
-                    "-s ASSERTIONS=2",
-                    "-s ENVIRONMENT=node,shell,web",
-                    "-s ERROR_ON_UNDEFINED_SYMBOLS=1",
-                    "-s DEMANGLE_SUPPORT=1",
-                    "-s EXIT_RUNTIME=1",
-                    "-s ALLOW_MEMORY_GROWTH=1",
-                    "--pre-js $(location :preamble.js.lds)",
-                ],
-                "//conditions:default": [],
-            }),
-            linkstatic = select({
-                "@platforms//cpu:riscv64": True,
-                "//conditions:default": False,
-            }),
-            local_defines = ["HWY_IS_TEST"],
-            # for test_suite.
-            tags = ["hwy_ops_test"],
-            deps = HWY_TEST_DEPS + select({
-                ":compiler_emscripten": [":preamble.js.lds"],
-                "//conditions:default": [],
-            }),
-        ),
-    ]
-    for subdir, test in HWY_TESTS
-]
-
-# For manually building the tests we define here (:all does not work in --config=msvc)
-test_suite(
-    name = "hwy_ops_tests",
-    tags = ["hwy_ops_test"],
-)
-
-# Placeholder for integration test, do not remove
diff --git a/third_party/highway/CMakeLists.txt b/third_party/highway/CMakeLists.txt
deleted file mode 100644 (file)
index 81361b7..0000000
+++ /dev/null
@@ -1,555 +0,0 @@
-# Copyright 2019 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-cmake_minimum_required(VERSION 3.10)
-
-# Set PIE flags for POSITION_INDEPENDENT_CODE targets, added in 3.14.
-if(POLICY CMP0083)
-  cmake_policy(SET CMP0083 NEW)
-endif()
-
-project(hwy VERSION 1.0.1)  # Keep in sync with highway.h version
-
-# Directly define the ABI version from the cmake project() version values:
-set(LIBRARY_VERSION "${hwy_VERSION}")
-set(LIBRARY_SOVERSION ${hwy_VERSION_MAJOR})
-
-set(CMAKE_CXX_EXTENSIONS OFF)
-
-# Enabled PIE binaries by default if supported.
-include(CheckPIESupported OPTIONAL RESULT_VARIABLE CHECK_PIE_SUPPORTED)
-if(CHECK_PIE_SUPPORTED)
-  check_pie_supported(LANGUAGES CXX)
-  if(CMAKE_CXX_LINK_PIE_SUPPORTED)
-    set(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
-  endif()
-endif()
-
-include(GNUInstallDirs)
-
-if (NOT CMAKE_BUILD_TYPE)
-  set(CMAKE_BUILD_TYPE RelWithDebInfo)
-endif()
-
-set(HWY_CMAKE_ARM7 OFF CACHE BOOL "Set copts for ARMv7 with NEON (requires vfpv4)?")
-
-# Unconditionally adding -Werror risks breaking the build when new warnings
-# arise due to compiler/platform changes. Enable this in CI/tests.
-set(HWY_WARNINGS_ARE_ERRORS OFF CACHE BOOL "Add -Werror flag?")
-
-set(HWY_ENABLE_CONTRIB ON CACHE BOOL "Include contrib/")
-set(HWY_ENABLE_EXAMPLES ON CACHE BOOL "Build examples")
-set(HWY_ENABLE_INSTALL ON CACHE BOOL "Install library")
-
-include(CheckCXXSourceCompiles)
-check_cxx_source_compiles(
-   "int main() {
-      #if !defined(__EMSCRIPTEN__)
-      static_assert(false, \"__EMSCRIPTEN__ is not defined\");
-      #endif
-      return 0;
-    }"
-  HWY_EMSCRIPTEN
-)
-
-check_cxx_source_compiles(
-   "int main() {
-      #if !defined(__riscv)
-      static_assert(false, \"__riscv is not defined\");
-      #endif
-      return 0;
-    }"
-  HWY_RISCV
-)
-
-if (HWY_ENABLE_CONTRIB)
-# Glob all the traits so we don't need to modify this file when adding
-# additional special cases.
-file(GLOB HWY_CONTRIB_SOURCES "hwy/contrib/sort/vqsort_*.cc")
-list(APPEND HWY_CONTRIB_SOURCES
-    hwy/contrib/dot/dot-inl.h
-    hwy/contrib/image/image.cc
-    hwy/contrib/image/image.h
-    hwy/contrib/math/math-inl.h
-    hwy/contrib/sort/shared-inl.h
-    hwy/contrib/sort/sorting_networks-inl.h
-    hwy/contrib/sort/traits-inl.h
-    hwy/contrib/sort/traits128-inl.h
-    hwy/contrib/sort/vqsort-inl.h
-    hwy/contrib/sort/vqsort.cc
-    hwy/contrib/sort/vqsort.h
-    hwy/contrib/algo/copy-inl.h
-    hwy/contrib/algo/find-inl.h
-    hwy/contrib/algo/transform-inl.h
-)
-endif()  # HWY_ENABLE_CONTRIB
-
-set(HWY_SOURCES
-    hwy/aligned_allocator.cc
-    hwy/aligned_allocator.h
-    hwy/base.h
-    hwy/cache_control.h
-    hwy/detect_compiler_arch.h  # private
-    hwy/detect_targets.h  # private
-    hwy/foreach_target.h
-    hwy/highway.h
-    hwy/highway_export.h
-    hwy/nanobenchmark.cc
-    hwy/nanobenchmark.h
-    hwy/ops/arm_neon-inl.h
-    hwy/ops/arm_sve-inl.h
-    hwy/ops/emu128-inl.h
-    hwy/ops/generic_ops-inl.h
-    hwy/ops/scalar-inl.h
-    hwy/ops/set_macros-inl.h
-    hwy/ops/shared-inl.h
-    hwy/ops/wasm_128-inl.h
-    hwy/ops/x86_128-inl.h
-    hwy/ops/x86_256-inl.h
-    hwy/ops/x86_512-inl.h
-    hwy/per_target.cc
-    hwy/per_target.h
-    hwy/print-inl.h
-    hwy/print.cc
-    hwy/print.h
-    hwy/targets.cc
-    hwy/targets.h
-)
-
-set(HWY_TEST_SOURCES
-    hwy/tests/hwy_gtest.h
-    hwy/tests/test_util-inl.h
-    hwy/tests/test_util.cc
-    hwy/tests/test_util.h
-)
-
-if (MSVC)
-  set(HWY_FLAGS
-    # fix build error C1128 in blockwise*_test & arithmetic_test
-    /bigobj
-  )
-else()
-  set(HWY_FLAGS
-    # Avoid changing binaries based on the current time and date.
-    -Wno-builtin-macro-redefined
-    -D__DATE__="redacted"
-    -D__TIMESTAMP__="redacted"
-    -D__TIME__="redacted"
-
-    # Optimizations
-    -fmerge-all-constants
-
-    # Warnings
-    -Wall
-    -Wextra
-    # These are not included in Wall nor Wextra:
-    -Wconversion
-    -Wsign-conversion
-    -Wvla
-    -Wnon-virtual-dtor
-  )
-
-  if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
-    list(APPEND HWY_FLAGS
-      -Wfloat-overflow-conversion
-      -Wfloat-zero-conversion
-      -Wfor-loop-analysis
-      -Wgnu-redeclared-enum
-      -Winfinite-recursion
-      -Wself-assign
-      -Wstring-conversion
-      -Wtautological-overlap-compare
-      -Wthread-safety-analysis
-      -Wundefined-func-template
-
-      -fno-cxx-exceptions
-      -fno-slp-vectorize
-      -fno-vectorize
-
-      # Use color in messages
-      -fdiagnostics-show-option -fcolor-diagnostics
-    )
-    if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 6.0)
-      list(APPEND HWY_FLAGS -Wc++2a-extensions)
-    endif()
-  endif()
-
-  if (WIN32)
-    if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
-      list(APPEND HWY_FLAGS
-        -Wno-global-constructors
-        -Wno-language-extension-token
-        -Wno-used-but-marked-unused
-        -Wno-shadow-field-in-constructor
-        -Wno-unused-member-function
-        -Wno-unused-template
-        -Wno-c++98-compat-pedantic
-        -Wno-used-but-marked-unused
-        -Wno-zero-as-null-pointer-constant
-      )
-    endif()
-
-    list(APPEND HWY_FLAGS
-      -Wno-cast-align
-      -Wno-double-promotion
-      -Wno-float-equal
-      -Wno-format-nonliteral
-      -Wno-shadow
-      -Wno-sign-conversion
-    )
-  else()
-    list(APPEND HWY_FLAGS
-      -fmath-errno
-      -fno-exceptions
-    )
-  endif()  # WIN32
-
-  if (HWY_CMAKE_ARM7)
-    list(APPEND HWY_FLAGS
-      -march=armv7-a
-      -mfpu=neon-vfpv4
-      -mfloat-abi=hard  # must match the toolchain specified as CXX=
-      -mfp16-format=ieee  # required for vcvt_f32_f16
-    )
-  endif()  # HWY_CMAKE_ARM7
-
-  if(HWY_RISCV)
-    list(APPEND HWY_FLAGS -march=rv64gcv1p0)
-    if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
-      list(APPEND HWY_FLAGS -menable-experimental-extensions)
-    endif()
-  endif()
-
-  if (HWY_WARNINGS_ARE_ERRORS)
-    list(APPEND HWY_FLAGS -Werror)
-  endif()
-
-  # Prevent "wasm-ld: error: --shared-memory is disallowed by targets.cc.o
-  # because it was not compiled with 'atomics' or 'bulk-memory' features."
-  if (HWY_EMSCRIPTEN)
-    list(APPEND HWY_FLAGS -matomics)
-  endif()
-
-endif()  # !MSVC
-
-# By default prefer STATIC build (legacy behavior)
-option(BUILD_SHARED_LIBS "Build shared libraries" OFF)
-option(HWY_FORCE_STATIC_LIBS "Ignore BUILD_SHARED_LIBS" OFF)
-# only expose shared/static options to advanced users:
-mark_as_advanced(BUILD_SHARED_LIBS)
-mark_as_advanced(HWY_FORCE_STATIC_LIBS)
-# Define visibility settings globally:
-set(CMAKE_CXX_VISIBILITY_PRESET hidden)
-set(CMAKE_VISIBILITY_INLINES_HIDDEN 1)
-
-# Copy-cat "add_library" logic + add override.
-set(HWY_LIBRARY_TYPE "SHARED")
-if (NOT BUILD_SHARED_LIBS OR HWY_FORCE_STATIC_LIBS)
-  set(HWY_LIBRARY_TYPE "STATIC")
-endif()
-
-# This preprocessor define will drive the build, also used in the *.pc files:
-if("${HWY_LIBRARY_TYPE}" STREQUAL "SHARED")
-  set(DLLEXPORT_TO_DEFINE "HWY_SHARED_DEFINE")
-else()
-  set(DLLEXPORT_TO_DEFINE "HWY_STATIC_DEFINE")
-endif()
-
-add_library(hwy ${HWY_LIBRARY_TYPE} ${HWY_SOURCES})
-target_compile_definitions(hwy PUBLIC "${DLLEXPORT_TO_DEFINE}")
-target_compile_options(hwy PRIVATE ${HWY_FLAGS})
-set_property(TARGET hwy PROPERTY POSITION_INDEPENDENT_CODE ON)
-set_target_properties(hwy PROPERTIES VERSION ${LIBRARY_VERSION} SOVERSION ${LIBRARY_SOVERSION})
-target_include_directories(hwy PUBLIC
-    $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}>
-    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
-target_compile_features(hwy PUBLIC cxx_std_11)
-set_target_properties(hwy PROPERTIES
-  LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version)
-if(UNIX AND NOT APPLE)
-  if(NOT HWY_EMSCRIPTEN)
-    # For GCC __atomic_store_8, see #887
-    target_link_libraries(hwy atomic)
-  endif()
-  # not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations)
-  set_property(TARGET hwy APPEND_STRING PROPERTY
-    LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version")
-endif()
-
-if (HWY_ENABLE_CONTRIB)
-add_library(hwy_contrib ${HWY_LIBRARY_TYPE} ${HWY_CONTRIB_SOURCES})
-target_link_libraries(hwy_contrib hwy)
-target_compile_options(hwy_contrib PRIVATE ${HWY_FLAGS})
-set_property(TARGET hwy_contrib PROPERTY POSITION_INDEPENDENT_CODE ON)
-set_target_properties(hwy_contrib PROPERTIES VERSION ${LIBRARY_VERSION} SOVERSION ${LIBRARY_SOVERSION})
-target_include_directories(hwy_contrib PUBLIC
-    $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}>
-    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
-target_compile_features(hwy_contrib PUBLIC cxx_std_11)
-set_target_properties(hwy_contrib PROPERTIES
-  LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version)
-# not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations)
-if(UNIX AND NOT APPLE)
-  set_property(TARGET hwy_contrib APPEND_STRING PROPERTY
-    LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version")
-endif()
-endif()  # HWY_ENABLE_CONTRIB
-
-add_library(hwy_test ${HWY_LIBRARY_TYPE} ${HWY_TEST_SOURCES})
-target_link_libraries(hwy_test hwy)
-target_compile_options(hwy_test PRIVATE ${HWY_FLAGS})
-set_property(TARGET hwy_test PROPERTY POSITION_INDEPENDENT_CODE ON)
-set_target_properties(hwy_test PROPERTIES VERSION ${LIBRARY_VERSION} SOVERSION ${LIBRARY_SOVERSION})
-target_include_directories(hwy_test PUBLIC
-    $<BUILD_INTERFACE:${CMAKE_CURRENT_LIST_DIR}>
-    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
-target_compile_features(hwy_test PUBLIC cxx_std_11)
-set_target_properties(hwy_test PROPERTIES
-  LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version)
-# not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations)
-if(UNIX AND NOT APPLE)
-  set_property(TARGET hwy_test APPEND_STRING PROPERTY
-    LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version")
-endif()
-
-# -------------------------------------------------------- hwy_list_targets
-# Generate a tool to print the compiled-in targets as defined by the current
-# flags. This tool will print to stderr at build time, after building hwy.
-add_executable(hwy_list_targets hwy/tests/list_targets.cc)
-target_compile_options(hwy_list_targets PRIVATE ${HWY_FLAGS})
-target_link_libraries(hwy_list_targets hwy)
-target_include_directories(hwy_list_targets PRIVATE
-  $<TARGET_PROPERTY:hwy,INCLUDE_DIRECTORIES>)
-# TARGET_FILE always returns the path to executable
-# Naked target also not always could be run (due to the lack of '.\' prefix)
-# Thus effective command to run should contain the full path
-# and emulator prefix (if any).
-if (NOT CMAKE_CROSSCOMPILING OR CMAKE_CROSSCOMPILING_EMULATOR)
-add_custom_command(TARGET hwy_list_targets POST_BUILD
-    COMMAND ${CMAKE_CROSSCOMPILING_EMULATOR} $<TARGET_FILE:hwy_list_targets> || (exit 0))
-endif()
-
-# --------------------------------------------------------
-# Allow skipping the following sections for projects that do not need them:
-# tests, examples, benchmarks and installation.
-
-# -------------------------------------------------------- install library
-if (HWY_ENABLE_INSTALL)
-
-install(TARGETS hwy
-  LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}"
-  ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
-  RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}")
-# Install all the headers keeping the relative path to the current directory
-# when installing them.
-foreach (source ${HWY_SOURCES})
-  if ("${source}" MATCHES "\.h$")
-    get_filename_component(dirname "${source}" DIRECTORY)
-    install(FILES "${source}"
-        DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${dirname}")
-  endif()
-endforeach()
-
-if (HWY_ENABLE_CONTRIB)
-install(TARGETS hwy_contrib
-  LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}"
-  ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
-  RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}")
-# Install all the headers keeping the relative path to the current directory
-# when installing them.
-foreach (source ${HWY_CONTRIB_SOURCES})
-  if ("${source}" MATCHES "\.h$")
-    get_filename_component(dirname "${source}" DIRECTORY)
-    install(FILES "${source}"
-        DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${dirname}")
-  endif()
-endforeach()
-endif()  # HWY_ENABLE_CONTRIB
-
-install(TARGETS hwy_test
-  LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}"
-  ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
-  RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}")
-# Install all the headers keeping the relative path to the current directory
-# when installing them.
-foreach (source ${HWY_TEST_SOURCES})
-  if ("${source}" MATCHES "\.h$")
-    get_filename_component(dirname "${source}" DIRECTORY)
-    install(FILES "${source}"
-        DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${dirname}")
-  endif()
-endforeach()
-
-# Add a pkg-config file for libhwy and the contrib/test libraries.
-set(HWY_LIBRARY_VERSION "${CMAKE_PROJECT_VERSION}")
-set(HWY_PC_FILES libhwy.pc libhwy-test.pc)
-if (HWY_ENABLE_CONTRIB)
-list(APPEND HWY_PC_FILES libhwy-contrib.pc)
-endif()  # HWY_ENABLE_CONTRIB
-foreach (pc ${HWY_PC_FILES})
-  configure_file("${CMAKE_CURRENT_SOURCE_DIR}/${pc}.in" "${pc}" @ONLY)
-  install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${pc}"
-      DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
-endforeach()
-
-endif()  # HWY_ENABLE_INSTALL
-# -------------------------------------------------------- Examples
-if (HWY_ENABLE_EXAMPLES)
-
-# Avoids mismatch between GTest's static CRT and our dynamic.
-set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
-
-# Programming exercise with integrated benchmark
-add_executable(hwy_benchmark hwy/examples/benchmark.cc)
-target_sources(hwy_benchmark PRIVATE
-    hwy/nanobenchmark.h)
-# Try adding one of -DHWY_COMPILE_ONLY_SCALAR, -DHWY_COMPILE_ONLY_EMU128 or
-# -DHWY_COMPILE_ONLY_STATIC to observe the difference in targets printed.
-target_compile_options(hwy_benchmark PRIVATE ${HWY_FLAGS})
-target_link_libraries(hwy_benchmark hwy)
-set_target_properties(hwy_benchmark
-    PROPERTIES RUNTIME_OUTPUT_DIRECTORY "examples/")
-
-endif()  # HWY_ENABLE_EXAMPLES
-# -------------------------------------------------------- Tests
-
-include(CTest)
-
-if(BUILD_TESTING)
-enable_testing()
-include(GoogleTest)
-
-set(HWY_SYSTEM_GTEST OFF CACHE BOOL "Use pre-installed googletest?")
-if(HWY_SYSTEM_GTEST)
-find_package(GTest REQUIRED)
-else()
-# Download and unpack googletest at configure time
-configure_file(CMakeLists.txt.in googletest-download/CMakeLists.txt)
-execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
-  RESULT_VARIABLE result
-  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download )
-if(result)
-  message(FATAL_ERROR "CMake step for googletest failed: ${result}")
-endif()
-execute_process(COMMAND ${CMAKE_COMMAND} --build .
-  RESULT_VARIABLE result
-  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download )
-if(result)
-  message(FATAL_ERROR "Build step for googletest failed: ${result}")
-endif()
-
-# Prevent overriding the parent project's compiler/linker
-# settings on Windows
-set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
-
-# Add googletest directly to our build. This defines
-# the gtest and gtest_main targets.
-add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/googletest-src
-                 ${CMAKE_CURRENT_BINARY_DIR}/googletest-build
-                 EXCLUDE_FROM_ALL)
-
-# The gtest/gtest_main targets carry header search path
-# dependencies automatically when using CMake 2.8.11 or
-# later. Otherwise we have to add them here ourselves.
-if (CMAKE_VERSION VERSION_LESS 2.8.11)
-  include_directories("${gtest_SOURCE_DIR}/include")
-endif()
-endif()  # HWY_SYSTEM_GTEST
-
-set(HWY_TEST_FILES
-  hwy/contrib/algo/copy_test.cc
-  hwy/contrib/algo/find_test.cc
-  hwy/contrib/algo/transform_test.cc
-  hwy/aligned_allocator_test.cc
-  hwy/base_test.cc
-  hwy/highway_test.cc
-  hwy/nanobenchmark_test.cc
-  hwy/targets_test.cc
-  hwy/examples/skeleton_test.cc
-  hwy/tests/arithmetic_test.cc
-  hwy/tests/blockwise_test.cc
-  hwy/tests/blockwise_shift_test.cc
-  hwy/tests/combine_test.cc
-  hwy/tests/compare_test.cc
-  hwy/tests/compress_test.cc
-  hwy/tests/convert_test.cc
-  hwy/tests/crypto_test.cc
-  hwy/tests/demote_test.cc
-  hwy/tests/float_test.cc
-  hwy/tests/if_test.cc
-  hwy/tests/interleaved_test.cc
-  hwy/tests/logical_test.cc
-  hwy/tests/mask_test.cc
-  hwy/tests/mask_mem_test.cc
-  hwy/tests/memory_test.cc
-  hwy/tests/mul_test.cc
-  hwy/tests/reduction_test.cc
-  hwy/tests/reverse_test.cc
-  hwy/tests/shift_test.cc
-  hwy/tests/swizzle_test.cc
-  hwy/tests/test_util_test.cc
-)
-
-set(HWY_TEST_LIBS hwy hwy_test)
-
-if (HWY_ENABLE_CONTRIB)
-list(APPEND HWY_TEST_LIBS hwy_contrib)
-
-list(APPEND HWY_TEST_FILES
-  hwy/contrib/dot/dot_test.cc
-  hwy/contrib/image/image_test.cc
-  # Disabled due to SIGILL in clang7 debug build during gtest discovery phase,
-  # not reproducible locally. Still tested via bazel build.
-  # hwy/contrib/math/math_test.cc
-  hwy/contrib/sort/sort_test.cc
-)
-endif()  # HWY_ENABLE_CONTRIB
-
-if(HWY_SYSTEM_GTEST)
-  set(HWY_GTEST_LIBS GTest::GTest GTest::Main)
-else()
-  set(HWY_GTEST_LIBS gtest gtest_main)
-endif()
-
-file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tests)
-foreach (TESTFILE IN LISTS HWY_TEST_FILES)
-  # The TESTNAME is the name without the extension or directory.
-  get_filename_component(TESTNAME ${TESTFILE} NAME_WE)
-  add_executable(${TESTNAME} ${TESTFILE})
-  target_compile_options(${TESTNAME} PRIVATE ${HWY_FLAGS})
-  # Test all targets, not just the best/baseline. This changes the default
-  # policy to all-attainable; note that setting -DHWY_COMPILE_* directly can
-  # cause compile errors because only one may be set, and other CMakeLists.txt
-  # that include us may set them.
-  target_compile_options(${TESTNAME} PRIVATE -DHWY_IS_TEST=1)
-
-  target_link_libraries(${TESTNAME} ${HWY_TEST_LIBS} ${HWY_GTEST_LIBS})
-  # Output test targets in the test directory.
-  set_target_properties(${TESTNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "tests")
-
-  if (HWY_EMSCRIPTEN)
-    set_target_properties(${TESTNAME} PROPERTIES LINK_FLAGS "-s SINGLE_FILE=1")
-  endif()
-
-  if(${CMAKE_VERSION} VERSION_LESS "3.10.3")
-    gtest_discover_tests(${TESTNAME} TIMEOUT 60)
-  else ()
-    gtest_discover_tests(${TESTNAME} DISCOVERY_TIMEOUT 60)
-  endif ()
-endforeach ()
-
-# The skeleton test uses the skeleton library code.
-target_sources(skeleton_test PRIVATE hwy/examples/skeleton.cc)
-
-endif()  # BUILD_TESTING
diff --git a/third_party/highway/CMakeLists.txt.in b/third_party/highway/CMakeLists.txt.in
deleted file mode 100644 (file)
index a0260b8..0000000
+++ /dev/null
@@ -1,15 +0,0 @@
-cmake_minimum_required(VERSION 2.8.12)
-
-project(googletest-download NONE)
-
-include(ExternalProject)
-ExternalProject_Add(googletest
-  GIT_REPOSITORY    https://github.com/google/googletest.git
-  GIT_TAG           43efa0a4efd40c78b9210d15373112081899a97c
-  SOURCE_DIR        "${CMAKE_CURRENT_BINARY_DIR}/googletest-src"
-  BINARY_DIR        "${CMAKE_CURRENT_BINARY_DIR}/googletest-build"
-  CONFIGURE_COMMAND ""
-  BUILD_COMMAND     ""
-  INSTALL_COMMAND   ""
-  TEST_COMMAND      ""
-)
diff --git a/third_party/highway/CONTRIBUTING b/third_party/highway/CONTRIBUTING
deleted file mode 100644 (file)
index 8b7d4d2..0000000
+++ /dev/null
@@ -1,33 +0,0 @@
-# How to Contribute
-
-We'd love to accept your patches and contributions to this project. There are
-just a few small guidelines you need to follow.
-
-## Contributor License Agreement
-
-Contributions to this project must be accompanied by a Contributor License
-Agreement. You (or your employer) retain the copyright to your contribution;
-this simply gives us permission to use and redistribute your contributions as
-part of the project. Head over to <https://cla.developers.google.com/> to see
-your current agreements on file or to sign a new one.
-
-You generally only need to submit a CLA once, so if you've already submitted one
-(even if it was for a different project), you probably don't need to do it
-again.
-
-## Code reviews
-
-All submissions, including submissions by project members, require review. We
-use GitHub pull requests for this purpose. Consult
-[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
-information on using pull requests.
-
-## Testing
-
-This repository is used by JPEG XL, so major API changes will require
-coordination. Please get in touch with us beforehand, e.g. by raising an issue.
-
-## Community Guidelines
-
-This project follows
-[Google's Open Source Community Guidelines](https://opensource.google.com/conduct/).
diff --git a/third_party/highway/LICENSE b/third_party/highway/LICENSE
deleted file mode 100644 (file)
index f49a4e1..0000000
+++ /dev/null
@@ -1,201 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
\ No newline at end of file
diff --git a/third_party/highway/README.md b/third_party/highway/README.md
deleted file mode 100644 (file)
index ae89d8a..0000000
+++ /dev/null
@@ -1,297 +0,0 @@
-# Efficient and performance-portable vector software
-
-[//]: # (placeholder, do not remove)
-
-Highway is a C++ library that provides portable SIMD/vector intrinsics.
-
-## Why
-
-We are passionate about high-performance software. We see major untapped
-potential in CPUs (servers, mobile, desktops). Highway is for engineers who want
-to reliably and economically push the boundaries of what is possible in
-software.
-
-## How
-
-CPUs provide SIMD/vector instructions that apply the same operation to multiple
-data items. This can reduce energy usage e.g. *fivefold* because fewer
-instructions are executed. We also often see *5-10x* speedups.
-
-Highway makes SIMD/vector programming practical and workable according to these
-guiding principles:
-
-**Does what you expect**: Highway is a C++ library with carefully-chosen
-functions that map well to CPU instructions without extensive compiler
-transformations. The resulting code is more predictable and robust to code
-changes/compiler updates than autovectorization.
-
-**Works on widely-used platforms**: Highway supports four architectures; the
-same application code can target eight instruction sets, including those with
-'scalable' vectors (size unknown at compile time). Highway only requires C++11
-and supports four families of compilers. If you would like to use Highway on
-other platforms, please raise an issue.
-
-**Flexible to deploy**: Applications using Highway can run on heterogeneous
-clouds or client devices, choosing the best available instruction set at
-runtime. Alternatively, developers may choose to target a single instruction set
-without any runtime overhead. In both cases, the application code is the same
-except for swapping `HWY_STATIC_DISPATCH` with `HWY_DYNAMIC_DISPATCH` plus one
-line of code.
-
-**Suitable for a variety of domains**: Highway provides an extensive set of
-operations, used for image processing (floating-point), compression, video
-analysis, linear algebra, cryptography, sorting and random generation. We
-recognise that new use-cases may require additional ops and are happy to add
-them where it makes sense (e.g. no performance cliffs on some architectures). If
-you would like to discuss, please file an issue.
-
-**Rewards data-parallel design**: Highway provides tools such as Gather,
-MaskedLoad, and FixedTag to enable speedups for legacy data structures. However,
-the biggest gains are unlocked by designing algorithms and data structures for
-scalable vectors. Helpful techniques include batching, structure-of-array
-layouts, and aligned/padded allocations.
-
-## Examples
-
-Online demos using Compiler Explorer:
-
--   [generating code for multiple targets](https://gcc.godbolt.org/z/n6rx6xK5h) (recommended)
--   [single target using -m flags](https://gcc.godbolt.org/z/rGnjMevKG)
-
-Projects using Highway: (to add yours, feel free to raise an issue or contact us
-via the below email)
-
-*   [iresearch database index](https://github.com/iresearch-toolkit/iresearch/blob/e7638e7a4b99136ca41f82be6edccf01351a7223/core/utils/simd_utils.hpp)
-*   [JPEG XL image codec](https://github.com/libjxl/libjxl)
-*   [Grok JPEG 2000 image codec](https://github.com/GrokImageCompression/grok)
-*   [vectorized Quicksort](https://github.com/google/highway/tree/master/hwy/contrib/sort) ([paper](https://arxiv.org/abs/2205.05982))
-
-## Current status
-
-### Targets
-
-Supported targets: scalar, S-SSE3, SSE4, AVX2, AVX-512, AVX3_DL (~Icelake,
-requires opt-in by defining `HWY_WANT_AVX3_DL`), NEON (ARMv7 and v8), SVE, SVE2,
-WASM SIMD, RISC-V V.
-
-SVE was initially tested using farm_sve (see acknowledgments).
-
-### Versioning
-
-Highway releases aim to follow the semver.org system (MAJOR.MINOR.PATCH),
-incrementing MINOR after backward-compatible additions and PATCH after
-backward-compatible fixes. We recommend using releases (rather than the Git tip)
-because they are tested more extensively, see below.
-
-The current version 1.0 signals an increased focus on backwards compatibility.
-Applications using documented functionality will remain compatible with future
-updates that have the same major version number.
-
-### Testing
-
-Continuous integration tests build with a recent version of Clang (running on
-native x86, or QEMU for RVV and ARM) and MSVC 2019 (v19.28, running on native
-x86).
-
-Before releases, we also test on x86 with Clang and GCC, and ARMv7/8 via GCC
-cross-compile. See the [testing process](g3doc/release_testing_process.md) for
-details.
-
-### Related modules
-
-The `contrib` directory contains SIMD-related utilities: an image class with
-aligned rows, a math library (16 functions already implemented, mostly
-trigonometry), and functions for computing dot products and sorting.
-
-## Installation
-
-This project uses CMake to generate and build. In a Debian-based system you can
-install it via:
-
-```bash
-sudo apt install cmake
-```
-
-Highway's unit tests use [googletest](https://github.com/google/googletest).
-By default, Highway's CMake downloads this dependency at configuration time.
-You can disable this by setting the `HWY_SYSTEM_GTEST` CMake variable to ON and
-installing gtest separately:
-
-```bash
-sudo apt install libgtest-dev
-```
-
-To build Highway as a shared or static library (depending on BUILD_SHARED_LIBS),
-the standard CMake workflow can be used:
-
-```bash
-mkdir -p build && cd build
-cmake ..
-make -j && make test
-```
-
-Or you can run `run_tests.sh` (`run_tests.bat` on Windows).
-
-Bazel is also supported for building, but it is not as widely used/tested.
-
-## Quick start
-
-You can use the `benchmark` inside examples/ as a starting point.
-
-A [quick-reference page](g3doc/quick_reference.md) briefly lists all operations
-and their parameters, and the [instruction_matrix](g3doc/instruction_matrix.pdf)
-indicates the number of instructions per operation.
-
-We recommend using full SIMD vectors whenever possible for maximum performance
-portability. To obtain them, pass a `ScalableTag<float>` (or equivalently
-`HWY_FULL(float)`) tag to functions such as `Zero/Set/Load`. There are two
-alternatives for use-cases requiring an upper bound on the lanes:
-
--   For up to `N` lanes, specify `CappedTag<T, N>` or the equivalent
-    `HWY_CAPPED(T, N)`. The actual number of lanes will be `N` rounded down to
-    the nearest power of two, such as 4 if `N` is 5, or 8 if `N` is 8. This is
-    useful for data structures such as a narrow matrix. A loop is still required
-    because vectors may actually have fewer than `N` lanes.
-
--   For exactly a power of two `N` lanes, specify `FixedTag<T, N>`. The largest
-    supported `N` depends on the target, but is guaranteed to be at least
-    `16/sizeof(T)`.
-
-Due to ADL restrictions, user code calling Highway ops must either:
-*   Reside inside `namespace hwy { namespace HWY_NAMESPACE {`; or
-*   prefix each op with an alias such as `namespace hn = hwy::HWY_NAMESPACE;
-    hn::Add()`; or
-*   add using-declarations for each op used: `using hwy::HWY_NAMESPACE::Add;`.
-
-Additionally, each function that calls Highway ops must either be prefixed with
-`HWY_ATTR`, OR reside between `HWY_BEFORE_NAMESPACE()` and
-`HWY_AFTER_NAMESPACE()`. Lambda functions currently require `HWY_ATTR` before
-their opening brace.
-
-The entry points into code using Highway differ slightly depending on whether
-they use static or dynamic dispatch.
-
-*   For static dispatch, `HWY_TARGET` will be the best available target among
-    `HWY_BASELINE_TARGETS`, i.e. those allowed for use by the compiler (see
-    [quick-reference](g3doc/quick_reference.md)). Functions inside
-    `HWY_NAMESPACE` can be called using `HWY_STATIC_DISPATCH(func)(args)` within
-    the same module they are defined in. You can call the function from other
-    modules by wrapping it in a regular function and declaring the regular
-    function in a header.
-
-*   For dynamic dispatch, a table of function pointers is generated via the
-    `HWY_EXPORT` macro that is used by `HWY_DYNAMIC_DISPATCH(func)(args)` to
-    call the best function pointer for the current CPU's supported targets. A
-    module is automatically compiled for each target in `HWY_TARGETS` (see
-    [quick-reference](g3doc/quick_reference.md)) if `HWY_TARGET_INCLUDE` is
-    defined and `foreach_target.h` is included.
-
-## Compiler flags
-
-Applications should be compiled with optimizations enabled - without inlining,
-SIMD code may slow down by factors of 10 to 100. For clang and GCC, `-O2` is
-generally sufficient.
-
-For MSVC, we recommend compiling with `/Gv` to allow non-inlined functions to
-pass vector arguments in registers. If intending to use the AVX2 target together
-with half-width vectors (e.g. for `PromoteTo`), it is also important to compile
-with `/arch:AVX2`. This seems to be the only way to generate VEX-encoded SSE4
-instructions on MSVC. Otherwise, mixing VEX-encoded AVX2 instructions and
-non-VEX SSE4 may cause severe performance degradation. Unfortunately, the
-resulting binary will then require AVX2. Note that no such flag is needed for
-clang and GCC because they support target-specific attributes, which we use to
-ensure proper VEX code generation for AVX2 targets.
-
-## Strip-mining loops
-
-To vectorize a loop, "strip-mining" transforms it into an outer loop and inner
-loop with number of iterations matching the preferred vector width.
-
-In this section, let `T` denote the element type, `d = ScalableTag<T>`, `count`
-the number of elements to process, and `N = Lanes(d)` the number of lanes in a
-full vector. Assume the loop body is given as a function `template<bool partial,
-class D> void LoopBody(D d, size_t index, size_t max_n)`.
-
-Highway offers several ways to express loops where `N` need not divide `count`:
-
-*   Ensure all inputs/outputs are padded. Then the loop is simply
-
-    ```
-    for (size_t i = 0; i < count; i += N) LoopBody<false>(d, i, 0);
-    ```
-    Here, the template parameter and second function argument are not needed.
-
-    This is the preferred option, unless `N` is in the thousands and vector
-    operations are pipelined with long latencies. This was the case for
-    supercomputers in the 90s, but nowadays ALUs are cheap and we see most
-    implementations split vectors into 1, 2 or 4 parts, so there is little cost
-    to processing entire vectors even if we do not need all their lanes. Indeed
-    this avoids the (potentially large) cost of predication or partial
-    loads/stores on older targets, and does not duplicate code.
-
-*   Use the `Transform*` functions in hwy/contrib/algo/transform-inl.h. This
-    takes care of the loop and remainder handling and you simply define a
-    generic lambda function (C++14) or functor which receives the current vector
-    from the input/output array, plus optionally vectors from up to two extra
-    input arrays, and returns the value to write to the input/output array.
-
-    Here is an example implementing the BLAS function SAXPY (`alpha * x + y`):
-
-    ```
-    Transform1(d, x, n, y, [](auto d, const auto v, const auto v1) HWY_ATTR {
-      return MulAdd(Set(d, alpha), v, v1);
-    });
-    ```
-
-*   Process whole vectors as above, followed by a scalar loop:
-
-    ```
-    size_t i = 0;
-    for (; i + N <= count; i += N) LoopBody<false>(d, i, 0);
-    for (; i < count; ++i) LoopBody<false>(CappedTag<T, 1>(), i, 0);
-    ```
-    The template parameter and second function arguments are again not needed.
-
-    This avoids duplicating code, and is reasonable if `count` is large.
-    If `count` is small, the second loop may be slower than the next option.
-
-*   Process whole vectors as above, followed by a single call to a modified
-    `LoopBody` with masking:
-
-    ```
-    size_t i = 0;
-    for (; i + N <= count; i += N) {
-      LoopBody<false>(d, i, 0);
-    }
-    if (i < count) {
-      LoopBody<true>(d, i, count - i);
-    }
-    ```
-    Now the template parameter and third function argument can be used inside
-    `LoopBody` to non-atomically 'blend' the first `num_remaining` lanes of `v`
-    with the previous contents of memory at subsequent locations:
-    `BlendedStore(v, FirstN(d, num_remaining), d, pointer);`. Similarly,
-    `MaskedLoad(FirstN(d, num_remaining), d, pointer)` loads the first
-    `num_remaining` elements and returns zero in other lanes.
-
-    This is a good default when it is infeasible to ensure vectors are padded,
-    but is only safe `#if !HWY_MEM_OPS_MIGHT_FAULT`!
-    In contrast to the scalar loop, only a single final iteration is needed.
-    The increased code size from two loop bodies is expected to be worthwhile
-    because it avoids the cost of masking in all but the final iteration.
-
-## Additional resources
-
-*   [Highway introduction (slides)](g3doc/highway_intro.pdf)
-*   [Overview of instructions per operation on different architectures](g3doc/instruction_matrix.pdf)
-*   [Design philosophy and comparison](g3doc/design_philosophy.md)
-*   [Implementation details](g3doc/impl_details.md)
-
-## Acknowledgments
-
-We have used [farm-sve](https://gitlab.inria.fr/bramas/farm-sve) by Berenger
-Bramas; it has proved useful for checking the SVE port on an x86 development
-machine.
-
-This is not an officially supported Google product.
-Contact: janwas@google.com
diff --git a/third_party/highway/WORKSPACE b/third_party/highway/WORKSPACE
deleted file mode 100644 (file)
index 6df1f62..0000000
+++ /dev/null
@@ -1,24 +0,0 @@
-workspace(name = "highway")
-
-load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
-
-http_archive(
-  name = "com_google_googletest",
-  urls = ["https://github.com/google/googletest/archive/609281088cfefc76f9d0ce82e1ff6c30cc3591e5.zip"],
-  sha256 = "5cf189eb6847b4f8fc603a3ffff3b0771c08eec7dd4bd961bfd45477dd13eb73",
-  strip_prefix = "googletest-609281088cfefc76f9d0ce82e1ff6c30cc3591e5",
-)
-
-# See https://google.github.io/googletest/quickstart-bazel.html
-http_archive(
-  name = "rules_cc",
-  urls = ["https://github.com/bazelbuild/rules_cc/archive/40548a2974f1aea06215272d9c2b47a14a24e556.zip"],
-  sha256 = "56ac9633c13d74cb71e0546f103ce1c58810e4a76aa8325da593ca4277908d72",
-  strip_prefix = "rules_cc-40548a2974f1aea06215272d9c2b47a14a24e556",
-)
-
-# Need recent version for config_setting_group
-http_archive(
-    name = "bazel_skylib",
-    urls = ["https://github.com/bazelbuild/bazel-skylib/releases/download/0.9.0/bazel_skylib-0.9.0.tar.gz"],
-)
diff --git a/third_party/highway/debian/changelog b/third_party/highway/debian/changelog
deleted file mode 100644 (file)
index 1db18df..0000000
+++ /dev/null
@@ -1,142 +0,0 @@
-highway (1.0.1-1) UNRELEASED; urgency=medium
-
-* Add Eq128, i64 Mul, unsigned->float ConvertTo
-* Faster sort for few unique keys, more robust pivot selection
-* Fix: floating-point generator for sort tests, Min/MaxOfLanes for i16
-* Fix: avoid always_inline in debug, link atomic
-* GCC warnings: string.h, maybe-uninitialized, ignored-attributes
-* GCC warnings: preprocessor int overflow, spurious use-after-free/overflow
-* Doc: <=HWY_AVX3, Full32/64/128, how to use generic-inl
-
- -- Jan Wassenberg <janwas@google.com>  Tue, 23 Aug 2022 10:00:00 +0200
-
-highway (1.0.0-1) UNRELEASED; urgency=medium
-
-* ABI change: 64-bit target values, more room for expansion
-* Add CompressBlocksNot, CompressNot, Lt128Upper, Min/Max128Upper, TruncateTo
-* Add HWY_SVE2_128 target
-* Sort speedups especially for 128-bit
-* Documentation clarifications
-* Faster NEON CountTrue/FindFirstTrue/AllFalse/AllTrue
-* Improved SVE codegen
-* Fix u16x8 ConcatEven/Odd, SSSE3 i64 Lt
-* MSVC 2017 workarounds
-* Support for runtime dispatch on Arm/GCC/Linux
-
- -- Jan Wassenberg <janwas@google.com>  Wed, 27 Jul 2022 10:00:00 +0200
-
-highway (0.17.0-1) UNRELEASED; urgency=medium
-
-* Add ExtractLane, InsertLane, IsInf, IsFinite, IsNaN
-* Add StoreInterleaved2, LoadInterleaved2/3/4, BlendedStore, SafeFillN
-* Add MulFixedPoint15, Or3
-* Add Copy[If], Find[If], Generate, Replace[If] algos
-* Add HWY_EMU128 target (replaces HWY_SCALAR)
-* HWY_RVV is feature-complete
-* Add HWY_ENABLE_CONTRIB build flag, HWY_NATIVE_FMA, HWY_WANT_SSSE3/SSE4 macros
-* Extend ConcatOdd/Even and StoreInterleaved* to all types
-* Allow CappedTag<T, nonPowerOfTwo>
-* Sort speedups: 2x for AVX2, 1.09x for AVX3; avoid x86 malloc
-* Expand documentation
-* Fix RDTSCP crash in nanobenchmark
-* Fix XCR0 check (was ignoring AVX3 on ICL)
-* Support Arm/RISC-V timers
-
- -- Jan Wassenberg <janwas@google.com>  Fri, 20 May 2022 10:00:00 +0200
-
-highway (0.16.0-1) UNRELEASED; urgency=medium
-
-  * Add contrib/sort (vectorized quicksort)
-  * Add IfNegativeThenElse, IfVecThenElse
-  * Add Reverse2,4,8, ReverseBlocks, DupEven/Odd, AESLastRound
-  * Add OrAnd, Min128, Max128, Lt128, SumsOf8
-  * Support capped/partial vectors on RVV/SVE, int64 in WASM
-  * Support SVE2, shared library build
-  * Remove deprecated overloads without the required d arg (UpperHalf etc.)
-
- -- Jan Wassenberg <janwas@google.com>  Thu, 03 Feb 2022 11:00:00 +0100
-
-highway (0.15.0-1) UNRELEASED; urgency=medium
-
-  * New ops: CompressBlendedStore, ConcatOdd/Even, IndicesFromVec
-  * New ops: OddEvenBlocks, SwapAdjacentBlocks, Reverse, RotateRight
-  * Add bf16, unsigned comparisons, more lane types for Reverse/TableLookupLanes
-  * Contrib: add sort(ing network) and dot(product)
-  * Targets: update RVV for LLVM, add experimental WASM2
-  * Separate library hwy_test for test utils
-  * Add non-macro Simd<> aliases
-  * Fixes: const V& for GCC, AVX3 BZHI, POPCNT with AVX on MSVC, avoid %zu
-
- -- Jan Wassenberg <janwas@google.com>  Wed, 10 Nov 2021 10:00:00 +0100
-
-highway (0.14.2-1) UNRELEASED; urgency=medium
-
-  * Add MaskedLoad
-  * Fix non-glibc PPC, Windows GCC, MSVC 19.14
-  * Opt-in for -Werror; separate design_philosophy.md
-
- -- Jan Wassenberg <janwas@google.com>  Tue, 24 Aug 2021 15:00:00 +0200
-
-highway (0.14.1-1) UNRELEASED; urgency=medium
-
-  * Add LoadMaskBits, CompressBits[Store]
-  * Fix CPU feature check (AES/F16C) and warnings
-  * Improved DASSERT - disabled in optimized builds
-
- -- Jan Wassenberg <janwas@google.com>  Tue, 17 Aug 2021 14:00:00 +0200
-
-highway (0.14.0-1) UNRELEASED; urgency=medium
-
-  * Add SVE, S-SSE3, AVX3_DL targets
-  * Support partial vectors in all ops
-  * Add PopulationCount, FindFirstTrue, Ne, TableLookupBytesOr0
-  * Add AESRound, CLMul, MulOdd, HWY_CAP_FLOAT16
-
- -- Jan Wassenberg <janwas@google.com>  Thu, 29 Jul 2021 15:00:00 +0200
-
-highway (0.12.2-1) UNRELEASED; urgency=medium
-
-  * fix scalar-only test and Windows macro conflict with Load/StoreFence
-  * replace deprecated wasm intrinsics
-
- -- Jan Wassenberg <janwas@google.com>  Mon, 31 May 2021 16:00:00 +0200
-
-highway (0.12.1-1) UNRELEASED; urgency=medium
-
-  * doc updates, ARM GCC support, fix s390/ppc, complete partial vectors
-  * fix warnings, faster ARM div/sqrt, separate hwy_contrib library
-  * add Abs(i64)/FirstN/Pause, enable AVX2 on MSVC
-
- -- Jan Wassenberg <janwas@google.com>  Wed, 19 May 2021 15:00:00 +0200
-
-highway (0.12.0-1) UNRELEASED; urgency=medium
-
-  * Add Shift*8, Compress16, emulated Scatter/Gather, StoreInterleaved3/4
-  * Remove deprecated HWY_*_LANES, deprecate HWY_GATHER_LANES
-  * Proper IEEE rounding, reduce libstdc++ usage, inlined math
-
- -- Jan Wassenberg <janwas@google.com>  Thu, 15 Apr 2021 20:00:00 +0200
-
-highway (0.11.1-1) UNRELEASED; urgency=medium
-
-  * Fix clang7 asan error, finish f16 conversions and add test
-
- -- Jan Wassenberg <janwas@google.com>  Thu, 25 Feb 2021 16:00:00 +0200
-
-highway (0.11.0-1) UNRELEASED; urgency=medium
-
-  * Add RVV+mask logical ops, allow Shl/ShiftLeftSame on all targets, more math
-
- -- Jan Wassenberg <janwas@google.com>  Thu, 18 Feb 2021 20:00:00 +0200
-
-highway (0.7.0-1) UNRELEASED; urgency=medium
-
-  * Added API stability notice, Compress[Store], contrib/, SignBit, CopySign
-
- -- Jan Wassenberg <janwas@google.com>  Tue, 5 Jan 2021 17:00:00 +0200
-
-highway (0.1-1) UNRELEASED; urgency=medium
-
-  * Initial debian package.
-
- -- Alex Deymo <deymo@google.com>  Mon, 19 Oct 2020 16:48:07 +0200
diff --git a/third_party/highway/debian/compat b/third_party/highway/debian/compat
deleted file mode 100644 (file)
index f599e28..0000000
+++ /dev/null
@@ -1 +0,0 @@
-10
diff --git a/third_party/highway/debian/control b/third_party/highway/debian/control
deleted file mode 100644 (file)
index 7c60ebc..0000000
+++ /dev/null
@@ -1,23 +0,0 @@
-Source: highway
-Maintainer: JPEG XL Maintainers <jpegxl@google.com>
-Section: misc
-Priority: optional
-Standards-Version: 3.9.8
-Build-Depends: cmake,
-               debhelper (>= 9),
-               libgtest-dev
-Homepage: https://github.com/google/highway
-
-Package: libhwy-dev
-Architecture: any
-Section: libdevel
-Depends: ${misc:Depends}
-Description: Efficient and performance-portable SIMD wrapper (developer files)
- This library provides type-safe and source-code portable wrappers over
- existing platform-specific intrinsics. Its design aims for simplicity,
- reliable efficiency across platforms, and immediate usability with current
- compilers.
- .
- This package installs the development files. There's no runtime library
- since most of Highway is implemented in headers and only a very small
- static library is needed.
diff --git a/third_party/highway/debian/copyright b/third_party/highway/debian/copyright
deleted file mode 100644 (file)
index 53ea57a..0000000
+++ /dev/null
@@ -1,20 +0,0 @@
-Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
-Upstream-Name: highway
-
-Files: *
-Copyright: 2020 Google LLC
-License: Apache-2.0
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- .
-      http://www.apache.org/licenses/LICENSE-2.0
- .
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- .
- On Debian systems, the complete text of the Apache License, Version 2
- can be found in "/usr/share/common-licenses/Apache-2.0".
diff --git a/third_party/highway/debian/rules b/third_party/highway/debian/rules
deleted file mode 100755 (executable)
index 969fc12..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/usr/bin/make -f
-%:
-       dh $@ --buildsystem=cmake
-
-override_dh_auto_configure:
-       dh_auto_configure -- -DHWY_SYSTEM_GTEST=ON
diff --git a/third_party/highway/debian/source/format b/third_party/highway/debian/source/format
deleted file mode 100644 (file)
index 163aaf8..0000000
+++ /dev/null
@@ -1 +0,0 @@
-3.0 (quilt)
diff --git a/third_party/highway/g3doc/design_philosophy.md b/third_party/highway/g3doc/design_philosophy.md
deleted file mode 100644 (file)
index 10fff8e..0000000
+++ /dev/null
@@ -1,186 +0,0 @@
-# Design philosophy
-
-*   Performance is important but not the sole consideration. Anyone who goes to
-    the trouble of using SIMD clearly cares about speed. However, portability,
-    maintainability and readability also matter, otherwise we would write in
-    assembly. We aim for performance within 10-20% of a hand-written assembly
-    implementation on the development platform. There is no performance gap vs.
-    intrinsics: Highway code can do anything they can. If necessary, you can use
-    platform-specific instructions inside `#if HWY_TARGET == HWY_NEON` etc.
-
-*   The guiding principles of C++ are "pay only for what you use" and "leave no
-    room for a lower-level language below C++". We apply these by defining a
-    SIMD API that ensures operation costs are visible, predictable and minimal.
-
-*   Performance portability is important, i.e. the API should be efficient on
-    all target platforms. Unfortunately, common idioms for one platform can be
-    inefficient on others. For example: summing lanes horizontally versus
-    shuffling. Documenting which operations are expensive does not prevent their
-    use, as evidenced by widespread use of `HADDPS`. Performance acceptance
-    tests may detect large regressions, but do not help choose the approach
-    during initial development. Analysis tools can warn about some potential
-    inefficiencies, but likely not all. We instead provide [a carefully chosen
-    set of vector types and operations that are efficient on all target
-    platforms](instruction_matrix.pdf) (PPC8, SSE4/AVX2+, ARMv8).
-
-*   Future SIMD hardware features are difficult to predict. For example, AVX2
-    came with surprising semantics (almost no interaction between 128-bit
-    blocks) and AVX-512 added two kinds of predicates (writemask and zeromask).
-    To ensure the API reflects hardware realities, we suggest a flexible
-    approach that adds new operations as they become commonly available, with
-    fallback implementations where necessary.
-
-*   Masking/predication differs between platforms, and it is not clear how
-    important the use cases are beyond the ternary operator `IfThenElse`.
-    AVX-512/ARM SVE zeromasks are useful, but not supported by P0214R5. We
-    provide `IfThen[Zero]Else[Zero]` variants.
-
-*   "Width-agnostic" SIMD is more future-proof than user-specified fixed sizes.
-    For example, valarray-like code can iterate over a 1D array with a
-    library-specified vector width. This will result in better code when vector
-    sizes increase, and matches the direction taken by
-    [ARM SVE](https://alastairreid.github.io/papers/sve-ieee-micro-2017.pdf) and
-    RiscV V as well as Agner Fog's
-    [ForwardCom instruction set proposal](https://goo.gl/CFizWu). However, some
-    applications may require fixed sizes, so we also guarantee support for <=
-    128-bit vectors in each instruction set.
-
-*   The API and its implementation should be usable and efficient with commonly
-    used compilers, including MSVC. For example, we write `ShiftLeft<3>(v)`
-    instead of `v << 3` because MSVC 2017 (ARM64) does not propagate the literal
-    (https://godbolt.org/g/rKx5Ga). Highway requires function-specific target
-    attributes, supported by GCC 4.9 / Clang 3.9 / MSVC 2015.
-
-*   Efficient and safe runtime dispatch is important. Modules such as image or
-    video codecs are typically embedded into larger applications such as
-    browsers, so they cannot require separate binaries for each CPU. Libraries
-    also cannot predict whether the application already uses AVX2 (and pays the
-    frequency throttling cost), so this decision must be left to the
-    application. Using only the lowest-common denominator instructions
-    sacrifices too much performance. Therefore, we provide code paths for
-    multiple instruction sets and choose the most suitable at runtime. To reduce
-    overhead, dispatch should be hoisted to higher layers instead of checking
-    inside every low-level function. Highway supports inlining functions in the
-    same file or in `*-inl.h` headers. We generate all code paths from the same
-    source to reduce implementation- and debugging cost.
-
-*   Not every CPU need be supported. For example, pre-SSSE3 CPUs are
-    increasingly rare and the AVX instruction set is limited to floating-point
-    operations. To reduce code size and compile time, we provide specializations
-    for S-SSE3, SSE4, AVX2 and AVX-512 instruction sets on x86, plus a scalar
-    fallback.
-
-*   Access to platform-specific intrinsics is necessary for acceptance in
-    performance-critical projects. We provide conversions to and from intrinsics
-    to allow utilizing specialized platform-specific functionality, and simplify
-    incremental porting of existing code.
-
-*   The core API should be compact and easy to learn; we provide a [concise
-    reference](quick_reference.md).
-
-## Prior API designs
-
-The author has been writing SIMD code since 2002: first via assembly language,
-then intrinsics, later Intel's `F32vec4` wrapper, followed by three generations
-of custom vector classes. The first used macros to generate the classes, which
-reduces duplication but also readability. The second used templates instead.
-The third (used in highwayhash and PIK) added support for AVX2 and runtime
-dispatch. The current design (used in JPEG XL) enables code generation for
-multiple platforms and/or instruction sets from the same source, and improves
-runtime dispatch.
-
-## Overloaded function API
-
-Most C++ vector APIs rely on class templates. However, the ARM SVE vector type
-is sizeless and cannot be wrapped in a class. We instead rely on overloaded
-functions. Overloading based on vector types is also undesirable because SVE
-vectors cannot be default-constructed. We instead use a dedicated tag type
-`Simd` for overloading, abbreviated to `D` for template arguments and `d` in
-lvalues.
-
-Note that generic function templates are possible (see generic_ops-inl.h).
-
-## Masks
-
-AVX-512 introduced a major change to the SIMD interface: special mask registers
-(one bit per lane) that serve as predicates. It would be expensive to force
-AVX-512 implementations to conform to the prior model of full vectors with lanes
-set to all one or all zero bits. We instead provide a Mask type that emulates
-a subset of this functionality on other platforms at zero cost.
-
-Masks are returned by comparisons and `TestBit`; they serve as the input to
-`IfThen*`. We provide conversions between masks and vector lanes. For clarity
-and safety, we use FF..FF as the definition of true. To also benefit from
-x86 instructions that only require the sign bit of floating-point inputs to be
-set, we provide a special `ZeroIfNegative` function.
-
-## Differences vs. [P0214R5](https://goo.gl/zKW4SA) / std::experimental::simd
-
-1.  Allowing the use of built-in vector types by relying on non-member
-    functions. By contrast, P0214R5 requires a wrapper class, which does not
-    work for sizeless vector types currently used by ARM SVE and Risc-V.
-
-1.  Adding widely used and portable operations such as `AndNot`, `AverageRound`,
-    bit-shift by immediates and `IfThenElse`.
-
-1.  Designing the API to avoid or minimize overhead on AVX2/AVX-512 caused by
-    crossing 128-bit 'block' boundaries.
-
-1.  Avoiding the need for non-native vectors. By contrast, P0214R5's `simd_cast`
-    returns `fixed_size<>` vectors which are more expensive to access because
-    they reside on the stack. We can avoid this plus additional overhead on
-    ARM/AVX2 by defining width-expanding operations as functions of a vector
-    part, e.g. promoting half a vector of `uint8_t` lanes to one full vector of
-    `uint16_t`, or demoting full vectors to half vectors with half-width lanes.
-
-1.  Guaranteeing access to the underlying intrinsic vector type. This ensures
-    all platform-specific capabilities can be used. P0214R5 instead only
-    'encourages' implementations to provide access.
-
-1.  Enabling safe runtime dispatch and inlining in the same binary. P0214R5 is
-    based on the Vc library, which does not provide assistance for linking
-    multiple instruction sets into the same binary. The Vc documentation
-    suggests compiling separate executables for each instruction set or using
-    GCC's ifunc (indirect functions). The latter is compiler-specific and risks
-    crashes due to ODR violations when compiling the same function with
-    different compiler flags. We solve this problem via target-specific
-    namespaces and attributes (see HOWTO section below). We also permit a mix of
-    static target selection and runtime dispatch for hotspots that may benefit
-    from newer instruction sets if available.
-
-1.  Omitting inefficient or non-performance-portable operations such as `hmax`,
-    `operator[]`, and unsupported integer comparisons. Applications can often
-    replace these operations at lower cost than emulating that exact behavior.
-
-1.  Omitting `long double` types: these are not commonly available in hardware.
-
-1.  Ensuring signed integer overflow has well-defined semantics (wraparound).
-
-1.  Simple header-only implementation and a fraction of the size of the
-    Vc library from which P0214 was derived (39K, vs. 92K lines in
-    https://github.com/VcDevel/Vc according to the gloc Chrome extension).
-
-1.  Avoiding hidden performance costs. P0214R5 allows implicit conversions from
-    integer to float, which costs 3-4 cycles on x86. We make these conversions
-    explicit to ensure their cost is visible.
-
-## Other related work
-
-*   [Neat SIMD](http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=7568423)
-    adopts a similar approach with interchangeable vector/scalar types and
-    a compact interface. It allows access to the underlying intrinsics, but
-    does not appear to be designed for other platforms than x86.
-
-*   UME::SIMD ([code](https://goo.gl/yPeVZx), [paper](https://goo.gl/2xpZrk))
-    also adopts an explicit vectorization model with vector classes.
-    However, it exposes the union of all platform capabilities, which makes the
-    API harder to learn (209-page spec) and implement (the estimated LOC count
-    is [500K](https://goo.gl/1THFRi)). The API is less performance-portable
-    because it allows applications to use operations that are inefficient on
-    other platforms.
-
-*   Inastemp ([code](https://goo.gl/hg3USM), [paper](https://goo.gl/YcTU7S))
-    is a vector library for scientific computing with some innovative features:
-    automatic FLOPS counting, and "if/else branches" using lambda functions.
-    It supports IBM Power8, but only provides float and double types and does
-    not support SVE without assuming the runtime vector size.
diff --git a/third_party/highway/g3doc/highway_intro.pdf b/third_party/highway/g3doc/highway_intro.pdf
deleted file mode 100644 (file)
index e051a2c..0000000
Binary files a/third_party/highway/g3doc/highway_intro.pdf and /dev/null differ
diff --git a/third_party/highway/g3doc/impl_details.md b/third_party/highway/g3doc/impl_details.md
deleted file mode 100644 (file)
index 624d0d1..0000000
+++ /dev/null
@@ -1,221 +0,0 @@
-# Highway implementation details
-
-[TOC]
-
-## Introduction
-
-This doc explains some of the Highway implementation details; understanding them
-is mainly useful for extending the library. Bear in mind that Highway is a thin
-wrapper over 'intrinsic functions' provided by the compiler.
-
-## Vectors vs. tags
-
-The key to understanding Highway is to differentiate between vectors and
-zero-sized tag arguments. The former store actual data and are mapped by the
-compiler to vector registers. The latter (`Simd<>` and `SizeTag<>`) are only
-used to select among the various overloads of functions such as `Set`. This
-allows Highway to use builtin vector types without a class wrapper.
-
-Class wrappers are problematic for SVE and RVV because LLVM (or at least Clang)
-does not allow member variables whose type is 'sizeless' (in particular,
-built-in vectors). To our knowledge, Highway is the only C++ vector library that
-supports SVE and RISC-V without compiler flags that indicate what the runtime
-vector length will be. Such flags allow the compiler to convert the previously
-sizeless vectors to known-size vector types, which can then be wrapped in
-classes, but this only makes sense for use-cases where the exact hardware is
-known and rarely changes (e.g. supercomputers). By contrast, Highway can run on
-unknown hardware such as heterogeneous clouds or client devices without
-requiring a recompile, nor multiple binaries.
-
-Note that Highway does use class wrappers where possible, in particular NEON,
-WASM and x86. The wrappers (e.g. Vec128) are in fact required on some platforms
-(x86 and perhaps WASM) because Highway assumes the vector arguments passed e.g.
-to `Add` provide sufficient type information to identify the appropriate
-intrinsic. By contrast, x86's loosely typed `__m128i` built-in type could
-actually refer to any integer lane type. Because some targets use wrappers and
-others do not, incorrect user code may compile on some platforms but not others.
-This is because passing class wrappers as arguments triggers argument-dependent
-lookup, which would find the `Add` function even without namespace qualifiers
-because it resides in the same namespace as the wrapper. Correct user code
-qualifies each call to a Highway op, e.g. with a namespace alias `hn`, so
-`hn::Add`. This works for both wrappers and built-in vector types.
-
-## Adding a new target
-
-Adding a target requires updating about ten locations: adding a macro constant
-to identify it, hooking it into static and dynamic dispatch, detecting support
-at runtime, and identifying the target name. The easiest and safest way to do
-this is to search for one of the target identifiers such as `HWY_AVX3_DL`, and
-add corresponding logic for your new target. Note the upper limits on the number
-of targets per platform imposed by `HWY_MAX_DYNAMIC_TARGETS`.
-
-## When to use -inl.h
-
-By convention, files whose name ends with `-inl.h` contain vector code in the
-form of inlined function templates. In order to support the multiple compilation
-required for dynamic dispatch on platforms which provide several targets, such
-files generally begin with a 'per-target include guard' of the form:
-
-```
-#if defined(HWY_PATH_NAME_INL_H_) == defined(HWY_TARGET_TOGGLE)
-#ifdef HWY_PATH_NAME_INL_H_
-#undef HWY_PATH_NAME_INL_H_
-#else
-#define HWY_PATH_NAME_INL_H_
-#endif
-// contents to include once per target
-#endif  // HWY_PATH_NAME_INL_H_
-```
-
-This toggles the include guard between defined and undefined, which is
-sufficient to 'reset' the include guard when beginning a new 'compilation pass'
-for the next target. This is accomplished by simply re-#including the user's
-translation unit, which may in turn `#include` one or more `-inl.h` files. As an
-exception, `hwy/ops/*-inl.h` do not require include guards because they are all
-included from highway.h, which takes care of this in a single location. Note
-that platforms such as RISC-V which currently only offer a single target do not
-require multiple compilation, but the same mechanism is used without actually
-re-#including. For both of those platforms, it is possible that additional
-targets will later be added, which means this mechanism will then be required.
-
-Instead of a -inl.h file, you can also use a normal .cc/.h component, where the
-vector code is hidden inside the .cc file, and the header only declares a normal
-non-template function whose implementation does `HWY_DYNAMIC_DISPATCH` into the
-vector code. For an example of this, see
-[vqsort.cc](../hwy/contrib/sort/vqsort.cc).
-
-Considerations for choosing between these alternatives are similar to those for
-regular headers. Inlining and thus `-inl.h` makes sense for short functions, or
-when the function must support many input types and is defined as a template.
-Conversely, non-inline `.cc` files make sense when the function is very long
-(such that call overhead does not matter), and/or is only required for a small
-set of input types. [Math functions](../hwy/contrib/math/math-inl.h)
-can fall into either case, hence we provide both inline functions and `Call*`
-wrappers.
-
-## Use of macros
-
-Highway ops are implemented for up to 12 lane types, which can make for
-considerable repetition - even more so for RISC-V, which can have seven times as
-many variants (one per LMUL in `[1/8, 8]`). The various backends
-(implementations of one or more targets) differ in their strategies for handling
-this, in increasing order of macro complexity:
-
-*   `x86_*` and `wasm_*` simply write out all the overloads, which is
-    straightforward but results in 4K-6K line files.
-
-*   [arm_sve-inl.h](../hwy/ops/arm_sve-inl.h) defines 'type list'
-    macros `HWY_SVE_FOREACH*` to define all overloads for most ops in a single
-    line. Such an approach makes sense because SVE ops are quite orthogonal
-    (i.e. generally defined for all types and consistent).
-
-*   [arm_neon-inl.h](../hwy/ops/arm_neon-inl.h) also uses type list
-    macros, but with a more general 'function builder' which helps to define
-    custom function templates required for 'unusual' ops such as `ShiftLeft`.
-
-*   [rvv-inl.h](../hwy/ops/rvv-inl.h) has the most complex system
-    because it deals with both type lists and LMUL, plus support for widening or
-    narrowing operations. The type lists thus have additional arguments, and
-    there are also additional lists for LMUL which can be extended or truncated.
-
-## Code reuse across targets
-
-The set of Highway ops is carefully chosen such that most of them map to a
-single platform-specific intrinsic. However, there are some important functions
-such as `AESRound` which may require emulation, and are non-trivial enough that
-we don't want to copy them into each target's implementation. Instead, we
-implement such functions in
-[generic_ops-inl.h](../hwy/ops/generic_ops-inl.h), which is included
-into every backend. To allow some targets to override these functions, we use
-the same per-target include guard mechanism, e.g. `HWY_NATIVE_AES`.
-
-The functions there are typically templated on the vector and/or tag types. This
-is necessary because the vector type depends on the target. Although `Vec128` is
-available on most targets, `HWY_SCALAR`, `HWY_RVV` and `HWY_SVE*` lack this
-type. To enable specialized overloads (e.g. only for signed integers), we use
-the `HWY_IF` SFINAE helpers. Example: `template <class V, class D = DFromV<V>,
-HWY_IF_SIGNED_D(D)>`. Note that there is a limited set of `HWY_IF` that work
-directly with vectors, identified by their `_V` suffix. However, the functions
-likely use a `D` type anyway, thus it is convenient to obtain one in the
-template arguments and also use that for `HWY_IF_*_D`.
-
-For x86, we also avoid some duplication by implementing only once the functions
-which are shared between all targets. They reside in
-[x86_128-inl.h](../hwy/ops/x86_128-inl.h) and are also templated on the
-vector type.
-
-## Adding a new op
-
-Adding an op consists of three steps, listed below. As an example, consider
-https://github.com/google/highway/commit/6c285d64ae50e0f48866072ed3a476fc12df5ab6.
-
-1) Document the new op in `g3doc/quick_reference.md` with its function signature
-and a description of what the op does.
-
-2) Implement the op in each `ops/*-inl.h` header. There are two exceptions,
-detailed in the previous section: first, `generic_ops-inl.h` is not changed in
-the common case where the op has a unique definition for every target. Second,
-if the op's definition would be duplicated in `x86_256-inl.h` and
-`x86_512-inl.h`, it may be expressed as a template in `x86_128-inl.h` with a
-`class V` template argument, e.g. `TableLookupBytesOr0`.
-
-3) Pick the appropriate `hwy/tests/*_test.cc` and add a test. This is also a
-three step process: first define a functor that implements the test logic (e.g.
-`TestPlusMinus`), then a function (e.g. `TestAllPlusMinus`) that invokes this
-functor for all lane types the op supports, and finally a line near the end of
-the file that invokes the function for all targets:
-`HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllPlusMinus);`. Note the naming
-convention that the function has the same name as the functor except for the
-`TestAll` prefix.
-
-## Documentation of platform-specific intrinsics
-
-When adding a new op, it is often necessary to consult the reference for each
-platform's intrinsics.
-
-For x86 targets `HWY_SSSE3`, `HWY_SSE4`, `HWY_AVX2`, `HWY_AVX3`, `HWY_AVX3_DL`
-Intel provides a
-[searchable reference](https://www.intel.com/content/www/us/en/docs/intrinsics-guide).
-
-For Arm targets `HWY_NEON`, `HWY_SVE` (plus its specialization for 256-bit
-vectors `HWY_SVE_256`), `HWY_SVE2` (plus its specialization for 128-bit vectors
-`HWY_SVE2_128`), Arm provides a
-[searchable reference](https://developer.arm.com/architectures/instruction-sets/intrinsics).
-
-For RISC-V target `HWY_RVV`, we refer to the assembly language
-[specification](https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc)
-plus the separate
-[intrinsics specification](https://github.com/riscv-non-isa/rvv-intrinsic-doc).
-
-For WebAssembly target `HWY_WASM`, we recommend consulting the
-[intrinsics header](https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/wasm_simd128.h).
-There is also an unofficial
-[searchable list of intrinsics](https://nemequ.github.io/waspr/intrinsics).
-
-## Why scalar target
-
-There can be various reasons to avoid using vector intrinsics:
-
-*   The current CPU may not support any instruction sets generated by Highway
-    (on x86, we only target S-SSE3 or newer because its predecessor SSE3 was
-    introduced in 2004 and it seems unlikely that many users will want to
-    support such old CPUs);
-*   The compiler may crash or emit incorrect code for certain intrinsics or
-    instruction sets;
-*   We may want to estimate the speedup from the vector implementation compared
-    to scalar code.
-
-Highway provides either the `HWY_SCALAR` or the `HWY_EMU128` target for such
-use-cases. Both implement ops using standard C++ instead of intrinsics. They
-differ in the vector size: the former always uses single-lane vectors and thus
-cannot implement ops such as `AESRound` or `TableLookupBytes`. The latter
-guarantees 16-byte vectors are available like all other Highway targets, and
-supports all ops. Both of these alternatives are slower than native vector code,
-but they allow testing your code even when actual vectors are unavailable.
-
-One of the above targets is used if the CPU does not support any actual SIMD
-target. To avoid compiling any intrinsics, define `HWY_COMPILE_ONLY_EMU128`.
-
-`HWY_SCALAR` is only enabled/used `#ifdef HWY_COMPILE_ONLY_SCALAR` (or `#if
-HWY_BROKEN_EMU128`). Projects that intend to use it may require `#if HWY_TARGET
-!= HWY_SCALAR` around the ops it does not support to prevent compile errors.
diff --git a/third_party/highway/g3doc/instruction_matrix.pdf b/third_party/highway/g3doc/instruction_matrix.pdf
deleted file mode 100644 (file)
index 23608f3..0000000
Binary files a/third_party/highway/g3doc/instruction_matrix.pdf and /dev/null differ
diff --git a/third_party/highway/g3doc/quick_reference.md b/third_party/highway/g3doc/quick_reference.md
deleted file mode 100644 (file)
index e529eb8..0000000
+++ /dev/null
@@ -1,1472 +0,0 @@
-# API synopsis / quick reference
-
-[[_TOC_]]
-
-## High-level overview
-
-Highway is a collection of 'ops': platform-agnostic pure functions that operate
-on tuples (multiple values of the same type). These functions are implemented
-using platform-specific intrinsics, which map to SIMD/vector instructions.
-`hwy/contrib` also includes higher-level algorithms such as `FindIf` or `Sorter`
-implemented using these ops.
-
-Highway can use dynamic dispatch, which chooses the best available
-implementation at runtime, or static dispatch which has no runtime overhead.
-Dynamic dispatch works by compiling your code once per target CPU and then
-selecting (via indirect call) at runtime.
-
-Examples of both are provided in examples/. Dynamic dispatch uses the same
-source code as static, plus `#define HWY_TARGET_INCLUDE`, `#include
-"third_party/highway/hwy/foreach_target.h"` (which must come before any
-inclusion of highway.h) and `HWY_DYNAMIC_DISPATCH`.
-
-## Headers
-
-The public headers are:
-
-*   hwy/highway.h: main header, included from source AND/OR header files that
-    use vector types. Note that including in headers may increase compile time,
-    but allows declaring functions implemented out of line.
-
-*   hwy/base.h: included from headers that only need compiler/platform-dependent
-    definitions (e.g. `PopCount`) without the full highway.h.
-
-*   hwy/foreach_target.h: re-includes the translation unit (specified by
-    `HWY_TARGET_INCLUDE`) once per enabled target to generate code from the same
-    source code. highway.h must still be included.
-
-*   hwy/aligned_allocator.h: defines functions for allocating memory with
-    alignment suitable for `Load`/`Store`.
-
-*   hwy/cache_control.h: defines stand-alone functions to control caching (e.g.
-    prefetching), independent of actual SIMD.
-
-*   hwy/nanobenchmark.h: library for precisely measuring elapsed time (under
-    varying inputs) for benchmarking small/medium regions of code.
-
-*   hwy/print-inl.h: defines Print() for writing vector lanes to stderr.
-
-*   hwy/tests/test_util-inl.h: defines macros for invoking tests on all
-    available targets, plus per-target functions useful in tests.
-
-SIMD implementations must be preceded and followed by the following:
-
-```
-#include "hwy/highway.h"
-HWY_BEFORE_NAMESPACE();  // at file scope
-namespace project {  // optional
-namespace HWY_NAMESPACE {
-
-// implementation
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace project - optional
-HWY_AFTER_NAMESPACE();
-```
-
-## Notation in this doc
-
-*   `T` denotes the type of a vector lane (integer or floating-point);
-*   `N` is a size_t value that governs (but is not necessarily identical to) the
-    number of lanes;
-*   `D` is shorthand for a zero-sized tag type `Simd<T, N, kPow2>`, used to
-    select the desired overloaded function (see next section). Use aliases such
-    as `ScalableTag` instead of referring to this type directly;
-*   `d` is an lvalue of type `D`, passed as a function argument e.g. to Zero;
-*   `V` is the type of a vector, which may be a class or built-in type.
-
-## Vector and tag types
-
-Highway vectors consist of one or more 'lanes' of the same built-in type
-`uint##_t, int##_t` for `## = 8, 16, 32, 64`, plus `float##_t` for `## = 16, 32,
-64` and `bfloat16_t`.
-
-Beware that `char` may differ from these types, and is not supported directly.
-If your code loads from/stores to `char*`, use `T=uint8_t` for Highway's `d`
-tags (see below) or `T=int8_t` (which may enable faster less-than/greater-than
-comparisons), and cast your `char*` pointers to your `T*`.
-
-In Highway, `float16_t` (an IEEE binary16 half-float) and `bfloat16_t` (the
-upper 16 bits of an IEEE binary32 float) only support load, store, and
-conversion to/from `float32_t`. The behavior of infinity and NaN in `float16_t`
-is implementation-defined due to ARMv7.
-
-On RVV/SVE, vectors are sizeless and cannot be wrapped inside a class. The
-Highway API allows using built-in types as vectors because operations are
-expressed as overloaded functions. Instead of constructors, overloaded
-initialization functions such as `Set` take a zero-sized tag argument called `d`
-of type `D` and return an actual vector of unspecified type.
-
-`T` is one of the lane types above, and may be retrieved via `TFromD<D>`.
-
-The actual lane count (used to increment loop counters etc.) can be obtained via
-`Lanes(d)`. This value might not be known at compile time, thus storage for
-vectors should be dynamically allocated, e.g. via `AllocateAligned(Lanes(d))`.
-
-Note that `Lanes(d)` could potentially change at runtime. This is currently
-unlikely, and will not be initiated by Highway without user action, but could
-still happen in other circumstances:
-
-*   upon user request in future via special CPU instructions (switching to
-    'streaming SVE' mode for Arm SME), or
-*   via system software (`prctl(PR_SVE_SET_VL` on Linux for Arm SVE). When the
-    vector length is changed using this mechanism, all but the lower 128 bits of
-    vector registers are invalidated.
-
-Thus we discourage caching the result; it is typically used inside a function or
-basic block. If the application anticipates that one of the above circumstances
-could happen, it should ensure by some out-of-band mechanism that such changes
-will not happen during the critical section (the vector code which uses the
-result of the previously obtained `Lanes(d)`).
-
-`MaxLanes(d)` returns a (potentially loose) upper bound on `Lanes(d)`, and is
-implemented as a constexpr function.
-
-The actual lane count is guaranteed to be a power of two, even on SVE hardware
-where vectors can be a multiple of 128 bits (there, the extra lanes remain
-unused). This simplifies alignment: remainders can be computed as `count &
-(Lanes(d) - 1)` instead of an expensive modulo. It also ensures loop trip counts
-that are a large power of two (at least `MaxLanes`) are evenly divisible by the
-lane count, thus avoiding the need for a second loop to handle remainders.
-
-`d` lvalues (a tag, NOT actual vector) are obtained using aliases:
-
-*   Most common: `ScalableTag<T[, kPow2=0]> d;` or the macro form `HWY_FULL(T[,
-    LMUL=1]) d;`. With the default value of the second argument, these both
-    select full vectors which utilize all available lanes.
-
-    Only for targets (e.g. RVV) that support register groups, the kPow2 (-3..3)
-    and LMUL argument (1, 2, 4, 8) specify `LMUL`, the number of registers in
-    the group. This effectively multiplies the lane count in each operation by
-    `LMUL`, or left-shifts by `kPow2` (negative values are understood as
-    right-shifting by the absolute value). These arguments will eventually be
-    optional hints that may improve performance on 1-2 wide machines (at the
-    cost of reducing the effective number of registers), but RVV target does not
-    yet support fractional `LMUL`. Thus, mixed-precision code (e.g. demoting
-    float to uint8_t) currently requires `LMUL` to be at least the ratio of the
-    sizes of the largest and smallest type, and smaller `d` to be obtained via
-    `Half<DLarger>`.
-
-*   Less common: `CappedTag<T, kCap> d` or the macro form `HWY_CAPPED(T, kCap)
-    d;`. These select vectors or masks where *no more than* the largest power of
-    two not exceeding `kCap` lanes have observable effects such as
-    loading/storing to memory, or being counted by `CountTrue`. The number of
-    lanes may also be less; for the `HWY_SCALAR` target, vectors always have a
-    single lane. For example, `CappedTag<T, 3>` will use up to two lanes.
-
-*   For applications that require fixed-size vectors: `FixedTag<T, kCount> d;`
-    will select vectors where exactly `kCount` lanes have observable effects.
-    These may be implemented using full vectors plus additional runtime cost for
-    masking in `Load` etc. `kCount` must be a power of two not exceeding
-    `HWY_LANES(T)`, which is one for `HWY_SCALAR`. This tag can be used when the
-    `HWY_SCALAR` target is anyway disabled (superseded by a higher baseline) or
-    unusable (due to use of ops such as `TableLookupBytes`). As a convenience,
-    we also provide `Full128<T>`, `Full64<T>` and `Full32<T>` aliases which are
-    equivalent to `FixedTag<T, 16 / sizeof(T)>`, `FixedTag<T, 8 / sizeof(T)>`
-    and `FixedTag<T, 4 / sizeof(T)>`.
-
-*   The result of `UpperHalf`/`LowerHalf` has half the lanes. To obtain a
-    corresponding `d`, use `Half<decltype(d)>`; the opposite is `Twice<>`.
-
-User-specified lane counts or tuples of vectors could cause spills on targets
-with fewer or smaller vectors. By contrast, Highway encourages vector-length
-agnostic code, which is more performance-portable.
-
-For mixed-precision code (e.g. `uint8_t` lanes promoted to `float`), tags for
-the smaller types must be obtained from those of the larger type (e.g. via
-`Rebind<uint8_t, ScalableTag<float>>`).
-
-## Using unspecified vector types
-
-Vector types are unspecified and depend on the target. User code could define
-them as `auto`, but it is more readable (due to making the type visible) to use
-an alias such as `Vec<D>`, or `decltype(Zero(d))`. Similarly, the mask type can
-be obtained via `Mask<D>`.
-
-Vectors are sizeless types on RVV/SVE. Therefore, vectors must not be used in
-arrays/STL containers (use the lane type `T` instead), class members,
-static/thread_local variables, new-expressions (use `AllocateAligned` instead),
-and sizeof/pointer arithmetic (increment `T*` by `Lanes(d)` instead).
-
-Initializing constants requires a tag type `D`, or an lvalue `d` of that type.
-The `D` can be passed as a template argument or obtained from a vector type `V`
-via `DFromV<V>`. `TFromV<V>` is equivalent to `TFromD<DFromV<V>>`.
-
-**Note**: Let `DV = DFromV<V>`. For builtin `V` (currently necessary on
-RVV/SVE), `DV` might not be the same as the `D` used to create `V`. In
-particular, `DV` must not be passed to `Load/Store` functions because it may
-lack the limit on `N` established by the original `D`. However, `Vec<DV>` is the
-same as `V`.
-
-Thus a template argument `V` suffices for generic functions that do not load
-from/store to memory: `template<class V> V Mul4(V v) { return v *
-Set(DFromV<V>(), 4); }`.
-
-Example of mixing partial vectors with generic functions:
-
-```
-CappedTag<int16_t, 2> d2;
-auto v = Mul4(Set(d2, 2));
-Store(v, d2, ptr);  // Use d2, NOT DFromV<decltype(v)>()
-```
-
-## Targets
-
-Let `Target` denote an instruction set, one of
-`SCALAR/EMU128/SSSE3/SSE4/AVX2/AVX3/AVX3_DL/NEON/SVE/SVE2/WASM/RVV`. Each of
-these is represented by a `HWY_Target` (for example, `HWY_SSE4`) macro which
-expands to a unique power-of-two value.
-
-Note that x86 CPUs are segmented into dozens of feature flags and capabilities,
-which are often used together because they were introduced in the same CPU
-(example: AVX2 and FMA). To keep the number of targets and thus compile time and
-code size manageable, we define targets as 'clusters' of related features. To
-use `HWY_AVX2`, it is therefore insufficient to pass -mavx2. For definitions of
-the clusters, see `kGroup*` in `targets.cc`. The corresponding Clang/GCC
-compiler options to enable them (without -m prefix) are defined by
-`HWY_TARGET_STR*` in `set_macros-inl.h`.
-
-Targets are only used if enabled (i.e. not broken nor disabled). Baseline
-targets are those for which the compiler is unconditionally allowed to generate
-instructions (implying the target CPU must support them).
-
-*   `HWY_STATIC_TARGET` is the best enabled baseline `HWY_Target`, and matches
-    `HWY_TARGET` in static dispatch mode. This is useful even in dynamic
-    dispatch mode for deducing and printing the compiler flags.
-
-*   `HWY_TARGETS` indicates which targets to generate for dynamic dispatch, and
-    which headers to include. It is determined by configuration macros and
-    always includes `HWY_STATIC_TARGET`.
-
-*   `HWY_SUPPORTED_TARGETS` is the set of targets available at runtime. Expands
-    to a literal if only a single target is enabled, or SupportedTargets().
-
-*   `HWY_TARGET`: which `HWY_Target` is currently being compiled. This is
-    initially identical to `HWY_STATIC_TARGET` and remains so in static dispatch
-    mode. For dynamic dispatch, this changes before each re-inclusion and
-    finally reverts to `HWY_STATIC_TARGET`. Can be used in `#if` expressions to
-    provide an alternative to functions which are not supported by `HWY_SCALAR`.
-
-    In particular, for x86 we sometimes wish to specialize functions for AVX-512
-    because it provides many new instructions. This can be accomplished via `#if
-    HWY_TARGET <= HWY_AVX3`, which means AVX-512 or better (e.g. `HWY_AVX3_DL`).
-    This is because numerically lower targets are better, and no other platform
-    has targets numerically less than those of x86.
-
-*   `HWY_WANT_SSSE3`, `HWY_WANT_SSE4`: add SSSE3 and SSE4 to the baseline even
-    if they are not marked as available by the compiler. On MSVC, the only ways
-    to enable SSSE3 and SSE4 are defining these, or enabling AVX.
-
-*   `HWY_WANT_AVX3_DL`: opt-in for dynamic dispatch to `HWY_AVX3_DL`. This is
-    unnecessary if the baseline already includes AVX3_DL.
-
-## Operations
-
-In the following, the argument or return type `V` denotes a vector with `N`
-lanes, and `M` a mask. Operations limited to certain vector types begin with a
-constraint of the form `V`: `{prefixes}[{bits}]`. The prefixes `u,i,f` denote
-unsigned, signed, and floating-point types, and bits indicates the number of
-bits per lane: 8, 16, 32, or 64. Any combination of the specified prefixes and
-bits are allowed. Abbreviations of the form `u32 = {u}{32}` may also be used.
-
-Note that Highway functions reside in `hwy::HWY_NAMESPACE`, whereas user-defined
-functions reside in `project::[nested]::HWY_NAMESPACE`. Highway functions
-generally take either a `D` or vector/mask argument. For targets where vectors
-and masks are defined in namespace `hwy`, the functions will be found via
-Argument-Dependent Lookup. However, this does not work for function templates,
-and RVV and SVE both use builtin vectors. There are three options for portable
-code, in descending order of preference:
-
--   `namespace hn = hwy::HWY_NAMESPACE;` alias used to prefix ops, e.g.
-    `hn::LoadDup128(..)`;
--   `using hwy::HWY_NAMESPACE::LoadDup128;` declarations for each op used;
--   `using hwy::HWY_NAMESPACE;` directive. This is generally discouraged,
-    especially for SIMD code residing in a header.
-
-Note that overloaded operators are not yet supported on RVV and SVE. Until that
-is resolved, code that wishes to run on all targets must use the corresponding
-equivalents mentioned in the description of each overloaded operator, for
-example `Lt` instead of `operator<`.
-
-### Initialization
-
-*   <code>V **Zero**(D)</code>: returns N-lane vector with all bits set to 0.
-*   <code>V **Set**(D, T)</code>: returns N-lane vector with all lanes equal to
-    the given value of type `T`.
-*   <code>V **Undefined**(D)</code>: returns uninitialized N-lane vector, e.g.
-    for use as an output parameter.
-*   <code>V **Iota**(D, T)</code>: returns N-lane vector where the lane with
-    index `i` has the given value of type `T` plus `i`. The least significant
-    lane has index 0. This is useful in tests for detecting lane-crossing bugs.
-*   <code>V **SignBit**(D, T)</code>: returns N-lane vector with all lanes set
-    to a value whose representation has only the most-significant bit set.
-
-### Getting/setting lanes
-
-*   <code>T **GetLane**(V)</code>: returns lane 0 within `V`. This is useful for
-    extracting `SumOfLanes` results.
-
-The following may be slow on some platforms (e.g. x86) and should not be used in
-time-critical code:
-
-*   <code>T **ExtractLane**(V, size_t i)</code>: returns lane `i` within `V`.
-    `i` must be in `[0, Lanes(DFromV<V>()))`. Potentially slow, it may be better
-    to store an entire vector to an array and then operate on its elements.
-
-*   <code>V **InsertLane**(V, size_t i, T t)</code>: returns a copy of V whose
-    lane `i` is set to `t`. `i` must be in `[0, Lanes(DFromV<V>()))`.
-    Potentially slow, it may be better set all elements of an aligned array and
-    then `Load` it.
-
-### Printing
-
-*   <code>V **Print**(D, const char* caption, V [, size_t lane][, size_t
-    max_lanes])</code>: prints `caption` followed by up to `max_lanes`
-    comma-separated lanes from the vector argument, starting at index `lane`.
-    Defined in hwy/print-inl.h, also available if hwy/tests/test_util-inl.h has
-    been included.
-
-### Arithmetic
-
-*   <code>V **operator+**(V a, V b)</code>: returns `a[i] + b[i]` (mod 2^bits).
-    Currently unavailable on SVE/RVV; use the equivalent `Add` instead.
-*   <code>V **operator-**(V a, V b)</code>: returns `a[i] - b[i]` (mod 2^bits).
-    Currently unavailable on SVE/RVV; use the equivalent `Sub` instead.
-
-*   `V`: `{i,f}` \
-    <code>V **Neg**(V a)</code>: returns `-a[i]`.
-
-*   `V`: `{i,f}` \
-    <code>V **Abs**(V a)</code> returns the absolute value of `a[i]`; for
-    integers, `LimitsMin()` maps to `LimitsMax() + 1`.
-
-*   `V`: `f32` \
-    <code>V **AbsDiff**(V a, V b)</code>: returns `|a[i] - b[i]|` in each lane.
-
-*   `V`: `u8` \
-    <code>VU64 **SumsOf8**(V v)</code> returns the sums of 8 consecutive u8
-    lanes, zero-extending each sum into a u64 lane. This is slower on RVV/WASM.
-
-*   `V`: `{u,i}{8,16}` \
-    <code>V **SaturatedAdd**(V a, V b)</code> returns `a[i] + b[i]` saturated to
-    the minimum/maximum representable value.
-
-*   `V`: `{u,i}{8,16}` \
-    <code>V **SaturatedSub**(V a, V b)</code> returns `a[i] - b[i]` saturated to
-    the minimum/maximum representable value.
-
-*   `V`: `{u}{8,16}` \
-    <code>V **AverageRound**(V a, V b)</code> returns `(a[i] + b[i] + 1) / 2`.
-
-*   <code>V **Clamp**(V a, V lo, V hi)</code>: returns `a[i]` clamped to
-    `[lo[i], hi[i]]`.
-
-*   `V`: `{f}` \
-    <code>V **operator/**(V a, V b)</code>: returns `a[i] / b[i]` in each lane.
-    Currently unavailable on SVE/RVV; use the equivalent `Div` instead.
-
-*   `V`: `{f}` \
-    <code>V **Sqrt**(V a)</code>: returns `sqrt(a[i])`.
-
-*   `V`: `f32` \
-    <code>V **ApproximateReciprocalSqrt**(V a)</code>: returns an approximation
-    of `1.0 / sqrt(a[i])`. `sqrt(a) ~= ApproximateReciprocalSqrt(a) * a`. x86
-    and PPC provide 12-bit approximations but the error on ARM is closer to 1%.
-
-*   `V`: `f32` \
-    <code>V **ApproximateReciprocal**(V a)</code>: returns an approximation of
-    `1.0 / a[i]`.
-
-#### Min/Max
-
-**Note**: Min/Max corner cases are target-specific and may change. If either
-argument is qNaN, x86 SIMD returns the second argument, ARMv7 Neon returns NaN,
-Wasm is supposed to return NaN but does not always, but other targets actually
-uphold IEEE 754-2019 minimumNumber: returning the other argument if exactly one
-is qNaN, and NaN if both are.
-
-*   <code>V **Min**(V a, V b)</code>: returns `min(a[i], b[i])`.
-
-*   <code>V **Max**(V a, V b)</code>: returns `max(a[i], b[i])`.
-
-All other ops in this section are only available if `HWY_TARGET != HWY_SCALAR`:
-
-*   `V`: `u64` \
-    <code>M **Min128**(D, V a, V b)</code>: returns the minimum of unsigned
-    128-bit values, each stored as an adjacent pair of 64-bit lanes (e.g.
-    indices 1 and 0, where 0 is the least-significant 64-bits).
-
-*   `V`: `u64` \
-    <code>M **Max128**(D, V a, V b)</code>: returns the maximum of unsigned
-    128-bit values, each stored as an adjacent pair of 64-bit lanes (e.g.
-    indices 1 and 0, where 0 is the least-significant 64-bits).
-
-*   `V`: `u64` \
-    <code>M **Min128Upper**(D, V a, V b)</code>: for each 128-bit key-value
-    pair, returns `a` if it is considered less than `b` by Lt128Upper, else `b`.
-
-*   `V`: `u64` \
-    <code>M **Max128Upper**(D, V a, V b)</code>: for each 128-bit key-value
-    pair, returns `a` if it is considered > `b` by Lt128Upper, else `b`.
-
-#### Multiply
-
-*   `V`: `{u,i}{16,32,64}` \
-    <code>V <b>operator*</b>(V a, V b)</code>: returns the lower half of `a[i] *
-    b[i]` in each lane. Currently unavailable on SVE/RVV; use the equivalent
-    `Mul` instead.
-
-*   `V`: `{f}` \
-    <code>V <b>operator*</b>(V a, V b)</code>: returns `a[i] * b[i]` in each
-    lane. Currently unavailable on SVE/RVV; use the equivalent `Mul` instead.
-
-*   `V`: `i16` \
-    <code>V **MulHigh**(V a, V b)</code>: returns the upper half of `a[i] *
-    b[i]` in each lane.
-
-*   `V`: `i16` \
-    <code>V **MulFixedPoint15**(V a, V b)</code>: returns the result of
-    multiplying two 1.15 fixed-point numbers. This corresponds to doubling the
-    multiplication result and storing the upper half. Results are
-    implementation-defined iff both inputs are -32768.
-
-*   `V`: `{u,i}{32},u64` \
-    <code>V2 **MulEven**(V a, V b)</code>: returns double-wide result of `a[i] *
-    b[i]` for every even `i`, in lanes `i` (lower) and `i + 1` (upper). `V2` is
-    a vector with double-width lanes, or the same as `V` for 64-bit inputs
-    (which are only supported if `HWY_TARGET != HWY_SCALAR`).
-
-*   `V`: `u64` \
-    <code>V **MulOdd**(V a, V b)</code>: returns double-wide result of `a[i] *
-    b[i]` for every odd `i`, in lanes `i - 1` (lower) and `i` (upper). Only
-    supported if `HWY_TARGET != HWY_SCALAR`.
-
-*   `V`: `bf16`; `D`: `f32` \
-    <code>Vec<D> **ReorderWidenMulAccumulate**(D d, V a, V b, Vec<D> sum0,
-    Vec<D>& sum1)</code>: widens `a` and `b` to `TFromD<D>`, then adds `a[i] *
-    b[i]` to either `sum1[j]` or lane `j` of the return value, where `j = P(i)`
-    and `P` is a permutation. The only guarantee is that `SumOfLanes(d,
-    Add(return_value, sum1))` is the sum of all `a[i] * b[i]`. This is useful
-    for computing dot products and the L2 norm.
-
-#### Fused multiply-add
-
-When implemented using special instructions, these functions are more precise
-and faster than separate multiplication followed by addition. The `*Sub`
-variants are somewhat slower on ARM; it is preferable to replace them with
-`MulAdd` using a negated constant.
-
-*   `V`: `{f}` \
-    <code>V **MulAdd**(V a, V b, V c)</code>: returns `a[i] * b[i] + c[i]`.
-
-*   `V`: `{f}` \
-    <code>V **NegMulAdd**(V a, V b, V c)</code>: returns `-a[i] * b[i] + c[i]`.
-
-*   `V`: `{f}` \
-    <code>V **MulSub**(V a, V b, V c)</code>: returns `a[i] * b[i] - c[i]`.
-
-*   `V`: `{f}` \
-    <code>V **NegMulSub**(V a, V b, V c)</code>: returns `-a[i] * b[i] - c[i]`.
-
-#### Shifts
-
-**Note**: Counts not in `[0, sizeof(T)*8)` yield implementation-defined results.
-Left-shifting signed `T` and right-shifting positive signed `T` is the same as
-shifting `MakeUnsigned<T>` and casting to `T`. Right-shifting negative signed
-`T` is the same as an unsigned shift, except that 1-bits are shifted in.
-
-Compile-time constant shifts: the amount must be in [0, sizeof(T)*8). Generally
-the most efficient variant, but 8-bit shifts are potentially slower than other
-lane sizes, and `RotateRight` is often emulated with shifts:
-
-*   `V`: `{u,i}` \
-    <code>V **ShiftLeft**&lt;int&gt;(V a)</code> returns `a[i] << int`.
-
-*   `V`: `{u,i}` \
-    <code>V **ShiftRight**&lt;int&gt;(V a)</code> returns `a[i] >> int`.
-
-*   `V`: `{u}{32,64}` \
-    <code>V **RotateRight**&lt;int&gt;(V a)</code> returns `(a[i] >> int) |
-    (a[i] << (sizeof(T)*8 - int))`.
-
-Shift all lanes by the same (not necessarily compile-time constant) amount:
-
-*   `V`: `{u,i}` \
-    <code>V **ShiftLeftSame**(V a, int bits)</code> returns `a[i] << bits`.
-
-*   `V`: `{u,i}` \
-    <code>V **ShiftRightSame**(V a, int bits)</code> returns `a[i] >> bits`.
-
-Per-lane variable shifts (slow if SSSE3/SSE4, or 16-bit, or Shr i64 on AVX2):
-
-*   `V`: `{u,i}{16,32,64}` \
-    <code>V **operator<<**(V a, V b)</code> returns `a[i] << b[i]`. Currently
-    unavailable on SVE/RVV; use the equivalent `Shl` instead.
-
-*   `V`: `{u,i}{16,32,64}` \
-    <code>V **operator>>**(V a, V b)</code> returns `a[i] >> b[i]`. Currently
-    unavailable on SVE/RVV; use the equivalent `Shr` instead.
-
-#### Floating-point rounding
-
-*   `V`: `{f}` \
-    <code>V **Round**(V v)</code>: returns `v[i]` rounded towards the nearest
-    integer, with ties to even.
-
-*   `V`: `{f}` \
-    <code>V **Trunc**(V v)</code>: returns `v[i]` rounded towards zero
-    (truncate).
-
-*   `V`: `{f}` \
-    <code>V **Ceil**(V v)</code>: returns `v[i]` rounded towards positive
-    infinity (ceiling).
-
-*   `V`: `{f}` \
-    <code>V **Floor**(V v)</code>: returns `v[i]` rounded towards negative
-    infinity.
-
-#### Floating-point classification
-
-*   `V`: `{f}` \
-    <code>M **IsNaN**(V v)</code>: returns mask indicating whether `v[i]` is
-    "not a number" (unordered).
-
-*   `V`: `{f}` \
-    <code>M **IsInf**(V v)</code>: returns mask indicating whether `v[i]` is
-    positive or negative infinity.
-
-*   `V`: `{f}` \
-    <code>M **IsFinite**(V v)</code>: returns mask indicating whether `v[i]` is
-    neither NaN nor infinity, i.e. normal, subnormal or zero. Equivalent to
-    `Not(Or(IsNaN(v), IsInf(v)))`.
-
-### Logical
-
-*   `V`: `{u,i}` \
-    <code>V **PopulationCount**(V a)</code>: returns the number of 1-bits in
-    each lane, i.e. `PopCount(a[i])`.
-
-The following operate on individual bits within each lane. Note that the
-non-operator functions (`And` instead of `&`) must be used for floating-point
-types, and on SVE/RVV.
-
-*   `V`: `{u,i}` \
-    <code>V **operator&**(V a, V b)</code>: returns `a[i] & b[i]`. Currently
-    unavailable on SVE/RVV; use the equivalent `And` instead.
-
-*   `V`: `{u,i}` \
-    <code>V **operator|**(V a, V b)</code>: returns `a[i] | b[i]`. Currently
-    unavailable on SVE/RVV; use the equivalent `Or` instead.
-
-*   `V`: `{u,i}` \
-    <code>V **operator^**(V a, V b)</code>: returns `a[i] ^ b[i]`. Currently
-    unavailable on SVE/RVV; use the equivalent `Xor` instead.
-
-*   `V`: `{u,i}` \
-    <code>V **Not**(V v)</code>: returns `~v[i]`.
-
-*   <code>V **AndNot**(V a, V b)</code>: returns `~a[i] & b[i]`.
-
-The following three-argument functions may be more efficient than assembling
-them from 2-argument functions:
-
-*   <code>V **Or3**(V o1, V o2, V o3)</code>: returns `o1[i] | o2[i] | o3[i]`.
-*   <code>V **OrAnd**(V o, V a1, V a2)</code>: returns `o[i] | (a1[i] & a2[i])`.
-
-Special functions for signed types:
-
-*   `V`: `{f}` \
-    <code>V **CopySign**(V a, V b)</code>: returns the number with the magnitude
-    of `a` and sign of `b`.
-
-*   `V`: `{f}` \
-    <code>V **CopySignToAbs**(V a, V b)</code>: as above, but potentially
-    slightly more efficient; requires the first argument to be non-negative.
-
-*   `V`: `i32/64` \
-    <code>V **BroadcastSignBit**(V a)</code> returns `a[i] < 0 ? -1 : 0`.
-
-*   `V`: `{f}` \
-    <code>V **ZeroIfNegative**(V v)</code>: returns `v[i] < 0 ? 0 : v[i]`.
-
-*   `V`: `{i,f}` \
-    <code>V **IfNegativeThenElse**(V v, V yes, V no)</code>: returns `v[i] < 0 ?
-    yes[i] : no[i]`. This may be more efficient than `IfThenElse(Lt..)`.
-
-### Masks
-
-Let `M` denote a mask capable of storing a logical true/false for each lane (the
-encoding depends on the platform).
-
-#### Creation
-
-*   <code>M **FirstN**(D, size_t N)</code>: returns mask with the first `N`
-    lanes (those with index `< N`) true. `N >= Lanes(D())` results in an
-    all-true mask. `N` must not exceed
-    `LimitsMax<SignedFromSize<HWY_MIN(sizeof(size_t), sizeof(TFromD<D>))>>()`.
-    Useful for implementing "masked" stores by loading `prev` followed by
-    `IfThenElse(FirstN(d, N), what_to_store, prev)`.
-
-*   <code>M **MaskFromVec**(V v)</code>: returns false in lane `i` if `v[i] ==
-    0`, or true if `v[i]` has all bits set. The result is
-    *implementation-defined* if `v[i]` is neither zero nor all bits set.
-
-*   <code>M **LoadMaskBits**(D, const uint8_t* p)</code>: returns a mask
-    indicating whether the i-th bit in the array is set. Loads bytes and bits in
-    ascending order of address and index. At least 8 bytes of `p` must be
-    readable, but only `(Lanes(D()) + 7) / 8` need be initialized. Any unused
-    bits (happens if `Lanes(D()) < 8`) are treated as if they were zero.
-
-#### Conversion
-
-*   <code>M1 **RebindMask**(D, M2 m)</code>: returns same mask bits as `m`, but
-    reinterpreted as a mask for lanes of type `TFromD<D>`. `M1` and `M2` must
-    have the same number of lanes.
-
-*   <code>V **VecFromMask**(D, M m)</code>: returns 0 in lane `i` if `m[i] ==
-    false`, otherwise all bits set.
-
-*   <code>size_t **StoreMaskBits**(D, M m, uint8_t* p)</code>: stores a bit
-    array indicating whether `m[i]` is true, in ascending order of `i`, filling
-    the bits of each byte from least to most significant, then proceeding to the
-    next byte. Returns the number of bytes written: `(Lanes(D()) + 7) / 8`. At
-    least 8 bytes of `p` must be writable.
-
-#### Testing
-
-*   <code>bool **AllTrue**(D, M m)</code>: returns whether all `m[i]` are true.
-
-*   <code>bool **AllFalse**(D, M m)</code>: returns whether all `m[i]` are
-    false.
-
-*   <code>size_t **CountTrue**(D, M m)</code>: returns how many of `m[i]` are
-    true [0, N]. This is typically more expensive than AllTrue/False.
-
-*   <code>intptr_t **FindFirstTrue**(D, M m)</code>: returns the index of the
-    first (i.e. lowest index) `m[i]` that is true, or -1 if none are.
-
-#### Ternary operator
-
-For `IfThen*`, masks must adhere to the invariant established by `MaskFromVec`:
-false is zero, true has all bits set:
-
-*   <code>V **IfThenElse**(M mask, V yes, V no)</code>: returns `mask[i] ?
-    yes[i] : no[i]`.
-
-*   <code>V **IfThenElseZero**(M mask, V yes)</code>: returns `mask[i] ?
-    yes[i] : 0`.
-
-*   <code>V **IfThenZeroElse**(M mask, V no)</code>: returns `mask[i] ? 0 :
-    no[i]`.
-
-*   <code>V **IfVecThenElse**(V mask, V yes, V no)</code>: equivalent to and
-    possibly faster than `IfVecThenElse(MaskFromVec(mask), yes, no)`. The result
-    is *implementation-defined* if `mask[i]` is neither zero nor all bits set.
-
-#### Logical
-
-*   <code>M **Not**(M m)</code>: returns mask of elements indicating whether the
-    input mask element was false.
-
-*   <code>M **And**(M a, M b)</code>: returns mask of elements indicating
-    whether both input mask elements were true.
-
-*   <code>M **AndNot**(M not_a, M b)</code>: returns mask of elements indicating
-    whether not_a is false and b is true.
-
-*   <code>M **Or**(M a, M b)</code>: returns mask of elements indicating whether
-    either input mask element was true.
-
-*   <code>M **Xor**(M a, M b)</code>: returns mask of elements indicating
-    whether exactly one input mask element was true.
-
-#### Compress
-
-*   `V`: `{u,i,f}{16,32,64}` \
-    <code>V **Compress**(V v, M m)</code>: returns `r` such that `r[n]` is
-    `v[i]`, with `i` the n-th lane index (starting from 0) where `m[i]` is true.
-    Compacts lanes whose mask is true into the lower lanes. For targets and lane
-    type `T` where `CompressIsPartition<T>::value` is true, the upper lanes are
-    those whose mask is false (thus `Compress` corresponds to partitioning
-    according to the mask). Otherwise, the upper lanes are
-    implementation-defined. Slow with 16-bit lanes. Use this form when the input
-    is already a mask, e.g. returned by a comparison.
-
-*   `V`: `{u,i,f}{16,32,64}` \
-    <code>V **CompressNot**(V v, M m)</code>: equivalent to `Compress(v,
-    Not(m))` but possibly faster if `CompressIsPartition<T>::value` is true.
-
-*   `V`: `u64` \
-    <code>V **CompressBlocksNot**(V v, M m)</code>: equivalent to
-    `CompressNot(v, m)` when `m` is structured as adjacent pairs (both true or
-    false), e.g. as returned by `Lt128`. This is a no-op for 128 bit vectors.
-    Unavailable if `HWY_TARGET == HWY_SCALAR`.
-
-*   `V`: `{u,i,f}{16,32,64}` \
-    <code>size_t **CompressStore**(V v, M m, D d, T* p)</code>: writes lanes
-    whose mask `m` is true into `p`, starting from lane 0. Returns `CountTrue(d,
-    m)`, the number of valid lanes. May be implemented as `Compress` followed by
-    `StoreU`; lanes after the valid ones may still be overwritten! Slower for
-    16-bit lanes.
-
-*   `V`: `{u,i,f}{16,32,64}` \
-    <code>size_t **CompressBlendedStore**(V v, M m, D d, T* p)</code>: writes
-    only lanes whose mask `m` is true into `p`, starting from lane 0. Returns
-    `CountTrue(d, m)`, the number of lanes written. Does not modify subsequent
-    lanes, but there is no guarantee of atomicity because this may be
-    implemented as `Compress, LoadU, IfThenElse(FirstN), StoreU`.
-
-*   `V`: `{u,i,f}{16,32,64}` \
-    <code>V **CompressBits**(V v, const uint8_t* HWY_RESTRICT bits)</code>:
-    Equivalent to, but often faster than `Compress(v, LoadMaskBits(d, bits))`.
-    `bits` is as specified for `LoadMaskBits`. If called multiple times, the
-    `bits` pointer passed to this function must also be marked `HWY_RESTRICT` to
-    avoid repeated work. Note that if the vector has less than 8 elements,
-    incrementing `bits` will not work as intended for packed bit arrays. As with
-    `Compress`, `CompressIsPartition` indicates the mask=false lanes are moved
-    to the upper lanes; this op is also slow for 16-bit lanes.
-
-*   `V`: `{u,i,f}{16,32,64}` \
-    <code>size_t **CompressBitsStore**(V v, const uint8_t* HWY_RESTRICT bits, D
-    d, T* p)</code>: combination of `CompressStore` and `CompressBits`, see
-    remarks there.
-
-### Comparisons
-
-These return a mask (see above) indicating whether the condition is true.
-
-*   <code>M **operator==**(V a, V b)</code>: returns `a[i] == b[i]`. Currently
-    unavailable on SVE/RVV; use the equivalent `Eq` instead.
-*   <code>M **operator!=**(V a, V b)</code>: returns `a[i] != b[i]`. Currently
-    unavailable on SVE/RVV; use the equivalent `Ne` instead.
-
-*   <code>M **operator&lt;**(V a, V b)</code>: returns `a[i] < b[i]`. Currently
-    unavailable on SVE/RVV; use the equivalent `Lt` instead.
-
-*   <code>M **operator&gt;**(V a, V b)</code>: returns `a[i] > b[i]`. Currently
-    unavailable on SVE/RVV; use the equivalent `Gt` instead.
-
-*   `V`: `{f}` \
-    <code>M **operator&lt;=**(V a, V b)</code>: returns `a[i] <= b[i]`.
-    Currently unavailable on SVE/RVV; use the equivalent `Le` instead.
-
-*   `V`: `{f}` \
-    <code>M **operator&gt;=**(V a, V b)</code>: returns `a[i] >= b[i]`.
-    Currently unavailable on SVE/RVV; use the equivalent `Ge` instead.
-
-*   `V`: `{u,i}` \
-    <code>M **TestBit**(V v, V bit)</code>: returns `(v[i] & bit[i]) == bit[i]`.
-    `bit[i]` must have exactly one bit set.
-
-*   `V`: `u64` \
-    <code>M **Lt128**(D, V a, V b)</code>: for each adjacent pair of 64-bit
-    lanes (e.g. indices 1,0), returns whether `a[1]:a[0]` concatenated to an
-    unsigned 128-bit integer (least significant bits in `a[0]`) is less than
-    `b[1]:b[0]`. For each pair, the mask lanes are either both true or both
-    false. Unavailable if `HWY_TARGET == HWY_SCALAR`.
-
-*   `V`: `u64` \
-    <code>M **Lt128Upper**(D, V a, V b)</code>: for each adjacent pair of 64-bit
-    lanes (e.g. indices 1,0), returns whether `a[1]` is less than `b[1]`. For
-    each pair, the mask lanes are either both true or both false. This is useful
-    for comparing 64-bit keys alongside 64-bit values. Only available if
-    `HWY_TARGET != HWY_SCALAR`.
-
-*   `V`: `u64` \
-    <code>M **Eq128**(D, V a, V b)</code>: for each adjacent pair of 64-bit
-    lanes (e.g. indices 1,0), returns whether `a[1]:a[0]` concatenated to an
-    unsigned 128-bit integer (least significant bits in `a[0]`) equals
-    `b[1]:b[0]`. For each pair, the mask lanes are either both true or both
-    false. Unavailable if `HWY_TARGET == HWY_SCALAR`.
-
-*   `V`: `u64` \
-    <code>M **Eq128Upper**(D, V a, V b)</code>: for each adjacent pair of 64-bit
-    lanes (e.g. indices 1,0), returns whether `a[1]` equals `b[1]`. For each
-    pair, the mask lanes are either both true or both false. This is useful for
-    comparing 64-bit keys alongside 64-bit values. Only available if `HWY_TARGET
-    != HWY_SCALAR`.
-
-### Memory
-
-Memory operands are little-endian, otherwise their order would depend on the
-lane configuration. Pointers are the addresses of `N` consecutive `T` values,
-either `aligned` (address is a multiple of the vector size) or possibly
-unaligned (denoted `p`).
-
-Even unaligned addresses must still be a multiple of `sizeof(T)`, otherwise
-`StoreU` may crash on some platforms (e.g. RVV and ARMv7). Note that C++ ensures
-automatic (stack) and dynamically allocated (via `new` or `malloc`) variables of
-type `T` are aligned to `sizeof(T)`, hence such addresses are suitable for
-`StoreU`. However, casting pointers to `char*` and adding arbitrary offsets (not
-a multiple of `sizeof(T)`) can violate this requirement.
-
-**Note**: computations with low arithmetic intensity (FLOP/s per memory traffic
-bytes), e.g. dot product, can be *1.5 times as fast* when the memory operands
-are aligned to the vector size. An unaligned access may require two load ports.
-
-#### Load
-
-*   <code>Vec&lt;D&gt; **Load**(D, const T* aligned)</code>: returns
-    `aligned[i]`. May fault if the pointer is not aligned to the vector size
-    (using aligned_allocator.h is safe). Using this whenever possible improves
-    codegen on SSSE3/SSE4: unlike `LoadU`, `Load` can be fused into a memory
-    operand, which reduces register pressure.
-
-Requires only *element-aligned* vectors (e.g. from malloc/std::vector, or
-aligned memory at indices which are not a multiple of the vector length):
-
-*   <code>Vec&lt;D&gt; **LoadU**(D, const T* p)</code>: returns `p[i]`.
-
-*   <code>Vec&lt;D&gt; **LoadDup128**(D, const T* p)</code>: returns one 128-bit
-    block loaded from `p` and broadcasted into all 128-bit block\[s\]. This may
-    be faster than broadcasting single values, and is more convenient than
-    preparing constants for the actual vector length. Only available if
-    `HWY_TARGET != HWY_SCALAR`.
-
-*   <code>Vec&lt;D&gt; **MaskedLoad**(M mask, D, const T* p)</code>: returns
-    `p[i]` or zero if the `mask` governing element `i` is false. May fault even
-    where `mask` is false `#if HWY_MEM_OPS_MIGHT_FAULT`. If `p` is aligned,
-    faults cannot happen unless the entire vector is inaccessible. Equivalent
-    to, and potentially more efficient than, `IfThenElseZero(mask, Load(D(),
-    aligned))`.
-
-*   <code>void **LoadInterleaved2**(D, const T* p, Vec&lt;D&gt;&amp; v0,
-    Vec&lt;D&gt;&amp; v1)</code>: equivalent to `LoadU` into `v0, v1` followed
-    by shuffling, such that `v0[0] == p[0], v1[0] == p[1]`.
-
-*   <code>void **LoadInterleaved3**(D, const T* p, Vec&lt;D&gt;&amp; v0,
-    Vec&lt;D&gt;&amp; v1, Vec&lt;D&gt;&amp; v2)</code>: as above, but for three
-    vectors (e.g. RGB samples).
-
-*   <code>void **LoadInterleaved4**(D, const T* p, Vec&lt;D&gt;&amp; v0,
-    Vec&lt;D&gt;&amp; v1, Vec&lt;D&gt;&amp; v2, Vec&lt;D&gt;&amp; v3)</code>: as
-    above, but for four vectors (e.g. RGBA).
-
-#### Scatter/Gather
-
-**Note**: Offsets/indices are of type `VI = Vec<RebindToSigned<D>>` and need not
-be unique. The results are implementation-defined if any are negative.
-
-**Note**: Where possible, applications should `Load/Store/TableLookup*` entire
-vectors, which is much faster than `Scatter/Gather`. Otherwise, code of the form
-`dst[tbl[i]] = F(src[i])` should when possible be transformed to `dst[i] =
-F(src[tbl[i]])` because `Scatter` is more expensive than `Gather`.
-
-*   `D`: `{u,i,f}{32,64}` \
-    <code>void **ScatterOffset**(Vec&lt;D&gt; v, D, const T* base, VI
-    offsets)</code>: stores `v[i]` to the base address plus *byte* `offsets[i]`.
-
-*   `D`: `{u,i,f}{32,64}` \
-    <code>void **ScatterIndex**(Vec&lt;D&gt; v, D, const T* base, VI
-    indices)</code>: stores `v[i]` to `base[indices[i]]`.
-
-*   `D`: `{u,i,f}{32,64}` \
-    <code>Vec&lt;D&gt; **GatherOffset**(D, const T* base, VI offsets)</code>:
-    returns elements of base selected by *byte* `offsets[i]`.
-
-*   `D`: `{u,i,f}{32,64}` \
-    <code>Vec&lt;D&gt; **GatherIndex**(D, const T* base, VI indices)</code>:
-    returns vector of `base[indices[i]]`.
-
-#### Store
-
-*   <code>void **Store**(Vec&lt;D&gt; v, D, T* aligned)</code>: copies `v[i]`
-    into `aligned[i]`, which must be aligned to the vector size. Writes exactly
-    `N * sizeof(T)` bytes.
-
-*   <code>void **StoreU**(Vec&lt;D&gt; v, D, T* p)</code>: as `Store`, but the
-    alignment requirement is relaxed to element-aligned (multiple of
-    `sizeof(T)`).
-
-*   <code>void **BlendedStore**(Vec&lt;D&gt; v, M m, D d, T* p)</code>: as
-    `StoreU`, but only updates `p` where `m` is true. May fault even where
-    `mask` is false `#if HWY_MEM_OPS_MIGHT_FAULT`. If `p` is aligned, faults
-    cannot happen unless the entire vector is inaccessible. Equivalent to, and
-    potentially more efficient than, `StoreU(IfThenElse(m, v, LoadU(d, p)), d,
-    p)`. "Blended" indicates this may not be atomic; other threads must not
-    concurrently update `[p, p + Lanes(d))` without synchronization.
-
-*   <code>void **SafeFillN**(size_t num, T value, D d, T* HWY_RESTRICT
-    to)</code>: Sets `to[0, num)` to `value`. If `num` exceeds `Lanes(d)`, the
-    behavior is target-dependent (either filling all, or no more than one
-    vector). Potentially more efficient than a scalar loop, but will not fault,
-    unlike `BlendedStore`. No alignment requirement. Potentially non-atomic,
-    like `BlendedStore`.
-
-*   <code>void **SafeCopyN**(size_t num, D d, const T* HWY_RESTRICT from, T*
-    HWY_RESTRICT to)</code>: Copies `from[0, num)` to `to`. If `num` exceeds
-    `Lanes(d)`, the behavior is target-dependent (either copying all, or no more
-    than one vector). Potentially more efficient than a scalar loop, but will
-    not fault, unlike `BlendedStore`. No alignment requirement. Potentially
-    non-atomic, like `BlendedStore`.
-
-*   <code>void **StoreInterleaved2**(Vec&lt;D&gt; v0, Vec&lt;D&gt; v1, D, T*
-    p)</code>: equivalent to shuffling `v0, v1` followed by two `StoreU()`, such
-    that `p[0] == v0[0], p[1] == v1[0]`.
-
-*   <code>void **StoreInterleaved3**(Vec&lt;D&gt; v0, Vec&lt;D&gt; v1,
-    Vec&lt;D&gt; v2, D, T* p)</code>: as above, but for three vectors (e.g. RGB
-    samples).
-
-*   <code>void **StoreInterleaved4**(Vec&lt;D&gt; v0, Vec&lt;D&gt; v1,
-    Vec&lt;D&gt; v2, Vec&lt;D&gt; v3, D, T* p)</code>: as above, but for four
-    vectors (e.g. RGBA samples).
-
-### Cache control
-
-All functions except `Stream` are defined in cache_control.h.
-
-*   <code>void **Stream**(Vec&lt;D&gt; a, D d, const T* aligned)</code>: copies
-    `a[i]` into `aligned[i]` with non-temporal hint if available (useful for
-    write-only data; avoids cache pollution). May be implemented using a
-    CPU-internal buffer. To avoid partial flushes and unpredictable interactions
-    with atomics (for example, see Intel SDM Vol 4, Sec. 8.1.2.2), call this
-    consecutively for an entire cache line (typically 64 bytes, aligned to its
-    size). Each call may write a multiple of `HWY_STREAM_MULTIPLE` bytes, which
-    can exceed `Lanes(d) * sizeof(T)`. The new contents of `aligned` may not be
-    visible until `FlushStream` is called.
-
-*   <code>void **FlushStream**()</code>: ensures values written by previous
-    `Stream` calls are visible on the current core. This is NOT sufficient for
-    synchronizing across cores; when `Stream` outputs are to be consumed by
-    other core(s), the producer must publish availability (e.g. via mutex or
-    atomic_flag) after `FlushStream`.
-
-*   <code>void **FlushCacheline**(const void* p)</code>: invalidates and flushes
-    the cache line containing "p", if possible.
-
-*   <code>void **Prefetch**(const T* p)</code>: optionally begins loading the
-    cache line containing "p" to reduce latency of subsequent actual loads.
-
-*   <code>void **Pause**()</code>: when called inside a spin-loop, may reduce
-    power consumption.
-
-### Type conversion
-
-*   <code>Vec&lt;D&gt; **BitCast**(D, V)</code>: returns the bits of `V`
-    reinterpreted as type `Vec<D>`.
-
-*   `V`,`D`: (`u8,u16`), (`u16,u32`), (`u8,u32`), (`u32,u64`), (`u8,i16`), \
-    (`u8,i32`), (`u16,i32`), (`i8,i16`), (`i8,i32`), (`i16,i32`), (`i32,i64`)
-    <code>Vec&lt;D&gt; **PromoteTo**(D, V part)</code>: returns `part[i]` zero-
-    or sign-extended to the integer type `MakeWide<T>`.
-
-*   `V`,`D`: (`f16,f32`), (`bf16,f32`), (`f32,f64`) \
-    <code>Vec&lt;D&gt; **PromoteTo**(D, V part)</code>: returns `part[i]`
-    widened to the floating-point type `MakeWide<T>`.
-
-*   `V`,`D`: \
-    <code>Vec&lt;D&gt; **PromoteTo**(D, V part)</code>: returns `part[i]`
-    converted to 64-bit floating point.
-
-*   `V`,`D`: (`bf16,f32`) <code>Vec&lt;D&gt; **PromoteLowerTo**(D, V v)</code>:
-    returns `v[i]` widened to `MakeWide<T>`, for i in `[0, Lanes(D()))`. Note
-    that `V` has twice as many lanes as `D` and the return value.
-
-*   `V`,`D`: (`bf16,f32`) <code>Vec&lt;D&gt; **PromoteUpperTo**(D, V v)</code>:
-    returns `v[i]` widened to `MakeWide<T>`, for i in `[Lanes(D()), 2 *
-    Lanes(D()))`. Note that `V` has twice as many lanes as `D` and the return
-    value.
-
-*   `V`,`V8`: (`u32,u8`) \
-    <code>V8 **U8FromU32**(V)</code>: special-case `u32` to `u8` conversion when
-    all lanes of `V` are already clamped to `[0, 256)`.
-
-*   `D`,`V`: (`u64,u32`), (`u64,u16`), (`u64,u8`), (`u32,u16`), (`u32,u8`), \
-    (`u16,u8`) <code>Vec&lt;D&gt; **TruncateTo**(D, V v)</code>: returns `v[i]`
-    truncated to the smaller type indicated by `T = TFromD<D>`, with the same
-    result as if the more-signficant input bits that do not fit in `T` had been
-    zero. Example: ```
-ScalableTag<uint32_t> du32;
-Rebind<uint8_t> du8;
-TruncateTo(du8, Set(du32, 0xF08F))
-    ``` is the same as `Set(du8, 0x8F)`.
-
-`DemoteTo` and float-to-int `ConvertTo` return the closest representable value
-if the input exceeds the destination range.
-
-*   `V`,`D`: (`i16,i8`), (`i32,i8`), (`i32,i16`), (`i16,u8`), (`i32,u8`),
-    (`i32,u16`), (`f64,f32`) \
-    <code>Vec&lt;D&gt; **DemoteTo**(D, V a)</code>: returns `a[i]` after packing
-    with signed/unsigned saturation to `MakeNarrow<T>`.
-
-*   `V`,`D`: `f64,i32` \
-    <code>Vec&lt;D&gt; **DemoteTo**(D, V a)</code>: rounds floating point
-    towards zero and converts the value to 32-bit integers.
-
-*   `V`,`D`: (`f32,f16`), (`f32,bf16`) \
-    <code>Vec&lt;D&gt; **DemoteTo**(D, V a)</code>: narrows float to half (for
-    bf16, it is unspecified whether this truncates or rounds).
-
-*   `V`,`D`: (`f32,bf16`) \
-    <code>Vec&lt;D&gt; **ReorderDemote2To**(D, V a, V b)</code>: as above, but
-    converts two inputs, `D` and the output have twice as many lanes as `V`, and
-    the output order is some permutation of the inputs. Only available if
-    `HWY_TARGET != HWY_SCALAR`.
-
-*   `V`,`D`: (`i32`,`f32`), (`i64`,`f64`) \
-    <code>Vec&lt;D&gt; **ConvertTo**(D, V)</code>: converts an integer value to
-    same-sized floating point.
-
-*   `V`,`D`: (`f32`,`i32`), (`f64`,`i64`) \
-    <code>Vec&lt;D&gt; **ConvertTo**(D, V)</code>: rounds floating point towards
-    zero and converts the value to same-sized integer.
-
-*   `V`: `f32`; `Ret`: `i32` \
-    <code>Ret **NearestInt**(V a)</code>: returns the integer nearest to `a[i]`;
-    results are undefined for NaN.
-
-### Combine
-
-*   <code>V2 **LowerHalf**([D, ] V)</code>: returns the lower half of the vector
-    `V`. The optional `D` (provided for consistency with `UpperHalf`) is
-    `Half<DFromV<V>>`.
-
-All other ops in this section are only available if `HWY_TARGET != HWY_SCALAR`:
-
-*   <code>V2 **UpperHalf**(D, V)</code>: returns upper half of the vector `V`,
-    where `D` is `Half<DFromV<V>>`.
-
-*   <code>V **ZeroExtendVector**(D, V2)</code>: returns vector whose `UpperHalf`
-    is zero and whose `LowerHalf` is the argument; `D` is `Twice<DFromV<V2>>`.
-
-*   <code>V **Combine**(D, V2, V2)</code>: returns vector whose `UpperHalf` is
-    the first argument and whose `LowerHalf` is the second argument; `D` is
-    `Twice<DFromV<V2>>`.
-
-**Note**: the following operations cross block boundaries, which is typically
-more expensive on AVX2/AVX-512 than per-block operations.
-
-*   <code>V **ConcatLowerLower**(D, V hi, V lo)</code>: returns the
-    concatenation of the lower halves of `hi` and `lo` without splitting into
-    blocks. `D` is `DFromV<V>`.
-
-*   <code>V **ConcatUpperUpper**(D, V hi, V lo)</code>: returns the
-    concatenation of the upper halves of `hi` and `lo` without splitting into
-    blocks. `D` is `DFromV<V>`.
-
-*   <code>V **ConcatLowerUpper**(D, V hi, V lo)</code>: returns the inner half
-    of the concatenation of `hi` and `lo` without splitting into blocks. Useful
-    for swapping the two blocks in 256-bit vectors. `D` is `DFromV<V>`.
-
-*   <code>V **ConcatUpperLower**(D, V hi, V lo)</code>: returns the outer
-    quarters of the concatenation of `hi` and `lo` without splitting into
-    blocks. Unlike the other variants, this does not incur a block-crossing
-    penalty on AVX2/3. `D` is `DFromV<V>`.
-
-*   <code>V **ConcatOdd**(D, V hi, V lo)</code>: returns the concatenation of
-    the odd lanes of `hi` and the odd lanes of `lo`.
-
-*   <code>V **ConcatEven**(D, V hi, V lo)</code>: returns the concatenation of
-    the even lanes of `hi` and the even lanes of `lo`.
-
-### Blockwise
-
-**Note**: if vectors are larger than 128 bits, the following operations split
-their operands into independently processed 128-bit *blocks*.
-
-*   `V`: `{u,i}{16,32,64}, {f}` \
-    <code>V **Broadcast**&lt;int i&gt;(V)</code>: returns individual *blocks*,
-    each with lanes set to `input_block[i]`, `i = [0, 16/sizeof(T))`.
-
-All other ops in this section are only available if `HWY_TARGET != HWY_SCALAR`:
-
-*   `V`: `{u,i}` \
-    <code>VI **TableLookupBytes**(V bytes, VI indices)</code>: returns
-    `bytes[indices[i]]`. Uses byte lanes regardless of the actual vector types.
-    Results are implementation-defined if `indices[i] < 0` or `indices[i] >=
-    HWY_MIN(Lanes(DFromV<V>()), 16)`. `VI` are integers, possibly of a different
-    type than those in `V`. The number of lanes in `V` and `VI` may differ, e.g.
-    a full-length table vector loaded via `LoadDup128`, plus partial vector `VI`
-    of 4-bit indices.
-
-*   `V`: `{u,i}` \
-    <code>VI **TableLookupBytesOr0**(V bytes, VI indices)</code>: returns
-    `bytes[indices[i]]`, or 0 if `indices[i] & 0x80`. Uses byte lanes regardless
-    of the actual vector types. Results are implementation-defined for
-    `indices[i] < 0` or in `[HWY_MIN(Lanes(DFromV<V>()), 16), 0x80)`. The
-    zeroing behavior has zero cost on x86 and ARM. For vectors of >= 256 bytes
-    (can happen on SVE and RVV), this will set all lanes after the first 128
-    to 0. `VI` are integers, possibly of a different type than those in `V`. The
-    number of lanes in `V` and `VI` may differ.
-
-#### Interleave
-
-Ops in this section are only available if `HWY_TARGET != HWY_SCALAR`:
-
-*   <code>V **InterleaveLower**([D, ] V a, V b)</code>: returns *blocks* with
-    alternating lanes from the lower halves of `a` and `b` (`a[0]` in the
-    least-significant lane). The optional `D` (provided for consistency with
-    `InterleaveUpper`) is `DFromV<V>`.
-
-*   <code>V **InterleaveUpper**(D, V a, V b)</code>: returns *blocks* with
-    alternating lanes from the upper halves of `a` and `b` (`a[N/2]` in the
-    least-significant lane). `D` is `DFromV<V>`.
-
-#### Zip
-
-*   `Ret`: `MakeWide<T>`; `V`: `{u,i}{8,16,32}` \
-    <code>Ret **ZipLower**([D, ] V a, V b)</code>: returns the same bits as
-    `InterleaveLower`, but repartitioned into double-width lanes (required in
-    order to use this operation with scalars). The optional `D` (provided for
-    consistency with `ZipUpper`) is `RepartitionToWide<DFromV<V>>`.
-
-*   `Ret`: `MakeWide<T>`; `V`: `{u,i}{8,16,32}` \
-    <code>Ret **ZipUpper**(D, V a, V b)</code>: returns the same bits as
-    `InterleaveUpper`, but repartitioned into double-width lanes (required in
-    order to use this operation with scalars). `D` is
-    `RepartitionToWide<DFromV<V>>`. Only available if `HWY_TARGET !=
-    HWY_SCALAR`.
-
-#### Shift
-
-Ops in this section are only available if `HWY_TARGET != HWY_SCALAR`:
-
-*   `V`: `{u,i}` \
-    <code>V **ShiftLeftBytes**&lt;int&gt;([D, ] V)</code>: returns the result of
-    shifting independent *blocks* left by `int` bytes \[1, 15\]. The optional
-    `D` (provided for consistency with `ShiftRightBytes`) is `DFromV<V>`.
-
-*   <code>V **ShiftLeftLanes**&lt;int&gt;([D, ] V)</code>: returns the result of
-    shifting independent *blocks* left by `int` lanes. The optional `D`
-    (provided for consistency with `ShiftRightLanes`) is `DFromV<V>`.
-
-*   `V`: `{u,i}` \
-    <code>V **ShiftRightBytes**&lt;int&gt;(D, V)</code>: returns the result of
-    shifting independent *blocks* right by `int` bytes \[1, 15\], shifting in
-    zeros even for partial vectors. `D` is `DFromV<V>`.
-
-*   <code>V **ShiftRightLanes**&lt;int&gt;(D, V)</code>: returns the result of
-    shifting independent *blocks* right by `int` lanes, shifting in zeros even
-    for partial vectors. `D` is `DFromV<V>`.
-
-*   `V`: `{u,i}` \
-    <code>V **CombineShiftRightBytes**&lt;int&gt;(D, V hi, V lo)</code>: returns
-    a vector of *blocks* each the result of shifting two concatenated *blocks*
-    `hi[i] || lo[i]` right by `int` bytes \[1, 16). `D` is `DFromV<V>`.
-
-*   <code>V **CombineShiftRightLanes**&lt;int&gt;(D, V hi, V lo)</code>: returns
-    a vector of *blocks* each the result of shifting two concatenated *blocks*
-    `hi[i] || lo[i]` right by `int` lanes \[1, 16/sizeof(T)). `D` is
-    `DFromV<V>`.
-
-#### Shuffle
-
-Ops in this section are only available if `HWY_TARGET != HWY_SCALAR`:
-
-*   `V`: `{u,i,f}{32}` \
-    <code>V **Shuffle1032**(V)</code>: returns *blocks* with 64-bit halves
-    swapped.
-
-*   `V`: `{u,i,f}{32}` \
-    <code>V **Shuffle0321**(V)</code>: returns *blocks* rotated right (toward
-    the lower end) by 32 bits.
-
-*   `V`: `{u,i,f}{32}` \
-    <code>V **Shuffle2103**(V)</code>: returns *blocks* rotated left (toward the
-    upper end) by 32 bits.
-
-The following are equivalent to `Reverse2` or `Reverse4`, which should be used
-instead because they are more general:
-
-*   `V`: `{u,i,f}{32}` \
-    <code>V **Shuffle2301**(V)</code>: returns *blocks* with 32-bit halves
-    swapped inside 64-bit halves.
-
-*   `V`: `{u,i,f}{64}` \
-    <code>V **Shuffle01**(V)</code>: returns *blocks* with 64-bit halves
-    swapped.
-
-*   `V`: `{u,i,f}{32}` \
-    <code>V **Shuffle0123**(V)</code>: returns *blocks* with lanes in reverse
-    order.
-
-### Swizzle
-
-*   <code>V **OddEven**(V a, V b)</code>: returns a vector whose odd lanes are
-    taken from `a` and the even lanes from `b`.
-
-*   <code>V **OddEvenBlocks**(V a, V b)</code>: returns a vector whose odd
-    blocks are taken from `a` and the even blocks from `b`. Returns `b` if the
-    vector has no more than one block (i.e. is 128 bits or scalar).
-
-*   `V`: `{u,i,f}{32,64}` \
-    <code>V **DupEven**(V v)</code>: returns `r`, the result of copying even
-    lanes to the next higher-indexed lane. For each even lane index `i`,
-    `r[i] == v[i]` and `r[i + 1] == v[i]`.
-
-*   <code>V **ReverseBlocks**(V v)</code>: returns a vector with blocks in
-    reversed order.
-
-*   `V`: `{u,i,f}{32,64}` \
-    <code>V **TableLookupLanes**(V a, unspecified)</code> returns a vector
-    of `a[indices[i]]`, where `unspecified` is the return value of
-    `SetTableIndices(D, &indices[0])` or `IndicesFromVec`. The indices are
-    not limited to blocks, hence this is slower than `TableLookupBytes*` on
-    AVX2/AVX-512. Results are implementation-defined unless `0 <= indices[i]
-    < Lanes(D())`. `indices` are always integers, even if `V` is a
-    floating-point type.
-
-*   `D`: `{u,i}{32,64}` \
-    <code>unspecified **IndicesFromVec**(D d, V idx)</code> prepares for
-    `TableLookupLanes` with integer indices in `idx`, which must be the same bit
-    width as `TFromD<D>` and in the range `[0, Lanes(d))`, but need not be
-    unique.
-
-*   `D`: `{u,i}{32,64}` \
-    <code>unspecified **SetTableIndices**(D d, TI* idx)</code> prepares for
-    `TableLookupLanes` by loading `Lanes(d)` integer indices from `idx`, which
-    must be in the range `[0, Lanes(d))` but need not be unique. The index type
-    `TI` must be an integer of the same size as `TFromD<D>`.
-
-*   `V`: `{u,i,f}{16,32,64}` \
-    <code>V **Reverse**(D, V a)</code> returns a vector with lanes in reversed
-    order (`out[i] == a[Lanes(D()) - 1 - i]`).
-
-The following `ReverseN` must not be called if `Lanes(D()) < N`:
-
-*   `V`: `{u,i,f}{16,32,64}` \
-    <code>V **Reverse2**(D, V a)</code> returns a vector with each group of 2
-    contiguous lanes in reversed order (`out[i] == a[i ^ 1]`).
-
-*   `V`: `{u,i,f}{16,32,64}` \
-    <code>V **Reverse4**(D, V a)</code> returns a vector with each group of 4
-    contiguous lanes in reversed order (`out[i] == a[i ^ 3]`).
-
-*   `V`: `{u,i,f}{16,32,64}` \
-    <code>V **Reverse8**(D, V a)</code> returns a vector with each group of 8
-    contiguous lanes in reversed order (`out[i] == a[i ^ 7]`).
-
-All other ops in this section are only available if `HWY_TARGET != HWY_SCALAR`:
-
-*   `V`: `{u,i,f}{32,64}` \
-    <code>V **DupOdd**(V v)</code>: returns `r`, the result of copying odd lanes
-    to the previous lower-indexed lane. For each odd lane index `i`, `r[i] ==
-    v[i]` and `r[i - 1] == v[i]`.
-
-*   <code>V **SwapAdjacentBlocks**(V v)</code>: returns a vector where blocks of
-    index `2*i` and `2*i+1` are swapped. Results are undefined for vectors with
-    less than two blocks; callers must first check that via `Lanes`.
-
-### Reductions
-
-**Note**: these 'reduce' all lanes to a single result (e.g. sum), which is
-broadcasted to all lanes. To obtain a scalar, you can call `GetLane`.
-
-Being a horizontal operation (across lanes of the same vector), these are slower
-than normal SIMD operations and are typically used outside critical loops.
-
-*   `V`: `{u,i,f}{32,64}` \
-    <code>V **SumOfLanes**(D, V v)</code>: returns the sum of all lanes in each
-    lane.
-
-*   `V`: `{u,i,f}{32,64},{u,i}{16}` \
-    <code>V **MinOfLanes**(D, V v)</code>: returns the minimum-valued lane in
-    each lane.
-
-*   `V`: `{u,i,f}{32,64},{u,i}{16}` \
-    <code>V **MaxOfLanes**(D, V v)</code>: returns the maximum-valued lane in
-    each lane.
-
-### Crypto
-
-Ops in this section are only available if `HWY_TARGET != HWY_SCALAR`:
-
-*   `V`: `u8` \
-    <code>V **AESRound**(V state, V round_key)</code>: one round of AES
-    encryption: `MixColumns(SubBytes(ShiftRows(state))) ^ round_key`. This
-    matches x86 AES-NI. The latency is independent of the input values.
-
-*   `V`: `u8` \
-    <code>V **AESLastRound**(V state, V round_key)</code>: the last round of AES
-    encryption: `SubBytes(ShiftRows(state)) ^ round_key`. This matches x86
-    AES-NI. The latency is independent of the input values.
-
-*   `V`: `u64` \
-    <code>V **CLMulLower**(V a, V b)</code>: carryless multiplication of the
-    lower 64 bits of each 128-bit block into a 128-bit product. The latency is
-    independent of the input values (assuming that is true of normal integer
-    multiplication) so this can safely be used in crypto. Applications that wish
-    to multiply upper with lower halves can `Shuffle01` one of the operands; on
-    x86 that is expected to be latency-neutral.
-
-*   `V`: `u64` \
-    <code>V **CLMulUpper**(V a, V b)</code>: as CLMulLower, but multiplies the
-    upper 64 bits of each 128-bit block.
-
-## Preprocessor macros
-
-*   `HWY_ALIGN`: Prefix for stack-allocated (i.e. automatic storage duration)
-    arrays to ensure they have suitable alignment for Load()/Store(). This is
-    specific to `HWY_TARGET` and should only be used inside `HWY_NAMESPACE`.
-
-    Arrays should also only be used for partial (<= 128-bit) vectors, or
-    `LoadDup128`, because full vectors may be too large for the stack and should
-    be heap-allocated instead (see aligned_allocator.h).
-
-    Example: `HWY_ALIGN float lanes[4];`
-
-*   `HWY_ALIGN_MAX`: as `HWY_ALIGN`, but independent of `HWY_TARGET` and may be
-    used outside `HWY_NAMESPACE`.
-
-## Advanced macros
-
-*   `HWY_IDE` is 0 except when parsed by IDEs; adding it to conditions such as
-    `#if HWY_TARGET != HWY_SCALAR || HWY_IDE` avoids code appearing greyed out.
-
-The following indicate support for certain lane types and expand to 1 or 0:
-
-*   `HWY_HAVE_INTEGER64`: support for 64-bit signed/unsigned integer lanes.
-*   `HWY_HAVE_FLOAT16`: support for 16-bit floating-point lanes.
-*   `HWY_HAVE_FLOAT64`: support for double-precision floating-point lanes.
-
-The above were previously known as `HWY_CAP_INTEGER64`, `HWY_CAP_FLOAT16`, and
-`HWY_CAP_FLOAT64`, respectively. Those `HWY_CAP_*` names are DEPRECATED.
-
-*   `HWY_HAVE_SCALABLE` indicates vector sizes are unknown at compile time, and
-    determined by the CPU.
-
-*   `HWY_MEM_OPS_MIGHT_FAULT` is 1 iff `MaskedLoad` may trigger a (page) fault
-    when attempting to load lanes from unmapped memory, even if the
-    corresponding mask element is false. This is the case on ASAN/MSAN builds,
-    AMD x86 prior to AVX-512, and ARM NEON. If so, users can prevent faults by
-    ensuring memory addresses are aligned to the vector size or at least padded
-    (allocation size increased by at least `Lanes(d)`.
-
-*   `HWY_NATIVE_FMA` expands to 1 if the `MulAdd` etc. ops use native fused
-    multiply-add. Otherwise, `MulAdd(f, m, a)` is implemented as `Add(Mul(f, m),
-    a)`. Checking this can be useful for increasing the tolerance of expected
-    results (around 1E-5 or 1E-6).
-
-The following were used to signal the maximum number of lanes for certain
-operations, but this is no longer necessary (nor possible on SVE/RVV), so they
-are DEPRECATED:
-
-*   `HWY_CAP_GE256`: the current target supports vectors of >= 256 bits.
-*   `HWY_CAP_GE512`: the current target supports vectors of >= 512 bits.
-
-## Detecting supported targets
-
-`SupportedTargets()` returns a non-cached (re-initialized on each call) bitfield
-of the targets supported on the current CPU, detected using CPUID on x86 or
-equivalent. This may include targets that are not in `HWY_TARGETS`, and vice
-versa. If there is no overlap the binary will likely crash. This can only happen
-if:
-
-*   the specified baseline is not supported by the current CPU, which
-    contradicts the definition of baseline, so the configuration is invalid; or
-*   the baseline does not include the enabled/attainable target(s), which are
-    also not supported by the current CPU, and baseline targets (in particular
-    `HWY_SCALAR`) were explicitly disabled.
-
-## Advanced configuration macros
-
-The following macros govern which targets to generate. Unless specified
-otherwise, they may be defined per translation unit, e.g. to disable >128 bit
-vectors in modules that do not benefit from them (if bandwidth-limited or only
-called occasionally). This is safe because `HWY_TARGETS` always includes at
-least one baseline target which `HWY_EXPORT` can use.
-
-*   `HWY_DISABLE_CACHE_CONTROL` makes the cache-control functions no-ops.
-*   `HWY_DISABLE_BMI2_FMA` prevents emitting BMI/BMI2/FMA instructions. This
-    allows using AVX2 in VMs that do not support the other instructions, but
-    only if defined for all translation units.
-
-The following `*_TARGETS` are zero or more `HWY_Target` bits and can be defined
-as an expression, e.g. `-DHWY_DISABLED_TARGETS=(HWY_SSE4|HWY_AVX3)`.
-
-*   `HWY_BROKEN_TARGETS` defaults to a blocklist of known compiler bugs.
-    Defining to 0 disables the blocklist.
-
-*   `HWY_DISABLED_TARGETS` defaults to zero. This allows explicitly disabling
-    targets without interfering with the blocklist.
-
-*   `HWY_BASELINE_TARGETS` defaults to the set whose predefined macros are
-    defined (i.e. those for which the corresponding flag, e.g. -mavx2, was
-    passed to the compiler). If specified, this should be the same for all
-    translation units, otherwise the safety check in SupportedTargets (that all
-    enabled baseline targets are supported) may be inaccurate.
-
-Zero or one of the following macros may be defined to replace the default
-policy for selecting `HWY_TARGETS`:
-
-*   `HWY_COMPILE_ONLY_EMU128` selects only `HWY_EMU128`, which avoids intrinsics
-    but implements all ops using standard C++.
-*   `HWY_COMPILE_ONLY_SCALAR` selects only `HWY_SCALAR`, which implements
-    single-lane-only ops using standard C++.
-*   `HWY_COMPILE_ONLY_STATIC` selects only `HWY_STATIC_TARGET`, which
-    effectively disables dynamic dispatch.
-*   `HWY_COMPILE_ALL_ATTAINABLE` selects all attainable targets (i.e. enabled
-    and permitted by the compiler, independently of autovectorization), which
-    maximizes coverage in tests.
-
-At most one `HWY_COMPILE_ONLY_*` may be defined. `HWY_COMPILE_ALL_ATTAINABLE`
-may also be defined even if one of `HWY_COMPILE_ONLY_*` is, but will then be
-ignored.
-
-If none are defined, but `HWY_IS_TEST` is defined, the default is
-`HWY_COMPILE_ALL_ATTAINABLE`. Otherwise, the default is to select all attainable
-targets except any non-best baseline (typically `HWY_SCALAR`), which reduces
-code size.
-
-## Compiler support
-
-Clang and GCC require e.g. -mavx2 flags in order to use SIMD intrinsics.
-However, this enables AVX2 instructions in the entire translation unit, which
-may violate the one-definition rule and cause crashes. Instead, we use
-target-specific attributes introduced via #pragma. Function using SIMD must
-reside between `HWY_BEFORE_NAMESPACE` and `HWY_AFTER_NAMESPACE`. Alternatively,
-individual functions or lambdas may be prefixed with `HWY_ATTR`.
-
-Immediates (compile-time constants) are specified as template arguments to avoid
-constant-propagation issues with Clang on ARM.
-
-## Type traits
-
-*   `IsFloat<T>()` returns true if the `T` is a floating-point type.
-*   `IsSigned<T>()` returns true if the `T` is a signed or floating-point type.
-*   `LimitsMin/Max<T>()` return the smallest/largest value representable in
-    integer `T`.
-*   `SizeTag<N>` is an empty struct, used to select overloaded functions
-    appropriate for `N` bytes.
-
-*   `MakeUnsigned<T>` is an alias for an unsigned type of the same size as `T`.
-
-*   `MakeSigned<T>` is an alias for a signed type of the same size as `T`.
-
-*   `MakeFloat<T>` is an alias for a floating-point type of the same size as
-    `T`.
-
-*   `MakeWide<T>` is an alias for a type with twice the size of `T` and the same
-    category (unsigned/signed/float).
-
-*   `MakeNarrow<T>` is an alias for a type with half the size of `T` and the
-    same category (unsigned/signed/float).
-
-## Memory allocation
-
-`AllocateAligned<T>(items)` returns a unique pointer to newly allocated memory
-for `items` elements of POD type `T`. The start address is aligned as required
-by `Load/Store`. Furthermore, successive allocations are not congruent modulo a
-platform-specific alignment. This helps prevent false dependencies or cache
-conflicts. The memory allocation is analogous to using `malloc()` and `free()`
-with a `std::unique_ptr` since the returned items are *not* initialized or
-default constructed and it is released using `FreeAlignedBytes()` without
-calling `~T()`.
-
-`MakeUniqueAligned<T>(Args&&... args)` creates a single object in newly
-allocated aligned memory as above but constructed passing the `args` argument to
-`T`'s constructor and returning a unique pointer to it. This is analogous to
-using `std::make_unique` with `new` but for aligned memory since the object is
-constructed and later destructed when the unique pointer is deleted. Typically
-this type `T` is a struct containing multiple members with `HWY_ALIGN` or
-`HWY_ALIGN_MAX`, or arrays whose lengths are known to be a multiple of the
-vector size.
-
-`MakeUniqueAlignedArray<T>(size_t items, Args&&... args)` creates an array of
-objects in newly allocated aligned memory as above and constructs every element
-of the new array using the passed constructor parameters, returning a unique
-pointer to the array. Note that only the first element is guaranteed to be
-aligned to the vector size; because there is no padding between elements,
-the alignment of the remaining elements depends on the size of `T`.
diff --git a/third_party/highway/g3doc/release_testing_process.md b/third_party/highway/g3doc/release_testing_process.md
deleted file mode 100644 (file)
index 589f816..0000000
+++ /dev/null
@@ -1,37 +0,0 @@
-## Release testing process
-
-We run the following before a release:
-
-### Windows x86
-
-```
-run_tests.bat
-```
-
-### Linux x86
-
-#### Clang, GCC, ARM cross compile
-
-```
-./run_tests.sh
-```
-
-#### JPEG XL clang (debug, asan, msan)
-
-```
-for VER in 7 8 9 10 11; do
-  rm -rf build_debug$VER && CC=clang-$VER CXX=clang++-$VER BUILD_DIR=build_debug$VER SKIP_TEST=1 ./ci.sh debug && ./ci.sh test -R PassesTest && rm -rf build_debug$VER
-  rm -rf build_asan$VER  && CC=clang-$VER CXX=clang++-$VER BUILD_DIR=build_asan$VER  ./ci.sh asan  && rm -rf build_asan$VER
-  rm -rf build_msan$VER  && CC=clang-$VER CXX=clang++-$VER BUILD_DIR=build_msan$VER  ./ci.sh msan  && rm -rf build_msan$VER
-done
-```
-
-#### JPEG XL tests
-
-```
-git -C third_party/highway pull -r origin master
-git diff
-vi deps.sh
-git commit -a -m"Highway test"
-git push git@github.com:$USER/libjxl.git HEAD:main --force
-```
diff --git a/third_party/highway/hwy/aligned_allocator.cc b/third_party/highway/hwy/aligned_allocator.cc
deleted file mode 100644 (file)
index 7b99479..0000000
+++ /dev/null
@@ -1,152 +0,0 @@
-// Copyright 2019 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/aligned_allocator.h"
-
-#include <stdarg.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>  // malloc
-
-#include <atomic>
-#include <limits>
-
-#include "hwy/base.h"
-
-namespace hwy {
-namespace {
-
-#if HWY_ARCH_RVV && defined(__riscv_vector)
-// Not actually an upper bound on the size, but this value prevents crossing a
-// 4K boundary (relevant on Andes).
-constexpr size_t kAlignment = HWY_MAX(HWY_ALIGNMENT, 4096);
-#else
-constexpr size_t kAlignment = HWY_ALIGNMENT;
-#endif
-
-#if HWY_ARCH_X86
-// On x86, aliasing can only occur at multiples of 2K, but that's too wasteful
-// if this is used for single-vector allocations. 256 is more reasonable.
-constexpr size_t kAlias = kAlignment * 4;
-#else
-constexpr size_t kAlias = kAlignment;
-#endif
-
-#pragma pack(push, 1)
-struct AllocationHeader {
-  void* allocated;
-  size_t payload_size;
-};
-#pragma pack(pop)
-
-// Returns a 'random' (cyclical) offset for AllocateAlignedBytes.
-size_t NextAlignedOffset() {
-  static std::atomic<uint32_t> next{0};
-  constexpr uint32_t kGroups = kAlias / kAlignment;
-  const uint32_t group = next.fetch_add(1, std::memory_order_relaxed) % kGroups;
-  const size_t offset = kAlignment * group;
-  HWY_DASSERT((offset % kAlignment == 0) && offset <= kAlias);
-  return offset;
-}
-
-}  // namespace
-
-HWY_DLLEXPORT void* AllocateAlignedBytes(const size_t payload_size,
-                                         AllocPtr alloc_ptr, void* opaque_ptr) {
-  HWY_ASSERT(payload_size != 0);  // likely a bug in caller
-  if (payload_size >= std::numeric_limits<size_t>::max() / 2) {
-    HWY_DASSERT(false && "payload_size too large");
-    return nullptr;
-  }
-
-  size_t offset = NextAlignedOffset();
-
-  // What: | misalign | unused | AllocationHeader |payload
-  // Size: |<= kAlias | offset                    |payload_size
-  //       ^allocated.^aligned.^header............^payload
-  // The header must immediately precede payload, which must remain aligned.
-  // To avoid wasting space, the header resides at the end of `unused`,
-  // which therefore cannot be empty (offset == 0).
-  if (offset == 0) {
-    offset = kAlignment;  // = RoundUpTo(sizeof(AllocationHeader), kAlignment)
-    static_assert(sizeof(AllocationHeader) <= kAlignment, "Else: round up");
-  }
-
-  const size_t allocated_size = kAlias + offset + payload_size;
-  void* allocated;
-  if (alloc_ptr == nullptr) {
-    allocated = malloc(allocated_size);
-  } else {
-    allocated = (*alloc_ptr)(opaque_ptr, allocated_size);
-  }
-  if (allocated == nullptr) return nullptr;
-  // Always round up even if already aligned - we already asked for kAlias
-  // extra bytes and there's no way to give them back.
-  uintptr_t aligned = reinterpret_cast<uintptr_t>(allocated) + kAlias;
-  static_assert((kAlias & (kAlias - 1)) == 0, "kAlias must be a power of 2");
-  static_assert(kAlias >= kAlignment, "Cannot align to more than kAlias");
-  aligned &= ~(kAlias - 1);
-
-  const uintptr_t payload = aligned + offset;  // still aligned
-
-  // Stash `allocated` and payload_size inside header for FreeAlignedBytes().
-  // The allocated_size can be reconstructed from the payload_size.
-  AllocationHeader* header = reinterpret_cast<AllocationHeader*>(payload) - 1;
-  header->allocated = allocated;
-  header->payload_size = payload_size;
-
-  return HWY_ASSUME_ALIGNED(reinterpret_cast<void*>(payload), kAlignment);
-}
-
-HWY_DLLEXPORT void FreeAlignedBytes(const void* aligned_pointer,
-                                    FreePtr free_ptr, void* opaque_ptr) {
-  if (aligned_pointer == nullptr) return;
-
-  const uintptr_t payload = reinterpret_cast<uintptr_t>(aligned_pointer);
-  HWY_DASSERT(payload % kAlignment == 0);
-  const AllocationHeader* header =
-      reinterpret_cast<const AllocationHeader*>(payload) - 1;
-
-  if (free_ptr == nullptr) {
-    free(header->allocated);
-  } else {
-    (*free_ptr)(opaque_ptr, header->allocated);
-  }
-}
-
-// static
-HWY_DLLEXPORT void AlignedDeleter::DeleteAlignedArray(void* aligned_pointer,
-                                                      FreePtr free_ptr,
-                                                      void* opaque_ptr,
-                                                      ArrayDeleter deleter) {
-  if (aligned_pointer == nullptr) return;
-
-  const uintptr_t payload = reinterpret_cast<uintptr_t>(aligned_pointer);
-  HWY_DASSERT(payload % kAlignment == 0);
-  const AllocationHeader* header =
-      reinterpret_cast<const AllocationHeader*>(payload) - 1;
-
-  if (deleter) {
-    (*deleter)(aligned_pointer, header->payload_size);
-  }
-
-  if (free_ptr == nullptr) {
-    free(header->allocated);
-  } else {
-    (*free_ptr)(opaque_ptr, header->allocated);
-  }
-}
-
-}  // namespace hwy
diff --git a/third_party/highway/hwy/aligned_allocator.h b/third_party/highway/hwy/aligned_allocator.h
deleted file mode 100644 (file)
index f6bfca1..0000000
+++ /dev/null
@@ -1,212 +0,0 @@
-// Copyright 2020 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_
-#define HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_
-
-// Memory allocator with support for alignment and offsets.
-
-#include <stddef.h>
-
-#include <memory>
-
-#include "hwy/highway_export.h"
-
-namespace hwy {
-
-// Minimum alignment of allocated memory for use in HWY_ASSUME_ALIGNED, which
-// requires a literal. This matches typical L1 cache line sizes, which prevents
-// false sharing.
-#define HWY_ALIGNMENT 64
-
-// Pointers to functions equivalent to malloc/free with an opaque void* passed
-// to them.
-using AllocPtr = void* (*)(void* opaque, size_t bytes);
-using FreePtr = void (*)(void* opaque, void* memory);
-
-// Returns null or a pointer to at least `payload_size` (which can be zero)
-// bytes of newly allocated memory, aligned to the larger of HWY_ALIGNMENT and
-// the vector size. Calls `alloc` with the passed `opaque` pointer to obtain
-// memory or malloc() if it is null.
-HWY_DLLEXPORT void* AllocateAlignedBytes(size_t payload_size,
-                                         AllocPtr alloc_ptr, void* opaque_ptr);
-
-// Frees all memory. No effect if `aligned_pointer` == nullptr, otherwise it
-// must have been returned from a previous call to `AllocateAlignedBytes`.
-// Calls `free_ptr` with the passed `opaque_ptr` pointer to free the memory; if
-// `free_ptr` function is null, uses the default free().
-HWY_DLLEXPORT void FreeAlignedBytes(const void* aligned_pointer,
-                                    FreePtr free_ptr, void* opaque_ptr);
-
-// Class that deletes the aligned pointer passed to operator() calling the
-// destructor before freeing the pointer. This is equivalent to the
-// std::default_delete but for aligned objects. For a similar deleter equivalent
-// to free() for aligned memory see AlignedFreer().
-class AlignedDeleter {
- public:
-  AlignedDeleter() : free_(nullptr), opaque_ptr_(nullptr) {}
-  AlignedDeleter(FreePtr free_ptr, void* opaque_ptr)
-      : free_(free_ptr), opaque_ptr_(opaque_ptr) {}
-
-  template <typename T>
-  void operator()(T* aligned_pointer) const {
-    return DeleteAlignedArray(aligned_pointer, free_, opaque_ptr_,
-                              TypedArrayDeleter<T>);
-  }
-
- private:
-  template <typename T>
-  static void TypedArrayDeleter(void* ptr, size_t size_in_bytes) {
-    size_t elems = size_in_bytes / sizeof(T);
-    for (size_t i = 0; i < elems; i++) {
-      // Explicitly call the destructor on each element.
-      (static_cast<T*>(ptr) + i)->~T();
-    }
-  }
-
-  // Function prototype that calls the destructor for each element in a typed
-  // array. TypeArrayDeleter<T> would match this prototype.
-  using ArrayDeleter = void (*)(void* t_ptr, size_t t_size);
-
-  HWY_DLLEXPORT static void DeleteAlignedArray(void* aligned_pointer,
-                                               FreePtr free_ptr,
-                                               void* opaque_ptr,
-                                               ArrayDeleter deleter);
-
-  FreePtr free_;
-  void* opaque_ptr_;
-};
-
-// Unique pointer to T with custom aligned deleter. This can be a single
-// element U or an array of element if T is a U[]. The custom aligned deleter
-// will call the destructor on U or each element of a U[] in the array case.
-template <typename T>
-using AlignedUniquePtr = std::unique_ptr<T, AlignedDeleter>;
-
-// Aligned memory equivalent of make_unique<T> using the custom allocators
-// alloc/free with the passed `opaque` pointer. This function calls the
-// constructor with the passed Args... and calls the destructor of the object
-// when the AlignedUniquePtr is destroyed.
-template <typename T, typename... Args>
-AlignedUniquePtr<T> MakeUniqueAlignedWithAlloc(AllocPtr alloc, FreePtr free,
-                                               void* opaque, Args&&... args) {
-  T* ptr = static_cast<T*>(AllocateAlignedBytes(sizeof(T), alloc, opaque));
-  return AlignedUniquePtr<T>(new (ptr) T(std::forward<Args>(args)...),
-                             AlignedDeleter(free, opaque));
-}
-
-// Similar to MakeUniqueAlignedWithAlloc but using the default alloc/free
-// functions.
-template <typename T, typename... Args>
-AlignedUniquePtr<T> MakeUniqueAligned(Args&&... args) {
-  T* ptr = static_cast<T*>(AllocateAlignedBytes(
-      sizeof(T), /*alloc_ptr=*/nullptr, /*opaque_ptr=*/nullptr));
-  return AlignedUniquePtr<T>(new (ptr) T(std::forward<Args>(args)...),
-                             AlignedDeleter());
-}
-
-// Helpers for array allocators (avoids overflow)
-namespace detail {
-
-// Returns x such that 1u << x == n (if n is a power of two).
-static inline constexpr size_t ShiftCount(size_t n) {
-  return (n <= 1) ? 0 : 1 + ShiftCount(n / 2);
-}
-
-template <typename T>
-T* AllocateAlignedItems(size_t items, AllocPtr alloc_ptr, void* opaque_ptr) {
-  constexpr size_t size = sizeof(T);
-
-  constexpr bool is_pow2 = (size & (size - 1)) == 0;
-  constexpr size_t bits = ShiftCount(size);
-  static_assert(!is_pow2 || (1ull << bits) == size, "ShiftCount is incorrect");
-
-  const size_t bytes = is_pow2 ? items << bits : items * size;
-  const size_t check = is_pow2 ? bytes >> bits : bytes / size;
-  if (check != items) {
-    return nullptr;  // overflowed
-  }
-  return static_cast<T*>(AllocateAlignedBytes(bytes, alloc_ptr, opaque_ptr));
-}
-
-}  // namespace detail
-
-// Aligned memory equivalent of make_unique<T[]> for array types using the
-// custom allocators alloc/free. This function calls the constructor with the
-// passed Args... on every created item. The destructor of each element will be
-// called when the AlignedUniquePtr is destroyed.
-template <typename T, typename... Args>
-AlignedUniquePtr<T[]> MakeUniqueAlignedArrayWithAlloc(
-    size_t items, AllocPtr alloc, FreePtr free, void* opaque, Args&&... args) {
-  T* ptr = detail::AllocateAlignedItems<T>(items, alloc, opaque);
-  if (ptr != nullptr) {
-    for (size_t i = 0; i < items; i++) {
-      new (ptr + i) T(std::forward<Args>(args)...);
-    }
-  }
-  return AlignedUniquePtr<T[]>(ptr, AlignedDeleter(free, opaque));
-}
-
-template <typename T, typename... Args>
-AlignedUniquePtr<T[]> MakeUniqueAlignedArray(size_t items, Args&&... args) {
-  return MakeUniqueAlignedArrayWithAlloc<T, Args...>(
-      items, nullptr, nullptr, nullptr, std::forward<Args>(args)...);
-}
-
-// Custom deleter for std::unique_ptr equivalent to using free() as a deleter
-// but for aligned memory.
-class AlignedFreer {
- public:
-  // Pass address of this to ctor to skip deleting externally-owned memory.
-  static void DoNothing(void* /*opaque*/, void* /*aligned_pointer*/) {}
-
-  AlignedFreer() : free_(nullptr), opaque_ptr_(nullptr) {}
-  AlignedFreer(FreePtr free_ptr, void* opaque_ptr)
-      : free_(free_ptr), opaque_ptr_(opaque_ptr) {}
-
-  template <typename T>
-  void operator()(T* aligned_pointer) const {
-    // TODO(deymo): assert that we are using a POD type T.
-    FreeAlignedBytes(aligned_pointer, free_, opaque_ptr_);
-  }
-
- private:
-  FreePtr free_;
-  void* opaque_ptr_;
-};
-
-// Unique pointer to single POD, or (if T is U[]) an array of POD. For non POD
-// data use AlignedUniquePtr.
-template <typename T>
-using AlignedFreeUniquePtr = std::unique_ptr<T, AlignedFreer>;
-
-// Allocate an aligned and uninitialized array of POD values as a unique_ptr.
-// Upon destruction of the unique_ptr the aligned array will be freed.
-template <typename T>
-AlignedFreeUniquePtr<T[]> AllocateAligned(const size_t items, AllocPtr alloc,
-                                          FreePtr free, void* opaque) {
-  return AlignedFreeUniquePtr<T[]>(
-      detail::AllocateAlignedItems<T>(items, alloc, opaque),
-      AlignedFreer(free, opaque));
-}
-
-// Same as previous AllocateAligned(), using default allocate/free functions.
-template <typename T>
-AlignedFreeUniquePtr<T[]> AllocateAligned(const size_t items) {
-  return AllocateAligned<T>(items, nullptr, nullptr, nullptr);
-}
-
-}  // namespace hwy
-#endif  // HIGHWAY_HWY_ALIGNED_ALLOCATOR_H_
diff --git a/third_party/highway/hwy/aligned_allocator_test.cc b/third_party/highway/hwy/aligned_allocator_test.cc
deleted file mode 100644 (file)
index ced08e7..0000000
+++ /dev/null
@@ -1,278 +0,0 @@
-// Copyright 2020 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/aligned_allocator.h"
-
-#include <stddef.h>
-
-#include <array>
-#include <new>
-#include <random>
-#include <vector>
-
-#include "gtest/gtest.h"
-
-namespace {
-
-// Sample object that keeps track on an external counter of how many times was
-// the explicit constructor and destructor called.
-template <size_t N>
-class SampleObject {
- public:
-  SampleObject() { data_[0] = 'a'; }
-  explicit SampleObject(int* counter) : counter_(counter) {
-    if (counter) (*counter)++;
-    data_[0] = 'b';
-  }
-
-  ~SampleObject() {
-    if (counter_) (*counter_)--;
-  }
-
-  static_assert(N > sizeof(int*), "SampleObject size too small.");
-  int* counter_ = nullptr;
-  char data_[N - sizeof(int*)];
-};
-
-class FakeAllocator {
- public:
-  // static AllocPtr and FreePtr member to be used with the alligned
-  // allocator. These functions calls the private non-static members.
-  static void* StaticAlloc(void* opaque, size_t bytes) {
-    return reinterpret_cast<FakeAllocator*>(opaque)->Alloc(bytes);
-  }
-  static void StaticFree(void* opaque, void* memory) {
-    return reinterpret_cast<FakeAllocator*>(opaque)->Free(memory);
-  }
-
-  // Returns the number of pending allocations to be freed.
-  size_t PendingAllocs() { return allocs_.size(); }
-
- private:
-  void* Alloc(size_t bytes) {
-    void* ret = malloc(bytes);
-    allocs_.insert(ret);
-    return ret;
-  }
-  void Free(void* memory) {
-    if (!memory) return;
-    EXPECT_NE(allocs_.end(), allocs_.find(memory));
-    allocs_.erase(memory);
-    free(memory);
-  }
-
-  std::set<void*> allocs_;
-};
-
-}  // namespace
-
-namespace hwy {
-
-class AlignedAllocatorTest : public testing::Test {};
-
-TEST(AlignedAllocatorTest, FreeNullptr) {
-  // Calling free with a nullptr is always ok.
-  FreeAlignedBytes(/*aligned_pointer=*/nullptr, /*free_ptr=*/nullptr,
-                   /*opaque_ptr=*/nullptr);
-}
-
-TEST(AlignedAllocatorTest, Log2) {
-  EXPECT_EQ(0u, detail::ShiftCount(1));
-  EXPECT_EQ(1u, detail::ShiftCount(2));
-  EXPECT_EQ(3u, detail::ShiftCount(8));
-}
-
-// Allocator returns null when it detects overflow of items * sizeof(T).
-TEST(AlignedAllocatorTest, Overflow) {
-  constexpr size_t max = ~size_t(0);
-  constexpr size_t msb = (max >> 1) + 1;
-  using Size5 = std::array<uint8_t, 5>;
-  using Size10 = std::array<uint8_t, 10>;
-  EXPECT_EQ(nullptr,
-            detail::AllocateAlignedItems<uint32_t>(max / 2, nullptr, nullptr));
-  EXPECT_EQ(nullptr,
-            detail::AllocateAlignedItems<uint32_t>(max / 3, nullptr, nullptr));
-  EXPECT_EQ(nullptr,
-            detail::AllocateAlignedItems<Size5>(max / 4, nullptr, nullptr));
-  EXPECT_EQ(nullptr,
-            detail::AllocateAlignedItems<uint16_t>(msb, nullptr, nullptr));
-  EXPECT_EQ(nullptr,
-            detail::AllocateAlignedItems<double>(msb + 1, nullptr, nullptr));
-  EXPECT_EQ(nullptr,
-            detail::AllocateAlignedItems<Size10>(msb / 4, nullptr, nullptr));
-}
-
-TEST(AlignedAllocatorTest, AllocDefaultPointers) {
-  const size_t kSize = 7777;
-  void* ptr = AllocateAlignedBytes(kSize, /*alloc_ptr=*/nullptr,
-                                   /*opaque_ptr=*/nullptr);
-  ASSERT_NE(nullptr, ptr);
-  // Make sure the pointer is actually aligned.
-  EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr) % HWY_ALIGNMENT);
-  char* p = static_cast<char*>(ptr);
-  size_t ret = 0;
-  for (size_t i = 0; i < kSize; i++) {
-    // Performs a computation using p[] to prevent it being optimized away.
-    p[i] = static_cast<char>(i & 0x7F);
-    if (i) ret += static_cast<size_t>(p[i] * p[i - 1]);
-  }
-  EXPECT_NE(0U, ret);
-  FreeAlignedBytes(ptr, /*free_ptr=*/nullptr, /*opaque_ptr=*/nullptr);
-}
-
-TEST(AlignedAllocatorTest, EmptyAlignedUniquePtr) {
-  AlignedUniquePtr<SampleObject<32>> ptr(nullptr, AlignedDeleter());
-  AlignedUniquePtr<SampleObject<32>[]> arr(nullptr, AlignedDeleter());
-}
-
-TEST(AlignedAllocatorTest, EmptyAlignedFreeUniquePtr) {
-  AlignedFreeUniquePtr<SampleObject<32>> ptr(nullptr, AlignedFreer());
-  AlignedFreeUniquePtr<SampleObject<32>[]> arr(nullptr, AlignedFreer());
-}
-
-TEST(AlignedAllocatorTest, CustomAlloc) {
-  FakeAllocator fake_alloc;
-
-  const size_t kSize = 7777;
-  void* ptr =
-      AllocateAlignedBytes(kSize, &FakeAllocator::StaticAlloc, &fake_alloc);
-  ASSERT_NE(nullptr, ptr);
-  // We should have only requested one alloc from the allocator.
-  EXPECT_EQ(1U, fake_alloc.PendingAllocs());
-  // Make sure the pointer is actually aligned.
-  EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr) % HWY_ALIGNMENT);
-  FreeAlignedBytes(ptr, &FakeAllocator::StaticFree, &fake_alloc);
-  EXPECT_EQ(0U, fake_alloc.PendingAllocs());
-}
-
-TEST(AlignedAllocatorTest, MakeUniqueAlignedDefaultConstructor) {
-  {
-    auto ptr = MakeUniqueAligned<SampleObject<24>>();
-    // Default constructor sets the data_[0] to 'a'.
-    EXPECT_EQ('a', ptr->data_[0]);
-    EXPECT_EQ(nullptr, ptr->counter_);
-  }
-}
-
-TEST(AlignedAllocatorTest, MakeUniqueAligned) {
-  int counter = 0;
-  {
-    // Creates the object, initializes it with the explicit constructor and
-    // returns an unique_ptr to it.
-    auto ptr = MakeUniqueAligned<SampleObject<24>>(&counter);
-    EXPECT_EQ(1, counter);
-    // Custom constructor sets the data_[0] to 'b'.
-    EXPECT_EQ('b', ptr->data_[0]);
-  }
-  EXPECT_EQ(0, counter);
-}
-
-TEST(AlignedAllocatorTest, MakeUniqueAlignedArray) {
-  int counter = 0;
-  {
-    // Creates the array of objects and initializes them with the explicit
-    // constructor.
-    auto arr = MakeUniqueAlignedArray<SampleObject<24>>(7, &counter);
-    EXPECT_EQ(7, counter);
-    for (size_t i = 0; i < 7; i++) {
-      // Custom constructor sets the data_[0] to 'b'.
-      EXPECT_EQ('b', arr[i].data_[0]) << "Where i = " << i;
-    }
-  }
-  EXPECT_EQ(0, counter);
-}
-
-TEST(AlignedAllocatorTest, AllocSingleInt) {
-  auto ptr = AllocateAligned<uint32_t>(1);
-  ASSERT_NE(nullptr, ptr.get());
-  EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr.get()) % HWY_ALIGNMENT);
-  // Force delete of the unique_ptr now to check that it doesn't crash.
-  ptr.reset(nullptr);
-  EXPECT_EQ(nullptr, ptr.get());
-}
-
-TEST(AlignedAllocatorTest, AllocMultipleInt) {
-  const size_t kSize = 7777;
-  auto ptr = AllocateAligned<uint32_t>(kSize);
-  ASSERT_NE(nullptr, ptr.get());
-  EXPECT_EQ(0U, reinterpret_cast<uintptr_t>(ptr.get()) % HWY_ALIGNMENT);
-  // ptr[i] is actually (*ptr.get())[i] which will use the operator[] of the
-  // underlying type chosen by AllocateAligned() for the std::unique_ptr.
-  EXPECT_EQ(&(ptr[0]) + 1, &(ptr[1]));
-
-  size_t ret = 0;
-  for (size_t i = 0; i < kSize; i++) {
-    // Performs a computation using ptr[] to prevent it being optimized away.
-    ptr[i] = static_cast<uint32_t>(i);
-    if (i) ret += ptr[i] * ptr[i - 1];
-  }
-  EXPECT_NE(0U, ret);
-}
-
-TEST(AlignedAllocatorTest, AllocateAlignedObjectWithoutDestructor) {
-  int counter = 0;
-  {
-    // This doesn't call the constructor.
-    auto obj = AllocateAligned<SampleObject<24>>(1);
-    obj[0].counter_ = &counter;
-  }
-  // Destroying the unique_ptr shouldn't have called the destructor of the
-  // SampleObject<24>.
-  EXPECT_EQ(0, counter);
-}
-
-TEST(AlignedAllocatorTest, MakeUniqueAlignedArrayWithCustomAlloc) {
-  FakeAllocator fake_alloc;
-  int counter = 0;
-  {
-    // Creates the array of objects and initializes them with the explicit
-    // constructor.
-    auto arr = MakeUniqueAlignedArrayWithAlloc<SampleObject<24>>(
-        7, FakeAllocator::StaticAlloc, FakeAllocator::StaticFree, &fake_alloc,
-        &counter);
-    ASSERT_NE(nullptr, arr.get());
-    // An array should still only call a single allocation.
-    EXPECT_EQ(1u, fake_alloc.PendingAllocs());
-    EXPECT_EQ(7, counter);
-    for (size_t i = 0; i < 7; i++) {
-      // Custom constructor sets the data_[0] to 'b'.
-      EXPECT_EQ('b', arr[i].data_[0]) << "Where i = " << i;
-    }
-  }
-  EXPECT_EQ(0, counter);
-  EXPECT_EQ(0u, fake_alloc.PendingAllocs());
-}
-
-TEST(AlignedAllocatorTest, DefaultInit) {
-  // The test is whether this compiles. Default-init is useful for output params
-  // and per-thread storage.
-  std::vector<AlignedUniquePtr<int[]>> ptrs;
-  std::vector<AlignedFreeUniquePtr<double[]>> free_ptrs;
-  ptrs.resize(128);
-  free_ptrs.resize(128);
-  // The following is to prevent elision of the pointers.
-  std::mt19937 rng(129);  // Emscripten lacks random_device.
-  std::uniform_int_distribution<size_t> dist(0, 127);
-  ptrs[dist(rng)] = MakeUniqueAlignedArray<int>(123);
-  free_ptrs[dist(rng)] = AllocateAligned<double>(456);
-  // "Use" pointer without resorting to printf. 0 == 0. Can't shift by 64.
-  const auto addr1 = reinterpret_cast<uintptr_t>(ptrs[dist(rng)].get());
-  const auto addr2 = reinterpret_cast<uintptr_t>(free_ptrs[dist(rng)].get());
-  constexpr size_t kBits = sizeof(uintptr_t) * 8;
-  EXPECT_EQ((addr1 >> (kBits - 1)) >> (kBits - 1),
-            (addr2 >> (kBits - 1)) >> (kBits - 1));
-}
-
-}  // namespace hwy
diff --git a/third_party/highway/hwy/base.h b/third_party/highway/hwy/base.h
deleted file mode 100644 (file)
index c87fa96..0000000
+++ /dev/null
@@ -1,924 +0,0 @@
-// Copyright 2020 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HIGHWAY_HWY_BASE_H_
-#define HIGHWAY_HWY_BASE_H_
-
-// For SIMD module implementations and their callers, target-independent.
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "hwy/detect_compiler_arch.h"
-#include "hwy/highway_export.h"
-
-#if HWY_COMPILER_MSVC
-#include <string.h>  // memcpy
-#endif
-#if HWY_ARCH_X86
-#include <atomic>
-#endif
-
-//------------------------------------------------------------------------------
-// Compiler-specific definitions
-
-#define HWY_STR_IMPL(macro) #macro
-#define HWY_STR(macro) HWY_STR_IMPL(macro)
-
-#if HWY_COMPILER_MSVC
-
-#include <intrin.h>
-
-#define HWY_RESTRICT __restrict
-#define HWY_INLINE __forceinline
-#define HWY_NOINLINE __declspec(noinline)
-#define HWY_FLATTEN
-#define HWY_NORETURN __declspec(noreturn)
-#define HWY_LIKELY(expr) (expr)
-#define HWY_UNLIKELY(expr) (expr)
-#define HWY_PRAGMA(tokens) __pragma(tokens)
-#define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(warning(tokens))
-#define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(msc)
-#define HWY_MAYBE_UNUSED
-#define HWY_HAS_ASSUME_ALIGNED 0
-#if (_MSC_VER >= 1700)
-#define HWY_MUST_USE_RESULT _Check_return_
-#else
-#define HWY_MUST_USE_RESULT
-#endif
-
-#else
-
-#define HWY_RESTRICT __restrict__
-// force inlining without optimization enabled creates very inefficient code
-// that can cause compiler timeout
-#ifdef __OPTIMIZE__
-#define HWY_INLINE inline __attribute__((always_inline))
-#else
-#define HWY_INLINE inline
-#endif
-#define HWY_NOINLINE __attribute__((noinline))
-#define HWY_FLATTEN __attribute__((flatten))
-#define HWY_NORETURN __attribute__((noreturn))
-#define HWY_LIKELY(expr) __builtin_expect(!!(expr), 1)
-#define HWY_UNLIKELY(expr) __builtin_expect(!!(expr), 0)
-#define HWY_PRAGMA(tokens) _Pragma(#tokens)
-#define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(GCC diagnostic tokens)
-#define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(gcc)
-// Encountered "attribute list cannot appear here" when using the C++17
-// [[maybe_unused]], so only use the old style attribute for now.
-#define HWY_MAYBE_UNUSED __attribute__((unused))
-#define HWY_MUST_USE_RESULT __attribute__((warn_unused_result))
-
-#endif  // !HWY_COMPILER_MSVC
-
-//------------------------------------------------------------------------------
-// Builtin/attributes
-
-// Enables error-checking of format strings.
-#if HWY_HAS_ATTRIBUTE(__format__)
-#define HWY_FORMAT(idx_fmt, idx_arg) \
-  __attribute__((__format__(__printf__, idx_fmt, idx_arg)))
-#else
-#define HWY_FORMAT(idx_fmt, idx_arg)
-#endif
-
-// Returns a void* pointer which the compiler then assumes is N-byte aligned.
-// Example: float* HWY_RESTRICT aligned = (float*)HWY_ASSUME_ALIGNED(in, 32);
-//
-// The assignment semantics are required by GCC/Clang. ICC provides an in-place
-// __assume_aligned, whereas MSVC's __assume appears unsuitable.
-#if HWY_HAS_BUILTIN(__builtin_assume_aligned)
-#define HWY_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align))
-#else
-#define HWY_ASSUME_ALIGNED(ptr, align) (ptr) /* not supported */
-#endif
-
-// Clang and GCC require attributes on each function into which SIMD intrinsics
-// are inlined. Support both per-function annotation (HWY_ATTR) for lambdas and
-// automatic annotation via pragmas.
-#if HWY_COMPILER_CLANG
-#define HWY_PUSH_ATTRIBUTES(targets_str)                                \
-  HWY_PRAGMA(clang attribute push(__attribute__((target(targets_str))), \
-                                  apply_to = function))
-#define HWY_POP_ATTRIBUTES HWY_PRAGMA(clang attribute pop)
-#elif HWY_COMPILER_GCC
-#define HWY_PUSH_ATTRIBUTES(targets_str) \
-  HWY_PRAGMA(GCC push_options) HWY_PRAGMA(GCC target targets_str)
-#define HWY_POP_ATTRIBUTES HWY_PRAGMA(GCC pop_options)
-#else
-#define HWY_PUSH_ATTRIBUTES(targets_str)
-#define HWY_POP_ATTRIBUTES
-#endif
-
-//------------------------------------------------------------------------------
-// Macros
-
-#define HWY_API static HWY_INLINE HWY_FLATTEN HWY_MAYBE_UNUSED
-
-#define HWY_CONCAT_IMPL(a, b) a##b
-#define HWY_CONCAT(a, b) HWY_CONCAT_IMPL(a, b)
-
-#define HWY_MIN(a, b) ((a) < (b) ? (a) : (b))
-#define HWY_MAX(a, b) ((a) > (b) ? (a) : (b))
-
-#if HWY_COMPILER_GCC_ACTUAL
-// nielskm: GCC does not support '#pragma GCC unroll' without the factor.
-#define HWY_UNROLL(factor) HWY_PRAGMA(GCC unroll factor)
-#define HWY_DEFAULT_UNROLL HWY_UNROLL(4)
-#elif HWY_COMPILER_CLANG || HWY_COMPILER_ICC || HWY_COMPILER_ICX
-#define HWY_UNROLL(factor) HWY_PRAGMA(unroll factor)
-#define HWY_DEFAULT_UNROLL HWY_UNROLL()
-#else
-#define HWY_UNROLL(factor)
-#define HWY_DEFAULT_UNROLL HWY_UNROLL()
-#endif
-
-
-// Compile-time fence to prevent undesirable code reordering. On Clang x86, the
-// typical asm volatile("" : : : "memory") has no effect, whereas atomic fence
-// does, without generating code.
-#if HWY_ARCH_X86
-#define HWY_FENCE std::atomic_thread_fence(std::memory_order_acq_rel)
-#else
-// TODO(janwas): investigate alternatives. On ARM, the above generates barriers.
-#define HWY_FENCE
-#endif
-
-// 4 instances of a given literal value, useful as input to LoadDup128.
-#define HWY_REP4(literal) literal, literal, literal, literal
-
-#define HWY_ABORT(format, ...) \
-  ::hwy::Abort(__FILE__, __LINE__, format, ##__VA_ARGS__)
-
-// Always enabled.
-#define HWY_ASSERT(condition)             \
-  do {                                    \
-    if (!(condition)) {                   \
-      HWY_ABORT("Assert %s", #condition); \
-    }                                     \
-  } while (0)
-
-#if HWY_HAS_FEATURE(memory_sanitizer) || defined(MEMORY_SANITIZER)
-#define HWY_IS_MSAN 1
-#else
-#define HWY_IS_MSAN 0
-#endif
-
-#if HWY_HAS_FEATURE(address_sanitizer) || defined(ADDRESS_SANITIZER)
-#define HWY_IS_ASAN 1
-#else
-#define HWY_IS_ASAN 0
-#endif
-
-#if HWY_HAS_FEATURE(thread_sanitizer) || defined(THREAD_SANITIZER)
-#define HWY_IS_TSAN 1
-#else
-#define HWY_IS_TSAN 0
-#endif
-
-// MSAN may cause lengthy build times or false positives e.g. in AVX3 DemoteTo.
-// You can disable MSAN by adding this attribute to the function that fails.
-#if HWY_IS_MSAN
-#define HWY_ATTR_NO_MSAN __attribute__((no_sanitize_memory))
-#else
-#define HWY_ATTR_NO_MSAN
-#endif
-
-// For enabling HWY_DASSERT and shortening tests in slower debug builds
-#if !defined(HWY_IS_DEBUG_BUILD)
-// Clang does not define NDEBUG, but it and GCC define __OPTIMIZE__, and recent
-// MSVC defines NDEBUG (if not, could instead check _DEBUG).
-#if (!defined(__OPTIMIZE__) && !defined(NDEBUG)) || HWY_IS_ASAN || \
-    HWY_IS_MSAN || HWY_IS_TSAN || defined(__clang_analyzer__)
-#define HWY_IS_DEBUG_BUILD 1
-#else
-#define HWY_IS_DEBUG_BUILD 0
-#endif
-#endif  // HWY_IS_DEBUG_BUILD
-
-#if HWY_IS_DEBUG_BUILD
-#define HWY_DASSERT(condition) HWY_ASSERT(condition)
-#else
-#define HWY_DASSERT(condition) \
-  do {                         \
-  } while (0)
-#endif
-
-namespace hwy {
-
-//------------------------------------------------------------------------------
-// kMaxVectorSize (undocumented, pending removal)
-
-#if HWY_ARCH_X86
-static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 64;  // AVX-512
-#elif HWY_ARCH_RVV && defined(__riscv_vector)
-// Not actually an upper bound on the size.
-static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 4096;
-#else
-static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16;
-#endif
-
-//------------------------------------------------------------------------------
-// Alignment
-
-// Potentially useful for LoadDup128 and capped vectors. In other cases, arrays
-// should be allocated dynamically via aligned_allocator.h because Lanes() may
-// exceed the stack size.
-#if HWY_ARCH_X86
-#define HWY_ALIGN_MAX alignas(64)
-#elif HWY_ARCH_RVV && defined(__riscv_vector)
-#define HWY_ALIGN_MAX alignas(8)  // only elements need be aligned
-#else
-#define HWY_ALIGN_MAX alignas(16)
-#endif
-
-//------------------------------------------------------------------------------
-// Lane types
-
-// Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name
-// by concatenating base type and bits.
-
-#pragma pack(push, 1)
-
-// ACLE (https://gcc.gnu.org/onlinedocs/gcc/Half-Precision.html):
-// always supported on aarch64, for v7 only if -mfp16-format is given.
-#if ((HWY_ARCH_ARM_A64 || (__ARM_FP & 2)) && HWY_COMPILER_GCC)
-using float16_t = __fp16;
-// C11 extension ISO/IEC TS 18661-3:2015 but not supported on all targets.
-// Required for Clang RVV if the float16 extension is used.
-#elif HWY_ARCH_RVV && HWY_COMPILER_CLANG && defined(__riscv_zvfh)
-using float16_t = _Float16;
-// Otherwise emulate
-#else
-struct float16_t {
-  uint16_t bits;
-};
-#endif
-
-struct bfloat16_t {
-  uint16_t bits;
-};
-
-#pragma pack(pop)
-
-using float32_t = float;
-using float64_t = double;
-
-#pragma pack(push, 1)
-
-// Aligned 128-bit type. Cannot use __int128 because clang doesn't yet align it:
-// https://reviews.llvm.org/D86310
-struct alignas(16) uint128_t {
-  uint64_t lo;  // little-endian layout
-  uint64_t hi;
-};
-
-// 64 bit key plus 64 bit value. Faster than using uint128_t when only the key
-// field is to be compared (Lt128Upper instead of Lt128).
-struct alignas(16) K64V64 {
-  uint64_t value;  // little-endian layout
-  uint64_t key;
-};
-
-#pragma pack(pop)
-
-static inline HWY_MAYBE_UNUSED bool operator<(const uint128_t& a,
-                                              const uint128_t& b) {
-  return (a.hi == b.hi) ? a.lo < b.lo : a.hi < b.hi;
-}
-// Required for std::greater.
-static inline HWY_MAYBE_UNUSED bool operator>(const uint128_t& a,
-                                              const uint128_t& b) {
-  return b < a;
-}
-
-static inline HWY_MAYBE_UNUSED bool operator<(const K64V64& a,
-                                              const K64V64& b) {
-  return a.key < b.key;
-}
-// Required for std::greater.
-static inline HWY_MAYBE_UNUSED bool operator>(const K64V64& a,
-                                              const K64V64& b) {
-  return b < a;
-}
-
-//------------------------------------------------------------------------------
-// Controlling overload resolution (SFINAE)
-
-template <bool Condition>
-struct EnableIfT {};
-template <>
-struct EnableIfT<true> {
-  using type = void;
-};
-
-template <bool Condition>
-using EnableIf = typename EnableIfT<Condition>::type;
-
-template <typename T, typename U>
-struct IsSameT {
-  enum { value = 0 };
-};
-
-template <typename T>
-struct IsSameT<T, T> {
-  enum { value = 1 };
-};
-
-template <typename T, typename U>
-HWY_API constexpr bool IsSame() {
-  return IsSameT<T, U>::value;
-}
-
-// Insert into template/function arguments to enable this overload only for
-// vectors of AT MOST this many bits.
-//
-// Note that enabling for exactly 128 bits is unnecessary because a function can
-// simply be overloaded with Vec128<T> and/or Full128<T> tag. Enabling for other
-// sizes (e.g. 64 bit) can be achieved via Simd<T, 8 / sizeof(T), 0>.
-#define HWY_IF_LE128(T, N) hwy::EnableIf<N * sizeof(T) <= 16>* = nullptr
-#define HWY_IF_LE64(T, N) hwy::EnableIf<N * sizeof(T) <= 8>* = nullptr
-#define HWY_IF_LE32(T, N) hwy::EnableIf<N * sizeof(T) <= 4>* = nullptr
-#define HWY_IF_GE32(T, N) hwy::EnableIf<N * sizeof(T) >= 4>* = nullptr
-#define HWY_IF_GE64(T, N) hwy::EnableIf<N * sizeof(T) >= 8>* = nullptr
-#define HWY_IF_GE128(T, N) hwy::EnableIf<N * sizeof(T) >= 16>* = nullptr
-#define HWY_IF_GT128(T, N) hwy::EnableIf<(N * sizeof(T) > 16)>* = nullptr
-
-#define HWY_IF_UNSIGNED(T) hwy::EnableIf<!IsSigned<T>()>* = nullptr
-#define HWY_IF_SIGNED(T) \
-  hwy::EnableIf<IsSigned<T>() && !IsFloat<T>()>* = nullptr
-#define HWY_IF_FLOAT(T) hwy::EnableIf<hwy::IsFloat<T>()>* = nullptr
-#define HWY_IF_NOT_FLOAT(T) hwy::EnableIf<!hwy::IsFloat<T>()>* = nullptr
-
-#define HWY_IF_LANE_SIZE(T, bytes) \
-  hwy::EnableIf<sizeof(T) == (bytes)>* = nullptr
-#define HWY_IF_NOT_LANE_SIZE(T, bytes) \
-  hwy::EnableIf<sizeof(T) != (bytes)>* = nullptr
-#define HWY_IF_LANE_SIZE_LT(T, bytes) \
-  hwy::EnableIf<sizeof(T) < (bytes)>* = nullptr
-
-#define HWY_IF_LANES_PER_BLOCK(T, N, LANES) \
-  hwy::EnableIf<HWY_MIN(sizeof(T) * N, 16) / sizeof(T) == (LANES)>* = nullptr
-
-// Empty struct used as a size tag type.
-template <size_t N>
-struct SizeTag {};
-
-template <class T>
-struct RemoveConstT {
-  using type = T;
-};
-template <class T>
-struct RemoveConstT<const T> {
-  using type = T;
-};
-
-template <class T>
-using RemoveConst = typename RemoveConstT<T>::type;
-
-//------------------------------------------------------------------------------
-// Type relations
-
-namespace detail {
-
-template <typename T>
-struct Relations;
-template <>
-struct Relations<uint8_t> {
-  using Unsigned = uint8_t;
-  using Signed = int8_t;
-  using Wide = uint16_t;
-  enum { is_signed = 0 };
-  enum { is_float = 0 };
-};
-template <>
-struct Relations<int8_t> {
-  using Unsigned = uint8_t;
-  using Signed = int8_t;
-  using Wide = int16_t;
-  enum { is_signed = 1 };
-  enum { is_float = 0 };
-};
-template <>
-struct Relations<uint16_t> {
-  using Unsigned = uint16_t;
-  using Signed = int16_t;
-  using Wide = uint32_t;
-  using Narrow = uint8_t;
-  enum { is_signed = 0 };
-  enum { is_float = 0 };
-};
-template <>
-struct Relations<int16_t> {
-  using Unsigned = uint16_t;
-  using Signed = int16_t;
-  using Wide = int32_t;
-  using Narrow = int8_t;
-  enum { is_signed = 1 };
-  enum { is_float = 0 };
-};
-template <>
-struct Relations<uint32_t> {
-  using Unsigned = uint32_t;
-  using Signed = int32_t;
-  using Float = float;
-  using Wide = uint64_t;
-  using Narrow = uint16_t;
-  enum { is_signed = 0 };
-  enum { is_float = 0 };
-};
-template <>
-struct Relations<int32_t> {
-  using Unsigned = uint32_t;
-  using Signed = int32_t;
-  using Float = float;
-  using Wide = int64_t;
-  using Narrow = int16_t;
-  enum { is_signed = 1 };
-  enum { is_float = 0 };
-};
-template <>
-struct Relations<uint64_t> {
-  using Unsigned = uint64_t;
-  using Signed = int64_t;
-  using Float = double;
-  using Wide = uint128_t;
-  using Narrow = uint32_t;
-  enum { is_signed = 0 };
-  enum { is_float = 0 };
-};
-template <>
-struct Relations<int64_t> {
-  using Unsigned = uint64_t;
-  using Signed = int64_t;
-  using Float = double;
-  using Narrow = int32_t;
-  enum { is_signed = 1 };
-  enum { is_float = 0 };
-};
-template <>
-struct Relations<uint128_t> {
-  using Unsigned = uint128_t;
-  using Narrow = uint64_t;
-  enum { is_signed = 0 };
-  enum { is_float = 0 };
-};
-template <>
-struct Relations<float16_t> {
-  using Unsigned = uint16_t;
-  using Signed = int16_t;
-  using Float = float16_t;
-  using Wide = float;
-  enum { is_signed = 1 };
-  enum { is_float = 1 };
-};
-template <>
-struct Relations<bfloat16_t> {
-  using Unsigned = uint16_t;
-  using Signed = int16_t;
-  using Wide = float;
-  enum { is_signed = 1 };
-  enum { is_float = 1 };
-};
-template <>
-struct Relations<float> {
-  using Unsigned = uint32_t;
-  using Signed = int32_t;
-  using Float = float;
-  using Wide = double;
-  using Narrow = float16_t;
-  enum { is_signed = 1 };
-  enum { is_float = 1 };
-};
-template <>
-struct Relations<double> {
-  using Unsigned = uint64_t;
-  using Signed = int64_t;
-  using Float = double;
-  using Narrow = float;
-  enum { is_signed = 1 };
-  enum { is_float = 1 };
-};
-
-template <size_t N>
-struct TypeFromSize;
-template <>
-struct TypeFromSize<1> {
-  using Unsigned = uint8_t;
-  using Signed = int8_t;
-};
-template <>
-struct TypeFromSize<2> {
-  using Unsigned = uint16_t;
-  using Signed = int16_t;
-};
-template <>
-struct TypeFromSize<4> {
-  using Unsigned = uint32_t;
-  using Signed = int32_t;
-  using Float = float;
-};
-template <>
-struct TypeFromSize<8> {
-  using Unsigned = uint64_t;
-  using Signed = int64_t;
-  using Float = double;
-};
-template <>
-struct TypeFromSize<16> {
-  using Unsigned = uint128_t;
-};
-
-}  // namespace detail
-
-// Aliases for types of a different category, but the same size.
-template <typename T>
-using MakeUnsigned = typename detail::Relations<T>::Unsigned;
-template <typename T>
-using MakeSigned = typename detail::Relations<T>::Signed;
-template <typename T>
-using MakeFloat = typename detail::Relations<T>::Float;
-
-// Aliases for types of the same category, but different size.
-template <typename T>
-using MakeWide = typename detail::Relations<T>::Wide;
-template <typename T>
-using MakeNarrow = typename detail::Relations<T>::Narrow;
-
-// Obtain type from its size [bytes].
-template <size_t N>
-using UnsignedFromSize = typename detail::TypeFromSize<N>::Unsigned;
-template <size_t N>
-using SignedFromSize = typename detail::TypeFromSize<N>::Signed;
-template <size_t N>
-using FloatFromSize = typename detail::TypeFromSize<N>::Float;
-
-// Avoid confusion with SizeTag where the parameter is a lane size.
-using UnsignedTag = SizeTag<0>;
-using SignedTag = SizeTag<0x100>;  // integer
-using FloatTag = SizeTag<0x200>;
-
-template <typename T, class R = detail::Relations<T>>
-constexpr auto TypeTag() -> hwy::SizeTag<((R::is_signed + R::is_float) << 8)> {
-  return hwy::SizeTag<((R::is_signed + R::is_float) << 8)>();
-}
-
-// For when we only want to distinguish FloatTag from everything else.
-using NonFloatTag = SizeTag<0x400>;
-
-template <typename T, class R = detail::Relations<T>>
-constexpr auto IsFloatTag() -> hwy::SizeTag<(R::is_float ? 0x200 : 0x400)> {
-  return hwy::SizeTag<(R::is_float ? 0x200 : 0x400)>();
-}
-
-//------------------------------------------------------------------------------
-// Type traits
-
-template <typename T>
-HWY_API constexpr bool IsFloat() {
-  // Cannot use T(1.25) != T(1) for float16_t, which can only be converted to or
-  // from a float, not compared.
-  return IsSame<T, float>() || IsSame<T, double>();
-}
-
-template <typename T>
-HWY_API constexpr bool IsSigned() {
-  return T(0) > T(-1);
-}
-template <>
-constexpr bool IsSigned<float16_t>() {
-  return true;
-}
-template <>
-constexpr bool IsSigned<bfloat16_t>() {
-  return true;
-}
-
-// Largest/smallest representable integer values.
-template <typename T>
-HWY_API constexpr T LimitsMax() {
-  static_assert(!IsFloat<T>(), "Only for integer types");
-  using TU = MakeUnsigned<T>;
-  return static_cast<T>(IsSigned<T>() ? (static_cast<TU>(~0ull) >> 1)
-                                      : static_cast<TU>(~0ull));
-}
-template <typename T>
-HWY_API constexpr T LimitsMin() {
-  static_assert(!IsFloat<T>(), "Only for integer types");
-  return IsSigned<T>() ? T(-1) - LimitsMax<T>() : T(0);
-}
-
-// Largest/smallest representable value (integer or float). This naming avoids
-// confusion with numeric_limits<float>::min() (the smallest positive value).
-template <typename T>
-HWY_API constexpr T LowestValue() {
-  return LimitsMin<T>();
-}
-template <>
-constexpr float LowestValue<float>() {
-  return -3.402823466e+38F;
-}
-template <>
-constexpr double LowestValue<double>() {
-  return -1.7976931348623158e+308;
-}
-
-template <typename T>
-HWY_API constexpr T HighestValue() {
-  return LimitsMax<T>();
-}
-template <>
-constexpr float HighestValue<float>() {
-  return 3.402823466e+38F;
-}
-template <>
-constexpr double HighestValue<double>() {
-  return 1.7976931348623158e+308;
-}
-
-// Returns width in bits of the mantissa field in IEEE binary32/64.
-template <typename T>
-constexpr int MantissaBits() {
-  static_assert(sizeof(T) == 0, "Only instantiate the specializations");
-  return 0;
-}
-template <>
-constexpr int MantissaBits<float>() {
-  return 23;
-}
-template <>
-constexpr int MantissaBits<double>() {
-  return 52;
-}
-
-// Returns the (left-shifted by one bit) IEEE binary32/64 representation with
-// the largest possible (biased) exponent field. Used by IsInf.
-template <typename T>
-constexpr MakeSigned<T> MaxExponentTimes2() {
-  return -(MakeSigned<T>{1} << (MantissaBits<T>() + 1));
-}
-
-// Returns bitmask of the sign bit in IEEE binary32/64.
-template <typename T>
-constexpr MakeUnsigned<T> SignMask() {
-  return MakeUnsigned<T>{1} << (sizeof(T) * 8 - 1);
-}
-
-// Returns bitmask of the exponent field in IEEE binary32/64.
-template <typename T>
-constexpr MakeUnsigned<T> ExponentMask() {
-  return (~(MakeUnsigned<T>{1} << MantissaBits<T>()) + 1) & ~SignMask<T>();
-}
-
-// Returns bitmask of the mantissa field in IEEE binary32/64.
-template <typename T>
-constexpr MakeUnsigned<T> MantissaMask() {
-  return (MakeUnsigned<T>{1} << MantissaBits<T>()) - 1;
-}
-
-// Returns 1 << mantissa_bits as a floating-point number. All integers whose
-// absolute value are less than this can be represented exactly.
-template <typename T>
-constexpr T MantissaEnd() {
-  static_assert(sizeof(T) == 0, "Only instantiate the specializations");
-  return 0;
-}
-template <>
-constexpr float MantissaEnd<float>() {
-  return 8388608.0f;  // 1 << 23
-}
-template <>
-constexpr double MantissaEnd<double>() {
-  // floating point literal with p52 requires C++17.
-  return 4503599627370496.0;  // 1 << 52
-}
-
-// Returns width in bits of the exponent field in IEEE binary32/64.
-template <typename T>
-constexpr int ExponentBits() {
-  // Exponent := remaining bits after deducting sign and mantissa.
-  return 8 * sizeof(T) - 1 - MantissaBits<T>();
-}
-
-// Returns largest value of the biased exponent field in IEEE binary32/64,
-// right-shifted so that the LSB is bit zero. Example: 0xFF for float.
-// This is expressed as a signed integer for more efficient comparison.
-template <typename T>
-constexpr MakeSigned<T> MaxExponentField() {
-  return (MakeSigned<T>{1} << ExponentBits<T>()) - 1;
-}
-
-//------------------------------------------------------------------------------
-// Helper functions
-
-template <typename T1, typename T2>
-constexpr inline T1 DivCeil(T1 a, T2 b) {
-  return (a + b - 1) / b;
-}
-
-// Works for any `align`; if a power of two, compiler emits ADD+AND.
-constexpr inline size_t RoundUpTo(size_t what, size_t align) {
-  return DivCeil(what, align) * align;
-}
-
-// Undefined results for x == 0.
-HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) {
-#if HWY_COMPILER_MSVC
-  unsigned long index;  // NOLINT
-  _BitScanForward(&index, x);
-  return index;
-#else   // HWY_COMPILER_MSVC
-  return static_cast<size_t>(__builtin_ctz(x));
-#endif  // HWY_COMPILER_MSVC
-}
-
-HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x) {
-#if HWY_COMPILER_MSVC
-#if HWY_ARCH_X86_64
-  unsigned long index;  // NOLINT
-  _BitScanForward64(&index, x);
-  return index;
-#else   // HWY_ARCH_X86_64
-  // _BitScanForward64 not available
-  uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
-  unsigned long index;  // NOLINT
-  if (lsb == 0) {
-    uint32_t msb = static_cast<uint32_t>(x >> 32u);
-    _BitScanForward(&index, msb);
-    return 32 + index;
-  } else {
-    _BitScanForward(&index, lsb);
-    return index;
-  }
-#endif  // HWY_ARCH_X86_64
-#else   // HWY_COMPILER_MSVC
-  return static_cast<size_t>(__builtin_ctzll(x));
-#endif  // HWY_COMPILER_MSVC
-}
-
-// Undefined results for x == 0.
-HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x) {
-#if HWY_COMPILER_MSVC
-  unsigned long index;  // NOLINT
-  _BitScanReverse(&index, x);
-  return 31 - index;
-#else   // HWY_COMPILER_MSVC
-  return static_cast<size_t>(__builtin_clz(x));
-#endif  // HWY_COMPILER_MSVC
-}
-
-HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) {
-#if HWY_COMPILER_MSVC
-#if HWY_ARCH_X86_64
-  unsigned long index;  // NOLINT
-  _BitScanReverse64(&index, x);
-  return 63 - index;
-#else   // HWY_ARCH_X86_64
-  // _BitScanReverse64 not available
-  const uint32_t msb = static_cast<uint32_t>(x >> 32u);
-  unsigned long index;  // NOLINT
-  if (msb == 0) {
-    const uint32_t lsb = static_cast<uint32_t>(x & 0xFFFFFFFF);
-    _BitScanReverse(&index, lsb);
-    return 63 - index;
-  } else {
-    _BitScanReverse(&index, msb);
-    return 31 - index;
-  }
-#endif  // HWY_ARCH_X86_64
-#else   // HWY_COMPILER_MSVC
-  return static_cast<size_t>(__builtin_clzll(x));
-#endif  // HWY_COMPILER_MSVC
-}
-
-HWY_API size_t PopCount(uint64_t x) {
-#if HWY_COMPILER_GCC  // includes clang
-  return static_cast<size_t>(__builtin_popcountll(x));
-  // This instruction has a separate feature flag, but is often called from
-  // non-SIMD code, so we don't want to require dynamic dispatch. It was first
-  // supported by Intel in Nehalem (SSE4.2), but MSVC only predefines a macro
-  // for AVX, so check for that.
-#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 && defined(__AVX__)
-  return _mm_popcnt_u64(x);
-#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__)
-  return _mm_popcnt_u32(static_cast<uint32_t>(x & 0xFFFFFFFFu)) +
-         _mm_popcnt_u32(static_cast<uint32_t>(x >> 32));
-#else
-  x -= ((x >> 1) & 0x5555555555555555ULL);
-  x = (((x >> 2) & 0x3333333333333333ULL) + (x & 0x3333333333333333ULL));
-  x = (((x >> 4) + x) & 0x0F0F0F0F0F0F0F0FULL);
-  x += (x >> 8);
-  x += (x >> 16);
-  x += (x >> 32);
-  return static_cast<size_t>(x & 0x7Fu);
-#endif
-}
-
-// Skip HWY_API due to GCC "function not considered for inlining". Previously
-// such errors were caused by underlying type mismatches, but it's not clear
-// what is still mismatched despite all the casts.
-template <typename TI>
-/*HWY_API*/ constexpr size_t FloorLog2(TI x) {
-  return x == TI{1}
-             ? 0
-             : static_cast<size_t>(FloorLog2(static_cast<TI>(x >> 1)) + 1);
-}
-
-template <typename TI>
-/*HWY_API*/ constexpr size_t CeilLog2(TI x) {
-  return x == TI{1}
-             ? 0
-             : static_cast<size_t>(FloorLog2(static_cast<TI>(x - 1)) + 1);
-}
-
-#if HWY_COMPILER_MSVC && HWY_ARCH_X86_64
-#pragma intrinsic(_umul128)
-#endif
-
-// 64 x 64 = 128 bit multiplication
-HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t* HWY_RESTRICT upper) {
-#if defined(__SIZEOF_INT128__)
-  __uint128_t product = (__uint128_t)a * (__uint128_t)b;
-  *upper = (uint64_t)(product >> 64);
-  return (uint64_t)(product & 0xFFFFFFFFFFFFFFFFULL);
-#elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64
-  return _umul128(a, b, upper);
-#else
-  constexpr uint64_t kLo32 = 0xFFFFFFFFU;
-  const uint64_t lo_lo = (a & kLo32) * (b & kLo32);
-  const uint64_t hi_lo = (a >> 32) * (b & kLo32);
-  const uint64_t lo_hi = (a & kLo32) * (b >> 32);
-  const uint64_t hi_hi = (a >> 32) * (b >> 32);
-  const uint64_t t = (lo_lo >> 32) + (hi_lo & kLo32) + lo_hi;
-  *upper = (hi_lo >> 32) + (t >> 32) + hi_hi;
-  return (t << 32) | (lo_lo & kLo32);
-#endif
-}
-
-#if HWY_COMPILER_MSVC
-#pragma intrinsic(memcpy)
-#pragma intrinsic(memset)
-#endif
-
-// The source/destination must not overlap/alias.
-template <size_t kBytes, typename From, typename To>
-HWY_API void CopyBytes(const From* from, To* to) {
-#if HWY_COMPILER_MSVC
-  memcpy(to, from, kBytes);
-#else
-  __builtin_memcpy(
-      static_cast<void*>(to), static_cast<const void*>(from), kBytes);
-#endif
-}
-
-// Same as CopyBytes, but for same-sized objects; avoids a size argument.
-template <typename From, typename To>
-HWY_API void CopySameSize(const From* HWY_RESTRICT from, To* HWY_RESTRICT to) {
-  static_assert(sizeof(From) == sizeof(To), "");
-  CopyBytes<sizeof(From)>(from, to);
-}
-
-template <size_t kBytes, typename To>
-HWY_API void ZeroBytes(To* to) {
-#if HWY_COMPILER_MSVC
-  memset(to, 0, kBytes);
-#else
-  __builtin_memset(to, 0, kBytes);
-#endif
-}
-
-HWY_API float F32FromBF16(bfloat16_t bf) {
-  uint32_t bits = bf.bits;
-  bits <<= 16;
-  float f;
-  CopySameSize(&bits, &f);
-  return f;
-}
-
-HWY_API bfloat16_t BF16FromF32(float f) {
-  uint32_t bits;
-  CopySameSize(&f, &bits);
-  bfloat16_t bf;
-  bf.bits = static_cast<uint16_t>(bits >> 16);
-  return bf;
-}
-
-HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
-    Abort(const char* file, int line, const char* format, ...);
-
-}  // namespace hwy
-
-#endif  // HIGHWAY_HWY_BASE_H_
diff --git a/third_party/highway/hwy/base_test.cc b/third_party/highway/hwy/base_test.cc
deleted file mode 100644 (file)
index baca70b..0000000
+++ /dev/null
@@ -1,178 +0,0 @@
-// Copyright 2019 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include <limits>
-
-#include "hwy/base.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "base_test.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#include "hwy/highway.h"
-#include "hwy/tests/test_util-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-HWY_NOINLINE void TestAllLimits() {
-  HWY_ASSERT_EQ(uint8_t{0}, LimitsMin<uint8_t>());
-  HWY_ASSERT_EQ(uint16_t{0}, LimitsMin<uint16_t>());
-  HWY_ASSERT_EQ(uint32_t{0}, LimitsMin<uint32_t>());
-  HWY_ASSERT_EQ(uint64_t{0}, LimitsMin<uint64_t>());
-
-  HWY_ASSERT_EQ(int8_t{-128}, LimitsMin<int8_t>());
-  HWY_ASSERT_EQ(int16_t{-32768}, LimitsMin<int16_t>());
-  HWY_ASSERT_EQ(static_cast<int32_t>(0x80000000u), LimitsMin<int32_t>());
-  HWY_ASSERT_EQ(static_cast<int64_t>(0x8000000000000000ull),
-                LimitsMin<int64_t>());
-
-  HWY_ASSERT_EQ(uint8_t{0xFF}, LimitsMax<uint8_t>());
-  HWY_ASSERT_EQ(uint16_t{0xFFFF}, LimitsMax<uint16_t>());
-  HWY_ASSERT_EQ(uint32_t{0xFFFFFFFFu}, LimitsMax<uint32_t>());
-  HWY_ASSERT_EQ(uint64_t{0xFFFFFFFFFFFFFFFFull}, LimitsMax<uint64_t>());
-
-  HWY_ASSERT_EQ(int8_t{0x7F}, LimitsMax<int8_t>());
-  HWY_ASSERT_EQ(int16_t{0x7FFF}, LimitsMax<int16_t>());
-  HWY_ASSERT_EQ(int32_t{0x7FFFFFFFu}, LimitsMax<int32_t>());
-  HWY_ASSERT_EQ(int64_t{0x7FFFFFFFFFFFFFFFull}, LimitsMax<int64_t>());
-}
-
-struct TestLowestHighest {
-  template <class T>
-  HWY_NOINLINE void operator()(T /*unused*/) const {
-    HWY_ASSERT_EQ(std::numeric_limits<T>::lowest(), LowestValue<T>());
-    HWY_ASSERT_EQ(std::numeric_limits<T>::max(), HighestValue<T>());
-  }
-};
-
-HWY_NOINLINE void TestAllLowestHighest() { ForAllTypes(TestLowestHighest()); }
-struct TestIsUnsigned {
-  template <class T>
-  HWY_NOINLINE void operator()(T /*unused*/) const {
-    static_assert(!IsFloat<T>(), "Expected !IsFloat");
-    static_assert(!IsSigned<T>(), "Expected !IsSigned");
-  }
-};
-
-struct TestIsSigned {
-  template <class T>
-  HWY_NOINLINE void operator()(T /*unused*/) const {
-    static_assert(!IsFloat<T>(), "Expected !IsFloat");
-    static_assert(IsSigned<T>(), "Expected IsSigned");
-  }
-};
-
-struct TestIsFloat {
-  template <class T>
-  HWY_NOINLINE void operator()(T /*unused*/) const {
-    static_assert(IsFloat<T>(), "Expected IsFloat");
-    static_assert(IsSigned<T>(), "Floats are also considered signed");
-  }
-};
-
-HWY_NOINLINE void TestAllType() {
-  ForUnsignedTypes(TestIsUnsigned());
-  ForSignedTypes(TestIsSigned());
-  ForFloatTypes(TestIsFloat());
-
-  static_assert(sizeof(MakeUnsigned<hwy::uint128_t>) == 16, "");
-  static_assert(sizeof(MakeWide<uint64_t>) == 16, "Expected uint128_t");
-  static_assert(sizeof(MakeNarrow<hwy::uint128_t>) == 8, "Expected uint64_t");
-}
-
-struct TestIsSame {
-  template <class T>
-  HWY_NOINLINE void operator()(T /*unused*/) const {
-    static_assert(IsSame<T, T>(), "T == T");
-    static_assert(!IsSame<MakeSigned<T>, MakeUnsigned<T>>(), "S != U");
-    static_assert(!IsSame<MakeUnsigned<T>, MakeSigned<T>>(), "U != S");
-  }
-};
-
-HWY_NOINLINE void TestAllIsSame() { ForAllTypes(TestIsSame()); }
-
-HWY_NOINLINE void TestAllBitScan() {
-  HWY_ASSERT_EQ(size_t{0}, Num0BitsAboveMS1Bit_Nonzero32(0x80000000u));
-  HWY_ASSERT_EQ(size_t{0}, Num0BitsAboveMS1Bit_Nonzero32(0xFFFFFFFFu));
-  HWY_ASSERT_EQ(size_t{1}, Num0BitsAboveMS1Bit_Nonzero32(0x40000000u));
-  HWY_ASSERT_EQ(size_t{1}, Num0BitsAboveMS1Bit_Nonzero32(0x40108210u));
-  HWY_ASSERT_EQ(size_t{30}, Num0BitsAboveMS1Bit_Nonzero32(2u));
-  HWY_ASSERT_EQ(size_t{30}, Num0BitsAboveMS1Bit_Nonzero32(3u));
-  HWY_ASSERT_EQ(size_t{31}, Num0BitsAboveMS1Bit_Nonzero32(1u));
-
-  HWY_ASSERT_EQ(size_t{0},
-                Num0BitsAboveMS1Bit_Nonzero64(0x8000000000000000ull));
-  HWY_ASSERT_EQ(size_t{0},
-                Num0BitsAboveMS1Bit_Nonzero64(0xFFFFFFFFFFFFFFFFull));
-  HWY_ASSERT_EQ(size_t{1},
-                Num0BitsAboveMS1Bit_Nonzero64(0x4000000000000000ull));
-  HWY_ASSERT_EQ(size_t{1},
-                Num0BitsAboveMS1Bit_Nonzero64(0x4010821004200011ull));
-  HWY_ASSERT_EQ(size_t{62}, Num0BitsAboveMS1Bit_Nonzero64(2ull));
-  HWY_ASSERT_EQ(size_t{62}, Num0BitsAboveMS1Bit_Nonzero64(3ull));
-  HWY_ASSERT_EQ(size_t{63}, Num0BitsAboveMS1Bit_Nonzero64(1ull));
-
-  HWY_ASSERT_EQ(size_t{0}, Num0BitsBelowLS1Bit_Nonzero32(1u));
-  HWY_ASSERT_EQ(size_t{1}, Num0BitsBelowLS1Bit_Nonzero32(2u));
-  HWY_ASSERT_EQ(size_t{30}, Num0BitsBelowLS1Bit_Nonzero32(0xC0000000u));
-  HWY_ASSERT_EQ(size_t{31}, Num0BitsBelowLS1Bit_Nonzero32(0x80000000u));
-
-  HWY_ASSERT_EQ(size_t{0}, Num0BitsBelowLS1Bit_Nonzero64(1ull));
-  HWY_ASSERT_EQ(size_t{1}, Num0BitsBelowLS1Bit_Nonzero64(2ull));
-  HWY_ASSERT_EQ(size_t{62},
-                Num0BitsBelowLS1Bit_Nonzero64(0xC000000000000000ull));
-  HWY_ASSERT_EQ(size_t{63},
-                Num0BitsBelowLS1Bit_Nonzero64(0x8000000000000000ull));
-}
-
-HWY_NOINLINE void TestAllPopCount() {
-  HWY_ASSERT_EQ(size_t{0}, PopCount(0u));
-  HWY_ASSERT_EQ(size_t{1}, PopCount(1u));
-  HWY_ASSERT_EQ(size_t{1}, PopCount(2u));
-  HWY_ASSERT_EQ(size_t{2}, PopCount(3u));
-  HWY_ASSERT_EQ(size_t{1}, PopCount(0x80000000u));
-  HWY_ASSERT_EQ(size_t{31}, PopCount(0x7FFFFFFFu));
-  HWY_ASSERT_EQ(size_t{32}, PopCount(0xFFFFFFFFu));
-
-  HWY_ASSERT_EQ(size_t{1}, PopCount(0x80000000ull));
-  HWY_ASSERT_EQ(size_t{31}, PopCount(0x7FFFFFFFull));
-  HWY_ASSERT_EQ(size_t{32}, PopCount(0xFFFFFFFFull));
-  HWY_ASSERT_EQ(size_t{33}, PopCount(0x10FFFFFFFFull));
-  HWY_ASSERT_EQ(size_t{63}, PopCount(0xFFFEFFFFFFFFFFFFull));
-  HWY_ASSERT_EQ(size_t{64}, PopCount(0xFFFFFFFFFFFFFFFFull));
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-
-namespace hwy {
-HWY_BEFORE_TEST(BaseTest);
-HWY_EXPORT_AND_TEST_P(BaseTest, TestAllLimits);
-HWY_EXPORT_AND_TEST_P(BaseTest, TestAllLowestHighest);
-HWY_EXPORT_AND_TEST_P(BaseTest, TestAllType);
-HWY_EXPORT_AND_TEST_P(BaseTest, TestAllIsSame);
-HWY_EXPORT_AND_TEST_P(BaseTest, TestAllBitScan);
-HWY_EXPORT_AND_TEST_P(BaseTest, TestAllPopCount);
-}  // namespace hwy
-
-#endif
diff --git a/third_party/highway/hwy/cache_control.h b/third_party/highway/hwy/cache_control.h
deleted file mode 100644 (file)
index b124e57..0000000
+++ /dev/null
@@ -1,110 +0,0 @@
-// Copyright 2020 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HIGHWAY_HWY_CACHE_CONTROL_H_
-#define HIGHWAY_HWY_CACHE_CONTROL_H_
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "hwy/base.h"
-
-// Requires SSE2; fails to compile on 32-bit Clang 7 (see
-// https://github.com/gperftools/gperftools/issues/946).
-#if !defined(__SSE2__) || (HWY_COMPILER_CLANG && HWY_ARCH_X86_32)
-#undef HWY_DISABLE_CACHE_CONTROL
-#define HWY_DISABLE_CACHE_CONTROL
-#endif
-
-// intrin.h is sufficient on MSVC and already included by base.h.
-#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) && !HWY_COMPILER_MSVC
-#include <emmintrin.h>  // SSE2
-#endif
-
-// Windows.h #defines these, which causes infinite recursion. Temporarily
-// undefine them in this header; these functions are anyway deprecated.
-// TODO(janwas): remove when these functions are removed.
-#pragma push_macro("LoadFence")
-#undef LoadFence
-
-namespace hwy {
-
-// Even if N*sizeof(T) is smaller, Stream may write a multiple of this size.
-#define HWY_STREAM_MULTIPLE 16
-
-// The following functions may also require an attribute.
-#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) && !HWY_COMPILER_MSVC
-#define HWY_ATTR_CACHE __attribute__((target("sse2")))
-#else
-#define HWY_ATTR_CACHE
-#endif
-
-// Delays subsequent loads until prior loads are visible. Beware of potentially
-// differing behavior across architectures and vendors: on Intel but not
-// AMD CPUs, also serves as a full fence (waits for all prior instructions to
-// complete).
-HWY_INLINE HWY_ATTR_CACHE void LoadFence() {
-#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
-  _mm_lfence();
-#endif
-}
-
-// Ensures values written by previous `Stream` calls are visible on the current
-// core. This is NOT sufficient for synchronizing across cores; when `Stream`
-// outputs are to be consumed by other core(s), the producer must publish
-// availability (e.g. via mutex or atomic_flag) after `FlushStream`.
-HWY_INLINE HWY_ATTR_CACHE void FlushStream() {
-#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
-  _mm_sfence();
-#endif
-}
-
-// Optionally begins loading the cache line containing "p" to reduce latency of
-// subsequent actual loads.
-template <typename T>
-HWY_INLINE HWY_ATTR_CACHE void Prefetch(const T* p) {
-#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
-  _mm_prefetch(reinterpret_cast<const char*>(p), _MM_HINT_T0);
-#elif HWY_COMPILER_GCC  // includes clang
-  // Hint=0 (NTA) behavior differs, but skipping outer caches is probably not
-  // desirable, so use the default 3 (keep in caches).
-  __builtin_prefetch(p, /*write=*/0, /*hint=*/3);
-#else
-  (void)p;
-#endif
-}
-
-// Invalidates and flushes the cache line containing "p", if possible.
-HWY_INLINE HWY_ATTR_CACHE void FlushCacheline(const void* p) {
-#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
-  _mm_clflush(p);
-#else
-  (void)p;
-#endif
-}
-
-// When called inside a spin-loop, may reduce power consumption.
-HWY_INLINE HWY_ATTR_CACHE void Pause() {
-#if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
-  _mm_pause();
-#endif
-}
-
-}  // namespace hwy
-
-// TODO(janwas): remove when these functions are removed. (See above.)
-#pragma pop_macro("LoadFence")
-
-#endif  // HIGHWAY_HWY_CACHE_CONTROL_H_
diff --git a/third_party/highway/hwy/contrib/algo/copy-inl.h b/third_party/highway/hwy/contrib/algo/copy-inl.h
deleted file mode 100644 (file)
index 033cf8a..0000000
+++ /dev/null
@@ -1,136 +0,0 @@
-// Copyright 2022 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Per-target include guard
-#if defined(HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_) == \
-    defined(HWY_TARGET_TOGGLE)
-#ifdef HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
-#undef HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
-#else
-#define HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
-#endif
-
-#include "hwy/highway.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-// These functions avoid having to write a loop plus remainder handling in the
-// (unfortunately still common) case where arrays are not aligned/padded. If the
-// inputs are known to be aligned/padded, it is more efficient to write a single
-// loop using Load(). We do not provide a CopyAlignedPadded because it
-// would be more verbose than such a loop.
-
-// Fills `to`[0, `count`) with `value`.
-template <class D, typename T = TFromD<D>>
-void Fill(D d, T value, size_t count, T* HWY_RESTRICT to) {
-  const size_t N = Lanes(d);
-  const Vec<D> v = Set(d, value);
-
-  size_t idx = 0;
-  for (; idx + N <= count; idx += N) {
-    StoreU(v, d, to + idx);
-  }
-
-  // `count` was a multiple of the vector length `N`: already done.
-  if (HWY_UNLIKELY(idx == count)) return;
-
-  const size_t remaining = count - idx;
-  HWY_DASSERT(0 != remaining && remaining < N);
-  SafeFillN(remaining, value, d, to + idx);
-}
-
-// Copies `from`[0, `count`) to `to`, which must not overlap `from`.
-template <class D, typename T = TFromD<D>>
-void Copy(D d, const T* HWY_RESTRICT from, size_t count, T* HWY_RESTRICT to) {
-  const size_t N = Lanes(d);
-
-  size_t idx = 0;
-  for (; idx + N <= count; idx += N) {
-    const Vec<D> v = LoadU(d, from + idx);
-    StoreU(v, d, to + idx);
-  }
-
-  // `count` was a multiple of the vector length `N`: already done.
-  if (HWY_UNLIKELY(idx == count)) return;
-
-  const size_t remaining = count - idx;
-  HWY_DASSERT(0 != remaining && remaining < N);
-  SafeCopyN(remaining, d, from + idx, to + idx);
-}
-
-// For idx in [0, count) in ascending order, appends `from[idx]` to `to` if the
-// corresponding mask element of `func(d, v)` is true. Returns the STL-style end
-// of the newly written elements in `to`.
-//
-// `func` is either a functor with a templated operator()(d, v) returning a
-// mask, or a generic lambda if using C++14. Due to apparent limitations of
-// Clang on Windows, it is currently necessary to add HWY_ATTR before the
-// opening { of the lambda to avoid errors about "function .. requires target".
-//
-// NOTE: this is only supported for 16-, 32- or 64-bit types.
-// NOTE: Func may be called a second time for elements it has already seen, but
-// these elements will not be written to `to` again.
-template <class D, class Func, typename T = TFromD<D>>
-T* CopyIf(D d, const T* HWY_RESTRICT from, size_t count, T* HWY_RESTRICT to,
-          const Func& func) {
-  const size_t N = Lanes(d);
-
-  size_t idx = 0;
-  for (; idx + N <= count; idx += N) {
-    const Vec<D> v = LoadU(d, from + idx);
-    to += CompressBlendedStore(v, func(d, v), d, to);
-  }
-
-  // `count` was a multiple of the vector length `N`: already done.
-  if (HWY_UNLIKELY(idx == count)) return to;
-
-#if HWY_MEM_OPS_MIGHT_FAULT
-  // Proceed one by one.
-  const CappedTag<T, 1> d1;
-  for (; idx < count; ++idx) {
-    using V1 = Vec<decltype(d1)>;
-    // Workaround for -Waggressive-loop-optimizations on GCC 8
-    // (iteration 2305843009213693951 invokes undefined behavior for T=i64)
-    const uintptr_t addr = reinterpret_cast<uintptr_t>(from);
-    const T* HWY_RESTRICT from_idx =
-        reinterpret_cast<const T * HWY_RESTRICT>(addr + (idx * sizeof(T)));
-    const V1 v = LoadU(d1, from_idx);
-    // Avoid storing to `to` unless we know it should be kept - otherwise, we
-    // might overrun the end if it was allocated for the exact count.
-    if (CountTrue(d1, func(d1, v)) == 0) continue;
-    StoreU(v, d1, to);
-    to += 1;
-  }
-#else
-  // Start index of the last unaligned whole vector, ending at the array end.
-  const size_t last = count - N;
-  // Number of elements before `from` or already written.
-  const size_t invalid = idx - last;
-  HWY_DASSERT(0 != invalid && invalid < N);
-  const Mask<D> mask = Not(FirstN(d, invalid));
-  const Vec<D> v = MaskedLoad(mask, d, from + last);
-  to += CompressBlendedStore(v, And(mask, func(d, v)), d, to);
-#endif
-  return to;
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#endif  // HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
diff --git a/third_party/highway/hwy/contrib/algo/copy_test.cc b/third_party/highway/hwy/contrib/algo/copy_test.cc
deleted file mode 100644 (file)
index e2675a3..0000000
+++ /dev/null
@@ -1,199 +0,0 @@
-// Copyright 2022 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/aligned_allocator.h"
-
-// clang-format off
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/algo/copy_test.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-
-#include "hwy/contrib/algo/copy-inl.h"
-#include "hwy/tests/test_util-inl.h"
-// clang-format on
-
-// If your project requires C++14 or later, you can ignore this and pass lambdas
-// directly to Transform, without requiring an lvalue as we do here for C++11.
-#if __cplusplus < 201402L
-#define HWY_GENERIC_LAMBDA 0
-#else
-#define HWY_GENERIC_LAMBDA 1
-#endif
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-// Returns random integer in [0, 128), which fits in any lane type.
-template <typename T>
-T Random7Bit(RandomState& rng) {
-  return static_cast<T>(Random32(&rng) & 127);
-}
-
-// In C++14, we can instead define these as generic lambdas next to where they
-// are invoked.
-#if !HWY_GENERIC_LAMBDA
-
-struct IsOdd {
-  template <class D, class V>
-  Mask<D> operator()(D d, V v) const {
-    return TestBit(v, Set(d, TFromD<D>{1}));
-  }
-};
-
-#endif  // !HWY_GENERIC_LAMBDA
-
-// Invokes Test (e.g. TestCopyIf) with all arg combinations. T comes from
-// ForFloatTypes.
-template <class Test>
-struct ForeachCountAndMisalign {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) const {
-    RandomState rng;
-    const size_t N = Lanes(d);
-    const size_t misalignments[3] = {0, N / 4, 3 * N / 5};
-
-    for (size_t count = 0; count < 2 * N; ++count) {
-      for (size_t ma : misalignments) {
-        for (size_t mb : misalignments) {
-          Test()(d, count, ma, mb, rng);
-        }
-      }
-    }
-  }
-};
-
-struct TestFill {
-  template <class D>
-  void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
-                  RandomState& rng) {
-    using T = TFromD<D>;
-    // HWY_MAX prevents error when misalign == count == 0.
-    AlignedFreeUniquePtr<T[]> pa =
-        AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
-    T* expected = pa.get() + misalign_a;
-    const T value = Random7Bit<T>(rng);
-    for (size_t i = 0; i < count; ++i) {
-      expected[i] = value;
-    }
-    AlignedFreeUniquePtr<T[]> pb = AllocateAligned<T>(misalign_b + count + 1);
-    T* actual = pb.get() + misalign_b;
-
-    actual[count] = T{0};  // sentinel
-    Fill(d, value, count, actual);
-    HWY_ASSERT_EQ(T{0}, actual[count]);  // did not write past end
-
-    const auto info = hwy::detail::MakeTypeInfo<T>();
-    const char* target_name = hwy::TargetName(HWY_TARGET);
-    hwy::detail::AssertArrayEqual(info, expected, actual, count, target_name,
-                                  __FILE__, __LINE__);
-  }
-};
-
-void TestAllFill() {
-  ForAllTypes(ForPartialVectors<ForeachCountAndMisalign<TestFill>>());
-}
-
-struct TestCopy {
-  template <class D>
-  void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
-                  RandomState& rng) {
-    using T = TFromD<D>;
-    // Prevents error if size to allocate is zero.
-    AlignedFreeUniquePtr<T[]> pa =
-        AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
-    T* a = pa.get() + misalign_a;
-    for (size_t i = 0; i < count; ++i) {
-      a[i] = Random7Bit<T>(rng);
-    }
-    AlignedFreeUniquePtr<T[]> pb =
-        AllocateAligned<T>(HWY_MAX(1, misalign_b + count));
-    T* b = pb.get() + misalign_b;
-
-    Copy(d, a, count, b);
-
-    const auto info = hwy::detail::MakeTypeInfo<T>();
-    const char* target_name = hwy::TargetName(HWY_TARGET);
-    hwy::detail::AssertArrayEqual(info, a, b, count, target_name, __FILE__,
-                                  __LINE__);
-  }
-};
-
-void TestAllCopy() {
-  ForAllTypes(ForPartialVectors<ForeachCountAndMisalign<TestCopy>>());
-}
-
-struct TestCopyIf {
-  template <class D>
-  void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
-                  RandomState& rng) {
-    using T = TFromD<D>;
-    // Prevents error if size to allocate is zero.
-    AlignedFreeUniquePtr<T[]> pa =
-        AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
-    T* a = pa.get() + misalign_a;
-    for (size_t i = 0; i < count; ++i) {
-      a[i] = Random7Bit<T>(rng);
-    }
-    const size_t padding = Lanes(ScalableTag<T>());
-    AlignedFreeUniquePtr<T[]> pb =
-        AllocateAligned<T>(HWY_MAX(1, misalign_b + count + padding));
-    T* b = pb.get() + misalign_b;
-
-    AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(HWY_MAX(1, count));
-    size_t num_odd = 0;
-    for (size_t i = 0; i < count; ++i) {
-      if (a[i] & 1) {
-        expected[num_odd++] = a[i];
-      }
-    }
-
-#if HWY_GENERIC_LAMBDA
-    const auto is_odd = [](const auto d, const auto v) HWY_ATTR {
-      return TestBit(v, Set(d, TFromD<decltype(d)>{1}));
-    };
-#else
-    const IsOdd is_odd;
-#endif
-    T* end = CopyIf(d, a, count, b, is_odd);
-    const size_t num_written = static_cast<size_t>(end - b);
-    HWY_ASSERT_EQ(num_odd, num_written);
-
-    const auto info = hwy::detail::MakeTypeInfo<T>();
-    const char* target_name = hwy::TargetName(HWY_TARGET);
-    hwy::detail::AssertArrayEqual(info, expected.get(), b, num_odd, target_name,
-                                  __FILE__, __LINE__);
-  }
-};
-
-void TestAllCopyIf() {
-  ForUI163264(ForPartialVectors<ForeachCountAndMisalign<TestCopyIf>>());
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-
-namespace hwy {
-HWY_BEFORE_TEST(CopyTest);
-HWY_EXPORT_AND_TEST_P(CopyTest, TestAllFill);
-HWY_EXPORT_AND_TEST_P(CopyTest, TestAllCopy);
-HWY_EXPORT_AND_TEST_P(CopyTest, TestAllCopyIf);
-}  // namespace hwy
-
-#endif
diff --git a/third_party/highway/hwy/contrib/algo/find-inl.h b/third_party/highway/hwy/contrib/algo/find-inl.h
deleted file mode 100644 (file)
index 388842e..0000000
+++ /dev/null
@@ -1,109 +0,0 @@
-// Copyright 2022 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Per-target include guard
-#if defined(HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_) == \
-    defined(HWY_TARGET_TOGGLE)
-#ifdef HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_
-#undef HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_
-#else
-#define HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_
-#endif
-
-#include "hwy/highway.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-// Returns index of the first element equal to `value` in `in[0, count)`, or
-// `count` if not found.
-template <class D, typename T = TFromD<D>>
-size_t Find(D d, T value, const T* HWY_RESTRICT in, size_t count) {
-  const size_t N = Lanes(d);
-  const Vec<D> broadcasted = Set(d, value);
-
-  size_t i = 0;
-  for (; i + N <= count; i += N) {
-    const intptr_t pos = FindFirstTrue(d, Eq(broadcasted, LoadU(d, in + i)));
-    if (pos >= 0) return i + static_cast<size_t>(pos);
-  }
-
-  if (i != count) {
-#if HWY_MEM_OPS_MIGHT_FAULT
-    // Scan single elements.
-    const CappedTag<T, 1> d1;
-    using V1 = Vec<decltype(d1)>;
-    const V1 broadcasted1 = Set(d1, GetLane(broadcasted));
-    for (; i < count; ++i) {
-      if (AllTrue(d1, Eq(broadcasted1, LoadU(d1, in + i)))) {
-        return i;
-      }
-    }
-#else
-    const size_t remaining = count - i;
-    HWY_DASSERT(0 != remaining && remaining < N);
-    const Mask<D> mask = FirstN(d, remaining);
-    const Vec<D> v = MaskedLoad(mask, d, in + i);
-    // Apply mask so that we don't 'find' the zero-padding from MaskedLoad.
-    const intptr_t pos = FindFirstTrue(d, And(Eq(broadcasted, v), mask));
-    if (pos >= 0) return i + static_cast<size_t>(pos);
-#endif  // HWY_MEM_OPS_MIGHT_FAULT
-  }
-
-  return count;  // not found
-}
-
-// Returns index of the first element in `in[0, count)` for which `func(d, vec)`
-// returns true, otherwise `count`.
-template <class D, class Func, typename T = TFromD<D>>
-size_t FindIf(D d, const T* HWY_RESTRICT in, size_t count, const Func& func) {
-  const size_t N = Lanes(d);
-
-  size_t i = 0;
-  for (; i + N <= count; i += N) {
-    const intptr_t pos = FindFirstTrue(d, func(d, LoadU(d, in + i)));
-    if (pos >= 0) return i + static_cast<size_t>(pos);
-  }
-
-  if (i != count) {
-#if HWY_MEM_OPS_MIGHT_FAULT
-    // Scan single elements.
-    const CappedTag<T, 1> d1;
-    for (; i < count; ++i) {
-      if (AllTrue(d1, func(d1, LoadU(d1, in + i)))) {
-        return i;
-      }
-    }
-#else
-    const size_t remaining = count - i;
-    HWY_DASSERT(0 != remaining && remaining < N);
-    const Mask<D> mask = FirstN(d, remaining);
-    const Vec<D> v = MaskedLoad(mask, d, in + i);
-    // Apply mask so that we don't 'find' the zero-padding from MaskedLoad.
-    const intptr_t pos = FindFirstTrue(d, And(func(d, v), mask));
-    if (pos >= 0) return i + static_cast<size_t>(pos);
-#endif  // HWY_MEM_OPS_MIGHT_FAULT
-  }
-
-  return count;  // not found
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#endif  // HIGHWAY_HWY_CONTRIB_ALGO_FIND_INL_H_
diff --git a/third_party/highway/hwy/contrib/algo/find_test.cc b/third_party/highway/hwy/contrib/algo/find_test.cc
deleted file mode 100644 (file)
index da13c47..0000000
+++ /dev/null
@@ -1,219 +0,0 @@
-// Copyright 2022 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <algorithm>
-#include <vector>
-
-#include "hwy/aligned_allocator.h"
-#include "hwy/base.h"
-#include "hwy/print.h"
-
-// clang-format off
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/algo/find_test.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-
-#include "hwy/contrib/algo/find-inl.h"
-#include "hwy/tests/test_util-inl.h"
-// clang-format on
-
-// If your project requires C++14 or later, you can ignore this and pass lambdas
-// directly to FindIf, without requiring an lvalue as we do here for C++11.
-#if __cplusplus < 201402L
-#define HWY_GENERIC_LAMBDA 0
-#else
-#define HWY_GENERIC_LAMBDA 1
-#endif
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-// Returns random number in [-8, 8) - we use knowledge of the range to Find()
-// values we know are not present.
-template <typename T>
-T Random(RandomState& rng) {
-  const int32_t bits = static_cast<int32_t>(Random32(&rng)) & 1023;
-  const double val = (bits - 512) / 64.0;
-  // Clamp negative to zero for unsigned types.
-  return static_cast<T>(HWY_MAX(hwy::LowestValue<T>(), val));
-}
-
-// In C++14, we can instead define these as generic lambdas next to where they
-// are invoked.
-#if !HWY_GENERIC_LAMBDA
-
-class GreaterThan {
- public:
-  GreaterThan(int val) : val_(val) {}
-  template <class D, class V>
-  Mask<D> operator()(D d, V v) const {
-    return Gt(v, Set(d, static_cast<TFromD<D>>(val_)));
-  }
-
- private:
-  int val_;
-};
-
-#endif  // !HWY_GENERIC_LAMBDA
-
-// Invokes Test (e.g. TestFind) with all arg combinations.
-template <class Test>
-struct ForeachCountAndMisalign {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) const {
-    RandomState rng;
-    const size_t N = Lanes(d);
-    const size_t misalignments[3] = {0, N / 4, 3 * N / 5};
-
-    // Find() checks 8 vectors at a time, so we want to cover a fairly large
-    // range without oversampling (checking every possible count).
-    std::vector<size_t> counts(AdjustedReps(512));
-    for (size_t& count : counts) {
-      count = static_cast<size_t>(rng()) % (16 * N + 1);
-    }
-    counts[0] = 0;  // ensure we test count=0.
-
-    for (size_t count : counts) {
-      for (size_t m : misalignments) {
-        Test()(d, count, m, rng);
-      }
-    }
-  }
-};
-
-struct TestFind {
-  template <class D>
-  void operator()(D d, size_t count, size_t misalign, RandomState& rng) {
-    using T = TFromD<D>;
-    // Must allocate at least one even if count is zero.
-    AlignedFreeUniquePtr<T[]> storage =
-        AllocateAligned<T>(HWY_MAX(1, misalign + count));
-    T* in = storage.get() + misalign;
-    for (size_t i = 0; i < count; ++i) {
-      in[i] = Random<T>(rng);
-    }
-
-    // For each position, search for that element (which we know is there)
-    for (size_t pos = 0; pos < count; ++pos) {
-      const size_t actual = Find(d, in[pos], in, count);
-
-      // We may have found an earlier occurrence of the same value; ensure the
-      // value is the same, and that it is the first.
-      if (!IsEqual(in[pos], in[actual])) {
-        fprintf(stderr, "%s count %d, found %.15f at %d but wanted %.15f\n",
-                hwy::TypeName(T(), Lanes(d)).c_str(), static_cast<int>(count),
-                static_cast<double>(in[actual]), static_cast<int>(actual),
-                static_cast<double>(in[pos]));
-        HWY_ASSERT(false);
-      }
-      for (size_t i = 0; i < actual; ++i) {
-        if (IsEqual(in[i], in[pos])) {
-          fprintf(stderr, "%s count %d, found %f at %d but Find returned %d\n",
-                  hwy::TypeName(T(), Lanes(d)).c_str(), static_cast<int>(count),
-                  static_cast<double>(in[i]), static_cast<int>(i),
-                  static_cast<int>(actual));
-          HWY_ASSERT(false);
-        }
-      }
-    }
-
-    // Also search for values we know not to be present (out of range)
-    HWY_ASSERT_EQ(count, Find(d, T{9}, in, count));
-    HWY_ASSERT_EQ(count, Find(d, static_cast<T>(-9), in, count));
-  }
-};
-
-void TestAllFind() {
-  ForAllTypes(ForPartialVectors<ForeachCountAndMisalign<TestFind>>());
-}
-
-struct TestFindIf {
-  template <class D>
-  void operator()(D d, size_t count, size_t misalign, RandomState& rng) {
-    using T = TFromD<D>;
-    using TI = MakeSigned<T>;
-    // Must allocate at least one even if count is zero.
-    AlignedFreeUniquePtr<T[]> storage =
-        AllocateAligned<T>(HWY_MAX(1, misalign + count));
-    T* in = storage.get() + misalign;
-    for (size_t i = 0; i < count; ++i) {
-      in[i] = Random<T>(rng);
-      HWY_ASSERT(in[i] < 8);
-      HWY_ASSERT(!hwy::IsSigned<T>() || static_cast<TI>(in[i]) >= -8);
-    }
-
-    bool found_any = false;
-    bool not_found_any = false;
-
-    // unsigned T would be promoted to signed and compare greater than any
-    // negative val, whereas Set() would just cast to an unsigned value and the
-    // comparison remains unsigned, so avoid negative numbers there.
-    const int min_val = IsSigned<T>() ? -9 : 0;
-    // Includes out-of-range value 9 to test the not-found path.
-    for (int val = min_val; val <= 9; ++val) {
-#if HWY_GENERIC_LAMBDA
-      const auto greater = [val](const auto d, const auto v) HWY_ATTR {
-        return Gt(v, Set(d, static_cast<T>(val)));
-      };
-#else
-      const GreaterThan greater(val);
-#endif
-      const size_t actual = FindIf(d, in, count, greater);
-      found_any |= actual < count;
-      not_found_any |= actual == count;
-
-      const auto pos = std::find_if(
-          in, in + count, [val](T x) { return x > static_cast<T>(val); });
-      // Convert returned iterator to index.
-      const size_t expected = static_cast<size_t>(pos - in);
-      if (expected != actual) {
-        fprintf(stderr, "%s count %d val %d, expected %d actual %d\n",
-                hwy::TypeName(T(), Lanes(d)).c_str(), static_cast<int>(count),
-                val, static_cast<int>(expected), static_cast<int>(actual));
-        hwy::detail::PrintArray(hwy::detail::MakeTypeInfo<T>(), "in", in, count,
-                                0, count);
-        HWY_ASSERT(false);
-      }
-    }
-
-    // We will always not-find something due to val=9.
-    HWY_ASSERT(not_found_any);
-    // We'll find something unless the input is empty or {0} - because 0 > i
-    // is false for all i=[0,9].
-    if (count != 0 && in[0] != 0) {
-      HWY_ASSERT(found_any);
-    }
-  }
-};
-
-void TestAllFindIf() {
-  ForAllTypes(ForPartialVectors<ForeachCountAndMisalign<TestFindIf>>());
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-
-namespace hwy {
-HWY_BEFORE_TEST(FindTest);
-HWY_EXPORT_AND_TEST_P(FindTest, TestAllFind);
-HWY_EXPORT_AND_TEST_P(FindTest, TestAllFindIf);
-}  // namespace hwy
-
-#endif
diff --git a/third_party/highway/hwy/contrib/algo/transform-inl.h b/third_party/highway/hwy/contrib/algo/transform-inl.h
deleted file mode 100644 (file)
index 3e830ac..0000000
+++ /dev/null
@@ -1,262 +0,0 @@
-// Copyright 2022 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Per-target include guard
-#if defined(HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_) == \
-    defined(HWY_TARGET_TOGGLE)
-#ifdef HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_
-#undef HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_
-#else
-#define HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_
-#endif
-
-#include "hwy/highway.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-// These functions avoid having to write a loop plus remainder handling in the
-// (unfortunately still common) case where arrays are not aligned/padded. If the
-// inputs are known to be aligned/padded, it is more efficient to write a single
-// loop using Load(). We do not provide a TransformAlignedPadded because it
-// would be more verbose than such a loop.
-//
-// Func is either a functor with a templated operator()(d, v[, v1[, v2]]), or a
-// generic lambda if using C++14. Due to apparent limitations of Clang on
-// Windows, it is currently necessary to add HWY_ATTR before the opening { of
-// the lambda to avoid errors about "always_inline function .. requires target".
-//
-// If HWY_MEM_OPS_MIGHT_FAULT, we use scalar code instead of masking. Otherwise,
-// we used `MaskedLoad` and `BlendedStore` to read/write the final partial
-// vector.
-
-// Fills `out[0, count)` with the vectors returned by `func(d, index_vec)`,
-// where `index_vec` is `Vec<RebindToUnsigned<D>>`. On the first call to `func`,
-// the value of its lane i is i, and increases by `Lanes(d)` after every call.
-// Note that some of these indices may be `>= count`, but the elements that
-// `func` returns in those lanes will not be written to `out`.
-template <class D, class Func, typename T = TFromD<D>>
-void Generate(D d, T* HWY_RESTRICT out, size_t count, const Func& func) {
-  const RebindToUnsigned<D> du;
-  using TU = TFromD<decltype(du)>;
-  const size_t N = Lanes(d);
-
-  size_t idx = 0;
-  Vec<decltype(du)> vidx = Iota(du, 0);
-  for (; idx + N <= count; idx += N) {
-    StoreU(func(d, vidx), d, out + idx);
-    vidx = Add(vidx, Set(du, static_cast<TU>(N)));
-  }
-
-  // `count` was a multiple of the vector length `N`: already done.
-  if (HWY_UNLIKELY(idx == count)) return;
-
-#if HWY_MEM_OPS_MIGHT_FAULT
-  // Proceed one by one.
-  const CappedTag<T, 1> d1;
-  const RebindToUnsigned<decltype(d1)> du1;
-  for (; idx < count; ++idx) {
-    StoreU(func(d1, Set(du1, static_cast<TU>(idx))), d1, out + idx);
-  }
-#else
-  const size_t remaining = count - idx;
-  HWY_DASSERT(0 != remaining && remaining < N);
-  const Mask<D> mask = FirstN(d, remaining);
-  BlendedStore(func(d, vidx), mask, d, out + idx);
-#endif
-}
-
-// Replaces `inout[idx]` with `func(d, inout[idx])`. Example usage: multiplying
-// array elements by a constant.
-template <class D, class Func, typename T = TFromD<D>>
-void Transform(D d, T* HWY_RESTRICT inout, size_t count, const Func& func) {
-  const size_t N = Lanes(d);
-
-  size_t idx = 0;
-  for (; idx + N <= count; idx += N) {
-    const Vec<D> v = LoadU(d, inout + idx);
-    StoreU(func(d, v), d, inout + idx);
-  }
-
-  // `count` was a multiple of the vector length `N`: already done.
-  if (HWY_UNLIKELY(idx == count)) return;
-
-#if HWY_MEM_OPS_MIGHT_FAULT
-  // Proceed one by one.
-  const CappedTag<T, 1> d1;
-  for (; idx < count; ++idx) {
-    using V1 = Vec<decltype(d1)>;
-    const V1 v = LoadU(d1, inout + idx);
-    StoreU(func(d1, v), d1, inout + idx);
-  }
-#else
-  const size_t remaining = count - idx;
-  HWY_DASSERT(0 != remaining && remaining < N);
-  const Mask<D> mask = FirstN(d, remaining);
-  const Vec<D> v = MaskedLoad(mask, d, inout + idx);
-  BlendedStore(func(d, v), mask, d, inout + idx);
-#endif
-}
-
-// Replaces `inout[idx]` with `func(d, inout[idx], in1[idx])`. Example usage:
-// multiplying array elements by those of another array.
-template <class D, class Func, typename T = TFromD<D>>
-void Transform1(D d, T* HWY_RESTRICT inout, size_t count,
-                const T* HWY_RESTRICT in1, const Func& func) {
-  const size_t N = Lanes(d);
-
-  size_t idx = 0;
-  for (; idx + N <= count; idx += N) {
-    const Vec<D> v = LoadU(d, inout + idx);
-    const Vec<D> v1 = LoadU(d, in1 + idx);
-    StoreU(func(d, v, v1), d, inout + idx);
-  }
-
-  // `count` was a multiple of the vector length `N`: already done.
-  if (HWY_UNLIKELY(idx == count)) return;
-
-#if HWY_MEM_OPS_MIGHT_FAULT
-  // Proceed one by one.
-  const CappedTag<T, 1> d1;
-  for (; idx < count; ++idx) {
-    using V1 = Vec<decltype(d1)>;
-    const V1 v = LoadU(d1, inout + idx);
-    const V1 v1 = LoadU(d1, in1 + idx);
-    StoreU(func(d1, v, v1), d1, inout + idx);
-  }
-#else
-  const size_t remaining = count - idx;
-  HWY_DASSERT(0 != remaining && remaining < N);
-  const Mask<D> mask = FirstN(d, remaining);
-  const Vec<D> v = MaskedLoad(mask, d, inout + idx);
-  const Vec<D> v1 = MaskedLoad(mask, d, in1 + idx);
-  BlendedStore(func(d, v, v1), mask, d, inout + idx);
-#endif
-}
-
-// Replaces `inout[idx]` with `func(d, inout[idx], in1[idx], in2[idx])`. Example
-// usage: FMA of elements from three arrays, stored into the first array.
-template <class D, class Func, typename T = TFromD<D>>
-void Transform2(D d, T* HWY_RESTRICT inout, size_t count,
-                const T* HWY_RESTRICT in1, const T* HWY_RESTRICT in2,
-                const Func& func) {
-  const size_t N = Lanes(d);
-
-  size_t idx = 0;
-  for (; idx + N <= count; idx += N) {
-    const Vec<D> v = LoadU(d, inout + idx);
-    const Vec<D> v1 = LoadU(d, in1 + idx);
-    const Vec<D> v2 = LoadU(d, in2 + idx);
-    StoreU(func(d, v, v1, v2), d, inout + idx);
-  }
-
-  // `count` was a multiple of the vector length `N`: already done.
-  if (HWY_UNLIKELY(idx == count)) return;
-
-#if HWY_MEM_OPS_MIGHT_FAULT
-  // Proceed one by one.
-  const CappedTag<T, 1> d1;
-  for (; idx < count; ++idx) {
-    using V1 = Vec<decltype(d1)>;
-    const V1 v = LoadU(d1, inout + idx);
-    const V1 v1 = LoadU(d1, in1 + idx);
-    const V1 v2 = LoadU(d1, in2 + idx);
-    StoreU(func(d1, v, v1, v2), d1, inout + idx);
-  }
-#else
-  const size_t remaining = count - idx;
-  HWY_DASSERT(0 != remaining && remaining < N);
-  const Mask<D> mask = FirstN(d, remaining);
-  const Vec<D> v = MaskedLoad(mask, d, inout + idx);
-  const Vec<D> v1 = MaskedLoad(mask, d, in1 + idx);
-  const Vec<D> v2 = MaskedLoad(mask, d, in2 + idx);
-  BlendedStore(func(d, v, v1, v2), mask, d, inout + idx);
-#endif
-}
-
-template <class D, typename T = TFromD<D>>
-void Replace(D d, T* HWY_RESTRICT inout, size_t count, T new_t, T old_t) {
-  const size_t N = Lanes(d);
-  const Vec<D> old_v = Set(d, old_t);
-  const Vec<D> new_v = Set(d, new_t);
-
-  size_t idx = 0;
-  for (; idx + N <= count; idx += N) {
-    Vec<D> v = LoadU(d, inout + idx);
-    StoreU(IfThenElse(Eq(v, old_v), new_v, v), d, inout + idx);
-  }
-
-  // `count` was a multiple of the vector length `N`: already done.
-  if (HWY_UNLIKELY(idx == count)) return;
-
-#if HWY_MEM_OPS_MIGHT_FAULT
-  // Proceed one by one.
-  const CappedTag<T, 1> d1;
-  const Vec<decltype(d1)> old_v1 = Set(d1, old_t);
-  const Vec<decltype(d1)> new_v1 = Set(d1, new_t);
-  for (; idx < count; ++idx) {
-    using V1 = Vec<decltype(d1)>;
-    const V1 v1 = LoadU(d1, inout + idx);
-    StoreU(IfThenElse(Eq(v1, old_v1), new_v1, v1), d1, inout + idx);
-  }
-#else
-  const size_t remaining = count - idx;
-  HWY_DASSERT(0 != remaining && remaining < N);
-  const Mask<D> mask = FirstN(d, remaining);
-  const Vec<D> v = MaskedLoad(mask, d, inout + idx);
-  BlendedStore(IfThenElse(Eq(v, old_v), new_v, v), mask, d, inout + idx);
-#endif
-}
-
-template <class D, class Func, typename T = TFromD<D>>
-void ReplaceIf(D d, T* HWY_RESTRICT inout, size_t count, T new_t,
-               const Func& func) {
-  const size_t N = Lanes(d);
-  const Vec<D> new_v = Set(d, new_t);
-
-  size_t idx = 0;
-  for (; idx + N <= count; idx += N) {
-    Vec<D> v = LoadU(d, inout + idx);
-    StoreU(IfThenElse(func(d, v), new_v, v), d, inout + idx);
-  }
-
-  // `count` was a multiple of the vector length `N`: already done.
-  if (HWY_UNLIKELY(idx == count)) return;
-
-#if HWY_MEM_OPS_MIGHT_FAULT
-  // Proceed one by one.
-  const CappedTag<T, 1> d1;
-  const Vec<decltype(d1)> new_v1 = Set(d1, new_t);
-  for (; idx < count; ++idx) {
-    using V1 = Vec<decltype(d1)>;
-    const V1 v = LoadU(d1, inout + idx);
-    StoreU(IfThenElse(func(d1, v), new_v1, v), d1, inout + idx);
-  }
-#else
-  const size_t remaining = count - idx;
-  HWY_DASSERT(0 != remaining && remaining < N);
-  const Mask<D> mask = FirstN(d, remaining);
-  const Vec<D> v = MaskedLoad(mask, d, inout + idx);
-  BlendedStore(IfThenElse(func(d, v), new_v, v), mask, d, inout + idx);
-#endif
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#endif  // HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_
diff --git a/third_party/highway/hwy/contrib/algo/transform_test.cc b/third_party/highway/hwy/contrib/algo/transform_test.cc
deleted file mode 100644 (file)
index 335607c..0000000
+++ /dev/null
@@ -1,372 +0,0 @@
-// Copyright 2022 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string.h>  // memcpy
-
-#include "hwy/aligned_allocator.h"
-
-// clang-format off
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/algo/transform_test.cc"  //NOLINT
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-
-#include "hwy/contrib/algo/transform-inl.h"
-#include "hwy/tests/test_util-inl.h"
-// clang-format on
-
-// If your project requires C++14 or later, you can ignore this and pass lambdas
-// directly to Transform, without requiring an lvalue as we do here for C++11.
-#if __cplusplus < 201402L
-#define HWY_GENERIC_LAMBDA 0
-#else
-#define HWY_GENERIC_LAMBDA 1
-#endif
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-template <typename T>
-T Alpha() {
-  return static_cast<T>(1.5);  // arbitrary scalar
-}
-
-// Returns random floating-point number in [-8, 8) to ensure computations do
-// not exceed float32 precision.
-template <typename T>
-T Random(RandomState& rng) {
-  const int32_t bits = static_cast<int32_t>(Random32(&rng)) & 1023;
-  const double val = (bits - 512) / 64.0;
-  // Clamp negative to zero for unsigned types.
-  return static_cast<T>(HWY_MAX(hwy::LowestValue<T>(), val));
-}
-
-// SCAL, AXPY names are from BLAS.
-template <typename T>
-HWY_NOINLINE void SimpleSCAL(const T* x, T* out, size_t count) {
-  for (size_t i = 0; i < count; ++i) {
-    out[i] = Alpha<T>() * x[i];
-  }
-}
-
-template <typename T>
-HWY_NOINLINE void SimpleAXPY(const T* x, const T* y, T* out, size_t count) {
-  for (size_t i = 0; i < count; ++i) {
-    out[i] = Alpha<T>() * x[i] + y[i];
-  }
-}
-
-template <typename T>
-HWY_NOINLINE void SimpleFMA4(const T* x, const T* y, const T* z, T* out,
-                             size_t count) {
-  for (size_t i = 0; i < count; ++i) {
-    out[i] = x[i] * y[i] + z[i];
-  }
-}
-
-// In C++14, we can instead define these as generic lambdas next to where they
-// are invoked.
-#if !HWY_GENERIC_LAMBDA
-
-// Generator that returns even numbers by doubling the output indices.
-struct Gen2 {
-  template <class D, class VU>
-  Vec<D> operator()(D d, VU vidx) const {
-    return BitCast(d, Add(vidx, vidx));
-  }
-};
-
-struct SCAL {
-  template <class D, class V>
-  Vec<D> operator()(D d, V v) const {
-    using T = TFromD<D>;
-    return Mul(Set(d, Alpha<T>()), v);
-  }
-};
-
-struct AXPY {
-  template <class D, class V>
-  Vec<D> operator()(D d, V v, V v1) const {
-    using T = TFromD<D>;
-    return MulAdd(Set(d, Alpha<T>()), v, v1);
-  }
-};
-
-struct FMA4 {
-  template <class D, class V>
-  Vec<D> operator()(D /*d*/, V v, V v1, V v2) const {
-    return MulAdd(v, v1, v2);
-  }
-};
-
-#endif  // !HWY_GENERIC_LAMBDA
-
-// Invokes Test (e.g. TestTransform1) with all arg combinations. T comes from
-// ForFloatTypes.
-template <class Test>
-struct ForeachCountAndMisalign {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) const {
-    RandomState rng;
-    const size_t N = Lanes(d);
-    const size_t misalignments[3] = {0, N / 4, 3 * N / 5};
-
-    for (size_t count = 0; count < 2 * N; ++count) {
-      for (size_t ma : misalignments) {
-        for (size_t mb : misalignments) {
-          Test()(d, count, ma, mb, rng);
-        }
-      }
-    }
-  }
-};
-
-// Output-only, no loads
-struct TestGenerate {
-  template <class D>
-  void operator()(D d, size_t count, size_t misalign_a, size_t /*misalign_b*/,
-                  RandomState& /*rng*/) {
-    using T = TFromD<D>;
-    AlignedFreeUniquePtr<T[]> pa = AllocateAligned<T>(misalign_a + count + 1);
-    T* actual = pa.get() + misalign_a;
-
-    AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(HWY_MAX(1, count));
-    for (size_t i = 0; i < count; ++i) {
-      expected[i] = static_cast<T>(2 * i);
-    }
-
-    // TODO(janwas): can we update the apply_to in HWY_PUSH_ATTRIBUTES so that
-    // the attribute also applies to lambdas? If so, remove HWY_ATTR.
-#if HWY_GENERIC_LAMBDA
-    const auto gen2 = [](const auto d, const auto vidx)
-                          HWY_ATTR { return BitCast(d, Add(vidx, vidx)); };
-#else
-    const Gen2 gen2;
-#endif
-    actual[count] = T{0};  // sentinel
-    Generate(d, actual, count, gen2);
-    HWY_ASSERT_EQ(T{0}, actual[count]);  // did not write past end
-
-    const auto info = hwy::detail::MakeTypeInfo<T>();
-    const char* target_name = hwy::TargetName(HWY_TARGET);
-    hwy::detail::AssertArrayEqual(info, expected.get(), actual, count,
-                                  target_name, __FILE__, __LINE__);
-  }
-};
-
-// Zero extra input arrays
-struct TestTransform {
-  template <class D>
-  void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
-                  RandomState& rng) {
-    if (misalign_b != 0) return;
-    using T = TFromD<D>;
-    // Prevents error if size to allocate is zero.
-    AlignedFreeUniquePtr<T[]> pa =
-        AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
-    T* a = pa.get() + misalign_a;
-    for (size_t i = 0; i < count; ++i) {
-      a[i] = Random<T>(rng);
-    }
-
-    AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(HWY_MAX(1, count));
-    SimpleSCAL(a, expected.get(), count);
-
-    // TODO(janwas): can we update the apply_to in HWY_PUSH_ATTRIBUTES so that
-    // the attribute also applies to lambdas? If so, remove HWY_ATTR.
-#if HWY_GENERIC_LAMBDA
-    const auto scal = [](const auto d, const auto v)
-                          HWY_ATTR { return Mul(Set(d, Alpha<T>()), v); };
-#else
-    const SCAL scal;
-#endif
-    Transform(d, a, count, scal);
-
-    const auto info = hwy::detail::MakeTypeInfo<T>();
-    const char* target_name = hwy::TargetName(HWY_TARGET);
-    hwy::detail::AssertArrayEqual(info, expected.get(), a, count, target_name,
-                                  __FILE__, __LINE__);
-  }
-};
-
-// One extra input array
-struct TestTransform1 {
-  template <class D>
-  void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
-                  RandomState& rng) {
-    using T = TFromD<D>;
-    // Prevents error if size to allocate is zero.
-    AlignedFreeUniquePtr<T[]> pa =
-        AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
-    AlignedFreeUniquePtr<T[]> pb =
-        AllocateAligned<T>(HWY_MAX(1, misalign_b + count));
-    T* a = pa.get() + misalign_a;
-    T* b = pb.get() + misalign_b;
-    for (size_t i = 0; i < count; ++i) {
-      a[i] = Random<T>(rng);
-      b[i] = Random<T>(rng);
-    }
-
-    AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(HWY_MAX(1, count));
-    SimpleAXPY(a, b, expected.get(), count);
-
-#if HWY_GENERIC_LAMBDA
-    const auto axpy = [](const auto d, const auto v, const auto v1) HWY_ATTR {
-      return MulAdd(Set(d, Alpha<T>()), v, v1);
-    };
-#else
-    const AXPY axpy;
-#endif
-    Transform1(d, a, count, b, axpy);
-
-    const auto info = hwy::detail::MakeTypeInfo<T>();
-    const char* target_name = hwy::TargetName(HWY_TARGET);
-    hwy::detail::AssertArrayEqual(info, expected.get(), a, count, target_name,
-                                  __FILE__, __LINE__);
-  }
-};
-
-// Two extra input arrays
-struct TestTransform2 {
-  template <class D>
-  void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
-                  RandomState& rng) {
-    using T = TFromD<D>;
-    // Prevents error if size to allocate is zero.
-    AlignedFreeUniquePtr<T[]> pa =
-        AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
-    AlignedFreeUniquePtr<T[]> pb =
-        AllocateAligned<T>(HWY_MAX(1, misalign_b + count));
-    AlignedFreeUniquePtr<T[]> pc =
-        AllocateAligned<T>(HWY_MAX(1, misalign_a + count));
-    T* a = pa.get() + misalign_a;
-    T* b = pb.get() + misalign_b;
-    T* c = pc.get() + misalign_a;
-    for (size_t i = 0; i < count; ++i) {
-      a[i] = Random<T>(rng);
-      b[i] = Random<T>(rng);
-      c[i] = Random<T>(rng);
-    }
-
-    AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(HWY_MAX(1, count));
-    SimpleFMA4(a, b, c, expected.get(), count);
-
-#if HWY_GENERIC_LAMBDA
-    const auto fma4 = [](auto /*d*/, auto v, auto v1, auto v2)
-                          HWY_ATTR { return MulAdd(v, v1, v2); };
-#else
-    const FMA4 fma4;
-#endif
-    Transform2(d, a, count, b, c, fma4);
-
-    const auto info = hwy::detail::MakeTypeInfo<T>();
-    const char* target_name = hwy::TargetName(HWY_TARGET);
-    hwy::detail::AssertArrayEqual(info, expected.get(), a, count, target_name,
-                                  __FILE__, __LINE__);
-  }
-};
-
-template <typename T>
-class IfEq {
- public:
-  IfEq(T val) : val_(val) {}
-
-  template <class D, class V>
-  Mask<D> operator()(D d, V v) const {
-    return Eq(v, Set(d, val_));
-  }
-
- private:
-  T val_;
-};
-
-struct TestReplace {
-  template <class D>
-  void operator()(D d, size_t count, size_t misalign_a, size_t misalign_b,
-                  RandomState& rng) {
-    if (misalign_b != 0) return;
-    if (count == 0) return;
-    using T = TFromD<D>;
-    AlignedFreeUniquePtr<T[]> pa = AllocateAligned<T>(misalign_a + count);
-    T* a = pa.get() + misalign_a;
-    for (size_t i = 0; i < count; ++i) {
-      a[i] = Random<T>(rng);
-    }
-    AlignedFreeUniquePtr<T[]> pb = AllocateAligned<T>(count);
-
-    AlignedFreeUniquePtr<T[]> expected = AllocateAligned<T>(count);
-
-    std::vector<size_t> positions(AdjustedReps(count));
-    for (size_t& pos : positions) {
-      pos = static_cast<size_t>(rng()) % count;
-    }
-
-    for (size_t pos = 0; pos < count; ++pos) {
-      const T old_t = a[pos];
-      const T new_t = Random<T>(rng);
-      for (size_t i = 0; i < count; ++i) {
-        expected[i] = IsEqual(a[i], old_t) ? new_t : a[i];
-      }
-
-      // Copy so ReplaceIf gets the same input (and thus also outputs expected)
-      memcpy(pb.get(), a, count * sizeof(T));
-
-      Replace(d, a, count, new_t, old_t);
-      HWY_ASSERT_ARRAY_EQ(expected.get(), a, count);
-
-      ReplaceIf(d, pb.get(), count, new_t, IfEq<T>(old_t));
-      HWY_ASSERT_ARRAY_EQ(expected.get(), pb.get(), count);
-    }
-  }
-};
-
-void TestAllGenerate() {
-  // The test BitCast-s the indices, which does not work for floats.
-  ForIntegerTypes(ForPartialVectors<ForeachCountAndMisalign<TestGenerate>>());
-}
-
-void TestAllTransform() {
-  ForFloatTypes(ForPartialVectors<ForeachCountAndMisalign<TestTransform>>());
-}
-
-void TestAllTransform1() {
-  ForFloatTypes(ForPartialVectors<ForeachCountAndMisalign<TestTransform1>>());
-}
-
-void TestAllTransform2() {
-  ForFloatTypes(ForPartialVectors<ForeachCountAndMisalign<TestTransform2>>());
-}
-
-void TestAllReplace() {
-  ForFloatTypes(ForPartialVectors<ForeachCountAndMisalign<TestReplace>>());
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-
-namespace hwy {
-HWY_BEFORE_TEST(TransformTest);
-HWY_EXPORT_AND_TEST_P(TransformTest, TestAllGenerate);
-HWY_EXPORT_AND_TEST_P(TransformTest, TestAllTransform);
-HWY_EXPORT_AND_TEST_P(TransformTest, TestAllTransform1);
-HWY_EXPORT_AND_TEST_P(TransformTest, TestAllTransform2);
-HWY_EXPORT_AND_TEST_P(TransformTest, TestAllReplace);
-}  // namespace hwy
-
-#endif
diff --git a/third_party/highway/hwy/contrib/dot/dot-inl.h b/third_party/highway/hwy/contrib/dot/dot-inl.h
deleted file mode 100644 (file)
index e04636f..0000000
+++ /dev/null
@@ -1,252 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Include guard (still compiled once per target)
-#include <cmath>
-
-#if defined(HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_) == \
-    defined(HWY_TARGET_TOGGLE)
-#ifdef HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_
-#undef HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_
-#else
-#define HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_
-#endif
-
-#include "hwy/highway.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-struct Dot {
-  // Specify zero or more of these, ORed together, as the kAssumptions template
-  // argument to Compute. Each one may improve performance or reduce code size,
-  // at the cost of additional requirements on the arguments.
-  enum Assumptions {
-    // num_elements is at least N, which may be up to HWY_MAX_BYTES / sizeof(T).
-    kAtLeastOneVector = 1,
-    // num_elements is divisible by N (a power of two, so this can be used if
-    // the problem size is known to be a power of two >= HWY_MAX_BYTES /
-    // sizeof(T)).
-    kMultipleOfVector = 2,
-    // RoundUpTo(num_elements, N) elements are accessible; their value does not
-    // matter (will be treated as if they were zero).
-    kPaddedToVector = 4,
-  };
-
-  // Returns sum{pa[i] * pb[i]} for float or double inputs. Aligning the
-  // pointers to a multiple of N elements is helpful but not required.
-  template <int kAssumptions, class D, typename T = TFromD<D>,
-            HWY_IF_NOT_LANE_SIZE_D(D, 2)>
-  static HWY_INLINE T Compute(const D d, const T* const HWY_RESTRICT pa,
-                              const T* const HWY_RESTRICT pb,
-                              const size_t num_elements) {
-    static_assert(IsFloat<T>(), "MulAdd requires float type");
-    using V = decltype(Zero(d));
-
-    const size_t N = Lanes(d);
-    size_t i = 0;
-
-    constexpr bool kIsAtLeastOneVector =
-        (kAssumptions & kAtLeastOneVector) != 0;
-    constexpr bool kIsMultipleOfVector =
-        (kAssumptions & kMultipleOfVector) != 0;
-    constexpr bool kIsPaddedToVector = (kAssumptions & kPaddedToVector) != 0;
-
-    // Won't be able to do a full vector load without padding => scalar loop.
-    if (!kIsAtLeastOneVector && !kIsMultipleOfVector && !kIsPaddedToVector &&
-        HWY_UNLIKELY(num_elements < N)) {
-      // Only 2x unroll to avoid excessive code size.
-      T sum0 = T(0);
-      T sum1 = T(0);
-      for (; i + 2 <= num_elements; i += 2) {
-        sum0 += pa[i + 0] * pb[i + 0];
-        sum1 += pa[i + 1] * pb[i + 1];
-      }
-      if (i < num_elements) {
-        sum1 += pa[i] * pb[i];
-      }
-      return sum0 + sum1;
-    }
-
-    // Compiler doesn't make independent sum* accumulators, so unroll manually.
-    // 2 FMA ports * 4 cycle latency = up to 8 in-flight, but that is excessive
-    // for unaligned inputs (each unaligned pointer halves the throughput
-    // because it occupies both L1 load ports for a cycle). We cannot have
-    // arrays of vectors on RVV/SVE, so always unroll 4x.
-    V sum0 = Zero(d);
-    V sum1 = Zero(d);
-    V sum2 = Zero(d);
-    V sum3 = Zero(d);
-
-    // Main loop: unrolled
-    for (; i + 4 * N <= num_elements; /* i += 4 * N */) {  // incr in loop
-      const auto a0 = LoadU(d, pa + i);
-      const auto b0 = LoadU(d, pb + i);
-      i += N;
-      sum0 = MulAdd(a0, b0, sum0);
-      const auto a1 = LoadU(d, pa + i);
-      const auto b1 = LoadU(d, pb + i);
-      i += N;
-      sum1 = MulAdd(a1, b1, sum1);
-      const auto a2 = LoadU(d, pa + i);
-      const auto b2 = LoadU(d, pb + i);
-      i += N;
-      sum2 = MulAdd(a2, b2, sum2);
-      const auto a3 = LoadU(d, pa + i);
-      const auto b3 = LoadU(d, pb + i);
-      i += N;
-      sum3 = MulAdd(a3, b3, sum3);
-    }
-
-    // Up to 3 iterations of whole vectors
-    for (; i + N <= num_elements; i += N) {
-      const auto a = LoadU(d, pa + i);
-      const auto b = LoadU(d, pb + i);
-      sum0 = MulAdd(a, b, sum0);
-    }
-
-    if (!kIsMultipleOfVector) {
-      const size_t remaining = num_elements - i;
-      if (remaining != 0) {
-        if (kIsPaddedToVector) {
-          const auto mask = FirstN(d, remaining);
-          const auto a = LoadU(d, pa + i);
-          const auto b = LoadU(d, pb + i);
-          sum1 = MulAdd(IfThenElseZero(mask, a), IfThenElseZero(mask, b), sum1);
-        } else {
-          // Unaligned load such that the last element is in the highest lane -
-          // ensures we do not touch any elements outside the valid range.
-          // If we get here, then num_elements >= N.
-          HWY_DASSERT(i >= N);
-          i += remaining - N;
-          const auto skip = FirstN(d, N - remaining);
-          const auto a = LoadU(d, pa + i);  // always unaligned
-          const auto b = LoadU(d, pb + i);
-          sum1 = MulAdd(IfThenZeroElse(skip, a), IfThenZeroElse(skip, b), sum1);
-        }
-      }
-    }  // kMultipleOfVector
-
-    // Reduction tree: sum of all accumulators by pairs, then across lanes.
-    sum0 = Add(sum0, sum1);
-    sum2 = Add(sum2, sum3);
-    sum0 = Add(sum0, sum2);
-    return GetLane(SumOfLanes(d, sum0));
-  }
-
-  // Returns sum{pa[i] * pb[i]} for bfloat16 inputs. Aligning the pointers to a
-  // multiple of N elements is helpful but not required.
-  template <int kAssumptions, class D>
-  static HWY_INLINE float Compute(const D d,
-                                  const bfloat16_t* const HWY_RESTRICT pa,
-                                  const bfloat16_t* const HWY_RESTRICT pb,
-                                  const size_t num_elements) {
-    const RebindToUnsigned<D> du16;
-    const Repartition<float, D> df32;
-
-    using V = decltype(Zero(df32));
-    const size_t N = Lanes(d);
-    size_t i = 0;
-
-    constexpr bool kIsAtLeastOneVector =
-        (kAssumptions & kAtLeastOneVector) != 0;
-    constexpr bool kIsMultipleOfVector =
-        (kAssumptions & kMultipleOfVector) != 0;
-    constexpr bool kIsPaddedToVector = (kAssumptions & kPaddedToVector) != 0;
-
-    // Won't be able to do a full vector load without padding => scalar loop.
-    if (!kIsAtLeastOneVector && !kIsMultipleOfVector && !kIsPaddedToVector &&
-        HWY_UNLIKELY(num_elements < N)) {
-      float sum0 = 0.0f;  // Only 2x unroll to avoid excessive code size for..
-      float sum1 = 0.0f;  // this unlikely(?) case.
-      for (; i + 2 <= num_elements; i += 2) {
-        sum0 += F32FromBF16(pa[i + 0]) * F32FromBF16(pb[i + 0]);
-        sum1 += F32FromBF16(pa[i + 1]) * F32FromBF16(pb[i + 1]);
-      }
-      if (i < num_elements) {
-        sum1 += F32FromBF16(pa[i]) * F32FromBF16(pb[i]);
-      }
-      return sum0 + sum1;
-    }
-
-    // See comment in the other Compute() overload. Unroll 2x, but we need
-    // twice as many sums for ReorderWidenMulAccumulate.
-    V sum0 = Zero(df32);
-    V sum1 = Zero(df32);
-    V sum2 = Zero(df32);
-    V sum3 = Zero(df32);
-
-    // Main loop: unrolled
-    for (; i + 2 * N <= num_elements; /* i += 2 * N */) {  // incr in loop
-      const auto a0 = LoadU(d, pa + i);
-      const auto b0 = LoadU(d, pb + i);
-      i += N;
-      sum0 = ReorderWidenMulAccumulate(df32, a0, b0, sum0, sum1);
-      const auto a1 = LoadU(d, pa + i);
-      const auto b1 = LoadU(d, pb + i);
-      i += N;
-      sum2 = ReorderWidenMulAccumulate(df32, a1, b1, sum2, sum3);
-    }
-
-    // Possibly one more iteration of whole vectors
-    if (i + N <= num_elements) {
-      const auto a0 = LoadU(d, pa + i);
-      const auto b0 = LoadU(d, pb + i);
-      i += N;
-      sum0 = ReorderWidenMulAccumulate(df32, a0, b0, sum0, sum1);
-    }
-
-    if (!kIsMultipleOfVector) {
-      const size_t remaining = num_elements - i;
-      if (remaining != 0) {
-        if (kIsPaddedToVector) {
-          const auto mask = FirstN(du16, remaining);
-          const auto va = LoadU(d, pa + i);
-          const auto vb = LoadU(d, pb + i);
-          const auto a16 = BitCast(d, IfThenElseZero(mask, BitCast(du16, va)));
-          const auto b16 = BitCast(d, IfThenElseZero(mask, BitCast(du16, vb)));
-          sum2 = ReorderWidenMulAccumulate(df32, a16, b16, sum2, sum3);
-
-        } else {
-          // Unaligned load such that the last element is in the highest lane -
-          // ensures we do not touch any elements outside the valid range.
-          // If we get here, then num_elements >= N.
-          HWY_DASSERT(i >= N);
-          i += remaining - N;
-          const auto skip = FirstN(du16, N - remaining);
-          const auto va = LoadU(d, pa + i);  // always unaligned
-          const auto vb = LoadU(d, pb + i);
-          const auto a16 = BitCast(d, IfThenZeroElse(skip, BitCast(du16, va)));
-          const auto b16 = BitCast(d, IfThenZeroElse(skip, BitCast(du16, vb)));
-          sum2 = ReorderWidenMulAccumulate(df32, a16, b16, sum2, sum3);
-        }
-      }
-    }  // kMultipleOfVector
-
-    // Reduction tree: sum of all accumulators by pairs, then across lanes.
-    sum0 = Add(sum0, sum1);
-    sum2 = Add(sum2, sum3);
-    sum0 = Add(sum0, sum2);
-    return GetLane(SumOfLanes(df32, sum0));
-  }
-};
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#endif  // HIGHWAY_HWY_CONTRIB_DOT_DOT_INL_H_
diff --git a/third_party/highway/hwy/contrib/dot/dot_test.cc b/third_party/highway/hwy/contrib/dot/dot_test.cc
deleted file mode 100644 (file)
index 12d7ab2..0000000
+++ /dev/null
@@ -1,167 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "hwy/aligned_allocator.h"
-
-// clang-format off
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/dot/dot_test.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-
-#include "hwy/contrib/dot/dot-inl.h"
-#include "hwy/tests/test_util-inl.h"
-// clang-format on
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-template <typename T>
-HWY_NOINLINE T SimpleDot(const T* pa, const T* pb, size_t num) {
-  double sum = 0.0;
-  for (size_t i = 0; i < num; ++i) {
-    sum += pa[i] * pb[i];
-  }
-  return static_cast<T>(sum);
-}
-
-HWY_NOINLINE float SimpleDot(const bfloat16_t* pa, const bfloat16_t* pb,
-                             size_t num) {
-  float sum = 0.0f;
-  for (size_t i = 0; i < num; ++i) {
-    sum += F32FromBF16(pa[i]) * F32FromBF16(pb[i]);
-  }
-  return sum;
-}
-
-template <typename T>
-void SetValue(const float value, T* HWY_RESTRICT ptr) {
-  *ptr = static_cast<T>(value);
-}
-void SetValue(const float value, bfloat16_t* HWY_RESTRICT ptr) {
-  *ptr = BF16FromF32(value);
-}
-
-class TestDot {
-  // Computes/verifies one dot product.
-  template <int kAssumptions, class D>
-  void Test(D d, size_t num, size_t misalign_a, size_t misalign_b,
-            RandomState& rng) {
-    using T = TFromD<D>;
-    const size_t N = Lanes(d);
-    const auto random_t = [&rng]() {
-      const int32_t bits = static_cast<int32_t>(Random32(&rng)) & 1023;
-      return static_cast<float>(bits - 512) * (1.0f / 64);
-    };
-
-    const size_t padded =
-        (kAssumptions & Dot::kPaddedToVector) ? RoundUpTo(num, N) : num;
-    AlignedFreeUniquePtr<T[]> pa = AllocateAligned<T>(misalign_a + padded);
-    AlignedFreeUniquePtr<T[]> pb = AllocateAligned<T>(misalign_b + padded);
-    T* a = pa.get() + misalign_a;
-    T* b = pb.get() + misalign_b;
-    size_t i = 0;
-    for (; i < num; ++i) {
-      SetValue(random_t(), a + i);
-      SetValue(random_t(), b + i);
-    }
-    // Fill padding with NaN - the values are not used, but avoids MSAN errors.
-    for (; i < padded; ++i) {
-      ScalableTag<float> df1;
-      SetValue(GetLane(NaN(df1)), a + i);
-      SetValue(GetLane(NaN(df1)), b + i);
-    }
-
-    const auto expected = SimpleDot(a, b, num);
-    const auto actual = Dot::Compute<kAssumptions>(d, a, b, num);
-    const auto max = static_cast<decltype(actual)>(8 * 8 * num);
-    HWY_ASSERT(-max <= actual && actual <= max);
-    HWY_ASSERT(expected - 1E-4 <= actual && actual <= expected + 1E-4);
-  }
-
-  // Runs tests with various alignments.
-  template <int kAssumptions, class D>
-  void ForeachMisalign(D d, size_t num, RandomState& rng) {
-    const size_t N = Lanes(d);
-    const size_t misalignments[3] = {0, N / 4, 3 * N / 5};
-    for (size_t ma : misalignments) {
-      for (size_t mb : misalignments) {
-        Test<kAssumptions>(d, num, ma, mb, rng);
-      }
-    }
-  }
-
-  // Runs tests with various lengths compatible with the given assumptions.
-  template <int kAssumptions, class D>
-  void ForeachCount(D d, RandomState& rng) {
-    const size_t N = Lanes(d);
-    const size_t counts[] = {1,
-                             3,
-                             7,
-                             16,
-                             HWY_MAX(N / 2, 1),
-                             HWY_MAX(2 * N / 3, 1),
-                             N,
-                             N + 1,
-                             4 * N / 3,
-                             3 * N,
-                             8 * N,
-                             8 * N + 2};
-    for (size_t num : counts) {
-      if ((kAssumptions & Dot::kAtLeastOneVector) && num < N) continue;
-      if ((kAssumptions & Dot::kMultipleOfVector) && (num % N) != 0) continue;
-      ForeachMisalign<kAssumptions>(d, num, rng);
-    }
-  }
-
- public:
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    RandomState rng;
-
-    // All 8 combinations of the three length-related flags:
-    ForeachCount<0>(d, rng);
-    ForeachCount<Dot::kAtLeastOneVector>(d, rng);
-    ForeachCount<Dot::kMultipleOfVector>(d, rng);
-    ForeachCount<Dot::kMultipleOfVector | Dot::kAtLeastOneVector>(d, rng);
-    ForeachCount<Dot::kPaddedToVector>(d, rng);
-    ForeachCount<Dot::kPaddedToVector | Dot::kAtLeastOneVector>(d, rng);
-    ForeachCount<Dot::kPaddedToVector | Dot::kMultipleOfVector>(d, rng);
-    ForeachCount<Dot::kPaddedToVector | Dot::kMultipleOfVector |
-                 Dot::kAtLeastOneVector>(d, rng);
-  }
-};
-
-void TestAllDot() { ForFloatTypes(ForPartialVectors<TestDot>()); }
-void TestAllDotBF16() { ForShrinkableVectors<TestDot>()(bfloat16_t()); }
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-
-namespace hwy {
-HWY_BEFORE_TEST(DotTest);
-HWY_EXPORT_AND_TEST_P(DotTest, TestAllDot);
-HWY_EXPORT_AND_TEST_P(DotTest, TestAllDotBF16);
-}  // namespace hwy
-
-#endif
diff --git a/third_party/highway/hwy/contrib/image/image.cc b/third_party/highway/hwy/contrib/image/image.cc
deleted file mode 100644 (file)
index 2bcdcd6..0000000
+++ /dev/null
@@ -1,145 +0,0 @@
-// Copyright 2020 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/image/image.h"
-
-#include <algorithm>  // swap
-#include <cstddef>
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/image/image.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#include "hwy/highway.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-size_t GetVectorSize() { return Lanes(ScalableTag<uint8_t>()); }
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-namespace {
-HWY_EXPORT(GetVectorSize);  // Local function.
-}  // namespace
-
-size_t ImageBase::VectorSize() {
-  // Do not cache result - must return the current value, which may be greater
-  // than the first call if it was subject to DisableTargets!
-  return HWY_DYNAMIC_DISPATCH(GetVectorSize)();
-}
-
-size_t ImageBase::BytesPerRow(const size_t xsize, const size_t sizeof_t) {
-  const size_t vec_size = VectorSize();
-  size_t valid_bytes = xsize * sizeof_t;
-
-  // Allow unaligned accesses starting at the last valid value - this may raise
-  // msan errors unless the user calls InitializePaddingForUnalignedAccesses.
-  // Skip for the scalar case because no extra lanes will be loaded.
-  if (vec_size != 1) {
-    HWY_DASSERT(vec_size >= sizeof_t);
-    valid_bytes += vec_size - sizeof_t;
-  }
-
-  // Round up to vector and cache line size.
-  const size_t align = HWY_MAX(vec_size, HWY_ALIGNMENT);
-  size_t bytes_per_row = RoundUpTo(valid_bytes, align);
-
-  // During the lengthy window before writes are committed to memory, CPUs
-  // guard against read after write hazards by checking the address, but
-  // only the lower 11 bits. We avoid a false dependency between writes to
-  // consecutive rows by ensuring their sizes are not multiples of 2 KiB.
-  // Avoid2K prevents the same problem for the planes of an Image3.
-  if (bytes_per_row % HWY_ALIGNMENT == 0) {
-    bytes_per_row += align;
-  }
-
-  HWY_DASSERT(bytes_per_row % align == 0);
-  return bytes_per_row;
-}
-
-ImageBase::ImageBase(const size_t xsize, const size_t ysize,
-                     const size_t sizeof_t)
-    : xsize_(static_cast<uint32_t>(xsize)),
-      ysize_(static_cast<uint32_t>(ysize)),
-      bytes_(nullptr, AlignedFreer(&AlignedFreer::DoNothing, nullptr)) {
-  HWY_ASSERT(sizeof_t == 1 || sizeof_t == 2 || sizeof_t == 4 || sizeof_t == 8);
-
-  bytes_per_row_ = 0;
-  // Dimensions can be zero, e.g. for lazily-allocated images. Only allocate
-  // if nonzero, because "zero" bytes still have padding/bookkeeping overhead.
-  if (xsize != 0 && ysize != 0) {
-    bytes_per_row_ = BytesPerRow(xsize, sizeof_t);
-    bytes_ = AllocateAligned<uint8_t>(bytes_per_row_ * ysize);
-    HWY_ASSERT(bytes_.get() != nullptr);
-    InitializePadding(sizeof_t, Padding::kRoundUp);
-  }
-}
-
-ImageBase::ImageBase(const size_t xsize, const size_t ysize,
-                     const size_t bytes_per_row, void* const aligned)
-    : xsize_(static_cast<uint32_t>(xsize)),
-      ysize_(static_cast<uint32_t>(ysize)),
-      bytes_per_row_(bytes_per_row),
-      bytes_(static_cast<uint8_t*>(aligned),
-             AlignedFreer(&AlignedFreer::DoNothing, nullptr)) {
-  const size_t vec_size = VectorSize();
-  HWY_ASSERT(bytes_per_row % vec_size == 0);
-  HWY_ASSERT(reinterpret_cast<uintptr_t>(aligned) % vec_size == 0);
-}
-
-void ImageBase::InitializePadding(const size_t sizeof_t, Padding padding) {
-#if HWY_IS_MSAN || HWY_IDE
-  if (xsize_ == 0 || ysize_ == 0) return;
-
-  const size_t vec_size = VectorSize();  // Bytes, independent of sizeof_t!
-  if (vec_size == 1) return;             // Scalar mode: no padding needed
-
-  const size_t valid_size = xsize_ * sizeof_t;
-  const size_t initialize_size = padding == Padding::kRoundUp
-                                     ? RoundUpTo(valid_size, vec_size)
-                                     : valid_size + vec_size - sizeof_t;
-  if (valid_size == initialize_size) return;
-
-  for (size_t y = 0; y < ysize_; ++y) {
-    uint8_t* HWY_RESTRICT row = static_cast<uint8_t*>(VoidRow(y));
-#if defined(__clang__) && (__clang_major__ <= 6)
-    // There's a bug in msan in clang-6 when handling AVX2 operations. This
-    // workaround allows tests to pass on msan, although it is slower and
-    // prevents msan warnings from uninitialized images.
-    memset(row, 0, initialize_size);
-#else
-    memset(row + valid_size, 0, initialize_size - valid_size);
-#endif  // clang6
-  }
-#else
-  (void)sizeof_t;
-  (void)padding;
-#endif  // HWY_IS_MSAN
-}
-
-void ImageBase::Swap(ImageBase& other) {
-  std::swap(xsize_, other.xsize_);
-  std::swap(ysize_, other.ysize_);
-  std::swap(bytes_per_row_, other.bytes_per_row_);
-  std::swap(bytes_, other.bytes_);
-}
-
-}  // namespace hwy
-#endif  // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/image/image.h b/third_party/highway/hwy/contrib/image/image.h
deleted file mode 100644 (file)
index 231f3c5..0000000
+++ /dev/null
@@ -1,471 +0,0 @@
-// Copyright 2020 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HIGHWAY_HWY_CONTRIB_IMAGE_IMAGE_H_
-#define HIGHWAY_HWY_CONTRIB_IMAGE_IMAGE_H_
-
-// SIMD/multicore-friendly planar image representation with row accessors.
-
-#include <stddef.h>
-#include <stdint.h>
-#include <string.h>
-
-#include <cstddef>
-#include <utility>  // std::move
-
-#include "hwy/aligned_allocator.h"
-#include "hwy/base.h"
-#include "hwy/highway_export.h"
-
-namespace hwy {
-
-// Type-independent parts of Image<> - reduces code duplication and facilitates
-// moving member function implementations to cc file.
-struct HWY_CONTRIB_DLLEXPORT ImageBase {
-  // Returns required alignment in bytes for externally allocated memory.
-  static size_t VectorSize();
-
-  // Returns distance [bytes] between the start of two consecutive rows, a
-  // multiple of VectorSize but NOT kAlias (see implementation).
-  static size_t BytesPerRow(const size_t xsize, const size_t sizeof_t);
-
-  // No allocation (for output params or unused images)
-  ImageBase()
-      : xsize_(0),
-        ysize_(0),
-        bytes_per_row_(0),
-        bytes_(nullptr, AlignedFreer(&AlignedFreer::DoNothing, nullptr)) {}
-
-  // Allocates memory (this is the common case)
-  ImageBase(size_t xsize, size_t ysize, size_t sizeof_t);
-
-  // References but does not take ownership of external memory. Useful for
-  // interoperability with other libraries. `aligned` must be aligned to a
-  // multiple of VectorSize() and `bytes_per_row` must also be a multiple of
-  // VectorSize() or preferably equal to BytesPerRow().
-  ImageBase(size_t xsize, size_t ysize, size_t bytes_per_row, void* aligned);
-
-  // Copy construction/assignment is forbidden to avoid inadvertent copies,
-  // which can be very expensive. Use CopyImageTo() instead.
-  ImageBase(const ImageBase& other) = delete;
-  ImageBase& operator=(const ImageBase& other) = delete;
-
-  // Move constructor (required for returning Image from function)
-  ImageBase(ImageBase&& other) noexcept = default;
-
-  // Move assignment (required for std::vector)
-  ImageBase& operator=(ImageBase&& other) noexcept = default;
-
-  void Swap(ImageBase& other);
-
-  // Useful for pre-allocating image with some padding for alignment purposes
-  // and later reporting the actual valid dimensions. Caller is responsible
-  // for ensuring xsize/ysize are <= the original dimensions.
-  void ShrinkTo(const size_t xsize, const size_t ysize) {
-    xsize_ = static_cast<uint32_t>(xsize);
-    ysize_ = static_cast<uint32_t>(ysize);
-    // NOTE: we can't recompute bytes_per_row for more compact storage and
-    // better locality because that would invalidate the image contents.
-  }
-
-  // How many pixels.
-  HWY_INLINE size_t xsize() const { return xsize_; }
-  HWY_INLINE size_t ysize() const { return ysize_; }
-
-  // NOTE: do not use this for copying rows - the valid xsize may be much less.
-  HWY_INLINE size_t bytes_per_row() const { return bytes_per_row_; }
-
-  // Raw access to byte contents, for interfacing with other libraries.
-  // Unsigned char instead of char to avoid surprises (sign extension).
-  HWY_INLINE uint8_t* bytes() {
-    void* p = bytes_.get();
-    return static_cast<uint8_t * HWY_RESTRICT>(HWY_ASSUME_ALIGNED(p, 64));
-  }
-  HWY_INLINE const uint8_t* bytes() const {
-    const void* p = bytes_.get();
-    return static_cast<const uint8_t * HWY_RESTRICT>(HWY_ASSUME_ALIGNED(p, 64));
-  }
-
- protected:
-  // Returns pointer to the start of a row.
-  HWY_INLINE void* VoidRow(const size_t y) const {
-#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
-    if (y >= ysize_) {
-      HWY_ABORT("Row(%d) >= %u\n", static_cast<int>(y), ysize_);
-    }
-#endif
-
-    void* row = bytes_.get() + y * bytes_per_row_;
-    return HWY_ASSUME_ALIGNED(row, 64);
-  }
-
-  enum class Padding {
-    // Allow Load(d, row + x) for x = 0; x < xsize(); x += Lanes(d). Default.
-    kRoundUp,
-    // Allow LoadU(d, row + x) for x <= xsize() - 1. This requires an extra
-    // vector to be initialized. If done by default, this would suppress
-    // legitimate msan warnings. We therefore require users to explicitly call
-    // InitializePadding before using unaligned loads (e.g. convolution).
-    kUnaligned
-  };
-
-  // Initializes the minimum bytes required to suppress msan warnings from
-  // legitimate (according to Padding mode) vector loads/stores on the right
-  // border, where some lanes are uninitialized and assumed to be unused.
-  void InitializePadding(size_t sizeof_t, Padding padding);
-
-  // (Members are non-const to enable assignment during move-assignment.)
-  uint32_t xsize_;  // In valid pixels, not including any padding.
-  uint32_t ysize_;
-  size_t bytes_per_row_;  // Includes padding.
-  AlignedFreeUniquePtr<uint8_t[]> bytes_;
-};
-
-// Single channel, aligned rows separated by padding. T must be POD.
-//
-// 'Single channel' (one 2D array per channel) simplifies vectorization
-// (repeating the same operation on multiple adjacent components) without the
-// complexity of a hybrid layout (8 R, 8 G, 8 B, ...). In particular, clients
-// can easily iterate over all components in a row and Image requires no
-// knowledge of the pixel format beyond the component type "T".
-//
-// 'Aligned' means each row is aligned to the L1 cache line size. This prevents
-// false sharing between two threads operating on adjacent rows.
-//
-// 'Padding' is still relevant because vectors could potentially be larger than
-// a cache line. By rounding up row sizes to the vector size, we allow
-// reading/writing ALIGNED vectors whose first lane is a valid sample. This
-// avoids needing a separate loop to handle remaining unaligned lanes.
-//
-// This image layout could also be achieved with a vector and a row accessor
-// function, but a class wrapper with support for "deleter" allows wrapping
-// existing memory allocated by clients without copying the pixels. It also
-// provides convenient accessors for xsize/ysize, which shortens function
-// argument lists. Supports move-construction so it can be stored in containers.
-template <typename ComponentType>
-class Image : public ImageBase {
- public:
-  using T = ComponentType;
-
-  Image() = default;
-  Image(const size_t xsize, const size_t ysize)
-      : ImageBase(xsize, ysize, sizeof(T)) {}
-  Image(const size_t xsize, const size_t ysize, size_t bytes_per_row,
-        void* aligned)
-      : ImageBase(xsize, ysize, bytes_per_row, aligned) {}
-
-  void InitializePaddingForUnalignedAccesses() {
-    InitializePadding(sizeof(T), Padding::kUnaligned);
-  }
-
-  HWY_INLINE const T* ConstRow(const size_t y) const {
-    return static_cast<const T*>(VoidRow(y));
-  }
-  HWY_INLINE const T* ConstRow(const size_t y) {
-    return static_cast<const T*>(VoidRow(y));
-  }
-
-  // Returns pointer to non-const. This allows passing const Image* parameters
-  // when the callee is only supposed to fill the pixels, as opposed to
-  // allocating or resizing the image.
-  HWY_INLINE T* MutableRow(const size_t y) const {
-    return static_cast<T*>(VoidRow(y));
-  }
-  HWY_INLINE T* MutableRow(const size_t y) {
-    return static_cast<T*>(VoidRow(y));
-  }
-
-  // Returns number of pixels (some of which are padding) per row. Useful for
-  // computing other rows via pointer arithmetic. WARNING: this must
-  // NOT be used to determine xsize.
-  HWY_INLINE intptr_t PixelsPerRow() const {
-    return static_cast<intptr_t>(bytes_per_row_ / sizeof(T));
-  }
-};
-
-using ImageF = Image<float>;
-
-// A bundle of 3 same-sized images. To fill an existing Image3 using
-// single-channel producers, we also need access to each const Image*. Const
-// prevents breaking the same-size invariant, while still allowing pixels to be
-// changed via MutableRow.
-template <typename ComponentType>
-class Image3 {
- public:
-  using T = ComponentType;
-  using ImageT = Image<T>;
-  static constexpr size_t kNumPlanes = 3;
-
-  Image3() : planes_{ImageT(), ImageT(), ImageT()} {}
-
-  Image3(const size_t xsize, const size_t ysize)
-      : planes_{ImageT(xsize, ysize), ImageT(xsize, ysize),
-                ImageT(xsize, ysize)} {}
-
-  Image3(Image3&& other) noexcept {
-    for (size_t i = 0; i < kNumPlanes; i++) {
-      planes_[i] = std::move(other.planes_[i]);
-    }
-  }
-
-  Image3(ImageT&& plane0, ImageT&& plane1, ImageT&& plane2) {
-    if (!SameSize(plane0, plane1) || !SameSize(plane0, plane2)) {
-      HWY_ABORT(
-          "Not same size: %d x %d, %d x %d, %d x %d\n",
-          static_cast<int>(plane0.xsize()), static_cast<int>(plane0.ysize()),
-          static_cast<int>(plane1.xsize()), static_cast<int>(plane1.ysize()),
-          static_cast<int>(plane2.xsize()), static_cast<int>(plane2.ysize()));
-    }
-    planes_[0] = std::move(plane0);
-    planes_[1] = std::move(plane1);
-    planes_[2] = std::move(plane2);
-  }
-
-  // Copy construction/assignment is forbidden to avoid inadvertent copies,
-  // which can be very expensive. Use CopyImageTo instead.
-  Image3(const Image3& other) = delete;
-  Image3& operator=(const Image3& other) = delete;
-
-  Image3& operator=(Image3&& other) noexcept {
-    for (size_t i = 0; i < kNumPlanes; i++) {
-      planes_[i] = std::move(other.planes_[i]);
-    }
-    return *this;
-  }
-
-  HWY_INLINE const T* ConstPlaneRow(const size_t c, const size_t y) const {
-    return static_cast<const T*>(VoidPlaneRow(c, y));
-  }
-  HWY_INLINE const T* ConstPlaneRow(const size_t c, const size_t y) {
-    return static_cast<const T*>(VoidPlaneRow(c, y));
-  }
-
-  HWY_INLINE T* MutablePlaneRow(const size_t c, const size_t y) const {
-    return static_cast<T*>(VoidPlaneRow(c, y));
-  }
-  HWY_INLINE T* MutablePlaneRow(const size_t c, const size_t y) {
-    return static_cast<T*>(VoidPlaneRow(c, y));
-  }
-
-  HWY_INLINE const ImageT& Plane(size_t idx) const { return planes_[idx]; }
-
-  void Swap(Image3& other) {
-    for (size_t c = 0; c < 3; ++c) {
-      other.planes_[c].Swap(planes_[c]);
-    }
-  }
-
-  void ShrinkTo(const size_t xsize, const size_t ysize) {
-    for (ImageT& plane : planes_) {
-      plane.ShrinkTo(xsize, ysize);
-    }
-  }
-
-  // Sizes of all three images are guaranteed to be equal.
-  HWY_INLINE size_t xsize() const { return planes_[0].xsize(); }
-  HWY_INLINE size_t ysize() const { return planes_[0].ysize(); }
-  // Returns offset [bytes] from one row to the next row of the same plane.
-  // WARNING: this must NOT be used to determine xsize, nor for copying rows -
-  // the valid xsize may be much less.
-  HWY_INLINE size_t bytes_per_row() const { return planes_[0].bytes_per_row(); }
-  // Returns number of pixels (some of which are padding) per row. Useful for
-  // computing other rows via pointer arithmetic. WARNING: this must NOT be used
-  // to determine xsize.
-  HWY_INLINE intptr_t PixelsPerRow() const { return planes_[0].PixelsPerRow(); }
-
- private:
-  // Returns pointer to the start of a row.
-  HWY_INLINE void* VoidPlaneRow(const size_t c, const size_t y) const {
-#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
-    if (c >= kNumPlanes || y >= ysize()) {
-      HWY_ABORT("PlaneRow(%d, %d) >= %d\n", static_cast<int>(c),
-                static_cast<int>(y), static_cast<int>(ysize()));
-    }
-#endif
-    // Use the first plane's stride because the compiler might not realize they
-    // are all equal. Thus we only need a single multiplication for all planes.
-    const size_t row_offset = y * planes_[0].bytes_per_row();
-    const void* row = planes_[c].bytes() + row_offset;
-    return static_cast<const T * HWY_RESTRICT>(
-        HWY_ASSUME_ALIGNED(row, HWY_ALIGNMENT));
-  }
-
- private:
-  ImageT planes_[kNumPlanes];
-};
-
-using Image3F = Image3<float>;
-
-// Rectangular region in image(s). Factoring this out of Image instead of
-// shifting the pointer by x0/y0 allows this to apply to multiple images with
-// different resolutions. Can compare size via SameSize(rect1, rect2).
-class Rect {
- public:
-  // Most windows are xsize_max * ysize_max, except those on the borders where
-  // begin + size_max > end.
-  constexpr Rect(size_t xbegin, size_t ybegin, size_t xsize_max,
-                 size_t ysize_max, size_t xend, size_t yend)
-      : x0_(xbegin),
-        y0_(ybegin),
-        xsize_(ClampedSize(xbegin, xsize_max, xend)),
-        ysize_(ClampedSize(ybegin, ysize_max, yend)) {}
-
-  // Construct with origin and known size (typically from another Rect).
-  constexpr Rect(size_t xbegin, size_t ybegin, size_t xsize, size_t ysize)
-      : x0_(xbegin), y0_(ybegin), xsize_(xsize), ysize_(ysize) {}
-
-  // Construct a rect that covers a whole image.
-  template <typename Image>
-  explicit Rect(const Image& image)
-      : Rect(0, 0, image.xsize(), image.ysize()) {}
-
-  Rect() : Rect(0, 0, 0, 0) {}
-
-  Rect(const Rect&) = default;
-  Rect& operator=(const Rect&) = default;
-
-  Rect Subrect(size_t xbegin, size_t ybegin, size_t xsize_max,
-               size_t ysize_max) {
-    return Rect(x0_ + xbegin, y0_ + ybegin, xsize_max, ysize_max, x0_ + xsize_,
-                y0_ + ysize_);
-  }
-
-  template <typename T>
-  const T* ConstRow(const Image<T>* image, size_t y) const {
-    return image->ConstRow(y + y0_) + x0_;
-  }
-
-  template <typename T>
-  T* MutableRow(const Image<T>* image, size_t y) const {
-    return image->MutableRow(y + y0_) + x0_;
-  }
-
-  template <typename T>
-  const T* ConstPlaneRow(const Image3<T>& image, size_t c, size_t y) const {
-    return image.ConstPlaneRow(c, y + y0_) + x0_;
-  }
-
-  template <typename T>
-  T* MutablePlaneRow(Image3<T>* image, const size_t c, size_t y) const {
-    return image->MutablePlaneRow(c, y + y0_) + x0_;
-  }
-
-  // Returns true if this Rect fully resides in the given image. ImageT could be
-  // Image<T> or Image3<T>; however if ImageT is Rect, results are nonsensical.
-  template <class ImageT>
-  bool IsInside(const ImageT& image) const {
-    return (x0_ + xsize_ <= image.xsize()) && (y0_ + ysize_ <= image.ysize());
-  }
-
-  size_t x0() const { return x0_; }
-  size_t y0() const { return y0_; }
-  size_t xsize() const { return xsize_; }
-  size_t ysize() const { return ysize_; }
-
- private:
-  // Returns size_max, or whatever is left in [begin, end).
-  static constexpr size_t ClampedSize(size_t begin, size_t size_max,
-                                      size_t end) {
-    return (begin + size_max <= end) ? size_max
-                                     : (end > begin ? end - begin : 0);
-  }
-
-  size_t x0_;
-  size_t y0_;
-
-  size_t xsize_;
-  size_t ysize_;
-};
-
-// Works for any image-like input type(s).
-template <class Image1, class Image2>
-HWY_MAYBE_UNUSED bool SameSize(const Image1& image1, const Image2& image2) {
-  return image1.xsize() == image2.xsize() && image1.ysize() == image2.ysize();
-}
-
-// Mirrors out of bounds coordinates and returns valid coordinates unchanged.
-// We assume the radius (distance outside the image) is small compared to the
-// image size, otherwise this might not terminate.
-// The mirror is outside the last column (border pixel is also replicated).
-static HWY_INLINE HWY_MAYBE_UNUSED size_t Mirror(int64_t x,
-                                                 const int64_t xsize) {
-  HWY_DASSERT(xsize != 0);
-
-  // TODO(janwas): replace with branchless version
-  while (x < 0 || x >= xsize) {
-    if (x < 0) {
-      x = -x - 1;
-    } else {
-      x = 2 * xsize - 1 - x;
-    }
-  }
-  return static_cast<size_t>(x);
-}
-
-// Wrap modes for ensuring X/Y coordinates are in the valid range [0, size):
-
-// Mirrors (repeating the edge pixel once). Useful for convolutions.
-struct WrapMirror {
-  HWY_INLINE size_t operator()(const int64_t coord, const size_t size) const {
-    return Mirror(coord, static_cast<int64_t>(size));
-  }
-};
-
-// Returns the same coordinate, for when we know "coord" is already valid (e.g.
-// interior of an image).
-struct WrapUnchanged {
-  HWY_INLINE size_t operator()(const int64_t coord, size_t /*size*/) const {
-    return static_cast<size_t>(coord);
-  }
-};
-
-// Similar to Wrap* but for row pointers (reduces Row() multiplications).
-
-class WrapRowMirror {
- public:
-  template <class View>
-  WrapRowMirror(const View& image, size_t ysize)
-      : first_row_(image.ConstRow(0)), last_row_(image.ConstRow(ysize - 1)) {}
-
-  const float* operator()(const float* const HWY_RESTRICT row,
-                          const int64_t stride) const {
-    if (row < first_row_) {
-      const int64_t num_before = first_row_ - row;
-      // Mirrored; one row before => row 0, two before = row 1, ...
-      return first_row_ + num_before - stride;
-    }
-    if (row > last_row_) {
-      const int64_t num_after = row - last_row_;
-      // Mirrored; one row after => last row, two after = last - 1, ...
-      return last_row_ - num_after + stride;
-    }
-    return row;
-  }
-
- private:
-  const float* const HWY_RESTRICT first_row_;
-  const float* const HWY_RESTRICT last_row_;
-};
-
-struct WrapRowUnchanged {
-  HWY_INLINE const float* operator()(const float* const HWY_RESTRICT row,
-                                     int64_t /*stride*/) const {
-    return row;
-  }
-};
-
-}  // namespace hwy
-
-#endif  // HIGHWAY_HWY_CONTRIB_IMAGE_IMAGE_H_
diff --git a/third_party/highway/hwy/contrib/image/image_test.cc b/third_party/highway/hwy/contrib/image/image_test.cc
deleted file mode 100644 (file)
index 6886577..0000000
+++ /dev/null
@@ -1,152 +0,0 @@
-// Copyright (c) the JPEG XL Project
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/image/image.h"
-
-#include <stddef.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <random>
-#include <utility>
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/image/image_test.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-
-// After foreach_target:
-#include "hwy/highway.h"
-#include "hwy/tests/test_util-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-// Ensure we can always write full aligned vectors.
-struct TestAlignedT {
-  template <typename T>
-  void operator()(T /*unused*/) const {
-    std::mt19937 rng(129);
-    std::uniform_int_distribution<int> dist(0, 16);
-    const ScalableTag<T> d;
-
-    for (size_t ysize = 1; ysize < 4; ++ysize) {
-      for (size_t xsize = 1; xsize < 64; ++xsize) {
-        Image<T> img(xsize, ysize);
-
-        for (size_t y = 0; y < ysize; ++y) {
-          T* HWY_RESTRICT row = img.MutableRow(y);
-          for (size_t x = 0; x < xsize; x += Lanes(d)) {
-            const auto values = Iota(d, static_cast<T>(dist(rng)));
-            Store(values, d, row + x);
-          }
-        }
-
-        // Sanity check to prevent optimizing out the writes
-        const auto x = std::uniform_int_distribution<size_t>(0, xsize - 1)(rng);
-        const auto y = std::uniform_int_distribution<size_t>(0, ysize - 1)(rng);
-        HWY_ASSERT(img.ConstRow(y)[x] < 16 + Lanes(d));
-      }
-    }
-  }
-};
-
-void TestAligned() { ForUnsignedTypes(TestAlignedT()); }
-
-// Ensure we can write an unaligned vector starting at the last valid value.
-struct TestUnalignedT {
-  template <typename T>
-  void operator()(T /*unused*/) const {
-    std::mt19937 rng(129);
-    std::uniform_int_distribution<int> dist(0, 3);
-    const ScalableTag<T> d;
-
-    for (size_t ysize = 1; ysize < 4; ++ysize) {
-      for (size_t xsize = 1; xsize < 128; ++xsize) {
-        Image<T> img(xsize, ysize);
-        img.InitializePaddingForUnalignedAccesses();
-
-// This test reads padding, which only works if it was initialized,
-// which only happens in MSAN builds.
-#if HWY_IS_MSAN || HWY_IDE
-        // Initialize only the valid samples
-        for (size_t y = 0; y < ysize; ++y) {
-          T* HWY_RESTRICT row = img.MutableRow(y);
-          for (size_t x = 0; x < xsize; ++x) {
-            row[x] = static_cast<T>(1u << dist(rng));
-          }
-        }
-
-        // Read padding bits
-        auto accum = Zero(d);
-        for (size_t y = 0; y < ysize; ++y) {
-          T* HWY_RESTRICT row = img.MutableRow(y);
-          for (size_t x = 0; x < xsize; ++x) {
-            accum = Or(accum, LoadU(d, row + x));
-          }
-        }
-
-        // Ensure padding was zero
-        const size_t N = Lanes(d);
-        auto lanes = AllocateAligned<T>(N);
-        Store(accum, d, lanes.get());
-        for (size_t i = 0; i < N; ++i) {
-          HWY_ASSERT(lanes[i] < 16);
-        }
-#else  // Check that writing padding does not overwrite valid samples
-       // Initialize only the valid samples
-        for (size_t y = 0; y < ysize; ++y) {
-          T* HWY_RESTRICT row = img.MutableRow(y);
-          for (size_t x = 0; x < xsize; ++x) {
-            row[x] = static_cast<T>(x);
-          }
-        }
-
-        // Zero padding and rightmost sample
-        for (size_t y = 0; y < ysize; ++y) {
-          T* HWY_RESTRICT row = img.MutableRow(y);
-          StoreU(Zero(d), d, row + xsize - 1);
-        }
-
-        // Ensure no samples except the rightmost were overwritten
-        for (size_t y = 0; y < ysize; ++y) {
-          T* HWY_RESTRICT row = img.MutableRow(y);
-          for (size_t x = 0; x < xsize - 1; ++x) {
-            HWY_ASSERT_EQ(static_cast<T>(x), row[x]);
-          }
-        }
-#endif
-      }
-    }
-  }
-};
-
-void TestUnaligned() { ForUnsignedTypes(TestUnalignedT()); }
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-
-namespace hwy {
-HWY_BEFORE_TEST(ImageTest);
-HWY_EXPORT_AND_TEST_P(ImageTest, TestAligned);
-HWY_EXPORT_AND_TEST_P(ImageTest, TestUnaligned);
-}  // namespace hwy
-
-#endif
diff --git a/third_party/highway/hwy/contrib/math/math-inl.h b/third_party/highway/hwy/contrib/math/math-inl.h
deleted file mode 100644 (file)
index b4cbb5d..0000000
+++ /dev/null
@@ -1,1242 +0,0 @@
-// Copyright 2020 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Include guard (still compiled once per target)
-#if defined(HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_) == \
-    defined(HWY_TARGET_TOGGLE)
-#ifdef HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_
-#undef HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_
-#else
-#define HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_
-#endif
-
-#include "hwy/highway.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-/**
- * Highway SIMD version of std::acos(x).
- *
- * Valid Lane Types: float32, float64
- *        Max Error: ULP = 2
- *      Valid Range: [-1, +1]
- * @return arc cosine of 'x'
- */
-template <class D, class V>
-HWY_INLINE V Acos(const D d, V x);
-template <class D, class V>
-HWY_NOINLINE V CallAcos(const D d, VecArg<V> x) {
-  return Acos(d, x);
-}
-
-/**
- * Highway SIMD version of std::acosh(x).
- *
- * Valid Lane Types: float32, float64
- *        Max Error: ULP = 3
- *      Valid Range: float32[1, +FLT_MAX], float64[1, +DBL_MAX]
- * @return hyperbolic arc cosine of 'x'
- */
-template <class D, class V>
-HWY_INLINE V Acosh(const D d, V x);
-template <class D, class V>
-HWY_NOINLINE V CallAcosh(const D d, VecArg<V> x) {
-  return Acosh(d, x);
-}
-
-/**
- * Highway SIMD version of std::asin(x).
- *
- * Valid Lane Types: float32, float64
- *        Max Error: ULP = 2
- *      Valid Range: [-1, +1]
- * @return arc sine of 'x'
- */
-template <class D, class V>
-HWY_INLINE V Asin(const D d, V x);
-template <class D, class V>
-HWY_NOINLINE V CallAsin(const D d, VecArg<V> x) {
-  return Asin(d, x);
-}
-
-/**
- * Highway SIMD version of std::asinh(x).
- *
- * Valid Lane Types: float32, float64
- *        Max Error: ULP = 3
- *      Valid Range: float32[-FLT_MAX, +FLT_MAX], float64[-DBL_MAX, +DBL_MAX]
- * @return hyperbolic arc sine of 'x'
- */
-template <class D, class V>
-HWY_INLINE V Asinh(const D d, V x);
-template <class D, class V>
-HWY_NOINLINE V CallAsinh(const D d, VecArg<V> x) {
-  return Asinh(d, x);
-}
-
-/**
- * Highway SIMD version of std::atan(x).
- *
- * Valid Lane Types: float32, float64
- *        Max Error: ULP = 3
- *      Valid Range: float32[-FLT_MAX, +FLT_MAX], float64[-DBL_MAX, +DBL_MAX]
- * @return arc tangent of 'x'
- */
-template <class D, class V>
-HWY_INLINE V Atan(const D d, V x);
-template <class D, class V>
-HWY_NOINLINE V CallAtan(const D d, VecArg<V> x) {
-  return Atan(d, x);
-}
-
-/**
- * Highway SIMD version of std::atanh(x).
- *
- * Valid Lane Types: float32, float64
- *        Max Error: ULP = 3
- *      Valid Range: (-1, +1)
- * @return hyperbolic arc tangent of 'x'
- */
-template <class D, class V>
-HWY_INLINE V Atanh(const D d, V x);
-template <class D, class V>
-HWY_NOINLINE V CallAtanh(const D d, VecArg<V> x) {
-  return Atanh(d, x);
-}
-
-/**
- * Highway SIMD version of std::cos(x).
- *
- * Valid Lane Types: float32, float64
- *        Max Error: ULP = 3
- *      Valid Range: [-39000, +39000]
- * @return cosine of 'x'
- */
-template <class D, class V>
-HWY_INLINE V Cos(const D d, V x);
-template <class D, class V>
-HWY_NOINLINE V CallCos(const D d, VecArg<V> x) {
-  return Cos(d, x);
-}
-
-/**
- * Highway SIMD version of std::exp(x).
- *
- * Valid Lane Types: float32, float64
- *        Max Error: ULP = 1
- *      Valid Range: float32[-FLT_MAX, +104], float64[-DBL_MAX, +706]
- * @return e^x
- */
-template <class D, class V>
-HWY_INLINE V Exp(const D d, V x);
-template <class D, class V>
-HWY_NOINLINE V CallExp(const D d, VecArg<V> x) {
-  return Exp(d, x);
-}
-
-/**
- * Highway SIMD version of std::expm1(x).
- *
- * Valid Lane Types: float32, float64
- *        Max Error: ULP = 4
- *      Valid Range: float32[-FLT_MAX, +104], float64[-DBL_MAX, +706]
- * @return e^x - 1
- */
-template <class D, class V>
-HWY_INLINE V Expm1(const D d, V x);
-template <class D, class V>
-HWY_NOINLINE V CallExpm1(const D d, VecArg<V> x) {
-  return Expm1(d, x);
-}
-
-/**
- * Highway SIMD version of std::log(x).
- *
- * Valid Lane Types: float32, float64
- *        Max Error: ULP = 4
- *      Valid Range: float32(0, +FLT_MAX], float64(0, +DBL_MAX]
- * @return natural logarithm of 'x'
- */
-template <class D, class V>
-HWY_INLINE V Log(const D d, V x);
-template <class D, class V>
-HWY_NOINLINE V CallLog(const D d, VecArg<V> x) {
-  return Log(d, x);
-}
-
-/**
- * Highway SIMD version of std::log10(x).
- *
- * Valid Lane Types: float32, float64
- *        Max Error: ULP = 2
- *      Valid Range: float32(0, +FLT_MAX], float64(0, +DBL_MAX]
- * @return base 10 logarithm of 'x'
- */
-template <class D, class V>
-HWY_INLINE V Log10(const D d, V x);
-template <class D, class V>
-HWY_NOINLINE V CallLog10(const D d, VecArg<V> x) {
-  return Log10(d, x);
-}
-
-/**
- * Highway SIMD version of std::log1p(x).
- *
- * Valid Lane Types: float32, float64
- *        Max Error: ULP = 2
- *      Valid Range: float32[0, +FLT_MAX], float64[0, +DBL_MAX]
- * @return log(1 + x)
- */
-template <class D, class V>
-HWY_INLINE V Log1p(const D d, V x);
-template <class D, class V>
-HWY_NOINLINE V CallLog1p(const D d, VecArg<V> x) {
-  return Log1p(d, x);
-}
-
-/**
- * Highway SIMD version of std::log2(x).
- *
- * Valid Lane Types: float32, float64
- *        Max Error: ULP = 2
- *      Valid Range: float32(0, +FLT_MAX], float64(0, +DBL_MAX]
- * @return base 2 logarithm of 'x'
- */
-template <class D, class V>
-HWY_INLINE V Log2(const D d, V x);
-template <class D, class V>
-HWY_NOINLINE V CallLog2(const D d, VecArg<V> x) {
-  return Log2(d, x);
-}
-
-/**
- * Highway SIMD version of std::sin(x).
- *
- * Valid Lane Types: float32, float64
- *        Max Error: ULP = 3
- *      Valid Range: [-39000, +39000]
- * @return sine of 'x'
- */
-template <class D, class V>
-HWY_INLINE V Sin(const D d, V x);
-template <class D, class V>
-HWY_NOINLINE V CallSin(const D d, VecArg<V> x) {
-  return Sin(d, x);
-}
-
-/**
- * Highway SIMD version of std::sinh(x).
- *
- * Valid Lane Types: float32, float64
- *        Max Error: ULP = 4
- *      Valid Range: float32[-88.7228, +88.7228], float64[-709, +709]
- * @return hyperbolic sine of 'x'
- */
-template <class D, class V>
-HWY_INLINE V Sinh(const D d, V x);
-template <class D, class V>
-HWY_NOINLINE V CallSinh(const D d, VecArg<V> x) {
-  return Sinh(d, x);
-}
-
-/**
- * Highway SIMD version of std::tanh(x).
- *
- * Valid Lane Types: float32, float64
- *        Max Error: ULP = 4
- *      Valid Range: float32[-FLT_MAX, +FLT_MAX], float64[-DBL_MAX, +DBL_MAX]
- * @return hyperbolic tangent of 'x'
- */
-template <class D, class V>
-HWY_INLINE V Tanh(const D d, V x);
-template <class D, class V>
-HWY_NOINLINE V CallTanh(const D d, VecArg<V> x) {
-  return Tanh(d, x);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Implementation
-////////////////////////////////////////////////////////////////////////////////
-namespace impl {
-
-// Estrin's Scheme is a faster method for evaluating large polynomials on
-// super scalar architectures. It works by factoring the Horner's Method
-// polynomial into power of two sub-trees that can be evaluated in parallel.
-// Wikipedia Link: https://en.wikipedia.org/wiki/Estrin%27s_scheme
-template <class T>
-HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1) {
-  return MulAdd(c1, x, c0);
-}
-template <class T>
-HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2) {
-  T x2 = Mul(x, x);
-  return MulAdd(x2, c2, MulAdd(c1, x, c0));
-}
-template <class T>
-HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3) {
-  T x2 = Mul(x, x);
-  return MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0));
-}
-template <class T>
-HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4) {
-  T x2 = Mul(x, x);
-  T x4 = Mul(x2, x2);
-  return MulAdd(x4, c4, MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)));
-}
-template <class T>
-HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5) {
-  T x2 = Mul(x, x);
-  T x4 = Mul(x2, x2);
-  return MulAdd(x4, MulAdd(c5, x, c4),
-                MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)));
-}
-template <class T>
-HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
-                                     T c6) {
-  T x2 = Mul(x, x);
-  T x4 = Mul(x2, x2);
-  return MulAdd(x4, MulAdd(x2, c6, MulAdd(c5, x, c4)),
-                MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)));
-}
-template <class T>
-HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
-                                     T c6, T c7) {
-  T x2 = Mul(x, x);
-  T x4 = Mul(x2, x2);
-  return MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
-                MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)));
-}
-template <class T>
-HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
-                                     T c6, T c7, T c8) {
-  T x2 = Mul(x, x);
-  T x4 = Mul(x2, x2);
-  T x8 = Mul(x4, x4);
-  return MulAdd(x8, c8,
-                MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
-                       MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
-}
-template <class T>
-HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
-                                     T c6, T c7, T c8, T c9) {
-  T x2 = Mul(x, x);
-  T x4 = Mul(x2, x2);
-  T x8 = Mul(x4, x4);
-  return MulAdd(x8, MulAdd(c9, x, c8),
-                MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
-                       MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
-}
-template <class T>
-HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
-                                     T c6, T c7, T c8, T c9, T c10) {
-  T x2 = Mul(x, x);
-  T x4 = Mul(x2, x2);
-  T x8 = Mul(x4, x4);
-  return MulAdd(x8, MulAdd(x2, c10, MulAdd(c9, x, c8)),
-                MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
-                       MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
-}
-template <class T>
-HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
-                                     T c6, T c7, T c8, T c9, T c10, T c11) {
-  T x2 = Mul(x, x);
-  T x4 = Mul(x2, x2);
-  T x8 = Mul(x4, x4);
-  return MulAdd(x8, MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8)),
-                MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
-                       MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
-}
-template <class T>
-HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
-                                     T c6, T c7, T c8, T c9, T c10, T c11,
-                                     T c12) {
-  T x2 = Mul(x, x);
-  T x4 = Mul(x2, x2);
-  T x8 = Mul(x4, x4);
-  return MulAdd(
-      x8, MulAdd(x4, c12, MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
-      MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
-             MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
-}
-template <class T>
-HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
-                                     T c6, T c7, T c8, T c9, T c10, T c11,
-                                     T c12, T c13) {
-  T x2 = Mul(x, x);
-  T x4 = Mul(x2, x2);
-  T x8 = Mul(x4, x4);
-  return MulAdd(x8,
-                MulAdd(x4, MulAdd(c13, x, c12),
-                       MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
-                MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
-                       MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
-}
-template <class T>
-HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
-                                     T c6, T c7, T c8, T c9, T c10, T c11,
-                                     T c12, T c13, T c14) {
-  T x2 = Mul(x, x);
-  T x4 = Mul(x2, x2);
-  T x8 = Mul(x4, x4);
-  return MulAdd(x8,
-                MulAdd(x4, MulAdd(x2, c14, MulAdd(c13, x, c12)),
-                       MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
-                MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
-                       MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
-}
-template <class T>
-HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
-                                     T c6, T c7, T c8, T c9, T c10, T c11,
-                                     T c12, T c13, T c14, T c15) {
-  T x2 = Mul(x, x);
-  T x4 = Mul(x2, x2);
-  T x8 = Mul(x4, x4);
-  return MulAdd(x8,
-                MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)),
-                       MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
-                MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
-                       MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
-}
-template <class T>
-HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
-                                     T c6, T c7, T c8, T c9, T c10, T c11,
-                                     T c12, T c13, T c14, T c15, T c16) {
-  T x2 = Mul(x, x);
-  T x4 = Mul(x2, x2);
-  T x8 = Mul(x4, x4);
-  T x16 = Mul(x8, x8);
-  return MulAdd(
-      x16, c16,
-      MulAdd(x8,
-             MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)),
-                    MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
-             MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
-                    MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))));
-}
-template <class T>
-HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
-                                     T c6, T c7, T c8, T c9, T c10, T c11,
-                                     T c12, T c13, T c14, T c15, T c16, T c17) {
-  T x2 = Mul(x, x);
-  T x4 = Mul(x2, x2);
-  T x8 = Mul(x4, x4);
-  T x16 = Mul(x8, x8);
-  return MulAdd(
-      x16, MulAdd(c17, x, c16),
-      MulAdd(x8,
-             MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)),
-                    MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
-             MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
-                    MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))));
-}
-template <class T>
-HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
-                                     T c6, T c7, T c8, T c9, T c10, T c11,
-                                     T c12, T c13, T c14, T c15, T c16, T c17,
-                                     T c18) {
-  T x2 = Mul(x, x);
-  T x4 = Mul(x2, x2);
-  T x8 = Mul(x4, x4);
-  T x16 = Mul(x8, x8);
-  return MulAdd(
-      x16, MulAdd(x2, c18, MulAdd(c17, x, c16)),
-      MulAdd(x8,
-             MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)),
-                    MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
-             MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
-                    MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))));
-}
-
-template <class FloatOrDouble>
-struct AsinImpl {};
-template <class FloatOrDouble>
-struct AtanImpl {};
-template <class FloatOrDouble>
-struct CosSinImpl {};
-template <class FloatOrDouble>
-struct ExpImpl {};
-template <class FloatOrDouble>
-struct LogImpl {};
-
-template <>
-struct AsinImpl<float> {
-  // Polynomial approximation for asin(x) over the range [0, 0.5).
-  template <class D, class V>
-  HWY_INLINE V AsinPoly(D d, V x2, V /*x*/) {
-    const auto k0 = Set(d, +0.1666677296f);
-    const auto k1 = Set(d, +0.07495029271f);
-    const auto k2 = Set(d, +0.04547423869f);
-    const auto k3 = Set(d, +0.02424046025f);
-    const auto k4 = Set(d, +0.04197454825f);
-
-    return Estrin(x2, k0, k1, k2, k3, k4);
-  }
-};
-
-#if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
-
-template <>
-struct AsinImpl<double> {
-  // Polynomial approximation for asin(x) over the range [0, 0.5).
-  template <class D, class V>
-  HWY_INLINE V AsinPoly(D d, V x2, V /*x*/) {
-    const auto k0 = Set(d, +0.1666666666666497543);
-    const auto k1 = Set(d, +0.07500000000378581611);
-    const auto k2 = Set(d, +0.04464285681377102438);
-    const auto k3 = Set(d, +0.03038195928038132237);
-    const auto k4 = Set(d, +0.02237176181932048341);
-    const auto k5 = Set(d, +0.01735956991223614604);
-    const auto k6 = Set(d, +0.01388715184501609218);
-    const auto k7 = Set(d, +0.01215360525577377331);
-    const auto k8 = Set(d, +0.006606077476277170610);
-    const auto k9 = Set(d, +0.01929045477267910674);
-    const auto k10 = Set(d, -0.01581918243329996643);
-    const auto k11 = Set(d, +0.03161587650653934628);
-
-    return Estrin(x2, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11);
-  }
-};
-
-#endif
-
-template <>
-struct AtanImpl<float> {
-  // Polynomial approximation for atan(x) over the range [0, 1.0).
-  template <class D, class V>
-  HWY_INLINE V AtanPoly(D d, V x) {
-    const auto k0 = Set(d, -0.333331018686294555664062f);
-    const auto k1 = Set(d, +0.199926957488059997558594f);
-    const auto k2 = Set(d, -0.142027363181114196777344f);
-    const auto k3 = Set(d, +0.106347933411598205566406f);
-    const auto k4 = Set(d, -0.0748900920152664184570312f);
-    const auto k5 = Set(d, +0.0425049886107444763183594f);
-    const auto k6 = Set(d, -0.0159569028764963150024414f);
-    const auto k7 = Set(d, +0.00282363896258175373077393f);
-
-    const auto y = Mul(x, x);
-    return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7), Mul(y, x), x);
-  }
-};
-
-#if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
-
-template <>
-struct AtanImpl<double> {
-  // Polynomial approximation for atan(x) over the range [0, 1.0).
-  template <class D, class V>
-  HWY_INLINE V AtanPoly(D d, V x) {
-    const auto k0 = Set(d, -0.333333333333311110369124);
-    const auto k1 = Set(d, +0.199999999996591265594148);
-    const auto k2 = Set(d, -0.14285714266771329383765);
-    const auto k3 = Set(d, +0.111111105648261418443745);
-    const auto k4 = Set(d, -0.090908995008245008229153);
-    const auto k5 = Set(d, +0.0769219538311769618355029);
-    const auto k6 = Set(d, -0.0666573579361080525984562);
-    const auto k7 = Set(d, +0.0587666392926673580854313);
-    const auto k8 = Set(d, -0.0523674852303482457616113);
-    const auto k9 = Set(d, +0.0466667150077840625632675);
-    const auto k10 = Set(d, -0.0407629191276836500001934);
-    const auto k11 = Set(d, +0.0337852580001353069993897);
-    const auto k12 = Set(d, -0.0254517624932312641616861);
-    const auto k13 = Set(d, +0.016599329773529201970117);
-    const auto k14 = Set(d, -0.00889896195887655491740809);
-    const auto k15 = Set(d, +0.00370026744188713119232403);
-    const auto k16 = Set(d, -0.00110611831486672482563471);
-    const auto k17 = Set(d, +0.000209850076645816976906797);
-    const auto k18 = Set(d, -1.88796008463073496563746e-5);
-
-    const auto y = Mul(x, x);
-    return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11,
-                         k12, k13, k14, k15, k16, k17, k18),
-                  Mul(y, x), x);
-  }
-};
-
-#endif
-
-template <>
-struct CosSinImpl<float> {
-  // Rounds float toward zero and returns as int32_t.
-  template <class D, class V>
-  HWY_INLINE Vec<Rebind<int32_t, D>> ToInt32(D /*unused*/, V x) {
-    return ConvertTo(Rebind<int32_t, D>(), x);
-  }
-
-  template <class D, class V>
-  HWY_INLINE V Poly(D d, V x) {
-    const auto k0 = Set(d, -1.66666597127914428710938e-1f);
-    const auto k1 = Set(d, +8.33307858556509017944336e-3f);
-    const auto k2 = Set(d, -1.981069071916863322258e-4f);
-    const auto k3 = Set(d, +2.6083159809786593541503e-6f);
-
-    const auto y = Mul(x, x);
-    return MulAdd(Estrin(y, k0, k1, k2, k3), Mul(y, x), x);
-  }
-
-  template <class D, class V, class VI32>
-  HWY_INLINE V CosReduce(D d, V x, VI32 q) {
-    // kHalfPiPart0f + kHalfPiPart1f + kHalfPiPart2f + kHalfPiPart3f ~= -pi/2
-    const V kHalfPiPart0f = Set(d, -0.5f * 3.140625f);
-    const V kHalfPiPart1f = Set(d, -0.5f * 0.0009670257568359375f);
-    const V kHalfPiPart2f = Set(d, -0.5f * 6.2771141529083251953e-7f);
-    const V kHalfPiPart3f = Set(d, -0.5f * 1.2154201256553420762e-10f);
-
-    // Extended precision modular arithmetic.
-    const V qf = ConvertTo(d, q);
-    x = MulAdd(qf, kHalfPiPart0f, x);
-    x = MulAdd(qf, kHalfPiPart1f, x);
-    x = MulAdd(qf, kHalfPiPart2f, x);
-    x = MulAdd(qf, kHalfPiPart3f, x);
-    return x;
-  }
-
-  template <class D, class V, class VI32>
-  HWY_INLINE V SinReduce(D d, V x, VI32 q) {
-    // kPiPart0f + kPiPart1f + kPiPart2f + kPiPart3f ~= -pi
-    const V kPiPart0f = Set(d, -3.140625f);
-    const V kPiPart1f = Set(d, -0.0009670257568359375f);
-    const V kPiPart2f = Set(d, -6.2771141529083251953e-7f);
-    const V kPiPart3f = Set(d, -1.2154201256553420762e-10f);
-
-    // Extended precision modular arithmetic.
-    const V qf = ConvertTo(d, q);
-    x = MulAdd(qf, kPiPart0f, x);
-    x = MulAdd(qf, kPiPart1f, x);
-    x = MulAdd(qf, kPiPart2f, x);
-    x = MulAdd(qf, kPiPart3f, x);
-    return x;
-  }
-
-  // (q & 2) == 0 ? -0.0 : +0.0
-  template <class D, class VI32>
-  HWY_INLINE Vec<Rebind<float, D>> CosSignFromQuadrant(D d, VI32 q) {
-    const VI32 kTwo = Set(Rebind<int32_t, D>(), 2);
-    return BitCast(d, ShiftLeft<30>(AndNot(q, kTwo)));
-  }
-
-  // ((q & 1) ? -0.0 : +0.0)
-  template <class D, class VI32>
-  HWY_INLINE Vec<Rebind<float, D>> SinSignFromQuadrant(D d, VI32 q) {
-    const VI32 kOne = Set(Rebind<int32_t, D>(), 1);
-    return BitCast(d, ShiftLeft<31>(And(q, kOne)));
-  }
-};
-
-#if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
-
-template <>
-struct CosSinImpl<double> {
-  // Rounds double toward zero and returns as int32_t.
-  template <class D, class V>
-  HWY_INLINE Vec<Rebind<int32_t, D>> ToInt32(D /*unused*/, V x) {
-    return DemoteTo(Rebind<int32_t, D>(), x);
-  }
-
-  template <class D, class V>
-  HWY_INLINE V Poly(D d, V x) {
-    const auto k0 = Set(d, -0.166666666666666657414808);
-    const auto k1 = Set(d, +0.00833333333333332974823815);
-    const auto k2 = Set(d, -0.000198412698412696162806809);
-    const auto k3 = Set(d, +2.75573192239198747630416e-6);
-    const auto k4 = Set(d, -2.50521083763502045810755e-8);
-    const auto k5 = Set(d, +1.60590430605664501629054e-10);
-    const auto k6 = Set(d, -7.64712219118158833288484e-13);
-    const auto k7 = Set(d, +2.81009972710863200091251e-15);
-    const auto k8 = Set(d, -7.97255955009037868891952e-18);
-
-    const auto y = Mul(x, x);
-    return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7, k8), Mul(y, x), x);
-  }
-
-  template <class D, class V, class VI32>
-  HWY_INLINE V CosReduce(D d, V x, VI32 q) {
-    // kHalfPiPart0d + kHalfPiPart1d + kHalfPiPart2d + kHalfPiPart3d ~= -pi/2
-    const V kHalfPiPart0d = Set(d, -0.5 * 3.1415926218032836914);
-    const V kHalfPiPart1d = Set(d, -0.5 * 3.1786509424591713469e-8);
-    const V kHalfPiPart2d = Set(d, -0.5 * 1.2246467864107188502e-16);
-    const V kHalfPiPart3d = Set(d, -0.5 * 1.2736634327021899816e-24);
-
-    // Extended precision modular arithmetic.
-    const V qf = PromoteTo(d, q);
-    x = MulAdd(qf, kHalfPiPart0d, x);
-    x = MulAdd(qf, kHalfPiPart1d, x);
-    x = MulAdd(qf, kHalfPiPart2d, x);
-    x = MulAdd(qf, kHalfPiPart3d, x);
-    return x;
-  }
-
-  template <class D, class V, class VI32>
-  HWY_INLINE V SinReduce(D d, V x, VI32 q) {
-    // kPiPart0d + kPiPart1d + kPiPart2d + kPiPart3d ~= -pi
-    const V kPiPart0d = Set(d, -3.1415926218032836914);
-    const V kPiPart1d = Set(d, -3.1786509424591713469e-8);
-    const V kPiPart2d = Set(d, -1.2246467864107188502e-16);
-    const V kPiPart3d = Set(d, -1.2736634327021899816e-24);
-
-    // Extended precision modular arithmetic.
-    const V qf = PromoteTo(d, q);
-    x = MulAdd(qf, kPiPart0d, x);
-    x = MulAdd(qf, kPiPart1d, x);
-    x = MulAdd(qf, kPiPart2d, x);
-    x = MulAdd(qf, kPiPart3d, x);
-    return x;
-  }
-
-  // (q & 2) == 0 ? -0.0 : +0.0
-  template <class D, class VI32>
-  HWY_INLINE Vec<Rebind<double, D>> CosSignFromQuadrant(D d, VI32 q) {
-    const VI32 kTwo = Set(Rebind<int32_t, D>(), 2);
-    return BitCast(
-        d, ShiftLeft<62>(PromoteTo(Rebind<int64_t, D>(), AndNot(q, kTwo))));
-  }
-
-  // ((q & 1) ? -0.0 : +0.0)
-  template <class D, class VI32>
-  HWY_INLINE Vec<Rebind<double, D>> SinSignFromQuadrant(D d, VI32 q) {
-    const VI32 kOne = Set(Rebind<int32_t, D>(), 1);
-    return BitCast(
-        d, ShiftLeft<63>(PromoteTo(Rebind<int64_t, D>(), And(q, kOne))));
-  }
-};
-
-#endif
-
-template <>
-struct ExpImpl<float> {
-  // Rounds float toward zero and returns as int32_t.
-  template <class D, class V>
-  HWY_INLINE Vec<Rebind<int32_t, D>> ToInt32(D /*unused*/, V x) {
-    return ConvertTo(Rebind<int32_t, D>(), x);
-  }
-
-  template <class D, class V>
-  HWY_INLINE V ExpPoly(D d, V x) {
-    const auto k0 = Set(d, +0.5f);
-    const auto k1 = Set(d, +0.166666671633720397949219f);
-    const auto k2 = Set(d, +0.0416664853692054748535156f);
-    const auto k3 = Set(d, +0.00833336077630519866943359f);
-    const auto k4 = Set(d, +0.00139304355252534151077271f);
-    const auto k5 = Set(d, +0.000198527617612853646278381f);
-
-    return MulAdd(Estrin(x, k0, k1, k2, k3, k4, k5), Mul(x, x), x);
-  }
-
-  // Computes 2^x, where x is an integer.
-  template <class D, class VI32>
-  HWY_INLINE Vec<D> Pow2I(D d, VI32 x) {
-    const Rebind<int32_t, D> di32;
-    const VI32 kOffset = Set(di32, 0x7F);
-    return BitCast(d, ShiftLeft<23>(Add(x, kOffset)));
-  }
-
-  // Sets the exponent of 'x' to 2^e.
-  template <class D, class V, class VI32>
-  HWY_INLINE V LoadExpShortRange(D d, V x, VI32 e) {
-    const VI32 y = ShiftRight<1>(e);
-    return Mul(Mul(x, Pow2I(d, y)), Pow2I(d, Sub(e, y)));
-  }
-
-  template <class D, class V, class VI32>
-  HWY_INLINE V ExpReduce(D d, V x, VI32 q) {
-    // kLn2Part0f + kLn2Part1f ~= -ln(2)
-    const V kLn2Part0f = Set(d, -0.693145751953125f);
-    const V kLn2Part1f = Set(d, -1.428606765330187045e-6f);
-
-    // Extended precision modular arithmetic.
-    const V qf = ConvertTo(d, q);
-    x = MulAdd(qf, kLn2Part0f, x);
-    x = MulAdd(qf, kLn2Part1f, x);
-    return x;
-  }
-};
-
-template <>
-struct LogImpl<float> {
-  template <class D, class V>
-  HWY_INLINE Vec<Rebind<int32_t, D>> Log2p1NoSubnormal(D /*d*/, V x) {
-    const Rebind<int32_t, D> di32;
-    const Rebind<uint32_t, D> du32;
-    const auto kBias = Set(di32, 0x7F);
-    return Sub(BitCast(di32, ShiftRight<23>(BitCast(du32, x))), kBias);
-  }
-
-  // Approximates Log(x) over the range [sqrt(2) / 2, sqrt(2)].
-  template <class D, class V>
-  HWY_INLINE V LogPoly(D d, V x) {
-    const V k0 = Set(d, 0.66666662693f);
-    const V k1 = Set(d, 0.40000972152f);
-    const V k2 = Set(d, 0.28498786688f);
-    const V k3 = Set(d, 0.24279078841f);
-
-    const V x2 = Mul(x, x);
-    const V x4 = Mul(x2, x2);
-    return MulAdd(MulAdd(k2, x4, k0), x2, Mul(MulAdd(k3, x4, k1), x4));
-  }
-};
-
-#if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
-template <>
-struct ExpImpl<double> {
-  // Rounds double toward zero and returns as int32_t.
-  template <class D, class V>
-  HWY_INLINE Vec<Rebind<int32_t, D>> ToInt32(D /*unused*/, V x) {
-    return DemoteTo(Rebind<int32_t, D>(), x);
-  }
-
-  template <class D, class V>
-  HWY_INLINE V ExpPoly(D d, V x) {
-    const auto k0 = Set(d, +0.5);
-    const auto k1 = Set(d, +0.166666666666666851703837);
-    const auto k2 = Set(d, +0.0416666666666665047591422);
-    const auto k3 = Set(d, +0.00833333333331652721664984);
-    const auto k4 = Set(d, +0.00138888888889774492207962);
-    const auto k5 = Set(d, +0.000198412698960509205564975);
-    const auto k6 = Set(d, +2.4801587159235472998791e-5);
-    const auto k7 = Set(d, +2.75572362911928827629423e-6);
-    const auto k8 = Set(d, +2.75573911234900471893338e-7);
-    const auto k9 = Set(d, +2.51112930892876518610661e-8);
-    const auto k10 = Set(d, +2.08860621107283687536341e-9);
-
-    return MulAdd(Estrin(x, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10),
-                  Mul(x, x), x);
-  }
-
-  // Computes 2^x, where x is an integer.
-  template <class D, class VI32>
-  HWY_INLINE Vec<D> Pow2I(D d, VI32 x) {
-    const Rebind<int32_t, D> di32;
-    const Rebind<int64_t, D> di64;
-    const VI32 kOffset = Set(di32, 0x3FF);
-    return BitCast(d, ShiftLeft<52>(PromoteTo(di64, Add(x, kOffset))));
-  }
-
-  // Sets the exponent of 'x' to 2^e.
-  template <class D, class V, class VI32>
-  HWY_INLINE V LoadExpShortRange(D d, V x, VI32 e) {
-    const VI32 y = ShiftRight<1>(e);
-    return Mul(Mul(x, Pow2I(d, y)), Pow2I(d, Sub(e, y)));
-  }
-
-  template <class D, class V, class VI32>
-  HWY_INLINE V ExpReduce(D d, V x, VI32 q) {
-    // kLn2Part0d + kLn2Part1d ~= -ln(2)
-    const V kLn2Part0d = Set(d, -0.6931471805596629565116018);
-    const V kLn2Part1d = Set(d, -0.28235290563031577122588448175e-12);
-
-    // Extended precision modular arithmetic.
-    const V qf = PromoteTo(d, q);
-    x = MulAdd(qf, kLn2Part0d, x);
-    x = MulAdd(qf, kLn2Part1d, x);
-    return x;
-  }
-};
-
-template <>
-struct LogImpl<double> {
-  template <class D, class V>
-  HWY_INLINE Vec<Rebind<int64_t, D>> Log2p1NoSubnormal(D /*d*/, V x) {
-    const Rebind<int64_t, D> di64;
-    const Rebind<uint64_t, D> du64;
-    return Sub(BitCast(di64, ShiftRight<52>(BitCast(du64, x))),
-               Set(di64, 0x3FF));
-  }
-
-  // Approximates Log(x) over the range [sqrt(2) / 2, sqrt(2)].
-  template <class D, class V>
-  HWY_INLINE V LogPoly(D d, V x) {
-    const V k0 = Set(d, 0.6666666666666735130);
-    const V k1 = Set(d, 0.3999999999940941908);
-    const V k2 = Set(d, 0.2857142874366239149);
-    const V k3 = Set(d, 0.2222219843214978396);
-    const V k4 = Set(d, 0.1818357216161805012);
-    const V k5 = Set(d, 0.1531383769920937332);
-    const V k6 = Set(d, 0.1479819860511658591);
-
-    const V x2 = Mul(x, x);
-    const V x4 = Mul(x2, x2);
-    return MulAdd(MulAdd(MulAdd(MulAdd(k6, x4, k4), x4, k2), x4, k0), x2,
-                  (Mul(MulAdd(MulAdd(k5, x4, k3), x4, k1), x4)));
-  }
-};
-
-#endif
-
-template <class D, class V, bool kAllowSubnormals = true>
-HWY_INLINE V Log(const D d, V x) {
-  // http://git.musl-libc.org/cgit/musl/tree/src/math/log.c for more info.
-  using T = TFromD<D>;
-  impl::LogImpl<T> impl;
-
-  constexpr bool kIsF32 = (sizeof(T) == 4);
-
-  // Float Constants
-  const V kLn2Hi = Set(d, kIsF32 ? static_cast<T>(0.69313812256f)
-                                 : static_cast<T>(0.693147180369123816490));
-  const V kLn2Lo = Set(d, kIsF32 ? static_cast<T>(9.0580006145e-6f)
-                                 : static_cast<T>(1.90821492927058770002e-10));
-  const V kOne = Set(d, static_cast<T>(+1.0));
-  const V kMinNormal = Set(d, kIsF32 ? static_cast<T>(1.175494351e-38f)
-                                     : static_cast<T>(2.2250738585072014e-308));
-  const V kScale = Set(d, kIsF32 ? static_cast<T>(3.355443200e+7f)
-                                 : static_cast<T>(1.8014398509481984e+16));
-
-  // Integer Constants
-  using TI = MakeSigned<T>;
-  const Rebind<TI, D> di;
-  using VI = decltype(Zero(di));
-  const VI kLowerBits = Set(di, kIsF32 ? static_cast<TI>(0x00000000L)
-                                       : static_cast<TI>(0xFFFFFFFFLL));
-  const VI kMagic = Set(di, kIsF32 ? static_cast<TI>(0x3F3504F3L)
-                                   : static_cast<TI>(0x3FE6A09E00000000LL));
-  const VI kExpMask = Set(di, kIsF32 ? static_cast<TI>(0x3F800000L)
-                                     : static_cast<TI>(0x3FF0000000000000LL));
-  const VI kExpScale =
-      Set(di, kIsF32 ? static_cast<TI>(-25) : static_cast<TI>(-54));
-  const VI kManMask = Set(di, kIsF32 ? static_cast<TI>(0x7FFFFFL)
-                                     : static_cast<TI>(0xFFFFF00000000LL));
-
-  // Scale up 'x' so that it is no longer denormalized.
-  VI exp_bits;
-  V exp;
-  if (kAllowSubnormals == true) {
-    const auto is_denormal = Lt(x, kMinNormal);
-    x = IfThenElse(is_denormal, Mul(x, kScale), x);
-
-    // Compute the new exponent.
-    exp_bits = Add(BitCast(di, x), Sub(kExpMask, kMagic));
-    const VI exp_scale =
-        BitCast(di, IfThenElseZero(is_denormal, BitCast(d, kExpScale)));
-    exp = ConvertTo(
-        d, Add(exp_scale, impl.Log2p1NoSubnormal(d, BitCast(d, exp_bits))));
-  } else {
-    // Compute the new exponent.
-    exp_bits = Add(BitCast(di, x), Sub(kExpMask, kMagic));
-    exp = ConvertTo(d, impl.Log2p1NoSubnormal(d, BitCast(d, exp_bits)));
-  }
-
-  // Renormalize.
-  const V y = Or(And(x, BitCast(d, kLowerBits)),
-                 BitCast(d, Add(And(exp_bits, kManMask), kMagic)));
-
-  // Approximate and reconstruct.
-  const V ym1 = Sub(y, kOne);
-  const V z = Div(ym1, Add(y, kOne));
-
-  return MulSub(
-      exp, kLn2Hi,
-      Sub(MulSub(z, Sub(ym1, impl.LogPoly(d, z)), Mul(exp, kLn2Lo)), ym1));
-}
-
-}  // namespace impl
-
-template <class D, class V>
-HWY_INLINE V Acos(const D d, V x) {
-  using T = TFromD<D>;
-
-  const V kZero = Zero(d);
-  const V kHalf = Set(d, static_cast<T>(+0.5));
-  const V kPi = Set(d, static_cast<T>(+3.14159265358979323846264));
-  const V kPiOverTwo = Set(d, static_cast<T>(+1.57079632679489661923132169));
-
-  const V sign_x = And(SignBit(d), x);
-  const V abs_x = Xor(x, sign_x);
-  const auto mask = Lt(abs_x, kHalf);
-  const V yy =
-      IfThenElse(mask, Mul(abs_x, abs_x), NegMulAdd(abs_x, kHalf, kHalf));
-  const V y = IfThenElse(mask, abs_x, Sqrt(yy));
-
-  impl::AsinImpl<T> impl;
-  const V t = Mul(impl.AsinPoly(d, yy, y), Mul(y, yy));
-
-  const V t_plus_y = Add(t, y);
-  const V z =
-      IfThenElse(mask, Sub(kPiOverTwo, Add(Xor(y, sign_x), Xor(t, sign_x))),
-                 Add(t_plus_y, t_plus_y));
-  return IfThenElse(Or(mask, Ge(x, kZero)), z, Sub(kPi, z));
-}
-
-template <class D, class V>
-HWY_INLINE V Acosh(const D d, V x) {
-  using T = TFromD<D>;
-
-  const V kLarge = Set(d, static_cast<T>(268435456.0));
-  const V kLog2 = Set(d, static_cast<T>(0.693147180559945286227));
-  const V kOne = Set(d, static_cast<T>(+1.0));
-  const V kTwo = Set(d, static_cast<T>(+2.0));
-
-  const auto is_x_large = Gt(x, kLarge);
-  const auto is_x_gt_2 = Gt(x, kTwo);
-
-  const V x_minus_1 = Sub(x, kOne);
-  const V y0 = MulSub(kTwo, x, Div(kOne, Add(Sqrt(MulSub(x, x, kOne)), x)));
-  const V y1 =
-      Add(Sqrt(MulAdd(x_minus_1, kTwo, Mul(x_minus_1, x_minus_1))), x_minus_1);
-  const V y2 =
-      IfThenElse(is_x_gt_2, IfThenElse(is_x_large, x, y0), Add(y1, kOne));
-  const V z = impl::Log<D, V, /*kAllowSubnormals=*/false>(d, y2);
-
-  const auto is_pole = Eq(y2, kOne);
-  const auto divisor = Sub(IfThenZeroElse(is_pole, y2), kOne);
-  return Add(IfThenElse(is_x_gt_2, z,
-                        IfThenElse(is_pole, y1, Div(Mul(z, y1), divisor))),
-             IfThenElseZero(is_x_large, kLog2));
-}
-
-template <class D, class V>
-HWY_INLINE V Asin(const D d, V x) {
-  using T = TFromD<D>;
-
-  const V kHalf = Set(d, static_cast<T>(+0.5));
-  const V kTwo = Set(d, static_cast<T>(+2.0));
-  const V kPiOverTwo = Set(d, static_cast<T>(+1.57079632679489661923132169));
-
-  const V sign_x = And(SignBit(d), x);
-  const V abs_x = Xor(x, sign_x);
-  const auto mask = Lt(abs_x, kHalf);
-  const V yy =
-      IfThenElse(mask, Mul(abs_x, abs_x), NegMulAdd(abs_x, kHalf, kHalf));
-  const V y = IfThenElse(mask, abs_x, Sqrt(yy));
-
-  impl::AsinImpl<T> impl;
-  const V z0 = MulAdd(impl.AsinPoly(d, yy, y), Mul(yy, y), y);
-  const V z1 = NegMulAdd(z0, kTwo, kPiOverTwo);
-  return Or(IfThenElse(mask, z0, z1), sign_x);
-}
-
-template <class D, class V>
-HWY_INLINE V Asinh(const D d, V x) {
-  using T = TFromD<D>;
-
-  const V kSmall = Set(d, static_cast<T>(1.0 / 268435456.0));
-  const V kLarge = Set(d, static_cast<T>(268435456.0));
-  const V kLog2 = Set(d, static_cast<T>(0.693147180559945286227));
-  const V kOne = Set(d, static_cast<T>(+1.0));
-  const V kTwo = Set(d, static_cast<T>(+2.0));
-
-  const V sign_x = And(SignBit(d), x);  // Extract the sign bit
-  const V abs_x = Xor(x, sign_x);
-
-  const auto is_x_large = Gt(abs_x, kLarge);
-  const auto is_x_lt_2 = Lt(abs_x, kTwo);
-
-  const V x2 = Mul(x, x);
-  const V sqrt_x2_plus_1 = Sqrt(Add(x2, kOne));
-
-  const V y0 = MulAdd(abs_x, kTwo, Div(kOne, Add(sqrt_x2_plus_1, abs_x)));
-  const V y1 = Add(Div(x2, Add(sqrt_x2_plus_1, kOne)), abs_x);
-  const V y2 =
-      IfThenElse(is_x_lt_2, Add(y1, kOne), IfThenElse(is_x_large, abs_x, y0));
-  const V z = impl::Log<D, V, /*kAllowSubnormals=*/false>(d, y2);
-
-  const auto is_pole = Eq(y2, kOne);
-  const auto divisor = Sub(IfThenZeroElse(is_pole, y2), kOne);
-  const auto large = IfThenElse(is_pole, y1, Div(Mul(z, y1), divisor));
-  const V y = IfThenElse(Lt(abs_x, kSmall), x, large);
-  return Or(Add(IfThenElse(is_x_lt_2, y, z), IfThenElseZero(is_x_large, kLog2)),
-            sign_x);
-}
-
-template <class D, class V>
-HWY_INLINE V Atan(const D d, V x) {
-  using T = TFromD<D>;
-
-  const V kOne = Set(d, static_cast<T>(+1.0));
-  const V kPiOverTwo = Set(d, static_cast<T>(+1.57079632679489661923132169));
-
-  const V sign = And(SignBit(d), x);
-  const V abs_x = Xor(x, sign);
-  const auto mask = Gt(abs_x, kOne);
-
-  impl::AtanImpl<T> impl;
-  const auto divisor = IfThenElse(mask, abs_x, kOne);
-  const V y = impl.AtanPoly(d, IfThenElse(mask, Div(kOne, divisor), abs_x));
-  return Or(IfThenElse(mask, Sub(kPiOverTwo, y), y), sign);
-}
-
-template <class D, class V>
-HWY_INLINE V Atanh(const D d, V x) {
-  using T = TFromD<D>;
-
-  const V kHalf = Set(d, static_cast<T>(+0.5));
-  const V kOne = Set(d, static_cast<T>(+1.0));
-
-  const V sign = And(SignBit(d), x);  // Extract the sign bit
-  const V abs_x = Xor(x, sign);
-  return Mul(Log1p(d, Div(Add(abs_x, abs_x), Sub(kOne, abs_x))),
-             Xor(kHalf, sign));
-}
-
-template <class D, class V>
-HWY_INLINE V Cos(const D d, V x) {
-  using T = TFromD<D>;
-  impl::CosSinImpl<T> impl;
-
-  // Float Constants
-  const V kOneOverPi = Set(d, static_cast<T>(0.31830988618379067153));
-
-  // Integer Constants
-  const Rebind<int32_t, D> di32;
-  using VI32 = decltype(Zero(di32));
-  const VI32 kOne = Set(di32, 1);
-
-  const V y = Abs(x);  // cos(x) == cos(|x|)
-
-  // Compute the quadrant, q = int(|x| / pi) * 2 + 1
-  const VI32 q = Add(ShiftLeft<1>(impl.ToInt32(d, Mul(y, kOneOverPi))), kOne);
-
-  // Reduce range, apply sign, and approximate.
-  return impl.Poly(
-      d, Xor(impl.CosReduce(d, y, q), impl.CosSignFromQuadrant(d, q)));
-}
-
-template <class D, class V>
-HWY_INLINE V Exp(const D d, V x) {
-  using T = TFromD<D>;
-
-  const V kHalf = Set(d, static_cast<T>(+0.5));
-  const V kLowerBound =
-      Set(d, static_cast<T>((sizeof(T) == 4 ? -104.0 : -1000.0)));
-  const V kNegZero = Set(d, static_cast<T>(-0.0));
-  const V kOne = Set(d, static_cast<T>(+1.0));
-  const V kOneOverLog2 = Set(d, static_cast<T>(+1.442695040888963407359924681));
-
-  impl::ExpImpl<T> impl;
-
-  // q = static_cast<int32>((x / log(2)) + ((x < 0) ? -0.5 : +0.5))
-  const auto q =
-      impl.ToInt32(d, MulAdd(x, kOneOverLog2, Or(kHalf, And(x, kNegZero))));
-
-  // Reduce, approximate, and then reconstruct.
-  const V y = impl.LoadExpShortRange(
-      d, Add(impl.ExpPoly(d, impl.ExpReduce(d, x, q)), kOne), q);
-  return IfThenElseZero(Ge(x, kLowerBound), y);
-}
-
-template <class D, class V>
-HWY_INLINE V Expm1(const D d, V x) {
-  using T = TFromD<D>;
-
-  const V kHalf = Set(d, static_cast<T>(+0.5));
-  const V kLowerBound =
-      Set(d, static_cast<T>((sizeof(T) == 4 ? -104.0 : -1000.0)));
-  const V kLn2Over2 = Set(d, static_cast<T>(+0.346573590279972654708616));
-  const V kNegOne = Set(d, static_cast<T>(-1.0));
-  const V kNegZero = Set(d, static_cast<T>(-0.0));
-  const V kOne = Set(d, static_cast<T>(+1.0));
-  const V kOneOverLog2 = Set(d, static_cast<T>(+1.442695040888963407359924681));
-
-  impl::ExpImpl<T> impl;
-
-  // q = static_cast<int32>((x / log(2)) + ((x < 0) ? -0.5 : +0.5))
-  const auto q =
-      impl.ToInt32(d, MulAdd(x, kOneOverLog2, Or(kHalf, And(x, kNegZero))));
-
-  // Reduce, approximate, and then reconstruct.
-  const V y = impl.ExpPoly(d, impl.ExpReduce(d, x, q));
-  const V z = IfThenElse(Lt(Abs(x), kLn2Over2), y,
-                         Sub(impl.LoadExpShortRange(d, Add(y, kOne), q), kOne));
-  return IfThenElse(Lt(x, kLowerBound), kNegOne, z);
-}
-
-template <class D, class V>
-HWY_INLINE V Log(const D d, V x) {
-  return impl::Log<D, V, /*kAllowSubnormals=*/true>(d, x);
-}
-
-template <class D, class V>
-HWY_INLINE V Log10(const D d, V x) {
-  using T = TFromD<D>;
-  return Mul(Log(d, x), Set(d, static_cast<T>(0.4342944819032518276511)));
-}
-
-template <class D, class V>
-HWY_INLINE V Log1p(const D d, V x) {
-  using T = TFromD<D>;
-  const V kOne = Set(d, static_cast<T>(+1.0));
-
-  const V y = Add(x, kOne);
-  const auto is_pole = Eq(y, kOne);
-  const auto divisor = Sub(IfThenZeroElse(is_pole, y), kOne);
-  const auto non_pole =
-      Mul(impl::Log<D, V, /*kAllowSubnormals=*/false>(d, y), Div(x, divisor));
-  return IfThenElse(is_pole, x, non_pole);
-}
-
-template <class D, class V>
-HWY_INLINE V Log2(const D d, V x) {
-  using T = TFromD<D>;
-  return Mul(Log(d, x), Set(d, static_cast<T>(1.44269504088896340735992)));
-}
-
-template <class D, class V>
-HWY_INLINE V Sin(const D d, V x) {
-  using T = TFromD<D>;
-  impl::CosSinImpl<T> impl;
-
-  // Float Constants
-  const V kOneOverPi = Set(d, static_cast<T>(0.31830988618379067153));
-  const V kHalf = Set(d, static_cast<T>(0.5));
-
-  // Integer Constants
-  const Rebind<int32_t, D> di32;
-  using VI32 = decltype(Zero(di32));
-
-  const V abs_x = Abs(x);
-  const V sign_x = Xor(abs_x, x);
-
-  // Compute the quadrant, q = int((|x| / pi) + 0.5)
-  const VI32 q = impl.ToInt32(d, MulAdd(abs_x, kOneOverPi, kHalf));
-
-  // Reduce range, apply sign, and approximate.
-  return impl.Poly(d, Xor(impl.SinReduce(d, abs_x, q),
-                          Xor(impl.SinSignFromQuadrant(d, q), sign_x)));
-}
-
-template <class D, class V>
-HWY_INLINE V Sinh(const D d, V x) {
-  using T = TFromD<D>;
-  const V kHalf = Set(d, static_cast<T>(+0.5));
-  const V kOne = Set(d, static_cast<T>(+1.0));
-  const V kTwo = Set(d, static_cast<T>(+2.0));
-
-  const V sign = And(SignBit(d), x);  // Extract the sign bit
-  const V abs_x = Xor(x, sign);
-  const V y = Expm1(d, abs_x);
-  const V z = Mul(Div(Add(y, kTwo), Add(y, kOne)), Mul(y, kHalf));
-  return Xor(z, sign);  // Reapply the sign bit
-}
-
-template <class D, class V>
-HWY_INLINE V Tanh(const D d, V x) {
-  using T = TFromD<D>;
-  const V kLimit = Set(d, static_cast<T>(18.714973875));
-  const V kOne = Set(d, static_cast<T>(+1.0));
-  const V kTwo = Set(d, static_cast<T>(+2.0));
-
-  const V sign = And(SignBit(d), x);  // Extract the sign bit
-  const V abs_x = Xor(x, sign);
-  const V y = Expm1(d, Mul(abs_x, kTwo));
-  const V z = IfThenElse(Gt(abs_x, kLimit), kOne, Div(y, Add(y, kTwo)));
-  return Xor(z, sign);  // Reapply the sign bit
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#endif  // HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_
diff --git a/third_party/highway/hwy/contrib/math/math_test.cc b/third_party/highway/hwy/contrib/math/math_test.cc
deleted file mode 100644 (file)
index 246a081..0000000
+++ /dev/null
@@ -1,227 +0,0 @@
-// Copyright 2020 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS  // before inttypes.h
-#endif
-#include <inttypes.h>
-#include <stdio.h>
-
-#include <cfloat>  // FLT_MAX
-#include <type_traits>
-
-// clang-format off
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/math/math_test.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-
-#include "hwy/contrib/math/math-inl.h"
-#include "hwy/tests/test_util-inl.h"
-// clang-format on
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-template <class Out, class In>
-inline Out BitCast(const In& in) {
-  static_assert(sizeof(Out) == sizeof(In), "");
-  Out out;
-  CopyBytes<sizeof(out)>(&in, &out);
-  return out;
-}
-
-template <class T, class D>
-HWY_NOINLINE void TestMath(const std::string name, T (*fx1)(T),
-                           Vec<D> (*fxN)(D, VecArg<Vec<D>>), D d, T min, T max,
-                           uint64_t max_error_ulp) {
-  using UintT = MakeUnsigned<T>;
-
-  const UintT min_bits = BitCast<UintT>(min);
-  const UintT max_bits = BitCast<UintT>(max);
-
-  // If min is negative and max is positive, the range needs to be broken into
-  // two pieces, [+0, max] and [-0, min], otherwise [min, max].
-  int range_count = 1;
-  UintT ranges[2][2] = {{min_bits, max_bits}, {0, 0}};
-  if ((min < 0.0) && (max > 0.0)) {
-    ranges[0][0] = BitCast<UintT>(static_cast<T>(+0.0));
-    ranges[0][1] = max_bits;
-    ranges[1][0] = BitCast<UintT>(static_cast<T>(-0.0));
-    ranges[1][1] = min_bits;
-    range_count = 2;
-  }
-
-  uint64_t max_ulp = 0;
-  // Emulation is slower, so cannot afford as many.
-  constexpr UintT kSamplesPerRange = static_cast<UintT>(AdjustedReps(4000));
-  for (int range_index = 0; range_index < range_count; ++range_index) {
-    const UintT start = ranges[range_index][0];
-    const UintT stop = ranges[range_index][1];
-    const UintT step = HWY_MAX(1, ((stop - start) / kSamplesPerRange));
-    for (UintT value_bits = start; value_bits <= stop; value_bits += step) {
-      // For reasons unknown, the HWY_MAX is necessary on RVV, otherwise
-      // value_bits can be less than start, and thus possibly NaN.
-      const T value = BitCast<T>(HWY_MIN(HWY_MAX(start, value_bits), stop));
-      const T actual = GetLane(fxN(d, Set(d, value)));
-      const T expected = fx1(value);
-
-      // Skip small inputs and outputs on armv7, it flushes subnormals to zero.
-#if HWY_TARGET == HWY_NEON && HWY_ARCH_ARM_V7
-      if ((std::abs(value) < 1e-37f) || (std::abs(expected) < 1e-37f)) {
-        continue;
-      }
-#endif
-
-      const auto ulp = hwy::detail::ComputeUlpDelta(actual, expected);
-      max_ulp = HWY_MAX(max_ulp, ulp);
-      if (ulp > max_error_ulp) {
-        fprintf(stderr,
-                "%s: %s(%f) expected %f actual %f ulp %" PRIu64 " max ulp %u\n",
-                hwy::TypeName(T(), Lanes(d)).c_str(), name.c_str(), value,
-                expected, actual, static_cast<uint64_t>(ulp),
-                static_cast<uint32_t>(max_error_ulp));
-      }
-    }
-  }
-  fprintf(stderr, "%s: %s max_ulp %" PRIu64 "\n",
-          hwy::TypeName(T(), Lanes(d)).c_str(), name.c_str(), max_ulp);
-  HWY_ASSERT(max_ulp <= max_error_ulp);
-}
-
-#define DEFINE_MATH_TEST_FUNC(NAME)                 \
-  HWY_NOINLINE void TestAll##NAME() {               \
-    ForFloatTypes(ForPartialVectors<Test##NAME>()); \
-  }
-
-#undef DEFINE_MATH_TEST
-#define DEFINE_MATH_TEST(NAME, F32x1, F32xN, F32_MIN, F32_MAX, F32_ERROR, \
-                         F64x1, F64xN, F64_MIN, F64_MAX, F64_ERROR)       \
-  struct Test##NAME {                                                     \
-    template <class T, class D>                                           \
-    HWY_NOINLINE void operator()(T, D d) {                                \
-      if (sizeof(T) == 4) {                                               \
-        TestMath<T, D>(HWY_STR(NAME), F32x1, F32xN, d, F32_MIN, F32_MAX,  \
-                       F32_ERROR);                                        \
-      } else {                                                            \
-        TestMath<T, D>(HWY_STR(NAME), F64x1, F64xN, d,                    \
-                       static_cast<T>(F64_MIN), static_cast<T>(F64_MAX),  \
-                       F64_ERROR);                                        \
-      }                                                                   \
-    }                                                                     \
-  };                                                                      \
-  DEFINE_MATH_TEST_FUNC(NAME)
-
-// Floating point values closest to but less than 1.0
-const float kNearOneF = BitCast<float>(0x3F7FFFFF);
-const double kNearOneD = BitCast<double>(0x3FEFFFFFFFFFFFFFULL);
-
-// The discrepancy is unacceptably large for MSYS2 (less accurate libm?), so
-// only increase the error tolerance there.
-constexpr uint64_t Cos64ULP() {
-#if defined(__MINGW32__)
-  return 23;
-#else
-  return 3;
-#endif
-}
-
-constexpr uint64_t ACosh32ULP() {
-#if defined(__MINGW32__)
-  return 8;
-#else
-  return 3;
-#endif
-}
-
-// clang-format off
-DEFINE_MATH_TEST(Acos,
-  std::acos,  CallAcos,  -1.0f,      +1.0f,       3,  // NEON is 3 instead of 2
-  std::acos,  CallAcos,  -1.0,       +1.0,        2)
-DEFINE_MATH_TEST(Acosh,
-  std::acosh, CallAcosh, +1.0f,      +FLT_MAX,    ACosh32ULP(),
-  std::acosh, CallAcosh, +1.0,       +DBL_MAX,    3)
-DEFINE_MATH_TEST(Asin,
-  std::asin,  CallAsin,  -1.0f,      +1.0f,       4,  // ARMv7 is 4 instead of 2
-  std::asin,  CallAsin,  -1.0,       +1.0,        2)
-DEFINE_MATH_TEST(Asinh,
-  std::asinh, CallAsinh, -FLT_MAX,   +FLT_MAX,    3,
-  std::asinh, CallAsinh, -DBL_MAX,   +DBL_MAX,    3)
-DEFINE_MATH_TEST(Atan,
-  std::atan,  CallAtan,  -FLT_MAX,   +FLT_MAX,    3,
-  std::atan,  CallAtan,  -DBL_MAX,   +DBL_MAX,    3)
-DEFINE_MATH_TEST(Atanh,
-  std::atanh, CallAtanh, -kNearOneF, +kNearOneF,  4,  // NEON is 4 instead of 3
-  std::atanh, CallAtanh, -kNearOneD, +kNearOneD,  3)
-DEFINE_MATH_TEST(Cos,
-  std::cos,   CallCos,   -39000.0f,  +39000.0f,   3,
-  std::cos,   CallCos,   -39000.0,   +39000.0,    Cos64ULP())
-DEFINE_MATH_TEST(Exp,
-  std::exp,   CallExp,   -FLT_MAX,   +104.0f,     1,
-  std::exp,   CallExp,   -DBL_MAX,   +104.0,      1)
-DEFINE_MATH_TEST(Expm1,
-  std::expm1, CallExpm1, -FLT_MAX,   +104.0f,     4,
-  std::expm1, CallExpm1, -DBL_MAX,   +104.0,      4)
-DEFINE_MATH_TEST(Log,
-  std::log,   CallLog,   +FLT_MIN,   +FLT_MAX,    1,
-  std::log,   CallLog,   +DBL_MIN,   +DBL_MAX,    1)
-DEFINE_MATH_TEST(Log10,
-  std::log10, CallLog10, +FLT_MIN,   +FLT_MAX,    2,
-  std::log10, CallLog10, +DBL_MIN,   +DBL_MAX,    2)
-DEFINE_MATH_TEST(Log1p,
-  std::log1p, CallLog1p, +0.0f,      +1e37f,      3,  // NEON is 3 instead of 2
-  std::log1p, CallLog1p, +0.0,       +DBL_MAX,    2)
-DEFINE_MATH_TEST(Log2,
-  std::log2,  CallLog2,  +FLT_MIN,   +FLT_MAX,    2,
-  std::log2,  CallLog2,  +DBL_MIN,   +DBL_MAX,    2)
-DEFINE_MATH_TEST(Sin,
-  std::sin,   CallSin,   -39000.0f,  +39000.0f,   3,
-  std::sin,   CallSin,   -39000.0,   +39000.0,    4)  // MSYS is 4 instead of 3
-DEFINE_MATH_TEST(Sinh,
-  std::sinh,  CallSinh,  -80.0f,     +80.0f,      4,
-  std::sinh,  CallSinh,  -709.0,     +709.0,      4)
-DEFINE_MATH_TEST(Tanh,
-  std::tanh,  CallTanh,  -FLT_MAX,   +FLT_MAX,    4,
-  std::tanh,  CallTanh,  -DBL_MAX,   +DBL_MAX,    4)
-// clang-format on
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-
-namespace hwy {
-HWY_BEFORE_TEST(HwyMathTest);
-HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAcos);
-HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAcosh);
-HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAsin);
-HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAsinh);
-HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAtan);
-HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAtanh);
-HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllCos);
-HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllExp);
-HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllExpm1);
-HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog);
-HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog10);
-HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog1p);
-HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog2);
-HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllSin);
-HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllSinh);
-HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllTanh);
-}  // namespace hwy
-
-#endif
diff --git a/third_party/highway/hwy/contrib/sort/BUILD b/third_party/highway/hwy/contrib/sort/BUILD
deleted file mode 100644 (file)
index f936e18..0000000
+++ /dev/null
@@ -1,188 +0,0 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])
-
-# Unused on Bazel builds, where this is not defined/known; Copybara replaces
-# usages with an empty list.
-COMPAT = [
-    "//buildenv/target:non_prod",  # includes mobile/vendor.
-]
-
-# cc_library(
-#     name = "vxsort",
-#     srcs = [
-#         "vxsort/isa_detection.cpp",
-#         "vxsort/isa_detection_msvc.cpp",
-#         "vxsort/isa_detection_sane.cpp",
-#         "vxsort/machine_traits.avx2.cpp",
-#         "vxsort/smallsort/avx2_load_mask_tables.cpp",
-#         "vxsort/smallsort/bitonic_sort.AVX2.double.generated.cpp",
-#         "vxsort/smallsort/bitonic_sort.AVX2.float.generated.cpp",
-#         "vxsort/smallsort/bitonic_sort.AVX2.int32_t.generated.cpp",
-#         "vxsort/smallsort/bitonic_sort.AVX2.int64_t.generated.cpp",
-#         "vxsort/smallsort/bitonic_sort.AVX2.uint32_t.generated.cpp",
-#         "vxsort/smallsort/bitonic_sort.AVX2.uint64_t.generated.cpp",
-#         "vxsort/smallsort/bitonic_sort.AVX512.double.generated.cpp",
-#         "vxsort/smallsort/bitonic_sort.AVX512.float.generated.cpp",
-#         "vxsort/smallsort/bitonic_sort.AVX512.int32_t.generated.cpp",
-#         "vxsort/smallsort/bitonic_sort.AVX512.int64_t.generated.cpp",
-#         "vxsort/smallsort/bitonic_sort.AVX512.uint32_t.generated.cpp",
-#         "vxsort/smallsort/bitonic_sort.AVX512.uint64_t.generated.cpp",
-#         "vxsort/vxsort_stats.cpp",
-#     ],
-#     hdrs = [
-#         "vxsort/alignment.h",
-#         "vxsort/defs.h",
-#         "vxsort/isa_detection.h",
-#         "vxsort/machine_traits.avx2.h",
-#         "vxsort/machine_traits.avx512.h",
-#         "vxsort/machine_traits.h",
-#         "vxsort/packer.h",
-#         "vxsort/smallsort/bitonic_sort.AVX2.double.generated.h",
-#         "vxsort/smallsort/bitonic_sort.AVX2.float.generated.h",
-#         "vxsort/smallsort/bitonic_sort.AVX2.int32_t.generated.h",
-#         "vxsort/smallsort/bitonic_sort.AVX2.int64_t.generated.h",
-#         "vxsort/smallsort/bitonic_sort.AVX2.uint32_t.generated.h",
-#         "vxsort/smallsort/bitonic_sort.AVX2.uint64_t.generated.h",
-#         "vxsort/smallsort/bitonic_sort.AVX512.double.generated.h",
-#         "vxsort/smallsort/bitonic_sort.AVX512.float.generated.h",
-#         "vxsort/smallsort/bitonic_sort.AVX512.int32_t.generated.h",
-#         "vxsort/smallsort/bitonic_sort.AVX512.int64_t.generated.h",
-#         "vxsort/smallsort/bitonic_sort.AVX512.uint32_t.generated.h",
-#         "vxsort/smallsort/bitonic_sort.AVX512.uint64_t.generated.h",
-#         "vxsort/smallsort/bitonic_sort.h",
-#         "vxsort/vxsort.h",
-#         "vxsort/vxsort_stats.h",
-#     ],
-#     compatible_with = [],
-#     textual_hdrs = [
-#         "vxsort/vxsort_targets_disable.h",
-#         "vxsort/vxsort_targets_enable_avx2.h",
-#         "vxsort/vxsort_targets_enable_avx512.h",
-#     ],
-# )
-
-cc_library(
-    name = "vqsort",
-    srcs = [
-        # Split into separate files to reduce MSVC build time.
-        "vqsort.cc",
-        "vqsort_128a.cc",
-        "vqsort_128d.cc",
-        "vqsort_f32a.cc",
-        "vqsort_f32d.cc",
-        "vqsort_f64a.cc",
-        "vqsort_f64d.cc",
-        "vqsort_i16a.cc",
-        "vqsort_i16d.cc",
-        "vqsort_i32a.cc",
-        "vqsort_i32d.cc",
-        "vqsort_i64a.cc",
-        "vqsort_i64d.cc",
-        "vqsort_kv128a.cc",
-        "vqsort_kv128d.cc",
-        "vqsort_u16a.cc",
-        "vqsort_u16d.cc",
-        "vqsort_u32a.cc",
-        "vqsort_u32d.cc",
-        "vqsort_u64a.cc",
-        "vqsort_u64d.cc",
-    ],
-    hdrs = [
-        "vqsort.h",  # public interface
-    ],
-    compatible_with = [],
-    local_defines = ["hwy_contrib_EXPORTS"],
-    textual_hdrs = [
-        "shared-inl.h",
-        "sorting_networks-inl.h",
-        "traits-inl.h",
-        "traits128-inl.h",
-        "vqsort-inl.h",
-        # Placeholder for internal instrumentation. Do not remove.
-    ],
-    deps = [
-        # Only if VQSORT_SECURE_RNG is set.
-        # "//third_party/absl/random",
-        "//:hwy",
-        # ":vxsort",  # required if HAVE_VXSORT
-    ],
-)
-
-# -----------------------------------------------------------------------------
-# Internal-only targets
-
-cc_library(
-    name = "helpers",
-    testonly = 1,
-    textual_hdrs = [
-        "algo-inl.h",
-        "result-inl.h",
-    ],
-    deps = [
-        ":vqsort",
-        "//:nanobenchmark",
-        # Required for HAVE_PDQSORT, but that is unused and this is
-        # unavailable to Bazel builds, hence commented out.
-        # "//third_party/boost/allowed",
-        # Avoid ips4o and thus TBB to work around hwloc build failure.
-    ],
-)
-
-cc_binary(
-    name = "print_network",
-    testonly = 1,
-    srcs = ["print_network.cc"],
-    deps = [
-        ":helpers",
-        ":vqsort",
-        "//:hwy",
-    ],
-)
-
-cc_test(
-    name = "sort_test",
-    size = "medium",
-    srcs = ["sort_test.cc"],
-    # Do not enable fully_static_link (pthread crash on bazel)
-    local_defines = ["HWY_IS_TEST"],
-    # for test_suite.
-    tags = ["hwy_ops_test"],
-    deps = [
-        ":helpers",
-        ":vqsort",
-        "@com_google_googletest//:gtest_main",
-        "//:hwy",
-        "//:hwy_test_util",
-    ],
-)
-
-cc_binary(
-    name = "bench_sort",
-    testonly = 1,
-    srcs = ["bench_sort.cc"],
-    # Do not enable fully_static_link (pthread crash on bazel)
-    local_defines = ["HWY_IS_TEST"],
-    deps = [
-        ":helpers",
-        ":vqsort",
-        "@com_google_googletest//:gtest_main",
-        "//:hwy",
-        "//:hwy_test_util",
-    ],
-)
-
-cc_binary(
-    name = "bench_parallel",
-    testonly = 1,
-    srcs = ["bench_parallel.cc"],
-    # Do not enable fully_static_link (pthread crash on bazel)
-    local_defines = ["HWY_IS_TEST"],
-    deps = [
-        ":helpers",
-        ":vqsort",
-        "@com_google_googletest//:gtest_main",
-        "//:hwy",
-        "//:hwy_test_util",
-    ],
-)
diff --git a/third_party/highway/hwy/contrib/sort/README.md b/third_party/highway/hwy/contrib/sort/README.md
deleted file mode 100644 (file)
index e35e710..0000000
+++ /dev/null
@@ -1,81 +0,0 @@
-# Vectorized and performance-portable Quicksort
-
-## Introduction
-
-As of 2022-06-07 this sorts large arrays of built-in types about ten times as
-fast as `std::sort`. See also our
-[blog post](https://opensource.googleblog.com/2022/06/Vectorized%20and%20performance%20portable%20Quicksort.html)
-and [paper](https://arxiv.org/abs/2205.05982).
-
-## Instructions
-
-Here are instructions for reproducing our results on x86 Linux (AVX2, AVX-512)
-and Arm V1 (NEON, SVE).
-
-### x86 (Linux)
-
-Please first ensure golang, and Clang (tested with 13.0.1) are installed via
-your system's package manager.
-
-```
-go install github.com/bazelbuild/bazelisk@latest
-git clone https://github.com/google/highway
-cd highway
-CC=clang CXX=clang++ ~/go/bin/bazelisk build -c opt hwy/contrib/sort:all
-bazel-bin/hwy/contrib/sort/sort_test
-bazel-bin/hwy/contrib/sort/bench_sort
-```
-
-### AWS Graviton3
-
-Instance config: amazon linux 5.10 arm64, c7g.8xlarge (largest allowed config is
-32 vCPU). Initial launch will fail. Wait a few minutes for an email saying the
-config is verified, then re-launch. See IPv4 hostname in list of instances.
-
-`ssh -i /path/key.pem ec2-user@hostname`
-
-Note that the AWS CMake package is too old for llvm, so we build it first:
-```
-wget https://cmake.org/files/v3.23/cmake-3.23.2.tar.gz
-tar -xvzf cmake-3.23.2.tar.gz && cd cmake-3.23.2/
-./bootstrap -- -DCMAKE_USE_OPENSSL=OFF
-make -j8 && sudo make install
-cd ..
-```
-
-AWS clang is at version 11.1, which generates unnecessary AND instructions which
-slow down the sort by 1.15x. We tested with clang trunk as of June 13
-(which reports Git hash 8f6512fea000c3a0d394864bb94e524bee375069). To build:
-```
-git clone --depth 1 https://github.com/llvm/llvm-project.git
-cd llvm-project
-mkdir -p build && cd build
-/usr/local/bin/cmake ../llvm -DLLVM_ENABLE_PROJECTS="clang" -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi" -DCMAKE_BUILD_TYPE=Release
-make -j32 && sudo make install
-```
-
-```
-sudo yum install go
-go install github.com/bazelbuild/bazelisk@latest
-git clone https://github.com/google/highway
-cd highway
-CC=/usr/local/bin/clang CXX=/usr/local/bin/clang++ ~/go/bin/bazelisk build -c opt --copt=-march=armv8.2-a+sve hwy/contrib/sort:all
-bazel-bin/hwy/contrib/sort/sort_test
-bazel-bin/hwy/contrib/sort/bench_sort
-```
-
-## Results
-
-`bench_sort` outputs the instruction set (AVX3 refers to AVX-512), the sort
-algorithm (std for `std::sort`, vq for our vqsort), the type of keys being
-sorted (f32 is float), the distribution of keys (uniform32 for uniform random
-with range 0-2^32), the number of keys, then the throughput of sorted keys (i.e.
-number of key bytes output per second).
-
-Example excerpt from Xeon 6154 (Skylake-X) CPU clocked at 3 GHz:
-
-```
-[ RUN      ] BenchSortGroup/BenchSort.BenchAllSort/AVX3
-      AVX3:          std:     f32: uniform32: 1.00E+06   54 MB/s ( 1 threads)
-      AVX3:           vq:     f32: uniform32: 1.00E+06 1143 MB/s ( 1 threads)
-```
diff --git a/third_party/highway/hwy/contrib/sort/algo-inl.h b/third_party/highway/hwy/contrib/sort/algo-inl.h
deleted file mode 100644 (file)
index 4b01e2d..0000000
+++ /dev/null
@@ -1,512 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Normal include guard for target-independent parts
-#ifndef HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_
-#define HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_
-
-#include <stdint.h>
-#include <string.h>  // memcpy
-
-#include <algorithm>
-#include <cmath>  // std::abs
-#include <vector>
-
-#include "hwy/base.h"
-#include "hwy/contrib/sort/vqsort.h"
-
-// Third-party algorithms
-#define HAVE_AVX2SORT 0
-#define HAVE_IPS4O 0
-// When enabling, consider changing max_threads (required for Table 1a)
-#define HAVE_PARALLEL_IPS4O (HAVE_IPS4O && 1)
-#define HAVE_PDQSORT 0
-#define HAVE_SORT512 0
-#define HAVE_VXSORT 0
-
-#if HAVE_AVX2SORT
-HWY_PUSH_ATTRIBUTES("avx2,avx")
-#include "avx2sort.h"  //NOLINT
-HWY_POP_ATTRIBUTES
-#endif
-#if HAVE_IPS4O || HAVE_PARALLEL_IPS4O
-#include "third_party/ips4o/include/ips4o.hpp"
-#include "third_party/ips4o/include/ips4o/thread_pool.hpp"
-#endif
-#if HAVE_PDQSORT
-#include "third_party/boost/allowed/sort/sort.hpp"
-#endif
-#if HAVE_SORT512
-#include "sort512.h"  //NOLINT
-#endif
-
-// vxsort is difficult to compile for multiple targets because it also uses
-// .cpp files, and we'd also have to #undef its include guards. Instead, compile
-// only for AVX2 or AVX3 depending on this macro.
-#define VXSORT_AVX3 1
-#if HAVE_VXSORT
-// inlined from vxsort_targets_enable_avx512 (must close before end of header)
-#ifdef __GNUC__
-#ifdef __clang__
-#if VXSORT_AVX3
-#pragma clang attribute push(__attribute__((target("avx512f,avx512dq"))), \
-                             apply_to = any(function))
-#else
-#pragma clang attribute push(__attribute__((target("avx2"))), \
-                             apply_to = any(function))
-#endif  // VXSORT_AVX3
-
-#else
-#pragma GCC push_options
-#if VXSORT_AVX3
-#pragma GCC target("avx512f,avx512dq")
-#else
-#pragma GCC target("avx2")
-#endif  // VXSORT_AVX3
-#endif
-#endif
-
-#if VXSORT_AVX3
-#include "vxsort/machine_traits.avx512.h"
-#else
-#include "vxsort/machine_traits.avx2.h"
-#endif  // VXSORT_AVX3
-#include "vxsort/vxsort.h"
-#ifdef __GNUC__
-#ifdef __clang__
-#pragma clang attribute pop
-#else
-#pragma GCC pop_options
-#endif
-#endif
-#endif  // HAVE_VXSORT
-
-namespace hwy {
-
-enum class Dist { kUniform8, kUniform16, kUniform32 };
-
-static inline std::vector<Dist> AllDist() {
-  return {/*Dist::kUniform8, Dist::kUniform16,*/ Dist::kUniform32};
-}
-
-static inline const char* DistName(Dist dist) {
-  switch (dist) {
-    case Dist::kUniform8:
-      return "uniform8";
-    case Dist::kUniform16:
-      return "uniform16";
-    case Dist::kUniform32:
-      return "uniform32";
-  }
-  return "unreachable";
-}
-
-template <typename T>
-class InputStats {
- public:
-  void Notify(T value) {
-    min_ = std::min(min_, value);
-    max_ = std::max(max_, value);
-    // Converting to integer would truncate floats, multiplying to save digits
-    // risks overflow especially when casting, so instead take the sum of the
-    // bit representations as the checksum.
-    uint64_t bits = 0;
-    static_assert(sizeof(T) <= 8, "Expected a built-in type");
-    CopyBytes<sizeof(T)>(&value, &bits);  // not same size
-    sum_ += bits;
-    count_ += 1;
-  }
-
-  bool operator==(const InputStats& other) const {
-    if (count_ != other.count_) {
-      HWY_ABORT("count %d vs %d\n", static_cast<int>(count_),
-                static_cast<int>(other.count_));
-    }
-
-    if (min_ != other.min_ || max_ != other.max_) {
-      HWY_ABORT("minmax %f/%f vs %f/%f\n", static_cast<double>(min_),
-                static_cast<double>(max_), static_cast<double>(other.min_),
-                static_cast<double>(other.max_));
-    }
-
-    // Sum helps detect duplicated/lost values
-    if (sum_ != other.sum_) {
-      HWY_ABORT("Sum mismatch %g %g; min %g max %g\n",
-                static_cast<double>(sum_), static_cast<double>(other.sum_),
-                static_cast<double>(min_), static_cast<double>(max_));
-    }
-
-    return true;
-  }
-
- private:
-  T min_ = hwy::HighestValue<T>();
-  T max_ = hwy::LowestValue<T>();
-  uint64_t sum_ = 0;
-  size_t count_ = 0;
-};
-
-enum class Algo {
-#if HAVE_AVX2SORT
-  kSEA,
-#endif
-#if HAVE_IPS4O
-  kIPS4O,
-#endif
-#if HAVE_PARALLEL_IPS4O
-  kParallelIPS4O,
-#endif
-#if HAVE_PDQSORT
-  kPDQ,
-#endif
-#if HAVE_SORT512
-  kSort512,
-#endif
-#if HAVE_VXSORT
-  kVXSort,
-#endif
-  kStd,
-  kVQSort,
-  kHeap,
-};
-
-static inline const char* AlgoName(Algo algo) {
-  switch (algo) {
-#if HAVE_AVX2SORT
-    case Algo::kSEA:
-      return "sea";
-#endif
-#if HAVE_IPS4O
-    case Algo::kIPS4O:
-      return "ips4o";
-#endif
-#if HAVE_PARALLEL_IPS4O
-    case Algo::kParallelIPS4O:
-      return "par_ips4o";
-#endif
-#if HAVE_PDQSORT
-    case Algo::kPDQ:
-      return "pdq";
-#endif
-#if HAVE_SORT512
-    case Algo::kSort512:
-      return "sort512";
-#endif
-#if HAVE_VXSORT
-    case Algo::kVXSort:
-      return "vxsort";
-#endif
-    case Algo::kStd:
-      return "std";
-    case Algo::kVQSort:
-      return "vq";
-    case Algo::kHeap:
-      return "heap";
-  }
-  return "unreachable";
-}
-
-}  // namespace hwy
-#endif  // HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_
-
-// Per-target
-#if defined(HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE) == \
-    defined(HWY_TARGET_TOGGLE)
-#ifdef HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
-#undef HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
-#else
-#define HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
-#endif
-
-#include "hwy/contrib/sort/traits-inl.h"
-#include "hwy/contrib/sort/traits128-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"  // HeapSort
-#include "hwy/tests/test_util-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-class Xorshift128Plus {
-  static HWY_INLINE uint64_t SplitMix64(uint64_t z) {
-    z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull;
-    z = (z ^ (z >> 27)) * 0x94D049BB133111EBull;
-    return z ^ (z >> 31);
-  }
-
- public:
-  // Generates two vectors of 64-bit seeds via SplitMix64 and stores into
-  // `seeds`. Generating these afresh in each ChoosePivot is too expensive.
-  template <class DU64>
-  static void GenerateSeeds(DU64 du64, TFromD<DU64>* HWY_RESTRICT seeds) {
-    seeds[0] = SplitMix64(0x9E3779B97F4A7C15ull);
-    for (size_t i = 1; i < 2 * Lanes(du64); ++i) {
-      seeds[i] = SplitMix64(seeds[i - 1]);
-    }
-  }
-
-  // Need to pass in the state because vector cannot be class members.
-  template <class VU64>
-  static VU64 RandomBits(VU64& state0, VU64& state1) {
-    VU64 s1 = state0;
-    VU64 s0 = state1;
-    const VU64 bits = Add(s1, s0);
-    state0 = s0;
-    s1 = Xor(s1, ShiftLeft<23>(s1));
-    state1 = Xor(s1, Xor(s0, Xor(ShiftRight<18>(s1), ShiftRight<5>(s0))));
-    return bits;
-  }
-};
-
-template <class D, class VU64, HWY_IF_NOT_FLOAT_D(D)>
-Vec<D> RandomValues(D d, VU64& s0, VU64& s1, const VU64 mask) {
-  const VU64 bits = Xorshift128Plus::RandomBits(s0, s1);
-  return BitCast(d, And(bits, mask));
-}
-
-// It is important to avoid denormals, which are flushed to zero by SIMD but not
-// scalar sorts, and NaN, which may be ordered differently in scalar vs. SIMD.
-template <class DF, class VU64, HWY_IF_FLOAT_D(DF)>
-Vec<DF> RandomValues(DF df, VU64& s0, VU64& s1, const VU64 mask) {
-  using TF = TFromD<DF>;
-  const RebindToUnsigned<decltype(df)> du;
-  using VU = Vec<decltype(du)>;
-
-  const VU64 bits64 = And(Xorshift128Plus::RandomBits(s0, s1), mask);
-
-#if HWY_TARGET == HWY_SCALAR  // Cannot repartition u64 to smaller types
-  using TU = MakeUnsigned<TF>;
-  const VU bits = Set(du, static_cast<TU>(GetLane(bits64) & LimitsMax<TU>()));
-#else
-  const VU bits = BitCast(du, bits64);
-#endif
-  // Avoid NaN/denormal by only generating values in [1, 2), i.e. random
-  // mantissas with the exponent taken from the representation of 1.0.
-  const VU k1 = BitCast(du, Set(df, TF{1.0}));
-  const VU mantissa_mask = Set(du, MantissaMask<TF>());
-  const VU representation = OrAnd(k1, bits, mantissa_mask);
-  return BitCast(df, representation);
-}
-
-template <class DU64>
-Vec<DU64> MaskForDist(DU64 du64, const Dist dist, size_t sizeof_t) {
-  switch (sizeof_t) {
-    case 2:
-      return Set(du64, (dist == Dist::kUniform8) ? 0x00FF00FF00FF00FFull
-                                                 : 0xFFFFFFFFFFFFFFFFull);
-    case 4:
-      return Set(du64, (dist == Dist::kUniform8)    ? 0x000000FF000000FFull
-                       : (dist == Dist::kUniform16) ? 0x0000FFFF0000FFFFull
-                                                    : 0xFFFFFFFFFFFFFFFFull);
-    case 8:
-      return Set(du64, (dist == Dist::kUniform8)    ? 0x00000000000000FFull
-                       : (dist == Dist::kUniform16) ? 0x000000000000FFFFull
-                                                    : 0x00000000FFFFFFFFull);
-    default:
-      HWY_ABORT("Logic error");
-      return Zero(du64);
-  }
-}
-
-template <typename T>
-InputStats<T> GenerateInput(const Dist dist, T* v, size_t num) {
-  SortTag<uint64_t> du64;
-  using VU64 = Vec<decltype(du64)>;
-  const size_t N64 = Lanes(du64);
-  auto seeds = hwy::AllocateAligned<uint64_t>(2 * N64);
-  Xorshift128Plus::GenerateSeeds(du64, seeds.get());
-  VU64 s0 = Load(du64, seeds.get());
-  VU64 s1 = Load(du64, seeds.get() + N64);
-
-#if HWY_TARGET == HWY_SCALAR
-  const Sisd<T> d;
-#else
-  const Repartition<T, decltype(du64)> d;
-#endif
-  using V = Vec<decltype(d)>;
-  const size_t N = Lanes(d);
-  const VU64 mask = MaskForDist(du64, dist, sizeof(T));
-  auto buf = hwy::AllocateAligned<T>(N);
-
-  size_t i = 0;
-  for (; i + N <= num; i += N) {
-    const V values = RandomValues(d, s0, s1, mask);
-    StoreU(values, d, v + i);
-  }
-  if (i < num) {
-    const V values = RandomValues(d, s0, s1, mask);
-    StoreU(values, d, buf.get());
-    memcpy(v + i, buf.get(), (num - i) * sizeof(T));
-  }
-
-  InputStats<T> input_stats;
-  for (size_t i = 0; i < num; ++i) {
-    input_stats.Notify(v[i]);
-  }
-  return input_stats;
-}
-
-struct ThreadLocal {
-  Sorter sorter;
-};
-
-struct SharedState {
-#if HAVE_PARALLEL_IPS4O
-  const unsigned max_threads = hwy::LimitsMax<unsigned>();  // 16 for Table 1a
-  ips4o::StdThreadPool pool{static_cast<int>(
-      HWY_MIN(max_threads, std::thread::hardware_concurrency() / 2))};
-#endif
-  std::vector<ThreadLocal> tls{1};
-};
-
-// Bridge from keys (passed to Run) to lanes as expected by HeapSort. For
-// non-128-bit keys they are the same:
-template <class Order, typename KeyType, HWY_IF_NOT_LANE_SIZE(KeyType, 16)>
-void CallHeapSort(KeyType* HWY_RESTRICT keys, const size_t num_keys) {
-  using detail::TraitsLane;
-  using detail::SharedTraits;
-  if (Order().IsAscending()) {
-    const SharedTraits<TraitsLane<detail::OrderAscending<KeyType>>> st;
-    return detail::HeapSort(st, keys, num_keys);
-  } else {
-    const SharedTraits<TraitsLane<detail::OrderDescending<KeyType>>> st;
-    return detail::HeapSort(st, keys, num_keys);
-  }
-}
-
-#if VQSORT_ENABLED
-template <class Order>
-void CallHeapSort(hwy::uint128_t* HWY_RESTRICT keys, const size_t num_keys) {
-  using detail::SharedTraits;
-  using detail::Traits128;
-  uint64_t* lanes = reinterpret_cast<uint64_t*>(keys);
-  const size_t num_lanes = num_keys * 2;
-  if (Order().IsAscending()) {
-    const SharedTraits<Traits128<detail::OrderAscending128>> st;
-    return detail::HeapSort(st, lanes, num_lanes);
-  } else {
-    const SharedTraits<Traits128<detail::OrderDescending128>> st;
-    return detail::HeapSort(st, lanes, num_lanes);
-  }
-}
-
-template <class Order>
-void CallHeapSort(K64V64* HWY_RESTRICT keys, const size_t num_keys) {
-  using detail::SharedTraits;
-  using detail::Traits128;
-  uint64_t* lanes = reinterpret_cast<uint64_t*>(keys);
-  const size_t num_lanes = num_keys * 2;
-  if (Order().IsAscending()) {
-    const SharedTraits<Traits128<detail::OrderAscendingKV128>> st;
-    return detail::HeapSort(st, lanes, num_lanes);
-  } else {
-    const SharedTraits<Traits128<detail::OrderDescendingKV128>> st;
-    return detail::HeapSort(st, lanes, num_lanes);
-  }
-}
-#endif  // VQSORT_ENABLED
-
-template <class Order, typename KeyType>
-void Run(Algo algo, KeyType* HWY_RESTRICT inout, size_t num,
-         SharedState& shared, size_t thread) {
-  const std::less<KeyType> less;
-  const std::greater<KeyType> greater;
-
-  switch (algo) {
-#if HAVE_AVX2SORT
-    case Algo::kSEA:
-      return avx2::quicksort(inout, static_cast<int>(num));
-#endif
-
-#if HAVE_IPS4O
-    case Algo::kIPS4O:
-      if (Order().IsAscending()) {
-        return ips4o::sort(inout, inout + num, less);
-      } else {
-        return ips4o::sort(inout, inout + num, greater);
-      }
-#endif
-
-#if HAVE_PARALLEL_IPS4O
-    case Algo::kParallelIPS4O:
-      if (Order().IsAscending()) {
-        return ips4o::parallel::sort(inout, inout + num, less, shared.pool);
-      } else {
-        return ips4o::parallel::sort(inout, inout + num, greater, shared.pool);
-      }
-#endif
-
-#if HAVE_SORT512
-    case Algo::kSort512:
-      HWY_ABORT("not supported");
-      //    return Sort512::Sort(inout, num);
-#endif
-
-#if HAVE_PDQSORT
-    case Algo::kPDQ:
-      if (Order().IsAscending()) {
-        return boost::sort::pdqsort_branchless(inout, inout + num, less);
-      } else {
-        return boost::sort::pdqsort_branchless(inout, inout + num, greater);
-      }
-#endif
-
-#if HAVE_VXSORT
-    case Algo::kVXSort: {
-#if (VXSORT_AVX3 && HWY_TARGET != HWY_AVX3) || \
-    (!VXSORT_AVX3 && HWY_TARGET != HWY_AVX2)
-      fprintf(stderr, "Do not call for target %s\n",
-              hwy::TargetName(HWY_TARGET));
-      return;
-#else
-#if VXSORT_AVX3
-      vxsort::vxsort<KeyType, vxsort::AVX512> vx;
-#else
-      vxsort::vxsort<KeyType, vxsort::AVX2> vx;
-#endif
-      if (Order().IsAscending()) {
-        return vx.sort(inout, inout + num - 1);
-      } else {
-        fprintf(stderr, "Skipping VX - does not support descending order\n");
-        return;
-      }
-#endif  // enabled for this target
-    }
-#endif  // HAVE_VXSORT
-
-    case Algo::kStd:
-      if (Order().IsAscending()) {
-        return std::sort(inout, inout + num, less);
-      } else {
-        return std::sort(inout, inout + num, greater);
-      }
-
-    case Algo::kVQSort:
-      return shared.tls[thread].sorter(inout, num, Order());
-
-    case Algo::kHeap:
-      return CallHeapSort<Order>(inout, num);
-
-    default:
-      HWY_ABORT("Not implemented");
-  }
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#endif  // HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
diff --git a/third_party/highway/hwy/contrib/sort/bench_parallel.cc b/third_party/highway/hwy/contrib/sort/bench_parallel.cc
deleted file mode 100644 (file)
index 1c8c928..0000000
+++ /dev/null
@@ -1,238 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Concurrent, independent sorts for generating more memory traffic and testing
-// scalability.
-
-#include <stdint.h>
-#include <stdio.h>
-
-#include <condition_variable>  //NOLINT
-#include <functional>
-#include <memory>
-#include <mutex>   //NOLINT
-#include <thread>  //NOLINT
-#include <utility>
-#include <vector>
-
-// clang-format off
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/bench_parallel.cc"  //NOLINT
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-
-// After foreach_target
-#include "hwy/contrib/sort/algo-inl.h"
-#include "hwy/contrib/sort/result-inl.h"
-#include "hwy/aligned_allocator.h"
-// Last
-#include "hwy/tests/test_util-inl.h"
-// clang-format on
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-namespace {
-
-class ThreadPool {
- public:
-  // Starts the given number of worker threads and blocks until they are ready.
-  explicit ThreadPool(
-      const size_t num_threads = std::thread::hardware_concurrency())
-      : num_threads_(num_threads) {
-    HWY_ASSERT(num_threads_ > 0);
-    threads_.reserve(num_threads_);
-    for (size_t i = 0; i < num_threads_; ++i) {
-      threads_.emplace_back(ThreadFunc, this, i);
-    }
-
-    WorkersReadyBarrier();
-  }
-
-  ThreadPool(const ThreadPool&) = delete;
-  ThreadPool& operator&(const ThreadPool&) = delete;
-
-  // Waits for all threads to exit.
-  ~ThreadPool() {
-    StartWorkers(kWorkerExit);
-
-    for (std::thread& thread : threads_) {
-      thread.join();
-    }
-  }
-
-  size_t NumThreads() const { return threads_.size(); }
-
-  template <class Func>
-  void RunOnThreads(size_t max_threads, const Func& func) {
-    task_ = &CallClosure<Func>;
-    data_ = &func;
-    StartWorkers(max_threads);
-    WorkersReadyBarrier();
-  }
-
- private:
-  // After construction and between calls to Run, workers are "ready", i.e.
-  // waiting on worker_start_cv_. They are "started" by sending a "command"
-  // and notifying all worker_start_cv_ waiters. (That is why all workers
-  // must be ready/waiting - otherwise, the notification will not reach all of
-  // them and the main thread waits in vain for them to report readiness.)
-  using WorkerCommand = uint64_t;
-
-  static constexpr WorkerCommand kWorkerWait = ~1ULL;
-  static constexpr WorkerCommand kWorkerExit = ~2ULL;
-
-  // Calls a closure (lambda with captures).
-  template <class Closure>
-  static void CallClosure(const void* f, size_t thread) {
-    (*reinterpret_cast<const Closure*>(f))(thread);
-  }
-
-  void WorkersReadyBarrier() {
-    std::unique_lock<std::mutex> lock(mutex_);
-    // Typically only a single iteration.
-    while (workers_ready_ != threads_.size()) {
-      workers_ready_cv_.wait(lock);
-    }
-    workers_ready_ = 0;
-
-    // Safely handle spurious worker wakeups.
-    worker_start_command_ = kWorkerWait;
-  }
-
-  // Precondition: all workers are ready.
-  void StartWorkers(const WorkerCommand worker_command) {
-    std::unique_lock<std::mutex> lock(mutex_);
-    worker_start_command_ = worker_command;
-    // Workers will need this lock, so release it before they wake up.
-    lock.unlock();
-    worker_start_cv_.notify_all();
-  }
-
-  static void ThreadFunc(ThreadPool* self, size_t thread) {
-    // Until kWorkerExit command received:
-    for (;;) {
-      std::unique_lock<std::mutex> lock(self->mutex_);
-      // Notify main thread that this thread is ready.
-      if (++self->workers_ready_ == self->num_threads_) {
-        self->workers_ready_cv_.notify_one();
-      }
-    RESUME_WAIT:
-      // Wait for a command.
-      self->worker_start_cv_.wait(lock);
-      const WorkerCommand command = self->worker_start_command_;
-      switch (command) {
-        case kWorkerWait:    // spurious wakeup:
-          goto RESUME_WAIT;  // lock still held, avoid incrementing ready.
-        case kWorkerExit:
-          return;  // exits thread
-        default:
-          break;
-      }
-
-      lock.unlock();
-      // Command is the maximum number of threads that should run the task.
-      HWY_ASSERT(command < self->NumThreads());
-      if (thread < command) {
-        self->task_(self->data_, thread);
-      }
-    }
-  }
-
-  const size_t num_threads_;
-
-  // Unmodified after ctor, but cannot be const because we call thread::join().
-  std::vector<std::thread> threads_;
-
-  std::mutex mutex_;  // guards both cv and their variables.
-  std::condition_variable workers_ready_cv_;
-  size_t workers_ready_ = 0;
-  std::condition_variable worker_start_cv_;
-  WorkerCommand worker_start_command_;
-
-  // Written by main thread, read by workers (after mutex lock/unlock).
-  std::function<void(const void*, size_t)> task_;  // points to CallClosure
-  const void* data_;                               // points to caller's Func
-};
-
-template <class Traits>
-void RunWithoutVerify(Traits st, const Dist dist, const size_t num_keys,
-                      const Algo algo, SharedState& shared, size_t thread) {
-  using LaneType = typename Traits::LaneType;
-  using KeyType = typename Traits::KeyType;
-  using Order = typename Traits::Order;
-  const size_t num_lanes = num_keys * st.LanesPerKey();
-  auto aligned = hwy::AllocateAligned<LaneType>(num_lanes);
-
-  (void)GenerateInput(dist, aligned.get(), num_lanes);
-
-  const Timestamp t0;
-  Run<Order>(algo, reinterpret_cast<KeyType*>(aligned.get()), num_keys, shared,
-             thread);
-  HWY_ASSERT(aligned[0] < aligned[num_lanes - 1]);
-}
-
-void BenchParallel() {
-  // Not interested in benchmark results for other targets on x86
-  if (HWY_ARCH_X86 && (HWY_TARGET != HWY_AVX2 && HWY_TARGET != HWY_AVX3)) {
-    return;
-  }
-
-  ThreadPool pool;
-  const size_t NT = pool.NumThreads();
-
-  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<int64_t>>> st;
-  using KeyType = typename decltype(st)::KeyType;
-  const size_t num_keys = size_t{100} * 1000 * 1000;
-
-#if HAVE_IPS4O
-  const Algo algo = Algo::kIPS4O;
-#else
-  const Algo algo = Algo::kVQSort;
-#endif
-  const Dist dist = Dist::kUniform32;
-
-  SharedState shared;
-  shared.tls.resize(NT);
-
-  std::vector<Result> results;
-  for (size_t nt = 1; nt < NT; nt += HWY_MAX(1, NT / 16)) {
-    Timestamp t0;
-    // Default capture because MSVC wants algo/dist but clang does not.
-    pool.RunOnThreads(nt, [=, &shared](size_t thread) {
-      RunWithoutVerify(st, dist, num_keys, algo, shared, thread);
-    });
-    const double sec = SecondsSince(t0);
-    results.emplace_back(algo, dist, num_keys, nt, sec, sizeof(KeyType),
-                         st.KeyString());
-    results.back().Print();
-  }
-}
-
-}  // namespace
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-
-namespace hwy {
-namespace {
-HWY_BEFORE_TEST(BenchParallel);
-HWY_EXPORT_AND_TEST_P(BenchParallel, BenchParallel);
-}  // namespace
-}  // namespace hwy
-
-#endif  // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/bench_sort.cc b/third_party/highway/hwy/contrib/sort/bench_sort.cc
deleted file mode 100644 (file)
index 0d10e87..0000000
+++ /dev/null
@@ -1,311 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <stdint.h>
-#include <stdio.h>
-
-#include <vector>
-
-// clang-format off
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/bench_sort.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-
-// After foreach_target
-#include "hwy/contrib/sort/algo-inl.h"
-#include "hwy/contrib/sort/result-inl.h"
-#include "hwy/contrib/sort/sorting_networks-inl.h"  // SharedTraits
-#include "hwy/contrib/sort/traits-inl.h"
-#include "hwy/contrib/sort/traits128-inl.h"
-#include "hwy/tests/test_util-inl.h"
-// clang-format on
-
-// Mode for larger sorts because M1 is able to access more than the per-core
-// share of L2, so 1M elements might still be in cache.
-#define SORT_100M 0
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-// Defined within HWY_ONCE, used by BenchAllSort.
-extern int64_t first_sort_target;
-
-namespace HWY_NAMESPACE {
-namespace {
-using detail::TraitsLane;
-using detail::OrderAscending;
-using detail::OrderDescending;
-using detail::SharedTraits;
-
-#if VQSORT_ENABLED || HWY_IDE
-using detail::OrderAscending128;
-using detail::OrderAscendingKV128;
-using detail::Traits128;
-
-template <class Traits>
-HWY_NOINLINE void BenchPartition() {
-  using LaneType = typename Traits::LaneType;
-  using KeyType = typename Traits::KeyType;
-  const SortTag<LaneType> d;
-  detail::SharedTraits<Traits> st;
-  const Dist dist = Dist::kUniform8;
-  double sum = 0.0;
-
-  detail::Generator rng(&sum, 123);  // for ChoosePivot
-
-  const size_t max_log2 = AdjustedLog2Reps(20);
-  for (size_t log2 = max_log2; log2 < max_log2 + 1; ++log2) {
-    const size_t num_lanes = 1ull << log2;
-    const size_t num_keys = num_lanes / st.LanesPerKey();
-    auto aligned = hwy::AllocateAligned<LaneType>(num_lanes);
-    auto buf = hwy::AllocateAligned<LaneType>(
-        HWY_MAX(hwy::SortConstants::PartitionBufNum(Lanes(d)),
-                hwy::SortConstants::PivotBufNum(sizeof(LaneType), Lanes(d))));
-
-    std::vector<double> seconds;
-    const size_t num_reps = (1ull << (14 - log2 / 2)) * 30;
-    for (size_t rep = 0; rep < num_reps; ++rep) {
-      (void)GenerateInput(dist, aligned.get(), num_lanes);
-
-      // The pivot value can influence performance. Do exactly what vqsort will
-      // do so that the performance (influenced by prefetching and branch
-      // prediction) is likely to predict the actual performance inside vqsort.
-      detail::PivotResult result;
-      const auto pivot = detail::ChoosePivot(d, st, aligned.get(), num_lanes,
-                                             buf.get(), rng, result);
-
-      const Timestamp t0;
-      detail::Partition(d, st, aligned.get(), 0, num_lanes - 1, pivot,
-                        buf.get());
-      seconds.push_back(SecondsSince(t0));
-      // 'Use' the result to prevent optimizing out the partition.
-      sum += static_cast<double>(aligned.get()[num_lanes / 2]);
-    }
-
-    Result(Algo::kVQSort, dist, num_keys, 1, SummarizeMeasurements(seconds),
-           sizeof(KeyType), st.KeyString())
-        .Print();
-  }
-  HWY_ASSERT(sum != 999999);  // Prevent optimizing out
-}
-
-HWY_NOINLINE void BenchAllPartition() {
-  // Not interested in benchmark results for these targets
-  if (HWY_TARGET == HWY_SSSE3) {
-    return;
-  }
-
-  BenchPartition<TraitsLane<OrderDescending<float>>>();
-  BenchPartition<TraitsLane<OrderDescending<int32_t>>>();
-  BenchPartition<TraitsLane<OrderDescending<int64_t>>>();
-  BenchPartition<Traits128<OrderAscending128>>();
-  // BenchPartition<Traits128<OrderDescending128>>();
-  BenchPartition<Traits128<OrderAscendingKV128>>();
-}
-
-template <class Traits>
-HWY_NOINLINE void BenchBase(std::vector<Result>& results) {
-  // Not interested in benchmark results for these targets
-  if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4) {
-    return;
-  }
-
-  using LaneType = typename Traits::LaneType;
-  using KeyType = typename Traits::KeyType;
-  const SortTag<LaneType> d;
-  detail::SharedTraits<Traits> st;
-  const Dist dist = Dist::kUniform32;
-
-  const size_t N = Lanes(d);
-  const size_t num_lanes = SortConstants::BaseCaseNum(N);
-  const size_t num_keys = num_lanes / st.LanesPerKey();
-  auto keys = hwy::AllocateAligned<LaneType>(num_lanes);
-  auto buf = hwy::AllocateAligned<LaneType>(num_lanes + N);
-
-  std::vector<double> seconds;
-  double sum = 0;                             // prevents elision
-  constexpr size_t kMul = AdjustedReps(600);  // ensures long enough to measure
-
-  for (size_t rep = 0; rep < 30; ++rep) {
-    InputStats<LaneType> input_stats =
-        GenerateInput(dist, keys.get(), num_lanes);
-
-    const Timestamp t0;
-    for (size_t i = 0; i < kMul; ++i) {
-      detail::BaseCase(d, st, keys.get(), keys.get() + num_lanes, num_lanes,
-                       buf.get());
-      sum += static_cast<double>(keys[0]);
-    }
-    seconds.push_back(SecondsSince(t0));
-    // printf("%f\n", seconds.back());
-
-    HWY_ASSERT(VerifySort(st, input_stats, keys.get(), num_lanes, "BenchBase"));
-  }
-  HWY_ASSERT(sum < 1E99);
-  results.emplace_back(Algo::kVQSort, dist, num_keys * kMul, 1,
-                       SummarizeMeasurements(seconds), sizeof(KeyType),
-                       st.KeyString());
-}
-
-HWY_NOINLINE void BenchAllBase() {
-  // Not interested in benchmark results for these targets
-  if (HWY_TARGET == HWY_SSSE3) {
-    return;
-  }
-
-  std::vector<Result> results;
-  BenchBase<TraitsLane<OrderAscending<float>>>(results);
-  BenchBase<TraitsLane<OrderDescending<int64_t>>>(results);
-  BenchBase<Traits128<OrderAscending128>>(results);
-  for (const Result& r : results) {
-    r.Print();
-  }
-}
-
-#else
-void BenchAllPartition() {}
-void BenchAllBase() {}
-#endif  // VQSORT_ENABLED
-
-std::vector<Algo> AlgoForBench() {
-  return {
-#if HAVE_AVX2SORT
-    Algo::kSEA,
-#endif
-#if HAVE_PARALLEL_IPS4O
-        Algo::kParallelIPS4O,
-#elif HAVE_IPS4O
-        Algo::kIPS4O,
-#endif
-#if HAVE_PDQSORT
-        Algo::kPDQ,
-#endif
-#if HAVE_SORT512
-        Algo::kSort512,
-#endif
-// Only include if we're compiling for the target it supports.
-#if HAVE_VXSORT && ((VXSORT_AVX3 && HWY_TARGET == HWY_AVX3) || \
-                    (!VXSORT_AVX3 && HWY_TARGET == HWY_AVX2))
-        Algo::kVXSort,
-#endif
-
-#if !HAVE_PARALLEL_IPS4O
-#if !SORT_100M
-        // These are 10-20x slower, but that's OK for the default size when we
-        // are not testing the parallel nor 100M modes.
-        Algo::kStd, Algo::kHeap,
-#endif
-
-        Algo::kVQSort,  // only ~4x slower, but not required for Table 1a
-#endif
-  };
-}
-
-template <class Traits>
-HWY_NOINLINE void BenchSort(size_t num_keys) {
-  if (first_sort_target == 0) first_sort_target = HWY_TARGET;
-
-  SharedState shared;
-  detail::SharedTraits<Traits> st;
-  using Order = typename Traits::Order;
-  using LaneType = typename Traits::LaneType;
-  using KeyType = typename Traits::KeyType;
-  const size_t num_lanes = num_keys * st.LanesPerKey();
-  auto aligned = hwy::AllocateAligned<LaneType>(num_lanes);
-
-  const size_t reps = num_keys > 1000 * 1000 ? 10 : 30;
-
-  for (Algo algo : AlgoForBench()) {
-    // Other algorithms don't depend on the vector instructions, so only run
-    // them for the first target.
-#if !HAVE_VXSORT
-    if (algo != Algo::kVQSort && HWY_TARGET != first_sort_target) {
-      continue;
-    }
-#endif
-
-    for (Dist dist : AllDist()) {
-      std::vector<double> seconds;
-      for (size_t rep = 0; rep < reps; ++rep) {
-        InputStats<LaneType> input_stats =
-            GenerateInput(dist, aligned.get(), num_lanes);
-
-        const Timestamp t0;
-        Run<Order>(algo, reinterpret_cast<KeyType*>(aligned.get()), num_keys,
-                   shared, /*thread=*/0);
-        seconds.push_back(SecondsSince(t0));
-        // printf("%f\n", seconds.back());
-
-        HWY_ASSERT(
-            VerifySort(st, input_stats, aligned.get(), num_lanes, "BenchSort"));
-      }
-      Result(algo, dist, num_keys, 1, SummarizeMeasurements(seconds),
-             sizeof(KeyType), st.KeyString())
-          .Print();
-    }  // dist
-  }    // algo
-}
-
-HWY_NOINLINE void BenchAllSort() {
-  // Not interested in benchmark results for these targets
-  if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4) {
-    return;
-  }
-
-  constexpr size_t K = 1000;
-  constexpr size_t M = K * K;
-  (void)K;
-  (void)M;
-  for (size_t num_keys : {
-#if HAVE_PARALLEL_IPS4O || SORT_100M
-         100 * M,
-#else
-        1 * M,
-#endif
-       }) {
-    BenchSort<TraitsLane<OrderAscending<float>>>(num_keys);
-    // BenchSort<TraitsLane<OrderDescending<double>>>(num_keys);
-    // BenchSort<TraitsLane<OrderAscending<int16_t>>>(num_keys);
-    BenchSort<TraitsLane<OrderDescending<int32_t>>>(num_keys);
-    BenchSort<TraitsLane<OrderAscending<int64_t>>>(num_keys);
-    // BenchSort<TraitsLane<OrderDescending<uint16_t>>>(num_keys);
-    // BenchSort<TraitsLane<OrderDescending<uint32_t>>>(num_keys);
-    // BenchSort<TraitsLane<OrderAscending<uint64_t>>>(num_keys);
-
-#if !HAVE_VXSORT && VQSORT_ENABLED
-    BenchSort<Traits128<OrderAscending128>>(num_keys);
-    BenchSort<Traits128<OrderAscendingKV128>>(num_keys);
-#endif
-  }
-}
-
-}  // namespace
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-
-namespace hwy {
-int64_t first_sort_target = 0;  // none run yet
-namespace {
-HWY_BEFORE_TEST(BenchSort);
-HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllPartition);
-HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllBase);
-HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllSort);
-}  // namespace
-}  // namespace hwy
-
-#endif  // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/print_network.cc b/third_party/highway/hwy/contrib/sort/print_network.cc
deleted file mode 100644 (file)
index 59cfebc..0000000
+++ /dev/null
@@ -1,191 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <stdio.h>
-
-#include <algorithm>
-
-#include "hwy/base.h"
-
-// Based on A.7 in "Entwurf und Implementierung vektorisierter
-// Sortieralgorithmen" and code by Mark Blacher.
-void PrintMergeNetwork16x2() {
-  for (int i = 8; i < 16; ++i) {
-    printf("v%x = st.SwapAdjacent(d, v%x);\n", i, i);
-  }
-  for (int i = 0; i < 8; ++i) {
-    printf("st.Sort2(d, v%x, v%x);\n", i, 15 - i);
-  }
-  for (int i = 0; i < 4; ++i) {
-    printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 4, i + 4);
-    printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 12, i + 12);
-  }
-  for (int i = 0; i < 4; ++i) {
-    printf("st.Sort2(d, v%x, v%x);\n", i, 7 - i);
-    printf("st.Sort2(d, v%x, v%x);\n", i + 8, 15 - i);
-  }
-  for (int i = 0; i < 16; i += 4) {
-    printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 2, i + 2);
-    printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 3, i + 3);
-  }
-  for (int i = 0; i < 16; i += 4) {
-    printf("st.Sort2(d, v%x, v%x);\n", i, i + 3);
-    printf("st.Sort2(d, v%x, v%x);\n", i + 1, i + 2);
-  }
-  for (int i = 0; i < 16; i += 2) {
-    printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 1, i + 1);
-  }
-  for (int i = 0; i < 16; i += 2) {
-    printf("st.Sort2(d, v%x, v%x);\n", i, i + 1);
-  }
-  for (int i = 0; i < 16; ++i) {
-    printf("v%x = st.SortPairsDistance1<kOrder>(d, v%x);\n", i, i);
-  }
-  printf("\n");
-}
-
-void PrintMergeNetwork16x4() {
-  printf("\n");
-
-  for (int i = 8; i < 16; ++i) {
-    printf("v%x = st.Reverse4(d, v%x);\n", i, i);
-  }
-  for (int i = 0; i < 8; ++i) {
-    printf("st.Sort2(d, v%x, v%x);\n", i, 15 - i);
-  }
-  for (int i = 0; i < 4; ++i) {
-    printf("v%x = st.Reverse4(d, v%x);\n", i + 4, i + 4);
-    printf("v%x = st.Reverse4(d, v%x);\n", i + 12, i + 12);
-  }
-  for (int i = 0; i < 4; ++i) {
-    printf("st.Sort2(d, v%x, v%x);\n", i, 7 - i);
-    printf("st.Sort2(d, v%x, v%x);\n", i + 8, 15 - i);
-  }
-  for (int i = 0; i < 16; i += 4) {
-    printf("v%x = st.Reverse4(d, v%x);\n", i + 2, i + 2);
-    printf("v%x = st.Reverse4(d, v%x);\n", i + 3, i + 3);
-  }
-  for (int i = 0; i < 16; i += 4) {
-    printf("st.Sort2(d, v%x, v%x);\n", i, i + 3);
-    printf("st.Sort2(d, v%x, v%x);\n", i + 1, i + 2);
-  }
-  for (int i = 0; i < 16; i += 2) {
-    printf("v%x = st.Reverse4(d, v%x);\n", i + 1, i + 1);
-  }
-  for (int i = 0; i < 16; i += 2) {
-    printf("st.Sort2(d, v%x, v%x);\n", i, i + 1);
-  }
-  for (int i = 0; i < 16; ++i) {
-    printf("v%x = st.SortPairsReverse4(d, v%x);\n", i, i);
-  }
-  for (int i = 0; i < 16; ++i) {
-    printf("v%x = st.SortPairsDistance1<kOrder>(d, v%x);\n", i, i);
-  }
-}
-
-void PrintMergeNetwork16x8() {
-  printf("\n");
-
-  for (int i = 8; i < 16; ++i) {
-    printf("v%x = st.ReverseKeys8(d, v%x);\n", i, i);
-  }
-  for (int i = 0; i < 8; ++i) {
-    printf("st.Sort2(d, v%x, v%x);\n", i, 15 - i);
-  }
-  for (int i = 0; i < 4; ++i) {
-    printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 4, i + 4);
-    printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 12, i + 12);
-  }
-  for (int i = 0; i < 4; ++i) {
-    printf("st.Sort2(d, v%x, v%x);\n", i, 7 - i);
-    printf("st.Sort2(d, v%x, v%x);\n", i + 8, 15 - i);
-  }
-  for (int i = 0; i < 16; i += 4) {
-    printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 2, i + 2);
-    printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 3, i + 3);
-  }
-  for (int i = 0; i < 16; i += 4) {
-    printf("st.Sort2(d, v%x, v%x);\n", i, i + 3);
-    printf("st.Sort2(d, v%x, v%x);\n", i + 1, i + 2);
-  }
-  for (int i = 0; i < 16; i += 2) {
-    printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 1, i + 1);
-  }
-  for (int i = 0; i < 16; i += 2) {
-    printf("st.Sort2(d, v%x, v%x);\n", i, i + 1);
-  }
-  for (int i = 0; i < 16; ++i) {
-    printf("v%x = st.SortPairsReverse8(d, v%x);\n", i, i);
-  }
-  for (int i = 0; i < 16; ++i) {
-    printf("v%x = st.SortPairsDistance2<kOrder>(d, v%x);\n", i, i);
-  }
-  for (int i = 0; i < 16; ++i) {
-    printf("v%x = st.SortPairsDistance1<kOrder>(d, v%x);\n", i, i);
-  }
-}
-
-void PrintMergeNetwork16x16() {
-  printf("\n");
-
-  for (int i = 8; i < 16; ++i) {
-    printf("v%x = st.ReverseKeys16(d, v%x);\n", i, i);
-  }
-  for (int i = 0; i < 8; ++i) {
-    printf("st.Sort2(d, v%x, v%x);\n", i, 15 - i);
-  }
-  for (int i = 0; i < 4; ++i) {
-    printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 4, i + 4);
-    printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 12, i + 12);
-  }
-  for (int i = 0; i < 4; ++i) {
-    printf("st.Sort2(d, v%x, v%x);\n", i, 7 - i);
-    printf("st.Sort2(d, v%x, v%x);\n", i + 8, 15 - i);
-  }
-  for (int i = 0; i < 16; i += 4) {
-    printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 2, i + 2);
-    printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 3, i + 3);
-  }
-  for (int i = 0; i < 16; i += 4) {
-    printf("st.Sort2(d, v%x, v%x);\n", i, i + 3);
-    printf("st.Sort2(d, v%x, v%x);\n", i + 1, i + 2);
-  }
-  for (int i = 0; i < 16; i += 2) {
-    printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 1, i + 1);
-  }
-  for (int i = 0; i < 16; i += 2) {
-    printf("st.Sort2(d, v%x, v%x);\n", i, i + 1);
-  }
-  for (int i = 0; i < 16; ++i) {
-    printf("v%x = st.SortPairsReverse16<kOrder>(d, v%x);\n", i, i);
-  }
-  for (int i = 0; i < 16; ++i) {
-    printf("v%x = st.SortPairsDistance4<kOrder>(d, v%x);\n", i, i);
-  }
-  for (int i = 0; i < 16; ++i) {
-    printf("v%x = st.SortPairsDistance2<kOrder>(d, v%x);\n", i, i);
-  }
-  for (int i = 0; i < 16; ++i) {
-    printf("v%x = st.SortPairsDistance1<kOrder>(d, v%x);\n", i, i);
-  }
-}
-
-int main(int argc, char** argv) {
-  PrintMergeNetwork16x2();
-  PrintMergeNetwork16x4();
-  PrintMergeNetwork16x8();
-  PrintMergeNetwork16x16();
-  return 0;
-}
diff --git a/third_party/highway/hwy/contrib/sort/result-inl.h b/third_party/highway/hwy/contrib/sort/result-inl.h
deleted file mode 100644 (file)
index f3d842d..0000000
+++ /dev/null
@@ -1,139 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/sort/algo-inl.h"
-
-// Normal include guard for non-SIMD parts
-#ifndef HIGHWAY_HWY_CONTRIB_SORT_RESULT_INL_H_
-#define HIGHWAY_HWY_CONTRIB_SORT_RESULT_INL_H_
-
-#include <time.h>
-
-#include <algorithm>  // std::sort
-#include <string>
-
-#include "hwy/base.h"
-#include "hwy/nanobenchmark.h"
-
-namespace hwy {
-
-struct Timestamp {
-  Timestamp() { t = platform::Now(); }
-  double t;
-};
-
-static inline double SecondsSince(const Timestamp& t0) {
-  const Timestamp t1;
-  return t1.t - t0.t;
-}
-
-// Returns trimmed mean (we don't want to run an out-of-L3-cache sort often
-// enough for the mode to be reliable).
-static inline double SummarizeMeasurements(std::vector<double>& seconds) {
-  std::sort(seconds.begin(), seconds.end());
-  double sum = 0;
-  int count = 0;
-  const size_t num = seconds.size();
-  for (size_t i = num / 4; i < num / 2; ++i) {
-    sum += seconds[i];
-    count += 1;
-  }
-  return sum / count;
-}
-
-}  // namespace hwy
-#endif  // HIGHWAY_HWY_CONTRIB_SORT_RESULT_INL_H_
-
-// Per-target
-#if defined(HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE) == \
-    defined(HWY_TARGET_TOGGLE)
-#ifdef HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE
-#undef HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE
-#else
-#define HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE
-#endif
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-struct Result {
-  Result() {}
-  Result(const Algo algo, Dist dist, size_t num_keys, size_t num_threads,
-         double sec, size_t sizeof_key, const std::string& key_name)
-      : target(HWY_TARGET),
-        algo(algo),
-        dist(dist),
-        num_keys(num_keys),
-        num_threads(num_threads),
-        sec(sec),
-        sizeof_key(sizeof_key),
-        key_name(key_name) {}
-
-  void Print() const {
-    const double bytes = static_cast<double>(num_keys) *
-                         static_cast<double>(num_threads) *
-                         static_cast<double>(sizeof_key);
-    printf("%10s: %12s: %7s: %9s: %.2E %4.0f MB/s (%2zu threads)\n",
-           hwy::TargetName(target), AlgoName(algo), key_name.c_str(),
-           DistName(dist), static_cast<double>(num_keys), bytes * 1E-6 / sec,
-           num_threads);
-  }
-
-  int64_t target;
-  Algo algo;
-  Dist dist;
-  size_t num_keys = 0;
-  size_t num_threads = 0;
-  double sec = 0.0;
-  size_t sizeof_key = 0;
-  std::string key_name;
-};
-
-template <class Traits, typename LaneType>
-bool VerifySort(Traits st, const InputStats<LaneType>& input_stats,
-                const LaneType* out, size_t num_lanes, const char* caller) {
-  constexpr size_t N1 = st.LanesPerKey();
-  HWY_ASSERT(num_lanes >= N1);
-
-  InputStats<LaneType> output_stats;
-  // Ensure it matches the sort order
-  for (size_t i = 0; i < num_lanes - N1; i += N1) {
-    output_stats.Notify(out[i]);
-    if (N1 == 2) output_stats.Notify(out[i + 1]);
-    // Reverse order instead of checking !Compare1 so we accept equal keys.
-    if (st.Compare1(out + i + N1, out + i)) {
-      printf("%s: i=%d of %d lanes: N1=%d %5.0f %5.0f vs. %5.0f %5.0f\n\n",
-             caller, static_cast<int>(i), static_cast<int>(num_lanes),
-             static_cast<int>(N1), static_cast<double>(out[i + 1]),
-             static_cast<double>(out[i + 0]),
-             static_cast<double>(out[i + N1 + 1]),
-             static_cast<double>(out[i + N1]));
-      HWY_ABORT("%d-bit sort is incorrect\n",
-                static_cast<int>(sizeof(LaneType) * 8 * N1));
-    }
-  }
-  output_stats.Notify(out[num_lanes - N1]);
-  if (N1 == 2) output_stats.Notify(out[num_lanes - N1 + 1]);
-
-  return input_stats == output_stats;
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#endif  // HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE
diff --git a/third_party/highway/hwy/contrib/sort/shared-inl.h b/third_party/highway/hwy/contrib/sort/shared-inl.h
deleted file mode 100644 (file)
index ea604ed..0000000
+++ /dev/null
@@ -1,133 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Definitions shared between vqsort-inl and sorting_networks-inl.
-
-// Normal include guard for target-independent parts
-#ifndef HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_
-#define HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_
-
-#include "hwy/base.h"
-
-namespace hwy {
-
-// Internal constants - these are to avoid magic numbers/literals and cannot be
-// changed without also changing the associated code.
-struct SortConstants {
-// SortingNetwork reshapes its input into a matrix. This is the maximum number
-// of *keys* per vector.
-#if HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
-  static constexpr size_t kMaxCols = 8;  // avoid build timeout/stack overflow
-#else
-  static constexpr size_t kMaxCols = 16;  // enough for u32 in 512-bit vector
-#endif
-
-  // 16 rows is a compromise between using the 32 AVX-512/SVE/RVV registers,
-  // fitting within 16 AVX2 registers with only a few spills, keeping BaseCase
-  // code size reasonable (7 KiB for AVX-512 and 16 cols), and minimizing the
-  // extra logN factor for larger networks (for which only loose upper bounds
-  // on size are known).
-  static constexpr size_t kMaxRowsLog2 = 4;
-  static constexpr size_t kMaxRows = size_t{1} << kMaxRowsLog2;
-
-  static constexpr HWY_INLINE size_t BaseCaseNum(size_t N) {
-    return kMaxRows * HWY_MIN(N, kMaxCols);
-  }
-
-  // Unrolling is important (pipelining and amortizing branch mispredictions);
-  // 2x is sufficient to reach full memory bandwidth on SKX in Partition, but
-  // somewhat slower for sorting than 4x.
-  //
-  // To change, must also update left + 3 * N etc. in the loop.
-  static constexpr size_t kPartitionUnroll = 4;
-
-  static constexpr HWY_INLINE size_t PartitionBufNum(size_t N) {
-    // The main loop reads kPartitionUnroll vectors, and first loads from
-    // both left and right beforehand, so it requires min = 2 *
-    // kPartitionUnroll vectors. To handle smaller amounts (only guaranteed
-    // >= BaseCaseNum), we partition the right side into a buffer. We need
-    // another vector at the end so CompressStore does not overwrite anything.
-    return (2 * kPartitionUnroll + 1) * N;
-  }
-
-  // Chunk := group of keys loaded for sampling a pivot. Matches the typical
-  // cache line size of 64 bytes to get maximum benefit per L2 miss. If vectors
-  // are larger, use entire vectors to ensure we do not overrun the array.
-  static constexpr HWY_INLINE size_t LanesPerChunk(size_t sizeof_t, size_t N) {
-    return HWY_MAX(64 / sizeof_t, N);
-  }
-
-  static constexpr HWY_INLINE size_t PivotBufNum(size_t sizeof_t, size_t N) {
-    // 3 chunks of medians, 1 chunk of median medians plus two padding vectors.
-    return (3 + 1) * LanesPerChunk(sizeof_t, N) + 2 * N;
-  }
-
-  template <typename T>
-  static constexpr HWY_INLINE size_t BufNum(size_t N) {
-    // One extra for padding plus another for full-vector loads.
-    return HWY_MAX(BaseCaseNum(N) + 2 * N,
-                   HWY_MAX(PartitionBufNum(N), PivotBufNum(sizeof(T), N)));
-  }
-
-  template <typename T>
-  static constexpr HWY_INLINE size_t BufBytes(size_t vector_size) {
-    return sizeof(T) * BufNum<T>(vector_size / sizeof(T));
-  }
-};
-
-}  // namespace hwy
-
-#endif  // HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_
-
-// Per-target
-#if defined(HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE) == \
-    defined(HWY_TARGET_TOGGLE)
-#ifdef HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
-#undef HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
-#else
-#define HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
-#endif
-
-#include "hwy/highway.h"
-
-// vqsort isn't available on HWY_SCALAR, and builds time out on MSVC opt and
-// Arm v7 debug.
-#undef VQSORT_ENABLED
-#if (HWY_TARGET == HWY_SCALAR) ||                 \
-    (HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD) || \
-    (HWY_ARCH_ARM_V7 && HWY_IS_DEBUG_BUILD)
-#define VQSORT_ENABLED 0
-#else
-#define VQSORT_ENABLED 1
-#endif
-
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-// Default tag / vector width selector.
-#if HWY_TARGET == HWY_RVV
-// Use LMUL = 1/2; for SEW=64 this ends up emulated via vsetvl.
-template <typename T>
-using SortTag = ScalableTag<T, -1>;
-#else
-template <typename T>
-using SortTag = ScalableTag<T>;
-#endif
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-
-#endif  // HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
diff --git a/third_party/highway/hwy/contrib/sort/sort_test.cc b/third_party/highway/hwy/contrib/sort/sort_test.cc
deleted file mode 100644 (file)
index 68f173a..0000000
+++ /dev/null
@@ -1,600 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS  // before inttypes.h
-#endif
-#include <inttypes.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <string.h>  // memcpy
-
-#include <vector>
-
-// clang-format off
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/sort_test.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-
-#include "hwy/contrib/sort/vqsort.h"
-// After foreach_target
-#include "hwy/contrib/sort/algo-inl.h"
-#include "hwy/contrib/sort/traits128-inl.h"
-#include "hwy/contrib/sort/result-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"  // BaseCase
-#include "hwy/tests/test_util-inl.h"
-// clang-format on
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-namespace {
-
-using detail::OrderAscending;
-using detail::OrderDescending;
-using detail::SharedTraits;
-using detail::TraitsLane;
-#if VQSORT_ENABLED || HWY_IDE
-using detail::OrderAscending128;
-using detail::OrderAscendingKV128;
-using detail::OrderDescending128;
-using detail::OrderDescendingKV128;
-using detail::Traits128;
-
-template <class Traits>
-static HWY_NOINLINE void TestMedian3() {
-  using LaneType = typename Traits::LaneType;
-  using D = CappedTag<LaneType, 1>;
-  SharedTraits<Traits> st;
-  const D d;
-  using V = Vec<D>;
-  for (uint32_t bits = 0; bits < 8; ++bits) {
-    const V v0 = Set(d, LaneType{(bits & (1u << 0)) ? 1u : 0u});
-    const V v1 = Set(d, LaneType{(bits & (1u << 1)) ? 1u : 0u});
-    const V v2 = Set(d, LaneType{(bits & (1u << 2)) ? 1u : 0u});
-    const LaneType m = GetLane(detail::MedianOf3(st, v0, v1, v2));
-    // If at least half(rounded up) of bits are 1, so is the median.
-    const size_t count = PopCount(bits);
-    HWY_ASSERT_EQ((count >= 2) ? static_cast<LaneType>(1) : 0, m);
-  }
-}
-
-HWY_NOINLINE void TestAllMedian() {
-  TestMedian3<TraitsLane<OrderAscending<uint64_t> > >();
-}
-
-template <class Traits>
-static HWY_NOINLINE void TestBaseCaseAscDesc() {
-  using LaneType = typename Traits::LaneType;
-  SharedTraits<Traits> st;
-  const SortTag<LaneType> d;
-  const size_t N = Lanes(d);
-  const size_t base_case_num = SortConstants::BaseCaseNum(N);
-  const size_t N1 = st.LanesPerKey();
-
-  constexpr int kDebug = 0;
-  auto aligned_lanes = hwy::AllocateAligned<LaneType>(N + base_case_num + N);
-  auto buf = hwy::AllocateAligned<LaneType>(base_case_num + 2 * N);
-
-  std::vector<size_t> lengths;
-  lengths.push_back(HWY_MAX(1, N1));
-  lengths.push_back(3 * N1);
-  lengths.push_back(base_case_num / 2);
-  lengths.push_back(base_case_num / 2 + N1);
-  lengths.push_back(base_case_num - N1);
-  lengths.push_back(base_case_num);
-
-  std::vector<size_t> misalignments;
-  misalignments.push_back(0);
-  misalignments.push_back(1);
-  if (N >= 6) misalignments.push_back(N / 2 - 1);
-  misalignments.push_back(N / 2);
-  misalignments.push_back(N / 2 + 1);
-  misalignments.push_back(HWY_MIN(2 * N / 3 + 3, size_t{N - 1}));
-
-  for (bool asc : {false, true}) {
-    for (size_t len : lengths) {
-      for (size_t misalign : misalignments) {
-        LaneType* HWY_RESTRICT lanes = aligned_lanes.get() + misalign;
-        if (kDebug) {
-          printf("============%s asc %d N1 %d len %d misalign %d\n",
-                 st.KeyString().c_str(), asc, static_cast<int>(N1),
-                 static_cast<int>(len), static_cast<int>(misalign));
-        }
-
-        for (size_t i = 0; i < misalign; ++i) {
-          aligned_lanes[i] = hwy::LowestValue<LaneType>();
-        }
-        InputStats<LaneType> input_stats;
-        for (size_t i = 0; i < len; ++i) {
-          lanes[i] = asc ? static_cast<LaneType>(LaneType(i) + 1)
-                         : static_cast<LaneType>(LaneType(len) - LaneType(i));
-          input_stats.Notify(lanes[i]);
-          if (kDebug >= 2) {
-            printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
-          }
-        }
-        for (size_t i = len; i < base_case_num + N; ++i) {
-          lanes[i] = hwy::LowestValue<LaneType>();
-        }
-
-        detail::BaseCase(d, st, lanes, lanes + len, len, buf.get());
-
-        if (kDebug >= 2) {
-          printf("out>>>>>>\n");
-          for (size_t i = 0; i < len; ++i) {
-            printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
-          }
-        }
-
-        HWY_ASSERT(VerifySort(st, input_stats, lanes, len, "BaseAscDesc"));
-        for (size_t i = 0; i < misalign; ++i) {
-          if (aligned_lanes[i] != hwy::LowestValue<LaneType>())
-            HWY_ABORT("Overrun misalign at %d\n", static_cast<int>(i));
-        }
-        for (size_t i = len; i < base_case_num + N; ++i) {
-          if (lanes[i] != hwy::LowestValue<LaneType>())
-            HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
-        }
-      }  // misalign
-    }    // len
-  }      // asc
-}
-
-template <class Traits>
-static HWY_NOINLINE void TestBaseCase01() {
-  using LaneType = typename Traits::LaneType;
-  SharedTraits<Traits> st;
-  const SortTag<LaneType> d;
-  const size_t N = Lanes(d);
-  const size_t base_case_num = SortConstants::BaseCaseNum(N);
-  const size_t N1 = st.LanesPerKey();
-
-  constexpr int kDebug = 0;
-  auto lanes = hwy::AllocateAligned<LaneType>(base_case_num + N);
-  auto buf = hwy::AllocateAligned<LaneType>(base_case_num + 2 * N);
-
-  std::vector<size_t> lengths;
-  lengths.push_back(HWY_MAX(1, N1));
-  lengths.push_back(3 * N1);
-  lengths.push_back(base_case_num / 2);
-  lengths.push_back(base_case_num / 2 + N1);
-  lengths.push_back(base_case_num - N1);
-  lengths.push_back(base_case_num);
-
-  for (size_t len : lengths) {
-    if (kDebug) {
-      printf("============%s 01 N1 %d len %d\n", st.KeyString().c_str(),
-             static_cast<int>(N1), static_cast<int>(len));
-    }
-    const uint64_t kMaxBits = AdjustedLog2Reps(HWY_MIN(len, size_t{14}));
-    for (uint64_t bits = 0; bits < ((1ull << kMaxBits) - 1); ++bits) {
-      InputStats<LaneType> input_stats;
-      for (size_t i = 0; i < len; ++i) {
-        lanes[i] = (i < 64 && (bits & (1ull << i))) ? 1 : 0;
-        input_stats.Notify(lanes[i]);
-        if (kDebug >= 2) {
-          printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
-        }
-      }
-      for (size_t i = len; i < base_case_num + N; ++i) {
-        lanes[i] = hwy::LowestValue<LaneType>();
-      }
-
-      detail::BaseCase(d, st, lanes.get(), lanes.get() + len, len, buf.get());
-
-      if (kDebug >= 2) {
-        printf("out>>>>>>\n");
-        for (size_t i = 0; i < len; ++i) {
-          printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
-        }
-      }
-
-      HWY_ASSERT(VerifySort(st, input_stats, lanes.get(), len, "Base01"));
-      for (size_t i = len; i < base_case_num + N; ++i) {
-        if (lanes[i] != hwy::LowestValue<LaneType>())
-          HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
-      }
-    }  // bits
-  }    // len
-}
-
-template <class Traits>
-static HWY_NOINLINE void TestBaseCase() {
-  TestBaseCaseAscDesc<Traits>();
-  TestBaseCase01<Traits>();
-}
-
-HWY_NOINLINE void TestAllBaseCase() {
-  // Workaround for stack overflow on MSVC debug.
-#if defined(_MSC_VER)
-  return;
-#endif
-  TestBaseCase<TraitsLane<OrderAscending<int32_t> > >();
-  TestBaseCase<TraitsLane<OrderDescending<int64_t> > >();
-  TestBaseCase<Traits128<OrderAscending128> >();
-  TestBaseCase<Traits128<OrderDescending128> >();
-}
-
-template <class Traits>
-static HWY_NOINLINE void VerifyPartition(
-    Traits st, typename Traits::LaneType* HWY_RESTRICT lanes, size_t left,
-    size_t border, size_t right, const size_t N1,
-    const typename Traits::LaneType* pivot) {
-  /* for (size_t i = left; i < right; ++i) {
-     if (i == border) printf("--\n");
-     printf("%4zu: %3d\n", i, lanes[i]);
-   }*/
-
-  HWY_ASSERT(left % N1 == 0);
-  HWY_ASSERT(border % N1 == 0);
-  HWY_ASSERT(right % N1 == 0);
-  const bool asc = typename Traits::Order().IsAscending();
-  for (size_t i = left; i < border; i += N1) {
-    if (st.Compare1(pivot, lanes + i)) {
-      HWY_ABORT(
-          "%s: asc %d left[%d] piv %.0f %.0f compares before %.0f %.0f "
-          "border %d",
-          st.KeyString().c_str(), asc, static_cast<int>(i),
-          static_cast<double>(pivot[1]), static_cast<double>(pivot[0]),
-          static_cast<double>(lanes[i + 1]), static_cast<double>(lanes[i + 0]),
-          static_cast<int>(border));
-    }
-  }
-  for (size_t i = border; i < right; i += N1) {
-    if (!st.Compare1(pivot, lanes + i)) {
-      HWY_ABORT(
-          "%s: asc %d right[%d] piv %.0f %.0f compares after %.0f %.0f "
-          "border %d",
-          st.KeyString().c_str(), asc, static_cast<int>(i),
-          static_cast<double>(pivot[1]), static_cast<double>(pivot[0]),
-          static_cast<double>(lanes[i + 1]), static_cast<double>(lanes[i]),
-          static_cast<int>(border));
-    }
-  }
-}
-
-template <class Traits>
-static HWY_NOINLINE void TestPartition() {
-  using LaneType = typename Traits::LaneType;
-  const SortTag<LaneType> d;
-  SharedTraits<Traits> st;
-  const bool asc = typename Traits::Order().IsAscending();
-  const size_t N = Lanes(d);
-  constexpr int kDebug = 0;
-  const size_t base_case_num = SortConstants::BaseCaseNum(N);
-  // left + len + align
-  const size_t total = 32 + (base_case_num + 4 * HWY_MAX(N, 4)) + 2 * N;
-  auto aligned_lanes = hwy::AllocateAligned<LaneType>(total);
-  auto buf = hwy::AllocateAligned<LaneType>(SortConstants::PartitionBufNum(N));
-
-  const size_t N1 = st.LanesPerKey();
-  for (bool in_asc : {false, true}) {
-    for (int left_i : {0, 1, 2, 3, 4, 5, 6, 7, 8, 12, 15, 22, 28, 29, 30, 31}) {
-      const size_t left = static_cast<size_t>(left_i) & ~(N1 - 1);
-      for (size_t ofs : {N, N + 1, N + 2, N + 3, 2 * N, 2 * N + 1, 2 * N + 2,
-                         2 * N + 3, 3 * N - 1, 4 * N - 3, 4 * N - 2}) {
-        const size_t len = (base_case_num + ofs) & ~(N1 - 1);
-        for (LaneType pivot1 :
-             {LaneType(0), LaneType(len / 3), LaneType(len / 2),
-              LaneType(2 * len / 3), LaneType(len)}) {
-          const LaneType pivot2[2] = {pivot1, 0};
-          const auto pivot = st.SetKey(d, pivot2);
-          for (size_t misalign = 0; misalign < N;
-               misalign += st.LanesPerKey()) {
-            LaneType* HWY_RESTRICT lanes = aligned_lanes.get() + misalign;
-            const size_t right = left + len;
-            if (kDebug) {
-              printf(
-                  "=========%s asc %d left %d len %d right %d piv %.0f %.0f\n",
-                  st.KeyString().c_str(), asc, static_cast<int>(left),
-                  static_cast<int>(len), static_cast<int>(right),
-                  static_cast<double>(pivot2[1]),
-                  static_cast<double>(pivot2[0]));
-            }
-
-            for (size_t i = 0; i < misalign; ++i) {
-              aligned_lanes[i] = hwy::LowestValue<LaneType>();
-            }
-            for (size_t i = 0; i < left; ++i) {
-              lanes[i] = hwy::LowestValue<LaneType>();
-            }
-            for (size_t i = left; i < right; ++i) {
-              lanes[i] = static_cast<LaneType>(
-                  in_asc ? LaneType(i + 1) - static_cast<LaneType>(left)
-                         : static_cast<LaneType>(right) - LaneType(i));
-              if (kDebug >= 2) {
-                printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
-              }
-            }
-            for (size_t i = right; i < total - misalign; ++i) {
-              lanes[i] = hwy::LowestValue<LaneType>();
-            }
-
-            size_t border =
-                detail::Partition(d, st, lanes, left, right, pivot, buf.get());
-
-            if (kDebug >= 2) {
-              printf("out>>>>>>\n");
-              for (size_t i = left; i < right; ++i) {
-                printf("%3zu: %f\n", i, static_cast<double>(lanes[i]));
-              }
-              for (size_t i = right; i < total - misalign; ++i) {
-                printf("%3zu: sentinel %f\n", i, static_cast<double>(lanes[i]));
-              }
-            }
-
-            VerifyPartition(st, lanes, left, border, right, N1, pivot2);
-            for (size_t i = 0; i < misalign; ++i) {
-              if (aligned_lanes[i] != hwy::LowestValue<LaneType>())
-                HWY_ABORT("Overrun misalign at %d\n", static_cast<int>(i));
-            }
-            for (size_t i = 0; i < left; ++i) {
-              if (lanes[i] != hwy::LowestValue<LaneType>())
-                HWY_ABORT("Overrun left at %d\n", static_cast<int>(i));
-            }
-            for (size_t i = right; i < total - misalign; ++i) {
-              if (lanes[i] != hwy::LowestValue<LaneType>())
-                HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
-            }
-          }  // misalign
-        }    // pivot
-      }      // len
-    }        // left
-  }          // asc
-}
-
-HWY_NOINLINE void TestAllPartition() {
-  TestPartition<TraitsLane<OrderAscending<int16_t> > >();
-  TestPartition<TraitsLane<OrderDescending<int32_t> > >();
-  TestPartition<TraitsLane<OrderAscending<int64_t> > >();
-  TestPartition<TraitsLane<OrderDescending<float> > >();
-#if HWY_HAVE_FLOAT64
-  TestPartition<TraitsLane<OrderDescending<double> > >();
-#endif
-  TestPartition<Traits128<OrderAscending128> >();
-  TestPartition<Traits128<OrderDescending128> >();
-}
-
-// (used for sample selection for choosing a pivot)
-template <typename TU>
-static HWY_NOINLINE void TestRandomGenerator() {
-  static_assert(!hwy::IsSigned<TU>(), "");
-  SortTag<TU> du;
-  const size_t N = Lanes(du);
-
-  detail::Generator rng(&N, N);
-
-  const size_t lanes_per_block = HWY_MAX(64 / sizeof(TU), N);  // power of two
-
-  for (uint32_t num_blocks = 2; num_blocks < 100000;
-       num_blocks = 3 * num_blocks / 2) {
-    // Generate some numbers and ensure all are in range
-    uint64_t sum = 0;
-    constexpr size_t kReps = 10000;
-    for (size_t rep = 0; rep < kReps; ++rep) {
-      const uint32_t bits = rng() & 0xFFFFFFFF;
-      const size_t index = detail::RandomChunkIndex(num_blocks, bits);
-      HWY_ASSERT(((index + 1) * lanes_per_block) <=
-                 num_blocks * lanes_per_block);
-
-      sum += index;
-    }
-
-    // Also ensure the mean is near the middle of the range
-    const double expected = (num_blocks - 1) / 2.0;
-    const double actual = static_cast<double>(sum) / kReps;
-    HWY_ASSERT(0.9 * expected <= actual && actual <= 1.1 * expected);
-  }
-}
-
-HWY_NOINLINE void TestAllGenerator() {
-  TestRandomGenerator<uint32_t>();
-  TestRandomGenerator<uint64_t>();
-}
-
-#else
-static void TestAllMedian() {}
-static void TestAllBaseCase() {}
-static void TestAllPartition() {}
-static void TestAllGenerator() {}
-#endif  // VQSORT_ENABLED
-
-// Remembers input, and compares results to that of a reference algorithm.
-template <class Traits>
-class CompareResults {
-  using LaneType = typename Traits::LaneType;
-  using KeyType = typename Traits::KeyType;
-
- public:
-  CompareResults(const LaneType* in, size_t num_lanes) {
-    copy_.resize(num_lanes);
-    memcpy(copy_.data(), in, num_lanes * sizeof(LaneType));
-  }
-
-  bool Verify(const LaneType* output) {
-#if HAVE_PDQSORT
-    const Algo reference = Algo::kPDQ;
-#else
-    const Algo reference = Algo::kStd;
-#endif
-    SharedState shared;
-    using Order = typename Traits::Order;
-    const Traits st;
-    const size_t num_keys = copy_.size() / st.LanesPerKey();
-    Run<Order>(reference, reinterpret_cast<KeyType*>(copy_.data()), num_keys,
-               shared, /*thread=*/0);
-
-    for (size_t i = 0; i < copy_.size(); ++i) {
-      if (copy_[i] != output[i]) {
-        if (sizeof(KeyType) == 16) {
-          fprintf(stderr,
-                  "%s Asc %d mismatch at %d of %d: %" PRIu64 " %" PRIu64 "\n",
-                  st.KeyString().c_str(), Order().IsAscending(),
-                  static_cast<int>(i), static_cast<int>(copy_.size()),
-                  static_cast<uint64_t>(copy_[i]),
-                  static_cast<uint64_t>(output[i]));
-        } else {
-          fprintf(stderr, "Type %s Asc %d mismatch at %d of %d: ",
-                  st.KeyString().c_str(), Order().IsAscending(),
-                  static_cast<int>(i), static_cast<int>(copy_.size()));
-          PrintValue(copy_[i]);
-          PrintValue(output[i]);
-          fprintf(stderr, "\n");
-        }
-        return false;
-      }
-    }
-    return true;
-  }
-
- private:
-  std::vector<LaneType> copy_;
-};
-
-std::vector<Algo> AlgoForTest() {
-  return {
-#if HAVE_AVX2SORT
-    Algo::kSEA,
-#endif
-#if HAVE_IPS4O
-        Algo::kIPS4O,
-#endif
-#if HAVE_PDQSORT
-        Algo::kPDQ,
-#endif
-#if HAVE_SORT512
-        Algo::kSort512,
-#endif
-        Algo::kHeap, Algo::kVQSort,
-  };
-}
-
-template <class Traits>
-void TestSort(size_t num_lanes) {
-// Workaround for stack overflow on clang-cl (/F 8388608 does not help).
-#if defined(_MSC_VER)
-  return;
-#endif
-  using Order = typename Traits::Order;
-  using LaneType = typename Traits::LaneType;
-  using KeyType = typename Traits::KeyType;
-  SharedState shared;
-  SharedTraits<Traits> st;
-
-  // Round up to a whole number of keys.
-  num_lanes += (st.Is128() && (num_lanes & 1));
-  const size_t num_keys = num_lanes / st.LanesPerKey();
-
-  constexpr size_t kMaxMisalign = 16;
-  auto aligned =
-      hwy::AllocateAligned<LaneType>(kMaxMisalign + num_lanes + kMaxMisalign);
-  for (Algo algo : AlgoForTest()) {
-    for (Dist dist : AllDist()) {
-      for (size_t misalign : {size_t{0}, size_t{st.LanesPerKey()},
-                              size_t{3 * st.LanesPerKey()}, kMaxMisalign / 2}) {
-        LaneType* lanes = aligned.get() + misalign;
-
-        // Set up red zones before/after the keys to sort
-        for (size_t i = 0; i < misalign; ++i) {
-          aligned[i] = hwy::LowestValue<LaneType>();
-        }
-        for (size_t i = 0; i < kMaxMisalign; ++i) {
-          lanes[num_lanes + i] = hwy::HighestValue<LaneType>();
-        }
-#if HWY_IS_MSAN
-        __msan_poison(aligned.get(), misalign * sizeof(LaneType));
-        __msan_poison(lanes + num_lanes, kMaxMisalign * sizeof(LaneType));
-#endif
-        InputStats<LaneType> input_stats =
-            GenerateInput(dist, lanes, num_lanes);
-
-        CompareResults<Traits> compare(lanes, num_lanes);
-        Run<Order>(algo, reinterpret_cast<KeyType*>(lanes), num_keys, shared,
-                   /*thread=*/0);
-        HWY_ASSERT(compare.Verify(lanes));
-        HWY_ASSERT(VerifySort(st, input_stats, lanes, num_lanes, "TestSort"));
-
-        // Check red zones
-#if HWY_IS_MSAN
-        __msan_unpoison(aligned.get(), misalign * sizeof(LaneType));
-        __msan_unpoison(lanes + num_lanes, kMaxMisalign * sizeof(LaneType));
-#endif
-        for (size_t i = 0; i < misalign; ++i) {
-          if (aligned[i] != hwy::LowestValue<LaneType>())
-            HWY_ABORT("Overrun left at %d\n", static_cast<int>(i));
-        }
-        for (size_t i = num_lanes; i < num_lanes + kMaxMisalign; ++i) {
-          if (lanes[i] != hwy::HighestValue<LaneType>())
-            HWY_ABORT("Overrun right at %d\n", static_cast<int>(i));
-        }
-      }  // misalign
-    }    // dist
-  }      // algo
-}
-
-void TestAllSort() {
-  for (int num : {129, 504, 20 * 1000, 34567}) {
-    const size_t num_lanes = AdjustedReps(static_cast<size_t>(num));
-    TestSort<TraitsLane<OrderAscending<int16_t> > >(num_lanes);
-    TestSort<TraitsLane<OrderDescending<uint16_t> > >(num_lanes);
-
-    TestSort<TraitsLane<OrderDescending<int32_t> > >(num_lanes);
-    TestSort<TraitsLane<OrderDescending<uint32_t> > >(num_lanes);
-
-    TestSort<TraitsLane<OrderAscending<int64_t> > >(num_lanes);
-    TestSort<TraitsLane<OrderAscending<uint64_t> > >(num_lanes);
-
-    // WARNING: for float types, SIMD comparisons will flush denormals to
-    // zero, causing mismatches with scalar sorts. In this test, we avoid
-    // generating denormal inputs.
-    TestSort<TraitsLane<OrderAscending<float> > >(num_lanes);
-#if HWY_HAVE_FLOAT64  // protects algo-inl's GenerateRandom
-    if (Sorter::HaveFloat64()) {
-      TestSort<TraitsLane<OrderDescending<double> > >(num_lanes);
-    }
-#endif
-
-// Our HeapSort does not support 128-bit keys.
-#if VQSORT_ENABLED
-    TestSort<Traits128<OrderAscending128> >(num_lanes);
-    TestSort<Traits128<OrderDescending128> >(num_lanes);
-
-    TestSort<Traits128<OrderAscendingKV128> >(num_lanes);
-    TestSort<Traits128<OrderDescendingKV128> >(num_lanes);
-#endif
-  }
-}
-
-}  // namespace
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-
-namespace hwy {
-namespace {
-HWY_BEFORE_TEST(SortTest);
-HWY_EXPORT_AND_TEST_P(SortTest, TestAllMedian);
-HWY_EXPORT_AND_TEST_P(SortTest, TestAllBaseCase);
-HWY_EXPORT_AND_TEST_P(SortTest, TestAllPartition);
-HWY_EXPORT_AND_TEST_P(SortTest, TestAllGenerator);
-HWY_EXPORT_AND_TEST_P(SortTest, TestAllSort);
-}  // namespace
-}  // namespace hwy
-
-#endif  // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/sorting_networks-inl.h b/third_party/highway/hwy/contrib/sort/sorting_networks-inl.h
deleted file mode 100644 (file)
index 3cc545b..0000000
+++ /dev/null
@@ -1,695 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Per-target
-#if defined(HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE) == \
-    defined(HWY_TARGET_TOGGLE)
-#ifdef HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
-#undef HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
-#else
-#define HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
-#endif
-
-#include "hwy/contrib/sort/shared-inl.h"  // SortConstants
-#include "hwy/highway.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-namespace detail {
-
-#if VQSORT_ENABLED
-
-using Constants = hwy::SortConstants;
-
-// ------------------------------ SharedTraits
-
-// Code shared between all traits. It's unclear whether these can profitably be
-// specialized for Lane vs Block, or optimized like SortPairsDistance1 using
-// Compare/DupOdd.
-template <class Base>
-struct SharedTraits : public Base {
-  // Conditionally swaps lane 0 with 2, 1 with 3 etc.
-  template <class D>
-  HWY_INLINE Vec<D> SortPairsDistance2(D d, Vec<D> v) const {
-    const Base* base = static_cast<const Base*>(this);
-    Vec<D> swapped = base->SwapAdjacentPairs(d, v);
-    base->Sort2(d, v, swapped);
-    return base->OddEvenPairs(d, swapped, v);
-  }
-
-  // Swaps with the vector formed by reversing contiguous groups of 8 keys.
-  template <class D>
-  HWY_INLINE Vec<D> SortPairsReverse8(D d, Vec<D> v) const {
-    const Base* base = static_cast<const Base*>(this);
-    Vec<D> swapped = base->ReverseKeys8(d, v);
-    base->Sort2(d, v, swapped);
-    return base->OddEvenQuads(d, swapped, v);
-  }
-
-  // Swaps with the vector formed by reversing contiguous groups of 8 keys.
-  template <class D>
-  HWY_INLINE Vec<D> SortPairsReverse16(D d, Vec<D> v) const {
-    const Base* base = static_cast<const Base*>(this);
-    static_assert(Constants::kMaxCols <= 16, "Need actual Reverse16");
-    Vec<D> swapped = base->ReverseKeys(d, v);
-    base->Sort2(d, v, swapped);
-    return ConcatUpperLower(d, swapped, v);  // 8 = half of the vector
-  }
-};
-
-// ------------------------------ Sorting network
-
-// (Green's irregular) sorting network for independent columns in 16 vectors.
-template <class D, class Traits, class V = Vec<D>>
-HWY_INLINE void Sort16(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
-                       V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd,
-                       V& ve, V& vf) {
-  st.Sort2(d, v0, v1);
-  st.Sort2(d, v2, v3);
-  st.Sort2(d, v4, v5);
-  st.Sort2(d, v6, v7);
-  st.Sort2(d, v8, v9);
-  st.Sort2(d, va, vb);
-  st.Sort2(d, vc, vd);
-  st.Sort2(d, ve, vf);
-  st.Sort2(d, v0, v2);
-  st.Sort2(d, v1, v3);
-  st.Sort2(d, v4, v6);
-  st.Sort2(d, v5, v7);
-  st.Sort2(d, v8, va);
-  st.Sort2(d, v9, vb);
-  st.Sort2(d, vc, ve);
-  st.Sort2(d, vd, vf);
-  st.Sort2(d, v0, v4);
-  st.Sort2(d, v1, v5);
-  st.Sort2(d, v2, v6);
-  st.Sort2(d, v3, v7);
-  st.Sort2(d, v8, vc);
-  st.Sort2(d, v9, vd);
-  st.Sort2(d, va, ve);
-  st.Sort2(d, vb, vf);
-  st.Sort2(d, v0, v8);
-  st.Sort2(d, v1, v9);
-  st.Sort2(d, v2, va);
-  st.Sort2(d, v3, vb);
-  st.Sort2(d, v4, vc);
-  st.Sort2(d, v5, vd);
-  st.Sort2(d, v6, ve);
-  st.Sort2(d, v7, vf);
-  st.Sort2(d, v5, va);
-  st.Sort2(d, v6, v9);
-  st.Sort2(d, v3, vc);
-  st.Sort2(d, v7, vb);
-  st.Sort2(d, vd, ve);
-  st.Sort2(d, v4, v8);
-  st.Sort2(d, v1, v2);
-  st.Sort2(d, v1, v4);
-  st.Sort2(d, v7, vd);
-  st.Sort2(d, v2, v8);
-  st.Sort2(d, vb, ve);
-  st.Sort2(d, v2, v4);
-  st.Sort2(d, v5, v6);
-  st.Sort2(d, v9, va);
-  st.Sort2(d, vb, vd);
-  st.Sort2(d, v3, v8);
-  st.Sort2(d, v7, vc);
-  st.Sort2(d, v3, v5);
-  st.Sort2(d, v6, v8);
-  st.Sort2(d, v7, v9);
-  st.Sort2(d, va, vc);
-  st.Sort2(d, v3, v4);
-  st.Sort2(d, v5, v6);
-  st.Sort2(d, v7, v8);
-  st.Sort2(d, v9, va);
-  st.Sort2(d, vb, vc);
-  st.Sort2(d, v6, v7);
-  st.Sort2(d, v8, v9);
-}
-
-// ------------------------------ Merging networks
-
-// Blacher's hybrid bitonic/odd-even networks, generated by print_network.cc.
-
-template <class D, class Traits, class V = Vec<D>>
-HWY_INLINE void Merge2(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
-                       V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd,
-                       V& ve, V& vf) {
-  v8 = st.ReverseKeys2(d, v8);
-  v9 = st.ReverseKeys2(d, v9);
-  va = st.ReverseKeys2(d, va);
-  vb = st.ReverseKeys2(d, vb);
-  vc = st.ReverseKeys2(d, vc);
-  vd = st.ReverseKeys2(d, vd);
-  ve = st.ReverseKeys2(d, ve);
-  vf = st.ReverseKeys2(d, vf);
-  st.Sort2(d, v0, vf);
-  st.Sort2(d, v1, ve);
-  st.Sort2(d, v2, vd);
-  st.Sort2(d, v3, vc);
-  st.Sort2(d, v4, vb);
-  st.Sort2(d, v5, va);
-  st.Sort2(d, v6, v9);
-  st.Sort2(d, v7, v8);
-  v4 = st.ReverseKeys2(d, v4);
-  vc = st.ReverseKeys2(d, vc);
-  v5 = st.ReverseKeys2(d, v5);
-  vd = st.ReverseKeys2(d, vd);
-  v6 = st.ReverseKeys2(d, v6);
-  ve = st.ReverseKeys2(d, ve);
-  v7 = st.ReverseKeys2(d, v7);
-  vf = st.ReverseKeys2(d, vf);
-  st.Sort2(d, v0, v7);
-  st.Sort2(d, v8, vf);
-  st.Sort2(d, v1, v6);
-  st.Sort2(d, v9, ve);
-  st.Sort2(d, v2, v5);
-  st.Sort2(d, va, vd);
-  st.Sort2(d, v3, v4);
-  st.Sort2(d, vb, vc);
-  v2 = st.ReverseKeys2(d, v2);
-  v3 = st.ReverseKeys2(d, v3);
-  v6 = st.ReverseKeys2(d, v6);
-  v7 = st.ReverseKeys2(d, v7);
-  va = st.ReverseKeys2(d, va);
-  vb = st.ReverseKeys2(d, vb);
-  ve = st.ReverseKeys2(d, ve);
-  vf = st.ReverseKeys2(d, vf);
-  st.Sort2(d, v0, v3);
-  st.Sort2(d, v1, v2);
-  st.Sort2(d, v4, v7);
-  st.Sort2(d, v5, v6);
-  st.Sort2(d, v8, vb);
-  st.Sort2(d, v9, va);
-  st.Sort2(d, vc, vf);
-  st.Sort2(d, vd, ve);
-  v1 = st.ReverseKeys2(d, v1);
-  v3 = st.ReverseKeys2(d, v3);
-  v5 = st.ReverseKeys2(d, v5);
-  v7 = st.ReverseKeys2(d, v7);
-  v9 = st.ReverseKeys2(d, v9);
-  vb = st.ReverseKeys2(d, vb);
-  vd = st.ReverseKeys2(d, vd);
-  vf = st.ReverseKeys2(d, vf);
-  st.Sort2(d, v0, v1);
-  st.Sort2(d, v2, v3);
-  st.Sort2(d, v4, v5);
-  st.Sort2(d, v6, v7);
-  st.Sort2(d, v8, v9);
-  st.Sort2(d, va, vb);
-  st.Sort2(d, vc, vd);
-  st.Sort2(d, ve, vf);
-  v0 = st.SortPairsDistance1(d, v0);
-  v1 = st.SortPairsDistance1(d, v1);
-  v2 = st.SortPairsDistance1(d, v2);
-  v3 = st.SortPairsDistance1(d, v3);
-  v4 = st.SortPairsDistance1(d, v4);
-  v5 = st.SortPairsDistance1(d, v5);
-  v6 = st.SortPairsDistance1(d, v6);
-  v7 = st.SortPairsDistance1(d, v7);
-  v8 = st.SortPairsDistance1(d, v8);
-  v9 = st.SortPairsDistance1(d, v9);
-  va = st.SortPairsDistance1(d, va);
-  vb = st.SortPairsDistance1(d, vb);
-  vc = st.SortPairsDistance1(d, vc);
-  vd = st.SortPairsDistance1(d, vd);
-  ve = st.SortPairsDistance1(d, ve);
-  vf = st.SortPairsDistance1(d, vf);
-}
-
-template <class D, class Traits, class V = Vec<D>>
-HWY_INLINE void Merge4(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
-                       V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd,
-                       V& ve, V& vf) {
-  v8 = st.ReverseKeys4(d, v8);
-  v9 = st.ReverseKeys4(d, v9);
-  va = st.ReverseKeys4(d, va);
-  vb = st.ReverseKeys4(d, vb);
-  vc = st.ReverseKeys4(d, vc);
-  vd = st.ReverseKeys4(d, vd);
-  ve = st.ReverseKeys4(d, ve);
-  vf = st.ReverseKeys4(d, vf);
-  st.Sort2(d, v0, vf);
-  st.Sort2(d, v1, ve);
-  st.Sort2(d, v2, vd);
-  st.Sort2(d, v3, vc);
-  st.Sort2(d, v4, vb);
-  st.Sort2(d, v5, va);
-  st.Sort2(d, v6, v9);
-  st.Sort2(d, v7, v8);
-  v4 = st.ReverseKeys4(d, v4);
-  vc = st.ReverseKeys4(d, vc);
-  v5 = st.ReverseKeys4(d, v5);
-  vd = st.ReverseKeys4(d, vd);
-  v6 = st.ReverseKeys4(d, v6);
-  ve = st.ReverseKeys4(d, ve);
-  v7 = st.ReverseKeys4(d, v7);
-  vf = st.ReverseKeys4(d, vf);
-  st.Sort2(d, v0, v7);
-  st.Sort2(d, v8, vf);
-  st.Sort2(d, v1, v6);
-  st.Sort2(d, v9, ve);
-  st.Sort2(d, v2, v5);
-  st.Sort2(d, va, vd);
-  st.Sort2(d, v3, v4);
-  st.Sort2(d, vb, vc);
-  v2 = st.ReverseKeys4(d, v2);
-  v3 = st.ReverseKeys4(d, v3);
-  v6 = st.ReverseKeys4(d, v6);
-  v7 = st.ReverseKeys4(d, v7);
-  va = st.ReverseKeys4(d, va);
-  vb = st.ReverseKeys4(d, vb);
-  ve = st.ReverseKeys4(d, ve);
-  vf = st.ReverseKeys4(d, vf);
-  st.Sort2(d, v0, v3);
-  st.Sort2(d, v1, v2);
-  st.Sort2(d, v4, v7);
-  st.Sort2(d, v5, v6);
-  st.Sort2(d, v8, vb);
-  st.Sort2(d, v9, va);
-  st.Sort2(d, vc, vf);
-  st.Sort2(d, vd, ve);
-  v1 = st.ReverseKeys4(d, v1);
-  v3 = st.ReverseKeys4(d, v3);
-  v5 = st.ReverseKeys4(d, v5);
-  v7 = st.ReverseKeys4(d, v7);
-  v9 = st.ReverseKeys4(d, v9);
-  vb = st.ReverseKeys4(d, vb);
-  vd = st.ReverseKeys4(d, vd);
-  vf = st.ReverseKeys4(d, vf);
-  st.Sort2(d, v0, v1);
-  st.Sort2(d, v2, v3);
-  st.Sort2(d, v4, v5);
-  st.Sort2(d, v6, v7);
-  st.Sort2(d, v8, v9);
-  st.Sort2(d, va, vb);
-  st.Sort2(d, vc, vd);
-  st.Sort2(d, ve, vf);
-  v0 = st.SortPairsReverse4(d, v0);
-  v1 = st.SortPairsReverse4(d, v1);
-  v2 = st.SortPairsReverse4(d, v2);
-  v3 = st.SortPairsReverse4(d, v3);
-  v4 = st.SortPairsReverse4(d, v4);
-  v5 = st.SortPairsReverse4(d, v5);
-  v6 = st.SortPairsReverse4(d, v6);
-  v7 = st.SortPairsReverse4(d, v7);
-  v8 = st.SortPairsReverse4(d, v8);
-  v9 = st.SortPairsReverse4(d, v9);
-  va = st.SortPairsReverse4(d, va);
-  vb = st.SortPairsReverse4(d, vb);
-  vc = st.SortPairsReverse4(d, vc);
-  vd = st.SortPairsReverse4(d, vd);
-  ve = st.SortPairsReverse4(d, ve);
-  vf = st.SortPairsReverse4(d, vf);
-  v0 = st.SortPairsDistance1(d, v0);
-  v1 = st.SortPairsDistance1(d, v1);
-  v2 = st.SortPairsDistance1(d, v2);
-  v3 = st.SortPairsDistance1(d, v3);
-  v4 = st.SortPairsDistance1(d, v4);
-  v5 = st.SortPairsDistance1(d, v5);
-  v6 = st.SortPairsDistance1(d, v6);
-  v7 = st.SortPairsDistance1(d, v7);
-  v8 = st.SortPairsDistance1(d, v8);
-  v9 = st.SortPairsDistance1(d, v9);
-  va = st.SortPairsDistance1(d, va);
-  vb = st.SortPairsDistance1(d, vb);
-  vc = st.SortPairsDistance1(d, vc);
-  vd = st.SortPairsDistance1(d, vd);
-  ve = st.SortPairsDistance1(d, ve);
-  vf = st.SortPairsDistance1(d, vf);
-}
-
-template <class D, class Traits, class V = Vec<D>>
-HWY_INLINE void Merge8(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5,
-                       V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd,
-                       V& ve, V& vf) {
-  v8 = st.ReverseKeys8(d, v8);
-  v9 = st.ReverseKeys8(d, v9);
-  va = st.ReverseKeys8(d, va);
-  vb = st.ReverseKeys8(d, vb);
-  vc = st.ReverseKeys8(d, vc);
-  vd = st.ReverseKeys8(d, vd);
-  ve = st.ReverseKeys8(d, ve);
-  vf = st.ReverseKeys8(d, vf);
-  st.Sort2(d, v0, vf);
-  st.Sort2(d, v1, ve);
-  st.Sort2(d, v2, vd);
-  st.Sort2(d, v3, vc);
-  st.Sort2(d, v4, vb);
-  st.Sort2(d, v5, va);
-  st.Sort2(d, v6, v9);
-  st.Sort2(d, v7, v8);
-  v4 = st.ReverseKeys8(d, v4);
-  vc = st.ReverseKeys8(d, vc);
-  v5 = st.ReverseKeys8(d, v5);
-  vd = st.ReverseKeys8(d, vd);
-  v6 = st.ReverseKeys8(d, v6);
-  ve = st.ReverseKeys8(d, ve);
-  v7 = st.ReverseKeys8(d, v7);
-  vf = st.ReverseKeys8(d, vf);
-  st.Sort2(d, v0, v7);
-  st.Sort2(d, v8, vf);
-  st.Sort2(d, v1, v6);
-  st.Sort2(d, v9, ve);
-  st.Sort2(d, v2, v5);
-  st.Sort2(d, va, vd);
-  st.Sort2(d, v3, v4);
-  st.Sort2(d, vb, vc);
-  v2 = st.ReverseKeys8(d, v2);
-  v3 = st.ReverseKeys8(d, v3);
-  v6 = st.ReverseKeys8(d, v6);
-  v7 = st.ReverseKeys8(d, v7);
-  va = st.ReverseKeys8(d, va);
-  vb = st.ReverseKeys8(d, vb);
-  ve = st.ReverseKeys8(d, ve);
-  vf = st.ReverseKeys8(d, vf);
-  st.Sort2(d, v0, v3);
-  st.Sort2(d, v1, v2);
-  st.Sort2(d, v4, v7);
-  st.Sort2(d, v5, v6);
-  st.Sort2(d, v8, vb);
-  st.Sort2(d, v9, va);
-  st.Sort2(d, vc, vf);
-  st.Sort2(d, vd, ve);
-  v1 = st.ReverseKeys8(d, v1);
-  v3 = st.ReverseKeys8(d, v3);
-  v5 = st.ReverseKeys8(d, v5);
-  v7 = st.ReverseKeys8(d, v7);
-  v9 = st.ReverseKeys8(d, v9);
-  vb = st.ReverseKeys8(d, vb);
-  vd = st.ReverseKeys8(d, vd);
-  vf = st.ReverseKeys8(d, vf);
-  st.Sort2(d, v0, v1);
-  st.Sort2(d, v2, v3);
-  st.Sort2(d, v4, v5);
-  st.Sort2(d, v6, v7);
-  st.Sort2(d, v8, v9);
-  st.Sort2(d, va, vb);
-  st.Sort2(d, vc, vd);
-  st.Sort2(d, ve, vf);
-  v0 = st.SortPairsReverse8(d, v0);
-  v1 = st.SortPairsReverse8(d, v1);
-  v2 = st.SortPairsReverse8(d, v2);
-  v3 = st.SortPairsReverse8(d, v3);
-  v4 = st.SortPairsReverse8(d, v4);
-  v5 = st.SortPairsReverse8(d, v5);
-  v6 = st.SortPairsReverse8(d, v6);
-  v7 = st.SortPairsReverse8(d, v7);
-  v8 = st.SortPairsReverse8(d, v8);
-  v9 = st.SortPairsReverse8(d, v9);
-  va = st.SortPairsReverse8(d, va);
-  vb = st.SortPairsReverse8(d, vb);
-  vc = st.SortPairsReverse8(d, vc);
-  vd = st.SortPairsReverse8(d, vd);
-  ve = st.SortPairsReverse8(d, ve);
-  vf = st.SortPairsReverse8(d, vf);
-  v0 = st.SortPairsDistance2(d, v0);
-  v1 = st.SortPairsDistance2(d, v1);
-  v2 = st.SortPairsDistance2(d, v2);
-  v3 = st.SortPairsDistance2(d, v3);
-  v4 = st.SortPairsDistance2(d, v4);
-  v5 = st.SortPairsDistance2(d, v5);
-  v6 = st.SortPairsDistance2(d, v6);
-  v7 = st.SortPairsDistance2(d, v7);
-  v8 = st.SortPairsDistance2(d, v8);
-  v9 = st.SortPairsDistance2(d, v9);
-  va = st.SortPairsDistance2(d, va);
-  vb = st.SortPairsDistance2(d, vb);
-  vc = st.SortPairsDistance2(d, vc);
-  vd = st.SortPairsDistance2(d, vd);
-  ve = st.SortPairsDistance2(d, ve);
-  vf = st.SortPairsDistance2(d, vf);
-  v0 = st.SortPairsDistance1(d, v0);
-  v1 = st.SortPairsDistance1(d, v1);
-  v2 = st.SortPairsDistance1(d, v2);
-  v3 = st.SortPairsDistance1(d, v3);
-  v4 = st.SortPairsDistance1(d, v4);
-  v5 = st.SortPairsDistance1(d, v5);
-  v6 = st.SortPairsDistance1(d, v6);
-  v7 = st.SortPairsDistance1(d, v7);
-  v8 = st.SortPairsDistance1(d, v8);
-  v9 = st.SortPairsDistance1(d, v9);
-  va = st.SortPairsDistance1(d, va);
-  vb = st.SortPairsDistance1(d, vb);
-  vc = st.SortPairsDistance1(d, vc);
-  vd = st.SortPairsDistance1(d, vd);
-  ve = st.SortPairsDistance1(d, ve);
-  vf = st.SortPairsDistance1(d, vf);
-}
-
-// Unused on MSVC, see below
-#if !HWY_COMPILER_MSVC
-
-template <class D, class Traits, class V = Vec<D>>
-HWY_INLINE void Merge16(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4,
-                        V& v5, V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc,
-                        V& vd, V& ve, V& vf) {
-  v8 = st.ReverseKeys16(d, v8);
-  v9 = st.ReverseKeys16(d, v9);
-  va = st.ReverseKeys16(d, va);
-  vb = st.ReverseKeys16(d, vb);
-  vc = st.ReverseKeys16(d, vc);
-  vd = st.ReverseKeys16(d, vd);
-  ve = st.ReverseKeys16(d, ve);
-  vf = st.ReverseKeys16(d, vf);
-  st.Sort2(d, v0, vf);
-  st.Sort2(d, v1, ve);
-  st.Sort2(d, v2, vd);
-  st.Sort2(d, v3, vc);
-  st.Sort2(d, v4, vb);
-  st.Sort2(d, v5, va);
-  st.Sort2(d, v6, v9);
-  st.Sort2(d, v7, v8);
-  v4 = st.ReverseKeys16(d, v4);
-  vc = st.ReverseKeys16(d, vc);
-  v5 = st.ReverseKeys16(d, v5);
-  vd = st.ReverseKeys16(d, vd);
-  v6 = st.ReverseKeys16(d, v6);
-  ve = st.ReverseKeys16(d, ve);
-  v7 = st.ReverseKeys16(d, v7);
-  vf = st.ReverseKeys16(d, vf);
-  st.Sort2(d, v0, v7);
-  st.Sort2(d, v8, vf);
-  st.Sort2(d, v1, v6);
-  st.Sort2(d, v9, ve);
-  st.Sort2(d, v2, v5);
-  st.Sort2(d, va, vd);
-  st.Sort2(d, v3, v4);
-  st.Sort2(d, vb, vc);
-  v2 = st.ReverseKeys16(d, v2);
-  v3 = st.ReverseKeys16(d, v3);
-  v6 = st.ReverseKeys16(d, v6);
-  v7 = st.ReverseKeys16(d, v7);
-  va = st.ReverseKeys16(d, va);
-  vb = st.ReverseKeys16(d, vb);
-  ve = st.ReverseKeys16(d, ve);
-  vf = st.ReverseKeys16(d, vf);
-  st.Sort2(d, v0, v3);
-  st.Sort2(d, v1, v2);
-  st.Sort2(d, v4, v7);
-  st.Sort2(d, v5, v6);
-  st.Sort2(d, v8, vb);
-  st.Sort2(d, v9, va);
-  st.Sort2(d, vc, vf);
-  st.Sort2(d, vd, ve);
-  v1 = st.ReverseKeys16(d, v1);
-  v3 = st.ReverseKeys16(d, v3);
-  v5 = st.ReverseKeys16(d, v5);
-  v7 = st.ReverseKeys16(d, v7);
-  v9 = st.ReverseKeys16(d, v9);
-  vb = st.ReverseKeys16(d, vb);
-  vd = st.ReverseKeys16(d, vd);
-  vf = st.ReverseKeys16(d, vf);
-  st.Sort2(d, v0, v1);
-  st.Sort2(d, v2, v3);
-  st.Sort2(d, v4, v5);
-  st.Sort2(d, v6, v7);
-  st.Sort2(d, v8, v9);
-  st.Sort2(d, va, vb);
-  st.Sort2(d, vc, vd);
-  st.Sort2(d, ve, vf);
-  v0 = st.SortPairsReverse16(d, v0);
-  v1 = st.SortPairsReverse16(d, v1);
-  v2 = st.SortPairsReverse16(d, v2);
-  v3 = st.SortPairsReverse16(d, v3);
-  v4 = st.SortPairsReverse16(d, v4);
-  v5 = st.SortPairsReverse16(d, v5);
-  v6 = st.SortPairsReverse16(d, v6);
-  v7 = st.SortPairsReverse16(d, v7);
-  v8 = st.SortPairsReverse16(d, v8);
-  v9 = st.SortPairsReverse16(d, v9);
-  va = st.SortPairsReverse16(d, va);
-  vb = st.SortPairsReverse16(d, vb);
-  vc = st.SortPairsReverse16(d, vc);
-  vd = st.SortPairsReverse16(d, vd);
-  ve = st.SortPairsReverse16(d, ve);
-  vf = st.SortPairsReverse16(d, vf);
-  v0 = st.SortPairsDistance4(d, v0);
-  v1 = st.SortPairsDistance4(d, v1);
-  v2 = st.SortPairsDistance4(d, v2);
-  v3 = st.SortPairsDistance4(d, v3);
-  v4 = st.SortPairsDistance4(d, v4);
-  v5 = st.SortPairsDistance4(d, v5);
-  v6 = st.SortPairsDistance4(d, v6);
-  v7 = st.SortPairsDistance4(d, v7);
-  v8 = st.SortPairsDistance4(d, v8);
-  v9 = st.SortPairsDistance4(d, v9);
-  va = st.SortPairsDistance4(d, va);
-  vb = st.SortPairsDistance4(d, vb);
-  vc = st.SortPairsDistance4(d, vc);
-  vd = st.SortPairsDistance4(d, vd);
-  ve = st.SortPairsDistance4(d, ve);
-  vf = st.SortPairsDistance4(d, vf);
-  v0 = st.SortPairsDistance2(d, v0);
-  v1 = st.SortPairsDistance2(d, v1);
-  v2 = st.SortPairsDistance2(d, v2);
-  v3 = st.SortPairsDistance2(d, v3);
-  v4 = st.SortPairsDistance2(d, v4);
-  v5 = st.SortPairsDistance2(d, v5);
-  v6 = st.SortPairsDistance2(d, v6);
-  v7 = st.SortPairsDistance2(d, v7);
-  v8 = st.SortPairsDistance2(d, v8);
-  v9 = st.SortPairsDistance2(d, v9);
-  va = st.SortPairsDistance2(d, va);
-  vb = st.SortPairsDistance2(d, vb);
-  vc = st.SortPairsDistance2(d, vc);
-  vd = st.SortPairsDistance2(d, vd);
-  ve = st.SortPairsDistance2(d, ve);
-  vf = st.SortPairsDistance2(d, vf);
-  v0 = st.SortPairsDistance1(d, v0);
-  v1 = st.SortPairsDistance1(d, v1);
-  v2 = st.SortPairsDistance1(d, v2);
-  v3 = st.SortPairsDistance1(d, v3);
-  v4 = st.SortPairsDistance1(d, v4);
-  v5 = st.SortPairsDistance1(d, v5);
-  v6 = st.SortPairsDistance1(d, v6);
-  v7 = st.SortPairsDistance1(d, v7);
-  v8 = st.SortPairsDistance1(d, v8);
-  v9 = st.SortPairsDistance1(d, v9);
-  va = st.SortPairsDistance1(d, va);
-  vb = st.SortPairsDistance1(d, vb);
-  vc = st.SortPairsDistance1(d, vc);
-  vd = st.SortPairsDistance1(d, vd);
-  ve = st.SortPairsDistance1(d, ve);
-  vf = st.SortPairsDistance1(d, vf);
-}
-
-#endif  // !HWY_COMPILER_MSVC
-
-// Reshapes `buf` into a matrix, sorts columns independently, and then merges
-// into a sorted 1D array without transposing.
-//
-// `st` is SharedTraits<Traits*<Order*>>. This abstraction layer bridges
-//   differences in sort order and single-lane vs 128-bit keys.
-// `buf` ensures full vectors are aligned, and enables loads/stores without
-//   bounds checks.
-//
-// NOINLINE because this is large and called twice from vqsort-inl.h.
-//
-// References:
-// https://drops.dagstuhl.de/opus/volltexte/2021/13775/pdf/LIPIcs-SEA-2021-3.pdf
-// https://github.com/simd-sorting/fast-and-robust/blob/master/avx2_sort_demo/avx2sort.h
-// "Entwurf und Implementierung vektorisierter Sortieralgorithmen" (M. Blacher)
-template <class Traits, typename T>
-HWY_NOINLINE void SortingNetwork(Traits st, T* HWY_RESTRICT buf, size_t cols) {
-  const CappedTag<T, Constants::kMaxCols> d;
-  using V = decltype(Zero(d));
-
-  HWY_DASSERT(cols <= Constants::kMaxCols);
-
-  // The network width depends on the number of keys, not lanes.
-  constexpr size_t kLanesPerKey = st.LanesPerKey();
-  const size_t keys = cols / kLanesPerKey;
-  constexpr size_t kMaxKeys = MaxLanes(d) / kLanesPerKey;
-
-  // These are aligned iff cols == Lanes(d). We prefer unaligned/non-constexpr
-  // offsets to duplicating this code for every value of cols.
-  static_assert(Constants::kMaxRows == 16, "Update loads/stores/args");
-  V v0 = LoadU(d, buf + 0x0 * cols);
-  V v1 = LoadU(d, buf + 0x1 * cols);
-  V v2 = LoadU(d, buf + 0x2 * cols);
-  V v3 = LoadU(d, buf + 0x3 * cols);
-  V v4 = LoadU(d, buf + 0x4 * cols);
-  V v5 = LoadU(d, buf + 0x5 * cols);
-  V v6 = LoadU(d, buf + 0x6 * cols);
-  V v7 = LoadU(d, buf + 0x7 * cols);
-  V v8 = LoadU(d, buf + 0x8 * cols);
-  V v9 = LoadU(d, buf + 0x9 * cols);
-  V va = LoadU(d, buf + 0xa * cols);
-  V vb = LoadU(d, buf + 0xb * cols);
-  V vc = LoadU(d, buf + 0xc * cols);
-  V vd = LoadU(d, buf + 0xd * cols);
-  V ve = LoadU(d, buf + 0xe * cols);
-  V vf = LoadU(d, buf + 0xf * cols);
-
-  Sort16(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, ve, vf);
-
-  // Checking MaxLanes avoids generating HWY_ASSERT code for the unreachable
-  // code paths: if MaxLanes < 2, then keys <= cols < 2.
-  if (HWY_LIKELY(keys >= 2 && kMaxKeys >= 2)) {
-    Merge2(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, ve,
-           vf);
-
-    if (HWY_LIKELY(keys >= 4 && kMaxKeys >= 4)) {
-      Merge4(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, ve,
-             vf);
-
-      if (HWY_LIKELY(keys >= 8 && kMaxKeys >= 8)) {
-        Merge8(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd,
-               ve, vf);
-
-        // Avoids build timeout. Must match #if condition in kMaxCols.
-#if !HWY_COMPILER_MSVC && !HWY_IS_DEBUG_BUILD
-        if (HWY_LIKELY(keys >= 16 && kMaxKeys >= 16)) {
-          Merge16(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd,
-                  ve, vf);
-
-          static_assert(Constants::kMaxCols <= 16, "Add more branches");
-        }
-#endif
-      }
-    }
-  }
-
-  StoreU(v0, d, buf + 0x0 * cols);
-  StoreU(v1, d, buf + 0x1 * cols);
-  StoreU(v2, d, buf + 0x2 * cols);
-  StoreU(v3, d, buf + 0x3 * cols);
-  StoreU(v4, d, buf + 0x4 * cols);
-  StoreU(v5, d, buf + 0x5 * cols);
-  StoreU(v6, d, buf + 0x6 * cols);
-  StoreU(v7, d, buf + 0x7 * cols);
-  StoreU(v8, d, buf + 0x8 * cols);
-  StoreU(v9, d, buf + 0x9 * cols);
-  StoreU(va, d, buf + 0xa * cols);
-  StoreU(vb, d, buf + 0xb * cols);
-  StoreU(vc, d, buf + 0xc * cols);
-  StoreU(vd, d, buf + 0xd * cols);
-  StoreU(ve, d, buf + 0xe * cols);
-  StoreU(vf, d, buf + 0xf * cols);
-}
-
-#else
-template <class Base>
-struct SharedTraits : public Base {};
-#endif  // VQSORT_ENABLED
-
-}  // namespace detail
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#endif  // HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
diff --git a/third_party/highway/hwy/contrib/sort/traits-inl.h b/third_party/highway/hwy/contrib/sort/traits-inl.h
deleted file mode 100644 (file)
index 2d22095..0000000
+++ /dev/null
@@ -1,417 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Per-target
-#if defined(HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE) == \
-    defined(HWY_TARGET_TOGGLE)
-#ifdef HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE
-#undef HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE
-#else
-#define HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE
-#endif
-
-#include <string>
-
-#include "hwy/contrib/sort/shared-inl.h"  // SortConstants
-#include "hwy/contrib/sort/vqsort.h"      // SortDescending
-#include "hwy/highway.h"
-#include "hwy/print.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-namespace detail {
-
-#if VQSORT_ENABLED || HWY_IDE
-
-// Highway does not provide a lane type for 128-bit keys, so we use uint64_t
-// along with an abstraction layer for single-lane vs. lane-pair, which is
-// independent of the order.
-template <typename T>
-struct KeyLane {
-  static constexpr bool Is128() { return false; }
-  constexpr size_t LanesPerKey() const { return 1; }
-
-  // What type bench_sort should allocate for generating inputs.
-  using LaneType = T;
-  // What type to pass to Sorter::operator().
-  using KeyType = T;
-
-  std::string KeyString() const {
-    char string100[100];
-    hwy::detail::TypeName(hwy::detail::MakeTypeInfo<KeyType>(), 1, string100);
-    return string100;
-  }
-
-  // For HeapSort
-  HWY_INLINE void Swap(T* a, T* b) const {
-    const T temp = *a;
-    *a = *b;
-    *b = temp;
-  }
-
-  template <class V, class M>
-  HWY_INLINE V CompressKeys(V keys, M mask) const {
-    return CompressNot(keys, mask);
-  }
-
-  // Broadcasts one key into a vector
-  template <class D>
-  HWY_INLINE Vec<D> SetKey(D d, const T* key) const {
-    return Set(d, *key);
-  }
-
-  template <class D>
-  HWY_INLINE Mask<D> EqualKeys(D /*tag*/, Vec<D> a, Vec<D> b) const {
-    return Eq(a, b);
-  }
-
-  HWY_INLINE bool Equal1(const T* a, const T* b) { return *a == *b; }
-
-  template <class D>
-  HWY_INLINE Vec<D> ReverseKeys(D d, Vec<D> v) const {
-    return Reverse(d, v);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> ReverseKeys2(D d, Vec<D> v) const {
-    return Reverse2(d, v);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> ReverseKeys4(D d, Vec<D> v) const {
-    return Reverse4(d, v);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> ReverseKeys8(D d, Vec<D> v) const {
-    return Reverse8(d, v);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> ReverseKeys16(D d, Vec<D> v) const {
-    static_assert(SortConstants::kMaxCols <= 16, "Assumes u32x16 = 512 bit");
-    return ReverseKeys(d, v);
-  }
-
-  template <class V>
-  HWY_INLINE V OddEvenKeys(const V odd, const V even) const {
-    return OddEven(odd, even);
-  }
-
-  template <class D, HWY_IF_LANE_SIZE_D(D, 2)>
-  HWY_INLINE Vec<D> SwapAdjacentPairs(D d, const Vec<D> v) const {
-    const Repartition<uint32_t, D> du32;
-    return BitCast(d, Shuffle2301(BitCast(du32, v)));
-  }
-  template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
-  HWY_INLINE Vec<D> SwapAdjacentPairs(D /* tag */, const Vec<D> v) const {
-    return Shuffle1032(v);
-  }
-  template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
-  HWY_INLINE Vec<D> SwapAdjacentPairs(D /* tag */, const Vec<D> v) const {
-    return SwapAdjacentBlocks(v);
-  }
-
-  template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)>
-  HWY_INLINE Vec<D> SwapAdjacentQuads(D d, const Vec<D> v) const {
-#if HWY_HAVE_FLOAT64  // in case D is float32
-    const RepartitionToWide<D> dw;
-#else
-    const RepartitionToWide<RebindToUnsigned<D> > dw;
-#endif
-    return BitCast(d, SwapAdjacentPairs(dw, BitCast(dw, v)));
-  }
-  template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
-  HWY_INLINE Vec<D> SwapAdjacentQuads(D d, const Vec<D> v) const {
-    // Assumes max vector size = 512
-    return ConcatLowerUpper(d, v, v);
-  }
-
-  template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)>
-  HWY_INLINE Vec<D> OddEvenPairs(D d, const Vec<D> odd,
-                                 const Vec<D> even) const {
-#if HWY_HAVE_FLOAT64  // in case D is float32
-    const RepartitionToWide<D> dw;
-#else
-    const RepartitionToWide<RebindToUnsigned<D> > dw;
-#endif
-    return BitCast(d, OddEven(BitCast(dw, odd), BitCast(dw, even)));
-  }
-  template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
-  HWY_INLINE Vec<D> OddEvenPairs(D /* tag */, Vec<D> odd, Vec<D> even) const {
-    return OddEvenBlocks(odd, even);
-  }
-
-  template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)>
-  HWY_INLINE Vec<D> OddEvenQuads(D d, Vec<D> odd, Vec<D> even) const {
-#if HWY_HAVE_FLOAT64  // in case D is float32
-    const RepartitionToWide<D> dw;
-#else
-    const RepartitionToWide<RebindToUnsigned<D> > dw;
-#endif
-    return BitCast(d, OddEvenPairs(dw, BitCast(dw, odd), BitCast(dw, even)));
-  }
-  template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
-  HWY_INLINE Vec<D> OddEvenQuads(D d, Vec<D> odd, Vec<D> even) const {
-    return ConcatUpperLower(d, odd, even);
-  }
-};
-
-// Anything order-related depends on the key traits *and* the order (see
-// FirstOfLanes). We cannot implement just one Compare function because Lt128
-// only compiles if the lane type is u64. Thus we need either overloaded
-// functions with a tag type, class specializations, or separate classes.
-// We avoid overloaded functions because we want all functions to be callable
-// from a SortTraits without per-function wrappers. Specializing would work, but
-// we are anyway going to specialize at a higher level.
-template <typename T>
-struct OrderAscending : public KeyLane<T> {
-  using Order = SortAscending;
-
-  HWY_INLINE bool Compare1(const T* a, const T* b) { return *a < *b; }
-
-  template <class D>
-  HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const {
-    return Lt(a, b);
-  }
-
-  // Two halves of Sort2, used in ScanMinMax.
-  template <class D>
-  HWY_INLINE Vec<D> First(D /* tag */, const Vec<D> a, const Vec<D> b) const {
-    return Min(a, b);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> Last(D /* tag */, const Vec<D> a, const Vec<D> b) const {
-    return Max(a, b);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
-                                 T* HWY_RESTRICT /* buf */) const {
-    return MinOfLanes(d, v);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
-                                T* HWY_RESTRICT /* buf */) const {
-    return MaxOfLanes(d, v);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> FirstValue(D d) const {
-    return Set(d, hwy::LowestValue<T>());
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> LastValue(D d) const {
-    return Set(d, hwy::HighestValue<T>());
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
-    return Sub(v, Set(d, 1));
-  }
-};
-
-template <typename T>
-struct OrderDescending : public KeyLane<T> {
-  using Order = SortDescending;
-
-  HWY_INLINE bool Compare1(const T* a, const T* b) { return *b < *a; }
-
-  template <class D>
-  HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const {
-    return Lt(b, a);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> First(D /* tag */, const Vec<D> a, const Vec<D> b) const {
-    return Max(a, b);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> Last(D /* tag */, const Vec<D> a, const Vec<D> b) const {
-    return Min(a, b);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
-                                 T* HWY_RESTRICT /* buf */) const {
-    return MaxOfLanes(d, v);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
-                                T* HWY_RESTRICT /* buf */) const {
-    return MinOfLanes(d, v);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> FirstValue(D d) const {
-    return Set(d, hwy::HighestValue<T>());
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> LastValue(D d) const {
-    return Set(d, hwy::LowestValue<T>());
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
-    return Add(v, Set(d, 1));
-  }
-};
-
-// Shared code that depends on Order.
-template <class Base>
-struct TraitsLane : public Base {
-  // For each lane i: replaces a[i] with the first and b[i] with the second
-  // according to Base.
-  // Corresponds to a conditional swap, which is one "node" of a sorting
-  // network. Min/Max are cheaper than compare + blend at least for integers.
-  template <class D>
-  HWY_INLINE void Sort2(D d, Vec<D>& a, Vec<D>& b) const {
-    const Base* base = static_cast<const Base*>(this);
-
-    const Vec<D> a_copy = a;
-    // Prior to AVX3, there is no native 64-bit Min/Max, so they compile to 4
-    // instructions. We can reduce it to a compare + 2 IfThenElse.
-#if HWY_AVX3 < HWY_TARGET && HWY_TARGET <= HWY_SSSE3
-    if (sizeof(TFromD<D>) == 8) {
-      const Mask<D> cmp = base->Compare(d, a, b);
-      a = IfThenElse(cmp, a, b);
-      b = IfThenElse(cmp, b, a_copy);
-      return;
-    }
-#endif
-    a = base->First(d, a, b);
-    b = base->Last(d, a_copy, b);
-  }
-
-  // Conditionally swaps even-numbered lanes with their odd-numbered neighbor.
-  template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
-  HWY_INLINE Vec<D> SortPairsDistance1(D d, Vec<D> v) const {
-    const Base* base = static_cast<const Base*>(this);
-    Vec<D> swapped = base->ReverseKeys2(d, v);
-    // Further to the above optimization, Sort2+OddEvenKeys compile to four
-    // instructions; we can save one by combining two blends.
-#if HWY_AVX3 < HWY_TARGET && HWY_TARGET <= HWY_SSSE3
-    const Vec<D> cmp = VecFromMask(d, base->Compare(d, v, swapped));
-    return IfVecThenElse(DupOdd(cmp), swapped, v);
-#else
-    Sort2(d, v, swapped);
-    return base->OddEvenKeys(swapped, v);
-#endif
-  }
-
-  // (See above - we use Sort2 for non-64-bit types.)
-  template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)>
-  HWY_INLINE Vec<D> SortPairsDistance1(D d, Vec<D> v) const {
-    const Base* base = static_cast<const Base*>(this);
-    Vec<D> swapped = base->ReverseKeys2(d, v);
-    Sort2(d, v, swapped);
-    return base->OddEvenKeys(swapped, v);
-  }
-
-  // Swaps with the vector formed by reversing contiguous groups of 4 keys.
-  template <class D>
-  HWY_INLINE Vec<D> SortPairsReverse4(D d, Vec<D> v) const {
-    const Base* base = static_cast<const Base*>(this);
-    Vec<D> swapped = base->ReverseKeys4(d, v);
-    Sort2(d, v, swapped);
-    return base->OddEvenPairs(d, swapped, v);
-  }
-
-  // Conditionally swaps lane 0 with 4, 1 with 5 etc.
-  template <class D>
-  HWY_INLINE Vec<D> SortPairsDistance4(D d, Vec<D> v) const {
-    const Base* base = static_cast<const Base*>(this);
-    Vec<D> swapped = base->SwapAdjacentQuads(d, v);
-    // Only used in Merge16, so this will not be used on AVX2 (which only has 4
-    // u64 lanes), so skip the above optimization for 64-bit AVX2.
-    Sort2(d, v, swapped);
-    return base->OddEvenQuads(d, swapped, v);
-  }
-};
-
-#else
-
-// Base class shared between OrderAscending, OrderDescending.
-template <typename T>
-struct KeyLane {
-  constexpr bool Is128() const { return false; }
-  constexpr size_t LanesPerKey() const { return 1; }
-
-  using LaneType = T;
-  using KeyType = T;
-
-  std::string KeyString() const {
-    char string100[100];
-    hwy::detail::TypeName(hwy::detail::MakeTypeInfo<KeyType>(), 1, string100);
-    return string100;
-  }
-};
-
-template <typename T>
-struct OrderAscending : public KeyLane<T> {
-  using Order = SortAscending;
-
-  HWY_INLINE bool Compare1(const T* a, const T* b) { return *a < *b; }
-
-  template <class D>
-  HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) {
-    return Lt(a, b);
-  }
-};
-
-template <typename T>
-struct OrderDescending : public KeyLane<T> {
-  using Order = SortDescending;
-
-  HWY_INLINE bool Compare1(const T* a, const T* b) { return *b < *a; }
-
-  template <class D>
-  HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) {
-    return Lt(b, a);
-  }
-};
-
-template <class Order>
-struct TraitsLane : public Order {
-  // For HeapSort
-  template <typename T>  // MSVC doesn't find typename Order::LaneType.
-  HWY_INLINE void Swap(T* a, T* b) const {
-    const T temp = *a;
-    *a = *b;
-    *b = temp;
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> SetKey(D d, const TFromD<D>* key) const {
-    return Set(d, *key);
-  }
-};
-
-#endif  // VQSORT_ENABLED
-
-}  // namespace detail
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#endif  // HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE
diff --git a/third_party/highway/hwy/contrib/sort/traits128-inl.h b/third_party/highway/hwy/contrib/sort/traits128-inl.h
deleted file mode 100644 (file)
index debed7f..0000000
+++ /dev/null
@@ -1,474 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Per-target
-#if defined(HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE) == \
-    defined(HWY_TARGET_TOGGLE)
-#ifdef HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE
-#undef HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE
-#else
-#define HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE
-#endif
-
-#include <string>
-
-#include "hwy/contrib/sort/shared-inl.h"
-#include "hwy/contrib/sort/vqsort.h"  // SortDescending
-#include "hwy/highway.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-namespace detail {
-
-#if VQSORT_ENABLED || HWY_IDE
-
-// Highway does not provide a lane type for 128-bit keys, so we use uint64_t
-// along with an abstraction layer for single-lane vs. lane-pair, which is
-// independent of the order.
-struct KeyAny128 {
-  static constexpr bool Is128() { return true; }
-  constexpr size_t LanesPerKey() const { return 2; }
-
-  // What type bench_sort should allocate for generating inputs.
-  using LaneType = uint64_t;
-  // KeyType and KeyString are defined by derived classes.
-
-  HWY_INLINE void Swap(LaneType* a, LaneType* b) const {
-    const FixedTag<LaneType, 2> d;
-    const auto temp = LoadU(d, a);
-    StoreU(LoadU(d, b), d, a);
-    StoreU(temp, d, b);
-  }
-
-  template <class V, class M>
-  HWY_INLINE V CompressKeys(V keys, M mask) const {
-    return CompressBlocksNot(keys, mask);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> SetKey(D d, const TFromD<D>* key) const {
-    return LoadDup128(d, key);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> ReverseKeys(D d, Vec<D> v) const {
-    return ReverseBlocks(d, v);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> ReverseKeys2(D /* tag */, const Vec<D> v) const {
-    return SwapAdjacentBlocks(v);
-  }
-
-  // Only called for 4 keys because we do not support >512-bit vectors.
-  template <class D>
-  HWY_INLINE Vec<D> ReverseKeys4(D d, const Vec<D> v) const {
-    HWY_DASSERT(Lanes(d) <= 64 / sizeof(TFromD<D>));
-    return ReverseKeys(d, v);
-  }
-
-  // Only called for 4 keys because we do not support >512-bit vectors.
-  template <class D>
-  HWY_INLINE Vec<D> OddEvenPairs(D d, const Vec<D> odd,
-                                 const Vec<D> even) const {
-    HWY_DASSERT(Lanes(d) <= 64 / sizeof(TFromD<D>));
-    return ConcatUpperLower(d, odd, even);
-  }
-
-  template <class V>
-  HWY_INLINE V OddEvenKeys(const V odd, const V even) const {
-    return OddEvenBlocks(odd, even);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> ReverseKeys8(D, Vec<D>) const {
-    HWY_ASSERT(0);  // not supported: would require 1024-bit vectors
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> ReverseKeys16(D, Vec<D>) const {
-    HWY_ASSERT(0);  // not supported: would require 2048-bit vectors
-  }
-
-  // This is only called for 8/16 col networks (not supported).
-  template <class D>
-  HWY_INLINE Vec<D> SwapAdjacentPairs(D, Vec<D>) const {
-    HWY_ASSERT(0);
-  }
-
-  // This is only called for 16 col networks (not supported).
-  template <class D>
-  HWY_INLINE Vec<D> SwapAdjacentQuads(D, Vec<D>) const {
-    HWY_ASSERT(0);
-  }
-
-  // This is only called for 8 col networks (not supported).
-  template <class D>
-  HWY_INLINE Vec<D> OddEvenQuads(D, Vec<D>, Vec<D>) const {
-    HWY_ASSERT(0);
-  }
-};
-
-// Base class shared between OrderAscending128, OrderDescending128.
-struct Key128 : public KeyAny128 {
-  // What type to pass to Sorter::operator().
-  using KeyType = hwy::uint128_t;
-
-  std::string KeyString() const { return "U128"; }
-
-  template <class D>
-  HWY_INLINE Mask<D> EqualKeys(D d, Vec<D> a, Vec<D> b) const {
-    return Eq128(d, a, b);
-  }
-
-  HWY_INLINE bool Equal1(const LaneType* a, const LaneType* b) {
-    return a[0] == b[0] && a[1] == b[1];
-  }
-};
-
-// Anything order-related depends on the key traits *and* the order (see
-// FirstOfLanes). We cannot implement just one Compare function because Lt128
-// only compiles if the lane type is u64. Thus we need either overloaded
-// functions with a tag type, class specializations, or separate classes.
-// We avoid overloaded functions because we want all functions to be callable
-// from a SortTraits without per-function wrappers. Specializing would work, but
-// we are anyway going to specialize at a higher level.
-struct OrderAscending128 : public Key128 {
-  using Order = SortAscending;
-
-  HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
-    return (a[1] == b[1]) ? a[0] < b[0] : a[1] < b[1];
-  }
-
-  template <class D>
-  HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const {
-    return Lt128(d, a, b);
-  }
-
-  // Used by CompareTop
-  template <class V>
-  HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const {
-    return Lt(a, b);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const {
-    return Min128(d, a, b);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const {
-    return Max128(d, a, b);
-  }
-
-  // Same as for regular lanes because 128-bit lanes are u64.
-  template <class D>
-  HWY_INLINE Vec<D> FirstValue(D d) const {
-    return Set(d, hwy::LowestValue<TFromD<D> >());
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> LastValue(D d) const {
-    return Set(d, hwy::HighestValue<TFromD<D> >());
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
-    const Vec<D> k1 = OddEven(Zero(d), Set(d, 1));
-    return Sub(v, k1);
-  }
-};
-
-struct OrderDescending128 : public Key128 {
-  using Order = SortDescending;
-
-  HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
-    return (a[1] == b[1]) ? b[0] < a[0] : b[1] < a[1];
-  }
-
-  template <class D>
-  HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const {
-    return Lt128(d, b, a);
-  }
-
-  // Used by CompareTop
-  template <class V>
-  HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const {
-    return Lt(b, a);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const {
-    return Max128(d, a, b);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const {
-    return Min128(d, a, b);
-  }
-
-  // Same as for regular lanes because 128-bit lanes are u64.
-  template <class D>
-  HWY_INLINE Vec<D> FirstValue(D d) const {
-    return Set(d, hwy::HighestValue<TFromD<D> >());
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> LastValue(D d) const {
-    return Set(d, hwy::LowestValue<TFromD<D> >());
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
-    const Vec<D> k1 = OddEven(Zero(d), Set(d, 1));
-    return Add(v, k1);
-  }
-};
-
-// Base class shared between OrderAscendingKV128, OrderDescendingKV128.
-struct KeyValue128 : public KeyAny128 {
-  // What type to pass to Sorter::operator().
-  using KeyType = K64V64;
-
-  std::string KeyString() const { return "KV128"; }
-
-  template <class D>
-  HWY_INLINE Mask<D> EqualKeys(D d, Vec<D> a, Vec<D> b) const {
-    return Eq128Upper(d, a, b);
-  }
-
-  HWY_INLINE bool Equal1(const LaneType* a, const LaneType* b) {
-    return a[1] == b[1];
-  }
-};
-
-struct OrderAscendingKV128 : public KeyValue128 {
-  using Order = SortAscending;
-
-  HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
-    return a[1] < b[1];
-  }
-
-  template <class D>
-  HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const {
-    return Lt128Upper(d, a, b);
-  }
-
-  // Used by CompareTop
-  template <class V>
-  HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const {
-    return Lt(a, b);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const {
-    return Min128Upper(d, a, b);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const {
-    return Max128Upper(d, a, b);
-  }
-
-  // Same as for regular lanes because 128-bit lanes are u64.
-  template <class D>
-  HWY_INLINE Vec<D> FirstValue(D d) const {
-    return Set(d, hwy::LowestValue<TFromD<D> >());
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> LastValue(D d) const {
-    return Set(d, hwy::HighestValue<TFromD<D> >());
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
-    const Vec<D> k1 = OddEven(Zero(d), Set(d, 1));
-    return Sub(v, k1);
-  }
-};
-
-struct OrderDescendingKV128 : public KeyValue128 {
-  using Order = SortDescending;
-
-  HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) {
-    return b[1] < a[1];
-  }
-
-  template <class D>
-  HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const {
-    return Lt128Upper(d, b, a);
-  }
-
-  // Used by CompareTop
-  template <class V>
-  HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const {
-    return Lt(b, a);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const {
-    return Max128Upper(d, a, b);
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const {
-    return Min128Upper(d, a, b);
-  }
-
-  // Same as for regular lanes because 128-bit lanes are u64.
-  template <class D>
-  HWY_INLINE Vec<D> FirstValue(D d) const {
-    return Set(d, hwy::HighestValue<TFromD<D> >());
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> LastValue(D d) const {
-    return Set(d, hwy::LowestValue<TFromD<D> >());
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> PrevValue(D d, Vec<D> v) const {
-    const Vec<D> k1 = OddEven(Zero(d), Set(d, 1));
-    return Add(v, k1);
-  }
-};
-
-// Shared code that depends on Order.
-template <class Base>
-class Traits128 : public Base {
-  // Special case for >= 256 bit vectors
-#if HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SVE_256
-  // Returns vector with only the top u64 lane valid. Useful when the next step
-  // is to replicate the mask anyway.
-  template <class D>
-  HWY_INLINE HWY_MAYBE_UNUSED Vec<D> CompareTop(D d, Vec<D> a, Vec<D> b) const {
-    const Base* base = static_cast<const Base*>(this);
-    const Mask<D> eqHL = Eq(a, b);
-    const Vec<D> ltHL = VecFromMask(d, base->CompareLanes(a, b));
-#if HWY_TARGET == HWY_SVE_256
-    return IfThenElse(eqHL, DupEven(ltHL), ltHL);
-#else
-    const Vec<D> ltLX = ShiftLeftLanes<1>(ltHL);
-    return OrAnd(ltHL, VecFromMask(d, eqHL), ltLX);
-#endif
-  }
-
-  // We want to swap 2 u128, i.e. 4 u64 lanes, based on the 0 or FF..FF mask in
-  // the most-significant of those lanes (the result of CompareTop), so
-  // replicate it 4x. Only called for >= 256-bit vectors.
-  template <class V>
-  HWY_INLINE V ReplicateTop4x(V v) const {
-#if HWY_TARGET == HWY_SVE_256
-    return svdup_lane_u64(v, 3);
-#elif HWY_TARGET <= HWY_AVX3
-    return V{_mm512_permutex_epi64(v.raw, _MM_SHUFFLE(3, 3, 3, 3))};
-#else  // AVX2
-    return V{_mm256_permute4x64_epi64(v.raw, _MM_SHUFFLE(3, 3, 3, 3))};
-#endif
-  }
-#endif  // HWY_TARGET
-
- public:
-  template <class D>
-  HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v,
-                                 TFromD<D>* HWY_RESTRICT buf) const {
-    const Base* base = static_cast<const Base*>(this);
-    const size_t N = Lanes(d);
-    Store(v, d, buf);
-    v = base->SetKey(d, buf + 0);  // result must be broadcasted
-    for (size_t i = base->LanesPerKey(); i < N; i += base->LanesPerKey()) {
-      v = base->First(d, v, base->SetKey(d, buf + i));
-    }
-    return v;
-  }
-
-  template <class D>
-  HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v,
-                                TFromD<D>* HWY_RESTRICT buf) const {
-    const Base* base = static_cast<const Base*>(this);
-    const size_t N = Lanes(d);
-    Store(v, d, buf);
-    v = base->SetKey(d, buf + 0);  // result must be broadcasted
-    for (size_t i = base->LanesPerKey(); i < N; i += base->LanesPerKey()) {
-      v = base->Last(d, v, base->SetKey(d, buf + i));
-    }
-    return v;
-  }
-
-  template <class D>
-  HWY_INLINE void Sort2(D d, Vec<D>& a, Vec<D>& b) const {
-    const Base* base = static_cast<const Base*>(this);
-
-    const Vec<D> a_copy = a;
-    const auto lt = base->Compare(d, a, b);
-    a = IfThenElse(lt, a, b);
-    b = IfThenElse(lt, b, a_copy);
-  }
-
-  // Conditionally swaps even-numbered lanes with their odd-numbered neighbor.
-  template <class D>
-  HWY_INLINE Vec<D> SortPairsDistance1(D d, Vec<D> v) const {
-    const Base* base = static_cast<const Base*>(this);
-    Vec<D> swapped = base->ReverseKeys2(d, v);
-
-#if HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SVE_256
-    const Vec<D> select = ReplicateTop4x(CompareTop(d, v, swapped));
-    return IfVecThenElse(select, swapped, v);
-#else
-    Sort2(d, v, swapped);
-    return base->OddEvenKeys(swapped, v);
-#endif
-  }
-
-  // Swaps with the vector formed by reversing contiguous groups of 4 keys.
-  template <class D>
-  HWY_INLINE Vec<D> SortPairsReverse4(D d, Vec<D> v) const {
-    const Base* base = static_cast<const Base*>(this);
-    Vec<D> swapped = base->ReverseKeys4(d, v);
-
-    // Only specialize for AVX3 because this requires 512-bit vectors.
-#if HWY_TARGET <= HWY_AVX3
-    const Vec512<uint64_t> outHx = CompareTop(d, v, swapped);
-    // Similar to ReplicateTop4x, we want to gang together 2 comparison results
-    // (4 lanes). They are not contiguous, so use permute to replicate 4x.
-    alignas(64) uint64_t kIndices[8] = {7, 7, 5, 5, 5, 5, 7, 7};
-    const Vec512<uint64_t> select =
-        TableLookupLanes(outHx, SetTableIndices(d, kIndices));
-    return IfVecThenElse(select, swapped, v);
-#else
-    Sort2(d, v, swapped);
-    return base->OddEvenPairs(d, swapped, v);
-#endif
-  }
-
-  // Conditionally swaps lane 0 with 4, 1 with 5 etc.
-  template <class D>
-  HWY_INLINE Vec<D> SortPairsDistance4(D, Vec<D>) const {
-    // Only used by Merge16, which would require 2048 bit vectors (unsupported).
-    HWY_ASSERT(0);
-  }
-};
-
-#endif  // VQSORT_ENABLED
-
-}  // namespace detail
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#endif  // HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort-inl.h b/third_party/highway/hwy/contrib/sort/vqsort-inl.h
deleted file mode 100644 (file)
index 2b133ba..0000000
+++ /dev/null
@@ -1,1001 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Normal include guard for target-independent parts
-#ifndef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_INL_H_
-#define HIGHWAY_HWY_CONTRIB_SORT_VQSORT_INL_H_
-
-#ifndef VQSORT_PRINT
-#define VQSORT_PRINT 0
-#endif
-
-// Makes it harder for adversaries to predict our sampling locations, at the
-// cost of 1-2% increased runtime.
-#ifndef VQSORT_SECURE_RNG
-#define VQSORT_SECURE_RNG 0
-#endif
-
-#if VQSORT_SECURE_RNG
-#include "third_party/absl/random/random.h"
-#endif
-
-#if VQSORT_PRINT
-#include <stdio.h>
-#endif
-
-#include <string.h>  // memcpy
-
-#include "hwy/cache_control.h"        // Prefetch
-#include "hwy/contrib/sort/vqsort.h"  // Fill24Bytes
-
-#if HWY_IS_MSAN
-#include <sanitizer/msan_interface.h>
-#endif
-
-#endif  // HIGHWAY_HWY_CONTRIB_SORT_VQSORT_INL_H_
-
-// Per-target
-#if defined(HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE) == \
-    defined(HWY_TARGET_TOGGLE)
-#ifdef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE
-#undef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE
-#else
-#define HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE
-#endif
-
-#if VQSORT_PRINT
-#include "hwy/print-inl.h"
-#endif
-
-#include "hwy/contrib/sort/shared-inl.h"
-#include "hwy/contrib/sort/sorting_networks-inl.h"
-// Placeholder for internal instrumentation. Do not remove.
-#include "hwy/highway.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-namespace detail {
-
-using Constants = hwy::SortConstants;
-
-// ------------------------------ HeapSort
-
-template <class Traits, typename T>
-void SiftDown(Traits st, T* HWY_RESTRICT lanes, const size_t num_lanes,
-              size_t start) {
-  constexpr size_t N1 = st.LanesPerKey();
-  const FixedTag<T, N1> d;
-
-  while (start < num_lanes) {
-    const size_t left = 2 * start + N1;
-    const size_t right = 2 * start + 2 * N1;
-    if (left >= num_lanes) break;
-    size_t idx_larger = start;
-    const auto key_j = st.SetKey(d, lanes + start);
-    if (AllTrue(d, st.Compare(d, key_j, st.SetKey(d, lanes + left)))) {
-      idx_larger = left;
-    }
-    if (right < num_lanes &&
-        AllTrue(d, st.Compare(d, st.SetKey(d, lanes + idx_larger),
-                              st.SetKey(d, lanes + right)))) {
-      idx_larger = right;
-    }
-    if (idx_larger == start) break;
-    st.Swap(lanes + start, lanes + idx_larger);
-    start = idx_larger;
-  }
-}
-
-// Heapsort: O(1) space, O(N*logN) worst-case comparisons.
-// Based on LLVM sanitizer_common.h, licensed under Apache-2.0.
-template <class Traits, typename T>
-void HeapSort(Traits st, T* HWY_RESTRICT lanes, const size_t num_lanes) {
-  constexpr size_t N1 = st.LanesPerKey();
-
-  if (num_lanes < 2 * N1) return;
-
-  // Build heap.
-  for (size_t i = ((num_lanes - N1) / N1 / 2) * N1; i != (~N1 + 1); i -= N1) {
-    SiftDown(st, lanes, num_lanes, i);
-  }
-
-  for (size_t i = num_lanes - N1; i != 0; i -= N1) {
-    // Swap root with last
-    st.Swap(lanes + 0, lanes + i);
-
-    // Sift down the new root.
-    SiftDown(st, lanes, i, 0);
-  }
-}
-
-#if VQSORT_ENABLED || HWY_IDE
-
-// ------------------------------ BaseCase
-
-// Sorts `keys` within the range [0, num) via sorting network.
-template <class D, class Traits, typename T>
-HWY_NOINLINE void BaseCase(D d, Traits st, T* HWY_RESTRICT keys,
-                           T* HWY_RESTRICT keys_end, size_t num,
-                           T* HWY_RESTRICT buf) {
-  const size_t N = Lanes(d);
-  using V = decltype(Zero(d));
-
-  // _Nonzero32 requires num - 1 != 0.
-  if (HWY_UNLIKELY(num <= 1)) return;
-
-  // Reshape into a matrix with kMaxRows rows, and columns limited by the
-  // 1D `num`, which is upper-bounded by the vector width (see BaseCaseNum).
-  const size_t num_pow2 = size_t{1}
-                          << (32 - Num0BitsAboveMS1Bit_Nonzero32(
-                                       static_cast<uint32_t>(num - 1)));
-  HWY_DASSERT(num <= num_pow2 && num_pow2 <= Constants::BaseCaseNum(N));
-  const size_t cols =
-      HWY_MAX(st.LanesPerKey(), num_pow2 >> Constants::kMaxRowsLog2);
-  HWY_DASSERT(cols <= N);
-
-  // We can avoid padding and load/store directly to `keys` after checking the
-  // original input array has enough space. Except at the right border, it's OK
-  // to sort more than the current sub-array. Even if we sort across a previous
-  // partition point, we know that keys will not migrate across it. However, we
-  // must use the maximum size of the sorting network, because the StoreU of its
-  // last vector would otherwise write invalid data starting at kMaxRows * cols.
-  const size_t N_sn = Lanes(CappedTag<T, Constants::kMaxCols>());
-  if (HWY_LIKELY(keys + N_sn * Constants::kMaxRows <= keys_end)) {
-    SortingNetwork(st, keys, N_sn);
-    return;
-  }
-
-  // Copy `keys` to `buf`.
-  size_t i;
-  for (i = 0; i + N <= num; i += N) {
-    Store(LoadU(d, keys + i), d, buf + i);
-  }
-  SafeCopyN(num - i, d, keys + i, buf + i);
-  i = num;
-
-  // Fill with padding - last in sort order, not copied to keys.
-  const V kPadding = st.LastValue(d);
-  // Initialize an extra vector because SortingNetwork loads full vectors,
-  // which may exceed cols*kMaxRows.
-  for (; i < (cols * Constants::kMaxRows + N); i += N) {
-    StoreU(kPadding, d, buf + i);
-  }
-
-  SortingNetwork(st, buf, cols);
-
-  for (i = 0; i + N <= num; i += N) {
-    StoreU(Load(d, buf + i), d, keys + i);
-  }
-  SafeCopyN(num - i, d, buf + i, keys + i);
-}
-
-// ------------------------------ Partition
-
-// Consumes from `left` until a multiple of kUnroll*N remains.
-// Temporarily stores the right side into `buf`, then moves behind `right`.
-template <class D, class Traits, class T>
-HWY_NOINLINE void PartitionToMultipleOfUnroll(D d, Traits st,
-                                              T* HWY_RESTRICT keys,
-                                              size_t& left, size_t& right,
-                                              const Vec<D> pivot,
-                                              T* HWY_RESTRICT buf) {
-  constexpr size_t kUnroll = Constants::kPartitionUnroll;
-  const size_t N = Lanes(d);
-  size_t readL = left;
-  size_t bufR = 0;
-  const size_t num = right - left;
-  // Partition requires both a multiple of kUnroll*N and at least
-  // 2*kUnroll*N for the initial loads. If less, consume all here.
-  const size_t num_rem =
-      (num < 2 * kUnroll * N) ? num : (num & (kUnroll * N - 1));
-  size_t i = 0;
-  for (; i + N <= num_rem; i += N) {
-    const Vec<D> vL = LoadU(d, keys + readL);
-    readL += N;
-
-    const auto comp = st.Compare(d, pivot, vL);
-    left += CompressBlendedStore(vL, Not(comp), d, keys + left);
-    bufR += CompressStore(vL, comp, d, buf + bufR);
-  }
-  // Last iteration: only use valid lanes.
-  if (HWY_LIKELY(i != num_rem)) {
-    const auto mask = FirstN(d, num_rem - i);
-    const Vec<D> vL = LoadU(d, keys + readL);
-
-    const auto comp = st.Compare(d, pivot, vL);
-    left += CompressBlendedStore(vL, AndNot(comp, mask), d, keys + left);
-    bufR += CompressStore(vL, And(comp, mask), d, buf + bufR);
-  }
-
-  // MSAN seems not to understand CompressStore. buf[0, bufR) are valid.
-#if HWY_IS_MSAN
-  __msan_unpoison(buf, bufR * sizeof(T));
-#endif
-
-  // Everything we loaded was put into buf, or behind the new `left`, after
-  // which there is space for bufR items. First move items from `right` to
-  // `left` to free up space, then copy `buf` into the vacated `right`.
-  // A loop with masked loads from `buf` is insufficient - we would also need to
-  // mask from `right`. Combining a loop with memcpy for the remainders is
-  // slower than just memcpy, so we use that for simplicity.
-  right -= bufR;
-  memcpy(keys + left, keys + right, bufR * sizeof(T));
-  memcpy(keys + right, buf, bufR * sizeof(T));
-}
-
-template <class D, class Traits, typename T>
-HWY_INLINE void StoreLeftRight(D d, Traits st, const Vec<D> v,
-                               const Vec<D> pivot, T* HWY_RESTRICT keys,
-                               size_t& writeL, size_t& remaining) {
-  const size_t N = Lanes(d);
-
-  const auto comp = st.Compare(d, pivot, v);
-
-  remaining -= N;
-  if (hwy::HWY_NAMESPACE::CompressIsPartition<T>::value ||
-      (HWY_MAX_BYTES == 16 && st.Is128())) {
-    // Non-native Compress (e.g. AVX2): we are able to partition a vector using
-    // a single Compress+two StoreU instead of two Compress[Blended]Store. The
-    // latter are more expensive. Because we store entire vectors, the contents
-    // between the updated writeL and writeR are ignored and will be overwritten
-    // by subsequent calls. This works because writeL and writeR are at least
-    // two vectors apart.
-    const auto lr = st.CompressKeys(v, comp);
-    const size_t num_left = N - CountTrue(d, comp);
-    StoreU(lr, d, keys + writeL);
-    // Now write the right-side elements (if any), such that the previous writeR
-    // is one past the end of the newly written right elements, then advance.
-    StoreU(lr, d, keys + remaining + writeL);
-    writeL += num_left;
-  } else {
-    // Native Compress[Store] (e.g. AVX3), which only keep the left or right
-    // side, not both, hence we require two calls.
-    const size_t num_left = CompressStore(v, Not(comp), d, keys + writeL);
-    writeL += num_left;
-
-    (void)CompressBlendedStore(v, comp, d, keys + remaining + writeL);
-  }
-}
-
-template <class D, class Traits, typename T>
-HWY_INLINE void StoreLeftRight4(D d, Traits st, const Vec<D> v0,
-                                const Vec<D> v1, const Vec<D> v2,
-                                const Vec<D> v3, const Vec<D> pivot,
-                                T* HWY_RESTRICT keys, size_t& writeL,
-                                size_t& remaining) {
-  StoreLeftRight(d, st, v0, pivot, keys, writeL, remaining);
-  StoreLeftRight(d, st, v1, pivot, keys, writeL, remaining);
-  StoreLeftRight(d, st, v2, pivot, keys, writeL, remaining);
-  StoreLeftRight(d, st, v3, pivot, keys, writeL, remaining);
-}
-
-// Moves "<= pivot" keys to the front, and others to the back. pivot is
-// broadcasted. Time-critical!
-//
-// Aligned loads do not seem to be worthwhile (not bottlenecked by load ports).
-template <class D, class Traits, typename T>
-HWY_NOINLINE size_t Partition(D d, Traits st, T* HWY_RESTRICT keys, size_t left,
-                              size_t right, const Vec<D> pivot,
-                              T* HWY_RESTRICT buf) {
-  using V = decltype(Zero(d));
-  const size_t N = Lanes(d);
-
-  // StoreLeftRight will CompressBlendedStore ending at `writeR`. Unless all
-  // lanes happen to be in the right-side partition, this will overrun `keys`,
-  // which triggers asan errors. Avoid by special-casing the last vector.
-  HWY_DASSERT(right - left > 2 * N);  // ensured by HandleSpecialCases
-  right -= N;
-  const size_t last = right;
-  const V vlast = LoadU(d, keys + last);
-
-  PartitionToMultipleOfUnroll(d, st, keys, left, right, pivot, buf);
-  constexpr size_t kUnroll = Constants::kPartitionUnroll;
-
-  // Partition splits the vector into 3 sections, left to right: Elements
-  // smaller or equal to the pivot, unpartitioned elements and elements larger
-  // than the pivot. To write elements unconditionally on the loop body without
-  // overwriting existing data, we maintain two regions of the loop where all
-  // elements have been copied elsewhere (e.g. vector registers.). I call these
-  // bufferL and bufferR, for left and right respectively.
-  //
-  // These regions are tracked by the indices (writeL, writeR, left, right) as
-  // presented in the diagram below.
-  //
-  //              writeL                                  writeR
-  //               \/                                       \/
-  //  |  <= pivot   | bufferL |   unpartitioned   | bufferR |   > pivot   |
-  //                          \/                  \/
-  //                         left                 right
-  //
-  // In the main loop body below we choose a side, load some elements out of the
-  // vector and move either `left` or `right`. Next we call into StoreLeftRight
-  // to partition the data, and the partitioned elements will be written either
-  // to writeR or writeL and the corresponding index will be moved accordingly.
-  //
-  // Note that writeR is not explicitly tracked as an optimization for platforms
-  // with conditional operations. Instead we track writeL and the number of
-  // elements left to process (`remaining`). From the diagram above we can see
-  // that:
-  //    writeR - writeL = remaining => writeR = remaining + writeL
-  //
-  // Tracking `remaining` is advantageous because each iteration reduces the
-  // number of unpartitioned elements by a fixed amount, so we can compute
-  // `remaining` without data dependencies.
-  //
-  size_t writeL = left;
-  size_t remaining = right - left;
-
-  const size_t num = right - left;
-  // Cannot load if there were fewer than 2 * kUnroll * N.
-  if (HWY_LIKELY(num != 0)) {
-    HWY_DASSERT(num >= 2 * kUnroll * N);
-    HWY_DASSERT((num & (kUnroll * N - 1)) == 0);
-
-    // Make space for writing in-place by reading from left and right.
-    const V vL0 = LoadU(d, keys + left + 0 * N);
-    const V vL1 = LoadU(d, keys + left + 1 * N);
-    const V vL2 = LoadU(d, keys + left + 2 * N);
-    const V vL3 = LoadU(d, keys + left + 3 * N);
-    left += kUnroll * N;
-    right -= kUnroll * N;
-    const V vR0 = LoadU(d, keys + right + 0 * N);
-    const V vR1 = LoadU(d, keys + right + 1 * N);
-    const V vR2 = LoadU(d, keys + right + 2 * N);
-    const V vR3 = LoadU(d, keys + right + 3 * N);
-
-    // The left/right updates may consume all inputs, so check before the loop.
-    while (left != right) {
-      V v0, v1, v2, v3;
-
-      // Data-dependent but branching is faster than forcing branch-free.
-      const size_t capacityL = left - writeL;
-      HWY_DASSERT(capacityL <= num);  // >= 0
-      // Load data from the end of the vector with less data (front or back).
-      // The next paragraphs explain how this works.
-      //
-      // let block_size = (kUnroll * N)
-      // On the loop prelude we load block_size elements from the front of the
-      // vector and an additional block_size elements from the back. On each
-      // iteration k elements are written to the front of the vector and
-      // (block_size - k) to the back.
-      //
-      // This creates a loop invariant where the capacity on the front
-      // (capacityL) and on the back (capacityR) always add to 2 * block_size.
-      // In other words:
-      //    capacityL + capacityR = 2 * block_size
-      //    capacityR = 2 * block_size - capacityL
-      //
-      // This means that:
-      //    capacityL < capacityR <=>
-      //    capacityL < 2 * block_size - capacityL <=>
-      //    2 * capacityL < 2 * block_size <=>
-      //    capacityL < block_size
-      //
-      // Thus the check on the next line is equivalent to capacityL > capacityR.
-      //
-      if (kUnroll * N < capacityL) {
-        right -= kUnroll * N;
-        v0 = LoadU(d, keys + right + 0 * N);
-        v1 = LoadU(d, keys + right + 1 * N);
-        v2 = LoadU(d, keys + right + 2 * N);
-        v3 = LoadU(d, keys + right + 3 * N);
-        hwy::Prefetch(keys + right - 3 * kUnroll * N);
-      } else {
-        v0 = LoadU(d, keys + left + 0 * N);
-        v1 = LoadU(d, keys + left + 1 * N);
-        v2 = LoadU(d, keys + left + 2 * N);
-        v3 = LoadU(d, keys + left + 3 * N);
-        left += kUnroll * N;
-        hwy::Prefetch(keys + left + 3 * kUnroll * N);
-      }
-
-      StoreLeftRight4(d, st, v0, v1, v2, v3, pivot, keys, writeL, remaining);
-    }
-
-    // Now finish writing the initial left/right to the middle.
-    StoreLeftRight4(d, st, vL0, vL1, vL2, vL3, pivot, keys, writeL, remaining);
-    StoreLeftRight4(d, st, vR0, vR1, vR2, vR3, pivot, keys, writeL, remaining);
-  }
-
-  // We have partitioned [left, right) such that writeL is the boundary.
-  HWY_DASSERT(remaining == 0);
-  // Make space for inserting vlast: move up to N of the first right-side keys
-  // into the unused space starting at last. If we have fewer, ensure they are
-  // the last items in that vector by subtracting from the *load* address,
-  // which is safe because we have at least two vectors (checked above).
-  const size_t totalR = last - writeL;
-  const size_t startR = totalR < N ? writeL + totalR - N : writeL;
-  StoreU(LoadU(d, keys + startR), d, keys + last);
-
-  // Partition vlast: write L, then R, into the single-vector gap at writeL.
-  const auto comp = st.Compare(d, pivot, vlast);
-  writeL += CompressBlendedStore(vlast, Not(comp), d, keys + writeL);
-  (void)CompressBlendedStore(vlast, comp, d, keys + writeL);
-
-  return writeL;
-}
-
-// ------------------------------ Pivot
-
-template <class Traits, class V>
-HWY_INLINE V MedianOf3(Traits st, V v0, V v1, V v2) {
-  const DFromV<V> d;
-  // Slightly faster for 128-bit, apparently because not serially dependent.
-  if (st.Is128()) {
-    // Median = XOR-sum 'minus' the first and last. Calling First twice is
-    // slightly faster than Compare + 2 IfThenElse or even IfThenElse + XOR.
-    const auto sum = Xor(Xor(v0, v1), v2);
-    const auto first = st.First(d, st.First(d, v0, v1), v2);
-    const auto last = st.Last(d, st.Last(d, v0, v1), v2);
-    return Xor(Xor(sum, first), last);
-  }
-  st.Sort2(d, v0, v2);
-  v1 = st.Last(d, v0, v1);
-  v1 = st.First(d, v1, v2);
-  return v1;
-}
-
-#if VQSORT_SECURE_RNG
-using Generator = absl::BitGen;
-#else
-// Based on https://github.com/numpy/numpy/issues/16313#issuecomment-641897028
-#pragma pack(push, 1)
-class Generator {
- public:
-  Generator(const void* heap, size_t num) {
-    Sorter::Fill24Bytes(heap, num, &a_);
-    k_ = 1;  // stream index: must be odd
-  }
-
-  explicit Generator(uint64_t seed) {
-    a_ = b_ = w_ = seed;
-    k_ = 1;
-  }
-
-  uint64_t operator()() {
-    const uint64_t b = b_;
-    w_ += k_;
-    const uint64_t next = a_ ^ w_;
-    a_ = (b + (b << 3)) ^ (b >> 11);
-    const uint64_t rot = (b << 24) | (b >> 40);
-    b_ = rot + next;
-    return next;
-  }
-
- private:
-  uint64_t a_;
-  uint64_t b_;
-  uint64_t w_;
-  uint64_t k_;  // increment
-};
-#pragma pack(pop)
-
-#endif  // !VQSORT_SECURE_RNG
-
-// Returns slightly biased random index of a chunk in [0, num_chunks).
-// See https://www.pcg-random.org/posts/bounded-rands.html.
-HWY_INLINE size_t RandomChunkIndex(const uint32_t num_chunks, uint32_t bits) {
-  const uint64_t chunk_index = (static_cast<uint64_t>(bits) * num_chunks) >> 32;
-  HWY_DASSERT(chunk_index < num_chunks);
-  return static_cast<size_t>(chunk_index);
-}
-
-template <class Traits, typename T>
-HWY_INLINE void SortSamples(Traits st, T* HWY_RESTRICT buf) {
-  // buf contains 192 bytes, so 16 128-bit vectors are necessary and sufficient.
-  constexpr size_t kSampleLanes = 3 * 64 / sizeof(T);
-  const CappedTag<T, 16 / sizeof(T)> d128;
-  const size_t N128 = Lanes(d128);
-  constexpr size_t kCols = HWY_MIN(16 / sizeof(T), Constants::kMaxCols);
-  constexpr size_t kBytes = kCols * Constants::kMaxRows * sizeof(T);
-  static_assert(192 <= kBytes, "");
-  // Fill with padding - last in sort order.
-  const auto kPadding = st.LastValue(d128);
-  // Initialize an extra vector because SortingNetwork loads full vectors,
-  // which may exceed cols*kMaxRows.
-  for (size_t i = kSampleLanes; i <= kBytes / sizeof(T); i += N128) {
-    StoreU(kPadding, d128, buf + i);
-  }
-
-  SortingNetwork(st, buf, kCols);
-}
-
-template <class Traits, typename T>
-HWY_INLINE size_t PivotRank(Traits st, T* HWY_RESTRICT buf) {
-  constexpr size_t kSampleLanes = 3 * 64 / sizeof(T);
-  constexpr size_t N1 = st.LanesPerKey();
-
-  constexpr size_t kRankMid = kSampleLanes / 2;
-  static_assert(kRankMid % N1 == 0, "Mid is not an aligned key");
-
-  // Find the previous value not equal to the median.
-  size_t rank_prev = kRankMid - N1;
-  for (; st.Equal1(buf + rank_prev, buf + kRankMid); rank_prev -= N1) {
-    // All previous samples are equal to the median.
-    if (rank_prev == 0) return 0;
-  }
-
-  size_t rank_next = rank_prev + N1;
-  for (; st.Equal1(buf + rank_next, buf + kRankMid); rank_next += N1) {
-    // The median is also the largest sample. If it is also the largest key,
-    // we'd end up with an empty right partition, so choose the previous key.
-    if (rank_next == kSampleLanes - N1) return rank_prev;
-  }
-
-  // If we choose the median as pivot, the ratio of keys ending in the left
-  // partition will likely be rank_next/kSampleLanes (if the sample is
-  // representative). This is because equal-to-pivot values also land in the
-  // left - it's infeasible to do an in-place vectorized 3-way partition.
-  // Check whether prev would lead to a more balanced partition.
-  const size_t excess_if_median = rank_next - kRankMid;
-  const size_t excess_if_prev = kRankMid - rank_prev;
-  return excess_if_median < excess_if_prev ? kRankMid : rank_prev;
-}
-
-#if VQSORT_PRINT
-// Compute exact min/max.
-template <class D, class Traits, typename T>
-HWY_NOINLINE void ScanMinMax(D d, Traits st, const T* HWY_RESTRICT keys,
-                             size_t num, T* HWY_RESTRICT buf, Vec<D>& first,
-                             Vec<D>& last) {
-  const size_t N = Lanes(d);
-
-  first = st.LastValue(d);
-  last = st.FirstValue(d);
-
-  size_t i = 0;
-  for (; i + N <= num; i += N) {
-    const Vec<D> v = LoadU(d, keys + i);
-    first = st.First(d, v, first);
-    last = st.Last(d, v, last);
-  }
-  if (HWY_LIKELY(i != num)) {
-    HWY_DASSERT(num >= N);  // See HandleSpecialCases
-    const Vec<D> v = LoadU(d, keys + num - N);
-    first = st.First(d, v, first);
-    last = st.Last(d, v, last);
-  }
-
-  first = st.FirstOfLanes(d, first, buf);
-  last = st.LastOfLanes(d, last, buf);
-}
-#endif  // VQSORT_PRINT
-
-template <class V>
-V OrXor(const V o, const V x1, const V x2) {
-  // TODO(janwas): ternlog?
-  return Or(o, Xor(x1, x2));
-}
-
-// Returns a lower bound on the index of the first mismatch, or `num` if all
-// are equal. `num` is const to ensure we don't change it, which would lead to
-// bugs because the caller will check whether we return the original value.
-template <class D, class Traits, typename T>
-HWY_NOINLINE size_t LowerBoundOfMismatch(D d, Traits st,
-                                         const T* HWY_RESTRICT keys,
-                                         const size_t num) {
-  using V = Vec<decltype(d)>;
-  const size_t N = Lanes(d);
-  HWY_DASSERT(num >= N);  // See HandleSpecialCases
-  const V reference = st.SetKey(d, keys);
-  const V zero = Zero(d);
-
-  size_t i = 0;
-
-  // Vector-align keys + i.
-  const size_t misalign =
-      (reinterpret_cast<uintptr_t>(keys) / sizeof(T)) & (N - 1);
-  if (HWY_LIKELY(misalign != 0)) {
-    HWY_DASSERT(misalign % st.LanesPerKey() == 0);
-    const size_t consume = N - misalign;
-    const auto mask = FirstN(d, consume);
-    const V v0 = LoadU(d, keys);
-    // Only check masked lanes; consider others to be equal to the reference.
-    if (!AllTrue(d, Or(Not(mask), Eq(v0, reference)))) {
-      return 0;  // not equal
-    }
-    i = consume;
-  }
-  HWY_DASSERT(((reinterpret_cast<uintptr_t>(keys + i) / sizeof(T)) & (N - 1)) ==
-              0);
-
-  // Sticky bits registering any difference between `keys` and the first key.
-  // We use vector XOR because it may be cheaper than comparisons, especially
-  // for 128-bit. 2x unrolled for more ILP.
-  V diff0 = zero;
-  V diff1 = zero;
-
-  // We want to stop once a difference has been found, but without slowing down
-  // the loop by comparing during each iteration. The compromise is to compare
-  // after a 'group', which consists of kLoops times two vectors.
-  constexpr size_t kLoops = 4;
-  const size_t lanes_per_group = kLoops * 2 * N;
-
-  for (; i + lanes_per_group <= num; i += lanes_per_group) {
-    HWY_DEFAULT_UNROLL
-    for (size_t loop = 0; loop < kLoops; ++loop) {
-      const V v0 = Load(d, keys + i + loop * 2 * N);
-      const V v1 = Load(d, keys + i + loop * 2 * N + N);
-      diff0 = OrXor(diff0, v0, reference);
-      diff1 = OrXor(diff1, v1, reference);
-    }
-    diff0 = Or(diff0, diff1);
-    if (!AllTrue(d, Eq(diff0, zero))) {
-      return i;  // not equal
-    }
-  }
-  // Whole vectors, no unrolling, compare directly
-  for (; i + N <= num; i += N) {
-    const V v0 = Load(d, keys + i);
-    if (!AllTrue(d, Eq(v0, reference))) {
-      return i;  // not equal
-    }
-  }
-  // If there are remainders, re-check the last whole vector.
-  if (HWY_LIKELY(i != num)) {
-    const V v0 = LoadU(d, keys + num - N);
-    if (!AllTrue(d, Eq(v0, reference))) {
-      return i;  // not equal
-    }
-  }
-
-  return num;  // all equal
-}
-
-enum class PivotResult {
-  kAllEqual,  // stop without partitioning
-  kNormal,    // partition and recurse left and right
-  kIsFirst,   // partition but skip left recursion
-  kWasLast,   // partition but skip right recursion
-};
-
-// Classifies (and possibly modifies) `pivot` by scanning for the first/last
-// key from index `idx_diff`, which is less than `num`.
-template <class D, class Traits, typename T>
-HWY_NOINLINE PivotResult CheckFirstLast(D d, Traits st,
-                                        const T* HWY_RESTRICT keys, size_t num,
-                                        size_t idx_diff,
-                                        Vec<D>* HWY_RESTRICT pivot,
-                                        T* HWY_RESTRICT buf) {
-  const size_t N = Lanes(d);
-  HWY_DASSERT(num >= N);  // See HandleSpecialCases
-  HWY_DASSERT(idx_diff < num);
-
-  Vec<D> first = st.LastValue(d);
-  Vec<D> last = st.FirstValue(d);
-  // Early out for mostly-0 arrays, where pivot is often FirstValue.
-  if (AllTrue(d, st.EqualKeys(d, *pivot, last))) {
-    return PivotResult::kIsFirst;
-  }
-
-  // We know keys[0, idx_diff) are equal, but they might be the first/last, so
-  // start scanning one vector before.
-  size_t i = static_cast<size_t>(
-      HWY_MAX(static_cast<intptr_t>(idx_diff) - static_cast<intptr_t>(N), 0));
-
-  constexpr size_t kLoops = 4;
-  const size_t lanes_per_group = kLoops * N;
-
-  // Whole group, unrolled
-  for (; i + lanes_per_group <= num; i += lanes_per_group) {
-    HWY_DEFAULT_UNROLL
-    for (size_t loop = 0; loop < kLoops; ++loop) {
-      const Vec<D> curr = LoadU(d, keys + i + loop * N);
-      first = st.First(d, first, curr);
-      last = st.Last(d, last, curr);
-    }
-  }
-  // Whole vectors, no unrolling
-  for (; i + N <= num; i += N) {
-    const Vec<D> curr = LoadU(d, keys + i);
-    first = st.First(d, first, curr);
-    last = st.Last(d, last, curr);
-  }
-  // If there are remainders, re-check the last whole vector.
-  if (HWY_LIKELY(i != num)) {
-    const Vec<D> curr = LoadU(d, keys + num - N);
-    first = st.First(d, first, curr);
-    last = st.Last(d, last, curr);
-  }
-
-  first = st.FirstOfLanes(d, first, buf);
-  last = st.LastOfLanes(d, last, buf);
-
-  if (AllTrue(d, st.EqualKeys(d, first, *pivot))) {
-    return PivotResult::kIsFirst;
-  }
-  // Fixup required because keys equal to the pivot go to the left partition,
-  // and the pivot is the last, so Partition would not change anything.
-  // Instead use the previous value in sort order, which is not necessarily an
-  // actual key.
-  if (AllTrue(d, st.EqualKeys(d, last, *pivot))) {
-    *pivot = st.PrevValue(d, *pivot);
-    return PivotResult::kWasLast;
-  }
-  return PivotResult::kNormal;
-}
-
-// Writes samples from `keys[0, num)` into `buf`.
-template <class D, class Traits, typename T>
-HWY_INLINE void DrawSamples(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
-                            T* HWY_RESTRICT buf, Generator& rng) {
-  using V = decltype(Zero(d));
-  const size_t N = Lanes(d);
-
-  // Power of two
-  const size_t lanes_per_chunk = Constants::LanesPerChunk(sizeof(T), N);
-
-  // Align start of keys to chunks. We always have at least 2 chunks because the
-  // base case would have handled anything up to 16 vectors, i.e. >= 4 chunks.
-  HWY_DASSERT(num >= 2 * lanes_per_chunk);
-  const size_t misalign =
-      (reinterpret_cast<uintptr_t>(keys) / sizeof(T)) & (lanes_per_chunk - 1);
-  if (misalign != 0) {
-    const size_t consume = lanes_per_chunk - misalign;
-    keys += consume;
-    num -= consume;
-  }
-
-  // Generate enough random bits for 9 uint32
-  uint64_t* bits64 = reinterpret_cast<uint64_t*>(buf);
-  for (size_t i = 0; i < 5; ++i) {
-    bits64[i] = rng();
-  }
-  const uint32_t* bits = reinterpret_cast<const uint32_t*>(buf);
-
-  const uint32_t lpc32 = static_cast<uint32_t>(lanes_per_chunk);
-  // Avoid division
-  const size_t log2_lpc = Num0BitsBelowLS1Bit_Nonzero32(lpc32);
-  const size_t num_chunks64 = num >> log2_lpc;
-  // Clamp to uint32 for RandomChunkIndex
-  const uint32_t num_chunks =
-      static_cast<uint32_t>(HWY_MIN(num_chunks64, 0xFFFFFFFFull));
-
-  const size_t offset0 = RandomChunkIndex(num_chunks, bits[0]) << log2_lpc;
-  const size_t offset1 = RandomChunkIndex(num_chunks, bits[1]) << log2_lpc;
-  const size_t offset2 = RandomChunkIndex(num_chunks, bits[2]) << log2_lpc;
-  const size_t offset3 = RandomChunkIndex(num_chunks, bits[3]) << log2_lpc;
-  const size_t offset4 = RandomChunkIndex(num_chunks, bits[4]) << log2_lpc;
-  const size_t offset5 = RandomChunkIndex(num_chunks, bits[5]) << log2_lpc;
-  const size_t offset6 = RandomChunkIndex(num_chunks, bits[6]) << log2_lpc;
-  const size_t offset7 = RandomChunkIndex(num_chunks, bits[7]) << log2_lpc;
-  const size_t offset8 = RandomChunkIndex(num_chunks, bits[8]) << log2_lpc;
-  for (size_t i = 0; i < lanes_per_chunk; i += N) {
-    const V v0 = Load(d, keys + offset0 + i);
-    const V v1 = Load(d, keys + offset1 + i);
-    const V v2 = Load(d, keys + offset2 + i);
-    const V medians0 = MedianOf3(st, v0, v1, v2);
-    Store(medians0, d, buf + i);
-
-    const V v3 = Load(d, keys + offset3 + i);
-    const V v4 = Load(d, keys + offset4 + i);
-    const V v5 = Load(d, keys + offset5 + i);
-    const V medians1 = MedianOf3(st, v3, v4, v5);
-    Store(medians1, d, buf + i + lanes_per_chunk);
-
-    const V v6 = Load(d, keys + offset6 + i);
-    const V v7 = Load(d, keys + offset7 + i);
-    const V v8 = Load(d, keys + offset8 + i);
-    const V medians2 = MedianOf3(st, v6, v7, v8);
-    Store(medians2, d, buf + i + lanes_per_chunk * 2);
-  }
-}
-
-// Returns pivot chosen from `keys[0, num)`. It will never be the largest key
-// (thus the right partition will never be empty).
-template <class D, class Traits, typename T>
-HWY_NOINLINE Vec<D> ChoosePivot(D d, Traits st, T* HWY_RESTRICT keys,
-                                const size_t num, T* HWY_RESTRICT buf,
-                                Generator& rng, PivotResult& result) {
-  using V = decltype(Zero(d));
-
-  constexpr size_t kSampleLanes = 3 * 64 / sizeof(T);
-  constexpr size_t N1 = st.LanesPerKey();
-
-#if VQSORT_PRINT
-  fprintf(stderr, "\nChoosePivot num %zu:\n", num);
-#endif
-  DrawSamples(d, st, keys, num, buf, rng);
-
-  SortSamples(st, buf);
-#if VQSORT_PRINT
-  const size_t N = Lanes(d);
-  for (size_t i = 0; i < kSampleLanes; i += N) {
-    Print(d, "", Load(d, buf + i), 0, N);
-  }
-#endif
-
-  // All samples are equal.
-  if (st.Equal1(buf, buf + kSampleLanes - N1)) {
-    const size_t idx_diff = LowerBoundOfMismatch(d, st, keys, num);
-    const bool all_eq = idx_diff == num;
-#if VQSORT_PRINT
-    fprintf(stderr, "Pivot num=%zu samplesEq, idxDiff %zu keysEq: %d\n", num,
-            idx_diff, all_eq);
-#endif
-    if (all_eq) {
-      result = PivotResult::kAllEqual;
-      return Zero(d);
-    }
-
-    V pivot = st.SetKey(d, buf);  // the single unique sample
-    result = CheckFirstLast(d, st, keys, num, idx_diff, &pivot, buf);
-#if VQSORT_PRINT
-    fprintf(stderr, "PivotResult %d\n", static_cast<int>(result));
-    Print(d, "Adjusted pivot", pivot, 0, st.LanesPerKey());
-#endif
-    return pivot;
-  }
-
-  const size_t pivot_rank = PivotRank(st, buf);
-  const Vec<D> pivot = st.SetKey(d, buf + pivot_rank);
-#if VQSORT_PRINT
-  fprintf(stderr, "  Pivot rank %zu = %.0f\n", pivot_rank,
-          static_cast<double>(GetLane(pivot)));
-#endif
-  result = PivotResult::kNormal;
-  return pivot;
-}
-
-template <class D, class Traits, typename T>
-HWY_NOINLINE void Recurse(D d, Traits st, T* HWY_RESTRICT keys,
-                          T* HWY_RESTRICT keys_end, const size_t begin,
-                          const size_t end, T* HWY_RESTRICT buf, Generator& rng,
-                          size_t remaining_levels) {
-  const size_t num = end - begin;  // >= 1
-#if VQSORT_PRINT
-  fprintf(stderr, "- Recurse remaining %zu [%zu %zu) len %zu\n",
-          remaining_levels, begin, end, num);
-  Vec<D> first, last;
-  if (num >= Lanes(d)) {
-    ScanMinMax(d, st, keys + begin, num, buf, first, last);
-  }
-  Print(d, "first", first, 0, st.LanesPerKey());
-  Print(d, "last", last, 0, st.LanesPerKey());
-#endif
-  HWY_DASSERT(begin < end);
-
-  if (HWY_UNLIKELY(num <= Constants::BaseCaseNum(Lanes(d)))) {
-    BaseCase(d, st, keys + begin, keys_end, num, buf);
-    return;
-  }
-  PivotResult result;
-  Vec<D> pivot = ChoosePivot(d, st, keys + begin, num, buf, rng, result);
-  if (HWY_UNLIKELY(result == PivotResult::kAllEqual)) {
-    return;
-  }
-
-  // Too many recursions. This is unlikely to happen because we select pivots
-  // from large (though still O(1)) samples.
-  if (HWY_UNLIKELY(remaining_levels == 0)) {
-#if VQSORT_PRINT
-    fprintf(stderr, "HeapSort reached, size=%zu\n", num);
-#endif
-    HeapSort(st, keys + begin, num);  // Slow but N*logN.
-    return;
-  }
-
-  const size_t bound = Partition(d, st, keys, begin, end, pivot, buf);
-  // ChoosePivot ensures pivot != last key, so the right partition is never
-  // empty. Nor is the left, because the pivot is either one of the keys, or
-  // the value prior to the last (which is not the only value).
-  HWY_ASSERT(begin != bound && bound != end);
-  if (HWY_LIKELY(result != PivotResult::kIsFirst)) {
-    Recurse(d, st, keys, keys_end, begin, bound, buf, rng,
-            remaining_levels - 1);
-  }
-  if (HWY_LIKELY(result != PivotResult::kWasLast)) {
-    Recurse(d, st, keys, keys_end, bound, end, buf, rng, remaining_levels - 1);
-  }
-}
-
-// Returns true if sorting is finished.
-template <class D, class Traits, typename T>
-HWY_INLINE bool HandleSpecialCases(D d, Traits st, T* HWY_RESTRICT keys,
-                                   size_t num) {
-  const size_t N = Lanes(d);
-  const size_t base_case_num = Constants::BaseCaseNum(N);
-
-  // 128-bit keys require vectors with at least two u64 lanes, which is always
-  // the case unless `d` requests partial vectors (e.g. fraction = 1/2) AND the
-  // hardware vector width is less than 128bit / fraction.
-  const bool partial_128 = !IsFull(d) && N < 2 && st.Is128();
-  // Partition assumes its input is at least two vectors. If vectors are huge,
-  // base_case_num may actually be smaller. If so, which is only possible on
-  // RVV, pass a capped or partial d (LMUL < 1). Use HWY_MAX_BYTES instead of
-  // HWY_LANES to account for the largest possible LMUL.
-  constexpr bool kPotentiallyHuge =
-      HWY_MAX_BYTES / sizeof(T) > Constants::kMaxRows * Constants::kMaxCols;
-  const bool huge_vec = kPotentiallyHuge && (2 * N > base_case_num);
-  if (partial_128 || huge_vec) {
-#if VQSORT_PRINT
-    fprintf(stderr, "WARNING: using slow HeapSort: partial %d huge %d\n",
-            partial_128, huge_vec);
-#endif
-    HeapSort(st, keys, num);
-    return true;
-  }
-
-  // Small arrays are already handled by Recurse.
-
-  // We could also check for already sorted/reverse/equal, but that's probably
-  // counterproductive if vqsort is used as a base case.
-
-  return false;  // not finished sorting
-}
-
-#endif  // VQSORT_ENABLED
-}  // namespace detail
-
-// Sorts `keys[0..num-1]` according to the order defined by `st.Compare`.
-// In-place i.e. O(1) additional storage. Worst-case N*logN comparisons.
-// Non-stable (order of equal keys may change), except for the common case where
-// the upper bits of T are the key, and the lower bits are a sequential or at
-// least unique ID.
-// There is no upper limit on `num`, but note that pivots may be chosen by
-// sampling only from the first 256 GiB.
-//
-// `d` is typically SortTag<T> (chooses between full and partial vectors).
-// `st` is SharedTraits<Traits*<Order*>>. This abstraction layer bridges
-//   differences in sort order and single-lane vs 128-bit keys.
-template <class D, class Traits, typename T>
-void Sort(D d, Traits st, T* HWY_RESTRICT keys, size_t num,
-          T* HWY_RESTRICT buf) {
-#if VQSORT_PRINT
-  fprintf(stderr, "=============== Sort num %zu\n", num);
-#endif
-
-#if VQSORT_ENABLED || HWY_IDE
-#if !HWY_HAVE_SCALABLE
-  // On targets with fixed-size vectors, avoid _using_ the allocated memory.
-  // We avoid (potentially expensive for small input sizes) allocations on
-  // platforms where no targets are scalable. For 512-bit vectors, this fits on
-  // the stack (several KiB).
-  HWY_ALIGN T storage[SortConstants::BufNum<T>(HWY_LANES(T))] = {};
-  static_assert(sizeof(storage) <= 8192, "Unexpectedly large, check size");
-  buf = storage;
-#endif  // !HWY_HAVE_SCALABLE
-
-  if (detail::HandleSpecialCases(d, st, keys, num)) return;
-
-#if HWY_MAX_BYTES > 64
-  // sorting_networks-inl and traits assume no more than 512 bit vectors.
-  if (HWY_UNLIKELY(Lanes(d) > 64 / sizeof(T))) {
-    return Sort(CappedTag<T, 64 / sizeof(T)>(), st, keys, num, buf);
-  }
-#endif  // HWY_MAX_BYTES > 64
-
-  detail::Generator rng(keys, num);
-
-  // Introspection: switch to worst-case N*logN heapsort after this many.
-  const size_t max_levels = 2 * hwy::CeilLog2(num) + 4;
-  detail::Recurse(d, st, keys, keys + num, 0, num, buf, rng, max_levels);
-#else
-  (void)d;
-  (void)buf;
-#if VQSORT_PRINT
-  fprintf(stderr, "WARNING: using slow HeapSort because vqsort disabled\n");
-#endif
-  return detail::HeapSort(st, keys, num);
-#endif  // VQSORT_ENABLED
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#endif  // HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort.cc b/third_party/highway/hwy/contrib/sort/vqsort.cc
deleted file mode 100644 (file)
index b3bac07..0000000
+++ /dev/null
@@ -1,184 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/sort/vqsort.h"
-
-#include <string.h>  // memset
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-
-// After foreach_target
-#include "hwy/contrib/sort/shared-inl.h"
-
-// Architectures for which we know HWY_HAVE_SCALABLE == 0. This opts into an
-// optimization that replaces dynamic allocation with stack storage.
-#ifndef VQSORT_STACK
-#if HWY_ARCH_X86 || HWY_ARCH_WASM
-#define VQSORT_STACK 1
-#else
-#define VQSORT_STACK 0
-#endif
-#endif  // VQSORT_STACK
-
-#if !VQSORT_STACK
-#include "hwy/aligned_allocator.h"
-#endif
-
-// Check if we have sys/random.h. First skip some systems on which the check
-// itself (features.h) might be problematic.
-#if defined(ANDROID) || defined(__ANDROID__) || HWY_ARCH_RVV
-#define VQSORT_GETRANDOM 0
-#endif
-
-#if !defined(VQSORT_GETRANDOM) && HWY_OS_LINUX
-#include <features.h>
-
-// ---- which libc
-#if defined(__UCLIBC__)
-#define VQSORT_GETRANDOM 1  // added Mar 2015, before uclibc-ng 1.0
-
-#elif defined(__GLIBC__) && defined(__GLIBC_PREREQ)
-#if __GLIBC_PREREQ(2, 25)
-#define VQSORT_GETRANDOM 1
-#else
-#define VQSORT_GETRANDOM 0
-#endif
-
-#else
-// Assume MUSL, which has getrandom since 2018. There is no macro to test, see
-// https://www.openwall.com/lists/musl/2013/03/29/13.
-#define VQSORT_GETRANDOM 1
-
-#endif  // ---- which libc
-#endif  // linux
-
-#if !defined(VQSORT_GETRANDOM)
-#define VQSORT_GETRANDOM 0
-#endif
-
-// Seed source for SFC generator: 1=getrandom, 2=CryptGenRandom
-// (not all Android support the getrandom wrapper)
-#ifndef VQSORT_SECURE_SEED
-
-#if VQSORT_GETRANDOM
-#define VQSORT_SECURE_SEED 1
-#elif defined(_WIN32) || defined(_WIN64)
-#define VQSORT_SECURE_SEED 2
-#else
-#define VQSORT_SECURE_SEED 0
-#endif
-
-#endif  // VQSORT_SECURE_SEED
-
-#if !VQSORT_SECURE_RNG
-
-#include <time.h>
-#if VQSORT_SECURE_SEED == 1
-#include <sys/random.h>
-#elif VQSORT_SECURE_SEED == 2
-#include <windows.h>
-#pragma comment(lib, "advapi32.lib")
-// Must come after windows.h.
-#include <wincrypt.h>
-#endif  // VQSORT_SECURE_SEED
-
-#endif  // !VQSORT_SECURE_RNG
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-size_t VectorSize() { return Lanes(ScalableTag<uint8_t, 3>()); }
-bool HaveFloat64() { return HWY_HAVE_FLOAT64; }
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-namespace {
-HWY_EXPORT(VectorSize);
-HWY_EXPORT(HaveFloat64);
-
-}  // namespace
-
-Sorter::Sorter() {
-#if VQSORT_STACK
-  ptr_ = nullptr;  // Sort will use stack storage instead
-#else
-  // Determine the largest buffer size required for any type by trying them all.
-  // (The capping of N in BaseCaseNum means that smaller N but larger sizeof_t
-  // may require a larger buffer.)
-  const size_t vector_size = HWY_DYNAMIC_DISPATCH(VectorSize)();
-  const size_t max_bytes =
-      HWY_MAX(HWY_MAX(SortConstants::BufBytes<uint16_t>(vector_size),
-                      SortConstants::BufBytes<uint32_t>(vector_size)),
-              SortConstants::BufBytes<uint64_t>(vector_size));
-  ptr_ = hwy::AllocateAlignedBytes(max_bytes, nullptr, nullptr);
-
-  // Prevent msan errors by initializing.
-  memset(ptr_, 0, max_bytes);
-#endif
-}
-
-void Sorter::Delete() {
-#if !VQSORT_STACK
-  FreeAlignedBytes(ptr_, nullptr, nullptr);
-  ptr_ = nullptr;
-#endif
-}
-
-#if !VQSORT_SECURE_RNG
-
-void Sorter::Fill24Bytes(const void* seed_heap, size_t seed_num, void* bytes) {
-#if VQSORT_SECURE_SEED == 1
-  // May block if urandom is not yet initialized.
-  const ssize_t ret = getrandom(bytes, 24, /*flags=*/0);
-  if (ret == 24) return;
-#elif VQSORT_SECURE_SEED == 2
-  HCRYPTPROV hProvider{};
-  if (CryptAcquireContextA(&hProvider, nullptr, nullptr, PROV_RSA_FULL,
-                           CRYPT_VERIFYCONTEXT)) {
-    const BOOL ok =
-        CryptGenRandom(hProvider, 24, reinterpret_cast<BYTE*>(bytes));
-    CryptReleaseContext(hProvider, 0);
-    if (ok) return;
-  }
-#endif
-
-  // VQSORT_SECURE_SEED == 0, or one of the above failed. Get some entropy from
-  // stack/heap/code addresses and the clock() timer.
-  uint64_t* words = reinterpret_cast<uint64_t*>(bytes);
-  uint64_t** seed_stack = &words;
-  void (*seed_code)(const void*, size_t, void*) = &Fill24Bytes;
-  const uintptr_t bits_stack = reinterpret_cast<uintptr_t>(seed_stack);
-  const uintptr_t bits_heap = reinterpret_cast<uintptr_t>(seed_heap);
-  const uintptr_t bits_code = reinterpret_cast<uintptr_t>(seed_code);
-  const uint64_t bits_time = static_cast<uint64_t>(clock());
-  words[0] = bits_stack ^ bits_time ^ seed_num;
-  words[1] = bits_heap ^ bits_time ^ seed_num;
-  words[2] = bits_code ^ bits_time ^ seed_num;
-}
-
-#endif  // !VQSORT_SECURE_RNG
-
-bool Sorter::HaveFloat64() { return HWY_DYNAMIC_DISPATCH(HaveFloat64)(); }
-
-}  // namespace hwy
-#endif  // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort.h b/third_party/highway/hwy/contrib/sort/vqsort.h
deleted file mode 100644 (file)
index ee7e9cb..0000000
+++ /dev/null
@@ -1,105 +0,0 @@
-// Copyright 2022 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Interface to vectorized quicksort with dynamic dispatch.
-// Blog post: https://tinyurl.com/vqsort-blog
-// Paper with measurements: https://arxiv.org/abs/2205.05982
-//
-// To ensure the overhead of using wide vectors (e.g. AVX2 or AVX-512) is
-// worthwhile, we recommend using this code for sorting arrays whose size is at
-// least 512 KiB.
-
-#ifndef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_
-#define HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_
-
-#include "hwy/base.h"
-
-namespace hwy {
-
-// Tag arguments that determine the sort order.
-struct SortAscending {
-  constexpr bool IsAscending() const { return true; }
-};
-struct SortDescending {
-  constexpr bool IsAscending() const { return false; }
-};
-
-// Allocates O(1) space. Type-erased RAII wrapper over hwy/aligned_allocator.h.
-// This allows amortizing the allocation over multiple sorts.
-class HWY_CONTRIB_DLLEXPORT Sorter {
- public:
-  Sorter();
-  ~Sorter() { Delete(); }
-
-  // Move-only
-  Sorter(const Sorter&) = delete;
-  Sorter& operator=(const Sorter&) = delete;
-  Sorter(Sorter&& other) {
-    Delete();
-    ptr_ = other.ptr_;
-    other.ptr_ = nullptr;
-  }
-  Sorter& operator=(Sorter&& other) {
-    Delete();
-    ptr_ = other.ptr_;
-    other.ptr_ = nullptr;
-    return *this;
-  }
-
-  // Sorts keys[0, n). Dispatches to the best available instruction set,
-  // and does not allocate memory.
-  void operator()(uint16_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
-  void operator()(uint16_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
-  void operator()(uint32_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
-  void operator()(uint32_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
-  void operator()(uint64_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
-  void operator()(uint64_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
-
-  void operator()(int16_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
-  void operator()(int16_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
-  void operator()(int32_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
-  void operator()(int32_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
-  void operator()(int64_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
-  void operator()(int64_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
-
-  void operator()(float* HWY_RESTRICT keys, size_t n, SortAscending) const;
-  void operator()(float* HWY_RESTRICT keys, size_t n, SortDescending) const;
-  void operator()(double* HWY_RESTRICT keys, size_t n, SortAscending) const;
-  void operator()(double* HWY_RESTRICT keys, size_t n, SortDescending) const;
-
-  void operator()(uint128_t* HWY_RESTRICT keys, size_t n, SortAscending) const;
-  void operator()(uint128_t* HWY_RESTRICT keys, size_t n, SortDescending) const;
-
-  void operator()(K64V64* HWY_RESTRICT keys, size_t n, SortAscending) const;
-  void operator()(K64V64* HWY_RESTRICT keys, size_t n, SortDescending) const;
-
-  // For internal use only
-  static void Fill24Bytes(const void* seed_heap, size_t seed_num, void* bytes);
-  static bool HaveFloat64();
-
- private:
-  void Delete();
-
-  template <typename T>
-  T* Get() const {
-    return static_cast<T*>(ptr_);
-  }
-
-  void* ptr_ = nullptr;
-};
-
-}  // namespace hwy
-
-#endif  // HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_128a.cc b/third_party/highway/hwy/contrib/sort/vqsort_128a.cc
deleted file mode 100644 (file)
index 40daea8..0000000
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/sort/vqsort.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_128a.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-
-// After foreach_target
-#include "hwy/contrib/sort/traits128-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-void Sort128Asc(uint64_t* HWY_RESTRICT keys, size_t num,
-                uint64_t* HWY_RESTRICT buf) {
-#if VQSORT_ENABLED
-  SortTag<uint64_t> d;
-  detail::SharedTraits<detail::Traits128<detail::OrderAscending128>> st;
-  Sort(d, st, keys, num, buf);
-#else
-  (void) keys;
-  (void) num;
-  (void) buf;
-  HWY_ASSERT(0);
-#endif
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-namespace {
-HWY_EXPORT(Sort128Asc);
-}  // namespace
-
-void Sorter::operator()(uint128_t* HWY_RESTRICT keys, size_t n,
-                        SortAscending) const {
-  HWY_DYNAMIC_DISPATCH(Sort128Asc)
-  (reinterpret_cast<uint64_t*>(keys), n * 2, Get<uint64_t>());
-}
-
-}  // namespace hwy
-#endif  // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_128d.cc b/third_party/highway/hwy/contrib/sort/vqsort_128d.cc
deleted file mode 100644 (file)
index 357da84..0000000
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/sort/vqsort.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_128d.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-
-// After foreach_target
-#include "hwy/contrib/sort/traits128-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-void Sort128Desc(uint64_t* HWY_RESTRICT keys, size_t num,
-                 uint64_t* HWY_RESTRICT buf) {
-#if VQSORT_ENABLED
-  SortTag<uint64_t> d;
-  detail::SharedTraits<detail::Traits128<detail::OrderDescending128>> st;
-  Sort(d, st, keys, num, buf);
-#else
-  (void) keys;
-  (void) num;
-  (void) buf;
-  HWY_ASSERT(0);
-#endif
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-namespace {
-HWY_EXPORT(Sort128Desc);
-}  // namespace
-
-void Sorter::operator()(uint128_t* HWY_RESTRICT keys, size_t n,
-                        SortDescending) const {
-  HWY_DYNAMIC_DISPATCH(Sort128Desc)
-  (reinterpret_cast<uint64_t*>(keys), n * 2, Get<uint64_t>());
-}
-
-}  // namespace hwy
-#endif  // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_f32a.cc b/third_party/highway/hwy/contrib/sort/vqsort_f32a.cc
deleted file mode 100644 (file)
index 3856eea..0000000
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/sort/vqsort.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f32a.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-
-// After foreach_target
-#include "hwy/contrib/sort/traits-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-void SortF32Asc(float* HWY_RESTRICT keys, size_t num, float* HWY_RESTRICT buf) {
-  SortTag<float> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<float>>> st;
-  Sort(d, st, keys, num, buf);
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-namespace {
-HWY_EXPORT(SortF32Asc);
-}  // namespace
-
-void Sorter::operator()(float* HWY_RESTRICT keys, size_t n,
-                        SortAscending) const {
-  HWY_DYNAMIC_DISPATCH(SortF32Asc)(keys, n, Get<float>());
-}
-
-}  // namespace hwy
-#endif  // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_f32d.cc b/third_party/highway/hwy/contrib/sort/vqsort_f32d.cc
deleted file mode 100644 (file)
index 7f5f97c..0000000
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/sort/vqsort.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f32d.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-
-// After foreach_target
-#include "hwy/contrib/sort/traits-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-void SortF32Desc(float* HWY_RESTRICT keys, size_t num,
-                 float* HWY_RESTRICT buf) {
-  SortTag<float> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<float>>> st;
-  Sort(d, st, keys, num, buf);
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-namespace {
-HWY_EXPORT(SortF32Desc);
-}  // namespace
-
-void Sorter::operator()(float* HWY_RESTRICT keys, size_t n,
-                        SortDescending) const {
-  HWY_DYNAMIC_DISPATCH(SortF32Desc)(keys, n, Get<float>());
-}
-
-}  // namespace hwy
-#endif  // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_f64a.cc b/third_party/highway/hwy/contrib/sort/vqsort_f64a.cc
deleted file mode 100644 (file)
index 287d521..0000000
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/sort/vqsort.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f64a.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-
-// After foreach_target
-#include "hwy/contrib/sort/traits-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-void SortF64Asc(double* HWY_RESTRICT keys, size_t num,
-                double* HWY_RESTRICT buf) {
-#if HWY_HAVE_FLOAT64
-  SortTag<double> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<double>>> st;
-  Sort(d, st, keys, num, buf);
-#else
-  (void)keys;
-  (void)num;
-  (void)buf;
-  HWY_ASSERT(0);
-#endif
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-namespace {
-HWY_EXPORT(SortF64Asc);
-}  // namespace
-
-void Sorter::operator()(double* HWY_RESTRICT keys, size_t n,
-                        SortAscending) const {
-  HWY_DYNAMIC_DISPATCH(SortF64Asc)(keys, n, Get<double>());
-}
-
-}  // namespace hwy
-#endif  // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_f64d.cc b/third_party/highway/hwy/contrib/sort/vqsort_f64d.cc
deleted file mode 100644 (file)
index 74d40c1..0000000
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/sort/vqsort.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f64d.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-
-// After foreach_target
-#include "hwy/contrib/sort/traits-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-void SortF64Desc(double* HWY_RESTRICT keys, size_t num,
-                 double* HWY_RESTRICT buf) {
-#if HWY_HAVE_FLOAT64
-  SortTag<double> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<double>>> st;
-  Sort(d, st, keys, num, buf);
-#else
-  (void)keys;
-  (void)num;
-  (void)buf;
-  HWY_ASSERT(0);
-#endif
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-namespace {
-HWY_EXPORT(SortF64Desc);
-}  // namespace
-
-void Sorter::operator()(double* HWY_RESTRICT keys, size_t n,
-                        SortDescending) const {
-  HWY_DYNAMIC_DISPATCH(SortF64Desc)(keys, n, Get<double>());
-}
-
-}  // namespace hwy
-#endif  // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_i16a.cc b/third_party/highway/hwy/contrib/sort/vqsort_i16a.cc
deleted file mode 100644 (file)
index ef4bb75..0000000
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/sort/vqsort.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i16a.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-
-// After foreach_target
-#include "hwy/contrib/sort/traits-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-void SortI16Asc(int16_t* HWY_RESTRICT keys, size_t num,
-                int16_t* HWY_RESTRICT buf) {
-  SortTag<int16_t> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<int16_t>>> st;
-  Sort(d, st, keys, num, buf);
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-namespace {
-HWY_EXPORT(SortI16Asc);
-}  // namespace
-
-void Sorter::operator()(int16_t* HWY_RESTRICT keys, size_t n,
-                        SortAscending) const {
-  HWY_DYNAMIC_DISPATCH(SortI16Asc)(keys, n, Get<int16_t>());
-}
-
-}  // namespace hwy
-#endif  // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_i16d.cc b/third_party/highway/hwy/contrib/sort/vqsort_i16d.cc
deleted file mode 100644 (file)
index 6507ed6..0000000
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/sort/vqsort.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i16d.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-
-// After foreach_target
-#include "hwy/contrib/sort/traits-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-void SortI16Desc(int16_t* HWY_RESTRICT keys, size_t num,
-                 int16_t* HWY_RESTRICT buf) {
-  SortTag<int16_t> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<int16_t>>> st;
-  Sort(d, st, keys, num, buf);
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-namespace {
-HWY_EXPORT(SortI16Desc);
-}  // namespace
-
-void Sorter::operator()(int16_t* HWY_RESTRICT keys, size_t n,
-                        SortDescending) const {
-  HWY_DYNAMIC_DISPATCH(SortI16Desc)(keys, n, Get<int16_t>());
-}
-
-}  // namespace hwy
-#endif  // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_i32a.cc b/third_party/highway/hwy/contrib/sort/vqsort_i32a.cc
deleted file mode 100644 (file)
index ae65be9..0000000
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/sort/vqsort.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i32a.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-
-// After foreach_target
-#include "hwy/contrib/sort/traits-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-void SortI32Asc(int32_t* HWY_RESTRICT keys, size_t num,
-                int32_t* HWY_RESTRICT buf) {
-  SortTag<int32_t> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<int32_t>>> st;
-  Sort(d, st, keys, num, buf);
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-namespace {
-HWY_EXPORT(SortI32Asc);
-}  // namespace
-
-void Sorter::operator()(int32_t* HWY_RESTRICT keys, size_t n,
-                        SortAscending) const {
-  HWY_DYNAMIC_DISPATCH(SortI32Asc)(keys, n, Get<int32_t>());
-}
-
-}  // namespace hwy
-#endif  // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_i32d.cc b/third_party/highway/hwy/contrib/sort/vqsort_i32d.cc
deleted file mode 100644 (file)
index 3ce276e..0000000
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/sort/vqsort.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i32d.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-
-// After foreach_target
-#include "hwy/contrib/sort/traits-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-void SortI32Desc(int32_t* HWY_RESTRICT keys, size_t num,
-                 int32_t* HWY_RESTRICT buf) {
-  SortTag<int32_t> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<int32_t>>> st;
-  Sort(d, st, keys, num, buf);
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-namespace {
-HWY_EXPORT(SortI32Desc);
-}  // namespace
-
-void Sorter::operator()(int32_t* HWY_RESTRICT keys, size_t n,
-                        SortDescending) const {
-  HWY_DYNAMIC_DISPATCH(SortI32Desc)(keys, n, Get<int32_t>());
-}
-
-}  // namespace hwy
-#endif  // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_i64a.cc b/third_party/highway/hwy/contrib/sort/vqsort_i64a.cc
deleted file mode 100644 (file)
index 901b8ea..0000000
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/sort/vqsort.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i64a.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-
-// After foreach_target
-#include "hwy/contrib/sort/traits-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-void SortI64Asc(int64_t* HWY_RESTRICT keys, size_t num,
-                int64_t* HWY_RESTRICT buf) {
-  SortTag<int64_t> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<int64_t>>> st;
-  Sort(d, st, keys, num, buf);
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-namespace {
-HWY_EXPORT(SortI64Asc);
-}  // namespace
-
-void Sorter::operator()(int64_t* HWY_RESTRICT keys, size_t n,
-                        SortAscending) const {
-  HWY_DYNAMIC_DISPATCH(SortI64Asc)(keys, n, Get<int64_t>());
-}
-
-}  // namespace hwy
-#endif  // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_i64d.cc b/third_party/highway/hwy/contrib/sort/vqsort_i64d.cc
deleted file mode 100644 (file)
index 7713f2e..0000000
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/sort/vqsort.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i64d.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-
-// After foreach_target
-#include "hwy/contrib/sort/traits-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-void SortI64Desc(int64_t* HWY_RESTRICT keys, size_t num,
-                 int64_t* HWY_RESTRICT buf) {
-  SortTag<int64_t> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<int64_t>>> st;
-  Sort(d, st, keys, num, buf);
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-namespace {
-HWY_EXPORT(SortI64Desc);
-}  // namespace
-
-void Sorter::operator()(int64_t* HWY_RESTRICT keys, size_t n,
-                        SortDescending) const {
-  HWY_DYNAMIC_DISPATCH(SortI64Desc)(keys, n, Get<int64_t>());
-}
-
-}  // namespace hwy
-#endif  // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_kv128a.cc b/third_party/highway/hwy/contrib/sort/vqsort_kv128a.cc
deleted file mode 100644 (file)
index 1e02742..0000000
+++ /dev/null
@@ -1,65 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/sort/vqsort.h"
-
-#undef HWY_TARGET_INCLUDE
-// clang-format off
-// (avoid line break, which would prevent Copybara rules from matching)
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_kv128a.cc"  //NOLINT
-// clang-format on
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-
-// After foreach_target
-#include "hwy/contrib/sort/traits128-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-void SortKV128Asc(uint64_t* HWY_RESTRICT keys, size_t num,
-                  uint64_t* HWY_RESTRICT buf) {
-#if VQSORT_ENABLED
-  SortTag<uint64_t> d;
-  detail::SharedTraits<detail::Traits128<detail::OrderAscendingKV128>> st;
-  Sort(d, st, keys, num, buf);
-#else
-  (void) keys;
-  (void) num;
-  (void) buf;
-  HWY_ASSERT(0);
-#endif
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-namespace {
-HWY_EXPORT(SortKV128Asc);
-}  // namespace
-
-void Sorter::operator()(K64V64* HWY_RESTRICT keys, size_t n,
-                        SortAscending) const {
-  HWY_DYNAMIC_DISPATCH(SortKV128Asc)
-  (reinterpret_cast<uint64_t*>(keys), n * 2, Get<uint64_t>());
-}
-
-}  // namespace hwy
-#endif  // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_kv128d.cc b/third_party/highway/hwy/contrib/sort/vqsort_kv128d.cc
deleted file mode 100644 (file)
index 3dd53b5..0000000
+++ /dev/null
@@ -1,65 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/sort/vqsort.h"
-
-#undef HWY_TARGET_INCLUDE
-// clang-format off
-// (avoid line break, which would prevent Copybara rules from matching)
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_kv128d.cc"  //NOLINT
-// clang-format on
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-
-// After foreach_target
-#include "hwy/contrib/sort/traits128-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-void SortKV128Desc(uint64_t* HWY_RESTRICT keys, size_t num,
-                   uint64_t* HWY_RESTRICT buf) {
-#if VQSORT_ENABLED
-  SortTag<uint64_t> d;
-  detail::SharedTraits<detail::Traits128<detail::OrderDescendingKV128>> st;
-  Sort(d, st, keys, num, buf);
-#else
-  (void) keys;
-  (void) num;
-  (void) buf;
-  HWY_ASSERT(0);
-#endif
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-namespace {
-HWY_EXPORT(SortKV128Desc);
-}  // namespace
-
-void Sorter::operator()(K64V64* HWY_RESTRICT keys, size_t n,
-                        SortDescending) const {
-  HWY_DYNAMIC_DISPATCH(SortKV128Desc)
-  (reinterpret_cast<uint64_t*>(keys), n * 2, Get<uint64_t>());
-}
-
-}  // namespace hwy
-#endif  // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_u16a.cc b/third_party/highway/hwy/contrib/sort/vqsort_u16a.cc
deleted file mode 100644 (file)
index 0a97ffa..0000000
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/sort/vqsort.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u16a.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-
-// After foreach_target
-#include "hwy/contrib/sort/traits-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-void SortU16Asc(uint16_t* HWY_RESTRICT keys, size_t num,
-                uint16_t* HWY_RESTRICT buf) {
-  SortTag<uint16_t> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<uint16_t>>> st;
-  Sort(d, st, keys, num, buf);
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-namespace {
-HWY_EXPORT(SortU16Asc);
-}  // namespace
-
-void Sorter::operator()(uint16_t* HWY_RESTRICT keys, size_t n,
-                        SortAscending) const {
-  HWY_DYNAMIC_DISPATCH(SortU16Asc)(keys, n, Get<uint16_t>());
-}
-
-}  // namespace hwy
-#endif  // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_u16d.cc b/third_party/highway/hwy/contrib/sort/vqsort_u16d.cc
deleted file mode 100644 (file)
index 286ebbb..0000000
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/sort/vqsort.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u16d.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-
-// After foreach_target
-#include "hwy/contrib/sort/traits-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-void SortU16Desc(uint16_t* HWY_RESTRICT keys, size_t num,
-                 uint16_t* HWY_RESTRICT buf) {
-  SortTag<uint16_t> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<uint16_t>>>
-      st;
-  Sort(d, st, keys, num, buf);
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-namespace {
-HWY_EXPORT(SortU16Desc);
-}  // namespace
-
-void Sorter::operator()(uint16_t* HWY_RESTRICT keys, size_t n,
-                        SortDescending) const {
-  HWY_DYNAMIC_DISPATCH(SortU16Desc)(keys, n, Get<uint16_t>());
-}
-
-}  // namespace hwy
-#endif  // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_u32a.cc b/third_party/highway/hwy/contrib/sort/vqsort_u32a.cc
deleted file mode 100644 (file)
index b6a69e6..0000000
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/sort/vqsort.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u32a.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-
-// After foreach_target
-#include "hwy/contrib/sort/traits-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-void SortU32Asc(uint32_t* HWY_RESTRICT keys, size_t num,
-                uint32_t* HWY_RESTRICT buf) {
-  SortTag<uint32_t> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<uint32_t>>> st;
-  Sort(d, st, keys, num, buf);
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-namespace {
-HWY_EXPORT(SortU32Asc);
-}  // namespace
-
-void Sorter::operator()(uint32_t* HWY_RESTRICT keys, size_t n,
-                        SortAscending) const {
-  HWY_DYNAMIC_DISPATCH(SortU32Asc)(keys, n, Get<uint32_t>());
-}
-
-}  // namespace hwy
-#endif  // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_u32d.cc b/third_party/highway/hwy/contrib/sort/vqsort_u32d.cc
deleted file mode 100644 (file)
index 38fc1e1..0000000
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/sort/vqsort.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u32d.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-
-// After foreach_target
-#include "hwy/contrib/sort/traits-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-void SortU32Desc(uint32_t* HWY_RESTRICT keys, size_t num,
-                 uint32_t* HWY_RESTRICT buf) {
-  SortTag<uint32_t> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<uint32_t>>>
-      st;
-  Sort(d, st, keys, num, buf);
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-namespace {
-HWY_EXPORT(SortU32Desc);
-}  // namespace
-
-void Sorter::operator()(uint32_t* HWY_RESTRICT keys, size_t n,
-                        SortDescending) const {
-  HWY_DYNAMIC_DISPATCH(SortU32Desc)(keys, n, Get<uint32_t>());
-}
-
-}  // namespace hwy
-#endif  // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_u64a.cc b/third_party/highway/hwy/contrib/sort/vqsort_u64a.cc
deleted file mode 100644 (file)
index a29824a..0000000
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/sort/vqsort.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u64a.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-
-// After foreach_target
-#include "hwy/contrib/sort/traits-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-void SortU64Asc(uint64_t* HWY_RESTRICT keys, size_t num,
-                uint64_t* HWY_RESTRICT buf) {
-  SortTag<uint64_t> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderAscending<uint64_t>>> st;
-  Sort(d, st, keys, num, buf);
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-namespace {
-HWY_EXPORT(SortU64Asc);
-}  // namespace
-
-void Sorter::operator()(uint64_t* HWY_RESTRICT keys, size_t n,
-                        SortAscending) const {
-  HWY_DYNAMIC_DISPATCH(SortU64Asc)(keys, n, Get<uint64_t>());
-}
-
-}  // namespace hwy
-#endif  // HWY_ONCE
diff --git a/third_party/highway/hwy/contrib/sort/vqsort_u64d.cc b/third_party/highway/hwy/contrib/sort/vqsort_u64d.cc
deleted file mode 100644 (file)
index d692458..0000000
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/contrib/sort/vqsort.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u64d.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-
-// After foreach_target
-#include "hwy/contrib/sort/traits-inl.h"
-#include "hwy/contrib/sort/vqsort-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-void SortU64Desc(uint64_t* HWY_RESTRICT keys, size_t num,
-                 uint64_t* HWY_RESTRICT buf) {
-  SortTag<uint64_t> d;
-  detail::SharedTraits<detail::TraitsLane<detail::OrderDescending<uint64_t>>>
-      st;
-  Sort(d, st, keys, num, buf);
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-namespace {
-HWY_EXPORT(SortU64Desc);
-}  // namespace
-
-void Sorter::operator()(uint64_t* HWY_RESTRICT keys, size_t n,
-                        SortDescending) const {
-  HWY_DYNAMIC_DISPATCH(SortU64Desc)(keys, n, Get<uint64_t>());
-}
-
-}  // namespace hwy
-#endif  // HWY_ONCE
diff --git a/third_party/highway/hwy/detect_compiler_arch.h b/third_party/highway/hwy/detect_compiler_arch.h
deleted file mode 100644 (file)
index 12c1486..0000000
+++ /dev/null
@@ -1,229 +0,0 @@
-// Copyright 2020 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HIGHWAY_HWY_DETECT_COMPILER_ARCH_H_
-#define HIGHWAY_HWY_DETECT_COMPILER_ARCH_H_
-
-// Detects compiler and arch from predefined macros. Zero dependencies for
-// inclusion by foreach_target.h.
-
-// Add to #if conditions to prevent IDE from graying out code.
-#if (defined __CDT_PARSER__) || (defined __INTELLISENSE__) || \
-    (defined Q_CREATOR_RUN) || (defined(__CLANGD__))
-#define HWY_IDE 1
-#else
-#define HWY_IDE 0
-#endif
-
-//------------------------------------------------------------------------------
-// Compiler
-
-// Actual MSVC, not clang-cl, which defines _MSC_VER but doesn't behave like
-// MSVC in other aspects (e.g. HWY_DIAGNOSTICS).
-#if defined(_MSC_VER) && !defined(__clang__)
-#define HWY_COMPILER_MSVC _MSC_VER
-#else
-#define HWY_COMPILER_MSVC 0
-#endif
-
-#if defined(_MSC_VER) && defined(__clang__)
-#define HWY_COMPILER_CLANGCL _MSC_VER
-#else
-#define HWY_COMPILER_CLANGCL 0
-#endif
-
-#ifdef __INTEL_COMPILER
-#define HWY_COMPILER_ICC __INTEL_COMPILER
-#else
-#define HWY_COMPILER_ICC 0
-#endif
-
-#ifdef __INTEL_LLVM_COMPILER
-#define HWY_COMPILER_ICX __INTEL_LLVM_COMPILER
-#else
-#define HWY_COMPILER_ICX 0
-#endif
-
-// HWY_COMPILER_GCC is a generic macro for all compilers implementing the GNU
-// compiler extensions (eg. Clang, Intel...)
-#ifdef __GNUC__
-#define HWY_COMPILER_GCC (__GNUC__ * 100 + __GNUC_MINOR__)
-#else
-#define HWY_COMPILER_GCC 0
-#endif
-
-// Clang or clang-cl, not GCC.
-#ifdef __clang__
-// In case of Apple LLVM (whose version number is unrelated to that of LLVM) or
-// an invalid version number, deduce it from the presence of warnings.
-// Adapted from https://github.com/simd-everywhere/simde/ simde-detect-clang.h.
-#if defined(__APPLE__) || __clang_major__ >= 999
-#if __has_warning("-Wbitwise-instead-of-logical")
-#define HWY_COMPILER_CLANG 1400
-#elif __has_warning("-Wreserved-identifier")
-#define HWY_COMPILER_CLANG 1300
-#elif __has_warning("-Wformat-insufficient-args")
-#define HWY_COMPILER_CLANG 1200
-#elif __has_warning("-Wimplicit-const-int-float-conversion")
-#define HWY_COMPILER_CLANG 1100
-#elif __has_warning("-Wmisleading-indentation")
-#define HWY_COMPILER_CLANG 1000
-#elif defined(__FILE_NAME__)
-#define HWY_COMPILER_CLANG 900
-#elif __has_warning("-Wextra-semi-stmt") || \
-    __has_builtin(__builtin_rotateleft32)
-#define HWY_COMPILER_CLANG 800
-#elif __has_warning("-Wc++98-compat-extra-semi")
-#define HWY_COMPILER_CLANG 700
-#else  // Anything older than 7.0 is not recommended for Highway.
-#define HWY_COMPILER_CLANG 600
-#endif  // __has_warning chain
-#else   // use normal version
-#define HWY_COMPILER_CLANG (__clang_major__ * 100 + __clang_minor__)
-#endif
-#else  // Not clang
-#define HWY_COMPILER_CLANG 0
-#endif
-
-#if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG
-#define HWY_COMPILER_GCC_ACTUAL HWY_COMPILER_GCC
-#else
-#define HWY_COMPILER_GCC_ACTUAL 0
-#endif
-
-// More than one may be nonzero, but we want at least one.
-#if 0 == (HWY_COMPILER_MSVC + HWY_COMPILER_CLANGCL + HWY_COMPILER_ICC + \
-          HWY_COMPILER_GCC + HWY_COMPILER_CLANG)
-#error "Unsupported compiler"
-#endif
-
-// We should only detect one of these (only clang/clangcl overlap)
-#if 1 <                                                                     \
-    (!!HWY_COMPILER_MSVC + !!HWY_COMPILER_ICC + !!HWY_COMPILER_GCC_ACTUAL + \
-     !!(HWY_COMPILER_CLANGCL | HWY_COMPILER_CLANG))
-#error "Detected multiple compilers"
-#endif
-
-#ifdef __has_builtin
-#define HWY_HAS_BUILTIN(name) __has_builtin(name)
-#else
-#define HWY_HAS_BUILTIN(name) 0
-#endif
-
-#ifdef __has_attribute
-#define HWY_HAS_ATTRIBUTE(name) __has_attribute(name)
-#else
-#define HWY_HAS_ATTRIBUTE(name) 0
-#endif
-
-#ifdef __has_feature
-#define HWY_HAS_FEATURE(name) __has_feature(name)
-#else
-#define HWY_HAS_FEATURE(name) 0
-#endif
-
-//------------------------------------------------------------------------------
-// Architecture
-
-#if defined(__i386__) || defined(_M_IX86)
-#define HWY_ARCH_X86_32 1
-#else
-#define HWY_ARCH_X86_32 0
-#endif
-
-#if defined(__x86_64__) || defined(_M_X64)
-#define HWY_ARCH_X86_64 1
-#else
-#define HWY_ARCH_X86_64 0
-#endif
-
-#if HWY_ARCH_X86_32 && HWY_ARCH_X86_64
-#error "Cannot have both x86-32 and x86-64"
-#endif
-
-#if HWY_ARCH_X86_32 || HWY_ARCH_X86_64
-#define HWY_ARCH_X86 1
-#else
-#define HWY_ARCH_X86 0
-#endif
-
-#if defined(__powerpc64__) || defined(_M_PPC)
-#define HWY_ARCH_PPC 1
-#else
-#define HWY_ARCH_PPC 0
-#endif
-
-#if defined(__ARM_ARCH_ISA_A64) || defined(__aarch64__) || defined(_M_ARM64)
-#define HWY_ARCH_ARM_A64 1
-#else
-#define HWY_ARCH_ARM_A64 0
-#endif
-
-#if (defined(__ARM_ARCH) && __ARM_ARCH == 7) || (defined(_M_ARM) && _M_ARM == 7)
-#define HWY_ARCH_ARM_V7 1
-#else
-#define HWY_ARCH_ARM_V7 0
-#endif
-
-#if HWY_ARCH_ARM_A64 && HWY_ARCH_ARM_V7
-#error "Cannot have both A64 and V7"
-#endif
-
-// Any *supported* version of Arm, i.e. 7 or later
-#if HWY_ARCH_ARM_A64 || HWY_ARCH_ARM_V7
-#define HWY_ARCH_ARM 1
-#else
-#define HWY_ARCH_ARM 0
-#endif
-
-// Older than v7 (e.g. armel aka Arm v5), in which case we do not support SIMD.
-#if (defined(__arm__) || defined(_M_ARM)) && !HWY_ARCH_ARM
-#define HWY_ARCH_ARM_OLD 1
-#else
-#define HWY_ARCH_ARM_OLD 0
-#endif
-
-#if defined(__EMSCRIPTEN__) || defined(__wasm__) || defined(__WASM__)
-#define HWY_ARCH_WASM 1
-#else
-#define HWY_ARCH_WASM 0
-#endif
-
-#ifdef __riscv
-#define HWY_ARCH_RVV 1
-#else
-#define HWY_ARCH_RVV 0
-#endif
-
-// It is an error to detect multiple architectures at the same time, but OK to
-// detect none of the above.
-#if (HWY_ARCH_X86 + HWY_ARCH_PPC + HWY_ARCH_ARM + HWY_ARCH_ARM_OLD + \
-     HWY_ARCH_WASM + HWY_ARCH_RVV) > 1
-#error "Must not detect more than one architecture"
-#endif
-
-#if defined(_WIN32) || defined(_WIN64)
-#define HWY_OS_WIN 1
-#else
-#define HWY_OS_WIN 0
-#endif
-
-#if defined(linux) || defined(__linux__)
-#define HWY_OS_LINUX 1
-#else
-#define HWY_OS_LINUX 0
-#endif
-
-#endif  // HIGHWAY_HWY_DETECT_COMPILER_ARCH_H_
diff --git a/third_party/highway/hwy/detect_targets.h b/third_party/highway/hwy/detect_targets.h
deleted file mode 100644 (file)
index 0071c7b..0000000
+++ /dev/null
@@ -1,460 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HIGHWAY_HWY_DETECT_TARGETS_H_
-#define HIGHWAY_HWY_DETECT_TARGETS_H_
-
-// Defines targets and chooses which to enable.
-
-#include "hwy/detect_compiler_arch.h"
-
-//------------------------------------------------------------------------------
-// Optional configuration
-
-// See ../quick_reference.md for documentation of these macros.
-
-// Uncomment to override the default baseline determined from predefined macros:
-// #define HWY_BASELINE_TARGETS (HWY_SSE4 | HWY_SCALAR)
-
-// Uncomment to override the default blocklist:
-// #define HWY_BROKEN_TARGETS HWY_AVX3
-
-// Uncomment to definitely avoid generating those target(s):
-// #define HWY_DISABLED_TARGETS HWY_SSE4
-
-// Uncomment to avoid emitting BMI/BMI2/FMA instructions (allows generating
-// AVX2 target for VMs which support AVX2 but not the other instruction sets)
-// #define HWY_DISABLE_BMI2_FMA
-
-// Uncomment to enable SSSE3/SSE4 on MSVC even if AVX is not enabled
-// #define HWY_WANT_SSSE3
-// #define HWY_WANT_SSE4
-
-//------------------------------------------------------------------------------
-// Targets
-
-// Unique bit value for each target. A lower value is "better" (e.g. more lanes)
-// than a higher value within the same group/platform - see HWY_STATIC_TARGET.
-//
-// All values are unconditionally defined so we can test HWY_TARGETS without
-// first checking the HWY_ARCH_*.
-//
-// The C99 preprocessor evaluates #if expressions using intmax_t types. This
-// holds at least 64 bits in practice (verified 2022-07-18 via Godbolt on
-// 32-bit clang/GCC/MSVC compilers for x86/Arm7/AArch32/RISC-V/WASM). We now
-// avoid overflow when computing HWY_TARGETS (subtracting one instead of
-// left-shifting 2^62), but still do not use bit 63 because it is the sign bit.
-
-// --------------------------- x86: 15 targets (+ one fallback)
-// Bits 0..6 reserved (7 targets)
-// Currently satisfiable by Ice Lake (VNNI, VPCLMULQDQ, VPOPCNTDQ, VBMI, VBMI2,
-// VAES, BITALG). Later to be added: BF16 (Cooper Lake). VP2INTERSECT is only in
-// Tiger Lake? We do not yet have uses for GFNI.
-#define HWY_AVX3_DL (1LL << 7)  // see HWY_WANT_AVX3_DL below
-#define HWY_AVX3 (1LL << 8)
-#define HWY_AVX2 (1LL << 9)
-// Bit 10: reserved for AVX
-#define HWY_SSE4 (1LL << 11)
-#define HWY_SSSE3 (1LL << 12)
-// Bits 13..14 reserved for SSE3 or SSE2 (2 targets)
-// The highest bit in the HWY_TARGETS mask that a x86 target can have. Used for
-// dynamic dispatch. All x86 target bits must be lower or equal to
-// (1 << HWY_HIGHEST_TARGET_BIT_X86) and they can only use
-// HWY_MAX_DYNAMIC_TARGETS in total.
-#define HWY_HIGHEST_TARGET_BIT_X86 14
-
-// --------------------------- Arm: 15 targets (+ one fallback)
-// Bits 15..23 reserved (9 targets)
-#define HWY_SVE2_128 (1LL << 24)  // specialized target (e.g. Arm N2)
-#define HWY_SVE_256 (1LL << 25)   // specialized target (e.g. Arm V1)
-#define HWY_SVE2 (1LL << 26)
-#define HWY_SVE (1LL << 27)
-#define HWY_NEON (1LL << 28)  // On A64, includes/requires AES
-// Bit 29 reserved (Helium?)
-#define HWY_HIGHEST_TARGET_BIT_ARM 29
-
-// --------------------------- RISC-V: 9 targets (+ one fallback)
-// Bits 30..36 reserved (7 targets)
-#define HWY_RVV (1LL << 37)
-// Bit 38 reserved
-#define HWY_HIGHEST_TARGET_BIT_RVV 38
-
-// --------------------------- Future expansion: 4 targets
-// Bits 39..42 reserved
-
-
-// --------------------------- IBM Power: 9 targets (+ one fallback)
-// Bits 43..48 reserved (6 targets)
-#define HWY_PPC8 (1LL << 49)  // v2.07 or 3
-// Bits 50..51 reserved for prior VSX/AltiVec (2 targets)
-#define HWY_HIGHEST_TARGET_BIT_PPC 51
-
-// --------------------------- WebAssembly: 9 targets (+ one fallback)
-// Bits 52..57 reserved (6 targets)
-#define HWY_WASM_EMU256 (1LL << 58)  // Experimental
-#define HWY_WASM (1LL << 59)
-// Bits 60 reserved
-#define HWY_HIGHEST_TARGET_BIT_WASM 60
-
-// --------------------------- Emulation: 2 targets
-
-#define HWY_EMU128 (1LL << 61)
-// We do not add/left-shift, so this will not overflow to a negative number.
-#define HWY_SCALAR (1LL << 62)
-#define HWY_HIGHEST_TARGET_BIT_SCALAR 62
-
-// Do not use bit 63 - would be confusing to have negative numbers.
-
-//------------------------------------------------------------------------------
-// Set default blocklists
-
-// Disabled means excluded from enabled at user's request. A separate config
-// macro allows disabling without deactivating the blocklist below.
-#ifndef HWY_DISABLED_TARGETS
-#define HWY_DISABLED_TARGETS 0
-#endif
-
-// Broken means excluded from enabled due to known compiler issues. Allow the
-// user to override this blocklist without any guarantee of success.
-#ifndef HWY_BROKEN_TARGETS
-
-// x86 clang-6: we saw multiple AVX2/3 compile errors and in one case invalid
-// SSE4 codegen (possibly only for msan), so disable all those targets.
-#if HWY_ARCH_X86 && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
-#define HWY_BROKEN_TARGETS (HWY_SSE4 | HWY_AVX2 | HWY_AVX3 | HWY_AVX3_DL)
-// This entails a major speed reduction, so warn unless the user explicitly
-// opts in to scalar-only.
-#if !defined(HWY_COMPILE_ONLY_SCALAR)
-#pragma message("x86 Clang <= 6: define HWY_COMPILE_ONLY_SCALAR or upgrade.")
-#endif
-
-// 32-bit may fail to compile AVX2/3.
-#elif HWY_ARCH_X86_32
-#define HWY_BROKEN_TARGETS (HWY_AVX2 | HWY_AVX3 | HWY_AVX3_DL)
-
-// MSVC AVX3 support is buggy: https://github.com/Mysticial/Flops/issues/16
-#elif HWY_COMPILER_MSVC != 0
-#define HWY_BROKEN_TARGETS (HWY_AVX3 | HWY_AVX3_DL)
-
-// armv7be has not been tested and is not yet supported.
-#elif HWY_ARCH_ARM_V7 &&          \
-    (defined(__ARM_BIG_ENDIAN) || \
-     (defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN))
-#define HWY_BROKEN_TARGETS (HWY_NEON)
-
-// SVE[2] require recent clang or gcc versions.
-#elif (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1100) || \
-    (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1000)
-#define HWY_BROKEN_TARGETS (HWY_SVE | HWY_SVE2 | HWY_SVE_256 | HWY_SVE2_128)
-
-#else
-#define HWY_BROKEN_TARGETS 0
-#endif
-
-#endif  // HWY_BROKEN_TARGETS
-
-// Enabled means not disabled nor blocklisted.
-#define HWY_ENABLED(targets) \
-  ((targets) & ~((HWY_DISABLED_TARGETS) | (HWY_BROKEN_TARGETS)))
-
-// Opt-out for EMU128 (affected by a GCC <12 bug on ARMv7: see
-// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106187). This is separate from
-// HWY_BROKEN_TARGETS because it affects the fallback target, which must always
-// be enabled. If 1, we instead choose HWY_SCALAR even without
-// HWY_COMPILE_ONLY_SCALAR being set.
-#if !defined(HWY_BROKEN_EMU128)  // allow overriding
-#if HWY_ARCH_ARM_V7 && HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1140
-#define HWY_BROKEN_EMU128 1
-#else
-#define HWY_BROKEN_EMU128 0
-#endif
-#endif  // HWY_BROKEN_EMU128
-
-//------------------------------------------------------------------------------
-// Detect baseline targets using predefined macros
-
-// Baseline means the targets for which the compiler is allowed to generate
-// instructions, implying the target CPU would have to support them. This does
-// not take the blocklist into account.
-
-#if defined(HWY_COMPILE_ONLY_SCALAR) || HWY_BROKEN_EMU128
-#define HWY_BASELINE_SCALAR HWY_SCALAR
-#else
-#define HWY_BASELINE_SCALAR HWY_EMU128
-#endif
-
-// Also check HWY_ARCH to ensure that simulating unknown platforms ends up with
-// HWY_TARGET == HWY_BASELINE_SCALAR.
-
-#if HWY_ARCH_WASM && defined(__wasm_simd128__)
-#if defined(HWY_WANT_WASM2)
-#define HWY_BASELINE_WASM HWY_WASM_EMU256
-#else
-#define HWY_BASELINE_WASM HWY_WASM
-#endif  // HWY_WANT_WASM2
-#else
-#define HWY_BASELINE_WASM 0
-#endif
-
-// Avoid choosing the PPC target until we have an implementation.
-#if HWY_ARCH_PPC && defined(__VSX__) && 0
-#define HWY_BASELINE_PPC8 HWY_PPC8
-#else
-#define HWY_BASELINE_PPC8 0
-#endif
-
-#if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE2)
-#define HWY_BASELINE_SVE2 HWY_SVE2
-#else
-#define HWY_BASELINE_SVE2 0
-#endif
-
-#if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE)
-// Baseline targets can be used unconditionally, which does not apply to
-// HWY_SVE_256 because it requires a vector size of 256 bits. Including SVE_256
-// in the baseline would also disable all 'worse' targets (including SVE and
-// SVE2) in non-test builds. Therefore we instead add HWY_SVE_256 to
-// HWY_ATTAINABLE_TARGETS below.
-#define HWY_BASELINE_SVE HWY_SVE
-#else
-#define HWY_BASELINE_SVE 0
-#endif
-
-// GCC 4.5.4 only defines __ARM_NEON__; 5.4 defines both.
-#if HWY_ARCH_ARM && (defined(__ARM_NEON__) || defined(__ARM_NEON))
-#define HWY_BASELINE_NEON HWY_NEON
-#else
-#define HWY_BASELINE_NEON 0
-#endif
-
-// Special handling for MSVC because it has fewer predefined macros:
-#if HWY_COMPILER_MSVC
-
-// 1) We can only be sure SSSE3/SSE4 are enabled if AVX is:
-//    https://stackoverflow.com/questions/18563978/.
-#if defined(__AVX__)
-#define HWY_CHECK_SSSE3 1
-#define HWY_CHECK_SSE4 1
-#else
-#define HWY_CHECK_SSSE3 0
-#define HWY_CHECK_SSE4 0
-#endif
-
-// 2) Cannot check for PCLMUL/AES and BMI2/FMA/F16C individually; we assume
-//    PCLMUL/AES are available if SSE4 is, and BMI2/FMA/F16C if AVX2 is.
-#define HWY_CHECK_PCLMUL_AES 1
-#define HWY_CHECK_BMI2_FMA 1
-#define HWY_CHECK_F16C 1
-
-#else  // non-MSVC
-
-#if defined(__SSSE3__)
-#define HWY_CHECK_SSSE3 1
-#else
-#define HWY_CHECK_SSSE3 0
-#endif
-
-#if defined(__SSE4_1__) && defined(__SSE4_2__)
-#define HWY_CHECK_SSE4 1
-#else
-#define HWY_CHECK_SSE4 0
-#endif
-
-// If these are disabled, they should not gate the availability of SSE4/AVX2.
-#if defined(HWY_DISABLE_PCLMUL_AES) || (defined(__PCLMUL__) && defined(__AES__))
-#define HWY_CHECK_PCLMUL_AES 1
-#else
-#define HWY_CHECK_PCLMUL_AES 0
-#endif
-
-#if defined(HWY_DISABLE_BMI2_FMA) || (defined(__BMI2__) && defined(__FMA__))
-#define HWY_CHECK_BMI2_FMA 1
-#else
-#define HWY_CHECK_BMI2_FMA 0
-#endif
-
-#if defined(HWY_DISABLE_F16C) || defined(__F16C__)
-#define HWY_CHECK_F16C 1
-#else
-#define HWY_CHECK_F16C 0
-#endif
-
-#endif  // non-MSVC
-
-#if HWY_ARCH_X86 && (HWY_WANT_SSSE3 || HWY_CHECK_SSSE3)
-#define HWY_BASELINE_SSSE3 HWY_SSSE3
-#else
-#define HWY_BASELINE_SSSE3 0
-#endif
-
-#if HWY_ARCH_X86 && (HWY_WANT_SSE4 || (HWY_CHECK_SSE4 && HWY_CHECK_PCLMUL_AES))
-#define HWY_BASELINE_SSE4 HWY_SSE4
-#else
-#define HWY_BASELINE_SSE4 0
-#endif
-
-#if HWY_BASELINE_SSE4 != 0 && HWY_CHECK_BMI2_FMA && HWY_CHECK_F16C && \
-    defined(__AVX2__)
-#define HWY_BASELINE_AVX2 HWY_AVX2
-#else
-#define HWY_BASELINE_AVX2 0
-#endif
-
-// Require everything in AVX2 plus AVX-512 flags (also set by MSVC)
-#if HWY_BASELINE_AVX2 != 0 && defined(__AVX512F__) && defined(__AVX512BW__) && \
-    defined(__AVX512DQ__) && defined(__AVX512VL__)
-#define HWY_BASELINE_AVX3 HWY_AVX3
-#else
-#define HWY_BASELINE_AVX3 0
-#endif
-
-// TODO(janwas): not yet known whether these will be set by MSVC
-#if HWY_BASELINE_AVX3 != 0 && defined(__AVXVNNI__) && defined(__VAES__) && \
-    defined(__VPCLMULQDQ__) && defined(__AVX512VBMI__) &&                  \
-    defined(__AVX512VBMI2__) && defined(__AVX512VPOPCNTDQ__) &&            \
-    defined(__AVX512BITALG__)
-#define HWY_BASELINE_AVX3_DL HWY_AVX3_DL
-#else
-#define HWY_BASELINE_AVX3_DL 0
-#endif
-
-#if HWY_ARCH_RVV && defined(__riscv_vector)
-#define HWY_BASELINE_RVV HWY_RVV
-#else
-#define HWY_BASELINE_RVV 0
-#endif
-
-// Allow the user to override this without any guarantee of success.
-#ifndef HWY_BASELINE_TARGETS
-#define HWY_BASELINE_TARGETS                                     \
-  (HWY_BASELINE_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 | \
-   HWY_BASELINE_SVE2 | HWY_BASELINE_SVE | HWY_BASELINE_NEON |    \
-   HWY_BASELINE_SSSE3 | HWY_BASELINE_SSE4 | HWY_BASELINE_AVX2 |  \
-   HWY_BASELINE_AVX3 | HWY_BASELINE_AVX3_DL | HWY_BASELINE_RVV)
-#endif  // HWY_BASELINE_TARGETS
-
-//------------------------------------------------------------------------------
-// Choose target for static dispatch
-
-#define HWY_ENABLED_BASELINE HWY_ENABLED(HWY_BASELINE_TARGETS)
-#if HWY_ENABLED_BASELINE == 0
-#error "At least one baseline target must be defined and enabled"
-#endif
-
-// Best baseline, used for static dispatch. This is the least-significant 1-bit
-// within HWY_ENABLED_BASELINE and lower bit values imply "better".
-#define HWY_STATIC_TARGET (HWY_ENABLED_BASELINE & -HWY_ENABLED_BASELINE)
-
-// Start by assuming static dispatch. If we later use dynamic dispatch, this
-// will be defined to other targets during the multiple-inclusion, and finally
-// return to the initial value. Defining this outside begin/end_target ensures
-// inl headers successfully compile by themselves (required by Bazel).
-#define HWY_TARGET HWY_STATIC_TARGET
-
-//------------------------------------------------------------------------------
-// Choose targets for dynamic dispatch according to one of four policies
-
-#if 1 < (defined(HWY_COMPILE_ONLY_SCALAR) + defined(HWY_COMPILE_ONLY_EMU128) + \
-         defined(HWY_COMPILE_ONLY_STATIC))
-#error "Can only define one of HWY_COMPILE_ONLY_{SCALAR|EMU128|STATIC} - bug?"
-#endif
-// Defining one of HWY_COMPILE_ONLY_* will trump HWY_COMPILE_ALL_ATTAINABLE.
-
-// x86 compilers generally allow runtime dispatch. On Arm, currently only GCC
-// does, and we require Linux to detect CPU capabilities.
-#if HWY_ARCH_X86 || (HWY_ARCH_ARM && HWY_COMPILER_GCC_ACTUAL && HWY_OS_LINUX)
-#define HWY_HAVE_RUNTIME_DISPATCH 1
-#else
-#define HWY_HAVE_RUNTIME_DISPATCH 0
-#endif
-
-// AVX3_DL is not widely available yet. To reduce code size and compile time,
-// only include it in the set of attainable targets (for dynamic dispatch) if
-// the user opts in, OR it is in the baseline (we check whether enabled below).
-#if defined(HWY_WANT_AVX3_DL) || (HWY_BASELINE & HWY_AVX3_DL)
-#define HWY_ATTAINABLE_AVX3_DL HWY_AVX3_DL
-#else
-#define HWY_ATTAINABLE_AVX3_DL 0
-#endif
-
-#if HWY_ARCH_ARM_A64 && \
-    ((HWY_ENABLED_BASELINE & HWY_SVE) || HWY_HAVE_RUNTIME_DISPATCH)
-#define HWY_ATTAINABLE_SVE HWY_ENABLED(HWY_SVE | HWY_SVE_256)
-#else
-#define HWY_ATTAINABLE_SVE 0
-#endif
-
-#if HWY_ARCH_ARM_A64 && \
-    ((HWY_ENABLED_BASELINE & HWY_SVE2) || HWY_HAVE_RUNTIME_DISPATCH)
-#define HWY_ATTAINABLE_SVE2 HWY_ENABLED(HWY_SVE2 | HWY_SVE2_128)
-#else
-#define HWY_ATTAINABLE_SVE2 0
-#endif
-
-// Attainable means enabled and the compiler allows intrinsics (even when not
-// allowed to autovectorize). Used in 3 and 4.
-#if HWY_ARCH_X86
-#define HWY_ATTAINABLE_TARGETS                                        \
-  HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_SSSE3 | HWY_SSE4 | HWY_AVX2 | \
-              HWY_AVX3 | HWY_ATTAINABLE_AVX3_DL)
-#elif HWY_ARCH_ARM && HWY_HAVE_RUNTIME_DISPATCH
-#define HWY_ATTAINABLE_TARGETS                                      \
-  HWY_ENABLED(HWY_BASELINE_SCALAR | HWY_NEON | HWY_ATTAINABLE_SVE | \
-              HWY_ATTAINABLE_SVE2)
-#else
-#define HWY_ATTAINABLE_TARGETS \
-  (HWY_ENABLED_BASELINE | HWY_ATTAINABLE_SVE | HWY_ATTAINABLE_SVE2)
-#endif
-
-// 1) For older compilers: avoid SIMD intrinsics, but still support all ops.
-#if defined(HWY_COMPILE_ONLY_EMU128) && !HWY_BROKEN_EMU128
-#undef HWY_STATIC_TARGET
-#define HWY_STATIC_TARGET HWY_EMU128  // override baseline
-#define HWY_TARGETS HWY_EMU128
-
-// 1b) HWY_SCALAR is less capable than HWY_EMU128 (which supports all ops), but
-// we currently still support it for backwards compatibility.
-#elif defined(HWY_COMPILE_ONLY_SCALAR) || \
-    (defined(HWY_COMPILE_ONLY_EMU128) && HWY_BROKEN_EMU128)
-#undef HWY_STATIC_TARGET
-#define HWY_STATIC_TARGET HWY_SCALAR  // override baseline
-#define HWY_TARGETS HWY_SCALAR
-
-// 2) For forcing static dispatch without code changes (removing HWY_EXPORT)
-#elif defined(HWY_COMPILE_ONLY_STATIC)
-#define HWY_TARGETS HWY_STATIC_TARGET
-
-// 3) For tests: include all attainable targets (in particular: scalar)
-#elif defined(HWY_COMPILE_ALL_ATTAINABLE) || defined(HWY_IS_TEST)
-#define HWY_TARGETS HWY_ATTAINABLE_TARGETS
-
-// 4) Default: attainable WITHOUT non-best baseline. This reduces code size by
-// excluding superseded targets, in particular scalar. Note: HWY_STATIC_TARGET
-// may be 2^62 (HWY_SCALAR), so we must not left-shift/add it. Subtracting one
-// sets all lower bits (better targets), then we also include the static target.
-#else
-#define HWY_TARGETS \
-  (HWY_ATTAINABLE_TARGETS & ((HWY_STATIC_TARGET - 1LL) | HWY_STATIC_TARGET))
-
-#endif  // target policy
-
-// HWY_ONCE and the multiple-inclusion mechanism rely on HWY_STATIC_TARGET being
-// one of the dynamic targets. This also implies HWY_TARGETS != 0 and
-// (HWY_TARGETS & HWY_ENABLED_BASELINE) != 0.
-#if (HWY_TARGETS & HWY_STATIC_TARGET) == 0
-#error "Logic error: best baseline should be included in dynamic targets"
-#endif
-
-#endif  // HIGHWAY_HWY_DETECT_TARGETS_H_
diff --git a/third_party/highway/hwy/examples/benchmark.cc b/third_party/highway/hwy/examples/benchmark.cc
deleted file mode 100644 (file)
index 8ab8108..0000000
+++ /dev/null
@@ -1,254 +0,0 @@
-// Copyright 2019 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS  // before inttypes.h
-#endif
-#include <inttypes.h>
-#include <stddef.h>
-#include <stdint.h>
-#include <stdio.h>
-
-#include <memory>
-#include <numeric>  // iota
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/examples/benchmark.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-
-// Must come after foreach_target.h to avoid redefinition errors.
-#include "hwy/aligned_allocator.h"
-#include "hwy/highway.h"
-#include "hwy/nanobenchmark.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-// These templates are not found via ADL.
-#if HWY_TARGET != HWY_SCALAR
-using hwy::HWY_NAMESPACE::CombineShiftRightLanes;
-#endif
-
-class TwoArray {
- public:
-  // Must be a multiple of the vector lane count * 8.
-  static size_t NumItems() { return 3456; }
-
-  TwoArray()
-      : a_(AllocateAligned<float>(NumItems() * 2)), b_(a_.get() + NumItems()) {
-    // = 1, but compiler doesn't know
-    const float init = static_cast<float>(Unpredictable1());
-    std::iota(a_.get(), a_.get() + NumItems(), init);
-    std::iota(b_, b_ + NumItems(), init);
-  }
-
- protected:
-  AlignedFreeUniquePtr<float[]> a_;
-  float* b_;
-};
-
-// Measures durations, verifies results, prints timings.
-template <class Benchmark>
-void RunBenchmark(const char* caption) {
-  printf("%10s: ", caption);
-  const size_t kNumInputs = 1;
-  const size_t num_items = Benchmark::NumItems() * size_t(Unpredictable1());
-  const FuncInput inputs[kNumInputs] = {num_items};
-  Result results[kNumInputs];
-
-  Benchmark benchmark;
-
-  Params p;
-  p.verbose = false;
-  p.max_evals = 7;
-  p.target_rel_mad = 0.002;
-  const size_t num_results = MeasureClosure(
-      [&benchmark](const FuncInput input) { return benchmark(input); }, inputs,
-      kNumInputs, results, p);
-  if (num_results != kNumInputs) {
-    fprintf(stderr, "MeasureClosure failed.\n");
-  }
-
-  benchmark.Verify(num_items);
-
-  for (size_t i = 0; i < num_results; ++i) {
-    const double cycles_per_item =
-        results[i].ticks / static_cast<double>(results[i].input);
-    const double mad = results[i].variability * cycles_per_item;
-    printf("%6" PRIu64 ": %6.3f (+/- %5.3f)\n",
-           static_cast<uint64_t>(results[i].input), cycles_per_item, mad);
-  }
-}
-
-void Intro() {
-  const float in[16] = {1, 2, 3, 4, 5, 6};
-  float out[16];
-  const ScalableTag<float> d;  // largest possible vector
-  for (size_t i = 0; i < 16; i += Lanes(d)) {
-    const auto vec = LoadU(d, in + i);  // no alignment requirement
-    auto result = Mul(vec, vec);
-    result = Add(result, result);  // can update if not const
-    StoreU(result, d, out + i);
-  }
-  printf("\nF(x)->2*x^2, F(%.0f) = %.1f\n", in[2], out[2]);
-}
-
-// BEGINNER: dot product
-// 0.4 cyc/float = bronze, 0.25 = silver, 0.15 = gold!
-class BenchmarkDot : public TwoArray {
- public:
-  BenchmarkDot() : dot_{-1.0f} {}
-
-  FuncOutput operator()(const size_t num_items) {
-    const ScalableTag<float> d;
-    const size_t N = Lanes(d);
-    using V = decltype(Zero(d));
-    // Compiler doesn't make independent sum* accumulators, so unroll manually.
-    // We cannot use an array because V might be a sizeless type. For reasonable
-    // code, we unroll 4x, but 8x might help (2 FMA ports * 4 cycle latency).
-    V sum0 = Zero(d);
-    V sum1 = Zero(d);
-    V sum2 = Zero(d);
-    V sum3 = Zero(d);
-    const float* const HWY_RESTRICT pa = &a_[0];
-    const float* const HWY_RESTRICT pb = b_;
-    for (size_t i = 0; i < num_items; i += 4 * N) {
-      const auto a0 = Load(d, pa + i + 0 * N);
-      const auto b0 = Load(d, pb + i + 0 * N);
-      sum0 = MulAdd(a0, b0, sum0);
-      const auto a1 = Load(d, pa + i + 1 * N);
-      const auto b1 = Load(d, pb + i + 1 * N);
-      sum1 = MulAdd(a1, b1, sum1);
-      const auto a2 = Load(d, pa + i + 2 * N);
-      const auto b2 = Load(d, pb + i + 2 * N);
-      sum2 = MulAdd(a2, b2, sum2);
-      const auto a3 = Load(d, pa + i + 3 * N);
-      const auto b3 = Load(d, pb + i + 3 * N);
-      sum3 = MulAdd(a3, b3, sum3);
-    }
-    // Reduction tree: sum of all accumulators by pairs into sum0.
-    sum0 = Add(sum0, sum1);
-    sum2 = Add(sum2, sum3);
-    sum0 = Add(sum0, sum2);
-    dot_ = GetLane(SumOfLanes(d, sum0));
-    return static_cast<FuncOutput>(dot_);
-  }
-  void Verify(size_t num_items) {
-    if (dot_ == -1.0f) {
-      fprintf(stderr, "Dot: must call Verify after benchmark");
-      abort();
-    }
-
-    const float expected =
-        std::inner_product(a_.get(), a_.get() + num_items, b_, 0.0f);
-    const float rel_err = std::abs(expected - dot_) / expected;
-    if (rel_err > 1.1E-6f) {
-      fprintf(stderr, "Dot: expected %e actual %e (%e)\n", expected, dot_,
-              rel_err);
-      abort();
-    }
-  }
-
- private:
-  float dot_;  // for Verify
-};
-
-// INTERMEDIATE: delta coding
-// 1.0 cycles/float = bronze, 0.7 = silver, 0.4 = gold!
-struct BenchmarkDelta : public TwoArray {
-  FuncOutput operator()(const size_t num_items) const {
-#if HWY_TARGET == HWY_SCALAR
-    b_[0] = a_[0];
-    for (size_t i = 1; i < num_items; ++i) {
-      b_[i] = a_[i] - a_[i - 1];
-    }
-#elif HWY_CAP_GE256
-    // Larger vectors are split into 128-bit blocks, easiest to use the
-    // unaligned load support to shift between them.
-    const ScalableTag<float> df;
-    const size_t N = Lanes(df);
-    size_t i;
-    b_[0] = a_[0];
-    for (i = 1; i < N; ++i) {
-      b_[i] = a_[i] - a_[i - 1];
-    }
-    for (; i < num_items; i += N) {
-      const auto a = Load(df, &a_[i]);
-      const auto shifted = LoadU(df, &a_[i - 1]);
-      Store(a - shifted, df, &b_[i]);
-    }
-#else  // 128-bit
-    // Slightly better than unaligned loads
-    const HWY_CAPPED(float, 4) df;
-    const size_t N = Lanes(df);
-    size_t i;
-    b_[0] = a_[0];
-    for (i = 1; i < N; ++i) {
-      b_[i] = a_[i] - a_[i - 1];
-    }
-    auto prev = Load(df, &a_[0]);
-    for (; i < num_items; i += Lanes(df)) {
-      const auto a = Load(df, &a_[i]);
-      const auto shifted = CombineShiftRightLanes<3>(df, a, prev);
-      prev = a;
-      Store(Sub(a, shifted), df, &b_[i]);
-    }
-#endif
-    return static_cast<FuncOutput>(b_[num_items - 1]);
-  }
-
-  void Verify(size_t num_items) {
-    for (size_t i = 0; i < num_items; ++i) {
-      const float expected = (i == 0) ? a_[0] : a_[i] - a_[i - 1];
-      const float err = std::abs(expected - b_[i]);
-      if (err > 1E-6f) {
-        fprintf(stderr, "Delta: expected %e, actual %e\n", expected, b_[i]);
-      }
-    }
-  }
-};
-
-void RunBenchmarks() {
-  Intro();
-  printf("------------------------ %s\n", TargetName(HWY_TARGET));
-  RunBenchmark<BenchmarkDot>("dot");
-  RunBenchmark<BenchmarkDelta>("delta");
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-HWY_EXPORT(RunBenchmarks);
-
-void Run() {
-  for (int64_t target : SupportedAndGeneratedTargets()) {
-    SetSupportedTargetsForTest(target);
-    HWY_DYNAMIC_DISPATCH(RunBenchmarks)();
-  }
-  SetSupportedTargetsForTest(0);  // Reset the mask afterwards.
-}
-
-}  // namespace hwy
-
-int main(int /*argc*/, char** /*argv*/) {
-  hwy::Run();
-  return 0;
-}
-#endif  // HWY_ONCE
diff --git a/third_party/highway/hwy/examples/skeleton-inl.h b/third_party/highway/hwy/examples/skeleton-inl.h
deleted file mode 100644 (file)
index 8aec33e..0000000
+++ /dev/null
@@ -1,66 +0,0 @@
-// Copyright 2020 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Demo of functions that might be called from multiple SIMD modules (either
-// other -inl.h files, or a .cc file between begin/end_target-inl). This is
-// optional - all SIMD code can reside in .cc files. However, this allows
-// splitting code into different files while still inlining instead of requiring
-// calling through function pointers.
-
-// Per-target include guard. This is only required when using dynamic dispatch,
-// i.e. including foreach_target.h. For static dispatch, a normal include
-// guard would be fine because the header is only compiled once.
-#if defined(HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_) == defined(HWY_TARGET_TOGGLE)
-#ifdef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
-#undef HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
-#else
-#define HIGHWAY_HWY_EXAMPLES_SKELETON_INL_H_
-#endif
-
-// It is fine to #include normal or *-inl headers.
-#include <stddef.h>
-
-#include "hwy/highway.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace skeleton {
-namespace HWY_NAMESPACE {
-
-// Highway ops reside here; ADL does not find templates nor builtins.
-namespace hn = hwy::HWY_NAMESPACE;
-
-// Example of a type-agnostic (caller-specified lane type) and width-agnostic
-// (uses best available instruction set) function in a header.
-//
-// Computes x[i] = mul_array[i] * x_array[i] + add_array[i] for i < size.
-template <class D, typename T>
-HWY_MAYBE_UNUSED void MulAddLoop(const D d, const T* HWY_RESTRICT mul_array,
-                                 const T* HWY_RESTRICT add_array,
-                                 const size_t size, T* HWY_RESTRICT x_array) {
-  for (size_t i = 0; i < size; i += hn::Lanes(d)) {
-    const auto mul = hn::Load(d, mul_array + i);
-    const auto add = hn::Load(d, add_array + i);
-    auto x = hn::Load(d, x_array + i);
-    x = hn::MulAdd(mul, x, add);
-    hn::Store(x, d, x_array + i);
-  }
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace skeleton
-HWY_AFTER_NAMESPACE();
-
-#endif  // include guard
diff --git a/third_party/highway/hwy/examples/skeleton.cc b/third_party/highway/hwy/examples/skeleton.cc
deleted file mode 100644 (file)
index 2e820b6..0000000
+++ /dev/null
@@ -1,121 +0,0 @@
-// Copyright 2020 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/examples/skeleton.h"
-
-#include <stdio.h>
-
-// >>>> for dynamic dispatch only, skip if you want static dispatch
-
-// First undef to prevent error when re-included.
-#undef HWY_TARGET_INCLUDE
-// For dynamic dispatch, specify the name of the current file (unfortunately
-// __FILE__ is not reliable) so that foreach_target.h can re-include it.
-#define HWY_TARGET_INCLUDE "hwy/examples/skeleton.cc"
-// Generates code for each enabled target by re-including this source file.
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-
-// <<<< end of dynamic dispatch
-
-// Must come after foreach_target.h to avoid redefinition errors.
-#include "hwy/highway.h"
-
-// Optional, can instead add HWY_ATTR to all functions.
-HWY_BEFORE_NAMESPACE();
-
-namespace skeleton {
-// This namespace name is unique per target, which allows code for multiple
-// targets to co-exist in the same translation unit. Required when using dynamic
-// dispatch, otherwise optional.
-namespace HWY_NAMESPACE {
-
-// Highway ops reside here; ADL does not find templates nor builtins.
-namespace hn = hwy::HWY_NAMESPACE;
-
-// Computes log2 by converting to a vector of floats. Compiled once per target.
-template <class DF>
-HWY_ATTR_NO_MSAN void OneFloorLog2(const DF df,
-                                   const uint8_t* HWY_RESTRICT values,
-                                   uint8_t* HWY_RESTRICT log2) {
-  // Type tags for converting to other element types (Rebind = same count).
-  const hn::RebindToSigned<DF> d32;
-  const hn::Rebind<uint8_t, DF> d8;
-
-  const auto u8 = hn::Load(d8, values);
-  const auto bits = hn::BitCast(d32, hn::ConvertTo(df, hn::PromoteTo(d32, u8)));
-  const auto exponent = hn::Sub(hn::ShiftRight<23>(bits), hn::Set(d32, 127));
-  hn::Store(hn::DemoteTo(d8, exponent), d8, log2);
-}
-
-void CodepathDemo() {
-  // Highway defaults to portability, but per-target codepaths may be selected
-  // via #if HWY_TARGET == HWY_SSE4 or by testing capability macros:
-#if HWY_HAVE_INTEGER64
-  const char* gather = "Has int64";
-#else
-  const char* gather = "No int64";
-#endif
-  printf("Target %s: %s\n", hwy::TargetName(HWY_TARGET), gather);
-}
-
-void FloorLog2(const uint8_t* HWY_RESTRICT values, size_t count,
-               uint8_t* HWY_RESTRICT log2) {
-  CodepathDemo();
-
-  const hn::ScalableTag<float> df;
-  const size_t N = hn::Lanes(df);
-  size_t i = 0;
-  for (; i + N <= count; i += N) {
-    OneFloorLog2(df, values + i, log2 + i);
-  }
-  for (; i < count; ++i) {
-    hn::CappedTag<float, 1> d1;
-    OneFloorLog2(d1, values + i, log2 + i);
-  }
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace skeleton
-HWY_AFTER_NAMESPACE();
-
-// The table of pointers to the various implementations in HWY_NAMESPACE must
-// be compiled only once (foreach_target #includes this file multiple times).
-// HWY_ONCE is true for only one of these 'compilation passes'.
-#if HWY_ONCE
-
-namespace skeleton {
-
-// This macro declares a static array used for dynamic dispatch; it resides in
-// the same outer namespace that contains FloorLog2.
-HWY_EXPORT(FloorLog2);
-
-// This function is optional and only needed in the case of exposing it in the
-// header file. Otherwise using HWY_DYNAMIC_DISPATCH(FloorLog2) in this module
-// is equivalent to inlining this function.
-HWY_DLLEXPORT void CallFloorLog2(const uint8_t* HWY_RESTRICT in,
-                                 const size_t count,
-                                 uint8_t* HWY_RESTRICT out) {
-  // This must reside outside of HWY_NAMESPACE because it references (calls the
-  // appropriate one from) the per-target implementations there.
-  // For static dispatch, use HWY_STATIC_DISPATCH.
-  return HWY_DYNAMIC_DISPATCH(FloorLog2)(in, count, out);
-}
-
-// Optional: anything to compile only once, e.g. non-SIMD implementations of
-// public functions provided by this module, can go inside #if HWY_ONCE.
-
-}  // namespace skeleton
-#endif  // HWY_ONCE
diff --git a/third_party/highway/hwy/examples/skeleton.h b/third_party/highway/hwy/examples/skeleton.h
deleted file mode 100644 (file)
index 381ac69..0000000
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright 2020 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Demo interface to target-specific code in skeleton.cc
-
-// Normal header with include guard and namespace.
-#ifndef HIGHWAY_HWY_EXAMPLES_SKELETON_H_
-#define HIGHWAY_HWY_EXAMPLES_SKELETON_H_
-
-#include <stddef.h>
-
-// Platform-specific definitions used for declaring an interface, independent of
-// the SIMD instruction set.
-#include "hwy/base.h"  // HWY_RESTRICT
-
-namespace skeleton {
-
-// Computes base-2 logarithm by converting to float. Supports dynamic dispatch.
-HWY_DLLEXPORT void CallFloorLog2(const uint8_t* HWY_RESTRICT in,
-                                 const size_t count, uint8_t* HWY_RESTRICT out);
-
-}  // namespace skeleton
-
-#endif  // HIGHWAY_HWY_EXAMPLES_SKELETON_H_
diff --git a/third_party/highway/hwy/examples/skeleton_test.cc b/third_party/highway/hwy/examples/skeleton_test.cc
deleted file mode 100644 (file)
index c7c26bf..0000000
+++ /dev/null
@@ -1,110 +0,0 @@
-// Copyright 2020 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Example of unit test for the "skeleton" library.
-
-#include "hwy/examples/skeleton.h"
-
-#include <stdio.h>
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "examples/skeleton_test.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-
-// Must come after foreach_target.h to avoid redefinition errors.
-#include "hwy/highway.h"
-#include "hwy/tests/test_util-inl.h"
-
-// Optional: factor out parts of the implementation into *-inl.h
-// (must also come after foreach_target.h to avoid redefinition errors)
-#include "hwy/examples/skeleton-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace skeleton {
-namespace HWY_NAMESPACE {
-
-namespace hn = hwy::HWY_NAMESPACE;
-
-// Calls function defined in skeleton.cc.
-struct TestFloorLog2 {
-  template <class T, class DF>
-  HWY_NOINLINE void operator()(T /*unused*/, DF df) {
-    const size_t count = 5 * hn::Lanes(df);
-    auto in = hwy::AllocateAligned<uint8_t>(count);
-    auto expected = hwy::AllocateAligned<uint8_t>(count);
-
-    hwy::RandomState rng;
-    for (size_t i = 0; i < count; ++i) {
-      expected[i] = Random32(&rng) & 7;
-      in[i] = static_cast<uint8_t>(1u << expected[i]);
-    }
-    auto out = hwy::AllocateAligned<uint8_t>(count);
-    CallFloorLog2(in.get(), count, out.get());
-    int sum = 0;
-    for (size_t i = 0; i < count; ++i) {
-      HWY_ASSERT_EQ(expected[i], out[i]);
-      sum += out[i];
-    }
-    hwy::PreventElision(sum);
-  }
-};
-
-HWY_NOINLINE void TestAllFloorLog2() {
-  hn::ForPartialVectors<TestFloorLog2>()(float());
-}
-
-// Calls function defined in skeleton-inl.h.
-struct TestSumMulAdd {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    hwy::RandomState rng;
-    const size_t count = 4096;
-    EXPECT_EQ(0, count % hn::Lanes(d));
-    auto mul = hwy::AllocateAligned<T>(count);
-    auto x = hwy::AllocateAligned<T>(count);
-    auto add = hwy::AllocateAligned<T>(count);
-    for (size_t i = 0; i < count; ++i) {
-      mul[i] = static_cast<T>(Random32(&rng) & 0xF);
-      x[i] = static_cast<T>(Random32(&rng) & 0xFF);
-      add[i] = static_cast<T>(Random32(&rng) & 0xFF);
-    }
-    double expected_sum = 0.0;
-    for (size_t i = 0; i < count; ++i) {
-      expected_sum += mul[i] * x[i] + add[i];
-    }
-
-    MulAddLoop(d, mul.get(), add.get(), count, x.get());
-    HWY_ASSERT_EQ(4344240.0, expected_sum);
-  }
-};
-
-HWY_NOINLINE void TestAllSumMulAdd() {
-  hn::ForFloatTypes(hn::ForPartialVectors<TestSumMulAdd>());
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace skeleton
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-
-namespace skeleton {
-HWY_BEFORE_TEST(SkeletonTest);
-HWY_EXPORT_AND_TEST_P(SkeletonTest, TestAllFloorLog2);
-HWY_EXPORT_AND_TEST_P(SkeletonTest, TestAllSumMulAdd);
-}  // namespace skeleton
-
-#endif
diff --git a/third_party/highway/hwy/foreach_target.h b/third_party/highway/hwy/foreach_target.h
deleted file mode 100644 (file)
index 3929905..0000000
+++ /dev/null
@@ -1,261 +0,0 @@
-// Copyright 2020 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HIGHWAY_HWY_FOREACH_TARGET_H_
-#define HIGHWAY_HWY_FOREACH_TARGET_H_
-
-// Re-includes the translation unit zero or more times to compile for any
-// targets except HWY_STATIC_TARGET. Defines unique HWY_TARGET each time so that
-// highway.h defines the corresponding macro/namespace.
-
-#include "hwy/detect_targets.h"
-
-// *_inl.h may include other headers, which requires include guards to prevent
-// repeated inclusion. The guards must be reset after compiling each target, so
-// the header is again visible. This is done by flipping HWY_TARGET_TOGGLE,
-// defining it if undefined and vice versa. This macro is initially undefined
-// so that IDEs don't gray out the contents of each header.
-#ifdef HWY_TARGET_TOGGLE
-#error "This macro must not be defined outside foreach_target.h"
-#endif
-
-#ifdef HWY_HIGHWAY_INCLUDED  // highway.h include guard
-// Trigger fixup at the bottom of this header.
-#define HWY_ALREADY_INCLUDED
-
-// The next highway.h must re-include set_macros-inl.h because the first
-// highway.h chose the static target instead of what we will set below.
-#undef HWY_SET_MACROS_PER_TARGET
-#endif
-
-// Disable HWY_EXPORT in user code until we have generated all targets. Note
-// that a subsequent highway.h will not override this definition.
-#undef HWY_ONCE
-#define HWY_ONCE (0 || HWY_IDE)
-
-// Avoid warnings on #include HWY_TARGET_INCLUDE by hiding them from the IDE;
-// also skip if only 1 target defined (no re-inclusion will be necessary).
-#if !HWY_IDE && (HWY_TARGETS != HWY_STATIC_TARGET)
-
-#if !defined(HWY_TARGET_INCLUDE)
-#error ">1 target enabled => define HWY_TARGET_INCLUDE before foreach_target.h"
-#endif
-
-#if (HWY_TARGETS & HWY_EMU128) && (HWY_STATIC_TARGET != HWY_EMU128)
-#undef HWY_TARGET
-#define HWY_TARGET HWY_EMU128
-#include HWY_TARGET_INCLUDE
-#ifdef HWY_TARGET_TOGGLE
-#undef HWY_TARGET_TOGGLE
-#else
-#define HWY_TARGET_TOGGLE
-#endif
-#endif
-
-#if (HWY_TARGETS & HWY_SCALAR) && (HWY_STATIC_TARGET != HWY_SCALAR)
-#undef HWY_TARGET
-#define HWY_TARGET HWY_SCALAR
-#include HWY_TARGET_INCLUDE
-#ifdef HWY_TARGET_TOGGLE
-#undef HWY_TARGET_TOGGLE
-#else
-#define HWY_TARGET_TOGGLE
-#endif
-#endif
-
-#if (HWY_TARGETS & HWY_NEON) && (HWY_STATIC_TARGET != HWY_NEON)
-#undef HWY_TARGET
-#define HWY_TARGET HWY_NEON
-#include HWY_TARGET_INCLUDE
-#ifdef HWY_TARGET_TOGGLE
-#undef HWY_TARGET_TOGGLE
-#else
-#define HWY_TARGET_TOGGLE
-#endif
-#endif
-
-#if (HWY_TARGETS & HWY_RVV) && (HWY_STATIC_TARGET != HWY_RVV)
-#undef HWY_TARGET
-#define HWY_TARGET HWY_RVV
-#include HWY_TARGET_INCLUDE
-#ifdef HWY_TARGET_TOGGLE
-#undef HWY_TARGET_TOGGLE
-#else
-#define HWY_TARGET_TOGGLE
-#endif
-#endif
-
-#if (HWY_TARGETS & HWY_SVE) && (HWY_STATIC_TARGET != HWY_SVE)
-#undef HWY_TARGET
-#define HWY_TARGET HWY_SVE
-#include HWY_TARGET_INCLUDE
-#ifdef HWY_TARGET_TOGGLE
-#undef HWY_TARGET_TOGGLE
-#else
-#define HWY_TARGET_TOGGLE
-#endif
-#endif
-
-#if (HWY_TARGETS & HWY_SVE2) && (HWY_STATIC_TARGET != HWY_SVE2)
-#undef HWY_TARGET
-#define HWY_TARGET HWY_SVE2
-#include HWY_TARGET_INCLUDE
-#ifdef HWY_TARGET_TOGGLE
-#undef HWY_TARGET_TOGGLE
-#else
-#define HWY_TARGET_TOGGLE
-#endif
-#endif
-
-#if (HWY_TARGETS & HWY_SVE_256) && (HWY_STATIC_TARGET != HWY_SVE_256)
-#undef HWY_TARGET
-#define HWY_TARGET HWY_SVE_256
-#include HWY_TARGET_INCLUDE
-#ifdef HWY_TARGET_TOGGLE
-#undef HWY_TARGET_TOGGLE
-#else
-#define HWY_TARGET_TOGGLE
-#endif
-#endif
-
-#if (HWY_TARGETS & HWY_SVE2_128) && (HWY_STATIC_TARGET != HWY_SVE2_128)
-#undef HWY_TARGET
-#define HWY_TARGET HWY_SVE2_128
-#include HWY_TARGET_INCLUDE
-#ifdef HWY_TARGET_TOGGLE
-#undef HWY_TARGET_TOGGLE
-#else
-#define HWY_TARGET_TOGGLE
-#endif
-#endif
-
-#if (HWY_TARGETS & HWY_SSSE3) && (HWY_STATIC_TARGET != HWY_SSSE3)
-#undef HWY_TARGET
-#define HWY_TARGET HWY_SSSE3
-#include HWY_TARGET_INCLUDE
-#ifdef HWY_TARGET_TOGGLE
-#undef HWY_TARGET_TOGGLE
-#else
-#define HWY_TARGET_TOGGLE
-#endif
-#endif
-
-#if (HWY_TARGETS & HWY_SSE4) && (HWY_STATIC_TARGET != HWY_SSE4)
-#undef HWY_TARGET
-#define HWY_TARGET HWY_SSE4
-#include HWY_TARGET_INCLUDE
-#ifdef HWY_TARGET_TOGGLE
-#undef HWY_TARGET_TOGGLE
-#else
-#define HWY_TARGET_TOGGLE
-#endif
-#endif
-
-#if (HWY_TARGETS & HWY_AVX2) && (HWY_STATIC_TARGET != HWY_AVX2)
-#undef HWY_TARGET
-#define HWY_TARGET HWY_AVX2
-#include HWY_TARGET_INCLUDE
-#ifdef HWY_TARGET_TOGGLE
-#undef HWY_TARGET_TOGGLE
-#else
-#define HWY_TARGET_TOGGLE
-#endif
-#endif
-
-#if (HWY_TARGETS & HWY_AVX3) && (HWY_STATIC_TARGET != HWY_AVX3)
-#undef HWY_TARGET
-#define HWY_TARGET HWY_AVX3
-#include HWY_TARGET_INCLUDE
-#ifdef HWY_TARGET_TOGGLE
-#undef HWY_TARGET_TOGGLE
-#else
-#define HWY_TARGET_TOGGLE
-#endif
-#endif
-
-#if (HWY_TARGETS & HWY_AVX3_DL) && (HWY_STATIC_TARGET != HWY_AVX3_DL)
-#undef HWY_TARGET
-#define HWY_TARGET HWY_AVX3_DL
-#include HWY_TARGET_INCLUDE
-#ifdef HWY_TARGET_TOGGLE
-#undef HWY_TARGET_TOGGLE
-#else
-#define HWY_TARGET_TOGGLE
-#endif
-#endif
-
-#if (HWY_TARGETS & HWY_WASM_EMU256) && (HWY_STATIC_TARGET != HWY_WASM_EMU256)
-#undef HWY_TARGET
-#define HWY_TARGET HWY_WASM_EMU256
-#include HWY_TARGET_INCLUDE
-#ifdef HWY_TARGET_TOGGLE
-#undef HWY_TARGET_TOGGLE
-#else
-#define HWY_TARGET_TOGGLE
-#endif
-#endif
-
-#if (HWY_TARGETS & HWY_WASM) && (HWY_STATIC_TARGET != HWY_WASM)
-#undef HWY_TARGET
-#define HWY_TARGET HWY_WASM
-#include HWY_TARGET_INCLUDE
-#ifdef HWY_TARGET_TOGGLE
-#undef HWY_TARGET_TOGGLE
-#else
-#define HWY_TARGET_TOGGLE
-#endif
-#endif
-
-#if (HWY_TARGETS & HWY_PPC8) && (HWY_STATIC_TARGET != HWY_PPC8)
-#undef HWY_TARGET
-#define HWY_TARGET HWY_PPC8
-#include HWY_TARGET_INCLUDE
-#ifdef HWY_TARGET_TOGGLE
-#undef HWY_TARGET_TOGGLE
-#else
-#define HWY_TARGET_TOGGLE
-#endif
-#endif
-
-#endif  // !HWY_IDE && (HWY_TARGETS != HWY_STATIC_TARGET)
-
-// Now that all but the static target have been generated, re-enable HWY_EXPORT.
-#undef HWY_ONCE
-#define HWY_ONCE 1
-
-// If we re-include once per enabled target, the translation unit's
-// implementation would have to be skipped via #if to avoid redefining symbols.
-// We instead skip the re-include for HWY_STATIC_TARGET, and generate its
-// implementation when resuming compilation of the translation unit.
-#undef HWY_TARGET
-#define HWY_TARGET HWY_STATIC_TARGET
-
-#ifdef HWY_ALREADY_INCLUDED
-// Revert the previous toggle to prevent redefinitions for the static target.
-#ifdef HWY_TARGET_TOGGLE
-#undef HWY_TARGET_TOGGLE
-#else
-#define HWY_TARGET_TOGGLE
-#endif
-
-// Force re-inclusion of set_macros-inl.h now that HWY_TARGET is restored.
-#ifdef HWY_SET_MACROS_PER_TARGET
-#undef HWY_SET_MACROS_PER_TARGET
-#else
-#define HWY_SET_MACROS_PER_TARGET
-#endif
-#endif
-
-#endif  // HIGHWAY_HWY_FOREACH_TARGET_H_
diff --git a/third_party/highway/hwy/highway.h b/third_party/highway/hwy/highway.h
deleted file mode 100644 (file)
index 615af4e..0000000
+++ /dev/null
@@ -1,378 +0,0 @@
-// Copyright 2020 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// This include guard is checked by foreach_target, so avoid the usual _H_
-// suffix to prevent copybara from renaming it. NOTE: ops/*-inl.h are included
-// after/outside this include guard.
-#ifndef HWY_HIGHWAY_INCLUDED
-#define HWY_HIGHWAY_INCLUDED
-
-// Main header required before using vector types.
-
-#include "hwy/base.h"
-#include "hwy/targets.h"
-
-namespace hwy {
-
-// API version (https://semver.org/); keep in sync with CMakeLists.txt.
-#define HWY_MAJOR 1
-#define HWY_MINOR 0
-#define HWY_PATCH 1
-
-//------------------------------------------------------------------------------
-// Shorthand for tags (defined in shared-inl.h) used to select overloads.
-// Note that ScalableTag<T> is preferred over HWY_FULL, and CappedTag<T, N> over
-// HWY_CAPPED(T, N).
-
-// HWY_FULL(T[,LMUL=1]) is a native vector/group. LMUL is the number of
-// registers in the group, and is ignored on targets that do not support groups.
-#define HWY_FULL1(T) hwy::HWY_NAMESPACE::ScalableTag<T>
-#define HWY_FULL2(T, LMUL) \
-  hwy::HWY_NAMESPACE::ScalableTag<T, hwy::CeilLog2(HWY_MAX(0, LMUL))>
-#define HWY_3TH_ARG(arg1, arg2, arg3, ...) arg3
-// Workaround for MSVC grouping __VA_ARGS__ into a single argument
-#define HWY_FULL_RECOMPOSER(args_with_paren) HWY_3TH_ARG args_with_paren
-// Trailing comma avoids -pedantic false alarm
-#define HWY_CHOOSE_FULL(...) \
-  HWY_FULL_RECOMPOSER((__VA_ARGS__, HWY_FULL2, HWY_FULL1, ))
-#define HWY_FULL(...) HWY_CHOOSE_FULL(__VA_ARGS__())(__VA_ARGS__)
-
-// Vector of up to MAX_N lanes. It's better to use full vectors where possible.
-#define HWY_CAPPED(T, MAX_N) hwy::HWY_NAMESPACE::CappedTag<T, MAX_N>
-
-//------------------------------------------------------------------------------
-// Export user functions for static/dynamic dispatch
-
-// Evaluates to 0 inside a translation unit if it is generating anything but the
-// static target (the last one if multiple targets are enabled). Used to prevent
-// redefinitions of HWY_EXPORT. Unless foreach_target.h is included, we only
-// compile once anyway, so this is 1 unless it is or has been included.
-#ifndef HWY_ONCE
-#define HWY_ONCE 1
-#endif
-
-// HWY_STATIC_DISPATCH(FUNC_NAME) is the namespace-qualified FUNC_NAME for
-// HWY_STATIC_TARGET (the only defined namespace unless HWY_TARGET_INCLUDE is
-// defined), and can be used to deduce the return type of Choose*.
-#if HWY_STATIC_TARGET == HWY_SCALAR
-#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SCALAR::FUNC_NAME
-#elif HWY_STATIC_TARGET == HWY_EMU128
-#define HWY_STATIC_DISPATCH(FUNC_NAME) N_EMU128::FUNC_NAME
-#elif HWY_STATIC_TARGET == HWY_RVV
-#define HWY_STATIC_DISPATCH(FUNC_NAME) N_RVV::FUNC_NAME
-#elif HWY_STATIC_TARGET == HWY_WASM_EMU256
-#define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM_EMU256::FUNC_NAME
-#elif HWY_STATIC_TARGET == HWY_WASM
-#define HWY_STATIC_DISPATCH(FUNC_NAME) N_WASM::FUNC_NAME
-#elif HWY_STATIC_TARGET == HWY_NEON
-#define HWY_STATIC_DISPATCH(FUNC_NAME) N_NEON::FUNC_NAME
-#elif HWY_STATIC_TARGET == HWY_SVE
-#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE::FUNC_NAME
-#elif HWY_STATIC_TARGET == HWY_SVE2
-#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2::FUNC_NAME
-#elif HWY_STATIC_TARGET == HWY_SVE_256
-#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE_256::FUNC_NAME
-#elif HWY_STATIC_TARGET == HWY_SVE2_128
-#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SVE2_128::FUNC_NAME
-#elif HWY_STATIC_TARGET == HWY_PPC8
-#define HWY_STATIC_DISPATCH(FUNC_NAME) N_PPC8::FUNC_NAME
-#elif HWY_STATIC_TARGET == HWY_SSSE3
-#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSSE3::FUNC_NAME
-#elif HWY_STATIC_TARGET == HWY_SSE4
-#define HWY_STATIC_DISPATCH(FUNC_NAME) N_SSE4::FUNC_NAME
-#elif HWY_STATIC_TARGET == HWY_AVX2
-#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX2::FUNC_NAME
-#elif HWY_STATIC_TARGET == HWY_AVX3
-#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3::FUNC_NAME
-#elif HWY_STATIC_TARGET == HWY_AVX3_DL
-#define HWY_STATIC_DISPATCH(FUNC_NAME) N_AVX3_DL::FUNC_NAME
-#endif
-
-// HWY_CHOOSE_*(FUNC_NAME) expands to the function pointer for that target or
-// nullptr is that target was not compiled.
-#if HWY_TARGETS & HWY_EMU128
-#define HWY_CHOOSE_FALLBACK(FUNC_NAME) &N_EMU128::FUNC_NAME
-#elif HWY_TARGETS & HWY_SCALAR
-#define HWY_CHOOSE_FALLBACK(FUNC_NAME) &N_SCALAR::FUNC_NAME
-#else
-// When HWY_SCALAR/HWY_EMU128 are not present and other targets were disabled at
-// runtime, fall back to the baseline with HWY_STATIC_DISPATCH().
-#define HWY_CHOOSE_FALLBACK(FUNC_NAME) &HWY_STATIC_DISPATCH(FUNC_NAME)
-#endif
-
-#if HWY_TARGETS & HWY_WASM_EMU256
-#define HWY_CHOOSE_WASM_EMU256(FUNC_NAME) &N_WASM_EMU256::FUNC_NAME
-#else
-#define HWY_CHOOSE_WASM_EMU256(FUNC_NAME) nullptr
-#endif
-
-#if HWY_TARGETS & HWY_WASM
-#define HWY_CHOOSE_WASM(FUNC_NAME) &N_WASM::FUNC_NAME
-#else
-#define HWY_CHOOSE_WASM(FUNC_NAME) nullptr
-#endif
-
-#if HWY_TARGETS & HWY_RVV
-#define HWY_CHOOSE_RVV(FUNC_NAME) &N_RVV::FUNC_NAME
-#else
-#define HWY_CHOOSE_RVV(FUNC_NAME) nullptr
-#endif
-
-#if HWY_TARGETS & HWY_NEON
-#define HWY_CHOOSE_NEON(FUNC_NAME) &N_NEON::FUNC_NAME
-#else
-#define HWY_CHOOSE_NEON(FUNC_NAME) nullptr
-#endif
-
-#if HWY_TARGETS & HWY_SVE
-#define HWY_CHOOSE_SVE(FUNC_NAME) &N_SVE::FUNC_NAME
-#else
-#define HWY_CHOOSE_SVE(FUNC_NAME) nullptr
-#endif
-
-#if HWY_TARGETS & HWY_SVE2
-#define HWY_CHOOSE_SVE2(FUNC_NAME) &N_SVE2::FUNC_NAME
-#else
-#define HWY_CHOOSE_SVE2(FUNC_NAME) nullptr
-#endif
-
-#if HWY_TARGETS & HWY_SVE_256
-#define HWY_CHOOSE_SVE_256(FUNC_NAME) &N_SVE_256::FUNC_NAME
-#else
-#define HWY_CHOOSE_SVE_256(FUNC_NAME) nullptr
-#endif
-
-#if HWY_TARGETS & HWY_SVE2_128
-#define HWY_CHOOSE_SVE2_128(FUNC_NAME) &N_SVE2_128::FUNC_NAME
-#else
-#define HWY_CHOOSE_SVE2_128(FUNC_NAME) nullptr
-#endif
-
-#if HWY_TARGETS & HWY_PPC8
-#define HWY_CHOOSE_PCC8(FUNC_NAME) &N_PPC8::FUNC_NAME
-#else
-#define HWY_CHOOSE_PPC8(FUNC_NAME) nullptr
-#endif
-
-#if HWY_TARGETS & HWY_SSSE3
-#define HWY_CHOOSE_SSSE3(FUNC_NAME) &N_SSSE3::FUNC_NAME
-#else
-#define HWY_CHOOSE_SSSE3(FUNC_NAME) nullptr
-#endif
-
-#if HWY_TARGETS & HWY_SSE4
-#define HWY_CHOOSE_SSE4(FUNC_NAME) &N_SSE4::FUNC_NAME
-#else
-#define HWY_CHOOSE_SSE4(FUNC_NAME) nullptr
-#endif
-
-#if HWY_TARGETS & HWY_AVX2
-#define HWY_CHOOSE_AVX2(FUNC_NAME) &N_AVX2::FUNC_NAME
-#else
-#define HWY_CHOOSE_AVX2(FUNC_NAME) nullptr
-#endif
-
-#if HWY_TARGETS & HWY_AVX3
-#define HWY_CHOOSE_AVX3(FUNC_NAME) &N_AVX3::FUNC_NAME
-#else
-#define HWY_CHOOSE_AVX3(FUNC_NAME) nullptr
-#endif
-
-#if HWY_TARGETS & HWY_AVX3_DL
-#define HWY_CHOOSE_AVX3_DL(FUNC_NAME) &N_AVX3_DL::FUNC_NAME
-#else
-#define HWY_CHOOSE_AVX3_DL(FUNC_NAME) nullptr
-#endif
-
-// MSVC 2017 workaround: the non-type template parameter to ChooseAndCall
-// apparently cannot be an array. Use a function pointer instead, which has the
-// disadvantage that we call the static (not best) target on the first call to
-// any HWY_DYNAMIC_DISPATCH.
-#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1915
-#define HWY_DISPATCH_WORKAROUND 1
-#else
-#define HWY_DISPATCH_WORKAROUND 0
-#endif
-
-// Provides a static member function which is what is called during the first
-// HWY_DYNAMIC_DISPATCH, where GetIndex is still zero, and instantiations of
-// this function are the first entry in the tables created by HWY_EXPORT.
-template <typename RetType, typename... Args>
-struct FunctionCache {
- public:
-  typedef RetType(FunctionType)(Args...);
-
-#if HWY_DISPATCH_WORKAROUND
-  template <FunctionType* const func>
-  static RetType ChooseAndCall(Args... args) {
-    ChosenTarget& chosen_target = GetChosenTarget();
-    chosen_target.Update(SupportedTargets());
-    return (*func)(args...);
-  }
-#else
-  // A template function that when instantiated has the same signature as the
-  // function being called. This function initializes the bit array of targets
-  // supported by the current CPU and then calls the appropriate entry within
-  // the HWY_EXPORT table. Subsequent calls via HWY_DYNAMIC_DISPATCH to any
-  // exported functions, even those defined by different translation units,
-  // will dispatch directly to the best available target.
-  template <FunctionType* const table[]>
-  static RetType ChooseAndCall(Args... args) {
-    ChosenTarget& chosen_target = GetChosenTarget();
-    chosen_target.Update(SupportedTargets());
-    return (table[chosen_target.GetIndex()])(args...);
-  }
-#endif  // HWY_DISPATCH_WORKAROUND
-};
-
-// Used to deduce the template parameters RetType and Args from a function.
-template <typename RetType, typename... Args>
-FunctionCache<RetType, Args...> DeduceFunctionCache(RetType (*)(Args...)) {
-  return FunctionCache<RetType, Args...>();
-}
-
-#define HWY_DISPATCH_TABLE(FUNC_NAME) \
-  HWY_CONCAT(FUNC_NAME, HighwayDispatchTable)
-
-// HWY_EXPORT(FUNC_NAME); expands to a static array that is used by
-// HWY_DYNAMIC_DISPATCH() to call the appropriate function at runtime. This
-// static array must be defined at the same namespace level as the function
-// it is exporting.
-// After being exported, it can be called from other parts of the same source
-// file using HWY_DYNAMIC_DISPATCH(), in particular from a function wrapper
-// like in the following example:
-//
-//   #include "hwy/highway.h"
-//   HWY_BEFORE_NAMESPACE();
-//   namespace skeleton {
-//   namespace HWY_NAMESPACE {
-//
-//   void MyFunction(int a, char b, const char* c) { ... }
-//
-//   // NOLINTNEXTLINE(google-readability-namespace-comments)
-//   }  // namespace HWY_NAMESPACE
-//   }  // namespace skeleton
-//   HWY_AFTER_NAMESPACE();
-//
-//   namespace skeleton {
-//   HWY_EXPORT(MyFunction);  // Defines the dispatch table in this scope.
-//
-//   void MyFunction(int a, char b, const char* c) {
-//     return HWY_DYNAMIC_DISPATCH(MyFunction)(a, b, c);
-//   }
-//   }  // namespace skeleton
-//
-
-#if HWY_IDE || ((HWY_TARGETS & (HWY_TARGETS - 1)) == 0)
-
-// Simplified version for IDE or the dynamic dispatch case with only one target.
-// This case still uses a table, although of a single element, to provide the
-// same compile error conditions as with the dynamic dispatch case when multiple
-// targets are being compiled.
-#define HWY_EXPORT(FUNC_NAME)                                             \
-  HWY_MAYBE_UNUSED static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const \
-  HWY_DISPATCH_TABLE(FUNC_NAME)[1] = {&HWY_STATIC_DISPATCH(FUNC_NAME)}
-#define HWY_DYNAMIC_DISPATCH(FUNC_NAME) HWY_STATIC_DISPATCH(FUNC_NAME)
-
-#else
-
-// Simplified version for MSVC 2017: function pointer instead of table.
-#if HWY_DISPATCH_WORKAROUND
-
-#define HWY_EXPORT(FUNC_NAME)                                                \
-  static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const HWY_DISPATCH_TABLE( \
-      FUNC_NAME)[HWY_MAX_DYNAMIC_TARGETS + 2] = {                            \
-      /* The first entry in the table initializes the global cache and       \
-       * calls the function from HWY_STATIC_TARGET. */                       \
-      &decltype(hwy::DeduceFunctionCache(&HWY_STATIC_DISPATCH(               \
-          FUNC_NAME)))::ChooseAndCall<&HWY_STATIC_DISPATCH(FUNC_NAME)>,      \
-      HWY_CHOOSE_TARGET_LIST(FUNC_NAME),                                     \
-      HWY_CHOOSE_FALLBACK(FUNC_NAME),                                        \
-  }
-
-#else
-
-// Dynamic dispatch case with one entry per dynamic target plus the fallback
-// target and the initialization wrapper.
-#define HWY_EXPORT(FUNC_NAME)                                                \
-  static decltype(&HWY_STATIC_DISPATCH(FUNC_NAME)) const HWY_DISPATCH_TABLE( \
-      FUNC_NAME)[HWY_MAX_DYNAMIC_TARGETS + 2] = {                            \
-      /* The first entry in the table initializes the global cache and       \
-       * calls the appropriate function. */                                  \
-      &decltype(hwy::DeduceFunctionCache(&HWY_STATIC_DISPATCH(               \
-          FUNC_NAME)))::ChooseAndCall<HWY_DISPATCH_TABLE(FUNC_NAME)>,        \
-      HWY_CHOOSE_TARGET_LIST(FUNC_NAME),                                     \
-      HWY_CHOOSE_FALLBACK(FUNC_NAME),                                        \
-  }
-
-#endif  // HWY_DISPATCH_WORKAROUND
-
-#define HWY_DYNAMIC_DISPATCH(FUNC_NAME) \
-  (*(HWY_DISPATCH_TABLE(FUNC_NAME)[hwy::GetChosenTarget().GetIndex()]))
-
-#endif  // HWY_IDE || ((HWY_TARGETS & (HWY_TARGETS - 1)) == 0)
-
-// DEPRECATED names; please use HWY_HAVE_* instead.
-#define HWY_CAP_INTEGER64 HWY_HAVE_INTEGER64
-#define HWY_CAP_FLOAT16 HWY_HAVE_FLOAT16
-#define HWY_CAP_FLOAT64 HWY_HAVE_FLOAT64
-
-}  // namespace hwy
-
-#endif  // HWY_HIGHWAY_INCLUDED
-
-//------------------------------------------------------------------------------
-
-// NOTE: the following definitions and ops/*.h depend on HWY_TARGET, so we want
-// to include them once per target, which is ensured by the toggle check.
-// Because ops/*.h are included under it, they do not need their own guard.
-#if defined(HWY_HIGHWAY_PER_TARGET) == defined(HWY_TARGET_TOGGLE)
-#ifdef HWY_HIGHWAY_PER_TARGET
-#undef HWY_HIGHWAY_PER_TARGET
-#else
-#define HWY_HIGHWAY_PER_TARGET
-#endif
-
-// These define ops inside namespace hwy::HWY_NAMESPACE.
-#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
-#include "hwy/ops/x86_128-inl.h"
-#elif HWY_TARGET == HWY_AVX2
-#include "hwy/ops/x86_256-inl.h"
-#elif HWY_TARGET == HWY_AVX3 || HWY_TARGET == HWY_AVX3_DL
-#include "hwy/ops/x86_512-inl.h"
-#elif HWY_TARGET == HWY_PPC8
-#error "PPC is not yet supported"
-#elif HWY_TARGET == HWY_NEON
-#include "hwy/ops/arm_neon-inl.h"
-#elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2 || \
-    HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
-#include "hwy/ops/arm_sve-inl.h"
-#elif HWY_TARGET == HWY_WASM_EMU256
-#include "hwy/ops/wasm_256-inl.h"
-#elif HWY_TARGET == HWY_WASM
-#include "hwy/ops/wasm_128-inl.h"
-#elif HWY_TARGET == HWY_RVV
-#include "hwy/ops/rvv-inl.h"
-#elif HWY_TARGET == HWY_EMU128
-#include "hwy/ops/emu128-inl.h"
-#elif HWY_TARGET == HWY_SCALAR
-#include "hwy/ops/scalar-inl.h"
-#else
-#pragma message("HWY_TARGET does not match any known target")
-#endif  // HWY_TARGET
-
-#include "hwy/ops/generic_ops-inl.h"
-
-#endif  // HWY_HIGHWAY_PER_TARGET
diff --git a/third_party/highway/hwy/highway_export.h b/third_party/highway/hwy/highway_export.h
deleted file mode 100644 (file)
index 30edc17..0000000
+++ /dev/null
@@ -1,74 +0,0 @@
-// Pseudo-generated file to handle both cmake & bazel build system.
-
-// Initial generation done using cmake code:
-// include(GenerateExportHeader)
-// generate_export_header(hwy EXPORT_MACRO_NAME HWY_DLLEXPORT EXPORT_FILE_NAME
-// hwy/highway_export.h)
-// code reformatted using clang-format --style=Google
-
-#ifndef HWY_DLLEXPORT_H
-#define HWY_DLLEXPORT_H
-
-#if !defined(HWY_SHARED_DEFINE)
-#define HWY_DLLEXPORT
-#define HWY_CONTRIB_DLLEXPORT
-#define HWY_TEST_DLLEXPORT
-#else  // !HWY_SHARED_DEFINE
-
-#ifndef HWY_DLLEXPORT
-#if defined(hwy_EXPORTS)
-/* We are building this library */
-#ifdef _WIN32
-#define HWY_DLLEXPORT __declspec(dllexport)
-#else
-#define HWY_DLLEXPORT __attribute__((visibility("default")))
-#endif
-#else  // defined(hwy_EXPORTS)
-/* We are using this library */
-#ifdef _WIN32
-#define HWY_DLLEXPORT __declspec(dllimport)
-#else
-#define HWY_DLLEXPORT __attribute__((visibility("default")))
-#endif
-#endif  // defined(hwy_EXPORTS)
-#endif  // HWY_DLLEXPORT
-
-#ifndef HWY_CONTRIB_DLLEXPORT
-#if defined(hwy_contrib_EXPORTS)
-/* We are building this library */
-#ifdef _WIN32
-#define HWY_CONTRIB_DLLEXPORT __declspec(dllexport)
-#else
-#define HWY_CONTRIB_DLLEXPORT __attribute__((visibility("default")))
-#endif
-#else  // defined(hwy_contrib_EXPORTS)
-/* We are using this library */
-#ifdef _WIN32
-#define HWY_CONTRIB_DLLEXPORT __declspec(dllimport)
-#else
-#define HWY_CONTRIB_DLLEXPORT __attribute__((visibility("default")))
-#endif
-#endif  // defined(hwy_contrib_EXPORTS)
-#endif  // HWY_CONTRIB_DLLEXPORT
-
-#ifndef HWY_TEST_DLLEXPORT
-#if defined(hwy_test_EXPORTS)
-/* We are building this library */
-#ifdef _WIN32
-#define HWY_TEST_DLLEXPORT __declspec(dllexport)
-#else
-#define HWY_TEST_DLLEXPORT __attribute__((visibility("default")))
-#endif
-#else  // defined(hwy_test_EXPORTS)
-/* We are using this library */
-#ifdef _WIN32
-#define HWY_TEST_DLLEXPORT __declspec(dllimport)
-#else
-#define HWY_TEST_DLLEXPORT __attribute__((visibility("default")))
-#endif
-#endif  // defined(hwy_test_EXPORTS)
-#endif  // HWY_TEST_DLLEXPORT
-
-#endif  // !HWY_SHARED_DEFINE
-
-#endif /* HWY_DLLEXPORT_H */
diff --git a/third_party/highway/hwy/highway_test.cc b/third_party/highway/hwy/highway_test.cc
deleted file mode 100644 (file)
index 4838e72..0000000
+++ /dev/null
@@ -1,485 +0,0 @@
-// Copyright 2019 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include <bitset>
-
-#include "hwy/base.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "highway_test.cc"
-#include "hwy/foreach_target.h"    // IWYU pragma: keep
-#include "hwy/highway.h"
-#include "hwy/nanobenchmark.h"  // Unpredictable1
-#include "hwy/tests/test_util-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-template <size_t kLimit, typename T>
-HWY_NOINLINE void TestCappedLimit(T /* tag */) {
-  CappedTag<T, kLimit> d;
-  // Ensure two ops compile
-  HWY_ASSERT_VEC_EQ(d, Zero(d), Set(d, T{0}));
-
-  // Ensure we do not write more than kLimit lanes
-  const size_t N = Lanes(d);
-  if (kLimit < N) {
-    auto lanes = AllocateAligned<T>(N);
-    std::fill(lanes.get(), lanes.get() + N, T{0});
-    Store(Set(d, T{1}), d, lanes.get());
-    for (size_t i = kLimit; i < N; ++i) {
-      HWY_ASSERT_EQ(lanes[i], T{0});
-    }
-  }
-}
-
-// Adapter for ForAllTypes - we are constructing our own Simd<> and thus do not
-// use ForPartialVectors etc.
-struct TestCapped {
-  template <typename T>
-  void operator()(T t) const {
-    TestCappedLimit<1>(t);
-    TestCappedLimit<3>(t);
-    TestCappedLimit<5>(t);
-    TestCappedLimit<1ull << 15>(t);
-  }
-};
-
-HWY_NOINLINE void TestAllCapped() { ForAllTypes(TestCapped()); }
-
-// For testing that ForPartialVectors reaches every possible size:
-using NumLanesSet = std::bitset<HWY_MAX_BYTES + 1>;
-
-// Monostate pattern because ForPartialVectors takes a template argument, not a
-// functor by reference.
-static NumLanesSet* NumLanesForSize(size_t sizeof_t) {
-  HWY_ASSERT(sizeof_t <= sizeof(uint64_t));
-  static NumLanesSet num_lanes[sizeof(uint64_t) + 1];
-  return num_lanes + sizeof_t;
-}
-static size_t* MaxLanesForSize(size_t sizeof_t) {
-  HWY_ASSERT(sizeof_t <= sizeof(uint64_t));
-  static size_t num_lanes[sizeof(uint64_t) + 1] = {0};
-  return num_lanes + sizeof_t;
-}
-
-struct TestMaxLanes {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    const size_t kMax = MaxLanes(d);  // for RVV, includes LMUL
-    HWY_ASSERT(N <= kMax);
-    HWY_ASSERT(kMax <= (HWY_MAX_BYTES / sizeof(T)));
-
-    NumLanesForSize(sizeof(T))->set(N);
-    *MaxLanesForSize(sizeof(T)) = HWY_MAX(*MaxLanesForSize(sizeof(T)), N);
-  }
-};
-
-HWY_NOINLINE void TestAllMaxLanes() {
-  ForAllTypes(ForPartialVectors<TestMaxLanes>());
-
-  // Ensure ForPartialVectors visited all powers of two [1, N].
-  for (size_t sizeof_t : {sizeof(uint8_t), sizeof(uint16_t), sizeof(uint32_t),
-                          sizeof(uint64_t)}) {
-    const size_t N = *MaxLanesForSize(sizeof_t);
-    for (size_t i = 1; i <= N; i += i) {
-      if (!NumLanesForSize(sizeof_t)->test(i)) {
-        fprintf(stderr, "T=%d: did not visit for N=%d, max=%d\n",
-                static_cast<int>(sizeof_t), static_cast<int>(i),
-                static_cast<int>(N));
-        HWY_ASSERT(false);
-      }
-    }
-  }
-}
-
-struct TestSet {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    // Zero
-    const auto v0 = Zero(d);
-    const size_t N = Lanes(d);
-    auto expected = AllocateAligned<T>(N);
-    std::fill(expected.get(), expected.get() + N, T(0));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), v0);
-
-    // Set
-    const auto v2 = Set(d, T(2));
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = 2;
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), v2);
-
-    // Iota
-    const auto vi = Iota(d, T(5));
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = T(5 + i);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), vi);
-
-    // Undefined
-    const auto vu = Undefined(d);
-    Store(vu, d, expected.get());
-  }
-};
-
-HWY_NOINLINE void TestAllSet() { ForAllTypes(ForPartialVectors<TestSet>()); }
-
-// Ensures wraparound (mod 2^bits)
-struct TestOverflow {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v1 = Set(d, T(1));
-    const auto vmax = Set(d, LimitsMax<T>());
-    const auto vmin = Set(d, LimitsMin<T>());
-    // Unsigned underflow / negative -> positive
-    HWY_ASSERT_VEC_EQ(d, vmax, Sub(vmin, v1));
-    // Unsigned overflow / positive -> negative
-    HWY_ASSERT_VEC_EQ(d, vmin, Add(vmax, v1));
-  }
-};
-
-HWY_NOINLINE void TestAllOverflow() {
-  ForIntegerTypes(ForPartialVectors<TestOverflow>());
-}
-
-struct TestClamp {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v0 = Zero(d);
-    const auto v1 = Set(d, 1);
-    const auto v2 = Set(d, 2);
-
-    HWY_ASSERT_VEC_EQ(d, v1, Clamp(v2, v0, v1));
-    HWY_ASSERT_VEC_EQ(d, v1, Clamp(v0, v1, v2));
-  }
-};
-
-HWY_NOINLINE void TestAllClamp() {
-  ForAllTypes(ForPartialVectors<TestClamp>());
-}
-
-struct TestSignBitInteger {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v0 = Zero(d);
-    const auto all = VecFromMask(d, Eq(v0, v0));
-    const auto vs = SignBit(d);
-    const auto other = Sub(vs, Set(d, 1));
-
-    // Shifting left by one => overflow, equal zero
-    HWY_ASSERT_VEC_EQ(d, v0, Add(vs, vs));
-    // Verify the lower bits are zero (only +/- and logical ops are available
-    // for all types)
-    HWY_ASSERT_VEC_EQ(d, all, Add(vs, other));
-  }
-};
-
-struct TestSignBitFloat {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v0 = Zero(d);
-    const auto vs = SignBit(d);
-    const auto vp = Set(d, 2.25);
-    const auto vn = Set(d, -2.25);
-    HWY_ASSERT_VEC_EQ(d, Or(vp, vs), vn);
-    HWY_ASSERT_VEC_EQ(d, AndNot(vs, vn), vp);
-    HWY_ASSERT_VEC_EQ(d, v0, vs);
-  }
-};
-
-HWY_NOINLINE void TestAllSignBit() {
-  ForIntegerTypes(ForPartialVectors<TestSignBitInteger>());
-  ForFloatTypes(ForPartialVectors<TestSignBitFloat>());
-}
-
-// inline to work around incorrect SVE codegen (only first 128 bits used).
-template <class D, class V>
-HWY_INLINE void AssertNaN(D d, VecArg<V> v, const char* file, int line) {
-  using T = TFromD<D>;
-  const size_t N = Lanes(d);
-  if (!AllTrue(d, IsNaN(v))) {
-    Print(d, "not all NaN", v, 0, N);
-    Print(d, "mask", VecFromMask(d, IsNaN(v)), 0, N);
-    const std::string type_name = TypeName(T(), N);
-    // RVV lacks PRIu64 and MSYS still has problems with %zu, so print bytes to
-    // avoid truncating doubles.
-    uint8_t bytes[HWY_MAX(sizeof(T), 8)] = {0};
-    const T lane = GetLane(v);
-    CopyBytes<sizeof(T)>(&lane, bytes);
-    Abort(file, line,
-          "Expected %s NaN, got %E (bytes %02x %02x %02x %02x %02x %02x %02x "
-          "%02x)",
-          type_name.c_str(), lane, bytes[0], bytes[1], bytes[2], bytes[3],
-          bytes[4], bytes[5], bytes[6], bytes[7]);
-  }
-}
-
-#define HWY_ASSERT_NAN(d, v) AssertNaN(d, v, __FILE__, __LINE__)
-
-struct TestNaN {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v1 = Set(d, T(Unpredictable1()));
-    const auto nan = IfThenElse(Eq(v1, Set(d, T(1))), NaN(d), v1);
-    HWY_ASSERT_NAN(d, nan);
-
-    // Arithmetic
-    HWY_ASSERT_NAN(d, Add(nan, v1));
-    HWY_ASSERT_NAN(d, Add(v1, nan));
-    HWY_ASSERT_NAN(d, Sub(nan, v1));
-    HWY_ASSERT_NAN(d, Sub(v1, nan));
-    HWY_ASSERT_NAN(d, Mul(nan, v1));
-    HWY_ASSERT_NAN(d, Mul(v1, nan));
-    HWY_ASSERT_NAN(d, Div(nan, v1));
-    HWY_ASSERT_NAN(d, Div(v1, nan));
-
-    // FMA
-    HWY_ASSERT_NAN(d, MulAdd(nan, v1, v1));
-    HWY_ASSERT_NAN(d, MulAdd(v1, nan, v1));
-    HWY_ASSERT_NAN(d, MulAdd(v1, v1, nan));
-    HWY_ASSERT_NAN(d, MulSub(nan, v1, v1));
-    HWY_ASSERT_NAN(d, MulSub(v1, nan, v1));
-    HWY_ASSERT_NAN(d, MulSub(v1, v1, nan));
-    HWY_ASSERT_NAN(d, NegMulAdd(nan, v1, v1));
-    HWY_ASSERT_NAN(d, NegMulAdd(v1, nan, v1));
-    HWY_ASSERT_NAN(d, NegMulAdd(v1, v1, nan));
-    HWY_ASSERT_NAN(d, NegMulSub(nan, v1, v1));
-    HWY_ASSERT_NAN(d, NegMulSub(v1, nan, v1));
-    HWY_ASSERT_NAN(d, NegMulSub(v1, v1, nan));
-
-    // Rcp/Sqrt
-    HWY_ASSERT_NAN(d, Sqrt(nan));
-
-    // Sign manipulation
-    HWY_ASSERT_NAN(d, Abs(nan));
-    HWY_ASSERT_NAN(d, Neg(nan));
-    HWY_ASSERT_NAN(d, CopySign(nan, v1));
-    HWY_ASSERT_NAN(d, CopySignToAbs(nan, v1));
-
-    // Rounding
-    HWY_ASSERT_NAN(d, Ceil(nan));
-    HWY_ASSERT_NAN(d, Floor(nan));
-    HWY_ASSERT_NAN(d, Round(nan));
-    HWY_ASSERT_NAN(d, Trunc(nan));
-
-    // Logical (And/AndNot/Xor will clear NaN!)
-    HWY_ASSERT_NAN(d, Or(nan, v1));
-
-    // Comparison
-    HWY_ASSERT(AllFalse(d, Eq(nan, v1)));
-    HWY_ASSERT(AllFalse(d, Gt(nan, v1)));
-    HWY_ASSERT(AllFalse(d, Lt(nan, v1)));
-    HWY_ASSERT(AllFalse(d, Ge(nan, v1)));
-    HWY_ASSERT(AllFalse(d, Le(nan, v1)));
-
-    // Reduction
-    HWY_ASSERT_NAN(d, SumOfLanes(d, nan));
-// TODO(janwas): re-enable after QEMU/Spike are fixed
-#if HWY_TARGET != HWY_RVV
-    HWY_ASSERT_NAN(d, MinOfLanes(d, nan));
-    HWY_ASSERT_NAN(d, MaxOfLanes(d, nan));
-#endif
-
-    // Min
-#if HWY_ARCH_X86 && (HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_EMU128)
-    // x86 SIMD returns the second operand if any input is NaN.
-    HWY_ASSERT_VEC_EQ(d, v1, Min(nan, v1));
-    HWY_ASSERT_VEC_EQ(d, v1, Max(nan, v1));
-    HWY_ASSERT_NAN(d, Min(v1, nan));
-    HWY_ASSERT_NAN(d, Max(v1, nan));
-#elif HWY_ARCH_WASM
-    // Should return NaN if any input is NaN, but does not for scalar.
-    // TODO(janwas): remove once this is fixed.
-#elif HWY_TARGET == HWY_NEON && HWY_ARCH_ARM_V7
-    // ARMv7 NEON returns NaN if any input is NaN.
-    HWY_ASSERT_NAN(d, Min(v1, nan));
-    HWY_ASSERT_NAN(d, Max(v1, nan));
-    HWY_ASSERT_NAN(d, Min(nan, v1));
-    HWY_ASSERT_NAN(d, Max(nan, v1));
-#else
-    // IEEE 754-2019 minimumNumber is defined as the other argument if exactly
-    // one is NaN, and qNaN if both are.
-    HWY_ASSERT_VEC_EQ(d, v1, Min(nan, v1));
-    HWY_ASSERT_VEC_EQ(d, v1, Max(nan, v1));
-    HWY_ASSERT_VEC_EQ(d, v1, Min(v1, nan));
-    HWY_ASSERT_VEC_EQ(d, v1, Max(v1, nan));
-#endif
-    HWY_ASSERT_NAN(d, Min(nan, nan));
-    HWY_ASSERT_NAN(d, Max(nan, nan));
-  }
-};
-
-// For functions only available for float32
-struct TestF32NaN {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v1 = Set(d, T(Unpredictable1()));
-    const auto nan = IfThenElse(Eq(v1, Set(d, T(1))), NaN(d), v1);
-    HWY_ASSERT_NAN(d, ApproximateReciprocal(nan));
-    HWY_ASSERT_NAN(d, ApproximateReciprocalSqrt(nan));
-    HWY_ASSERT_NAN(d, AbsDiff(nan, v1));
-    HWY_ASSERT_NAN(d, AbsDiff(v1, nan));
-  }
-};
-
-HWY_NOINLINE void TestAllNaN() {
-  ForFloatTypes(ForPartialVectors<TestNaN>());
-  ForPartialVectors<TestF32NaN>()(float());
-}
-
-struct TestIsNaN {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v1 = Set(d, T(Unpredictable1()));
-    const auto inf = IfThenElse(Eq(v1, Set(d, T(1))), Inf(d), v1);
-    const auto nan = IfThenElse(Eq(v1, Set(d, T(1))), NaN(d), v1);
-    const auto neg = Set(d, T{-1});
-    HWY_ASSERT_NAN(d, nan);
-    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsNaN(inf));
-    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsNaN(CopySign(inf, neg)));
-    HWY_ASSERT_MASK_EQ(d, MaskTrue(d), IsNaN(nan));
-    HWY_ASSERT_MASK_EQ(d, MaskTrue(d), IsNaN(CopySign(nan, neg)));
-    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsNaN(v1));
-    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsNaN(Zero(d)));
-    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsNaN(Set(d, hwy::LowestValue<T>())));
-    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsNaN(Set(d, hwy::HighestValue<T>())));
-  }
-};
-
-HWY_NOINLINE void TestAllIsNaN() {
-  ForFloatTypes(ForPartialVectors<TestIsNaN>());
-}
-
-struct TestIsInf {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v1 = Set(d, T(Unpredictable1()));
-    const auto inf = IfThenElse(Eq(v1, Set(d, T(1))), Inf(d), v1);
-    const auto nan = IfThenElse(Eq(v1, Set(d, T(1))), NaN(d), v1);
-    const auto neg = Set(d, T{-1});
-    HWY_ASSERT_MASK_EQ(d, MaskTrue(d), IsInf(inf));
-    HWY_ASSERT_MASK_EQ(d, MaskTrue(d), IsInf(CopySign(inf, neg)));
-    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsInf(nan));
-    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsInf(CopySign(nan, neg)));
-    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsInf(v1));
-    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsInf(Zero(d)));
-    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsInf(Set(d, hwy::LowestValue<T>())));
-    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsInf(Set(d, hwy::HighestValue<T>())));
-  }
-};
-
-HWY_NOINLINE void TestAllIsInf() {
-  ForFloatTypes(ForPartialVectors<TestIsInf>());
-}
-
-struct TestIsFinite {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v1 = Set(d, T(Unpredictable1()));
-    const auto inf = IfThenElse(Eq(v1, Set(d, T(1))), Inf(d), v1);
-    const auto nan = IfThenElse(Eq(v1, Set(d, T(1))), NaN(d), v1);
-    const auto neg = Set(d, T{-1});
-    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsFinite(inf));
-    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsFinite(CopySign(inf, neg)));
-    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsFinite(nan));
-    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), IsFinite(CopySign(nan, neg)));
-    HWY_ASSERT_MASK_EQ(d, MaskTrue(d), IsFinite(v1));
-    HWY_ASSERT_MASK_EQ(d, MaskTrue(d), IsFinite(Zero(d)));
-    HWY_ASSERT_MASK_EQ(d, MaskTrue(d), IsFinite(Set(d, hwy::LowestValue<T>())));
-    HWY_ASSERT_MASK_EQ(d, MaskTrue(d),
-                       IsFinite(Set(d, hwy::HighestValue<T>())));
-  }
-};
-
-HWY_NOINLINE void TestAllIsFinite() {
-  ForFloatTypes(ForPartialVectors<TestIsFinite>());
-}
-
-struct TestCopyAndAssign {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    // copy V
-    const auto v3 = Iota(d, 3);
-    auto v3b(v3);
-    HWY_ASSERT_VEC_EQ(d, v3, v3b);
-
-    // assign V
-    auto v3c = Undefined(d);
-    v3c = v3;
-    HWY_ASSERT_VEC_EQ(d, v3, v3c);
-  }
-};
-
-HWY_NOINLINE void TestAllCopyAndAssign() {
-  ForAllTypes(ForPartialVectors<TestCopyAndAssign>());
-}
-
-struct TestGetLane {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    HWY_ASSERT_EQ(T(0), GetLane(Zero(d)));
-    HWY_ASSERT_EQ(T(1), GetLane(Set(d, 1)));
-  }
-};
-
-HWY_NOINLINE void TestAllGetLane() {
-  ForAllTypes(ForPartialVectors<TestGetLane>());
-}
-
-struct TestDFromV {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v0 = Zero(d);
-    using D0 = DFromV<decltype(v0)>;         // not necessarily same as D
-    const auto v0b = And(v0, Set(D0(), 1));  // but vectors can interoperate
-    HWY_ASSERT_VEC_EQ(d, v0, v0b);
-  }
-};
-
-HWY_NOINLINE void TestAllDFromV() {
-  ForAllTypes(ForPartialVectors<TestDFromV>());
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-
-namespace hwy {
-HWY_BEFORE_TEST(HighwayTest);
-HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllCapped);
-HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllMaxLanes);
-HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllSet);
-HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllOverflow);
-HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllClamp);
-HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllSignBit);
-HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllNaN);
-HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllIsNaN);
-HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllIsInf);
-HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllIsFinite);
-HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllCopyAndAssign);
-HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllGetLane);
-HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllDFromV);
-}  // namespace hwy
-
-#endif
diff --git a/third_party/highway/hwy/hwy.version b/third_party/highway/hwy/hwy.version
deleted file mode 100644 (file)
index 9ff6be6..0000000
+++ /dev/null
@@ -1,19 +0,0 @@
-HWY_0 {
-  global:
-    extern "C++" {
-      *hwy::*;
-    };
-
-  local:
-    # Hide all the std namespace symbols. std namespace is explicitly marked
-    # as visibility(default) and header-only functions or methods (such as those
-    # from templates) should be exposed in shared libraries as weak symbols but
-    # this is only needed when we expose those types in the shared library API
-    # in any way. We don't use C++ std types in the API and we also don't
-    # support exceptions in the library.
-    # See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=36022 for a discussion
-    # about this.
-    extern "C++" {
-      *std::*;
-    };
-};
diff --git a/third_party/highway/hwy/nanobenchmark.cc b/third_party/highway/hwy/nanobenchmark.cc
deleted file mode 100644 (file)
index b5acf61..0000000
+++ /dev/null
@@ -1,762 +0,0 @@
-// Copyright 2019 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/nanobenchmark.h"
-
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS  // before inttypes.h
-#endif
-#include <inttypes.h>
-#include <stddef.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>    // clock_gettime
-
-#include <algorithm>  // sort
-#include <array>
-#include <atomic>
-#include <chrono>  //NOLINT
-#include <limits>
-#include <numeric>  // iota
-#include <random>
-#include <string>
-#include <vector>
-
-#if defined(_WIN32) || defined(_WIN64)
-#ifndef NOMINMAX
-#define NOMINMAX
-#endif  // NOMINMAX
-#include <windows.h>
-#endif
-
-#if defined(__APPLE__)
-#include <mach/mach.h>
-#include <mach/mach_time.h>
-#endif
-
-#if defined(__HAIKU__)
-#include <OS.h>
-#endif
-
-#include "hwy/base.h"
-#if HWY_ARCH_PPC && defined(__GLIBC__)
-#include <sys/platform/ppc.h>  // NOLINT __ppc_get_timebase_freq
-#elif HWY_ARCH_X86
-
-#if HWY_COMPILER_MSVC
-#include <intrin.h>
-#else
-#include <cpuid.h>  // NOLINT
-#endif              // HWY_COMPILER_MSVC
-
-#endif  // HWY_ARCH_X86
-
-namespace hwy {
-namespace {
-namespace timer {
-
-// Ticks := platform-specific timer values (CPU cycles on x86). Must be
-// unsigned to guarantee wraparound on overflow.
-using Ticks = uint64_t;
-
-// Start/Stop return absolute timestamps and must be placed immediately before
-// and after the region to measure. We provide separate Start/Stop functions
-// because they use different fences.
-//
-// Background: RDTSC is not 'serializing'; earlier instructions may complete
-// after it, and/or later instructions may complete before it. 'Fences' ensure
-// regions' elapsed times are independent of such reordering. The only
-// documented unprivileged serializing instruction is CPUID, which acts as a
-// full fence (no reordering across it in either direction). Unfortunately
-// the latency of CPUID varies wildly (perhaps made worse by not initializing
-// its EAX input). Because it cannot reliably be deducted from the region's
-// elapsed time, it must not be included in the region to measure (i.e.
-// between the two RDTSC).
-//
-// The newer RDTSCP is sometimes described as serializing, but it actually
-// only serves as a half-fence with release semantics. Although all
-// instructions in the region will complete before the final timestamp is
-// captured, subsequent instructions may leak into the region and increase the
-// elapsed time. Inserting another fence after the final RDTSCP would prevent
-// such reordering without affecting the measured region.
-//
-// Fortunately, such a fence exists. The LFENCE instruction is only documented
-// to delay later loads until earlier loads are visible. However, Intel's
-// reference manual says it acts as a full fence (waiting until all earlier
-// instructions have completed, and delaying later instructions until it
-// completes). AMD assigns the same behavior to MFENCE.
-//
-// We need a fence before the initial RDTSC to prevent earlier instructions
-// from leaking into the region, and arguably another after RDTSC to avoid
-// region instructions from completing before the timestamp is recorded.
-// When surrounded by fences, the additional RDTSCP half-fence provides no
-// benefit, so the initial timestamp can be recorded via RDTSC, which has
-// lower overhead than RDTSCP because it does not read TSC_AUX. In summary,
-// we define Start = LFENCE/RDTSC/LFENCE; Stop = RDTSCP/LFENCE.
-//
-// Using Start+Start leads to higher variance and overhead than Stop+Stop.
-// However, Stop+Stop includes an LFENCE in the region measurements, which
-// adds a delay dependent on earlier loads. The combination of Start+Stop
-// is faster than Start+Start and more consistent than Stop+Stop because
-// the first LFENCE already delayed subsequent loads before the measured
-// region. This combination seems not to have been considered in prior work:
-// http://akaros.cs.berkeley.edu/lxr/akaros/kern/arch/x86/rdtsc_test.c
-//
-// Note: performance counters can measure 'exact' instructions-retired or
-// (unhalted) cycle counts. The RDPMC instruction is not serializing and also
-// requires fences. Unfortunately, it is not accessible on all OSes and we
-// prefer to avoid kernel-mode drivers. Performance counters are also affected
-// by several under/over-count errata, so we use the TSC instead.
-
-// Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds,
-// divide by InvariantTicksPerSecond.
-inline Ticks Start() {
-  Ticks t;
-#if HWY_ARCH_PPC && defined(__GLIBC__)
-  asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
-#elif HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC
-  // pmccntr_el0 is privileged but cntvct_el0 is accessible in Linux and QEMU.
-  asm volatile("mrs %0, cntvct_el0" : "=r"(t));
-#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
-  _ReadWriteBarrier();
-  _mm_lfence();
-  _ReadWriteBarrier();
-  t = __rdtsc();
-  _ReadWriteBarrier();
-  _mm_lfence();
-  _ReadWriteBarrier();
-#elif HWY_ARCH_X86_64
-  asm volatile(
-      "lfence\n\t"
-      "rdtsc\n\t"
-      "shl $32, %%rdx\n\t"
-      "or %%rdx, %0\n\t"
-      "lfence"
-      : "=a"(t)
-      :
-      // "memory" avoids reordering. rdx = TSC >> 32.
-      // "cc" = flags modified by SHL.
-      : "rdx", "memory", "cc");
-#elif HWY_ARCH_RVV
-  asm volatile("rdcycle %0" : "=r"(t));
-#elif defined(_WIN32) || defined(_WIN64)
-  LARGE_INTEGER counter;
-  (void)QueryPerformanceCounter(&counter);
-  t = counter.QuadPart;
-#elif defined(__APPLE__)
-  t = mach_absolute_time();
-#elif defined(__HAIKU__)
-  t = system_time_nsecs();  // since boot
-#else  // POSIX
-  timespec ts;
-  clock_gettime(CLOCK_MONOTONIC, &ts);
-  t = static_cast<Ticks>(ts.tv_sec * 1000000000LL + ts.tv_nsec);
-#endif
-  return t;
-}
-
-// WARNING: on x86, caller must check HasRDTSCP before using this!
-inline Ticks Stop() {
-  uint64_t t;
-#if HWY_ARCH_PPC && defined(__GLIBC__)
-  asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
-#elif HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC
-  // pmccntr_el0 is privileged but cntvct_el0 is accessible in Linux and QEMU.
-  asm volatile("mrs %0, cntvct_el0" : "=r"(t));
-#elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
-  _ReadWriteBarrier();
-  unsigned aux;
-  t = __rdtscp(&aux);
-  _ReadWriteBarrier();
-  _mm_lfence();
-  _ReadWriteBarrier();
-#elif HWY_ARCH_X86_64
-  // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
-  asm volatile(
-      "rdtscp\n\t"
-      "shl $32, %%rdx\n\t"
-      "or %%rdx, %0\n\t"
-      "lfence"
-      : "=a"(t)
-      :
-      // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
-      // "cc" = flags modified by SHL.
-      : "rcx", "rdx", "memory", "cc");
-#else
-  t = Start();
-#endif
-  return t;
-}
-
-}  // namespace timer
-
-namespace robust_statistics {
-
-// Sorts integral values in ascending order (e.g. for Mode). About 3x faster
-// than std::sort for input distributions with very few unique values.
-template <class T>
-void CountingSort(T* values, size_t num_values) {
-  // Unique values and their frequency (similar to flat_map).
-  using Unique = std::pair<T, int>;
-  std::vector<Unique> unique;
-  for (size_t i = 0; i < num_values; ++i) {
-    const T value = values[i];
-    const auto pos =
-        std::find_if(unique.begin(), unique.end(),
-                     [value](const Unique u) { return u.first == value; });
-    if (pos == unique.end()) {
-      unique.push_back(std::make_pair(value, 1));
-    } else {
-      ++pos->second;
-    }
-  }
-
-  // Sort in ascending order of value (pair.first).
-  std::sort(unique.begin(), unique.end());
-
-  // Write that many copies of each unique value to the array.
-  T* HWY_RESTRICT p = values;
-  for (const auto& value_count : unique) {
-    std::fill(p, p + value_count.second, value_count.first);
-    p += value_count.second;
-  }
-  NANOBENCHMARK_CHECK(p == values + num_values);
-}
-
-// @return i in [idx_begin, idx_begin + half_count) that minimizes
-// sorted[i + half_count] - sorted[i].
-template <typename T>
-size_t MinRange(const T* const HWY_RESTRICT sorted, const size_t idx_begin,
-                const size_t half_count) {
-  T min_range = std::numeric_limits<T>::max();
-  size_t min_idx = 0;
-
-  for (size_t idx = idx_begin; idx < idx_begin + half_count; ++idx) {
-    NANOBENCHMARK_CHECK(sorted[idx] <= sorted[idx + half_count]);
-    const T range = sorted[idx + half_count] - sorted[idx];
-    if (range < min_range) {
-      min_range = range;
-      min_idx = idx;
-    }
-  }
-
-  return min_idx;
-}
-
-// Returns an estimate of the mode by calling MinRange on successively
-// halved intervals. "sorted" must be in ascending order. This is the
-// Half Sample Mode estimator proposed by Bickel in "On a fast, robust
-// estimator of the mode", with complexity O(N log N). The mode is less
-// affected by outliers in highly-skewed distributions than the median.
-// The averaging operation below assumes "T" is an unsigned integer type.
-template <typename T>
-T ModeOfSorted(const T* const HWY_RESTRICT sorted, const size_t num_values) {
-  size_t idx_begin = 0;
-  size_t half_count = num_values / 2;
-  while (half_count > 1) {
-    idx_begin = MinRange(sorted, idx_begin, half_count);
-    half_count >>= 1;
-  }
-
-  const T x = sorted[idx_begin + 0];
-  if (half_count == 0) {
-    return x;
-  }
-  NANOBENCHMARK_CHECK(half_count == 1);
-  const T average = (x + sorted[idx_begin + 1] + 1) / 2;
-  return average;
-}
-
-// Returns the mode. Side effect: sorts "values".
-template <typename T>
-T Mode(T* values, const size_t num_values) {
-  CountingSort(values, num_values);
-  return ModeOfSorted(values, num_values);
-}
-
-template <typename T, size_t N>
-T Mode(T (&values)[N]) {
-  return Mode(&values[0], N);
-}
-
-// Returns the median value. Side effect: sorts "values".
-template <typename T>
-T Median(T* values, const size_t num_values) {
-  NANOBENCHMARK_CHECK(!values->empty());
-  std::sort(values, values + num_values);
-  const size_t half = num_values / 2;
-  // Odd count: return middle
-  if (num_values % 2) {
-    return values[half];
-  }
-  // Even count: return average of middle two.
-  return (values[half] + values[half - 1] + 1) / 2;
-}
-
-// Returns a robust measure of variability.
-template <typename T>
-T MedianAbsoluteDeviation(const T* values, const size_t num_values,
-                          const T median) {
-  NANOBENCHMARK_CHECK(num_values != 0);
-  std::vector<T> abs_deviations;
-  abs_deviations.reserve(num_values);
-  for (size_t i = 0; i < num_values; ++i) {
-    const int64_t abs = std::abs(static_cast<int64_t>(values[i]) -
-                                 static_cast<int64_t>(median));
-    abs_deviations.push_back(static_cast<T>(abs));
-  }
-  return Median(abs_deviations.data(), num_values);
-}
-
-}  // namespace robust_statistics
-}  // namespace
-namespace platform {
-namespace {
-
-// Prevents the compiler from eliding the computations that led to "output".
-template <class T>
-inline void PreventElision(T&& output) {
-#if HWY_COMPILER_MSVC == 0
-  // Works by indicating to the compiler that "output" is being read and
-  // modified. The +r constraint avoids unnecessary writes to memory, but only
-  // works for built-in types (typically FuncOutput).
-  asm volatile("" : "+r"(output) : : "memory");
-#else
-  // MSVC does not support inline assembly anymore (and never supported GCC's
-  // RTL constraints). Self-assignment with #pragma optimize("off") might be
-  // expected to prevent elision, but it does not with MSVC 2015. Type-punning
-  // with volatile pointers generates inefficient code on MSVC 2017.
-  static std::atomic<T> dummy(T{});
-  dummy.store(output, std::memory_order_relaxed);
-#endif
-}
-
-// Measures the actual current frequency of Ticks. We cannot rely on the nominal
-// frequency encoded in x86 BrandString because it is misleading on M1 Rosetta,
-// and not reported by AMD. CPUID 0x15 is also not yet widely supported. Also
-// used on RISC-V and ARM64.
-HWY_MAYBE_UNUSED double MeasureNominalClockRate() {
-  double max_ticks_per_sec = 0.0;
-  // Arbitrary, enough to ignore 2 outliers without excessive init time.
-  for (int rep = 0; rep < 3; ++rep) {
-    auto time0 = std::chrono::steady_clock::now();
-    using Time = decltype(time0);
-    const timer::Ticks ticks0 = timer::Start();
-    const Time time_min = time0 + std::chrono::milliseconds(10);
-
-    Time time1;
-    timer::Ticks ticks1;
-    for (;;) {
-      time1 = std::chrono::steady_clock::now();
-      // Ideally this would be Stop, but that requires RDTSCP on x86. To avoid
-      // another codepath, just use Start instead. now() presumably has its own
-      // fence-like behavior.
-      ticks1 = timer::Start();  // Do not use Stop, see comment above
-      if (time1 >= time_min) break;
-    }
-
-    const double dticks = static_cast<double>(ticks1 - ticks0);
-    std::chrono::duration<double, std::ratio<1>> dtime = time1 - time0;
-    const double ticks_per_sec = dticks / dtime.count();
-    max_ticks_per_sec = std::max(max_ticks_per_sec, ticks_per_sec);
-  }
-  return max_ticks_per_sec;
-}
-
-#if HWY_ARCH_X86
-
-void Cpuid(const uint32_t level, const uint32_t count,
-           uint32_t* HWY_RESTRICT abcd) {
-#if HWY_COMPILER_MSVC
-  int regs[4];
-  __cpuidex(regs, level, count);
-  for (int i = 0; i < 4; ++i) {
-    abcd[i] = regs[i];
-  }
-#else
-  uint32_t a;
-  uint32_t b;
-  uint32_t c;
-  uint32_t d;
-  __cpuid_count(level, count, a, b, c, d);
-  abcd[0] = a;
-  abcd[1] = b;
-  abcd[2] = c;
-  abcd[3] = d;
-#endif
-}
-
-bool HasRDTSCP() {
-  uint32_t abcd[4];
-  Cpuid(0x80000001U, 0, abcd);         // Extended feature flags
-  return (abcd[3] & (1u << 27)) != 0;  // RDTSCP
-}
-
-std::string BrandString() {
-  char brand_string[49];
-  std::array<uint32_t, 4> abcd;
-
-  // Check if brand string is supported (it is on all reasonable Intel/AMD)
-  Cpuid(0x80000000U, 0, abcd.data());
-  if (abcd[0] < 0x80000004U) {
-    return std::string();
-  }
-
-  for (size_t i = 0; i < 3; ++i) {
-    Cpuid(static_cast<uint32_t>(0x80000002U + i), 0, abcd.data());
-    CopyBytes<sizeof(abcd)>(&abcd[0], brand_string + i * 16);  // not same size
-  }
-  brand_string[48] = 0;
-  return brand_string;
-}
-
-#endif  // HWY_ARCH_X86
-
-}  // namespace
-
-HWY_DLLEXPORT double InvariantTicksPerSecond() {
-#if HWY_ARCH_PPC && defined(__GLIBC__)
-  return static_cast<double>(__ppc_get_timebase_freq());
-#elif HWY_ARCH_X86 || HWY_ARCH_RVV || (HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC)
-  // We assume the x86 TSC is invariant; it is on all recent Intel/AMD CPUs.
-  static const double freq = MeasureNominalClockRate();
-  return freq;
-#elif defined(_WIN32) || defined(_WIN64)
-  LARGE_INTEGER freq;
-  (void)QueryPerformanceFrequency(&freq);
-  return static_cast<double>(freq.QuadPart);
-#elif defined(__APPLE__)
-  // https://developer.apple.com/library/mac/qa/qa1398/_index.html
-  mach_timebase_info_data_t timebase;
-  (void)mach_timebase_info(&timebase);
-  return static_cast<double>(timebase.denom) / timebase.numer * 1E9;
-#else
-  return 1E9;  // Haiku and clock_gettime return nanoseconds.
-#endif
-}
-
-HWY_DLLEXPORT double Now() {
-  static const double mul = 1.0 / InvariantTicksPerSecond();
-  return static_cast<double>(timer::Start()) * mul;
-}
-
-HWY_DLLEXPORT uint64_t TimerResolution() {
-#if HWY_ARCH_X86
-  bool can_use_stop = platform::HasRDTSCP();
-#else
-  constexpr bool can_use_stop = true;
-#endif
-
-  // Nested loop avoids exceeding stack/L1 capacity.
-  timer::Ticks repetitions[Params::kTimerSamples];
-  for (size_t rep = 0; rep < Params::kTimerSamples; ++rep) {
-    timer::Ticks samples[Params::kTimerSamples];
-    if (can_use_stop) {
-      for (size_t i = 0; i < Params::kTimerSamples; ++i) {
-        const timer::Ticks t0 = timer::Start();
-        const timer::Ticks t1 = timer::Stop();  // we checked HasRDTSCP above
-        samples[i] = t1 - t0;
-      }
-    } else {
-      for (size_t i = 0; i < Params::kTimerSamples; ++i) {
-        const timer::Ticks t0 = timer::Start();
-        const timer::Ticks t1 = timer::Start();  // do not use Stop, see above
-        samples[i] = t1 - t0;
-      }
-    }
-    repetitions[rep] = robust_statistics::Mode(samples);
-  }
-  return robust_statistics::Mode(repetitions);
-}
-
-}  // namespace platform
-namespace {
-
-static const timer::Ticks timer_resolution = platform::TimerResolution();
-
-// Estimates the expected value of "lambda" values with a variable number of
-// samples until the variability "rel_mad" is less than "max_rel_mad".
-template <class Lambda>
-timer::Ticks SampleUntilStable(const double max_rel_mad, double* rel_mad,
-                               const Params& p, const Lambda& lambda) {
-  // Choose initial samples_per_eval based on a single estimated duration.
-  timer::Ticks t0 = timer::Start();
-  lambda();
-  timer::Ticks t1 = timer::Stop();  // Caller checks HasRDTSCP
-  timer::Ticks est = t1 - t0;
-  static const double ticks_per_second = platform::InvariantTicksPerSecond();
-  const size_t ticks_per_eval =
-      static_cast<size_t>(ticks_per_second * p.seconds_per_eval);
-  size_t samples_per_eval = est == 0
-                                ? p.min_samples_per_eval
-                                : static_cast<size_t>(ticks_per_eval / est);
-  samples_per_eval = HWY_MAX(samples_per_eval, p.min_samples_per_eval);
-
-  std::vector<timer::Ticks> samples;
-  samples.reserve(1 + samples_per_eval);
-  samples.push_back(est);
-
-  // Percentage is too strict for tiny differences, so also allow a small
-  // absolute "median absolute deviation".
-  const timer::Ticks max_abs_mad = (timer_resolution + 99) / 100;
-  *rel_mad = 0.0;  // ensure initialized
-
-  for (size_t eval = 0; eval < p.max_evals; ++eval, samples_per_eval *= 2) {
-    samples.reserve(samples.size() + samples_per_eval);
-    for (size_t i = 0; i < samples_per_eval; ++i) {
-      t0 = timer::Start();
-      lambda();
-      t1 = timer::Stop();  // Caller checks HasRDTSCP
-      samples.push_back(t1 - t0);
-    }
-
-    if (samples.size() >= p.min_mode_samples) {
-      est = robust_statistics::Mode(samples.data(), samples.size());
-    } else {
-      // For "few" (depends also on the variance) samples, Median is safer.
-      est = robust_statistics::Median(samples.data(), samples.size());
-    }
-    NANOBENCHMARK_CHECK(est != 0);
-
-    // Median absolute deviation (mad) is a robust measure of 'variability'.
-    const timer::Ticks abs_mad = robust_statistics::MedianAbsoluteDeviation(
-        samples.data(), samples.size(), est);
-    *rel_mad = static_cast<double>(abs_mad) / static_cast<double>(est);
-
-    if (*rel_mad <= max_rel_mad || abs_mad <= max_abs_mad) {
-      if (p.verbose) {
-        printf("%6" PRIu64 " samples => %5" PRIu64 " (abs_mad=%4" PRIu64
-               ", rel_mad=%4.2f%%)\n",
-               static_cast<uint64_t>(samples.size()),
-               static_cast<uint64_t>(est), static_cast<uint64_t>(abs_mad),
-               *rel_mad * 100.0);
-      }
-      return est;
-    }
-  }
-
-  if (p.verbose) {
-    printf("WARNING: rel_mad=%4.2f%% still exceeds %4.2f%% after %6" PRIu64
-           " samples.\n",
-           *rel_mad * 100.0, max_rel_mad * 100.0,
-           static_cast<uint64_t>(samples.size()));
-  }
-  return est;
-}
-
-using InputVec = std::vector<FuncInput>;
-
-// Returns vector of unique input values.
-InputVec UniqueInputs(const FuncInput* inputs, const size_t num_inputs) {
-  InputVec unique(inputs, inputs + num_inputs);
-  std::sort(unique.begin(), unique.end());
-  unique.erase(std::unique(unique.begin(), unique.end()), unique.end());
-  return unique;
-}
-
-// Returns how often we need to call func for sufficient precision.
-size_t NumSkip(const Func func, const uint8_t* arg, const InputVec& unique,
-               const Params& p) {
-  // Min elapsed ticks for any input.
-  timer::Ticks min_duration = ~timer::Ticks(0);
-
-  for (const FuncInput input : unique) {
-    double rel_mad;
-    const timer::Ticks total = SampleUntilStable(
-        p.target_rel_mad, &rel_mad, p,
-        [func, arg, input]() { platform::PreventElision(func(arg, input)); });
-    min_duration = HWY_MIN(min_duration, total - timer_resolution);
-  }
-
-  // Number of repetitions required to reach the target resolution.
-  const size_t max_skip = p.precision_divisor;
-  // Number of repetitions given the estimated duration.
-  const size_t num_skip =
-      min_duration == 0
-          ? 0
-          : static_cast<size_t>((max_skip + min_duration - 1) / min_duration);
-  if (p.verbose) {
-    printf("res=%" PRIu64 " max_skip=%" PRIu64 " min_dur=%" PRIu64
-           " num_skip=%" PRIu64 "\n",
-           static_cast<uint64_t>(timer_resolution),
-           static_cast<uint64_t>(max_skip), static_cast<uint64_t>(min_duration),
-           static_cast<uint64_t>(num_skip));
-  }
-  return num_skip;
-}
-
-// Replicates inputs until we can omit "num_skip" occurrences of an input.
-InputVec ReplicateInputs(const FuncInput* inputs, const size_t num_inputs,
-                         const size_t num_unique, const size_t num_skip,
-                         const Params& p) {
-  InputVec full;
-  if (num_unique == 1) {
-    full.assign(p.subset_ratio * num_skip, inputs[0]);
-    return full;
-  }
-
-  full.reserve(p.subset_ratio * num_skip * num_inputs);
-  for (size_t i = 0; i < p.subset_ratio * num_skip; ++i) {
-    full.insert(full.end(), inputs, inputs + num_inputs);
-  }
-  std::mt19937 rng;
-  std::shuffle(full.begin(), full.end(), rng);
-  return full;
-}
-
-// Copies the "full" to "subset" in the same order, but with "num_skip"
-// randomly selected occurrences of "input_to_skip" removed.
-void FillSubset(const InputVec& full, const FuncInput input_to_skip,
-                const size_t num_skip, InputVec* subset) {
-  const size_t count =
-      static_cast<size_t>(std::count(full.begin(), full.end(), input_to_skip));
-  // Generate num_skip random indices: which occurrence to skip.
-  std::vector<uint32_t> omit(count);
-  std::iota(omit.begin(), omit.end(), 0);
-  // omit[] is the same on every call, but that's OK because they identify the
-  // Nth instance of input_to_skip, so the position within full[] differs.
-  std::mt19937 rng;
-  std::shuffle(omit.begin(), omit.end(), rng);
-  omit.resize(num_skip);
-  std::sort(omit.begin(), omit.end());
-
-  uint32_t occurrence = ~0u;  // 0 after preincrement
-  size_t idx_omit = 0;        // cursor within omit[]
-  size_t idx_subset = 0;      // cursor within *subset
-  for (const FuncInput next : full) {
-    if (next == input_to_skip) {
-      ++occurrence;
-      // Haven't removed enough already
-      if (idx_omit < num_skip) {
-        // This one is up for removal
-        if (occurrence == omit[idx_omit]) {
-          ++idx_omit;
-          continue;
-        }
-      }
-    }
-    if (idx_subset < subset->size()) {
-      (*subset)[idx_subset++] = next;
-    }
-  }
-  NANOBENCHMARK_CHECK(idx_subset == subset->size());
-  NANOBENCHMARK_CHECK(idx_omit == omit.size());
-  NANOBENCHMARK_CHECK(occurrence == count - 1);
-}
-
-// Returns total ticks elapsed for all inputs.
-timer::Ticks TotalDuration(const Func func, const uint8_t* arg,
-                           const InputVec* inputs, const Params& p,
-                           double* max_rel_mad) {
-  double rel_mad;
-  const timer::Ticks duration =
-      SampleUntilStable(p.target_rel_mad, &rel_mad, p, [func, arg, inputs]() {
-        for (const FuncInput input : *inputs) {
-          platform::PreventElision(func(arg, input));
-        }
-      });
-  *max_rel_mad = HWY_MAX(*max_rel_mad, rel_mad);
-  return duration;
-}
-
-// (Nearly) empty Func for measuring timer overhead/resolution.
-HWY_NOINLINE FuncOutput EmptyFunc(const void* /*arg*/, const FuncInput input) {
-  return input;
-}
-
-// Returns overhead of accessing inputs[] and calling a function; this will
-// be deducted from future TotalDuration return values.
-timer::Ticks Overhead(const uint8_t* arg, const InputVec* inputs,
-                      const Params& p) {
-  double rel_mad;
-  // Zero tolerance because repeatability is crucial and EmptyFunc is fast.
-  return SampleUntilStable(0.0, &rel_mad, p, [arg, inputs]() {
-    for (const FuncInput input : *inputs) {
-      platform::PreventElision(EmptyFunc(arg, input));
-    }
-  });
-}
-
-}  // namespace
-
-HWY_DLLEXPORT int Unpredictable1() { return timer::Start() != ~0ULL; }
-
-HWY_DLLEXPORT size_t Measure(const Func func, const uint8_t* arg,
-                             const FuncInput* inputs, const size_t num_inputs,
-                             Result* results, const Params& p) {
-  NANOBENCHMARK_CHECK(num_inputs != 0);
-
-#if HWY_ARCH_X86
-  if (!platform::HasRDTSCP()) {
-    fprintf(stderr, "CPU '%s' does not support RDTSCP, skipping benchmark.\n",
-            platform::BrandString().c_str());
-    return 0;
-  }
-#endif
-
-  const InputVec& unique = UniqueInputs(inputs, num_inputs);
-
-  const size_t num_skip = NumSkip(func, arg, unique, p);  // never 0
-  if (num_skip == 0) return 0;  // NumSkip already printed error message
-  // (slightly less work on x86 to cast from signed integer)
-  const float mul = 1.0f / static_cast<float>(static_cast<int>(num_skip));
-
-  const InputVec& full =
-      ReplicateInputs(inputs, num_inputs, unique.size(), num_skip, p);
-  InputVec subset(full.size() - num_skip);
-
-  const timer::Ticks overhead = Overhead(arg, &full, p);
-  const timer::Ticks overhead_skip = Overhead(arg, &subset, p);
-  if (overhead < overhead_skip) {
-    fprintf(stderr, "Measurement failed: overhead %" PRIu64 " < %" PRIu64 "\n",
-            static_cast<uint64_t>(overhead),
-            static_cast<uint64_t>(overhead_skip));
-    return 0;
-  }
-
-  if (p.verbose) {
-    printf("#inputs=%5" PRIu64 ",%5" PRIu64 " overhead=%5" PRIu64 ",%5" PRIu64
-           "\n",
-           static_cast<uint64_t>(full.size()),
-           static_cast<uint64_t>(subset.size()),
-           static_cast<uint64_t>(overhead),
-           static_cast<uint64_t>(overhead_skip));
-  }
-
-  double max_rel_mad = 0.0;
-  const timer::Ticks total = TotalDuration(func, arg, &full, p, &max_rel_mad);
-
-  for (size_t i = 0; i < unique.size(); ++i) {
-    FillSubset(full, unique[i], num_skip, &subset);
-    const timer::Ticks total_skip =
-        TotalDuration(func, arg, &subset, p, &max_rel_mad);
-
-    if (total < total_skip) {
-      fprintf(stderr, "Measurement failed: total %" PRIu64 " < %" PRIu64 "\n",
-              static_cast<uint64_t>(total), static_cast<uint64_t>(total_skip));
-      return 0;
-    }
-
-    const timer::Ticks duration =
-        (total - overhead) - (total_skip - overhead_skip);
-    results[i].input = unique[i];
-    results[i].ticks = static_cast<float>(duration) * mul;
-    results[i].variability = static_cast<float>(max_rel_mad);
-  }
-
-  return unique.size();
-}
-
-}  // namespace hwy
diff --git a/third_party/highway/hwy/nanobenchmark.h b/third_party/highway/hwy/nanobenchmark.h
deleted file mode 100644 (file)
index f0910b4..0000000
+++ /dev/null
@@ -1,194 +0,0 @@
-// Copyright 2019 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HIGHWAY_HWY_NANOBENCHMARK_H_
-#define HIGHWAY_HWY_NANOBENCHMARK_H_
-
-// Benchmarks functions of a single integer argument with realistic branch
-// prediction hit rates. Uses a robust estimator to summarize the measurements.
-// The precision is about 0.2%.
-//
-// Examples: see nanobenchmark_test.cc.
-//
-// Background: Microbenchmarks such as http://github.com/google/benchmark
-// can measure elapsed times on the order of a microsecond. Shorter functions
-// are typically measured by repeating them thousands of times and dividing
-// the total elapsed time by this count. Unfortunately, repetition (especially
-// with the same input parameter!) influences the runtime. In time-critical
-// code, it is reasonable to expect warm instruction/data caches and TLBs,
-// but a perfect record of which branches will be taken is unrealistic.
-// Unless the application also repeatedly invokes the measured function with
-// the same parameter, the benchmark is measuring something very different -
-// a best-case result, almost as if the parameter were made a compile-time
-// constant. This may lead to erroneous conclusions about branch-heavy
-// algorithms outperforming branch-free alternatives.
-//
-// Our approach differs in three ways. Adding fences to the timer functions
-// reduces variability due to instruction reordering, improving the timer
-// resolution to about 40 CPU cycles. However, shorter functions must still
-// be invoked repeatedly. For more realistic branch prediction performance,
-// we vary the input parameter according to a user-specified distribution.
-// Thus, instead of VaryInputs(Measure(Repeat(func))), we change the
-// loop nesting to Measure(Repeat(VaryInputs(func))). We also estimate the
-// central tendency of the measurement samples with the "half sample mode",
-// which is more robust to outliers and skewed data than the mean or median.
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "hwy/highway_export.h"
-
-// Enables sanity checks that verify correct operation at the cost of
-// longer benchmark runs.
-#ifndef NANOBENCHMARK_ENABLE_CHECKS
-#define NANOBENCHMARK_ENABLE_CHECKS 0
-#endif
-
-#define NANOBENCHMARK_CHECK_ALWAYS(condition)                             \
-  while (!(condition)) {                                                  \
-    fprintf(stderr, "Nanobenchmark check failed at line %d\n", __LINE__); \
-    abort();                                                              \
-  }
-
-#if NANOBENCHMARK_ENABLE_CHECKS
-#define NANOBENCHMARK_CHECK(condition) NANOBENCHMARK_CHECK_ALWAYS(condition)
-#else
-#define NANOBENCHMARK_CHECK(condition)
-#endif
-
-namespace hwy {
-
-namespace platform {
-
-// Returns tick rate, useful for converting measurements to seconds. Invariant
-// means the tick counter frequency is independent of CPU throttling or sleep.
-// This call may be expensive, callers should cache the result.
-HWY_DLLEXPORT double InvariantTicksPerSecond();
-
-// Returns current timestamp [in seconds] relative to an unspecified origin.
-// Features: monotonic (no negative elapsed time), steady (unaffected by system
-// time changes), high-resolution (on the order of microseconds).
-HWY_DLLEXPORT double Now();
-
-// Returns ticks elapsed in back to back timer calls, i.e. a function of the
-// timer resolution (minimum measurable difference) and overhead.
-// This call is expensive, callers should cache the result.
-HWY_DLLEXPORT uint64_t TimerResolution();
-
-}  // namespace platform
-
-// Returns 1, but without the compiler knowing what the value is. This prevents
-// optimizing out code.
-HWY_DLLEXPORT int Unpredictable1();
-
-// Input influencing the function being measured (e.g. number of bytes to copy).
-using FuncInput = size_t;
-
-// "Proof of work" returned by Func to ensure the compiler does not elide it.
-using FuncOutput = uint64_t;
-
-// Function to measure: either 1) a captureless lambda or function with two
-// arguments or 2) a lambda with capture, in which case the first argument
-// is reserved for use by MeasureClosure.
-using Func = FuncOutput (*)(const void*, FuncInput);
-
-// Internal parameters that determine precision/resolution/measuring time.
-struct Params {
-  // For measuring timer overhead/resolution. Used in a nested loop =>
-  // quadratic time, acceptable because we know timer overhead is "low".
-  // constexpr because this is used to define array bounds.
-  static constexpr size_t kTimerSamples = 256;
-
-  // Best-case precision, expressed as a divisor of the timer resolution.
-  // Larger => more calls to Func and higher precision.
-  size_t precision_divisor = 1024;
-
-  // Ratio between full and subset input distribution sizes. Cannot be less
-  // than 2; larger values increase measurement time but more faithfully
-  // model the given input distribution.
-  size_t subset_ratio = 2;
-
-  // Together with the estimated Func duration, determines how many times to
-  // call Func before checking the sample variability. Larger values increase
-  // measurement time, memory/cache use and precision.
-  double seconds_per_eval = 4E-3;
-
-  // The minimum number of samples before estimating the central tendency.
-  size_t min_samples_per_eval = 7;
-
-  // The mode is better than median for estimating the central tendency of
-  // skewed/fat-tailed distributions, but it requires sufficient samples
-  // relative to the width of half-ranges.
-  size_t min_mode_samples = 64;
-
-  // Maximum permissible variability (= median absolute deviation / center).
-  double target_rel_mad = 0.002;
-
-  // Abort after this many evals without reaching target_rel_mad. This
-  // prevents infinite loops.
-  size_t max_evals = 9;
-
-  // Whether to print additional statistics to stdout.
-  bool verbose = true;
-};
-
-// Measurement result for each unique input.
-struct Result {
-  FuncInput input;
-
-  // Robust estimate (mode or median) of duration.
-  float ticks;
-
-  // Measure of variability (median absolute deviation relative to "ticks").
-  float variability;
-};
-
-// Precisely measures the number of ticks elapsed when calling "func" with the
-// given inputs, shuffled to ensure realistic branch prediction hit rates.
-//
-// "func" returns a 'proof of work' to ensure its computations are not elided.
-// "arg" is passed to Func, or reserved for internal use by MeasureClosure.
-// "inputs" is an array of "num_inputs" (not necessarily unique) arguments to
-//   "func". The values should be chosen to maximize coverage of "func". This
-//   represents a distribution, so a value's frequency should reflect its
-//   probability in the real application. Order does not matter; for example, a
-//   uniform distribution over [0, 4) could be represented as {3,0,2,1}.
-// Returns how many Result were written to "results": one per unique input, or
-//   zero if the measurement failed (an error message goes to stderr).
-HWY_DLLEXPORT size_t Measure(const Func func, const uint8_t* arg,
-                             const FuncInput* inputs, const size_t num_inputs,
-                             Result* results, const Params& p = Params());
-
-// Calls operator() of the given closure (lambda function).
-template <class Closure>
-static FuncOutput CallClosure(const Closure* f, const FuncInput input) {
-  return (*f)(input);
-}
-
-// Same as Measure, except "closure" is typically a lambda function of
-// FuncInput -> FuncOutput with a capture list.
-template <class Closure>
-static inline size_t MeasureClosure(const Closure& closure,
-                                    const FuncInput* inputs,
-                                    const size_t num_inputs, Result* results,
-                                    const Params& p = Params()) {
-  return Measure(reinterpret_cast<Func>(&CallClosure<Closure>),
-                 reinterpret_cast<const uint8_t*>(&closure), inputs, num_inputs,
-                 results, p);
-}
-
-}  // namespace hwy
-
-#endif  // HIGHWAY_HWY_NANOBENCHMARK_H_
diff --git a/third_party/highway/hwy/nanobenchmark_test.cc b/third_party/highway/hwy/nanobenchmark_test.cc
deleted file mode 100644 (file)
index 0d153a1..0000000
+++ /dev/null
@@ -1,94 +0,0 @@
-// Copyright 2019 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/nanobenchmark.h"
-
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS  // before inttypes.h
-#endif
-#include <inttypes.h>
-#include <stdint.h>
-#include <stdio.h>
-
-#include <random>
-
-#include "hwy/tests/test_util-inl.h"
-
-namespace hwy {
-namespace {
-
-// Governs duration of test; avoid timeout in debug builds.
-#if HWY_IS_DEBUG_BUILD
-constexpr size_t kMaxEvals = 3;
-#else
-constexpr size_t kMaxEvals = 4;
-#endif
-
-FuncOutput Div(const void*, FuncInput in) {
-  // Here we're measuring the throughput because benchmark invocations are
-  // independent. Any dividend will do; the divisor is nonzero.
-  return 0xFFFFF / in;
-}
-
-template <size_t N>
-void MeasureDiv(const FuncInput (&inputs)[N]) {
-  printf("Measuring integer division (output on final two lines)\n");
-  Result results[N];
-  Params params;
-  params.max_evals = kMaxEvals;
-  const size_t num_results = Measure(&Div, nullptr, inputs, N, results, params);
-  for (size_t i = 0; i < num_results; ++i) {
-    printf("%5" PRIu64 ": %6.2f ticks; MAD=%4.2f%%\n",
-           static_cast<uint64_t>(results[i].input), results[i].ticks,
-           results[i].variability * 100.0);
-  }
-}
-
-std::mt19937 rng;
-
-// A function whose runtime depends on rng.
-FuncOutput Random(const void* /*arg*/, FuncInput in) {
-  const size_t r = rng() & 0xF;
-  FuncOutput ret = static_cast<FuncOutput>(in);
-  for (size_t i = 0; i < r; ++i) {
-    ret /= ((rng() & 1) + 2);
-  }
-  return ret;
-}
-
-// Ensure the measured variability is high.
-template <size_t N>
-void MeasureRandom(const FuncInput (&inputs)[N]) {
-  Result results[N];
-  Params p;
-  p.max_evals = kMaxEvals;
-  p.verbose = false;
-  const size_t num_results = Measure(&Random, nullptr, inputs, N, results, p);
-  for (size_t i = 0; i < num_results; ++i) {
-    NANOBENCHMARK_CHECK(results[i].variability > 1E-3);
-  }
-}
-
-TEST(NanobenchmarkTest, RunAll) {
-  const int unpredictable = Unpredictable1();  // == 1, unknown to compiler.
-  static const FuncInput inputs[] = {static_cast<FuncInput>(unpredictable) + 2,
-                                     static_cast<FuncInput>(unpredictable + 9)};
-
-  MeasureDiv(inputs);
-  MeasureRandom(inputs);
-}
-
-}  // namespace
-}  // namespace hwy
diff --git a/third_party/highway/hwy/ops/arm_neon-inl.h b/third_party/highway/hwy/ops/arm_neon-inl.h
deleted file mode 100644 (file)
index 87edbcf..0000000
+++ /dev/null
@@ -1,6504 +0,0 @@
-// Copyright 2019 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// 128-bit ARM64 NEON vectors and operations.
-// External include guard in highway.h - see comment there.
-
-// ARM NEON intrinsics are documented at:
-// https://developer.arm.com/architectures/instruction-sets/intrinsics/#f:@navigationhierarchiessimdisa=[Neon]
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "hwy/base.h"  // before HWY_DIAGNOSTICS
-
-HWY_DIAGNOSTICS(push)
-HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
-#include <arm_neon.h>
-HWY_DIAGNOSTICS(pop)
-
-#include "hwy/ops/shared-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-namespace detail {  // for code folding and Raw128
-
-// Macros used to define single and double function calls for multiple types
-// for full and half vectors. These macros are undefined at the end of the file.
-
-// HWY_NEON_BUILD_TPL_* is the template<...> prefix to the function.
-#define HWY_NEON_BUILD_TPL_1
-#define HWY_NEON_BUILD_TPL_2
-#define HWY_NEON_BUILD_TPL_3
-
-// HWY_NEON_BUILD_RET_* is return type; type arg is without _t suffix so we can
-// extend it to int32x4x2_t packs.
-#define HWY_NEON_BUILD_RET_1(type, size) Vec128<type##_t, size>
-#define HWY_NEON_BUILD_RET_2(type, size) Vec128<type##_t, size>
-#define HWY_NEON_BUILD_RET_3(type, size) Vec128<type##_t, size>
-
-// HWY_NEON_BUILD_PARAM_* is the list of parameters the function receives.
-#define HWY_NEON_BUILD_PARAM_1(type, size) const Vec128<type##_t, size> a
-#define HWY_NEON_BUILD_PARAM_2(type, size) \
-  const Vec128<type##_t, size> a, const Vec128<type##_t, size> b
-#define HWY_NEON_BUILD_PARAM_3(type, size)                        \
-  const Vec128<type##_t, size> a, const Vec128<type##_t, size> b, \
-      const Vec128<type##_t, size> c
-
-// HWY_NEON_BUILD_ARG_* is the list of arguments passed to the underlying
-// function.
-#define HWY_NEON_BUILD_ARG_1 a.raw
-#define HWY_NEON_BUILD_ARG_2 a.raw, b.raw
-#define HWY_NEON_BUILD_ARG_3 a.raw, b.raw, c.raw
-
-// We use HWY_NEON_EVAL(func, ...) to delay the evaluation of func until after
-// the __VA_ARGS__ have been expanded. This allows "func" to be a macro on
-// itself like with some of the library "functions" such as vshlq_u8. For
-// example, HWY_NEON_EVAL(vshlq_u8, MY_PARAMS) where MY_PARAMS is defined as
-// "a, b" (without the quotes) will end up expanding "vshlq_u8(a, b)" if needed.
-// Directly writing vshlq_u8(MY_PARAMS) would fail since vshlq_u8() macro
-// expects two arguments.
-#define HWY_NEON_EVAL(func, ...) func(__VA_ARGS__)
-
-// Main macro definition that defines a single function for the given type and
-// size of vector, using the underlying (prefix##infix##suffix) function and
-// the template, return type, parameters and arguments defined by the "args"
-// parameters passed here (see HWY_NEON_BUILD_* macros defined before).
-#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \
-  HWY_CONCAT(HWY_NEON_BUILD_TPL_, args)                                      \
-  HWY_API HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size)                  \
-      name(HWY_CONCAT(HWY_NEON_BUILD_PARAM_, args)(type, size)) {            \
-    return HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size)(                \
-        HWY_NEON_EVAL(prefix##infix##suffix, HWY_NEON_BUILD_ARG_##args));    \
-  }
-
-// The HWY_NEON_DEF_FUNCTION_* macros define all the variants of a function
-// called "name" using the set of neon functions starting with the given
-// "prefix" for all the variants of certain types, as specified next to each
-// macro. For example, the prefix "vsub" can be used to define the operator-
-// using args=2.
-
-// uint8_t
-#define HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args)      \
-  HWY_NEON_DEF_FUNCTION(uint8, 16, name, prefix##q, infix, u8, args) \
-  HWY_NEON_DEF_FUNCTION(uint8, 8, name, prefix, infix, u8, args)     \
-  HWY_NEON_DEF_FUNCTION(uint8, 4, name, prefix, infix, u8, args)     \
-  HWY_NEON_DEF_FUNCTION(uint8, 2, name, prefix, infix, u8, args)     \
-  HWY_NEON_DEF_FUNCTION(uint8, 1, name, prefix, infix, u8, args)
-
-// int8_t
-#define HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args)      \
-  HWY_NEON_DEF_FUNCTION(int8, 16, name, prefix##q, infix, s8, args) \
-  HWY_NEON_DEF_FUNCTION(int8, 8, name, prefix, infix, s8, args)     \
-  HWY_NEON_DEF_FUNCTION(int8, 4, name, prefix, infix, s8, args)     \
-  HWY_NEON_DEF_FUNCTION(int8, 2, name, prefix, infix, s8, args)     \
-  HWY_NEON_DEF_FUNCTION(int8, 1, name, prefix, infix, s8, args)
-
-// uint16_t
-#define HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args)      \
-  HWY_NEON_DEF_FUNCTION(uint16, 8, name, prefix##q, infix, u16, args) \
-  HWY_NEON_DEF_FUNCTION(uint16, 4, name, prefix, infix, u16, args)    \
-  HWY_NEON_DEF_FUNCTION(uint16, 2, name, prefix, infix, u16, args)    \
-  HWY_NEON_DEF_FUNCTION(uint16, 1, name, prefix, infix, u16, args)
-
-// int16_t
-#define HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args)      \
-  HWY_NEON_DEF_FUNCTION(int16, 8, name, prefix##q, infix, s16, args) \
-  HWY_NEON_DEF_FUNCTION(int16, 4, name, prefix, infix, s16, args)    \
-  HWY_NEON_DEF_FUNCTION(int16, 2, name, prefix, infix, s16, args)    \
-  HWY_NEON_DEF_FUNCTION(int16, 1, name, prefix, infix, s16, args)
-
-// uint32_t
-#define HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args)      \
-  HWY_NEON_DEF_FUNCTION(uint32, 4, name, prefix##q, infix, u32, args) \
-  HWY_NEON_DEF_FUNCTION(uint32, 2, name, prefix, infix, u32, args)    \
-  HWY_NEON_DEF_FUNCTION(uint32, 1, name, prefix, infix, u32, args)
-
-// int32_t
-#define HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args)      \
-  HWY_NEON_DEF_FUNCTION(int32, 4, name, prefix##q, infix, s32, args) \
-  HWY_NEON_DEF_FUNCTION(int32, 2, name, prefix, infix, s32, args)    \
-  HWY_NEON_DEF_FUNCTION(int32, 1, name, prefix, infix, s32, args)
-
-// uint64_t
-#define HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args)      \
-  HWY_NEON_DEF_FUNCTION(uint64, 2, name, prefix##q, infix, u64, args) \
-  HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)
-
-// int64_t
-#define HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args)      \
-  HWY_NEON_DEF_FUNCTION(int64, 2, name, prefix##q, infix, s64, args) \
-  HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args)
-
-// float
-#define HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args)      \
-  HWY_NEON_DEF_FUNCTION(float32, 4, name, prefix##q, infix, f32, args) \
-  HWY_NEON_DEF_FUNCTION(float32, 2, name, prefix, infix, f32, args)    \
-  HWY_NEON_DEF_FUNCTION(float32, 1, name, prefix, infix, f32, args)
-
-// double
-#if HWY_ARCH_ARM_A64
-#define HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args)      \
-  HWY_NEON_DEF_FUNCTION(float64, 2, name, prefix##q, infix, f64, args) \
-  HWY_NEON_DEF_FUNCTION(float64, 1, name, prefix, infix, f64, args)
-#else
-#define HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args)
-#endif
-
-// float and double
-
-#define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) \
-  HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args)         \
-  HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args)
-
-// Helper macros to define for more than one type.
-// uint8_t, uint16_t and uint32_t
-#define HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
-  HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args)             \
-  HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args)            \
-  HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args)
-
-// int8_t, int16_t and int32_t
-#define HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
-  HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args)             \
-  HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args)            \
-  HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args)
-
-// uint8_t, uint16_t, uint32_t and uint64_t
-#define HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args)  \
-  HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
-  HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args)
-
-// int8_t, int16_t, int32_t and int64_t
-#define HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args)  \
-  HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
-  HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args)
-
-// All int*_t and uint*_t up to 64
-#define HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \
-  HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args)             \
-  HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args)
-
-// All previous types.
-#define HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args) \
-  HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args)      \
-  HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args)
-
-#define HWY_NEON_DEF_FUNCTION_UIF81632(name, prefix, infix, args) \
-  HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args)   \
-  HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args)    \
-  HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args)
-
-// Emulation of some intrinsics on armv7.
-#if HWY_ARCH_ARM_V7
-#define vuzp1_s8(x, y) vuzp_s8(x, y).val[0]
-#define vuzp1_u8(x, y) vuzp_u8(x, y).val[0]
-#define vuzp1_s16(x, y) vuzp_s16(x, y).val[0]
-#define vuzp1_u16(x, y) vuzp_u16(x, y).val[0]
-#define vuzp1_s32(x, y) vuzp_s32(x, y).val[0]
-#define vuzp1_u32(x, y) vuzp_u32(x, y).val[0]
-#define vuzp1_f32(x, y) vuzp_f32(x, y).val[0]
-#define vuzp1q_s8(x, y) vuzpq_s8(x, y).val[0]
-#define vuzp1q_u8(x, y) vuzpq_u8(x, y).val[0]
-#define vuzp1q_s16(x, y) vuzpq_s16(x, y).val[0]
-#define vuzp1q_u16(x, y) vuzpq_u16(x, y).val[0]
-#define vuzp1q_s32(x, y) vuzpq_s32(x, y).val[0]
-#define vuzp1q_u32(x, y) vuzpq_u32(x, y).val[0]
-#define vuzp1q_f32(x, y) vuzpq_f32(x, y).val[0]
-#define vuzp2_s8(x, y) vuzp_s8(x, y).val[1]
-#define vuzp2_u8(x, y) vuzp_u8(x, y).val[1]
-#define vuzp2_s16(x, y) vuzp_s16(x, y).val[1]
-#define vuzp2_u16(x, y) vuzp_u16(x, y).val[1]
-#define vuzp2_s32(x, y) vuzp_s32(x, y).val[1]
-#define vuzp2_u32(x, y) vuzp_u32(x, y).val[1]
-#define vuzp2_f32(x, y) vuzp_f32(x, y).val[1]
-#define vuzp2q_s8(x, y) vuzpq_s8(x, y).val[1]
-#define vuzp2q_u8(x, y) vuzpq_u8(x, y).val[1]
-#define vuzp2q_s16(x, y) vuzpq_s16(x, y).val[1]
-#define vuzp2q_u16(x, y) vuzpq_u16(x, y).val[1]
-#define vuzp2q_s32(x, y) vuzpq_s32(x, y).val[1]
-#define vuzp2q_u32(x, y) vuzpq_u32(x, y).val[1]
-#define vuzp2q_f32(x, y) vuzpq_f32(x, y).val[1]
-#define vzip1_s8(x, y) vzip_s8(x, y).val[0]
-#define vzip1_u8(x, y) vzip_u8(x, y).val[0]
-#define vzip1_s16(x, y) vzip_s16(x, y).val[0]
-#define vzip1_u16(x, y) vzip_u16(x, y).val[0]
-#define vzip1_f32(x, y) vzip_f32(x, y).val[0]
-#define vzip1_u32(x, y) vzip_u32(x, y).val[0]
-#define vzip1_s32(x, y) vzip_s32(x, y).val[0]
-#define vzip1q_s8(x, y) vzipq_s8(x, y).val[0]
-#define vzip1q_u8(x, y) vzipq_u8(x, y).val[0]
-#define vzip1q_s16(x, y) vzipq_s16(x, y).val[0]
-#define vzip1q_u16(x, y) vzipq_u16(x, y).val[0]
-#define vzip1q_s32(x, y) vzipq_s32(x, y).val[0]
-#define vzip1q_u32(x, y) vzipq_u32(x, y).val[0]
-#define vzip1q_f32(x, y) vzipq_f32(x, y).val[0]
-#define vzip2_s8(x, y) vzip_s8(x, y).val[1]
-#define vzip2_u8(x, y) vzip_u8(x, y).val[1]
-#define vzip2_s16(x, y) vzip_s16(x, y).val[1]
-#define vzip2_u16(x, y) vzip_u16(x, y).val[1]
-#define vzip2_s32(x, y) vzip_s32(x, y).val[1]
-#define vzip2_u32(x, y) vzip_u32(x, y).val[1]
-#define vzip2_f32(x, y) vzip_f32(x, y).val[1]
-#define vzip2q_s8(x, y) vzipq_s8(x, y).val[1]
-#define vzip2q_u8(x, y) vzipq_u8(x, y).val[1]
-#define vzip2q_s16(x, y) vzipq_s16(x, y).val[1]
-#define vzip2q_u16(x, y) vzipq_u16(x, y).val[1]
-#define vzip2q_s32(x, y) vzipq_s32(x, y).val[1]
-#define vzip2q_u32(x, y) vzipq_u32(x, y).val[1]
-#define vzip2q_f32(x, y) vzipq_f32(x, y).val[1]
-#endif
-
-// Wrappers over uint8x16x2_t etc. so we can define StoreInterleaved2 overloads
-// for all vector types, even those (bfloat16_t) where the underlying vector is
-// the same as others (uint16_t).
-template <typename T, size_t N>
-struct Tuple2;
-template <typename T, size_t N>
-struct Tuple3;
-template <typename T, size_t N>
-struct Tuple4;
-
-template <>
-struct Tuple2<uint8_t, 16> {
-  uint8x16x2_t raw;
-};
-template <size_t N>
-struct Tuple2<uint8_t, N> {
-  uint8x8x2_t raw;
-};
-template <>
-struct Tuple2<int8_t, 16> {
-  int8x16x2_t raw;
-};
-template <size_t N>
-struct Tuple2<int8_t, N> {
-  int8x8x2_t raw;
-};
-template <>
-struct Tuple2<uint16_t, 8> {
-  uint16x8x2_t raw;
-};
-template <size_t N>
-struct Tuple2<uint16_t, N> {
-  uint16x4x2_t raw;
-};
-template <>
-struct Tuple2<int16_t, 8> {
-  int16x8x2_t raw;
-};
-template <size_t N>
-struct Tuple2<int16_t, N> {
-  int16x4x2_t raw;
-};
-template <>
-struct Tuple2<uint32_t, 4> {
-  uint32x4x2_t raw;
-};
-template <size_t N>
-struct Tuple2<uint32_t, N> {
-  uint32x2x2_t raw;
-};
-template <>
-struct Tuple2<int32_t, 4> {
-  int32x4x2_t raw;
-};
-template <size_t N>
-struct Tuple2<int32_t, N> {
-  int32x2x2_t raw;
-};
-template <>
-struct Tuple2<uint64_t, 2> {
-  uint64x2x2_t raw;
-};
-template <size_t N>
-struct Tuple2<uint64_t, N> {
-  uint64x1x2_t raw;
-};
-template <>
-struct Tuple2<int64_t, 2> {
-  int64x2x2_t raw;
-};
-template <size_t N>
-struct Tuple2<int64_t, N> {
-  int64x1x2_t raw;
-};
-
-template <>
-struct Tuple2<float16_t, 8> {
-  uint16x8x2_t raw;
-};
-template <size_t N>
-struct Tuple2<float16_t, N> {
-  uint16x4x2_t raw;
-};
-template <>
-struct Tuple2<bfloat16_t, 8> {
-  uint16x8x2_t raw;
-};
-template <size_t N>
-struct Tuple2<bfloat16_t, N> {
-  uint16x4x2_t raw;
-};
-
-template <>
-struct Tuple2<float32_t, 4> {
-  float32x4x2_t raw;
-};
-template <size_t N>
-struct Tuple2<float32_t, N> {
-  float32x2x2_t raw;
-};
-#if HWY_ARCH_ARM_A64
-template <>
-struct Tuple2<float64_t, 2> {
-  float64x2x2_t raw;
-};
-template <size_t N>
-struct Tuple2<float64_t, N> {
-  float64x1x2_t raw;
-};
-#endif  // HWY_ARCH_ARM_A64
-
-template <>
-struct Tuple3<uint8_t, 16> {
-  uint8x16x3_t raw;
-};
-template <size_t N>
-struct Tuple3<uint8_t, N> {
-  uint8x8x3_t raw;
-};
-template <>
-struct Tuple3<int8_t, 16> {
-  int8x16x3_t raw;
-};
-template <size_t N>
-struct Tuple3<int8_t, N> {
-  int8x8x3_t raw;
-};
-template <>
-struct Tuple3<uint16_t, 8> {
-  uint16x8x3_t raw;
-};
-template <size_t N>
-struct Tuple3<uint16_t, N> {
-  uint16x4x3_t raw;
-};
-template <>
-struct Tuple3<int16_t, 8> {
-  int16x8x3_t raw;
-};
-template <size_t N>
-struct Tuple3<int16_t, N> {
-  int16x4x3_t raw;
-};
-template <>
-struct Tuple3<uint32_t, 4> {
-  uint32x4x3_t raw;
-};
-template <size_t N>
-struct Tuple3<uint32_t, N> {
-  uint32x2x3_t raw;
-};
-template <>
-struct Tuple3<int32_t, 4> {
-  int32x4x3_t raw;
-};
-template <size_t N>
-struct Tuple3<int32_t, N> {
-  int32x2x3_t raw;
-};
-template <>
-struct Tuple3<uint64_t, 2> {
-  uint64x2x3_t raw;
-};
-template <size_t N>
-struct Tuple3<uint64_t, N> {
-  uint64x1x3_t raw;
-};
-template <>
-struct Tuple3<int64_t, 2> {
-  int64x2x3_t raw;
-};
-template <size_t N>
-struct Tuple3<int64_t, N> {
-  int64x1x3_t raw;
-};
-
-template <>
-struct Tuple3<float16_t, 8> {
-  uint16x8x3_t raw;
-};
-template <size_t N>
-struct Tuple3<float16_t, N> {
-  uint16x4x3_t raw;
-};
-template <>
-struct Tuple3<bfloat16_t, 8> {
-  uint16x8x3_t raw;
-};
-template <size_t N>
-struct Tuple3<bfloat16_t, N> {
-  uint16x4x3_t raw;
-};
-
-template <>
-struct Tuple3<float32_t, 4> {
-  float32x4x3_t raw;
-};
-template <size_t N>
-struct Tuple3<float32_t, N> {
-  float32x2x3_t raw;
-};
-#if HWY_ARCH_ARM_A64
-template <>
-struct Tuple3<float64_t, 2> {
-  float64x2x3_t raw;
-};
-template <size_t N>
-struct Tuple3<float64_t, N> {
-  float64x1x3_t raw;
-};
-#endif  // HWY_ARCH_ARM_A64
-
-template <>
-struct Tuple4<uint8_t, 16> {
-  uint8x16x4_t raw;
-};
-template <size_t N>
-struct Tuple4<uint8_t, N> {
-  uint8x8x4_t raw;
-};
-template <>
-struct Tuple4<int8_t, 16> {
-  int8x16x4_t raw;
-};
-template <size_t N>
-struct Tuple4<int8_t, N> {
-  int8x8x4_t raw;
-};
-template <>
-struct Tuple4<uint16_t, 8> {
-  uint16x8x4_t raw;
-};
-template <size_t N>
-struct Tuple4<uint16_t, N> {
-  uint16x4x4_t raw;
-};
-template <>
-struct Tuple4<int16_t, 8> {
-  int16x8x4_t raw;
-};
-template <size_t N>
-struct Tuple4<int16_t, N> {
-  int16x4x4_t raw;
-};
-template <>
-struct Tuple4<uint32_t, 4> {
-  uint32x4x4_t raw;
-};
-template <size_t N>
-struct Tuple4<uint32_t, N> {
-  uint32x2x4_t raw;
-};
-template <>
-struct Tuple4<int32_t, 4> {
-  int32x4x4_t raw;
-};
-template <size_t N>
-struct Tuple4<int32_t, N> {
-  int32x2x4_t raw;
-};
-template <>
-struct Tuple4<uint64_t, 2> {
-  uint64x2x4_t raw;
-};
-template <size_t N>
-struct Tuple4<uint64_t, N> {
-  uint64x1x4_t raw;
-};
-template <>
-struct Tuple4<int64_t, 2> {
-  int64x2x4_t raw;
-};
-template <size_t N>
-struct Tuple4<int64_t, N> {
-  int64x1x4_t raw;
-};
-
-template <>
-struct Tuple4<float16_t, 8> {
-  uint16x8x4_t raw;
-};
-template <size_t N>
-struct Tuple4<float16_t, N> {
-  uint16x4x4_t raw;
-};
-template <>
-struct Tuple4<bfloat16_t, 8> {
-  uint16x8x4_t raw;
-};
-template <size_t N>
-struct Tuple4<bfloat16_t, N> {
-  uint16x4x4_t raw;
-};
-
-template <>
-struct Tuple4<float32_t, 4> {
-  float32x4x4_t raw;
-};
-template <size_t N>
-struct Tuple4<float32_t, N> {
-  float32x2x4_t raw;
-};
-#if HWY_ARCH_ARM_A64
-template <>
-struct Tuple4<float64_t, 2> {
-  float64x2x4_t raw;
-};
-template <size_t N>
-struct Tuple4<float64_t, N> {
-  float64x1x4_t raw;
-};
-#endif  // HWY_ARCH_ARM_A64
-
-template <typename T, size_t N>
-struct Raw128;
-
-// 128
-template <>
-struct Raw128<uint8_t, 16> {
-  using type = uint8x16_t;
-};
-
-template <>
-struct Raw128<uint16_t, 8> {
-  using type = uint16x8_t;
-};
-
-template <>
-struct Raw128<uint32_t, 4> {
-  using type = uint32x4_t;
-};
-
-template <>
-struct Raw128<uint64_t, 2> {
-  using type = uint64x2_t;
-};
-
-template <>
-struct Raw128<int8_t, 16> {
-  using type = int8x16_t;
-};
-
-template <>
-struct Raw128<int16_t, 8> {
-  using type = int16x8_t;
-};
-
-template <>
-struct Raw128<int32_t, 4> {
-  using type = int32x4_t;
-};
-
-template <>
-struct Raw128<int64_t, 2> {
-  using type = int64x2_t;
-};
-
-template <>
-struct Raw128<float16_t, 8> {
-  using type = uint16x8_t;
-};
-
-template <>
-struct Raw128<bfloat16_t, 8> {
-  using type = uint16x8_t;
-};
-
-template <>
-struct Raw128<float, 4> {
-  using type = float32x4_t;
-};
-
-#if HWY_ARCH_ARM_A64
-template <>
-struct Raw128<double, 2> {
-  using type = float64x2_t;
-};
-#endif
-
-// 64
-template <>
-struct Raw128<uint8_t, 8> {
-  using type = uint8x8_t;
-};
-
-template <>
-struct Raw128<uint16_t, 4> {
-  using type = uint16x4_t;
-};
-
-template <>
-struct Raw128<uint32_t, 2> {
-  using type = uint32x2_t;
-};
-
-template <>
-struct Raw128<uint64_t, 1> {
-  using type = uint64x1_t;
-};
-
-template <>
-struct Raw128<int8_t, 8> {
-  using type = int8x8_t;
-};
-
-template <>
-struct Raw128<int16_t, 4> {
-  using type = int16x4_t;
-};
-
-template <>
-struct Raw128<int32_t, 2> {
-  using type = int32x2_t;
-};
-
-template <>
-struct Raw128<int64_t, 1> {
-  using type = int64x1_t;
-};
-
-template <>
-struct Raw128<float16_t, 4> {
-  using type = uint16x4_t;
-};
-
-template <>
-struct Raw128<bfloat16_t, 4> {
-  using type = uint16x4_t;
-};
-
-template <>
-struct Raw128<float, 2> {
-  using type = float32x2_t;
-};
-
-#if HWY_ARCH_ARM_A64
-template <>
-struct Raw128<double, 1> {
-  using type = float64x1_t;
-};
-#endif
-
-// 32 (same as 64)
-template <>
-struct Raw128<uint8_t, 4> : public Raw128<uint8_t, 8> {};
-
-template <>
-struct Raw128<uint16_t, 2> : public Raw128<uint16_t, 4> {};
-
-template <>
-struct Raw128<uint32_t, 1> : public Raw128<uint32_t, 2> {};
-
-template <>
-struct Raw128<int8_t, 4> : public Raw128<int8_t, 8> {};
-
-template <>
-struct Raw128<int16_t, 2> : public Raw128<int16_t, 4> {};
-
-template <>
-struct Raw128<int32_t, 1> : public Raw128<int32_t, 2> {};
-
-template <>
-struct Raw128<float16_t, 2> : public Raw128<float16_t, 4> {};
-
-template <>
-struct Raw128<bfloat16_t, 2> : public Raw128<bfloat16_t, 4> {};
-
-template <>
-struct Raw128<float, 1> : public Raw128<float, 2> {};
-
-// 16 (same as 64)
-template <>
-struct Raw128<uint8_t, 2> : public Raw128<uint8_t, 8> {};
-
-template <>
-struct Raw128<uint16_t, 1> : public Raw128<uint16_t, 4> {};
-
-template <>
-struct Raw128<int8_t, 2> : public Raw128<int8_t, 8> {};
-
-template <>
-struct Raw128<int16_t, 1> : public Raw128<int16_t, 4> {};
-
-template <>
-struct Raw128<float16_t, 1> : public Raw128<float16_t, 4> {};
-
-template <>
-struct Raw128<bfloat16_t, 1> : public Raw128<bfloat16_t, 4> {};
-
-// 8 (same as 64)
-template <>
-struct Raw128<uint8_t, 1> : public Raw128<uint8_t, 8> {};
-
-template <>
-struct Raw128<int8_t, 1> : public Raw128<int8_t, 8> {};
-
-}  // namespace detail
-
-template <typename T, size_t N = 16 / sizeof(T)>
-class Vec128 {
-  using Raw = typename detail::Raw128<T, N>::type;
-
- public:
-  HWY_INLINE Vec128() {}
-  Vec128(const Vec128&) = default;
-  Vec128& operator=(const Vec128&) = default;
-  HWY_INLINE explicit Vec128(const Raw raw) : raw(raw) {}
-
-  // Compound assignment. Only usable if there is a corresponding non-member
-  // binary operator overload. For example, only f32 and f64 support division.
-  HWY_INLINE Vec128& operator*=(const Vec128 other) {
-    return *this = (*this * other);
-  }
-  HWY_INLINE Vec128& operator/=(const Vec128 other) {
-    return *this = (*this / other);
-  }
-  HWY_INLINE Vec128& operator+=(const Vec128 other) {
-    return *this = (*this + other);
-  }
-  HWY_INLINE Vec128& operator-=(const Vec128 other) {
-    return *this = (*this - other);
-  }
-  HWY_INLINE Vec128& operator&=(const Vec128 other) {
-    return *this = (*this & other);
-  }
-  HWY_INLINE Vec128& operator|=(const Vec128 other) {
-    return *this = (*this | other);
-  }
-  HWY_INLINE Vec128& operator^=(const Vec128 other) {
-    return *this = (*this ^ other);
-  }
-
-  Raw raw;
-};
-
-template <typename T>
-using Vec64 = Vec128<T, 8 / sizeof(T)>;
-
-template <typename T>
-using Vec32 = Vec128<T, 4 / sizeof(T)>;
-
-// FF..FF or 0.
-template <typename T, size_t N = 16 / sizeof(T)>
-class Mask128 {
-  // ARM C Language Extensions return and expect unsigned type.
-  using Raw = typename detail::Raw128<MakeUnsigned<T>, N>::type;
-
- public:
-  HWY_INLINE Mask128() {}
-  Mask128(const Mask128&) = default;
-  Mask128& operator=(const Mask128&) = default;
-  HWY_INLINE explicit Mask128(const Raw raw) : raw(raw) {}
-
-  Raw raw;
-};
-
-template <typename T>
-using Mask64 = Mask128<T, 8 / sizeof(T)>;
-
-namespace detail {
-
-// Deduce Simd<T, N, 0> from Vec128<T, N>
-struct DeduceD {
-  template <typename T, size_t N>
-  Simd<T, N, 0> operator()(Vec128<T, N>) const {
-    return Simd<T, N, 0>();
-  }
-};
-
-}  // namespace detail
-
-template <class V>
-using DFromV = decltype(detail::DeduceD()(V()));
-
-template <class V>
-using TFromV = TFromD<DFromV<V>>;
-
-// ------------------------------ BitCast
-
-namespace detail {
-
-// Converts from Vec128<T, N> to Vec128<uint8_t, N * sizeof(T)> using the
-// vreinterpret*_u8_*() set of functions.
-#define HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
-#define HWY_NEON_BUILD_RET_HWY_CAST_TO_U8(type, size) \
-  Vec128<uint8_t, size * sizeof(type##_t)>
-#define HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8(type, size) Vec128<type##_t, size> v
-#define HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8 v.raw
-
-// Special case of u8 to u8 since vreinterpret*_u8_u8 is obviously not defined.
-template <size_t N>
-HWY_INLINE Vec128<uint8_t, N> BitCastToByte(Vec128<uint8_t, N> v) {
-  return v;
-}
-
-HWY_NEON_DEF_FUNCTION_ALL_FLOATS(BitCastToByte, vreinterpret, _u8_,
-                                 HWY_CAST_TO_U8)
-HWY_NEON_DEF_FUNCTION_INTS(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
-HWY_NEON_DEF_FUNCTION_UINT_16(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
-HWY_NEON_DEF_FUNCTION_UINT_32(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
-HWY_NEON_DEF_FUNCTION_UINT_64(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
-
-// Special cases for [b]float16_t, which have the same Raw as uint16_t.
-template <size_t N>
-HWY_INLINE Vec128<uint8_t, N * 2> BitCastToByte(Vec128<float16_t, N> v) {
-  return BitCastToByte(Vec128<uint16_t, N>(v.raw));
-}
-template <size_t N>
-HWY_INLINE Vec128<uint8_t, N * 2> BitCastToByte(Vec128<bfloat16_t, N> v) {
-  return BitCastToByte(Vec128<uint16_t, N>(v.raw));
-}
-
-#undef HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
-#undef HWY_NEON_BUILD_RET_HWY_CAST_TO_U8
-#undef HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8
-#undef HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8
-
-template <size_t N>
-HWY_INLINE Vec128<uint8_t, N> BitCastFromByte(Simd<uint8_t, N, 0> /* tag */,
-                                              Vec128<uint8_t, N> v) {
-  return v;
-}
-
-// 64-bit or less:
-
-template <size_t N, HWY_IF_LE64(int8_t, N)>
-HWY_INLINE Vec128<int8_t, N> BitCastFromByte(Simd<int8_t, N, 0> /* tag */,
-                                             Vec128<uint8_t, N> v) {
-  return Vec128<int8_t, N>(vreinterpret_s8_u8(v.raw));
-}
-template <size_t N, HWY_IF_LE64(uint16_t, N)>
-HWY_INLINE Vec128<uint16_t, N> BitCastFromByte(Simd<uint16_t, N, 0> /* tag */,
-                                               Vec128<uint8_t, N * 2> v) {
-  return Vec128<uint16_t, N>(vreinterpret_u16_u8(v.raw));
-}
-template <size_t N, HWY_IF_LE64(int16_t, N)>
-HWY_INLINE Vec128<int16_t, N> BitCastFromByte(Simd<int16_t, N, 0> /* tag */,
-                                              Vec128<uint8_t, N * 2> v) {
-  return Vec128<int16_t, N>(vreinterpret_s16_u8(v.raw));
-}
-template <size_t N, HWY_IF_LE64(uint32_t, N)>
-HWY_INLINE Vec128<uint32_t, N> BitCastFromByte(Simd<uint32_t, N, 0> /* tag */,
-                                               Vec128<uint8_t, N * 4> v) {
-  return Vec128<uint32_t, N>(vreinterpret_u32_u8(v.raw));
-}
-template <size_t N, HWY_IF_LE64(int32_t, N)>
-HWY_INLINE Vec128<int32_t, N> BitCastFromByte(Simd<int32_t, N, 0> /* tag */,
-                                              Vec128<uint8_t, N * 4> v) {
-  return Vec128<int32_t, N>(vreinterpret_s32_u8(v.raw));
-}
-template <size_t N, HWY_IF_LE64(float, N)>
-HWY_INLINE Vec128<float, N> BitCastFromByte(Simd<float, N, 0> /* tag */,
-                                            Vec128<uint8_t, N * 4> v) {
-  return Vec128<float, N>(vreinterpret_f32_u8(v.raw));
-}
-HWY_INLINE Vec64<uint64_t> BitCastFromByte(Full64<uint64_t> /* tag */,
-                                           Vec128<uint8_t, 1 * 8> v) {
-  return Vec64<uint64_t>(vreinterpret_u64_u8(v.raw));
-}
-HWY_INLINE Vec64<int64_t> BitCastFromByte(Full64<int64_t> /* tag */,
-                                          Vec128<uint8_t, 1 * 8> v) {
-  return Vec64<int64_t>(vreinterpret_s64_u8(v.raw));
-}
-#if HWY_ARCH_ARM_A64
-HWY_INLINE Vec64<double> BitCastFromByte(Full64<double> /* tag */,
-                                         Vec128<uint8_t, 1 * 8> v) {
-  return Vec64<double>(vreinterpret_f64_u8(v.raw));
-}
-#endif
-
-// 128-bit full:
-
-HWY_INLINE Vec128<int8_t> BitCastFromByte(Full128<int8_t> /* tag */,
-                                          Vec128<uint8_t> v) {
-  return Vec128<int8_t>(vreinterpretq_s8_u8(v.raw));
-}
-HWY_INLINE Vec128<uint16_t> BitCastFromByte(Full128<uint16_t> /* tag */,
-                                            Vec128<uint8_t> v) {
-  return Vec128<uint16_t>(vreinterpretq_u16_u8(v.raw));
-}
-HWY_INLINE Vec128<int16_t> BitCastFromByte(Full128<int16_t> /* tag */,
-                                           Vec128<uint8_t> v) {
-  return Vec128<int16_t>(vreinterpretq_s16_u8(v.raw));
-}
-HWY_INLINE Vec128<uint32_t> BitCastFromByte(Full128<uint32_t> /* tag */,
-                                            Vec128<uint8_t> v) {
-  return Vec128<uint32_t>(vreinterpretq_u32_u8(v.raw));
-}
-HWY_INLINE Vec128<int32_t> BitCastFromByte(Full128<int32_t> /* tag */,
-                                           Vec128<uint8_t> v) {
-  return Vec128<int32_t>(vreinterpretq_s32_u8(v.raw));
-}
-HWY_INLINE Vec128<float> BitCastFromByte(Full128<float> /* tag */,
-                                         Vec128<uint8_t> v) {
-  return Vec128<float>(vreinterpretq_f32_u8(v.raw));
-}
-HWY_INLINE Vec128<uint64_t> BitCastFromByte(Full128<uint64_t> /* tag */,
-                                            Vec128<uint8_t> v) {
-  return Vec128<uint64_t>(vreinterpretq_u64_u8(v.raw));
-}
-HWY_INLINE Vec128<int64_t> BitCastFromByte(Full128<int64_t> /* tag */,
-                                           Vec128<uint8_t> v) {
-  return Vec128<int64_t>(vreinterpretq_s64_u8(v.raw));
-}
-
-#if HWY_ARCH_ARM_A64
-HWY_INLINE Vec128<double> BitCastFromByte(Full128<double> /* tag */,
-                                          Vec128<uint8_t> v) {
-  return Vec128<double>(vreinterpretq_f64_u8(v.raw));
-}
-#endif
-
-// Special cases for [b]float16_t, which have the same Raw as uint16_t.
-template <size_t N>
-HWY_INLINE Vec128<float16_t, N> BitCastFromByte(Simd<float16_t, N, 0> /* tag */,
-                                                Vec128<uint8_t, N * 2> v) {
-  return Vec128<float16_t, N>(BitCastFromByte(Simd<uint16_t, N, 0>(), v).raw);
-}
-template <size_t N>
-HWY_INLINE Vec128<bfloat16_t, N> BitCastFromByte(
-    Simd<bfloat16_t, N, 0> /* tag */, Vec128<uint8_t, N * 2> v) {
-  return Vec128<bfloat16_t, N>(BitCastFromByte(Simd<uint16_t, N, 0>(), v).raw);
-}
-
-}  // namespace detail
-
-template <typename T, size_t N, typename FromT>
-HWY_API Vec128<T, N> BitCast(Simd<T, N, 0> d,
-                             Vec128<FromT, N * sizeof(T) / sizeof(FromT)> v) {
-  return detail::BitCastFromByte(d, detail::BitCastToByte(v));
-}
-
-// ------------------------------ Set
-
-// Returns a vector with all lanes set to "t".
-#define HWY_NEON_BUILD_TPL_HWY_SET1
-#define HWY_NEON_BUILD_RET_HWY_SET1(type, size) Vec128<type##_t, size>
-#define HWY_NEON_BUILD_PARAM_HWY_SET1(type, size) \
-  Simd<type##_t, size, 0> /* tag */, const type##_t t
-#define HWY_NEON_BUILD_ARG_HWY_SET1 t
-
-HWY_NEON_DEF_FUNCTION_ALL_TYPES(Set, vdup, _n_, HWY_SET1)
-
-#undef HWY_NEON_BUILD_TPL_HWY_SET1
-#undef HWY_NEON_BUILD_RET_HWY_SET1
-#undef HWY_NEON_BUILD_PARAM_HWY_SET1
-#undef HWY_NEON_BUILD_ARG_HWY_SET1
-
-// Returns an all-zero vector.
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Zero(Simd<T, N, 0> d) {
-  return Set(d, 0);
-}
-
-template <size_t N>
-HWY_API Vec128<bfloat16_t, N> Zero(Simd<bfloat16_t, N, 0> /* tag */) {
-  return Vec128<bfloat16_t, N>(Zero(Simd<uint16_t, N, 0>()).raw);
-}
-
-template <class D>
-using VFromD = decltype(Zero(D()));
-
-// Returns a vector with uninitialized elements.
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Undefined(Simd<T, N, 0> /*d*/) {
-  HWY_DIAGNOSTICS(push)
-  HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
-#if HWY_COMPILER_GCC_ACTUAL
-  HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wmaybe-uninitialized")
-#endif
-  typename detail::Raw128<T, N>::type a;
-  return Vec128<T, N>(a);
-  HWY_DIAGNOSTICS(pop)
-}
-
-// Returns a vector with lane i=[0, N) set to "first" + i.
-template <typename T, size_t N, typename T2>
-Vec128<T, N> Iota(const Simd<T, N, 0> d, const T2 first) {
-  HWY_ALIGN T lanes[16 / sizeof(T)];
-  for (size_t i = 0; i < 16 / sizeof(T); ++i) {
-    lanes[i] = static_cast<T>(first + static_cast<T2>(i));
-  }
-  return Load(d, lanes);
-}
-
-// ------------------------------ GetLane
-
-namespace detail {
-#define HWY_NEON_BUILD_TPL_HWY_GET template <size_t kLane>
-#define HWY_NEON_BUILD_RET_HWY_GET(type, size) type##_t
-#define HWY_NEON_BUILD_PARAM_HWY_GET(type, size) Vec128<type##_t, size> v
-#define HWY_NEON_BUILD_ARG_HWY_GET v.raw, kLane
-
-HWY_NEON_DEF_FUNCTION_ALL_TYPES(GetLane, vget, _lane_, HWY_GET)
-
-#undef HWY_NEON_BUILD_TPL_HWY_GET
-#undef HWY_NEON_BUILD_RET_HWY_GET
-#undef HWY_NEON_BUILD_PARAM_HWY_GET
-#undef HWY_NEON_BUILD_ARG_HWY_GET
-
-}  // namespace detail
-
-template <class V>
-HWY_API TFromV<V> GetLane(const V v) {
-  return detail::GetLane<0>(v);
-}
-
-// ------------------------------ ExtractLane
-
-// Requires one overload per vector length because GetLane<3> is a compile error
-// if v is a uint32x2_t.
-template <typename T>
-HWY_API T ExtractLane(const Vec128<T, 1> v, size_t i) {
-  HWY_DASSERT(i == 0);
-  (void)i;
-  return detail::GetLane<0>(v);
-}
-
-template <typename T>
-HWY_API T ExtractLane(const Vec128<T, 2> v, size_t i) {
-#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
-  if (__builtin_constant_p(i)) {
-    switch (i) {
-      case 0:
-        return detail::GetLane<0>(v);
-      case 1:
-        return detail::GetLane<1>(v);
-    }
-  }
-#endif
-  alignas(16) T lanes[2];
-  Store(v, DFromV<decltype(v)>(), lanes);
-  return lanes[i];
-}
-
-template <typename T>
-HWY_API T ExtractLane(const Vec128<T, 4> v, size_t i) {
-#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
-  if (__builtin_constant_p(i)) {
-    switch (i) {
-      case 0:
-        return detail::GetLane<0>(v);
-      case 1:
-        return detail::GetLane<1>(v);
-      case 2:
-        return detail::GetLane<2>(v);
-      case 3:
-        return detail::GetLane<3>(v);
-    }
-  }
-#endif
-  alignas(16) T lanes[4];
-  Store(v, DFromV<decltype(v)>(), lanes);
-  return lanes[i];
-}
-
-template <typename T>
-HWY_API T ExtractLane(const Vec128<T, 8> v, size_t i) {
-#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
-  if (__builtin_constant_p(i)) {
-    switch (i) {
-      case 0:
-        return detail::GetLane<0>(v);
-      case 1:
-        return detail::GetLane<1>(v);
-      case 2:
-        return detail::GetLane<2>(v);
-      case 3:
-        return detail::GetLane<3>(v);
-      case 4:
-        return detail::GetLane<4>(v);
-      case 5:
-        return detail::GetLane<5>(v);
-      case 6:
-        return detail::GetLane<6>(v);
-      case 7:
-        return detail::GetLane<7>(v);
-    }
-  }
-#endif
-  alignas(16) T lanes[8];
-  Store(v, DFromV<decltype(v)>(), lanes);
-  return lanes[i];
-}
-
-template <typename T>
-HWY_API T ExtractLane(const Vec128<T, 16> v, size_t i) {
-#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
-  if (__builtin_constant_p(i)) {
-    switch (i) {
-      case 0:
-        return detail::GetLane<0>(v);
-      case 1:
-        return detail::GetLane<1>(v);
-      case 2:
-        return detail::GetLane<2>(v);
-      case 3:
-        return detail::GetLane<3>(v);
-      case 4:
-        return detail::GetLane<4>(v);
-      case 5:
-        return detail::GetLane<5>(v);
-      case 6:
-        return detail::GetLane<6>(v);
-      case 7:
-        return detail::GetLane<7>(v);
-      case 8:
-        return detail::GetLane<8>(v);
-      case 9:
-        return detail::GetLane<9>(v);
-      case 10:
-        return detail::GetLane<10>(v);
-      case 11:
-        return detail::GetLane<11>(v);
-      case 12:
-        return detail::GetLane<12>(v);
-      case 13:
-        return detail::GetLane<13>(v);
-      case 14:
-        return detail::GetLane<14>(v);
-      case 15:
-        return detail::GetLane<15>(v);
-    }
-  }
-#endif
-  alignas(16) T lanes[16];
-  Store(v, DFromV<decltype(v)>(), lanes);
-  return lanes[i];
-}
-
-// ------------------------------ InsertLane
-
-namespace detail {
-#define HWY_NEON_BUILD_TPL_HWY_INSERT template <size_t kLane>
-#define HWY_NEON_BUILD_RET_HWY_INSERT(type, size) Vec128<type##_t, size>
-#define HWY_NEON_BUILD_PARAM_HWY_INSERT(type, size) \
-  Vec128<type##_t, size> v, type##_t t
-#define HWY_NEON_BUILD_ARG_HWY_INSERT t, v.raw, kLane
-
-HWY_NEON_DEF_FUNCTION_ALL_TYPES(InsertLane, vset, _lane_, HWY_INSERT)
-
-#undef HWY_NEON_BUILD_TPL_HWY_INSERT
-#undef HWY_NEON_BUILD_RET_HWY_INSERT
-#undef HWY_NEON_BUILD_PARAM_HWY_INSERT
-#undef HWY_NEON_BUILD_ARG_HWY_INSERT
-
-}  // namespace detail
-
-// Requires one overload per vector length because InsertLane<3> may be a
-// compile error.
-
-template <typename T>
-HWY_API Vec128<T, 1> InsertLane(const Vec128<T, 1> v, size_t i, T t) {
-  HWY_DASSERT(i == 0);
-  (void)i;
-  return Set(DFromV<decltype(v)>(), t);
-}
-
-template <typename T>
-HWY_API Vec128<T, 2> InsertLane(const Vec128<T, 2> v, size_t i, T t) {
-#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
-  if (__builtin_constant_p(i)) {
-    switch (i) {
-      case 0:
-        return detail::InsertLane<0>(v, t);
-      case 1:
-        return detail::InsertLane<1>(v, t);
-    }
-  }
-#endif
-  const DFromV<decltype(v)> d;
-  alignas(16) T lanes[2];
-  Store(v, d, lanes);
-  lanes[i] = t;
-  return Load(d, lanes);
-}
-
-template <typename T>
-HWY_API Vec128<T, 4> InsertLane(const Vec128<T, 4> v, size_t i, T t) {
-#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
-  if (__builtin_constant_p(i)) {
-    switch (i) {
-      case 0:
-        return detail::InsertLane<0>(v, t);
-      case 1:
-        return detail::InsertLane<1>(v, t);
-      case 2:
-        return detail::InsertLane<2>(v, t);
-      case 3:
-        return detail::InsertLane<3>(v, t);
-    }
-  }
-#endif
-  const DFromV<decltype(v)> d;
-  alignas(16) T lanes[4];
-  Store(v, d, lanes);
-  lanes[i] = t;
-  return Load(d, lanes);
-}
-
-template <typename T>
-HWY_API Vec128<T, 8> InsertLane(const Vec128<T, 8> v, size_t i, T t) {
-#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
-  if (__builtin_constant_p(i)) {
-    switch (i) {
-      case 0:
-        return detail::InsertLane<0>(v, t);
-      case 1:
-        return detail::InsertLane<1>(v, t);
-      case 2:
-        return detail::InsertLane<2>(v, t);
-      case 3:
-        return detail::InsertLane<3>(v, t);
-      case 4:
-        return detail::InsertLane<4>(v, t);
-      case 5:
-        return detail::InsertLane<5>(v, t);
-      case 6:
-        return detail::InsertLane<6>(v, t);
-      case 7:
-        return detail::InsertLane<7>(v, t);
-    }
-  }
-#endif
-  const DFromV<decltype(v)> d;
-  alignas(16) T lanes[8];
-  Store(v, d, lanes);
-  lanes[i] = t;
-  return Load(d, lanes);
-}
-
-template <typename T>
-HWY_API Vec128<T, 16> InsertLane(const Vec128<T, 16> v, size_t i, T t) {
-#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
-  if (__builtin_constant_p(i)) {
-    switch (i) {
-      case 0:
-        return detail::InsertLane<0>(v, t);
-      case 1:
-        return detail::InsertLane<1>(v, t);
-      case 2:
-        return detail::InsertLane<2>(v, t);
-      case 3:
-        return detail::InsertLane<3>(v, t);
-      case 4:
-        return detail::InsertLane<4>(v, t);
-      case 5:
-        return detail::InsertLane<5>(v, t);
-      case 6:
-        return detail::InsertLane<6>(v, t);
-      case 7:
-        return detail::InsertLane<7>(v, t);
-      case 8:
-        return detail::InsertLane<8>(v, t);
-      case 9:
-        return detail::InsertLane<9>(v, t);
-      case 10:
-        return detail::InsertLane<10>(v, t);
-      case 11:
-        return detail::InsertLane<11>(v, t);
-      case 12:
-        return detail::InsertLane<12>(v, t);
-      case 13:
-        return detail::InsertLane<13>(v, t);
-      case 14:
-        return detail::InsertLane<14>(v, t);
-      case 15:
-        return detail::InsertLane<15>(v, t);
-    }
-  }
-#endif
-  const DFromV<decltype(v)> d;
-  alignas(16) T lanes[16];
-  Store(v, d, lanes);
-  lanes[i] = t;
-  return Load(d, lanes);
-}
-
-// ================================================== ARITHMETIC
-
-// ------------------------------ Addition
-HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator+, vadd, _, 2)
-
-// ------------------------------ Subtraction
-HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator-, vsub, _, 2)
-
-// ------------------------------ SumsOf8
-
-HWY_API Vec128<uint64_t> SumsOf8(const Vec128<uint8_t> v) {
-  return Vec128<uint64_t>(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v.raw))));
-}
-HWY_API Vec64<uint64_t> SumsOf8(const Vec64<uint8_t> v) {
-  return Vec64<uint64_t>(vpaddl_u32(vpaddl_u16(vpaddl_u8(v.raw))));
-}
-
-// ------------------------------ SaturatedAdd
-// Only defined for uint8_t, uint16_t and their signed versions, as in other
-// architectures.
-
-// Returns a + b clamped to the destination range.
-HWY_NEON_DEF_FUNCTION_INT_8(SaturatedAdd, vqadd, _, 2)
-HWY_NEON_DEF_FUNCTION_INT_16(SaturatedAdd, vqadd, _, 2)
-HWY_NEON_DEF_FUNCTION_UINT_8(SaturatedAdd, vqadd, _, 2)
-HWY_NEON_DEF_FUNCTION_UINT_16(SaturatedAdd, vqadd, _, 2)
-
-// ------------------------------ SaturatedSub
-
-// Returns a - b clamped to the destination range.
-HWY_NEON_DEF_FUNCTION_INT_8(SaturatedSub, vqsub, _, 2)
-HWY_NEON_DEF_FUNCTION_INT_16(SaturatedSub, vqsub, _, 2)
-HWY_NEON_DEF_FUNCTION_UINT_8(SaturatedSub, vqsub, _, 2)
-HWY_NEON_DEF_FUNCTION_UINT_16(SaturatedSub, vqsub, _, 2)
-
-// Not part of API, used in implementation.
-namespace detail {
-HWY_NEON_DEF_FUNCTION_UINT_32(SaturatedSub, vqsub, _, 2)
-HWY_NEON_DEF_FUNCTION_UINT_64(SaturatedSub, vqsub, _, 2)
-HWY_NEON_DEF_FUNCTION_INT_32(SaturatedSub, vqsub, _, 2)
-HWY_NEON_DEF_FUNCTION_INT_64(SaturatedSub, vqsub, _, 2)
-}  // namespace detail
-
-// ------------------------------ Average
-
-// Returns (a + b + 1) / 2
-HWY_NEON_DEF_FUNCTION_UINT_8(AverageRound, vrhadd, _, 2)
-HWY_NEON_DEF_FUNCTION_UINT_16(AverageRound, vrhadd, _, 2)
-
-// ------------------------------ Neg
-
-HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Neg, vneg, _, 1)
-HWY_NEON_DEF_FUNCTION_INT_8_16_32(Neg, vneg, _, 1)  // i64 implemented below
-
-HWY_API Vec64<int64_t> Neg(const Vec64<int64_t> v) {
-#if HWY_ARCH_ARM_A64
-  return Vec64<int64_t>(vneg_s64(v.raw));
-#else
-  return Zero(Full64<int64_t>()) - v;
-#endif
-}
-
-HWY_API Vec128<int64_t> Neg(const Vec128<int64_t> v) {
-#if HWY_ARCH_ARM_A64
-  return Vec128<int64_t>(vnegq_s64(v.raw));
-#else
-  return Zero(Full128<int64_t>()) - v;
-#endif
-}
-
-// ------------------------------ ShiftLeft
-
-// Customize HWY_NEON_DEF_FUNCTION to special-case count=0 (not supported).
-#pragma push_macro("HWY_NEON_DEF_FUNCTION")
-#undef HWY_NEON_DEF_FUNCTION
-#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args)   \
-  template <int kBits>                                                         \
-  HWY_API Vec128<type##_t, size> name(const Vec128<type##_t, size> v) {        \
-    return kBits == 0 ? v                                                      \
-                      : Vec128<type##_t, size>(HWY_NEON_EVAL(                  \
-                            prefix##infix##suffix, v.raw, HWY_MAX(1, kBits))); \
-  }
-
-HWY_NEON_DEF_FUNCTION_INTS_UINTS(ShiftLeft, vshl, _n_, ignored)
-
-HWY_NEON_DEF_FUNCTION_UINTS(ShiftRight, vshr, _n_, ignored)
-HWY_NEON_DEF_FUNCTION_INTS(ShiftRight, vshr, _n_, ignored)
-
-#pragma pop_macro("HWY_NEON_DEF_FUNCTION")
-
-// ------------------------------ RotateRight (ShiftRight, Or)
-
-template <int kBits, size_t N>
-HWY_API Vec128<uint32_t, N> RotateRight(const Vec128<uint32_t, N> v) {
-  static_assert(0 <= kBits && kBits < 32, "Invalid shift count");
-  if (kBits == 0) return v;
-  return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(31, 32 - kBits)>(v));
-}
-
-template <int kBits, size_t N>
-HWY_API Vec128<uint64_t, N> RotateRight(const Vec128<uint64_t, N> v) {
-  static_assert(0 <= kBits && kBits < 64, "Invalid shift count");
-  if (kBits == 0) return v;
-  return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(63, 64 - kBits)>(v));
-}
-
-// NOTE: vxarq_u64 can be applied to uint64_t, but we do not yet have a
-// mechanism for checking for extensions to ARMv8.
-
-// ------------------------------ Shl
-
-HWY_API Vec128<uint8_t> operator<<(const Vec128<uint8_t> v,
-                                   const Vec128<uint8_t> bits) {
-  return Vec128<uint8_t>(vshlq_u8(v.raw, vreinterpretq_s8_u8(bits.raw)));
-}
-template <size_t N, HWY_IF_LE64(uint8_t, N)>
-HWY_API Vec128<uint8_t, N> operator<<(const Vec128<uint8_t, N> v,
-                                      const Vec128<uint8_t, N> bits) {
-  return Vec128<uint8_t, N>(vshl_u8(v.raw, vreinterpret_s8_u8(bits.raw)));
-}
-
-HWY_API Vec128<uint16_t> operator<<(const Vec128<uint16_t> v,
-                                    const Vec128<uint16_t> bits) {
-  return Vec128<uint16_t>(vshlq_u16(v.raw, vreinterpretq_s16_u16(bits.raw)));
-}
-template <size_t N, HWY_IF_LE64(uint16_t, N)>
-HWY_API Vec128<uint16_t, N> operator<<(const Vec128<uint16_t, N> v,
-                                       const Vec128<uint16_t, N> bits) {
-  return Vec128<uint16_t, N>(vshl_u16(v.raw, vreinterpret_s16_u16(bits.raw)));
-}
-
-HWY_API Vec128<uint32_t> operator<<(const Vec128<uint32_t> v,
-                                    const Vec128<uint32_t> bits) {
-  return Vec128<uint32_t>(vshlq_u32(v.raw, vreinterpretq_s32_u32(bits.raw)));
-}
-template <size_t N, HWY_IF_LE64(uint32_t, N)>
-HWY_API Vec128<uint32_t, N> operator<<(const Vec128<uint32_t, N> v,
-                                       const Vec128<uint32_t, N> bits) {
-  return Vec128<uint32_t, N>(vshl_u32(v.raw, vreinterpret_s32_u32(bits.raw)));
-}
-
-HWY_API Vec128<uint64_t> operator<<(const Vec128<uint64_t> v,
-                                    const Vec128<uint64_t> bits) {
-  return Vec128<uint64_t>(vshlq_u64(v.raw, vreinterpretq_s64_u64(bits.raw)));
-}
-HWY_API Vec64<uint64_t> operator<<(const Vec64<uint64_t> v,
-                                   const Vec64<uint64_t> bits) {
-  return Vec64<uint64_t>(vshl_u64(v.raw, vreinterpret_s64_u64(bits.raw)));
-}
-
-HWY_API Vec128<int8_t> operator<<(const Vec128<int8_t> v,
-                                  const Vec128<int8_t> bits) {
-  return Vec128<int8_t>(vshlq_s8(v.raw, bits.raw));
-}
-template <size_t N, HWY_IF_LE64(int8_t, N)>
-HWY_API Vec128<int8_t, N> operator<<(const Vec128<int8_t, N> v,
-                                     const Vec128<int8_t, N> bits) {
-  return Vec128<int8_t, N>(vshl_s8(v.raw, bits.raw));
-}
-
-HWY_API Vec128<int16_t> operator<<(const Vec128<int16_t> v,
-                                   const Vec128<int16_t> bits) {
-  return Vec128<int16_t>(vshlq_s16(v.raw, bits.raw));
-}
-template <size_t N, HWY_IF_LE64(int16_t, N)>
-HWY_API Vec128<int16_t, N> operator<<(const Vec128<int16_t, N> v,
-                                      const Vec128<int16_t, N> bits) {
-  return Vec128<int16_t, N>(vshl_s16(v.raw, bits.raw));
-}
-
-HWY_API Vec128<int32_t> operator<<(const Vec128<int32_t> v,
-                                   const Vec128<int32_t> bits) {
-  return Vec128<int32_t>(vshlq_s32(v.raw, bits.raw));
-}
-template <size_t N, HWY_IF_LE64(int32_t, N)>
-HWY_API Vec128<int32_t, N> operator<<(const Vec128<int32_t, N> v,
-                                      const Vec128<int32_t, N> bits) {
-  return Vec128<int32_t, N>(vshl_s32(v.raw, bits.raw));
-}
-
-HWY_API Vec128<int64_t> operator<<(const Vec128<int64_t> v,
-                                   const Vec128<int64_t> bits) {
-  return Vec128<int64_t>(vshlq_s64(v.raw, bits.raw));
-}
-HWY_API Vec64<int64_t> operator<<(const Vec64<int64_t> v,
-                                  const Vec64<int64_t> bits) {
-  return Vec64<int64_t>(vshl_s64(v.raw, bits.raw));
-}
-
-// ------------------------------ Shr (Neg)
-
-HWY_API Vec128<uint8_t> operator>>(const Vec128<uint8_t> v,
-                                   const Vec128<uint8_t> bits) {
-  const int8x16_t neg_bits = Neg(BitCast(Full128<int8_t>(), bits)).raw;
-  return Vec128<uint8_t>(vshlq_u8(v.raw, neg_bits));
-}
-template <size_t N, HWY_IF_LE64(uint8_t, N)>
-HWY_API Vec128<uint8_t, N> operator>>(const Vec128<uint8_t, N> v,
-                                      const Vec128<uint8_t, N> bits) {
-  const int8x8_t neg_bits = Neg(BitCast(Simd<int8_t, N, 0>(), bits)).raw;
-  return Vec128<uint8_t, N>(vshl_u8(v.raw, neg_bits));
-}
-
-HWY_API Vec128<uint16_t> operator>>(const Vec128<uint16_t> v,
-                                    const Vec128<uint16_t> bits) {
-  const int16x8_t neg_bits = Neg(BitCast(Full128<int16_t>(), bits)).raw;
-  return Vec128<uint16_t>(vshlq_u16(v.raw, neg_bits));
-}
-template <size_t N, HWY_IF_LE64(uint16_t, N)>
-HWY_API Vec128<uint16_t, N> operator>>(const Vec128<uint16_t, N> v,
-                                       const Vec128<uint16_t, N> bits) {
-  const int16x4_t neg_bits = Neg(BitCast(Simd<int16_t, N, 0>(), bits)).raw;
-  return Vec128<uint16_t, N>(vshl_u16(v.raw, neg_bits));
-}
-
-HWY_API Vec128<uint32_t> operator>>(const Vec128<uint32_t> v,
-                                    const Vec128<uint32_t> bits) {
-  const int32x4_t neg_bits = Neg(BitCast(Full128<int32_t>(), bits)).raw;
-  return Vec128<uint32_t>(vshlq_u32(v.raw, neg_bits));
-}
-template <size_t N, HWY_IF_LE64(uint32_t, N)>
-HWY_API Vec128<uint32_t, N> operator>>(const Vec128<uint32_t, N> v,
-                                       const Vec128<uint32_t, N> bits) {
-  const int32x2_t neg_bits = Neg(BitCast(Simd<int32_t, N, 0>(), bits)).raw;
-  return Vec128<uint32_t, N>(vshl_u32(v.raw, neg_bits));
-}
-
-HWY_API Vec128<uint64_t> operator>>(const Vec128<uint64_t> v,
-                                    const Vec128<uint64_t> bits) {
-  const int64x2_t neg_bits = Neg(BitCast(Full128<int64_t>(), bits)).raw;
-  return Vec128<uint64_t>(vshlq_u64(v.raw, neg_bits));
-}
-HWY_API Vec64<uint64_t> operator>>(const Vec64<uint64_t> v,
-                                   const Vec64<uint64_t> bits) {
-  const int64x1_t neg_bits = Neg(BitCast(Full64<int64_t>(), bits)).raw;
-  return Vec64<uint64_t>(vshl_u64(v.raw, neg_bits));
-}
-
-HWY_API Vec128<int8_t> operator>>(const Vec128<int8_t> v,
-                                  const Vec128<int8_t> bits) {
-  return Vec128<int8_t>(vshlq_s8(v.raw, Neg(bits).raw));
-}
-template <size_t N, HWY_IF_LE64(int8_t, N)>
-HWY_API Vec128<int8_t, N> operator>>(const Vec128<int8_t, N> v,
-                                     const Vec128<int8_t, N> bits) {
-  return Vec128<int8_t, N>(vshl_s8(v.raw, Neg(bits).raw));
-}
-
-HWY_API Vec128<int16_t> operator>>(const Vec128<int16_t> v,
-                                   const Vec128<int16_t> bits) {
-  return Vec128<int16_t>(vshlq_s16(v.raw, Neg(bits).raw));
-}
-template <size_t N, HWY_IF_LE64(int16_t, N)>
-HWY_API Vec128<int16_t, N> operator>>(const Vec128<int16_t, N> v,
-                                      const Vec128<int16_t, N> bits) {
-  return Vec128<int16_t, N>(vshl_s16(v.raw, Neg(bits).raw));
-}
-
-HWY_API Vec128<int32_t> operator>>(const Vec128<int32_t> v,
-                                   const Vec128<int32_t> bits) {
-  return Vec128<int32_t>(vshlq_s32(v.raw, Neg(bits).raw));
-}
-template <size_t N, HWY_IF_LE64(int32_t, N)>
-HWY_API Vec128<int32_t, N> operator>>(const Vec128<int32_t, N> v,
-                                      const Vec128<int32_t, N> bits) {
-  return Vec128<int32_t, N>(vshl_s32(v.raw, Neg(bits).raw));
-}
-
-HWY_API Vec128<int64_t> operator>>(const Vec128<int64_t> v,
-                                   const Vec128<int64_t> bits) {
-  return Vec128<int64_t>(vshlq_s64(v.raw, Neg(bits).raw));
-}
-HWY_API Vec64<int64_t> operator>>(const Vec64<int64_t> v,
-                                  const Vec64<int64_t> bits) {
-  return Vec64<int64_t>(vshl_s64(v.raw, Neg(bits).raw));
-}
-
-// ------------------------------ ShiftLeftSame (Shl)
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> ShiftLeftSame(const Vec128<T, N> v, int bits) {
-  return v << Set(Simd<T, N, 0>(), static_cast<T>(bits));
-}
-template <typename T, size_t N>
-HWY_API Vec128<T, N> ShiftRightSame(const Vec128<T, N> v, int bits) {
-  return v >> Set(Simd<T, N, 0>(), static_cast<T>(bits));
-}
-
-// ------------------------------ Integer multiplication
-
-// Unsigned
-HWY_API Vec128<uint16_t> operator*(const Vec128<uint16_t> a,
-                                   const Vec128<uint16_t> b) {
-  return Vec128<uint16_t>(vmulq_u16(a.raw, b.raw));
-}
-HWY_API Vec128<uint32_t> operator*(const Vec128<uint32_t> a,
-                                   const Vec128<uint32_t> b) {
-  return Vec128<uint32_t>(vmulq_u32(a.raw, b.raw));
-}
-
-template <size_t N, HWY_IF_LE64(uint16_t, N)>
-HWY_API Vec128<uint16_t, N> operator*(const Vec128<uint16_t, N> a,
-                                      const Vec128<uint16_t, N> b) {
-  return Vec128<uint16_t, N>(vmul_u16(a.raw, b.raw));
-}
-template <size_t N, HWY_IF_LE64(uint32_t, N)>
-HWY_API Vec128<uint32_t, N> operator*(const Vec128<uint32_t, N> a,
-                                      const Vec128<uint32_t, N> b) {
-  return Vec128<uint32_t, N>(vmul_u32(a.raw, b.raw));
-}
-
-// Signed
-HWY_API Vec128<int16_t> operator*(const Vec128<int16_t> a,
-                                  const Vec128<int16_t> b) {
-  return Vec128<int16_t>(vmulq_s16(a.raw, b.raw));
-}
-HWY_API Vec128<int32_t> operator*(const Vec128<int32_t> a,
-                                  const Vec128<int32_t> b) {
-  return Vec128<int32_t>(vmulq_s32(a.raw, b.raw));
-}
-
-template <size_t N, HWY_IF_LE64(uint16_t, N)>
-HWY_API Vec128<int16_t, N> operator*(const Vec128<int16_t, N> a,
-                                     const Vec128<int16_t, N> b) {
-  return Vec128<int16_t, N>(vmul_s16(a.raw, b.raw));
-}
-template <size_t N, HWY_IF_LE64(int32_t, N)>
-HWY_API Vec128<int32_t, N> operator*(const Vec128<int32_t, N> a,
-                                     const Vec128<int32_t, N> b) {
-  return Vec128<int32_t, N>(vmul_s32(a.raw, b.raw));
-}
-
-// Returns the upper 16 bits of a * b in each lane.
-HWY_API Vec128<int16_t> MulHigh(const Vec128<int16_t> a,
-                                const Vec128<int16_t> b) {
-  int32x4_t rlo = vmull_s16(vget_low_s16(a.raw), vget_low_s16(b.raw));
-#if HWY_ARCH_ARM_A64
-  int32x4_t rhi = vmull_high_s16(a.raw, b.raw);
-#else
-  int32x4_t rhi = vmull_s16(vget_high_s16(a.raw), vget_high_s16(b.raw));
-#endif
-  return Vec128<int16_t>(
-      vuzp2q_s16(vreinterpretq_s16_s32(rlo), vreinterpretq_s16_s32(rhi)));
-}
-HWY_API Vec128<uint16_t> MulHigh(const Vec128<uint16_t> a,
-                                 const Vec128<uint16_t> b) {
-  uint32x4_t rlo = vmull_u16(vget_low_u16(a.raw), vget_low_u16(b.raw));
-#if HWY_ARCH_ARM_A64
-  uint32x4_t rhi = vmull_high_u16(a.raw, b.raw);
-#else
-  uint32x4_t rhi = vmull_u16(vget_high_u16(a.raw), vget_high_u16(b.raw));
-#endif
-  return Vec128<uint16_t>(
-      vuzp2q_u16(vreinterpretq_u16_u32(rlo), vreinterpretq_u16_u32(rhi)));
-}
-
-template <size_t N, HWY_IF_LE64(int16_t, N)>
-HWY_API Vec128<int16_t, N> MulHigh(const Vec128<int16_t, N> a,
-                                   const Vec128<int16_t, N> b) {
-  int16x8_t hi_lo = vreinterpretq_s16_s32(vmull_s16(a.raw, b.raw));
-  return Vec128<int16_t, N>(vget_low_s16(vuzp2q_s16(hi_lo, hi_lo)));
-}
-template <size_t N, HWY_IF_LE64(uint16_t, N)>
-HWY_API Vec128<uint16_t, N> MulHigh(const Vec128<uint16_t, N> a,
-                                    const Vec128<uint16_t, N> b) {
-  uint16x8_t hi_lo = vreinterpretq_u16_u32(vmull_u16(a.raw, b.raw));
-  return Vec128<uint16_t, N>(vget_low_u16(vuzp2q_u16(hi_lo, hi_lo)));
-}
-
-HWY_API Vec128<int16_t> MulFixedPoint15(Vec128<int16_t> a, Vec128<int16_t> b) {
-  return Vec128<int16_t>(vqrdmulhq_s16(a.raw, b.raw));
-}
-template <size_t N, HWY_IF_LE64(int16_t, N)>
-HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
-                                           Vec128<int16_t, N> b) {
-  return Vec128<int16_t, N>(vqrdmulh_s16(a.raw, b.raw));
-}
-
-// ------------------------------ Floating-point mul / div
-
-HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator*, vmul, _, 2)
-
-// Approximate reciprocal
-HWY_API Vec128<float> ApproximateReciprocal(const Vec128<float> v) {
-  return Vec128<float>(vrecpeq_f32(v.raw));
-}
-template <size_t N>
-HWY_API Vec128<float, N> ApproximateReciprocal(const Vec128<float, N> v) {
-  return Vec128<float, N>(vrecpe_f32(v.raw));
-}
-
-#if HWY_ARCH_ARM_A64
-HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator/, vdiv, _, 2)
-#else
-// Not defined on armv7: approximate
-namespace detail {
-
-HWY_INLINE Vec128<float> ReciprocalNewtonRaphsonStep(
-    const Vec128<float> recip, const Vec128<float> divisor) {
-  return Vec128<float>(vrecpsq_f32(recip.raw, divisor.raw));
-}
-template <size_t N>
-HWY_INLINE Vec128<float, N> ReciprocalNewtonRaphsonStep(
-    const Vec128<float, N> recip, Vec128<float, N> divisor) {
-  return Vec128<float, N>(vrecps_f32(recip.raw, divisor.raw));
-}
-
-}  // namespace detail
-
-template <size_t N>
-HWY_API Vec128<float, N> operator/(const Vec128<float, N> a,
-                                   const Vec128<float, N> b) {
-  auto x = ApproximateReciprocal(b);
-  x *= detail::ReciprocalNewtonRaphsonStep(x, b);
-  x *= detail::ReciprocalNewtonRaphsonStep(x, b);
-  x *= detail::ReciprocalNewtonRaphsonStep(x, b);
-  return a * x;
-}
-#endif
-
-// ------------------------------ Absolute value of difference.
-
-HWY_API Vec128<float> AbsDiff(const Vec128<float> a, const Vec128<float> b) {
-  return Vec128<float>(vabdq_f32(a.raw, b.raw));
-}
-template <size_t N, HWY_IF_LE64(float, N)>
-HWY_API Vec128<float, N> AbsDiff(const Vec128<float, N> a,
-                                 const Vec128<float, N> b) {
-  return Vec128<float, N>(vabd_f32(a.raw, b.raw));
-}
-
-// ------------------------------ Floating-point multiply-add variants
-
-// Returns add + mul * x
-#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
-template <size_t N, HWY_IF_LE64(float, N)>
-HWY_API Vec128<float, N> MulAdd(const Vec128<float, N> mul,
-                                const Vec128<float, N> x,
-                                const Vec128<float, N> add) {
-  return Vec128<float, N>(vfma_f32(add.raw, mul.raw, x.raw));
-}
-HWY_API Vec128<float> MulAdd(const Vec128<float> mul, const Vec128<float> x,
-                             const Vec128<float> add) {
-  return Vec128<float>(vfmaq_f32(add.raw, mul.raw, x.raw));
-}
-#else
-// Emulate FMA for floats.
-template <size_t N>
-HWY_API Vec128<float, N> MulAdd(const Vec128<float, N> mul,
-                                const Vec128<float, N> x,
-                                const Vec128<float, N> add) {
-  return mul * x + add;
-}
-#endif
-
-#if HWY_ARCH_ARM_A64
-HWY_API Vec64<double> MulAdd(const Vec64<double> mul, const Vec64<double> x,
-                             const Vec64<double> add) {
-  return Vec64<double>(vfma_f64(add.raw, mul.raw, x.raw));
-}
-HWY_API Vec128<double> MulAdd(const Vec128<double> mul, const Vec128<double> x,
-                              const Vec128<double> add) {
-  return Vec128<double>(vfmaq_f64(add.raw, mul.raw, x.raw));
-}
-#endif
-
-// Returns add - mul * x
-#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
-template <size_t N, HWY_IF_LE64(float, N)>
-HWY_API Vec128<float, N> NegMulAdd(const Vec128<float, N> mul,
-                                   const Vec128<float, N> x,
-                                   const Vec128<float, N> add) {
-  return Vec128<float, N>(vfms_f32(add.raw, mul.raw, x.raw));
-}
-HWY_API Vec128<float> NegMulAdd(const Vec128<float> mul, const Vec128<float> x,
-                                const Vec128<float> add) {
-  return Vec128<float>(vfmsq_f32(add.raw, mul.raw, x.raw));
-}
-#else
-// Emulate FMA for floats.
-template <size_t N>
-HWY_API Vec128<float, N> NegMulAdd(const Vec128<float, N> mul,
-                                   const Vec128<float, N> x,
-                                   const Vec128<float, N> add) {
-  return add - mul * x;
-}
-#endif
-
-#if HWY_ARCH_ARM_A64
-HWY_API Vec64<double> NegMulAdd(const Vec64<double> mul, const Vec64<double> x,
-                                const Vec64<double> add) {
-  return Vec64<double>(vfms_f64(add.raw, mul.raw, x.raw));
-}
-HWY_API Vec128<double> NegMulAdd(const Vec128<double> mul,
-                                 const Vec128<double> x,
-                                 const Vec128<double> add) {
-  return Vec128<double>(vfmsq_f64(add.raw, mul.raw, x.raw));
-}
-#endif
-
-// Returns mul * x - sub
-template <size_t N>
-HWY_API Vec128<float, N> MulSub(const Vec128<float, N> mul,
-                                const Vec128<float, N> x,
-                                const Vec128<float, N> sub) {
-  return MulAdd(mul, x, Neg(sub));
-}
-
-// Returns -mul * x - sub
-template <size_t N>
-HWY_API Vec128<float, N> NegMulSub(const Vec128<float, N> mul,
-                                   const Vec128<float, N> x,
-                                   const Vec128<float, N> sub) {
-  return Neg(MulAdd(mul, x, sub));
-}
-
-#if HWY_ARCH_ARM_A64
-template <size_t N>
-HWY_API Vec128<double, N> MulSub(const Vec128<double, N> mul,
-                                 const Vec128<double, N> x,
-                                 const Vec128<double, N> sub) {
-  return MulAdd(mul, x, Neg(sub));
-}
-template <size_t N>
-HWY_API Vec128<double, N> NegMulSub(const Vec128<double, N> mul,
-                                    const Vec128<double, N> x,
-                                    const Vec128<double, N> sub) {
-  return Neg(MulAdd(mul, x, sub));
-}
-#endif
-
-// ------------------------------ Floating-point square root (IfThenZeroElse)
-
-// Approximate reciprocal square root
-HWY_API Vec128<float> ApproximateReciprocalSqrt(const Vec128<float> v) {
-  return Vec128<float>(vrsqrteq_f32(v.raw));
-}
-template <size_t N>
-HWY_API Vec128<float, N> ApproximateReciprocalSqrt(const Vec128<float, N> v) {
-  return Vec128<float, N>(vrsqrte_f32(v.raw));
-}
-
-// Full precision square root
-#if HWY_ARCH_ARM_A64
-HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Sqrt, vsqrt, _, 1)
-#else
-namespace detail {
-
-HWY_INLINE Vec128<float> ReciprocalSqrtStep(const Vec128<float> root,
-                                            const Vec128<float> recip) {
-  return Vec128<float>(vrsqrtsq_f32(root.raw, recip.raw));
-}
-template <size_t N>
-HWY_INLINE Vec128<float, N> ReciprocalSqrtStep(const Vec128<float, N> root,
-                                               Vec128<float, N> recip) {
-  return Vec128<float, N>(vrsqrts_f32(root.raw, recip.raw));
-}
-
-}  // namespace detail
-
-// Not defined on armv7: approximate
-template <size_t N>
-HWY_API Vec128<float, N> Sqrt(const Vec128<float, N> v) {
-  auto recip = ApproximateReciprocalSqrt(v);
-
-  recip *= detail::ReciprocalSqrtStep(v * recip, recip);
-  recip *= detail::ReciprocalSqrtStep(v * recip, recip);
-  recip *= detail::ReciprocalSqrtStep(v * recip, recip);
-
-  const auto root = v * recip;
-  return IfThenZeroElse(v == Zero(Simd<float, N, 0>()), root);
-}
-#endif
-
-// ================================================== LOGICAL
-
-// ------------------------------ Not
-
-// There is no 64-bit vmvn, so cast instead of using HWY_NEON_DEF_FUNCTION.
-template <typename T>
-HWY_API Vec128<T> Not(const Vec128<T> v) {
-  const Full128<T> d;
-  const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, Vec128<uint8_t>(vmvnq_u8(BitCast(d8, v).raw)));
-}
-template <typename T, size_t N, HWY_IF_LE64(T, N)>
-HWY_API Vec128<T, N> Not(const Vec128<T, N> v) {
-  const Simd<T, N, 0> d;
-  const Repartition<uint8_t, decltype(d)> d8;
-  using V8 = decltype(Zero(d8));
-  return BitCast(d, V8(vmvn_u8(BitCast(d8, v).raw)));
-}
-
-// ------------------------------ And
-HWY_NEON_DEF_FUNCTION_INTS_UINTS(And, vand, _, 2)
-
-// Uses the u32/64 defined above.
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
-HWY_API Vec128<T, N> And(const Vec128<T, N> a, const Vec128<T, N> b) {
-  const DFromV<decltype(a)> d;
-  const RebindToUnsigned<decltype(d)> du;
-  return BitCast(d, BitCast(du, a) & BitCast(du, b));
-}
-
-// ------------------------------ AndNot
-
-namespace detail {
-// reversed_andnot returns a & ~b.
-HWY_NEON_DEF_FUNCTION_INTS_UINTS(reversed_andnot, vbic, _, 2)
-}  // namespace detail
-
-// Returns ~not_mask & mask.
-template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
-HWY_API Vec128<T, N> AndNot(const Vec128<T, N> not_mask,
-                            const Vec128<T, N> mask) {
-  return detail::reversed_andnot(mask, not_mask);
-}
-
-// Uses the u32/64 defined above.
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
-HWY_API Vec128<T, N> AndNot(const Vec128<T, N> not_mask,
-                            const Vec128<T, N> mask) {
-  const DFromV<decltype(mask)> d;
-  const RebindToUnsigned<decltype(d)> du;
-  VFromD<decltype(du)> ret =
-      detail::reversed_andnot(BitCast(du, mask), BitCast(du, not_mask));
-  return BitCast(d, ret);
-}
-
-// ------------------------------ Or
-
-HWY_NEON_DEF_FUNCTION_INTS_UINTS(Or, vorr, _, 2)
-
-// Uses the u32/64 defined above.
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
-HWY_API Vec128<T, N> Or(const Vec128<T, N> a, const Vec128<T, N> b) {
-  const DFromV<decltype(a)> d;
-  const RebindToUnsigned<decltype(d)> du;
-  return BitCast(d, BitCast(du, a) | BitCast(du, b));
-}
-
-// ------------------------------ Xor
-
-HWY_NEON_DEF_FUNCTION_INTS_UINTS(Xor, veor, _, 2)
-
-// Uses the u32/64 defined above.
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
-HWY_API Vec128<T, N> Xor(const Vec128<T, N> a, const Vec128<T, N> b) {
-  const DFromV<decltype(a)> d;
-  const RebindToUnsigned<decltype(d)> du;
-  return BitCast(d, BitCast(du, a) ^ BitCast(du, b));
-}
-
-// ------------------------------ Or3
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
-  return Or(o1, Or(o2, o3));
-}
-
-// ------------------------------ OrAnd
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
-  return Or(o, And(a1, a2));
-}
-
-// ------------------------------ IfVecThenElse
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
-                                   Vec128<T, N> no) {
-  return IfThenElse(MaskFromVec(mask), yes, no);
-}
-
-// ------------------------------ Operator overloads (internal-only if float)
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> operator&(const Vec128<T, N> a, const Vec128<T, N> b) {
-  return And(a, b);
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> operator|(const Vec128<T, N> a, const Vec128<T, N> b) {
-  return Or(a, b);
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> operator^(const Vec128<T, N> a, const Vec128<T, N> b) {
-  return Xor(a, b);
-}
-
-// ------------------------------ PopulationCount
-
-#ifdef HWY_NATIVE_POPCNT
-#undef HWY_NATIVE_POPCNT
-#else
-#define HWY_NATIVE_POPCNT
-#endif
-
-namespace detail {
-
-template <typename T>
-HWY_INLINE Vec128<T> PopulationCount(hwy::SizeTag<1> /* tag */, Vec128<T> v) {
-  const Full128<uint8_t> d8;
-  return Vec128<T>(vcntq_u8(BitCast(d8, v).raw));
-}
-template <typename T, size_t N, HWY_IF_LE64(T, N)>
-HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<1> /* tag */,
-                                        Vec128<T, N> v) {
-  const Simd<uint8_t, N, 0> d8;
-  return Vec128<T, N>(vcnt_u8(BitCast(d8, v).raw));
-}
-
-// ARM lacks popcount for lane sizes > 1, so take pairwise sums of the bytes.
-template <typename T>
-HWY_INLINE Vec128<T> PopulationCount(hwy::SizeTag<2> /* tag */, Vec128<T> v) {
-  const Full128<uint8_t> d8;
-  const uint8x16_t bytes = vcntq_u8(BitCast(d8, v).raw);
-  return Vec128<T>(vpaddlq_u8(bytes));
-}
-template <typename T, size_t N, HWY_IF_LE64(T, N)>
-HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<2> /* tag */,
-                                        Vec128<T, N> v) {
-  const Repartition<uint8_t, Simd<T, N, 0>> d8;
-  const uint8x8_t bytes = vcnt_u8(BitCast(d8, v).raw);
-  return Vec128<T, N>(vpaddl_u8(bytes));
-}
-
-template <typename T>
-HWY_INLINE Vec128<T> PopulationCount(hwy::SizeTag<4> /* tag */, Vec128<T> v) {
-  const Full128<uint8_t> d8;
-  const uint8x16_t bytes = vcntq_u8(BitCast(d8, v).raw);
-  return Vec128<T>(vpaddlq_u16(vpaddlq_u8(bytes)));
-}
-template <typename T, size_t N, HWY_IF_LE64(T, N)>
-HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<4> /* tag */,
-                                        Vec128<T, N> v) {
-  const Repartition<uint8_t, Simd<T, N, 0>> d8;
-  const uint8x8_t bytes = vcnt_u8(BitCast(d8, v).raw);
-  return Vec128<T, N>(vpaddl_u16(vpaddl_u8(bytes)));
-}
-
-template <typename T>
-HWY_INLINE Vec128<T> PopulationCount(hwy::SizeTag<8> /* tag */, Vec128<T> v) {
-  const Full128<uint8_t> d8;
-  const uint8x16_t bytes = vcntq_u8(BitCast(d8, v).raw);
-  return Vec128<T>(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(bytes))));
-}
-template <typename T, size_t N, HWY_IF_LE64(T, N)>
-HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<8> /* tag */,
-                                        Vec128<T, N> v) {
-  const Repartition<uint8_t, Simd<T, N, 0>> d8;
-  const uint8x8_t bytes = vcnt_u8(BitCast(d8, v).raw);
-  return Vec128<T, N>(vpaddl_u32(vpaddl_u16(vpaddl_u8(bytes))));
-}
-
-}  // namespace detail
-
-template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
-HWY_API Vec128<T, N> PopulationCount(Vec128<T, N> v) {
-  return detail::PopulationCount(hwy::SizeTag<sizeof(T)>(), v);
-}
-
-// ================================================== SIGN
-
-// ------------------------------ Abs
-
-// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
-HWY_API Vec128<int8_t> Abs(const Vec128<int8_t> v) {
-  return Vec128<int8_t>(vabsq_s8(v.raw));
-}
-HWY_API Vec128<int16_t> Abs(const Vec128<int16_t> v) {
-  return Vec128<int16_t>(vabsq_s16(v.raw));
-}
-HWY_API Vec128<int32_t> Abs(const Vec128<int32_t> v) {
-  return Vec128<int32_t>(vabsq_s32(v.raw));
-}
-// i64 is implemented after BroadcastSignBit.
-HWY_API Vec128<float> Abs(const Vec128<float> v) {
-  return Vec128<float>(vabsq_f32(v.raw));
-}
-
-template <size_t N, HWY_IF_LE64(int8_t, N)>
-HWY_API Vec128<int8_t, N> Abs(const Vec128<int8_t, N> v) {
-  return Vec128<int8_t, N>(vabs_s8(v.raw));
-}
-template <size_t N, HWY_IF_LE64(int16_t, N)>
-HWY_API Vec128<int16_t, N> Abs(const Vec128<int16_t, N> v) {
-  return Vec128<int16_t, N>(vabs_s16(v.raw));
-}
-template <size_t N, HWY_IF_LE64(int32_t, N)>
-HWY_API Vec128<int32_t, N> Abs(const Vec128<int32_t, N> v) {
-  return Vec128<int32_t, N>(vabs_s32(v.raw));
-}
-template <size_t N, HWY_IF_LE64(float, N)>
-HWY_API Vec128<float, N> Abs(const Vec128<float, N> v) {
-  return Vec128<float, N>(vabs_f32(v.raw));
-}
-
-#if HWY_ARCH_ARM_A64
-HWY_API Vec128<double> Abs(const Vec128<double> v) {
-  return Vec128<double>(vabsq_f64(v.raw));
-}
-
-HWY_API Vec64<double> Abs(const Vec64<double> v) {
-  return Vec64<double>(vabs_f64(v.raw));
-}
-#endif
-
-// ------------------------------ CopySign
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> CopySign(const Vec128<T, N> magn,
-                              const Vec128<T, N> sign) {
-  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
-  const auto msb = SignBit(Simd<T, N, 0>());
-  return Or(AndNot(msb, magn), And(msb, sign));
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> CopySignToAbs(const Vec128<T, N> abs,
-                                   const Vec128<T, N> sign) {
-  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
-  return Or(abs, And(SignBit(Simd<T, N, 0>()), sign));
-}
-
-// ------------------------------ BroadcastSignBit
-
-template <typename T, size_t N, HWY_IF_SIGNED(T)>
-HWY_API Vec128<T, N> BroadcastSignBit(const Vec128<T, N> v) {
-  return ShiftRight<sizeof(T) * 8 - 1>(v);
-}
-
-// ================================================== MASK
-
-// ------------------------------ To/from vector
-
-// Mask and Vec have the same representation (true = FF..FF).
-template <typename T, size_t N>
-HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
-  const Simd<MakeUnsigned<T>, N, 0> du;
-  return Mask128<T, N>(BitCast(du, v).raw);
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> VecFromMask(Simd<T, N, 0> d, const Mask128<T, N> v) {
-  return BitCast(d, Vec128<MakeUnsigned<T>, N>(v.raw));
-}
-
-// ------------------------------ RebindMask
-
-template <typename TFrom, typename TTo, size_t N>
-HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N, 0> dto, Mask128<TFrom, N> m) {
-  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
-  return MaskFromVec(BitCast(dto, VecFromMask(Simd<TFrom, N, 0>(), m)));
-}
-
-// ------------------------------ IfThenElse(mask, yes, no) = mask ? b : a.
-
-#define HWY_NEON_BUILD_TPL_HWY_IF
-#define HWY_NEON_BUILD_RET_HWY_IF(type, size) Vec128<type##_t, size>
-#define HWY_NEON_BUILD_PARAM_HWY_IF(type, size)                         \
-  const Mask128<type##_t, size> mask, const Vec128<type##_t, size> yes, \
-      const Vec128<type##_t, size> no
-#define HWY_NEON_BUILD_ARG_HWY_IF mask.raw, yes.raw, no.raw
-
-HWY_NEON_DEF_FUNCTION_ALL_TYPES(IfThenElse, vbsl, _, HWY_IF)
-
-#undef HWY_NEON_BUILD_TPL_HWY_IF
-#undef HWY_NEON_BUILD_RET_HWY_IF
-#undef HWY_NEON_BUILD_PARAM_HWY_IF
-#undef HWY_NEON_BUILD_ARG_HWY_IF
-
-// mask ? yes : 0
-template <typename T, size_t N>
-HWY_API Vec128<T, N> IfThenElseZero(const Mask128<T, N> mask,
-                                    const Vec128<T, N> yes) {
-  return yes & VecFromMask(Simd<T, N, 0>(), mask);
-}
-
-// mask ? 0 : no
-template <typename T, size_t N>
-HWY_API Vec128<T, N> IfThenZeroElse(const Mask128<T, N> mask,
-                                    const Vec128<T, N> no) {
-  return AndNot(VecFromMask(Simd<T, N, 0>(), mask), no);
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
-                                        Vec128<T, N> no) {
-  static_assert(IsSigned<T>(), "Only works for signed/float");
-  const Simd<T, N, 0> d;
-  const RebindToSigned<decltype(d)> di;
-
-  Mask128<T, N> m = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v))));
-  return IfThenElse(m, yes, no);
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
-  const auto zero = Zero(Simd<T, N, 0>());
-  return Max(zero, v);
-}
-
-// ------------------------------ Mask logical
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
-  return MaskFromVec(Not(VecFromMask(Simd<T, N, 0>(), m)));
-}
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) {
-  const Simd<T, N, 0> d;
-  return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
-}
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) {
-  const Simd<T, N, 0> d;
-  return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
-}
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) {
-  const Simd<T, N, 0> d;
-  return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
-}
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
-  const Simd<T, N, 0> d;
-  return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
-}
-
-// ================================================== COMPARE
-
-// Comparisons fill a lane with 1-bits if the condition is true, else 0.
-
-// ------------------------------ Shuffle2301 (for i64 compares)
-
-// Swap 32-bit halves in 64-bits
-HWY_API Vec64<uint32_t> Shuffle2301(const Vec64<uint32_t> v) {
-  return Vec64<uint32_t>(vrev64_u32(v.raw));
-}
-HWY_API Vec64<int32_t> Shuffle2301(const Vec64<int32_t> v) {
-  return Vec64<int32_t>(vrev64_s32(v.raw));
-}
-HWY_API Vec64<float> Shuffle2301(const Vec64<float> v) {
-  return Vec64<float>(vrev64_f32(v.raw));
-}
-HWY_API Vec128<uint32_t> Shuffle2301(const Vec128<uint32_t> v) {
-  return Vec128<uint32_t>(vrev64q_u32(v.raw));
-}
-HWY_API Vec128<int32_t> Shuffle2301(const Vec128<int32_t> v) {
-  return Vec128<int32_t>(vrev64q_s32(v.raw));
-}
-HWY_API Vec128<float> Shuffle2301(const Vec128<float> v) {
-  return Vec128<float>(vrev64q_f32(v.raw));
-}
-
-#define HWY_NEON_BUILD_TPL_HWY_COMPARE
-#define HWY_NEON_BUILD_RET_HWY_COMPARE(type, size) Mask128<type##_t, size>
-#define HWY_NEON_BUILD_PARAM_HWY_COMPARE(type, size) \
-  const Vec128<type##_t, size> a, const Vec128<type##_t, size> b
-#define HWY_NEON_BUILD_ARG_HWY_COMPARE a.raw, b.raw
-
-// ------------------------------ Equality
-HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator==, vceq, _, HWY_COMPARE)
-#if HWY_ARCH_ARM_A64
-HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator==, vceq, _, HWY_COMPARE)
-#else
-// No 64-bit comparisons on armv7: emulate them below, after Shuffle2301.
-HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator==, vceq, _, HWY_COMPARE)
-HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator==, vceq, _, HWY_COMPARE)
-#endif
-
-// ------------------------------ Strict inequality (signed, float)
-#if HWY_ARCH_ARM_A64
-HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator<, vclt, _, HWY_COMPARE)
-#else
-HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator<, vclt, _, HWY_COMPARE)
-HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator<, vclt, _, HWY_COMPARE)
-#endif
-HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<, vclt, _, HWY_COMPARE)
-
-// ------------------------------ Weak inequality (float)
-HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<=, vcle, _, HWY_COMPARE)
-
-#undef HWY_NEON_BUILD_TPL_HWY_COMPARE
-#undef HWY_NEON_BUILD_RET_HWY_COMPARE
-#undef HWY_NEON_BUILD_PARAM_HWY_COMPARE
-#undef HWY_NEON_BUILD_ARG_HWY_COMPARE
-
-// ------------------------------ ARMv7 i64 compare (Shuffle2301, Eq)
-
-#if HWY_ARCH_ARM_V7
-
-template <size_t N>
-HWY_API Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a,
-                                       const Vec128<int64_t, N> b) {
-  const Simd<int32_t, N * 2, 0> d32;
-  const Simd<int64_t, N, 0> d64;
-  const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b)));
-  const auto cmp64 = cmp32 & Shuffle2301(cmp32);
-  return MaskFromVec(BitCast(d64, cmp64));
-}
-
-template <size_t N>
-HWY_API Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a,
-                                        const Vec128<uint64_t, N> b) {
-  const Simd<uint32_t, N * 2, 0> d32;
-  const Simd<uint64_t, N, 0> d64;
-  const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b)));
-  const auto cmp64 = cmp32 & Shuffle2301(cmp32);
-  return MaskFromVec(BitCast(d64, cmp64));
-}
-
-HWY_API Mask128<int64_t> operator<(const Vec128<int64_t> a,
-                                   const Vec128<int64_t> b) {
-  const int64x2_t sub = vqsubq_s64(a.raw, b.raw);
-  return MaskFromVec(BroadcastSignBit(Vec128<int64_t>(sub)));
-}
-HWY_API Mask128<int64_t, 1> operator<(const Vec64<int64_t> a,
-                                      const Vec64<int64_t> b) {
-  const int64x1_t sub = vqsub_s64(a.raw, b.raw);
-  return MaskFromVec(BroadcastSignBit(Vec64<int64_t>(sub)));
-}
-
-template <size_t N>
-HWY_API Mask128<uint64_t, N> operator<(const Vec128<uint64_t, N> a,
-                                       const Vec128<uint64_t, N> b) {
-  const DFromV<decltype(a)> du;
-  const RebindToSigned<decltype(du)> di;
-  const Vec128<uint64_t, N> msb = AndNot(a, b) | AndNot(a ^ b, a - b);
-  return MaskFromVec(BitCast(du, BroadcastSignBit(BitCast(di, msb))));
-}
-
-#endif
-
-// ------------------------------ operator!= (operator==)
-
-// Customize HWY_NEON_DEF_FUNCTION to call 2 functions.
-#pragma push_macro("HWY_NEON_DEF_FUNCTION")
-#undef HWY_NEON_DEF_FUNCTION
-// This cannot have _any_ template argument (in x86_128 we can at least have N
-// as an argument), otherwise it is not more specialized than rewritten
-// operator== in C++20, leading to compile errors.
-#define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \
-  HWY_API Mask128<type##_t, size> name(Vec128<type##_t, size> a,             \
-                                       Vec128<type##_t, size> b) {           \
-    return Not(a == b);                                                      \
-  }
-
-HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator!=, ignored, ignored, ignored)
-
-#pragma pop_macro("HWY_NEON_DEF_FUNCTION")
-
-// ------------------------------ Reversed comparisons
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> operator>(Vec128<T, N> a, Vec128<T, N> b) {
-  return operator<(b, a);
-}
-template <typename T, size_t N>
-HWY_API Mask128<T, N> operator>=(Vec128<T, N> a, Vec128<T, N> b) {
-  return operator<=(b, a);
-}
-
-// ------------------------------ FirstN (Iota, Lt)
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> FirstN(const Simd<T, N, 0> d, size_t num) {
-  const RebindToSigned<decltype(d)> di;  // Signed comparisons are cheaper.
-  return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
-}
-
-// ------------------------------ TestBit (Eq)
-
-#define HWY_NEON_BUILD_TPL_HWY_TESTBIT
-#define HWY_NEON_BUILD_RET_HWY_TESTBIT(type, size) Mask128<type##_t, size>
-#define HWY_NEON_BUILD_PARAM_HWY_TESTBIT(type, size) \
-  Vec128<type##_t, size> v, Vec128<type##_t, size> bit
-#define HWY_NEON_BUILD_ARG_HWY_TESTBIT v.raw, bit.raw
-
-#if HWY_ARCH_ARM_A64
-HWY_NEON_DEF_FUNCTION_INTS_UINTS(TestBit, vtst, _, HWY_TESTBIT)
-#else
-// No 64-bit versions on armv7
-HWY_NEON_DEF_FUNCTION_UINT_8_16_32(TestBit, vtst, _, HWY_TESTBIT)
-HWY_NEON_DEF_FUNCTION_INT_8_16_32(TestBit, vtst, _, HWY_TESTBIT)
-
-template <size_t N>
-HWY_API Mask128<uint64_t, N> TestBit(Vec128<uint64_t, N> v,
-                                     Vec128<uint64_t, N> bit) {
-  return (v & bit) == bit;
-}
-template <size_t N>
-HWY_API Mask128<int64_t, N> TestBit(Vec128<int64_t, N> v,
-                                    Vec128<int64_t, N> bit) {
-  return (v & bit) == bit;
-}
-
-#endif
-#undef HWY_NEON_BUILD_TPL_HWY_TESTBIT
-#undef HWY_NEON_BUILD_RET_HWY_TESTBIT
-#undef HWY_NEON_BUILD_PARAM_HWY_TESTBIT
-#undef HWY_NEON_BUILD_ARG_HWY_TESTBIT
-
-// ------------------------------ Abs i64 (IfThenElse, BroadcastSignBit)
-HWY_API Vec128<int64_t> Abs(const Vec128<int64_t> v) {
-#if HWY_ARCH_ARM_A64
-  return Vec128<int64_t>(vabsq_s64(v.raw));
-#else
-  const auto zero = Zero(Full128<int64_t>());
-  return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
-#endif
-}
-HWY_API Vec64<int64_t> Abs(const Vec64<int64_t> v) {
-#if HWY_ARCH_ARM_A64
-  return Vec64<int64_t>(vabs_s64(v.raw));
-#else
-  const auto zero = Zero(Full64<int64_t>());
-  return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
-#endif
-}
-
-// ------------------------------ Min (IfThenElse, BroadcastSignBit)
-
-// Unsigned
-HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Min, vmin, _, 2)
-
-template <size_t N>
-HWY_API Vec128<uint64_t, N> Min(const Vec128<uint64_t, N> a,
-                                const Vec128<uint64_t, N> b) {
-#if HWY_ARCH_ARM_A64
-  return IfThenElse(b < a, b, a);
-#else
-  const DFromV<decltype(a)> du;
-  const RebindToSigned<decltype(du)> di;
-  return BitCast(du, BitCast(di, a) - BitCast(di, detail::SaturatedSub(a, b)));
-#endif
-}
-
-// Signed
-HWY_NEON_DEF_FUNCTION_INT_8_16_32(Min, vmin, _, 2)
-
-template <size_t N>
-HWY_API Vec128<int64_t, N> Min(const Vec128<int64_t, N> a,
-                               const Vec128<int64_t, N> b) {
-#if HWY_ARCH_ARM_A64
-  return IfThenElse(b < a, b, a);
-#else
-  const Vec128<int64_t, N> sign = detail::SaturatedSub(a, b);
-  return IfThenElse(MaskFromVec(BroadcastSignBit(sign)), a, b);
-#endif
-}
-
-// Float: IEEE minimumNumber on v8, otherwise NaN if any is NaN.
-#if HWY_ARCH_ARM_A64
-HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Min, vminnm, _, 2)
-#else
-HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Min, vmin, _, 2)
-#endif
-
-// ------------------------------ Max (IfThenElse, BroadcastSignBit)
-
-// Unsigned (no u64)
-HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Max, vmax, _, 2)
-
-template <size_t N>
-HWY_API Vec128<uint64_t, N> Max(const Vec128<uint64_t, N> a,
-                                const Vec128<uint64_t, N> b) {
-#if HWY_ARCH_ARM_A64
-  return IfThenElse(b < a, a, b);
-#else
-  const DFromV<decltype(a)> du;
-  const RebindToSigned<decltype(du)> di;
-  return BitCast(du, BitCast(di, b) + BitCast(di, detail::SaturatedSub(a, b)));
-#endif
-}
-
-// Signed (no i64)
-HWY_NEON_DEF_FUNCTION_INT_8_16_32(Max, vmax, _, 2)
-
-template <size_t N>
-HWY_API Vec128<int64_t, N> Max(const Vec128<int64_t, N> a,
-                               const Vec128<int64_t, N> b) {
-#if HWY_ARCH_ARM_A64
-  return IfThenElse(b < a, a, b);
-#else
-  const Vec128<int64_t, N> sign = detail::SaturatedSub(a, b);
-  return IfThenElse(MaskFromVec(BroadcastSignBit(sign)), b, a);
-#endif
-}
-
-// Float: IEEE maximumNumber on v8, otherwise NaN if any is NaN.
-#if HWY_ARCH_ARM_A64
-HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Max, vmaxnm, _, 2)
-#else
-HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Max, vmax, _, 2)
-#endif
-
-// ================================================== MEMORY
-
-// ------------------------------ Load 128
-
-HWY_API Vec128<uint8_t> LoadU(Full128<uint8_t> /* tag */,
-                              const uint8_t* HWY_RESTRICT unaligned) {
-  return Vec128<uint8_t>(vld1q_u8(unaligned));
-}
-HWY_API Vec128<uint16_t> LoadU(Full128<uint16_t> /* tag */,
-                               const uint16_t* HWY_RESTRICT unaligned) {
-  return Vec128<uint16_t>(vld1q_u16(unaligned));
-}
-HWY_API Vec128<uint32_t> LoadU(Full128<uint32_t> /* tag */,
-                               const uint32_t* HWY_RESTRICT unaligned) {
-  return Vec128<uint32_t>(vld1q_u32(unaligned));
-}
-HWY_API Vec128<uint64_t> LoadU(Full128<uint64_t> /* tag */,
-                               const uint64_t* HWY_RESTRICT unaligned) {
-  return Vec128<uint64_t>(vld1q_u64(unaligned));
-}
-HWY_API Vec128<int8_t> LoadU(Full128<int8_t> /* tag */,
-                             const int8_t* HWY_RESTRICT unaligned) {
-  return Vec128<int8_t>(vld1q_s8(unaligned));
-}
-HWY_API Vec128<int16_t> LoadU(Full128<int16_t> /* tag */,
-                              const int16_t* HWY_RESTRICT unaligned) {
-  return Vec128<int16_t>(vld1q_s16(unaligned));
-}
-HWY_API Vec128<int32_t> LoadU(Full128<int32_t> /* tag */,
-                              const int32_t* HWY_RESTRICT unaligned) {
-  return Vec128<int32_t>(vld1q_s32(unaligned));
-}
-HWY_API Vec128<int64_t> LoadU(Full128<int64_t> /* tag */,
-                              const int64_t* HWY_RESTRICT unaligned) {
-  return Vec128<int64_t>(vld1q_s64(unaligned));
-}
-HWY_API Vec128<float> LoadU(Full128<float> /* tag */,
-                            const float* HWY_RESTRICT unaligned) {
-  return Vec128<float>(vld1q_f32(unaligned));
-}
-#if HWY_ARCH_ARM_A64
-HWY_API Vec128<double> LoadU(Full128<double> /* tag */,
-                             const double* HWY_RESTRICT unaligned) {
-  return Vec128<double>(vld1q_f64(unaligned));
-}
-#endif
-
-// ------------------------------ Load 64
-
-HWY_API Vec64<uint8_t> LoadU(Full64<uint8_t> /* tag */,
-                             const uint8_t* HWY_RESTRICT p) {
-  return Vec64<uint8_t>(vld1_u8(p));
-}
-HWY_API Vec64<uint16_t> LoadU(Full64<uint16_t> /* tag */,
-                              const uint16_t* HWY_RESTRICT p) {
-  return Vec64<uint16_t>(vld1_u16(p));
-}
-HWY_API Vec64<uint32_t> LoadU(Full64<uint32_t> /* tag */,
-                              const uint32_t* HWY_RESTRICT p) {
-  return Vec64<uint32_t>(vld1_u32(p));
-}
-HWY_API Vec64<uint64_t> LoadU(Full64<uint64_t> /* tag */,
-                              const uint64_t* HWY_RESTRICT p) {
-  return Vec64<uint64_t>(vld1_u64(p));
-}
-HWY_API Vec64<int8_t> LoadU(Full64<int8_t> /* tag */,
-                            const int8_t* HWY_RESTRICT p) {
-  return Vec64<int8_t>(vld1_s8(p));
-}
-HWY_API Vec64<int16_t> LoadU(Full64<int16_t> /* tag */,
-                             const int16_t* HWY_RESTRICT p) {
-  return Vec64<int16_t>(vld1_s16(p));
-}
-HWY_API Vec64<int32_t> LoadU(Full64<int32_t> /* tag */,
-                             const int32_t* HWY_RESTRICT p) {
-  return Vec64<int32_t>(vld1_s32(p));
-}
-HWY_API Vec64<int64_t> LoadU(Full64<int64_t> /* tag */,
-                             const int64_t* HWY_RESTRICT p) {
-  return Vec64<int64_t>(vld1_s64(p));
-}
-HWY_API Vec64<float> LoadU(Full64<float> /* tag */,
-                           const float* HWY_RESTRICT p) {
-  return Vec64<float>(vld1_f32(p));
-}
-#if HWY_ARCH_ARM_A64
-HWY_API Vec64<double> LoadU(Full64<double> /* tag */,
-                            const double* HWY_RESTRICT p) {
-  return Vec64<double>(vld1_f64(p));
-}
-#endif
-// ------------------------------ Load 32
-
-// Actual 32-bit broadcast load - used to implement the other lane types
-// because reinterpret_cast of the pointer leads to incorrect codegen on GCC.
-HWY_API Vec32<uint32_t> LoadU(Full32<uint32_t> /*tag*/,
-                              const uint32_t* HWY_RESTRICT p) {
-  return Vec32<uint32_t>(vld1_dup_u32(p));
-}
-HWY_API Vec32<int32_t> LoadU(Full32<int32_t> /*tag*/,
-                             const int32_t* HWY_RESTRICT p) {
-  return Vec32<int32_t>(vld1_dup_s32(p));
-}
-HWY_API Vec32<float> LoadU(Full32<float> /*tag*/, const float* HWY_RESTRICT p) {
-  return Vec32<float>(vld1_dup_f32(p));
-}
-
-template <typename T, HWY_IF_LANE_SIZE_LT(T, 4)>
-HWY_API Vec32<T> LoadU(Full32<T> d, const T* HWY_RESTRICT p) {
-  const Repartition<uint32_t, decltype(d)> d32;
-  uint32_t buf;
-  CopyBytes<4>(p, &buf);
-  return BitCast(d, LoadU(d32, &buf));
-}
-
-// ------------------------------ Load 16
-
-// Actual 16-bit broadcast load - used to implement the other lane types
-// because reinterpret_cast of the pointer leads to incorrect codegen on GCC.
-HWY_API Vec128<uint16_t, 1> LoadU(Simd<uint16_t, 1, 0> /*tag*/,
-                                  const uint16_t* HWY_RESTRICT p) {
-  return Vec128<uint16_t, 1>(vld1_dup_u16(p));
-}
-HWY_API Vec128<int16_t, 1> LoadU(Simd<int16_t, 1, 0> /*tag*/,
-                                 const int16_t* HWY_RESTRICT p) {
-  return Vec128<int16_t, 1>(vld1_dup_s16(p));
-}
-
-template <typename T, HWY_IF_LANE_SIZE_LT(T, 2)>
-HWY_API Vec128<T, 2> LoadU(Simd<T, 2, 0> d, const T* HWY_RESTRICT p) {
-  const Repartition<uint16_t, decltype(d)> d16;
-  uint16_t buf;
-  CopyBytes<2>(p, &buf);
-  return BitCast(d, LoadU(d16, &buf));
-}
-
-// ------------------------------ Load 8
-
-HWY_API Vec128<uint8_t, 1> LoadU(Simd<uint8_t, 1, 0>,
-                                 const uint8_t* HWY_RESTRICT p) {
-  return Vec128<uint8_t, 1>(vld1_dup_u8(p));
-}
-
-HWY_API Vec128<int8_t, 1> LoadU(Simd<int8_t, 1, 0>,
-                                const int8_t* HWY_RESTRICT p) {
-  return Vec128<int8_t, 1>(vld1_dup_s8(p));
-}
-
-// [b]float16_t use the same Raw as uint16_t, so forward to that.
-template <size_t N>
-HWY_API Vec128<float16_t, N> LoadU(Simd<float16_t, N, 0> d,
-                                   const float16_t* HWY_RESTRICT p) {
-  const RebindToUnsigned<decltype(d)> du16;
-  const auto pu16 = reinterpret_cast<const uint16_t*>(p);
-  return Vec128<float16_t, N>(LoadU(du16, pu16).raw);
-}
-template <size_t N>
-HWY_API Vec128<bfloat16_t, N> LoadU(Simd<bfloat16_t, N, 0> d,
-                                    const bfloat16_t* HWY_RESTRICT p) {
-  const RebindToUnsigned<decltype(d)> du16;
-  const auto pu16 = reinterpret_cast<const uint16_t*>(p);
-  return Vec128<bfloat16_t, N>(LoadU(du16, pu16).raw);
-}
-
-// On ARM, Load is the same as LoadU.
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Load(Simd<T, N, 0> d, const T* HWY_RESTRICT p) {
-  return LoadU(d, p);
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> d,
-                                const T* HWY_RESTRICT aligned) {
-  return IfThenElseZero(m, Load(d, aligned));
-}
-
-// 128-bit SIMD => nothing to duplicate, same as an unaligned load.
-template <typename T, size_t N, HWY_IF_LE128(T, N)>
-HWY_API Vec128<T, N> LoadDup128(Simd<T, N, 0> d,
-                                const T* const HWY_RESTRICT p) {
-  return LoadU(d, p);
-}
-
-// ------------------------------ Store 128
-
-HWY_API void StoreU(const Vec128<uint8_t> v, Full128<uint8_t> /* tag */,
-                    uint8_t* HWY_RESTRICT unaligned) {
-  vst1q_u8(unaligned, v.raw);
-}
-HWY_API void StoreU(const Vec128<uint16_t> v, Full128<uint16_t> /* tag */,
-                    uint16_t* HWY_RESTRICT unaligned) {
-  vst1q_u16(unaligned, v.raw);
-}
-HWY_API void StoreU(const Vec128<uint32_t> v, Full128<uint32_t> /* tag */,
-                    uint32_t* HWY_RESTRICT unaligned) {
-  vst1q_u32(unaligned, v.raw);
-}
-HWY_API void StoreU(const Vec128<uint64_t> v, Full128<uint64_t> /* tag */,
-                    uint64_t* HWY_RESTRICT unaligned) {
-  vst1q_u64(unaligned, v.raw);
-}
-HWY_API void StoreU(const Vec128<int8_t> v, Full128<int8_t> /* tag */,
-                    int8_t* HWY_RESTRICT unaligned) {
-  vst1q_s8(unaligned, v.raw);
-}
-HWY_API void StoreU(const Vec128<int16_t> v, Full128<int16_t> /* tag */,
-                    int16_t* HWY_RESTRICT unaligned) {
-  vst1q_s16(unaligned, v.raw);
-}
-HWY_API void StoreU(const Vec128<int32_t> v, Full128<int32_t> /* tag */,
-                    int32_t* HWY_RESTRICT unaligned) {
-  vst1q_s32(unaligned, v.raw);
-}
-HWY_API void StoreU(const Vec128<int64_t> v, Full128<int64_t> /* tag */,
-                    int64_t* HWY_RESTRICT unaligned) {
-  vst1q_s64(unaligned, v.raw);
-}
-HWY_API void StoreU(const Vec128<float> v, Full128<float> /* tag */,
-                    float* HWY_RESTRICT unaligned) {
-  vst1q_f32(unaligned, v.raw);
-}
-#if HWY_ARCH_ARM_A64
-HWY_API void StoreU(const Vec128<double> v, Full128<double> /* tag */,
-                    double* HWY_RESTRICT unaligned) {
-  vst1q_f64(unaligned, v.raw);
-}
-#endif
-
-// ------------------------------ Store 64
-
-HWY_API void StoreU(const Vec64<uint8_t> v, Full64<uint8_t> /* tag */,
-                    uint8_t* HWY_RESTRICT p) {
-  vst1_u8(p, v.raw);
-}
-HWY_API void StoreU(const Vec64<uint16_t> v, Full64<uint16_t> /* tag */,
-                    uint16_t* HWY_RESTRICT p) {
-  vst1_u16(p, v.raw);
-}
-HWY_API void StoreU(const Vec64<uint32_t> v, Full64<uint32_t> /* tag */,
-                    uint32_t* HWY_RESTRICT p) {
-  vst1_u32(p, v.raw);
-}
-HWY_API void StoreU(const Vec64<uint64_t> v, Full64<uint64_t> /* tag */,
-                    uint64_t* HWY_RESTRICT p) {
-  vst1_u64(p, v.raw);
-}
-HWY_API void StoreU(const Vec64<int8_t> v, Full64<int8_t> /* tag */,
-                    int8_t* HWY_RESTRICT p) {
-  vst1_s8(p, v.raw);
-}
-HWY_API void StoreU(const Vec64<int16_t> v, Full64<int16_t> /* tag */,
-                    int16_t* HWY_RESTRICT p) {
-  vst1_s16(p, v.raw);
-}
-HWY_API void StoreU(const Vec64<int32_t> v, Full64<int32_t> /* tag */,
-                    int32_t* HWY_RESTRICT p) {
-  vst1_s32(p, v.raw);
-}
-HWY_API void StoreU(const Vec64<int64_t> v, Full64<int64_t> /* tag */,
-                    int64_t* HWY_RESTRICT p) {
-  vst1_s64(p, v.raw);
-}
-HWY_API void StoreU(const Vec64<float> v, Full64<float> /* tag */,
-                    float* HWY_RESTRICT p) {
-  vst1_f32(p, v.raw);
-}
-#if HWY_ARCH_ARM_A64
-HWY_API void StoreU(const Vec64<double> v, Full64<double> /* tag */,
-                    double* HWY_RESTRICT p) {
-  vst1_f64(p, v.raw);
-}
-#endif
-
-// ------------------------------ Store 32
-
-HWY_API void StoreU(const Vec32<uint32_t> v, Full32<uint32_t>,
-                    uint32_t* HWY_RESTRICT p) {
-  vst1_lane_u32(p, v.raw, 0);
-}
-HWY_API void StoreU(const Vec32<int32_t> v, Full32<int32_t>,
-                    int32_t* HWY_RESTRICT p) {
-  vst1_lane_s32(p, v.raw, 0);
-}
-HWY_API void StoreU(const Vec32<float> v, Full32<float>,
-                    float* HWY_RESTRICT p) {
-  vst1_lane_f32(p, v.raw, 0);
-}
-
-template <typename T, HWY_IF_LANE_SIZE_LT(T, 4)>
-HWY_API void StoreU(const Vec32<T> v, Full32<T> d, T* HWY_RESTRICT p) {
-  const Repartition<uint32_t, decltype(d)> d32;
-  const uint32_t buf = GetLane(BitCast(d32, v));
-  CopyBytes<4>(&buf, p);
-}
-
-// ------------------------------ Store 16
-
-HWY_API void StoreU(const Vec128<uint16_t, 1> v, Simd<uint16_t, 1, 0>,
-                    uint16_t* HWY_RESTRICT p) {
-  vst1_lane_u16(p, v.raw, 0);
-}
-HWY_API void StoreU(const Vec128<int16_t, 1> v, Simd<int16_t, 1, 0>,
-                    int16_t* HWY_RESTRICT p) {
-  vst1_lane_s16(p, v.raw, 0);
-}
-
-template <typename T, HWY_IF_LANE_SIZE_LT(T, 2)>
-HWY_API void StoreU(const Vec128<T, 2> v, Simd<T, 2, 0> d, T* HWY_RESTRICT p) {
-  const Repartition<uint16_t, decltype(d)> d16;
-  const uint16_t buf = GetLane(BitCast(d16, v));
-  CopyBytes<2>(&buf, p);
-}
-
-// ------------------------------ Store 8
-
-HWY_API void StoreU(const Vec128<uint8_t, 1> v, Simd<uint8_t, 1, 0>,
-                    uint8_t* HWY_RESTRICT p) {
-  vst1_lane_u8(p, v.raw, 0);
-}
-HWY_API void StoreU(const Vec128<int8_t, 1> v, Simd<int8_t, 1, 0>,
-                    int8_t* HWY_RESTRICT p) {
-  vst1_lane_s8(p, v.raw, 0);
-}
-
-// [b]float16_t use the same Raw as uint16_t, so forward to that.
-template <size_t N>
-HWY_API void StoreU(Vec128<float16_t, N> v, Simd<float16_t, N, 0> d,
-                    float16_t* HWY_RESTRICT p) {
-  const RebindToUnsigned<decltype(d)> du16;
-  const auto pu16 = reinterpret_cast<uint16_t*>(p);
-  return StoreU(Vec128<uint16_t, N>(v.raw), du16, pu16);
-}
-template <size_t N>
-HWY_API void StoreU(Vec128<bfloat16_t, N> v, Simd<bfloat16_t, N, 0> d,
-                    bfloat16_t* HWY_RESTRICT p) {
-  const RebindToUnsigned<decltype(d)> du16;
-  const auto pu16 = reinterpret_cast<uint16_t*>(p);
-  return StoreU(Vec128<uint16_t, N>(v.raw), du16, pu16);
-}
-
-// On ARM, Store is the same as StoreU.
-template <typename T, size_t N>
-HWY_API void Store(Vec128<T, N> v, Simd<T, N, 0> d, T* HWY_RESTRICT aligned) {
-  StoreU(v, d, aligned);
-}
-
-template <typename T, size_t N>
-HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m, Simd<T, N, 0> d,
-                          T* HWY_RESTRICT p) {
-  // Treat as unsigned so that we correctly support float16.
-  const RebindToUnsigned<decltype(d)> du;
-  const auto blended =
-      IfThenElse(RebindMask(du, m), BitCast(du, v), BitCast(du, LoadU(d, p)));
-  StoreU(BitCast(d, blended), d, p);
-}
-
-// ------------------------------ Non-temporal stores
-
-// Same as aligned stores on non-x86.
-
-template <typename T, size_t N>
-HWY_API void Stream(const Vec128<T, N> v, Simd<T, N, 0> d,
-                    T* HWY_RESTRICT aligned) {
-  Store(v, d, aligned);
-}
-
-// ================================================== CONVERT
-
-// ------------------------------ Promotions (part w/ narrow lanes -> full)
-
-// Unsigned: zero-extend to full vector.
-HWY_API Vec128<uint16_t> PromoteTo(Full128<uint16_t> /* tag */,
-                                   const Vec64<uint8_t> v) {
-  return Vec128<uint16_t>(vmovl_u8(v.raw));
-}
-HWY_API Vec128<uint32_t> PromoteTo(Full128<uint32_t> /* tag */,
-                                   const Vec32<uint8_t> v) {
-  uint16x8_t a = vmovl_u8(v.raw);
-  return Vec128<uint32_t>(vmovl_u16(vget_low_u16(a)));
-}
-HWY_API Vec128<uint32_t> PromoteTo(Full128<uint32_t> /* tag */,
-                                   const Vec64<uint16_t> v) {
-  return Vec128<uint32_t>(vmovl_u16(v.raw));
-}
-HWY_API Vec128<uint64_t> PromoteTo(Full128<uint64_t> /* tag */,
-                                   const Vec64<uint32_t> v) {
-  return Vec128<uint64_t>(vmovl_u32(v.raw));
-}
-HWY_API Vec128<int16_t> PromoteTo(Full128<int16_t> d, const Vec64<uint8_t> v) {
-  return BitCast(d, Vec128<uint16_t>(vmovl_u8(v.raw)));
-}
-HWY_API Vec128<int32_t> PromoteTo(Full128<int32_t> d, const Vec32<uint8_t> v) {
-  uint16x8_t a = vmovl_u8(v.raw);
-  return BitCast(d, Vec128<uint32_t>(vmovl_u16(vget_low_u16(a))));
-}
-HWY_API Vec128<int32_t> PromoteTo(Full128<int32_t> d, const Vec64<uint16_t> v) {
-  return BitCast(d, Vec128<uint32_t>(vmovl_u16(v.raw)));
-}
-
-// Unsigned: zero-extend to half vector.
-template <size_t N, HWY_IF_LE64(uint16_t, N)>
-HWY_API Vec128<uint16_t, N> PromoteTo(Simd<uint16_t, N, 0> /* tag */,
-                                      const Vec128<uint8_t, N> v) {
-  return Vec128<uint16_t, N>(vget_low_u16(vmovl_u8(v.raw)));
-}
-template <size_t N, HWY_IF_LE64(uint32_t, N)>
-HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N, 0> /* tag */,
-                                      const Vec128<uint8_t, N> v) {
-  uint16x8_t a = vmovl_u8(v.raw);
-  return Vec128<uint32_t, N>(vget_low_u32(vmovl_u16(vget_low_u16(a))));
-}
-template <size_t N>
-HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N, 0> /* tag */,
-                                      const Vec128<uint16_t, N> v) {
-  return Vec128<uint32_t, N>(vget_low_u32(vmovl_u16(v.raw)));
-}
-template <size_t N, HWY_IF_LE64(uint64_t, N)>
-HWY_API Vec128<uint64_t, N> PromoteTo(Simd<uint64_t, N, 0> /* tag */,
-                                      const Vec128<uint32_t, N> v) {
-  return Vec128<uint64_t, N>(vget_low_u64(vmovl_u32(v.raw)));
-}
-template <size_t N, HWY_IF_LE64(int16_t, N)>
-HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N, 0> d,
-                                     const Vec128<uint8_t, N> v) {
-  return BitCast(d, Vec128<uint16_t, N>(vget_low_u16(vmovl_u8(v.raw))));
-}
-template <size_t N, HWY_IF_LE64(int32_t, N)>
-HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
-                                     const Vec128<uint8_t, N> v) {
-  uint16x8_t a = vmovl_u8(v.raw);
-  uint32x4_t b = vmovl_u16(vget_low_u16(a));
-  return Vec128<int32_t, N>(vget_low_s32(vreinterpretq_s32_u32(b)));
-}
-template <size_t N, HWY_IF_LE64(int32_t, N)>
-HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
-                                     const Vec128<uint16_t, N> v) {
-  uint32x4_t a = vmovl_u16(v.raw);
-  return Vec128<int32_t, N>(vget_low_s32(vreinterpretq_s32_u32(a)));
-}
-
-// Signed: replicate sign bit to full vector.
-HWY_API Vec128<int16_t> PromoteTo(Full128<int16_t> /* tag */,
-                                  const Vec64<int8_t> v) {
-  return Vec128<int16_t>(vmovl_s8(v.raw));
-}
-HWY_API Vec128<int32_t> PromoteTo(Full128<int32_t> /* tag */,
-                                  const Vec32<int8_t> v) {
-  int16x8_t a = vmovl_s8(v.raw);
-  return Vec128<int32_t>(vmovl_s16(vget_low_s16(a)));
-}
-HWY_API Vec128<int32_t> PromoteTo(Full128<int32_t> /* tag */,
-                                  const Vec64<int16_t> v) {
-  return Vec128<int32_t>(vmovl_s16(v.raw));
-}
-HWY_API Vec128<int64_t> PromoteTo(Full128<int64_t> /* tag */,
-                                  const Vec64<int32_t> v) {
-  return Vec128<int64_t>(vmovl_s32(v.raw));
-}
-
-// Signed: replicate sign bit to half vector.
-template <size_t N>
-HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N, 0> /* tag */,
-                                     const Vec128<int8_t, N> v) {
-  return Vec128<int16_t, N>(vget_low_s16(vmovl_s8(v.raw)));
-}
-template <size_t N>
-HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
-                                     const Vec128<int8_t, N> v) {
-  int16x8_t a = vmovl_s8(v.raw);
-  int32x4_t b = vmovl_s16(vget_low_s16(a));
-  return Vec128<int32_t, N>(vget_low_s32(b));
-}
-template <size_t N>
-HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
-                                     const Vec128<int16_t, N> v) {
-  return Vec128<int32_t, N>(vget_low_s32(vmovl_s16(v.raw)));
-}
-template <size_t N>
-HWY_API Vec128<int64_t, N> PromoteTo(Simd<int64_t, N, 0> /* tag */,
-                                     const Vec128<int32_t, N> v) {
-  return Vec128<int64_t, N>(vget_low_s64(vmovl_s32(v.raw)));
-}
-
-#if __ARM_FP & 2
-
-HWY_API Vec128<float> PromoteTo(Full128<float> /* tag */,
-                                const Vec128<float16_t, 4> v) {
-  const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(v.raw));
-  return Vec128<float>(f32);
-}
-template <size_t N>
-HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> /* tag */,
-                                   const Vec128<float16_t, N> v) {
-  const float32x4_t f32 = vcvt_f32_f16(vreinterpret_f16_u16(v.raw));
-  return Vec128<float, N>(vget_low_f32(f32));
-}
-
-#else
-
-template <size_t N>
-HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> df32,
-                                   const Vec128<float16_t, N> v) {
-  const RebindToSigned<decltype(df32)> di32;
-  const RebindToUnsigned<decltype(df32)> du32;
-  // Expand to u32 so we can shift.
-  const auto bits16 = PromoteTo(du32, Vec128<uint16_t, N>{v.raw});
-  const auto sign = ShiftRight<15>(bits16);
-  const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F);
-  const auto mantissa = bits16 & Set(du32, 0x3FF);
-  const auto subnormal =
-      BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) *
-                        Set(df32, 1.0f / 16384 / 1024));
-
-  const auto biased_exp32 = biased_exp + Set(du32, 127 - 15);
-  const auto mantissa32 = ShiftLeft<23 - 10>(mantissa);
-  const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
-  const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal);
-  return BitCast(df32, ShiftLeft<31>(sign) | bits32);
-}
-
-#endif
-
-#if HWY_ARCH_ARM_A64
-
-HWY_API Vec128<double> PromoteTo(Full128<double> /* tag */,
-                                 const Vec64<float> v) {
-  return Vec128<double>(vcvt_f64_f32(v.raw));
-}
-
-HWY_API Vec64<double> PromoteTo(Full64<double> /* tag */,
-                                const Vec32<float> v) {
-  return Vec64<double>(vget_low_f64(vcvt_f64_f32(v.raw)));
-}
-
-HWY_API Vec128<double> PromoteTo(Full128<double> /* tag */,
-                                 const Vec64<int32_t> v) {
-  const int64x2_t i64 = vmovl_s32(v.raw);
-  return Vec128<double>(vcvtq_f64_s64(i64));
-}
-
-HWY_API Vec64<double> PromoteTo(Full64<double> /* tag */,
-                                const Vec32<int32_t> v) {
-  const int64x1_t i64 = vget_low_s64(vmovl_s32(v.raw));
-  return Vec64<double>(vcvt_f64_s64(i64));
-}
-
-#endif
-
-// ------------------------------ Demotions (full -> part w/ narrow lanes)
-
-// From full vector to half or quarter
-HWY_API Vec64<uint16_t> DemoteTo(Full64<uint16_t> /* tag */,
-                                 const Vec128<int32_t> v) {
-  return Vec64<uint16_t>(vqmovun_s32(v.raw));
-}
-HWY_API Vec64<int16_t> DemoteTo(Full64<int16_t> /* tag */,
-                                const Vec128<int32_t> v) {
-  return Vec64<int16_t>(vqmovn_s32(v.raw));
-}
-HWY_API Vec32<uint8_t> DemoteTo(Full32<uint8_t> /* tag */,
-                                const Vec128<int32_t> v) {
-  const uint16x4_t a = vqmovun_s32(v.raw);
-  return Vec32<uint8_t>(vqmovn_u16(vcombine_u16(a, a)));
-}
-HWY_API Vec64<uint8_t> DemoteTo(Full64<uint8_t> /* tag */,
-                                const Vec128<int16_t> v) {
-  return Vec64<uint8_t>(vqmovun_s16(v.raw));
-}
-HWY_API Vec32<int8_t> DemoteTo(Full32<int8_t> /* tag */,
-                               const Vec128<int32_t> v) {
-  const int16x4_t a = vqmovn_s32(v.raw);
-  return Vec32<int8_t>(vqmovn_s16(vcombine_s16(a, a)));
-}
-HWY_API Vec64<int8_t> DemoteTo(Full64<int8_t> /* tag */,
-                               const Vec128<int16_t> v) {
-  return Vec64<int8_t>(vqmovn_s16(v.raw));
-}
-
-// From half vector to partial half
-template <size_t N, HWY_IF_LE64(int32_t, N)>
-HWY_API Vec128<uint16_t, N> DemoteTo(Simd<uint16_t, N, 0> /* tag */,
-                                     const Vec128<int32_t, N> v) {
-  return Vec128<uint16_t, N>(vqmovun_s32(vcombine_s32(v.raw, v.raw)));
-}
-template <size_t N, HWY_IF_LE64(int32_t, N)>
-HWY_API Vec128<int16_t, N> DemoteTo(Simd<int16_t, N, 0> /* tag */,
-                                    const Vec128<int32_t, N> v) {
-  return Vec128<int16_t, N>(vqmovn_s32(vcombine_s32(v.raw, v.raw)));
-}
-template <size_t N, HWY_IF_LE64(int32_t, N)>
-HWY_API Vec128<uint8_t, N> DemoteTo(Simd<uint8_t, N, 0> /* tag */,
-                                    const Vec128<int32_t, N> v) {
-  const uint16x4_t a = vqmovun_s32(vcombine_s32(v.raw, v.raw));
-  return Vec128<uint8_t, N>(vqmovn_u16(vcombine_u16(a, a)));
-}
-template <size_t N, HWY_IF_LE64(int16_t, N)>
-HWY_API Vec128<uint8_t, N> DemoteTo(Simd<uint8_t, N, 0> /* tag */,
-                                    const Vec128<int16_t, N> v) {
-  return Vec128<uint8_t, N>(vqmovun_s16(vcombine_s16(v.raw, v.raw)));
-}
-template <size_t N, HWY_IF_LE64(int32_t, N)>
-HWY_API Vec128<int8_t, N> DemoteTo(Simd<int8_t, N, 0> /* tag */,
-                                   const Vec128<int32_t, N> v) {
-  const int16x4_t a = vqmovn_s32(vcombine_s32(v.raw, v.raw));
-  return Vec128<int8_t, N>(vqmovn_s16(vcombine_s16(a, a)));
-}
-template <size_t N, HWY_IF_LE64(int16_t, N)>
-HWY_API Vec128<int8_t, N> DemoteTo(Simd<int8_t, N, 0> /* tag */,
-                                   const Vec128<int16_t, N> v) {
-  return Vec128<int8_t, N>(vqmovn_s16(vcombine_s16(v.raw, v.raw)));
-}
-
-#if __ARM_FP & 2
-
-HWY_API Vec128<float16_t, 4> DemoteTo(Full64<float16_t> /* tag */,
-                                      const Vec128<float> v) {
-  return Vec128<float16_t, 4>{vreinterpret_u16_f16(vcvt_f16_f32(v.raw))};
-}
-template <size_t N>
-HWY_API Vec128<float16_t, N> DemoteTo(Simd<float16_t, N, 0> /* tag */,
-                                      const Vec128<float, N> v) {
-  const float16x4_t f16 = vcvt_f16_f32(vcombine_f32(v.raw, v.raw));
-  return Vec128<float16_t, N>(vreinterpret_u16_f16(f16));
-}
-
-#else
-
-template <size_t N>
-HWY_API Vec128<float16_t, N> DemoteTo(Simd<float16_t, N, 0> df16,
-                                      const Vec128<float, N> v) {
-  const RebindToUnsigned<decltype(df16)> du16;
-  const Rebind<uint32_t, decltype(du16)> du;
-  const RebindToSigned<decltype(du)> di;
-  const auto bits32 = BitCast(du, v);
-  const auto sign = ShiftRight<31>(bits32);
-  const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF);
-  const auto mantissa32 = bits32 & Set(du, 0x7FFFFF);
-
-  const auto k15 = Set(di, 15);
-  const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15);
-  const auto is_tiny = exp < Set(di, -24);
-
-  const auto is_subnormal = exp < Set(di, -14);
-  const auto biased_exp16 =
-      BitCast(du, IfThenZeroElse(is_subnormal, exp + k15));
-  const auto sub_exp = BitCast(du, Set(di, -14) - exp);  // [1, 11)
-  const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) +
-                     (mantissa32 >> (Set(du, 13) + sub_exp));
-  const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m,
-                                     ShiftRight<13>(mantissa32));  // <1024
-
-  const auto sign16 = ShiftLeft<15>(sign);
-  const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
-  const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16));
-  return Vec128<float16_t, N>(DemoteTo(du16, bits16).raw);
-}
-
-#endif
-
-template <size_t N>
-HWY_API Vec128<bfloat16_t, N> DemoteTo(Simd<bfloat16_t, N, 0> dbf16,
-                                       const Vec128<float, N> v) {
-  const Rebind<int32_t, decltype(dbf16)> di32;
-  const Rebind<uint32_t, decltype(dbf16)> du32;  // for logical shift right
-  const Rebind<uint16_t, decltype(dbf16)> du16;
-  const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
-  return BitCast(dbf16, DemoteTo(du16, bits_in_32));
-}
-
-#if HWY_ARCH_ARM_A64
-
-HWY_API Vec64<float> DemoteTo(Full64<float> /* tag */, const Vec128<double> v) {
-  return Vec64<float>(vcvt_f32_f64(v.raw));
-}
-HWY_API Vec32<float> DemoteTo(Full32<float> /* tag */, const Vec64<double> v) {
-  return Vec32<float>(vcvt_f32_f64(vcombine_f64(v.raw, v.raw)));
-}
-
-HWY_API Vec64<int32_t> DemoteTo(Full64<int32_t> /* tag */,
-                                const Vec128<double> v) {
-  const int64x2_t i64 = vcvtq_s64_f64(v.raw);
-  return Vec64<int32_t>(vqmovn_s64(i64));
-}
-HWY_API Vec32<int32_t> DemoteTo(Full32<int32_t> /* tag */,
-                                const Vec64<double> v) {
-  const int64x1_t i64 = vcvt_s64_f64(v.raw);
-  // There is no i64x1 -> i32x1 narrow, so expand to int64x2_t first.
-  const int64x2_t i64x2 = vcombine_s64(i64, i64);
-  return Vec32<int32_t>(vqmovn_s64(i64x2));
-}
-
-#endif
-
-HWY_API Vec32<uint8_t> U8FromU32(const Vec128<uint32_t> v) {
-  const uint8x16_t org_v = detail::BitCastToByte(v).raw;
-  const uint8x16_t w = vuzp1q_u8(org_v, org_v);
-  return Vec32<uint8_t>(vget_low_u8(vuzp1q_u8(w, w)));
-}
-template <size_t N, HWY_IF_LE64(uint32_t, N)>
-HWY_API Vec128<uint8_t, N> U8FromU32(const Vec128<uint32_t, N> v) {
-  const uint8x8_t org_v = detail::BitCastToByte(v).raw;
-  const uint8x8_t w = vuzp1_u8(org_v, org_v);
-  return Vec128<uint8_t, N>(vuzp1_u8(w, w));
-}
-
-// In the following DemoteTo functions, |b| is purposely undefined.
-// The value a needs to be extended to 128 bits so that vqmovn can be
-// used and |b| is undefined so that no extra overhead is introduced.
-HWY_DIAGNOSTICS(push)
-HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
-
-template <size_t N>
-HWY_API Vec128<uint8_t, N> DemoteTo(Simd<uint8_t, N, 0> /* tag */,
-                                    const Vec128<int32_t> v) {
-  Vec128<uint16_t, N> a = DemoteTo(Simd<uint16_t, N, 0>(), v);
-  Vec128<uint16_t, N> b;
-  uint16x8_t c = vcombine_u16(a.raw, b.raw);
-  return Vec128<uint8_t, N>(vqmovn_u16(c));
-}
-
-template <size_t N>
-HWY_API Vec128<int8_t, N> DemoteTo(Simd<int8_t, N, 0> /* tag */,
-                                   const Vec128<int32_t> v) {
-  Vec128<int16_t, N> a = DemoteTo(Simd<int16_t, N, 0>(), v);
-  Vec128<int16_t, N> b;
-  int16x8_t c = vcombine_s16(a.raw, b.raw);
-  return Vec128<int8_t, N>(vqmovn_s16(c));
-}
-
-HWY_DIAGNOSTICS(pop)
-
-// ------------------------------ Convert integer <=> floating-point
-
-HWY_API Vec128<float> ConvertTo(Full128<float> /* tag */,
-                                const Vec128<int32_t> v) {
-  return Vec128<float>(vcvtq_f32_s32(v.raw));
-}
-template <size_t N, HWY_IF_LE64(int32_t, N)>
-HWY_API Vec128<float, N> ConvertTo(Simd<float, N, 0> /* tag */,
-                                   const Vec128<int32_t, N> v) {
-  return Vec128<float, N>(vcvt_f32_s32(v.raw));
-}
-
-HWY_API Vec128<float> ConvertTo(Full128<float> /* tag */,
-                                const Vec128<uint32_t> v) {
-  return Vec128<float>(vcvtq_f32_u32(v.raw));
-}
-template <size_t N, HWY_IF_LE64(uint32_t, N)>
-HWY_API Vec128<float, N> ConvertTo(Simd<float, N, 0> /* tag */,
-                                   const Vec128<uint32_t, N> v) {
-  return Vec128<float, N>(vcvt_f32_u32(v.raw));
-}
-
-// Truncates (rounds toward zero).
-HWY_API Vec128<int32_t> ConvertTo(Full128<int32_t> /* tag */,
-                                  const Vec128<float> v) {
-  return Vec128<int32_t>(vcvtq_s32_f32(v.raw));
-}
-template <size_t N, HWY_IF_LE64(float, N)>
-HWY_API Vec128<int32_t, N> ConvertTo(Simd<int32_t, N, 0> /* tag */,
-                                     const Vec128<float, N> v) {
-  return Vec128<int32_t, N>(vcvt_s32_f32(v.raw));
-}
-
-#if HWY_ARCH_ARM_A64
-
-HWY_API Vec128<double> ConvertTo(Full128<double> /* tag */,
-                                 const Vec128<int64_t> v) {
-  return Vec128<double>(vcvtq_f64_s64(v.raw));
-}
-HWY_API Vec64<double> ConvertTo(Full64<double> /* tag */,
-                                const Vec64<int64_t> v) {
-  return Vec64<double>(vcvt_f64_s64(v.raw));
-}
-
-HWY_API Vec128<double> ConvertTo(Full128<double> /* tag */,
-                                 const Vec128<uint64_t> v) {
-  return Vec128<double>(vcvtq_f64_u64(v.raw));
-}
-HWY_API Vec64<double> ConvertTo(Full64<double> /* tag */,
-                                const Vec64<uint64_t> v) {
-  return Vec64<double>(vcvt_f64_u64(v.raw));
-}
-
-// Truncates (rounds toward zero).
-HWY_API Vec128<int64_t> ConvertTo(Full128<int64_t> /* tag */,
-                                  const Vec128<double> v) {
-  return Vec128<int64_t>(vcvtq_s64_f64(v.raw));
-}
-HWY_API Vec64<int64_t> ConvertTo(Full64<int64_t> /* tag */,
-                                 const Vec64<double> v) {
-  return Vec64<int64_t>(vcvt_s64_f64(v.raw));
-}
-
-#endif
-
-// ------------------------------ Round (IfThenElse, mask, logical)
-
-#if HWY_ARCH_ARM_A64
-// Toward nearest integer
-HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Round, vrndn, _, 1)
-
-// Toward zero, aka truncate
-HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Trunc, vrnd, _, 1)
-
-// Toward +infinity, aka ceiling
-HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Ceil, vrndp, _, 1)
-
-// Toward -infinity, aka floor
-HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Floor, vrndm, _, 1)
-#else
-
-// ------------------------------ Trunc
-
-// ARMv7 only supports truncation to integer. We can either convert back to
-// float (3 floating-point and 2 logic operations) or manipulate the binary32
-// representation, clearing the lowest 23-exp mantissa bits. This requires 9
-// integer operations and 3 constants, which is likely more expensive.
-
-namespace detail {
-
-// The original value is already the desired result if NaN or the magnitude is
-// large (i.e. the value is already an integer).
-template <size_t N>
-HWY_INLINE Mask128<float, N> UseInt(const Vec128<float, N> v) {
-  return Abs(v) < Set(Simd<float, N, 0>(), MantissaEnd<float>());
-}
-
-}  // namespace detail
-
-template <size_t N>
-HWY_API Vec128<float, N> Trunc(const Vec128<float, N> v) {
-  const DFromV<decltype(v)> df;
-  const RebindToSigned<decltype(df)> di;
-
-  const auto integer = ConvertTo(di, v);  // round toward 0
-  const auto int_f = ConvertTo(df, integer);
-
-  return IfThenElse(detail::UseInt(v), int_f, v);
-}
-
-template <size_t N>
-HWY_API Vec128<float, N> Round(const Vec128<float, N> v) {
-  const DFromV<decltype(v)> df;
-
-  // ARMv7 also lacks a native NearestInt, but we can instead rely on rounding
-  // (we assume the current mode is nearest-even) after addition with a large
-  // value such that no mantissa bits remain. We may need a compiler flag for
-  // precise floating-point to prevent this from being "optimized" out.
-  const auto max = Set(df, MantissaEnd<float>());
-  const auto large = CopySignToAbs(max, v);
-  const auto added = large + v;
-  const auto rounded = added - large;
-
-  // Keep original if NaN or the magnitude is large (already an int).
-  return IfThenElse(Abs(v) < max, rounded, v);
-}
-
-template <size_t N>
-HWY_API Vec128<float, N> Ceil(const Vec128<float, N> v) {
-  const DFromV<decltype(v)> df;
-  const RebindToSigned<decltype(df)> di;
-
-  const auto integer = ConvertTo(di, v);  // round toward 0
-  const auto int_f = ConvertTo(df, integer);
-
-  // Truncating a positive non-integer ends up smaller; if so, add 1.
-  const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v)));
-
-  return IfThenElse(detail::UseInt(v), int_f - neg1, v);
-}
-
-template <size_t N>
-HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) {
-  const DFromV<decltype(v)> df;
-  const RebindToSigned<decltype(df)> di;
-
-  const auto integer = ConvertTo(di, v);  // round toward 0
-  const auto int_f = ConvertTo(df, integer);
-
-  // Truncating a negative non-integer ends up larger; if so, subtract 1.
-  const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v)));
-
-  return IfThenElse(detail::UseInt(v), int_f + neg1, v);
-}
-
-#endif
-
-// ------------------------------ NearestInt (Round)
-
-#if HWY_ARCH_ARM_A64
-
-HWY_API Vec128<int32_t> NearestInt(const Vec128<float> v) {
-  return Vec128<int32_t>(vcvtnq_s32_f32(v.raw));
-}
-template <size_t N, HWY_IF_LE64(float, N)>
-HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
-  return Vec128<int32_t, N>(vcvtn_s32_f32(v.raw));
-}
-
-#else
-
-template <size_t N>
-HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
-  const RebindToSigned<DFromV<decltype(v)>> di;
-  return ConvertTo(di, Round(v));
-}
-
-#endif
-
-// ------------------------------ Floating-point classification
-template <typename T, size_t N>
-HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
-  return v != v;
-}
-
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
-HWY_API Mask128<T, N> IsInf(const Vec128<T, N> v) {
-  const Simd<T, N, 0> d;
-  const RebindToSigned<decltype(d)> di;
-  const VFromD<decltype(di)> vi = BitCast(di, v);
-  // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
-  return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
-}
-
-// Returns whether normal/subnormal/zero.
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
-HWY_API Mask128<T, N> IsFinite(const Vec128<T, N> v) {
-  const Simd<T, N, 0> d;
-  const RebindToUnsigned<decltype(d)> du;
-  const RebindToSigned<decltype(d)> di;  // cheaper than unsigned comparison
-  const VFromD<decltype(du)> vu = BitCast(du, v);
-  // 'Shift left' to clear the sign bit, then right so we can compare with the
-  // max exponent (cannot compare with MaxExponentTimes2 directly because it is
-  // negative and non-negative floats would be greater).
-  const VFromD<decltype(di)> exp =
-      BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
-  return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
-}
-
-// ================================================== SWIZZLE
-
-// ------------------------------ LowerHalf
-
-// <= 64 bit: just return different type
-template <typename T, size_t N, HWY_IF_LE64(uint8_t, N)>
-HWY_API Vec128<T, N / 2> LowerHalf(const Vec128<T, N> v) {
-  return Vec128<T, N / 2>(v.raw);
-}
-
-HWY_API Vec64<uint8_t> LowerHalf(const Vec128<uint8_t> v) {
-  return Vec64<uint8_t>(vget_low_u8(v.raw));
-}
-HWY_API Vec64<uint16_t> LowerHalf(const Vec128<uint16_t> v) {
-  return Vec64<uint16_t>(vget_low_u16(v.raw));
-}
-HWY_API Vec64<uint32_t> LowerHalf(const Vec128<uint32_t> v) {
-  return Vec64<uint32_t>(vget_low_u32(v.raw));
-}
-HWY_API Vec64<uint64_t> LowerHalf(const Vec128<uint64_t> v) {
-  return Vec64<uint64_t>(vget_low_u64(v.raw));
-}
-HWY_API Vec64<int8_t> LowerHalf(const Vec128<int8_t> v) {
-  return Vec64<int8_t>(vget_low_s8(v.raw));
-}
-HWY_API Vec64<int16_t> LowerHalf(const Vec128<int16_t> v) {
-  return Vec64<int16_t>(vget_low_s16(v.raw));
-}
-HWY_API Vec64<int32_t> LowerHalf(const Vec128<int32_t> v) {
-  return Vec64<int32_t>(vget_low_s32(v.raw));
-}
-HWY_API Vec64<int64_t> LowerHalf(const Vec128<int64_t> v) {
-  return Vec64<int64_t>(vget_low_s64(v.raw));
-}
-HWY_API Vec64<float> LowerHalf(const Vec128<float> v) {
-  return Vec64<float>(vget_low_f32(v.raw));
-}
-#if HWY_ARCH_ARM_A64
-HWY_API Vec64<double> LowerHalf(const Vec128<double> v) {
-  return Vec64<double>(vget_low_f64(v.raw));
-}
-#endif
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N / 2> LowerHalf(Simd<T, N / 2, 0> /* tag */,
-                                   Vec128<T, N> v) {
-  return LowerHalf(v);
-}
-
-// ------------------------------ CombineShiftRightBytes
-
-// 128-bit
-template <int kBytes, typename T, class V128 = Vec128<T>>
-HWY_API V128 CombineShiftRightBytes(Full128<T> d, V128 hi, V128 lo) {
-  static_assert(0 < kBytes && kBytes < 16, "kBytes must be in [1, 15]");
-  const Repartition<uint8_t, decltype(d)> d8;
-  uint8x16_t v8 = vextq_u8(BitCast(d8, lo).raw, BitCast(d8, hi).raw, kBytes);
-  return BitCast(d, Vec128<uint8_t>(v8));
-}
-
-// 64-bit
-template <int kBytes, typename T>
-HWY_API Vec64<T> CombineShiftRightBytes(Full64<T> d, Vec64<T> hi, Vec64<T> lo) {
-  static_assert(0 < kBytes && kBytes < 8, "kBytes must be in [1, 7]");
-  const Repartition<uint8_t, decltype(d)> d8;
-  uint8x8_t v8 = vext_u8(BitCast(d8, lo).raw, BitCast(d8, hi).raw, kBytes);
-  return BitCast(d, VFromD<decltype(d8)>(v8));
-}
-
-// <= 32-bit defined after ShiftLeftBytes.
-
-// ------------------------------ Shift vector by constant #bytes
-
-namespace detail {
-
-// Partially specialize because kBytes = 0 and >= size are compile errors;
-// callers replace the latter with 0xFF for easier specialization.
-template <int kBytes>
-struct ShiftLeftBytesT {
-  // Full
-  template <class T>
-  HWY_INLINE Vec128<T> operator()(const Vec128<T> v) {
-    const Full128<T> d;
-    return CombineShiftRightBytes<16 - kBytes>(d, v, Zero(d));
-  }
-
-  // Partial
-  template <class T, size_t N, HWY_IF_LE64(T, N)>
-  HWY_INLINE Vec128<T, N> operator()(const Vec128<T, N> v) {
-    // Expand to 64-bit so we only use the native EXT instruction.
-    const Full64<T> d64;
-    const auto zero64 = Zero(d64);
-    const decltype(zero64) v64(v.raw);
-    return Vec128<T, N>(
-        CombineShiftRightBytes<8 - kBytes>(d64, v64, zero64).raw);
-  }
-};
-template <>
-struct ShiftLeftBytesT<0> {
-  template <class T, size_t N>
-  HWY_INLINE Vec128<T, N> operator()(const Vec128<T, N> v) {
-    return v;
-  }
-};
-template <>
-struct ShiftLeftBytesT<0xFF> {
-  template <class T, size_t N>
-  HWY_INLINE Vec128<T, N> operator()(const Vec128<T, N> /* v */) {
-    return Zero(Simd<T, N, 0>());
-  }
-};
-
-template <int kBytes>
-struct ShiftRightBytesT {
-  template <class T, size_t N>
-  HWY_INLINE Vec128<T, N> operator()(Vec128<T, N> v) {
-    const Simd<T, N, 0> d;
-    // For < 64-bit vectors, zero undefined lanes so we shift in zeros.
-    if (N * sizeof(T) < 8) {
-      constexpr size_t kReg = N * sizeof(T) == 16 ? 16 : 8;
-      const Simd<T, kReg / sizeof(T), 0> dreg;
-      v = Vec128<T, N>(
-          IfThenElseZero(FirstN(dreg, N), VFromD<decltype(dreg)>(v.raw)).raw);
-    }
-    return CombineShiftRightBytes<kBytes>(d, Zero(d), v);
-  }
-};
-template <>
-struct ShiftRightBytesT<0> {
-  template <class T, size_t N>
-  HWY_INLINE Vec128<T, N> operator()(const Vec128<T, N> v) {
-    return v;
-  }
-};
-template <>
-struct ShiftRightBytesT<0xFF> {
-  template <class T, size_t N>
-  HWY_INLINE Vec128<T, N> operator()(const Vec128<T, N> /* v */) {
-    return Zero(Simd<T, N, 0>());
-  }
-};
-
-}  // namespace detail
-
-template <int kBytes, typename T, size_t N>
-HWY_API Vec128<T, N> ShiftLeftBytes(Simd<T, N, 0> /* tag */, Vec128<T, N> v) {
-  return detail::ShiftLeftBytesT < kBytes >= N * sizeof(T) ? 0xFF
-                                                           : kBytes > ()(v);
-}
-
-template <int kBytes, typename T, size_t N>
-HWY_API Vec128<T, N> ShiftLeftBytes(const Vec128<T, N> v) {
-  return ShiftLeftBytes<kBytes>(Simd<T, N, 0>(), v);
-}
-
-template <int kLanes, typename T, size_t N>
-HWY_API Vec128<T, N> ShiftLeftLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
-  const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
-}
-
-template <int kLanes, typename T, size_t N>
-HWY_API Vec128<T, N> ShiftLeftLanes(const Vec128<T, N> v) {
-  return ShiftLeftLanes<kLanes>(Simd<T, N, 0>(), v);
-}
-
-// 0x01..0F, kBytes = 1 => 0x0001..0E
-template <int kBytes, typename T, size_t N>
-HWY_API Vec128<T, N> ShiftRightBytes(Simd<T, N, 0> /* tag */, Vec128<T, N> v) {
-  return detail::ShiftRightBytesT < kBytes >= N * sizeof(T) ? 0xFF
-                                                            : kBytes > ()(v);
-}
-
-template <int kLanes, typename T, size_t N>
-HWY_API Vec128<T, N> ShiftRightLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
-  const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v)));
-}
-
-// Calls ShiftLeftBytes
-template <int kBytes, typename T, size_t N, HWY_IF_LE32(T, N)>
-HWY_API Vec128<T, N> CombineShiftRightBytes(Simd<T, N, 0> d, Vec128<T, N> hi,
-                                            Vec128<T, N> lo) {
-  constexpr size_t kSize = N * sizeof(T);
-  static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
-  const Repartition<uint8_t, decltype(d)> d8;
-  const Full64<uint8_t> d_full8;
-  const Repartition<T, decltype(d_full8)> d_full;
-  using V64 = VFromD<decltype(d_full8)>;
-  const V64 hi64(BitCast(d8, hi).raw);
-  // Move into most-significant bytes
-  const V64 lo64 = ShiftLeftBytes<8 - kSize>(V64(BitCast(d8, lo).raw));
-  const V64 r = CombineShiftRightBytes<8 - kSize + kBytes>(d_full8, hi64, lo64);
-  // After casting to full 64-bit vector of correct type, shrink to 32-bit
-  return Vec128<T, N>(BitCast(d_full, r).raw);
-}
-
-// ------------------------------ UpperHalf (ShiftRightBytes)
-
-// Full input
-HWY_API Vec64<uint8_t> UpperHalf(Full64<uint8_t> /* tag */,
-                                 const Vec128<uint8_t> v) {
-  return Vec64<uint8_t>(vget_high_u8(v.raw));
-}
-HWY_API Vec64<uint16_t> UpperHalf(Full64<uint16_t> /* tag */,
-                                  const Vec128<uint16_t> v) {
-  return Vec64<uint16_t>(vget_high_u16(v.raw));
-}
-HWY_API Vec64<uint32_t> UpperHalf(Full64<uint32_t> /* tag */,
-                                  const Vec128<uint32_t> v) {
-  return Vec64<uint32_t>(vget_high_u32(v.raw));
-}
-HWY_API Vec64<uint64_t> UpperHalf(Full64<uint64_t> /* tag */,
-                                  const Vec128<uint64_t> v) {
-  return Vec64<uint64_t>(vget_high_u64(v.raw));
-}
-HWY_API Vec64<int8_t> UpperHalf(Full64<int8_t> /* tag */,
-                                const Vec128<int8_t> v) {
-  return Vec64<int8_t>(vget_high_s8(v.raw));
-}
-HWY_API Vec64<int16_t> UpperHalf(Full64<int16_t> /* tag */,
-                                 const Vec128<int16_t> v) {
-  return Vec64<int16_t>(vget_high_s16(v.raw));
-}
-HWY_API Vec64<int32_t> UpperHalf(Full64<int32_t> /* tag */,
-                                 const Vec128<int32_t> v) {
-  return Vec64<int32_t>(vget_high_s32(v.raw));
-}
-HWY_API Vec64<int64_t> UpperHalf(Full64<int64_t> /* tag */,
-                                 const Vec128<int64_t> v) {
-  return Vec64<int64_t>(vget_high_s64(v.raw));
-}
-HWY_API Vec64<float> UpperHalf(Full64<float> /* tag */, const Vec128<float> v) {
-  return Vec64<float>(vget_high_f32(v.raw));
-}
-#if HWY_ARCH_ARM_A64
-HWY_API Vec64<double> UpperHalf(Full64<double> /* tag */,
-                                const Vec128<double> v) {
-  return Vec64<double>(vget_high_f64(v.raw));
-}
-#endif
-
-// Partial
-template <typename T, size_t N, HWY_IF_LE64(T, N)>
-HWY_API Vec128<T, (N + 1) / 2> UpperHalf(Half<Simd<T, N, 0>> /* tag */,
-                                         Vec128<T, N> v) {
-  const DFromV<decltype(v)> d;
-  const RebindToUnsigned<decltype(d)> du;
-  const auto vu = BitCast(du, v);
-  const auto upper = BitCast(d, ShiftRightBytes<N * sizeof(T) / 2>(du, vu));
-  return Vec128<T, (N + 1) / 2>(upper.raw);
-}
-
-// ------------------------------ Broadcast/splat any lane
-
-#if HWY_ARCH_ARM_A64
-// Unsigned
-template <int kLane>
-HWY_API Vec128<uint16_t> Broadcast(const Vec128<uint16_t> v) {
-  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
-  return Vec128<uint16_t>(vdupq_laneq_u16(v.raw, kLane));
-}
-template <int kLane, size_t N, HWY_IF_LE64(uint16_t, N)>
-HWY_API Vec128<uint16_t, N> Broadcast(const Vec128<uint16_t, N> v) {
-  static_assert(0 <= kLane && kLane < N, "Invalid lane");
-  return Vec128<uint16_t, N>(vdup_lane_u16(v.raw, kLane));
-}
-template <int kLane>
-HWY_API Vec128<uint32_t> Broadcast(const Vec128<uint32_t> v) {
-  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
-  return Vec128<uint32_t>(vdupq_laneq_u32(v.raw, kLane));
-}
-template <int kLane, size_t N, HWY_IF_LE64(uint32_t, N)>
-HWY_API Vec128<uint32_t, N> Broadcast(const Vec128<uint32_t, N> v) {
-  static_assert(0 <= kLane && kLane < N, "Invalid lane");
-  return Vec128<uint32_t, N>(vdup_lane_u32(v.raw, kLane));
-}
-template <int kLane>
-HWY_API Vec128<uint64_t> Broadcast(const Vec128<uint64_t> v) {
-  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
-  return Vec128<uint64_t>(vdupq_laneq_u64(v.raw, kLane));
-}
-// Vec64<uint64_t> is defined below.
-
-// Signed
-template <int kLane>
-HWY_API Vec128<int16_t> Broadcast(const Vec128<int16_t> v) {
-  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
-  return Vec128<int16_t>(vdupq_laneq_s16(v.raw, kLane));
-}
-template <int kLane, size_t N, HWY_IF_LE64(int16_t, N)>
-HWY_API Vec128<int16_t, N> Broadcast(const Vec128<int16_t, N> v) {
-  static_assert(0 <= kLane && kLane < N, "Invalid lane");
-  return Vec128<int16_t, N>(vdup_lane_s16(v.raw, kLane));
-}
-template <int kLane>
-HWY_API Vec128<int32_t> Broadcast(const Vec128<int32_t> v) {
-  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
-  return Vec128<int32_t>(vdupq_laneq_s32(v.raw, kLane));
-}
-template <int kLane, size_t N, HWY_IF_LE64(int32_t, N)>
-HWY_API Vec128<int32_t, N> Broadcast(const Vec128<int32_t, N> v) {
-  static_assert(0 <= kLane && kLane < N, "Invalid lane");
-  return Vec128<int32_t, N>(vdup_lane_s32(v.raw, kLane));
-}
-template <int kLane>
-HWY_API Vec128<int64_t> Broadcast(const Vec128<int64_t> v) {
-  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
-  return Vec128<int64_t>(vdupq_laneq_s64(v.raw, kLane));
-}
-// Vec64<int64_t> is defined below.
-
-// Float
-template <int kLane>
-HWY_API Vec128<float> Broadcast(const Vec128<float> v) {
-  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
-  return Vec128<float>(vdupq_laneq_f32(v.raw, kLane));
-}
-template <int kLane, size_t N, HWY_IF_LE64(float, N)>
-HWY_API Vec128<float, N> Broadcast(const Vec128<float, N> v) {
-  static_assert(0 <= kLane && kLane < N, "Invalid lane");
-  return Vec128<float, N>(vdup_lane_f32(v.raw, kLane));
-}
-template <int kLane>
-HWY_API Vec128<double> Broadcast(const Vec128<double> v) {
-  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
-  return Vec128<double>(vdupq_laneq_f64(v.raw, kLane));
-}
-template <int kLane>
-HWY_API Vec64<double> Broadcast(const Vec64<double> v) {
-  static_assert(0 <= kLane && kLane < 1, "Invalid lane");
-  return v;
-}
-
-#else
-// No vdupq_laneq_* on armv7: use vgetq_lane_* + vdupq_n_*.
-
-// Unsigned
-template <int kLane>
-HWY_API Vec128<uint16_t> Broadcast(const Vec128<uint16_t> v) {
-  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
-  return Vec128<uint16_t>(vdupq_n_u16(vgetq_lane_u16(v.raw, kLane)));
-}
-template <int kLane, size_t N, HWY_IF_LE64(uint16_t, N)>
-HWY_API Vec128<uint16_t, N> Broadcast(const Vec128<uint16_t, N> v) {
-  static_assert(0 <= kLane && kLane < N, "Invalid lane");
-  return Vec128<uint16_t, N>(vdup_lane_u16(v.raw, kLane));
-}
-template <int kLane>
-HWY_API Vec128<uint32_t> Broadcast(const Vec128<uint32_t> v) {
-  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
-  return Vec128<uint32_t>(vdupq_n_u32(vgetq_lane_u32(v.raw, kLane)));
-}
-template <int kLane, size_t N, HWY_IF_LE64(uint32_t, N)>
-HWY_API Vec128<uint32_t, N> Broadcast(const Vec128<uint32_t, N> v) {
-  static_assert(0 <= kLane && kLane < N, "Invalid lane");
-  return Vec128<uint32_t, N>(vdup_lane_u32(v.raw, kLane));
-}
-template <int kLane>
-HWY_API Vec128<uint64_t> Broadcast(const Vec128<uint64_t> v) {
-  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
-  return Vec128<uint64_t>(vdupq_n_u64(vgetq_lane_u64(v.raw, kLane)));
-}
-// Vec64<uint64_t> is defined below.
-
-// Signed
-template <int kLane>
-HWY_API Vec128<int16_t> Broadcast(const Vec128<int16_t> v) {
-  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
-  return Vec128<int16_t>(vdupq_n_s16(vgetq_lane_s16(v.raw, kLane)));
-}
-template <int kLane, size_t N, HWY_IF_LE64(int16_t, N)>
-HWY_API Vec128<int16_t, N> Broadcast(const Vec128<int16_t, N> v) {
-  static_assert(0 <= kLane && kLane < N, "Invalid lane");
-  return Vec128<int16_t, N>(vdup_lane_s16(v.raw, kLane));
-}
-template <int kLane>
-HWY_API Vec128<int32_t> Broadcast(const Vec128<int32_t> v) {
-  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
-  return Vec128<int32_t>(vdupq_n_s32(vgetq_lane_s32(v.raw, kLane)));
-}
-template <int kLane, size_t N, HWY_IF_LE64(int32_t, N)>
-HWY_API Vec128<int32_t, N> Broadcast(const Vec128<int32_t, N> v) {
-  static_assert(0 <= kLane && kLane < N, "Invalid lane");
-  return Vec128<int32_t, N>(vdup_lane_s32(v.raw, kLane));
-}
-template <int kLane>
-HWY_API Vec128<int64_t> Broadcast(const Vec128<int64_t> v) {
-  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
-  return Vec128<int64_t>(vdupq_n_s64(vgetq_lane_s64(v.raw, kLane)));
-}
-// Vec64<int64_t> is defined below.
-
-// Float
-template <int kLane>
-HWY_API Vec128<float> Broadcast(const Vec128<float> v) {
-  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
-  return Vec128<float>(vdupq_n_f32(vgetq_lane_f32(v.raw, kLane)));
-}
-template <int kLane, size_t N, HWY_IF_LE64(float, N)>
-HWY_API Vec128<float, N> Broadcast(const Vec128<float, N> v) {
-  static_assert(0 <= kLane && kLane < N, "Invalid lane");
-  return Vec128<float, N>(vdup_lane_f32(v.raw, kLane));
-}
-
-#endif
-
-template <int kLane>
-HWY_API Vec64<uint64_t> Broadcast(const Vec64<uint64_t> v) {
-  static_assert(0 <= kLane && kLane < 1, "Invalid lane");
-  return v;
-}
-template <int kLane>
-HWY_API Vec64<int64_t> Broadcast(const Vec64<int64_t> v) {
-  static_assert(0 <= kLane && kLane < 1, "Invalid lane");
-  return v;
-}
-
-// ------------------------------ TableLookupLanes
-
-// Returned by SetTableIndices for use by TableLookupLanes.
-template <typename T, size_t N>
-struct Indices128 {
-  typename detail::Raw128<T, N>::type raw;
-};
-
-template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N)>
-HWY_API Indices128<T, N> IndicesFromVec(Simd<T, N, 0> d, Vec128<TI, N> vec) {
-  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
-#if HWY_IS_DEBUG_BUILD
-  const Rebind<TI, decltype(d)> di;
-  HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
-              AllTrue(di, Lt(vec, Set(di, static_cast<TI>(N)))));
-#endif
-
-  const Repartition<uint8_t, decltype(d)> d8;
-  using V8 = VFromD<decltype(d8)>;
-  const Repartition<uint16_t, decltype(d)> d16;
-
-  // Broadcast each lane index to all bytes of T and shift to bytes
-  static_assert(sizeof(T) == 4 || sizeof(T) == 8, "");
-  if (sizeof(T) == 4) {
-    alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = {
-        0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
-    const V8 lane_indices =
-        TableLookupBytes(BitCast(d8, vec), Load(d8, kBroadcastLaneBytes));
-    const V8 byte_indices =
-        BitCast(d8, ShiftLeft<2>(BitCast(d16, lane_indices)));
-    alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 0, 1, 2, 3,
-                                                      0, 1, 2, 3, 0, 1, 2, 3};
-    const V8 sum = Add(byte_indices, Load(d8, kByteOffsets));
-    return Indices128<T, N>{BitCast(d, sum).raw};
-  } else {
-    alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = {
-        0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
-    const V8 lane_indices =
-        TableLookupBytes(BitCast(d8, vec), Load(d8, kBroadcastLaneBytes));
-    const V8 byte_indices =
-        BitCast(d8, ShiftLeft<3>(BitCast(d16, lane_indices)));
-    alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 4, 5, 6, 7,
-                                                      0, 1, 2, 3, 4, 5, 6, 7};
-    const V8 sum = Add(byte_indices, Load(d8, kByteOffsets));
-    return Indices128<T, N>{BitCast(d, sum).raw};
-  }
-}
-
-template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N)>
-HWY_API Indices128<T, N> SetTableIndices(Simd<T, N, 0> d, const TI* idx) {
-  const Rebind<TI, decltype(d)> di;
-  return IndicesFromVec(d, LoadU(di, idx));
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) {
-  const DFromV<decltype(v)> d;
-  const RebindToSigned<decltype(d)> di;
-  return BitCast(
-      d, TableLookupBytes(BitCast(di, v), BitCast(di, Vec128<T, N>{idx.raw})));
-}
-
-// ------------------------------ Reverse (Shuffle0123, Shuffle2301, Shuffle01)
-
-// Single lane: no change
-template <typename T>
-HWY_API Vec128<T, 1> Reverse(Simd<T, 1, 0> /* tag */, const Vec128<T, 1> v) {
-  return v;
-}
-
-// Two lanes: shuffle
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec128<T, 2> Reverse(Simd<T, 2, 0> /* tag */, const Vec128<T, 2> v) {
-  return Vec128<T, 2>(Shuffle2301(v));
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) {
-  return Shuffle01(v);
-}
-
-// Four lanes: shuffle
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) {
-  return Shuffle0123(v);
-}
-
-// 16-bit
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec128<T, N> Reverse(Simd<T, N, 0> d, const Vec128<T, N> v) {
-  const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
-  return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v))));
-}
-
-// ------------------------------ Reverse2
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_LE64(T, N)>
-HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> d, const Vec128<T, N> v) {
-  const RebindToUnsigned<decltype(d)> du;
-  return BitCast(d, Vec128<uint16_t, N>(vrev32_u16(BitCast(du, v).raw)));
-}
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec128<T> Reverse2(Full128<T> d, const Vec128<T> v) {
-  const RebindToUnsigned<decltype(d)> du;
-  return BitCast(d, Vec128<uint16_t>(vrev32q_u16(BitCast(du, v).raw)));
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4), HWY_IF_LE64(T, N)>
-HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> d, const Vec128<T, N> v) {
-  const RebindToUnsigned<decltype(d)> du;
-  return BitCast(d, Vec128<uint32_t, N>(vrev64_u32(BitCast(du, v).raw)));
-}
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec128<T> Reverse2(Full128<T> d, const Vec128<T> v) {
-  const RebindToUnsigned<decltype(d)> du;
-  return BitCast(d, Vec128<uint32_t>(vrev64q_u32(BitCast(du, v).raw)));
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
-  return Shuffle01(v);
-}
-
-// ------------------------------ Reverse4
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2), HWY_IF_LE64(T, N)>
-HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> d, const Vec128<T, N> v) {
-  const RebindToUnsigned<decltype(d)> du;
-  return BitCast(d, Vec128<uint16_t, N>(vrev64_u16(BitCast(du, v).raw)));
-}
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec128<T> Reverse4(Full128<T> d, const Vec128<T> v) {
-  const RebindToUnsigned<decltype(d)> du;
-  return BitCast(d, Vec128<uint16_t>(vrev64q_u16(BitCast(du, v).raw)));
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
-  return Shuffle0123(v);
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> /* tag */, const Vec128<T, N>) {
-  HWY_ASSERT(0);  // don't have 8 u64 lanes
-}
-
-// ------------------------------ Reverse8
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec128<T, N> Reverse8(Simd<T, N, 0> d, const Vec128<T, N> v) {
-  return Reverse(d, v);
-}
-
-template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 2)>
-HWY_API Vec128<T, N> Reverse8(Simd<T, N, 0>, const Vec128<T, N>) {
-  HWY_ASSERT(0);  // don't have 8 lanes unless 16-bit
-}
-
-// ------------------------------ Other shuffles (TableLookupBytes)
-
-// Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
-// Shuffle0321 rotates one lane to the right (the previous least-significant
-// lane is now most-significant). These could also be implemented via
-// CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
-
-// Swap 64-bit halves
-template <typename T>
-HWY_API Vec128<T> Shuffle1032(const Vec128<T> v) {
-  return CombineShiftRightBytes<8>(Full128<T>(), v, v);
-}
-template <typename T>
-HWY_API Vec128<T> Shuffle01(const Vec128<T> v) {
-  return CombineShiftRightBytes<8>(Full128<T>(), v, v);
-}
-
-// Rotate right 32 bits
-template <typename T>
-HWY_API Vec128<T> Shuffle0321(const Vec128<T> v) {
-  return CombineShiftRightBytes<4>(Full128<T>(), v, v);
-}
-
-// Rotate left 32 bits
-template <typename T>
-HWY_API Vec128<T> Shuffle2103(const Vec128<T> v) {
-  return CombineShiftRightBytes<12>(Full128<T>(), v, v);
-}
-
-// Reverse
-template <typename T>
-HWY_API Vec128<T> Shuffle0123(const Vec128<T> v) {
-  return Shuffle2301(Shuffle1032(v));
-}
-
-// ------------------------------ InterleaveLower
-
-// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
-// the least-significant lane) and "b". To concatenate two half-width integers
-// into one, use ZipLower/Upper instead (also works with scalar).
-HWY_NEON_DEF_FUNCTION_INT_8_16_32(InterleaveLower, vzip1, _, 2)
-HWY_NEON_DEF_FUNCTION_UINT_8_16_32(InterleaveLower, vzip1, _, 2)
-
-#if HWY_ARCH_ARM_A64
-// N=1 makes no sense (in that case, there would be no upper/lower).
-HWY_API Vec128<uint64_t> InterleaveLower(const Vec128<uint64_t> a,
-                                         const Vec128<uint64_t> b) {
-  return Vec128<uint64_t>(vzip1q_u64(a.raw, b.raw));
-}
-HWY_API Vec128<int64_t> InterleaveLower(const Vec128<int64_t> a,
-                                        const Vec128<int64_t> b) {
-  return Vec128<int64_t>(vzip1q_s64(a.raw, b.raw));
-}
-HWY_API Vec128<double> InterleaveLower(const Vec128<double> a,
-                                       const Vec128<double> b) {
-  return Vec128<double>(vzip1q_f64(a.raw, b.raw));
-}
-#else
-// ARMv7 emulation.
-HWY_API Vec128<uint64_t> InterleaveLower(const Vec128<uint64_t> a,
-                                         const Vec128<uint64_t> b) {
-  return CombineShiftRightBytes<8>(Full128<uint64_t>(), b, Shuffle01(a));
-}
-HWY_API Vec128<int64_t> InterleaveLower(const Vec128<int64_t> a,
-                                        const Vec128<int64_t> b) {
-  return CombineShiftRightBytes<8>(Full128<int64_t>(), b, Shuffle01(a));
-}
-#endif
-
-// Floats
-HWY_API Vec128<float> InterleaveLower(const Vec128<float> a,
-                                      const Vec128<float> b) {
-  return Vec128<float>(vzip1q_f32(a.raw, b.raw));
-}
-template <size_t N, HWY_IF_LE64(float, N)>
-HWY_API Vec128<float, N> InterleaveLower(const Vec128<float, N> a,
-                                         const Vec128<float, N> b) {
-  return Vec128<float, N>(vzip1_f32(a.raw, b.raw));
-}
-
-// < 64 bit parts
-template <typename T, size_t N, HWY_IF_LE32(T, N)>
-HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) {
-  return Vec128<T, N>(InterleaveLower(Vec64<T>(a.raw), Vec64<T>(b.raw)).raw);
-}
-
-// Additional overload for the optional Simd<> tag.
-template <typename T, size_t N, class V = Vec128<T, N>>
-HWY_API V InterleaveLower(Simd<T, N, 0> /* tag */, V a, V b) {
-  return InterleaveLower(a, b);
-}
-
-// ------------------------------ InterleaveUpper (UpperHalf)
-
-// All functions inside detail lack the required D parameter.
-namespace detail {
-HWY_NEON_DEF_FUNCTION_INT_8_16_32(InterleaveUpper, vzip2, _, 2)
-HWY_NEON_DEF_FUNCTION_UINT_8_16_32(InterleaveUpper, vzip2, _, 2)
-
-#if HWY_ARCH_ARM_A64
-// N=1 makes no sense (in that case, there would be no upper/lower).
-HWY_API Vec128<uint64_t> InterleaveUpper(const Vec128<uint64_t> a,
-                                         const Vec128<uint64_t> b) {
-  return Vec128<uint64_t>(vzip2q_u64(a.raw, b.raw));
-}
-HWY_API Vec128<int64_t> InterleaveUpper(Vec128<int64_t> a, Vec128<int64_t> b) {
-  return Vec128<int64_t>(vzip2q_s64(a.raw, b.raw));
-}
-HWY_API Vec128<double> InterleaveUpper(Vec128<double> a, Vec128<double> b) {
-  return Vec128<double>(vzip2q_f64(a.raw, b.raw));
-}
-#else
-// ARMv7 emulation.
-HWY_API Vec128<uint64_t> InterleaveUpper(const Vec128<uint64_t> a,
-                                         const Vec128<uint64_t> b) {
-  return CombineShiftRightBytes<8>(Full128<uint64_t>(), Shuffle01(b), a);
-}
-HWY_API Vec128<int64_t> InterleaveUpper(Vec128<int64_t> a, Vec128<int64_t> b) {
-  return CombineShiftRightBytes<8>(Full128<int64_t>(), Shuffle01(b), a);
-}
-#endif
-
-HWY_API Vec128<float> InterleaveUpper(Vec128<float> a, Vec128<float> b) {
-  return Vec128<float>(vzip2q_f32(a.raw, b.raw));
-}
-HWY_API Vec64<float> InterleaveUpper(const Vec64<float> a,
-                                     const Vec64<float> b) {
-  return Vec64<float>(vzip2_f32(a.raw, b.raw));
-}
-
-}  // namespace detail
-
-// Full register
-template <typename T, size_t N, HWY_IF_GE64(T, N), class V = Vec128<T, N>>
-HWY_API V InterleaveUpper(Simd<T, N, 0> /* tag */, V a, V b) {
-  return detail::InterleaveUpper(a, b);
-}
-
-// Partial
-template <typename T, size_t N, HWY_IF_LE32(T, N), class V = Vec128<T, N>>
-HWY_API V InterleaveUpper(Simd<T, N, 0> d, V a, V b) {
-  const Half<decltype(d)> d2;
-  return InterleaveLower(d, V(UpperHalf(d2, a).raw), V(UpperHalf(d2, b).raw));
-}
-
-// ------------------------------ ZipLower/ZipUpper (InterleaveLower)
-
-// Same as Interleave*, except that the return lanes are double-width integers;
-// this is necessary because the single-lane scalar cannot return two values.
-template <class V, class DW = RepartitionToWide<DFromV<V>>>
-HWY_API VFromD<DW> ZipLower(V a, V b) {
-  return BitCast(DW(), InterleaveLower(a, b));
-}
-template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
-HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
-  return BitCast(dw, InterleaveLower(D(), a, b));
-}
-
-template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
-HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
-  return BitCast(dw, InterleaveUpper(D(), a, b));
-}
-
-// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
-
-template <size_t N>
-HWY_API Vec128<float, N> ReorderWidenMulAccumulate(Simd<float, N, 0> df32,
-                                                   Vec128<bfloat16_t, 2 * N> a,
-                                                   Vec128<bfloat16_t, 2 * N> b,
-                                                   const Vec128<float, N> sum0,
-                                                   Vec128<float, N>& sum1) {
-  const Repartition<uint16_t, decltype(df32)> du16;
-  const RebindToUnsigned<decltype(df32)> du32;
-  const Vec128<uint16_t, 2 * N> zero = Zero(du16);
-  const Vec128<uint32_t, N> a0 = ZipLower(du32, zero, BitCast(du16, a));
-  const Vec128<uint32_t, N> a1 = ZipUpper(du32, zero, BitCast(du16, a));
-  const Vec128<uint32_t, N> b0 = ZipLower(du32, zero, BitCast(du16, b));
-  const Vec128<uint32_t, N> b1 = ZipUpper(du32, zero, BitCast(du16, b));
-  sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
-  return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
-}
-
-// ================================================== COMBINE
-
-// ------------------------------ Combine (InterleaveLower)
-
-// Full result
-HWY_API Vec128<uint8_t> Combine(Full128<uint8_t> /* tag */, Vec64<uint8_t> hi,
-                                Vec64<uint8_t> lo) {
-  return Vec128<uint8_t>(vcombine_u8(lo.raw, hi.raw));
-}
-HWY_API Vec128<uint16_t> Combine(Full128<uint16_t> /* tag */,
-                                 Vec64<uint16_t> hi, Vec64<uint16_t> lo) {
-  return Vec128<uint16_t>(vcombine_u16(lo.raw, hi.raw));
-}
-HWY_API Vec128<uint32_t> Combine(Full128<uint32_t> /* tag */,
-                                 Vec64<uint32_t> hi, Vec64<uint32_t> lo) {
-  return Vec128<uint32_t>(vcombine_u32(lo.raw, hi.raw));
-}
-HWY_API Vec128<uint64_t> Combine(Full128<uint64_t> /* tag */,
-                                 Vec64<uint64_t> hi, Vec64<uint64_t> lo) {
-  return Vec128<uint64_t>(vcombine_u64(lo.raw, hi.raw));
-}
-
-HWY_API Vec128<int8_t> Combine(Full128<int8_t> /* tag */, Vec64<int8_t> hi,
-                               Vec64<int8_t> lo) {
-  return Vec128<int8_t>(vcombine_s8(lo.raw, hi.raw));
-}
-HWY_API Vec128<int16_t> Combine(Full128<int16_t> /* tag */, Vec64<int16_t> hi,
-                                Vec64<int16_t> lo) {
-  return Vec128<int16_t>(vcombine_s16(lo.raw, hi.raw));
-}
-HWY_API Vec128<int32_t> Combine(Full128<int32_t> /* tag */, Vec64<int32_t> hi,
-                                Vec64<int32_t> lo) {
-  return Vec128<int32_t>(vcombine_s32(lo.raw, hi.raw));
-}
-HWY_API Vec128<int64_t> Combine(Full128<int64_t> /* tag */, Vec64<int64_t> hi,
-                                Vec64<int64_t> lo) {
-  return Vec128<int64_t>(vcombine_s64(lo.raw, hi.raw));
-}
-
-HWY_API Vec128<float> Combine(Full128<float> /* tag */, Vec64<float> hi,
-                              Vec64<float> lo) {
-  return Vec128<float>(vcombine_f32(lo.raw, hi.raw));
-}
-#if HWY_ARCH_ARM_A64
-HWY_API Vec128<double> Combine(Full128<double> /* tag */, Vec64<double> hi,
-                               Vec64<double> lo) {
-  return Vec128<double>(vcombine_f64(lo.raw, hi.raw));
-}
-#endif
-
-// < 64bit input, <= 64 bit result
-template <typename T, size_t N, HWY_IF_LE64(T, N)>
-HWY_API Vec128<T, N> Combine(Simd<T, N, 0> d, Vec128<T, N / 2> hi,
-                             Vec128<T, N / 2> lo) {
-  // First double N (only lower halves will be used).
-  const Vec128<T, N> hi2(hi.raw);
-  const Vec128<T, N> lo2(lo.raw);
-  // Repartition to two unsigned lanes (each the size of the valid input).
-  const Simd<UnsignedFromSize<N * sizeof(T) / 2>, 2, 0> du;
-  return BitCast(d, InterleaveLower(BitCast(du, lo2), BitCast(du, hi2)));
-}
-
-// ------------------------------ ZeroExtendVector (Combine)
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> ZeroExtendVector(Simd<T, N, 0> d, Vec128<T, N / 2> lo) {
-  return Combine(d, Zero(Half<decltype(d)>()), lo);
-}
-
-// ------------------------------ ConcatLowerLower
-
-// 64 or 128-bit input: just interleave
-template <typename T, size_t N, HWY_IF_GE64(T, N)>
-HWY_API Vec128<T, N> ConcatLowerLower(const Simd<T, N, 0> d, Vec128<T, N> hi,
-                                      Vec128<T, N> lo) {
-  // Treat half-width input as a single lane and interleave them.
-  const Repartition<UnsignedFromSize<N * sizeof(T) / 2>, decltype(d)> du;
-  return BitCast(d, InterleaveLower(BitCast(du, lo), BitCast(du, hi)));
-}
-
-namespace detail {
-#if HWY_ARCH_ARM_A64
-HWY_NEON_DEF_FUNCTION_UIF81632(InterleaveEven, vtrn1, _, 2)
-HWY_NEON_DEF_FUNCTION_UIF81632(InterleaveOdd, vtrn2, _, 2)
-#else
-
-// vtrn returns a struct with even and odd result.
-#define HWY_NEON_BUILD_TPL_HWY_TRN
-#define HWY_NEON_BUILD_RET_HWY_TRN(type, size) type##x##size##x2_t
-// Pass raw args so we can accept uint16x2 args, for which there is no
-// corresponding uint16x2x2 return type.
-#define HWY_NEON_BUILD_PARAM_HWY_TRN(TYPE, size) \
-  Raw128<TYPE##_t, size>::type a, Raw128<TYPE##_t, size>::type b
-#define HWY_NEON_BUILD_ARG_HWY_TRN a, b
-
-// Cannot use UINT8 etc. type macros because the x2_t tuples are only defined
-// for full and half vectors.
-HWY_NEON_DEF_FUNCTION(uint8, 16, InterleaveEvenOdd, vtrnq, _, u8, HWY_TRN)
-HWY_NEON_DEF_FUNCTION(uint8, 8, InterleaveEvenOdd, vtrn, _, u8, HWY_TRN)
-HWY_NEON_DEF_FUNCTION(uint16, 8, InterleaveEvenOdd, vtrnq, _, u16, HWY_TRN)
-HWY_NEON_DEF_FUNCTION(uint16, 4, InterleaveEvenOdd, vtrn, _, u16, HWY_TRN)
-HWY_NEON_DEF_FUNCTION(uint32, 4, InterleaveEvenOdd, vtrnq, _, u32, HWY_TRN)
-HWY_NEON_DEF_FUNCTION(uint32, 2, InterleaveEvenOdd, vtrn, _, u32, HWY_TRN)
-HWY_NEON_DEF_FUNCTION(int8, 16, InterleaveEvenOdd, vtrnq, _, s8, HWY_TRN)
-HWY_NEON_DEF_FUNCTION(int8, 8, InterleaveEvenOdd, vtrn, _, s8, HWY_TRN)
-HWY_NEON_DEF_FUNCTION(int16, 8, InterleaveEvenOdd, vtrnq, _, s16, HWY_TRN)
-HWY_NEON_DEF_FUNCTION(int16, 4, InterleaveEvenOdd, vtrn, _, s16, HWY_TRN)
-HWY_NEON_DEF_FUNCTION(int32, 4, InterleaveEvenOdd, vtrnq, _, s32, HWY_TRN)
-HWY_NEON_DEF_FUNCTION(int32, 2, InterleaveEvenOdd, vtrn, _, s32, HWY_TRN)
-HWY_NEON_DEF_FUNCTION(float32, 4, InterleaveEvenOdd, vtrnq, _, f32, HWY_TRN)
-HWY_NEON_DEF_FUNCTION(float32, 2, InterleaveEvenOdd, vtrn, _, f32, HWY_TRN)
-#endif
-}  // namespace detail
-
-// <= 32-bit input/output
-template <typename T, size_t N, HWY_IF_LE32(T, N)>
-HWY_API Vec128<T, N> ConcatLowerLower(const Simd<T, N, 0> d, Vec128<T, N> hi,
-                                      Vec128<T, N> lo) {
-  // Treat half-width input as two lanes and take every second one.
-  const Repartition<UnsignedFromSize<N * sizeof(T) / 2>, decltype(d)> du;
-#if HWY_ARCH_ARM_A64
-  return BitCast(d, detail::InterleaveEven(BitCast(du, lo), BitCast(du, hi)));
-#else
-  using VU = VFromD<decltype(du)>;
-  return BitCast(
-      d, VU(detail::InterleaveEvenOdd(BitCast(du, lo).raw, BitCast(du, hi).raw)
-                .val[0]));
-#endif
-}
-
-// ------------------------------ ConcatUpperUpper
-
-// 64 or 128-bit input: just interleave
-template <typename T, size_t N, HWY_IF_GE64(T, N)>
-HWY_API Vec128<T, N> ConcatUpperUpper(const Simd<T, N, 0> d, Vec128<T, N> hi,
-                                      Vec128<T, N> lo) {
-  // Treat half-width input as a single lane and interleave them.
-  const Repartition<UnsignedFromSize<N * sizeof(T) / 2>, decltype(d)> du;
-  return BitCast(d, InterleaveUpper(du, BitCast(du, lo), BitCast(du, hi)));
-}
-
-// <= 32-bit input/output
-template <typename T, size_t N, HWY_IF_LE32(T, N)>
-HWY_API Vec128<T, N> ConcatUpperUpper(const Simd<T, N, 0> d, Vec128<T, N> hi,
-                                      Vec128<T, N> lo) {
-  // Treat half-width input as two lanes and take every second one.
-  const Repartition<UnsignedFromSize<N * sizeof(T) / 2>, decltype(d)> du;
-#if HWY_ARCH_ARM_A64
-  return BitCast(d, detail::InterleaveOdd(BitCast(du, lo), BitCast(du, hi)));
-#else
-  using VU = VFromD<decltype(du)>;
-  return BitCast(
-      d, VU(detail::InterleaveEvenOdd(BitCast(du, lo).raw, BitCast(du, hi).raw)
-                .val[1]));
-#endif
-}
-
-// ------------------------------ ConcatLowerUpper (ShiftLeftBytes)
-
-// 64 or 128-bit input: extract from concatenated
-template <typename T, size_t N, HWY_IF_GE64(T, N)>
-HWY_API Vec128<T, N> ConcatLowerUpper(const Simd<T, N, 0> d, Vec128<T, N> hi,
-                                      Vec128<T, N> lo) {
-  return CombineShiftRightBytes<N * sizeof(T) / 2>(d, hi, lo);
-}
-
-// <= 32-bit input/output
-template <typename T, size_t N, HWY_IF_LE32(T, N)>
-HWY_API Vec128<T, N> ConcatLowerUpper(const Simd<T, N, 0> d, Vec128<T, N> hi,
-                                      Vec128<T, N> lo) {
-  constexpr size_t kSize = N * sizeof(T);
-  const Repartition<uint8_t, decltype(d)> d8;
-  const Full64<uint8_t> d8x8;
-  const Full64<T> d64;
-  using V8x8 = VFromD<decltype(d8x8)>;
-  const V8x8 hi8x8(BitCast(d8, hi).raw);
-  // Move into most-significant bytes
-  const V8x8 lo8x8 = ShiftLeftBytes<8 - kSize>(V8x8(BitCast(d8, lo).raw));
-  const V8x8 r = CombineShiftRightBytes<8 - kSize / 2>(d8x8, hi8x8, lo8x8);
-  // Back to original lane type, then shrink N.
-  return Vec128<T, N>(BitCast(d64, r).raw);
-}
-
-// ------------------------------ ConcatUpperLower
-
-// Works for all N.
-template <typename T, size_t N>
-HWY_API Vec128<T, N> ConcatUpperLower(Simd<T, N, 0> d, Vec128<T, N> hi,
-                                      Vec128<T, N> lo) {
-  return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi);
-}
-
-// ------------------------------ ConcatOdd (InterleaveUpper)
-
-namespace detail {
-// There is no vuzpq_u64.
-HWY_NEON_DEF_FUNCTION_UIF81632(ConcatEven, vuzp1, _, 2)
-HWY_NEON_DEF_FUNCTION_UIF81632(ConcatOdd, vuzp2, _, 2)
-}  // namespace detail
-
-// Full/half vector
-template <typename T, size_t N,
-          hwy::EnableIf<N != 2 && sizeof(T) * N >= 8>* = nullptr>
-HWY_API Vec128<T, N> ConcatOdd(Simd<T, N, 0> /* tag */, Vec128<T, N> hi,
-                               Vec128<T, N> lo) {
-  return detail::ConcatOdd(lo, hi);
-}
-
-// 8-bit x4
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Vec128<T, 4> ConcatOdd(Simd<T, 4, 0> d, Vec128<T, 4> hi,
-                               Vec128<T, 4> lo) {
-  const Twice<decltype(d)> d2;
-  const Repartition<uint16_t, decltype(d2)> dw2;
-  const VFromD<decltype(d2)> hi2(hi.raw);
-  const VFromD<decltype(d2)> lo2(lo.raw);
-  const VFromD<decltype(dw2)> Hx1Lx1 = BitCast(dw2, ConcatOdd(d2, hi2, lo2));
-  // Compact into two pairs of u8, skipping the invalid x lanes. Could also use
-  // vcopy_lane_u16, but that's A64-only.
-  return Vec128<T, 4>(BitCast(d2, ConcatEven(dw2, Hx1Lx1, Hx1Lx1)).raw);
-}
-
-// Any type x2
-template <typename T>
-HWY_API Vec128<T, 2> ConcatOdd(Simd<T, 2, 0> d, Vec128<T, 2> hi,
-                               Vec128<T, 2> lo) {
-  return InterleaveUpper(d, lo, hi);
-}
-
-// ------------------------------ ConcatEven (InterleaveLower)
-
-// Full/half vector
-template <typename T, size_t N,
-          hwy::EnableIf<N != 2 && sizeof(T) * N >= 8>* = nullptr>
-HWY_API Vec128<T, N> ConcatEven(Simd<T, N, 0> /* tag */, Vec128<T, N> hi,
-                                Vec128<T, N> lo) {
-  return detail::ConcatEven(lo, hi);
-}
-
-// 8-bit x4
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Vec128<T, 4> ConcatEven(Simd<T, 4, 0> d, Vec128<T, 4> hi,
-                                Vec128<T, 4> lo) {
-  const Twice<decltype(d)> d2;
-  const Repartition<uint16_t, decltype(d2)> dw2;
-  const VFromD<decltype(d2)> hi2(hi.raw);
-  const VFromD<decltype(d2)> lo2(lo.raw);
-  const VFromD<decltype(dw2)> Hx0Lx0 = BitCast(dw2, ConcatEven(d2, hi2, lo2));
-  // Compact into two pairs of u8, skipping the invalid x lanes. Could also use
-  // vcopy_lane_u16, but that's A64-only.
-  return Vec128<T, 4>(BitCast(d2, ConcatEven(dw2, Hx0Lx0, Hx0Lx0)).raw);
-}
-
-// Any type x2
-template <typename T>
-HWY_API Vec128<T, 2> ConcatEven(Simd<T, 2, 0> d, Vec128<T, 2> hi,
-                                Vec128<T, 2> lo) {
-  return InterleaveLower(d, lo, hi);
-}
-
-// ------------------------------ DupEven (InterleaveLower)
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
-#if HWY_ARCH_ARM_A64
-  return detail::InterleaveEven(v, v);
-#else
-  return Vec128<T, N>(detail::InterleaveEvenOdd(v.raw, v.raw).val[0]);
-#endif
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec128<T, N> DupEven(const Vec128<T, N> v) {
-  return InterleaveLower(Simd<T, N, 0>(), v, v);
-}
-
-// ------------------------------ DupOdd (InterleaveUpper)
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
-#if HWY_ARCH_ARM_A64
-  return detail::InterleaveOdd(v, v);
-#else
-  return Vec128<T, N>(detail::InterleaveEvenOdd(v.raw, v.raw).val[1]);
-#endif
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec128<T, N> DupOdd(const Vec128<T, N> v) {
-  return InterleaveUpper(Simd<T, N, 0>(), v, v);
-}
-
-// ------------------------------ OddEven (IfThenElse)
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
-  const Simd<T, N, 0> d;
-  const Repartition<uint8_t, decltype(d)> d8;
-  alignas(16) constexpr uint8_t kBytes[16] = {
-      ((0 / sizeof(T)) & 1) ? 0 : 0xFF,  ((1 / sizeof(T)) & 1) ? 0 : 0xFF,
-      ((2 / sizeof(T)) & 1) ? 0 : 0xFF,  ((3 / sizeof(T)) & 1) ? 0 : 0xFF,
-      ((4 / sizeof(T)) & 1) ? 0 : 0xFF,  ((5 / sizeof(T)) & 1) ? 0 : 0xFF,
-      ((6 / sizeof(T)) & 1) ? 0 : 0xFF,  ((7 / sizeof(T)) & 1) ? 0 : 0xFF,
-      ((8 / sizeof(T)) & 1) ? 0 : 0xFF,  ((9 / sizeof(T)) & 1) ? 0 : 0xFF,
-      ((10 / sizeof(T)) & 1) ? 0 : 0xFF, ((11 / sizeof(T)) & 1) ? 0 : 0xFF,
-      ((12 / sizeof(T)) & 1) ? 0 : 0xFF, ((13 / sizeof(T)) & 1) ? 0 : 0xFF,
-      ((14 / sizeof(T)) & 1) ? 0 : 0xFF, ((15 / sizeof(T)) & 1) ? 0 : 0xFF,
-  };
-  const auto vec = BitCast(d, Load(d8, kBytes));
-  return IfThenElse(MaskFromVec(vec), b, a);
-}
-
-// ------------------------------ OddEvenBlocks
-template <typename T, size_t N>
-HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
-  return even;
-}
-
-// ------------------------------ SwapAdjacentBlocks
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
-  return v;
-}
-
-// ------------------------------ ReverseBlocks
-
-// Single block: no change
-template <typename T>
-HWY_API Vec128<T> ReverseBlocks(Full128<T> /* tag */, const Vec128<T> v) {
-  return v;
-}
-
-// ------------------------------ ReorderDemote2To (OddEven)
-
-template <size_t N>
-HWY_API Vec128<bfloat16_t, 2 * N> ReorderDemote2To(
-    Simd<bfloat16_t, 2 * N, 0> dbf16, Vec128<float, N> a, Vec128<float, N> b) {
-  const RebindToUnsigned<decltype(dbf16)> du16;
-  const Repartition<uint32_t, decltype(dbf16)> du32;
-  const Vec128<uint32_t, N> b_in_even = ShiftRight<16>(BitCast(du32, b));
-  return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
-}
-
-// ================================================== CRYPTO
-
-#if defined(__ARM_FEATURE_AES) || \
-    (HWY_HAVE_RUNTIME_DISPATCH && HWY_ARCH_ARM_A64)
-
-// Per-target flag to prevent generic_ops-inl.h from defining AESRound.
-#ifdef HWY_NATIVE_AES
-#undef HWY_NATIVE_AES
-#else
-#define HWY_NATIVE_AES
-#endif
-
-HWY_API Vec128<uint8_t> AESRound(Vec128<uint8_t> state,
-                                 Vec128<uint8_t> round_key) {
-  // NOTE: it is important that AESE and AESMC be consecutive instructions so
-  // they can be fused. AESE includes AddRoundKey, which is a different ordering
-  // than the AES-NI semantics we adopted, so XOR by 0 and later with the actual
-  // round key (the compiler will hopefully optimize this for multiple rounds).
-  return Vec128<uint8_t>(vaesmcq_u8(vaeseq_u8(state.raw, vdupq_n_u8(0)))) ^
-         round_key;
-}
-
-HWY_API Vec128<uint8_t> AESLastRound(Vec128<uint8_t> state,
-                                     Vec128<uint8_t> round_key) {
-  return Vec128<uint8_t>(vaeseq_u8(state.raw, vdupq_n_u8(0))) ^ round_key;
-}
-
-HWY_API Vec128<uint64_t> CLMulLower(Vec128<uint64_t> a, Vec128<uint64_t> b) {
-  return Vec128<uint64_t>((uint64x2_t)vmull_p64(GetLane(a), GetLane(b)));
-}
-
-HWY_API Vec128<uint64_t> CLMulUpper(Vec128<uint64_t> a, Vec128<uint64_t> b) {
-  return Vec128<uint64_t>(
-      (uint64x2_t)vmull_high_p64((poly64x2_t)a.raw, (poly64x2_t)b.raw));
-}
-
-#endif  // __ARM_FEATURE_AES
-
-// ================================================== MISC
-
-template <size_t N>
-HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> df32,
-                                   const Vec128<bfloat16_t, N> v) {
-  const Rebind<uint16_t, decltype(df32)> du16;
-  const RebindToSigned<decltype(df32)> di32;
-  return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
-}
-
-// ------------------------------ Truncations
-
-template <typename From, typename To, HWY_IF_UNSIGNED(From),
-          HWY_IF_UNSIGNED(To),
-          hwy::EnableIf<(sizeof(To) < sizeof(From))>* = nullptr>
-HWY_API Vec128<To, 1> TruncateTo(Simd<To, 1, 0> /* tag */,
-                                 const Vec128<From, 1> v) {
-  const Repartition<To, DFromV<decltype(v)>> d;
-  const auto v1 = BitCast(d, v);
-  return Vec128<To, 1>{v1.raw};
-}
-
-HWY_API Vec128<uint8_t, 2> TruncateTo(Simd<uint8_t, 2, 0> /* tag */,
-                                      const Vec128<uint64_t, 2> v) {
-  const Repartition<uint8_t, DFromV<decltype(v)>> d;
-  const auto v1 = BitCast(d, v);
-  const auto v2 = detail::ConcatEven(v1, v1);
-  const auto v3 = detail::ConcatEven(v2, v2);
-  const auto v4 = detail::ConcatEven(v3, v3);
-  return LowerHalf(LowerHalf(LowerHalf(v4)));
-}
-
-HWY_API Vec32<uint16_t> TruncateTo(Simd<uint16_t, 2, 0> /* tag */,
-                                   const Vec128<uint64_t, 2> v) {
-  const Repartition<uint16_t, DFromV<decltype(v)>> d;
-  const auto v1 = BitCast(d, v);
-  const auto v2 = detail::ConcatEven(v1, v1);
-  const auto v3 = detail::ConcatEven(v2, v2);
-  return LowerHalf(LowerHalf(v3));
-}
-
-HWY_API Vec64<uint32_t> TruncateTo(Simd<uint32_t, 2, 0> /* tag */,
-                                   const Vec128<uint64_t, 2> v) {
-  const Repartition<uint32_t, DFromV<decltype(v)>> d;
-  const auto v1 = BitCast(d, v);
-  const auto v2 = detail::ConcatEven(v1, v1);
-  return LowerHalf(v2);
-}
-
-template <size_t N, hwy::EnableIf<N >= 2>* = nullptr>
-HWY_API Vec128<uint8_t, N> TruncateTo(Simd<uint8_t, N, 0> /* tag */,
-                                      const Vec128<uint32_t, N> v) {
-  const Repartition<uint8_t, DFromV<decltype(v)>> d;
-  const auto v1 = BitCast(d, v);
-  const auto v2 = detail::ConcatEven(v1, v1);
-  const auto v3 = detail::ConcatEven(v2, v2);
-  return LowerHalf(LowerHalf(v3));
-}
-
-template <size_t N, hwy::EnableIf<N >= 2>* = nullptr>
-HWY_API Vec128<uint16_t, N> TruncateTo(Simd<uint16_t, N, 0> /* tag */,
-                                       const Vec128<uint32_t, N> v) {
-  const Repartition<uint16_t, DFromV<decltype(v)>> d;
-  const auto v1 = BitCast(d, v);
-  const auto v2 = detail::ConcatEven(v1, v1);
-  return LowerHalf(v2);
-}
-
-template <size_t N, hwy::EnableIf<N >= 2>* = nullptr>
-HWY_API Vec128<uint8_t, N> TruncateTo(Simd<uint8_t, N, 0> /* tag */,
-                                      const Vec128<uint16_t, N> v) {
-  const Repartition<uint8_t, DFromV<decltype(v)>> d;
-  const auto v1 = BitCast(d, v);
-  const auto v2 = detail::ConcatEven(v1, v1);
-  return LowerHalf(v2);
-}
-
-// ------------------------------ MulEven (ConcatEven)
-
-// Multiplies even lanes (0, 2 ..) and places the double-wide result into
-// even and the upper half into its odd neighbor lane.
-HWY_API Vec128<int64_t> MulEven(Vec128<int32_t> a, Vec128<int32_t> b) {
-  const Full128<int32_t> d;
-  int32x4_t a_packed = ConcatEven(d, a, a).raw;
-  int32x4_t b_packed = ConcatEven(d, b, b).raw;
-  return Vec128<int64_t>(
-      vmull_s32(vget_low_s32(a_packed), vget_low_s32(b_packed)));
-}
-HWY_API Vec128<uint64_t> MulEven(Vec128<uint32_t> a, Vec128<uint32_t> b) {
-  const Full128<uint32_t> d;
-  uint32x4_t a_packed = ConcatEven(d, a, a).raw;
-  uint32x4_t b_packed = ConcatEven(d, b, b).raw;
-  return Vec128<uint64_t>(
-      vmull_u32(vget_low_u32(a_packed), vget_low_u32(b_packed)));
-}
-
-template <size_t N>
-HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a,
-                                             const Vec128<int32_t, N> b) {
-  const DFromV<decltype(a)> d;
-  int32x2_t a_packed = ConcatEven(d, a, a).raw;
-  int32x2_t b_packed = ConcatEven(d, b, b).raw;
-  return Vec128<int64_t, (N + 1) / 2>(
-      vget_low_s64(vmull_s32(a_packed, b_packed)));
-}
-template <size_t N>
-HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(const Vec128<uint32_t, N> a,
-                                              const Vec128<uint32_t, N> b) {
-  const DFromV<decltype(a)> d;
-  uint32x2_t a_packed = ConcatEven(d, a, a).raw;
-  uint32x2_t b_packed = ConcatEven(d, b, b).raw;
-  return Vec128<uint64_t, (N + 1) / 2>(
-      vget_low_u64(vmull_u32(a_packed, b_packed)));
-}
-
-HWY_INLINE Vec128<uint64_t> MulEven(Vec128<uint64_t> a, Vec128<uint64_t> b) {
-  uint64_t hi;
-  uint64_t lo = Mul128(vgetq_lane_u64(a.raw, 0), vgetq_lane_u64(b.raw, 0), &hi);
-  return Vec128<uint64_t>(vsetq_lane_u64(hi, vdupq_n_u64(lo), 1));
-}
-
-HWY_INLINE Vec128<uint64_t> MulOdd(Vec128<uint64_t> a, Vec128<uint64_t> b) {
-  uint64_t hi;
-  uint64_t lo = Mul128(vgetq_lane_u64(a.raw, 1), vgetq_lane_u64(b.raw, 1), &hi);
-  return Vec128<uint64_t>(vsetq_lane_u64(hi, vdupq_n_u64(lo), 1));
-}
-
-// ------------------------------ TableLookupBytes (Combine, LowerHalf)
-
-// Both full
-template <typename T, typename TI>
-HWY_API Vec128<TI> TableLookupBytes(const Vec128<T> bytes,
-                                    const Vec128<TI> from) {
-  const Full128<TI> d;
-  const Repartition<uint8_t, decltype(d)> d8;
-#if HWY_ARCH_ARM_A64
-  return BitCast(d, Vec128<uint8_t>(vqtbl1q_u8(BitCast(d8, bytes).raw,
-                                               BitCast(d8, from).raw)));
-#else
-  uint8x16_t table0 = BitCast(d8, bytes).raw;
-  uint8x8x2_t table;
-  table.val[0] = vget_low_u8(table0);
-  table.val[1] = vget_high_u8(table0);
-  uint8x16_t idx = BitCast(d8, from).raw;
-  uint8x8_t low = vtbl2_u8(table, vget_low_u8(idx));
-  uint8x8_t hi = vtbl2_u8(table, vget_high_u8(idx));
-  return BitCast(d, Vec128<uint8_t>(vcombine_u8(low, hi)));
-#endif
-}
-
-// Partial index vector
-template <typename T, typename TI, size_t NI, HWY_IF_LE64(TI, NI)>
-HWY_API Vec128<TI, NI> TableLookupBytes(const Vec128<T> bytes,
-                                        const Vec128<TI, NI> from) {
-  const Full128<TI> d_full;
-  const Vec64<TI> from64(from.raw);
-  const auto idx_full = Combine(d_full, from64, from64);
-  const auto out_full = TableLookupBytes(bytes, idx_full);
-  return Vec128<TI, NI>(LowerHalf(Half<decltype(d_full)>(), out_full).raw);
-}
-
-// Partial table vector
-template <typename T, size_t N, typename TI, HWY_IF_LE64(T, N)>
-HWY_API Vec128<TI> TableLookupBytes(const Vec128<T, N> bytes,
-                                    const Vec128<TI> from) {
-  const Full128<T> d_full;
-  return TableLookupBytes(Combine(d_full, bytes, bytes), from);
-}
-
-// Partial both
-template <typename T, size_t N, typename TI, size_t NI, HWY_IF_LE64(T, N),
-          HWY_IF_LE64(TI, NI)>
-HWY_API VFromD<Repartition<T, Simd<TI, NI, 0>>> TableLookupBytes(
-    Vec128<T, N> bytes, Vec128<TI, NI> from) {
-  const Simd<T, N, 0> d;
-  const Simd<TI, NI, 0> d_idx;
-  const Repartition<uint8_t, decltype(d_idx)> d_idx8;
-  // uint8x8
-  const auto bytes8 = BitCast(Repartition<uint8_t, decltype(d)>(), bytes);
-  const auto from8 = BitCast(d_idx8, from);
-  const VFromD<decltype(d_idx8)> v8(vtbl1_u8(bytes8.raw, from8.raw));
-  return BitCast(d_idx, v8);
-}
-
-// For all vector widths; ARM anyway zeroes if >= 0x10.
-template <class V, class VI>
-HWY_API VI TableLookupBytesOr0(const V bytes, const VI from) {
-  return TableLookupBytes(bytes, from);
-}
-
-// ------------------------------ Scatter (Store)
-
-template <typename T, size_t N, typename Offset, HWY_IF_LE128(T, N)>
-HWY_API void ScatterOffset(Vec128<T, N> v, Simd<T, N, 0> d,
-                           T* HWY_RESTRICT base,
-                           const Vec128<Offset, N> offset) {
-  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
-
-  alignas(16) T lanes[N];
-  Store(v, d, lanes);
-
-  alignas(16) Offset offset_lanes[N];
-  Store(offset, Rebind<Offset, decltype(d)>(), offset_lanes);
-
-  uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
-  for (size_t i = 0; i < N; ++i) {
-    CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
-  }
-}
-
-template <typename T, size_t N, typename Index, HWY_IF_LE128(T, N)>
-HWY_API void ScatterIndex(Vec128<T, N> v, Simd<T, N, 0> d, T* HWY_RESTRICT base,
-                          const Vec128<Index, N> index) {
-  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
-
-  alignas(16) T lanes[N];
-  Store(v, d, lanes);
-
-  alignas(16) Index index_lanes[N];
-  Store(index, Rebind<Index, decltype(d)>(), index_lanes);
-
-  for (size_t i = 0; i < N; ++i) {
-    base[index_lanes[i]] = lanes[i];
-  }
-}
-
-// ------------------------------ Gather (Load/Store)
-
-template <typename T, size_t N, typename Offset>
-HWY_API Vec128<T, N> GatherOffset(const Simd<T, N, 0> d,
-                                  const T* HWY_RESTRICT base,
-                                  const Vec128<Offset, N> offset) {
-  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
-
-  alignas(16) Offset offset_lanes[N];
-  Store(offset, Rebind<Offset, decltype(d)>(), offset_lanes);
-
-  alignas(16) T lanes[N];
-  const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
-  for (size_t i = 0; i < N; ++i) {
-    CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
-  }
-  return Load(d, lanes);
-}
-
-template <typename T, size_t N, typename Index>
-HWY_API Vec128<T, N> GatherIndex(const Simd<T, N, 0> d,
-                                 const T* HWY_RESTRICT base,
-                                 const Vec128<Index, N> index) {
-  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
-
-  alignas(16) Index index_lanes[N];
-  Store(index, Rebind<Index, decltype(d)>(), index_lanes);
-
-  alignas(16) T lanes[N];
-  for (size_t i = 0; i < N; ++i) {
-    lanes[i] = base[index_lanes[i]];
-  }
-  return Load(d, lanes);
-}
-
-// ------------------------------ Reductions
-
-namespace detail {
-
-// N=1 for any T: no-op
-template <typename T>
-HWY_INLINE Vec128<T, 1> SumOfLanes(const Vec128<T, 1> v) {
-  return v;
-}
-template <typename T>
-HWY_INLINE Vec128<T, 1> MinOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
-                                   const Vec128<T, 1> v) {
-  return v;
-}
-template <typename T>
-HWY_INLINE Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
-                                   const Vec128<T, 1> v) {
-  return v;
-}
-
-// u32/i32/f32: N=2
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_INLINE Vec128<T, 2> SumOfLanes(const Vec128<T, 2> v10) {
-  return v10 + Shuffle2301(v10);
-}
-template <typename T>
-HWY_INLINE Vec128<T, 2> MinOfLanes(hwy::SizeTag<4> /* tag */,
-                                   const Vec128<T, 2> v10) {
-  return Min(v10, Shuffle2301(v10));
-}
-template <typename T>
-HWY_INLINE Vec128<T, 2> MaxOfLanes(hwy::SizeTag<4> /* tag */,
-                                   const Vec128<T, 2> v10) {
-  return Max(v10, Shuffle2301(v10));
-}
-
-// full vectors
-#if HWY_ARCH_ARM_A64
-HWY_INLINE Vec128<uint32_t> SumOfLanes(const Vec128<uint32_t> v) {
-  return Vec128<uint32_t>(vdupq_n_u32(vaddvq_u32(v.raw)));
-}
-HWY_INLINE Vec128<int32_t> SumOfLanes(const Vec128<int32_t> v) {
-  return Vec128<int32_t>(vdupq_n_s32(vaddvq_s32(v.raw)));
-}
-HWY_INLINE Vec128<float> SumOfLanes(const Vec128<float> v) {
-  return Vec128<float>(vdupq_n_f32(vaddvq_f32(v.raw)));
-}
-HWY_INLINE Vec128<uint64_t> SumOfLanes(const Vec128<uint64_t> v) {
-  return Vec128<uint64_t>(vdupq_n_u64(vaddvq_u64(v.raw)));
-}
-HWY_INLINE Vec128<int64_t> SumOfLanes(const Vec128<int64_t> v) {
-  return Vec128<int64_t>(vdupq_n_s64(vaddvq_s64(v.raw)));
-}
-HWY_INLINE Vec128<double> SumOfLanes(const Vec128<double> v) {
-  return Vec128<double>(vdupq_n_f64(vaddvq_f64(v.raw)));
-}
-#else
-// ARMv7 version for everything except doubles.
-HWY_INLINE Vec128<uint32_t> SumOfLanes(const Vec128<uint32_t> v) {
-  uint32x4x2_t v0 = vuzpq_u32(v.raw, v.raw);
-  uint32x4_t c0 = vaddq_u32(v0.val[0], v0.val[1]);
-  uint32x4x2_t v1 = vuzpq_u32(c0, c0);
-  return Vec128<uint32_t>(vaddq_u32(v1.val[0], v1.val[1]));
-}
-HWY_INLINE Vec128<int32_t> SumOfLanes(const Vec128<int32_t> v) {
-  int32x4x2_t v0 = vuzpq_s32(v.raw, v.raw);
-  int32x4_t c0 = vaddq_s32(v0.val[0], v0.val[1]);
-  int32x4x2_t v1 = vuzpq_s32(c0, c0);
-  return Vec128<int32_t>(vaddq_s32(v1.val[0], v1.val[1]));
-}
-HWY_INLINE Vec128<float> SumOfLanes(const Vec128<float> v) {
-  float32x4x2_t v0 = vuzpq_f32(v.raw, v.raw);
-  float32x4_t c0 = vaddq_f32(v0.val[0], v0.val[1]);
-  float32x4x2_t v1 = vuzpq_f32(c0, c0);
-  return Vec128<float>(vaddq_f32(v1.val[0], v1.val[1]));
-}
-HWY_INLINE Vec128<uint64_t> SumOfLanes(const Vec128<uint64_t> v) {
-  return v + Shuffle01(v);
-}
-HWY_INLINE Vec128<int64_t> SumOfLanes(const Vec128<int64_t> v) {
-  return v + Shuffle01(v);
-}
-#endif
-
-template <typename T>
-HWY_INLINE Vec128<T> MinOfLanes(hwy::SizeTag<4> /* tag */,
-                                const Vec128<T> v3210) {
-  const Vec128<T> v1032 = Shuffle1032(v3210);
-  const Vec128<T> v31_20_31_20 = Min(v3210, v1032);
-  const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
-  return Min(v20_31_20_31, v31_20_31_20);
-}
-template <typename T>
-HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<4> /* tag */,
-                                const Vec128<T> v3210) {
-  const Vec128<T> v1032 = Shuffle1032(v3210);
-  const Vec128<T> v31_20_31_20 = Max(v3210, v1032);
-  const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
-  return Max(v20_31_20_31, v31_20_31_20);
-}
-
-// For u64/i64[/f64].
-template <typename T>
-HWY_INLINE Vec128<T> MinOfLanes(hwy::SizeTag<8> /* tag */,
-                                const Vec128<T> v10) {
-  const Vec128<T> v01 = Shuffle01(v10);
-  return Min(v10, v01);
-}
-template <typename T>
-HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
-                                const Vec128<T> v10) {
-  const Vec128<T> v01 = Shuffle01(v10);
-  return Max(v10, v01);
-}
-
-template <size_t N, HWY_IF_GE32(uint16_t, N)>
-HWY_API Vec128<uint16_t, N> MinOfLanes(hwy::SizeTag<2> /* tag */,
-                                       Vec128<uint16_t, N> v) {
-  const Simd<uint16_t, N, 0> d;
-  const RepartitionToWide<decltype(d)> d32;
-  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
-  const auto odd = ShiftRight<16>(BitCast(d32, v));
-  const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd));
-  // Also broadcast into odd lanes.
-  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
-}
-template <size_t N, HWY_IF_GE32(int16_t, N)>
-HWY_API Vec128<int16_t, N> MinOfLanes(hwy::SizeTag<2> /* tag */,
-                                      Vec128<int16_t, N> v) {
-  const Simd<int16_t, N, 0> d;
-  const RepartitionToWide<decltype(d)> d32;
-  // Sign-extend
-  const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
-  const auto odd = ShiftRight<16>(BitCast(d32, v));
-  const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd));
-  // Also broadcast into odd lanes.
-  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
-}
-
-template <size_t N, HWY_IF_GE32(uint16_t, N)>
-HWY_API Vec128<uint16_t, N> MaxOfLanes(hwy::SizeTag<2> /* tag */,
-                                       Vec128<uint16_t, N> v) {
-  const Simd<uint16_t, N, 0> d;
-  const RepartitionToWide<decltype(d)> d32;
-  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
-  const auto odd = ShiftRight<16>(BitCast(d32, v));
-  const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd));
-  // Also broadcast into odd lanes.
-  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
-}
-template <size_t N, HWY_IF_GE32(int16_t, N)>
-HWY_API Vec128<int16_t, N> MaxOfLanes(hwy::SizeTag<2> /* tag */,
-                                      Vec128<int16_t, N> v) {
-  const Simd<int16_t, N, 0> d;
-  const RepartitionToWide<decltype(d)> d32;
-  // Sign-extend
-  const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
-  const auto odd = ShiftRight<16>(BitCast(d32, v));
-  const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd));
-  // Also broadcast into odd lanes.
-  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
-}
-
-}  // namespace detail
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> SumOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
-  return detail::SumOfLanes(v);
-}
-template <typename T, size_t N>
-HWY_API Vec128<T, N> MinOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
-  return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), v);
-}
-template <typename T, size_t N>
-HWY_API Vec128<T, N> MaxOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
-  return detail::MaxOfLanes(hwy::SizeTag<sizeof(T)>(), v);
-}
-
-// ------------------------------ LoadMaskBits (TestBit)
-
-namespace detail {
-
-// Helper function to set 64 bits and potentially return a smaller vector. The
-// overload is required to call the q vs non-q intrinsics. Note that 8-bit
-// LoadMaskBits only requires 16 bits, but 64 avoids casting.
-template <typename T, size_t N, HWY_IF_LE64(T, N)>
-HWY_INLINE Vec128<T, N> Set64(Simd<T, N, 0> /* tag */, uint64_t mask_bits) {
-  const auto v64 = Vec64<uint64_t>(vdup_n_u64(mask_bits));
-  return Vec128<T, N>(BitCast(Full64<T>(), v64).raw);
-}
-template <typename T>
-HWY_INLINE Vec128<T> Set64(Full128<T> d, uint64_t mask_bits) {
-  return BitCast(d, Vec128<uint64_t>(vdupq_n_u64(mask_bits)));
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
-HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t mask_bits) {
-  const RebindToUnsigned<decltype(d)> du;
-  // Easier than Set(), which would require an >8-bit type, which would not
-  // compile for T=uint8_t, N=1.
-  const auto vmask_bits = Set64(du, mask_bits);
-
-  // Replicate bytes 8x such that each byte contains the bit that governs it.
-  alignas(16) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
-                                             1, 1, 1, 1, 1, 1, 1, 1};
-  const auto rep8 = TableLookupBytes(vmask_bits, Load(du, kRep8));
-
-  alignas(16) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
-                                            1, 2, 4, 8, 16, 32, 64, 128};
-  return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit)));
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
-HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t mask_bits) {
-  const RebindToUnsigned<decltype(d)> du;
-  alignas(16) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
-  const auto vmask_bits = Set(du, static_cast<uint16_t>(mask_bits));
-  return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
-HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t mask_bits) {
-  const RebindToUnsigned<decltype(d)> du;
-  alignas(16) constexpr uint32_t kBit[8] = {1, 2, 4, 8};
-  const auto vmask_bits = Set(du, static_cast<uint32_t>(mask_bits));
-  return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
-HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t mask_bits) {
-  const RebindToUnsigned<decltype(d)> du;
-  alignas(16) constexpr uint64_t kBit[8] = {1, 2};
-  return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit)));
-}
-
-}  // namespace detail
-
-// `p` points to at least 8 readable bytes, not all of which need be valid.
-template <typename T, size_t N, HWY_IF_LE128(T, N)>
-HWY_API Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d,
-                                   const uint8_t* HWY_RESTRICT bits) {
-  uint64_t mask_bits = 0;
-  CopyBytes<(N + 7) / 8>(bits, &mask_bits);
-  return detail::LoadMaskBits(d, mask_bits);
-}
-
-// ------------------------------ Mask
-
-namespace detail {
-
-// Returns mask[i]? 0xF : 0 in each nibble. This is more efficient than
-// BitsFromMask for use in (partial) CountTrue, FindFirstTrue and AllFalse.
-template <typename T>
-HWY_INLINE uint64_t NibblesFromMask(const Full128<T> d, Mask128<T> mask) {
-  const Full128<uint16_t> du16;
-  const Vec128<uint16_t> vu16 = BitCast(du16, VecFromMask(d, mask));
-  const Vec64<uint8_t> nib(vshrn_n_u16(vu16.raw, 4));
-  return GetLane(BitCast(Full64<uint64_t>(), nib));
-}
-
-template <typename T>
-HWY_INLINE uint64_t NibblesFromMask(const Full64<T> d, Mask64<T> mask) {
-  // There is no vshrn_n_u16 for uint16x4, so zero-extend.
-  const Twice<decltype(d)> d2;
-  const Vec128<T> v128 = ZeroExtendVector(d2, VecFromMask(d, mask));
-  // No need to mask, upper half is zero thanks to ZeroExtendVector.
-  return NibblesFromMask(d2, MaskFromVec(v128));
-}
-
-template <typename T, size_t N, HWY_IF_LE32(T, N)>
-HWY_INLINE uint64_t NibblesFromMask(Simd<T, N, 0> /*d*/, Mask128<T, N> mask) {
-  const Mask64<T> mask64(mask.raw);
-  const uint64_t nib = NibblesFromMask(Full64<T>(), mask64);
-  // Clear nibbles from upper half of 64-bits
-  constexpr size_t kBytes = sizeof(T) * N;
-  return nib & ((1ull << (kBytes * 4)) - 1);
-}
-
-template <typename T>
-HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
-                                 const Mask128<T> mask) {
-  alignas(16) constexpr uint8_t kSliceLanes[16] = {
-      1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80,
-  };
-  const Full128<uint8_t> du;
-  const Vec128<uint8_t> values =
-      BitCast(du, VecFromMask(Full128<T>(), mask)) & Load(du, kSliceLanes);
-
-#if HWY_ARCH_ARM_A64
-  // Can't vaddv - we need two separate bytes (16 bits).
-  const uint8x8_t x2 = vget_low_u8(vpaddq_u8(values.raw, values.raw));
-  const uint8x8_t x4 = vpadd_u8(x2, x2);
-  const uint8x8_t x8 = vpadd_u8(x4, x4);
-  return vget_lane_u64(vreinterpret_u64_u8(x8), 0);
-#else
-  // Don't have vpaddq, so keep doubling lane size.
-  const uint16x8_t x2 = vpaddlq_u8(values.raw);
-  const uint32x4_t x4 = vpaddlq_u16(x2);
-  const uint64x2_t x8 = vpaddlq_u32(x4);
-  return (vgetq_lane_u64(x8, 1) << 8) | vgetq_lane_u64(x8, 0);
-#endif
-}
-
-template <typename T, size_t N, HWY_IF_LE64(T, N)>
-HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
-                                 const Mask128<T, N> mask) {
-  // Upper lanes of partial loads are undefined. OnlyActive will fix this if
-  // we load all kSliceLanes so the upper lanes do not pollute the valid bits.
-  alignas(8) constexpr uint8_t kSliceLanes[8] = {1,    2,    4,    8,
-                                                 0x10, 0x20, 0x40, 0x80};
-  const Simd<T, N, 0> d;
-  const RebindToUnsigned<decltype(d)> du;
-  const Vec128<uint8_t, N> slice(Load(Full64<uint8_t>(), kSliceLanes).raw);
-  const Vec128<uint8_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice;
-
-#if HWY_ARCH_ARM_A64
-  return vaddv_u8(values.raw);
-#else
-  const uint16x4_t x2 = vpaddl_u8(values.raw);
-  const uint32x2_t x4 = vpaddl_u16(x2);
-  const uint64x1_t x8 = vpaddl_u32(x4);
-  return vget_lane_u64(x8, 0);
-#endif
-}
-
-template <typename T>
-HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/,
-                                 const Mask128<T> mask) {
-  alignas(16) constexpr uint16_t kSliceLanes[8] = {1,    2,    4,    8,
-                                                   0x10, 0x20, 0x40, 0x80};
-  const Full128<T> d;
-  const Full128<uint16_t> du;
-  const Vec128<uint16_t> values =
-      BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes);
-#if HWY_ARCH_ARM_A64
-  return vaddvq_u16(values.raw);
-#else
-  const uint32x4_t x2 = vpaddlq_u16(values.raw);
-  const uint64x2_t x4 = vpaddlq_u32(x2);
-  return vgetq_lane_u64(x4, 0) + vgetq_lane_u64(x4, 1);
-#endif
-}
-
-template <typename T, size_t N, HWY_IF_LE64(T, N)>
-HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/,
-                                 const Mask128<T, N> mask) {
-  // Upper lanes of partial loads are undefined. OnlyActive will fix this if
-  // we load all kSliceLanes so the upper lanes do not pollute the valid bits.
-  alignas(8) constexpr uint16_t kSliceLanes[4] = {1, 2, 4, 8};
-  const Simd<T, N, 0> d;
-  const RebindToUnsigned<decltype(d)> du;
-  const Vec128<uint16_t, N> slice(Load(Full64<uint16_t>(), kSliceLanes).raw);
-  const Vec128<uint16_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice;
-#if HWY_ARCH_ARM_A64
-  return vaddv_u16(values.raw);
-#else
-  const uint32x2_t x2 = vpaddl_u16(values.raw);
-  const uint64x1_t x4 = vpaddl_u32(x2);
-  return vget_lane_u64(x4, 0);
-#endif
-}
-
-template <typename T>
-HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/,
-                                 const Mask128<T> mask) {
-  alignas(16) constexpr uint32_t kSliceLanes[4] = {1, 2, 4, 8};
-  const Full128<T> d;
-  const Full128<uint32_t> du;
-  const Vec128<uint32_t> values =
-      BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes);
-#if HWY_ARCH_ARM_A64
-  return vaddvq_u32(values.raw);
-#else
-  const uint64x2_t x2 = vpaddlq_u32(values.raw);
-  return vgetq_lane_u64(x2, 0) + vgetq_lane_u64(x2, 1);
-#endif
-}
-
-template <typename T, size_t N, HWY_IF_LE64(T, N)>
-HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/,
-                                 const Mask128<T, N> mask) {
-  // Upper lanes of partial loads are undefined. OnlyActive will fix this if
-  // we load all kSliceLanes so the upper lanes do not pollute the valid bits.
-  alignas(8) constexpr uint32_t kSliceLanes[2] = {1, 2};
-  const Simd<T, N, 0> d;
-  const RebindToUnsigned<decltype(d)> du;
-  const Vec128<uint32_t, N> slice(Load(Full64<uint32_t>(), kSliceLanes).raw);
-  const Vec128<uint32_t, N> values = BitCast(du, VecFromMask(d, mask)) & slice;
-#if HWY_ARCH_ARM_A64
-  return vaddv_u32(values.raw);
-#else
-  const uint64x1_t x2 = vpaddl_u32(values.raw);
-  return vget_lane_u64(x2, 0);
-#endif
-}
-
-template <typename T>
-HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, const Mask128<T> m) {
-  alignas(16) constexpr uint64_t kSliceLanes[2] = {1, 2};
-  const Full128<T> d;
-  const Full128<uint64_t> du;
-  const Vec128<uint64_t> values =
-      BitCast(du, VecFromMask(d, m)) & Load(du, kSliceLanes);
-#if HWY_ARCH_ARM_A64
-  return vaddvq_u64(values.raw);
-#else
-  return vgetq_lane_u64(values.raw, 0) + vgetq_lane_u64(values.raw, 1);
-#endif
-}
-
-template <typename T>
-HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/,
-                                 const Mask128<T, 1> m) {
-  const Full64<T> d;
-  const Full64<uint64_t> du;
-  const Vec64<uint64_t> values = BitCast(du, VecFromMask(d, m)) & Set(du, 1);
-  return vget_lane_u64(values.raw, 0);
-}
-
-// Returns the lowest N for the BitsFromMask result.
-template <typename T, size_t N>
-constexpr uint64_t OnlyActive(uint64_t bits) {
-  return ((N * sizeof(T)) >= 8) ? bits : (bits & ((1ull << N) - 1));
-}
-
-template <typename T, size_t N>
-HWY_INLINE uint64_t BitsFromMask(const Mask128<T, N> mask) {
-  return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
-}
-
-// Returns number of lanes whose mask is set.
-//
-// Masks are either FF..FF or 0. Unfortunately there is no reduce-sub op
-// ("vsubv"). ANDing with 1 would work but requires a constant. Negating also
-// changes each lane to 1 (if mask set) or 0.
-// NOTE: PopCount also operates on vectors, so we still have to do horizontal
-// sums separately. We specialize CountTrue for full vectors (negating instead
-// of PopCount because it avoids an extra shift), and use PopCount of
-// NibblesFromMask for partial vectors.
-
-template <typename T>
-HWY_INLINE size_t CountTrue(hwy::SizeTag<1> /*tag*/, const Mask128<T> mask) {
-  const Full128<int8_t> di;
-  const int8x16_t ones =
-      vnegq_s8(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
-
-#if HWY_ARCH_ARM_A64
-  return static_cast<size_t>(vaddvq_s8(ones));
-#else
-  const int16x8_t x2 = vpaddlq_s8(ones);
-  const int32x4_t x4 = vpaddlq_s16(x2);
-  const int64x2_t x8 = vpaddlq_s32(x4);
-  return static_cast<size_t>(vgetq_lane_s64(x8, 0) + vgetq_lane_s64(x8, 1));
-#endif
-}
-template <typename T>
-HWY_INLINE size_t CountTrue(hwy::SizeTag<2> /*tag*/, const Mask128<T> mask) {
-  const Full128<int16_t> di;
-  const int16x8_t ones =
-      vnegq_s16(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
-
-#if HWY_ARCH_ARM_A64
-  return static_cast<size_t>(vaddvq_s16(ones));
-#else
-  const int32x4_t x2 = vpaddlq_s16(ones);
-  const int64x2_t x4 = vpaddlq_s32(x2);
-  return static_cast<size_t>(vgetq_lane_s64(x4, 0) + vgetq_lane_s64(x4, 1));
-#endif
-}
-
-template <typename T>
-HWY_INLINE size_t CountTrue(hwy::SizeTag<4> /*tag*/, const Mask128<T> mask) {
-  const Full128<int32_t> di;
-  const int32x4_t ones =
-      vnegq_s32(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
-
-#if HWY_ARCH_ARM_A64
-  return static_cast<size_t>(vaddvq_s32(ones));
-#else
-  const int64x2_t x2 = vpaddlq_s32(ones);
-  return static_cast<size_t>(vgetq_lane_s64(x2, 0) + vgetq_lane_s64(x2, 1));
-#endif
-}
-
-template <typename T>
-HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, const Mask128<T> mask) {
-#if HWY_ARCH_ARM_A64
-  const Full128<int64_t> di;
-  const int64x2_t ones =
-      vnegq_s64(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
-  return static_cast<size_t>(vaddvq_s64(ones));
-#else
-  const Full128<uint64_t> du;
-  const auto mask_u = VecFromMask(du, RebindMask(du, mask));
-  const uint64x2_t ones = vshrq_n_u64(mask_u.raw, 63);
-  return static_cast<size_t>(vgetq_lane_u64(ones, 0) + vgetq_lane_u64(ones, 1));
-#endif
-}
-
-}  // namespace detail
-
-// Full
-template <typename T>
-HWY_API size_t CountTrue(Full128<T> /* tag */, const Mask128<T> mask) {
-  return detail::CountTrue(hwy::SizeTag<sizeof(T)>(), mask);
-}
-
-// Partial
-template <typename T, size_t N, HWY_IF_LE64(T, N)>
-HWY_API size_t CountTrue(Simd<T, N, 0> d, const Mask128<T, N> mask) {
-  constexpr int kDiv = 4 * sizeof(T);
-  return PopCount(detail::NibblesFromMask(d, mask)) / kDiv;
-}
-template <typename T, size_t N>
-HWY_API intptr_t FindFirstTrue(const Simd<T, N, 0> d,
-                               const Mask128<T, N> mask) {
-  const uint64_t nib = detail::NibblesFromMask(d, mask);
-  if (nib == 0) return -1;
-  constexpr int kDiv = 4 * sizeof(T);
-  return static_cast<intptr_t>(Num0BitsBelowLS1Bit_Nonzero64(nib) / kDiv);
-}
-
-// `p` points to at least 8 writable bytes.
-template <typename T, size_t N>
-HWY_API size_t StoreMaskBits(Simd<T, N, 0> /* tag */, const Mask128<T, N> mask,
-                             uint8_t* bits) {
-  const uint64_t mask_bits = detail::BitsFromMask(mask);
-  const size_t kNumBytes = (N + 7) / 8;
-  CopyBytes<kNumBytes>(&mask_bits, bits);
-  return kNumBytes;
-}
-
-template <typename T, size_t N>
-HWY_API bool AllFalse(const Simd<T, N, 0> d, const Mask128<T, N> m) {
-  return detail::NibblesFromMask(d, m) == 0;
-}
-
-// Full
-template <typename T>
-HWY_API bool AllTrue(const Full128<T> d, const Mask128<T> m) {
-  return detail::NibblesFromMask(d, m) == ~0ull;
-}
-// Partial
-template <typename T, size_t N, HWY_IF_LE64(T, N)>
-HWY_API bool AllTrue(const Simd<T, N, 0> d, const Mask128<T, N> m) {
-  constexpr size_t kBytes = sizeof(T) * N;
-  return detail::NibblesFromMask(d, m) == (1ull << (kBytes * 4)) - 1;
-}
-
-// ------------------------------ Compress
-
-template <typename T>
-struct CompressIsPartition {
-  enum { value = 1 };
-};
-
-namespace detail {
-
-// Load 8 bytes, replicate into upper half so ZipLower can use the lower half.
-HWY_INLINE Vec128<uint8_t> Load8Bytes(Full128<uint8_t> /*d*/,
-                                      const uint8_t* bytes) {
-  return Vec128<uint8_t>(vreinterpretq_u8_u64(
-      vld1q_dup_u64(reinterpret_cast<const uint64_t*>(bytes))));
-}
-
-// Load 8 bytes and return half-reg with N <= 8 bytes.
-template <size_t N, HWY_IF_LE64(uint8_t, N)>
-HWY_INLINE Vec128<uint8_t, N> Load8Bytes(Simd<uint8_t, N, 0> d,
-                                         const uint8_t* bytes) {
-  return Load(d, bytes);
-}
-
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> IdxFromBits(hwy::SizeTag<2> /*tag*/,
-                                    const uint64_t mask_bits) {
-  HWY_DASSERT(mask_bits < 256);
-  const Simd<T, N, 0> d;
-  const Repartition<uint8_t, decltype(d)> d8;
-  const Simd<uint16_t, N, 0> du;
-
-  // ARM does not provide an equivalent of AVX2 permutevar, so we need byte
-  // indices for VTBL (one vector's worth for each of 256 combinations of
-  // 8 mask bits). Loading them directly would require 4 KiB. We can instead
-  // store lane indices and convert to byte indices (2*lane + 0..1), with the
-  // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
-  // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
-  // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
-  // is likely more costly than the higher cache footprint from storing bytes.
-  alignas(16) constexpr uint8_t table[256 * 8] = {
-      // PrintCompress16x8Tables
-      0,  2,  4,  6,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
-      2,  0,  4,  6,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
-      4,  0,  2,  6,  8,  10, 12, 14, /**/ 0, 4,  2,  6,  8,  10, 12, 14,  //
-      2,  4,  0,  6,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
-      6,  0,  2,  4,  8,  10, 12, 14, /**/ 0, 6,  2,  4,  8,  10, 12, 14,  //
-      2,  6,  0,  4,  8,  10, 12, 14, /**/ 0, 2,  6,  4,  8,  10, 12, 14,  //
-      4,  6,  0,  2,  8,  10, 12, 14, /**/ 0, 4,  6,  2,  8,  10, 12, 14,  //
-      2,  4,  6,  0,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
-      8,  0,  2,  4,  6,  10, 12, 14, /**/ 0, 8,  2,  4,  6,  10, 12, 14,  //
-      2,  8,  0,  4,  6,  10, 12, 14, /**/ 0, 2,  8,  4,  6,  10, 12, 14,  //
-      4,  8,  0,  2,  6,  10, 12, 14, /**/ 0, 4,  8,  2,  6,  10, 12, 14,  //
-      2,  4,  8,  0,  6,  10, 12, 14, /**/ 0, 2,  4,  8,  6,  10, 12, 14,  //
-      6,  8,  0,  2,  4,  10, 12, 14, /**/ 0, 6,  8,  2,  4,  10, 12, 14,  //
-      2,  6,  8,  0,  4,  10, 12, 14, /**/ 0, 2,  6,  8,  4,  10, 12, 14,  //
-      4,  6,  8,  0,  2,  10, 12, 14, /**/ 0, 4,  6,  8,  2,  10, 12, 14,  //
-      2,  4,  6,  8,  0,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
-      10, 0,  2,  4,  6,  8,  12, 14, /**/ 0, 10, 2,  4,  6,  8,  12, 14,  //
-      2,  10, 0,  4,  6,  8,  12, 14, /**/ 0, 2,  10, 4,  6,  8,  12, 14,  //
-      4,  10, 0,  2,  6,  8,  12, 14, /**/ 0, 4,  10, 2,  6,  8,  12, 14,  //
-      2,  4,  10, 0,  6,  8,  12, 14, /**/ 0, 2,  4,  10, 6,  8,  12, 14,  //
-      6,  10, 0,  2,  4,  8,  12, 14, /**/ 0, 6,  10, 2,  4,  8,  12, 14,  //
-      2,  6,  10, 0,  4,  8,  12, 14, /**/ 0, 2,  6,  10, 4,  8,  12, 14,  //
-      4,  6,  10, 0,  2,  8,  12, 14, /**/ 0, 4,  6,  10, 2,  8,  12, 14,  //
-      2,  4,  6,  10, 0,  8,  12, 14, /**/ 0, 2,  4,  6,  10, 8,  12, 14,  //
-      8,  10, 0,  2,  4,  6,  12, 14, /**/ 0, 8,  10, 2,  4,  6,  12, 14,  //
-      2,  8,  10, 0,  4,  6,  12, 14, /**/ 0, 2,  8,  10, 4,  6,  12, 14,  //
-      4,  8,  10, 0,  2,  6,  12, 14, /**/ 0, 4,  8,  10, 2,  6,  12, 14,  //
-      2,  4,  8,  10, 0,  6,  12, 14, /**/ 0, 2,  4,  8,  10, 6,  12, 14,  //
-      6,  8,  10, 0,  2,  4,  12, 14, /**/ 0, 6,  8,  10, 2,  4,  12, 14,  //
-      2,  6,  8,  10, 0,  4,  12, 14, /**/ 0, 2,  6,  8,  10, 4,  12, 14,  //
-      4,  6,  8,  10, 0,  2,  12, 14, /**/ 0, 4,  6,  8,  10, 2,  12, 14,  //
-      2,  4,  6,  8,  10, 0,  12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
-      12, 0,  2,  4,  6,  8,  10, 14, /**/ 0, 12, 2,  4,  6,  8,  10, 14,  //
-      2,  12, 0,  4,  6,  8,  10, 14, /**/ 0, 2,  12, 4,  6,  8,  10, 14,  //
-      4,  12, 0,  2,  6,  8,  10, 14, /**/ 0, 4,  12, 2,  6,  8,  10, 14,  //
-      2,  4,  12, 0,  6,  8,  10, 14, /**/ 0, 2,  4,  12, 6,  8,  10, 14,  //
-      6,  12, 0,  2,  4,  8,  10, 14, /**/ 0, 6,  12, 2,  4,  8,  10, 14,  //
-      2,  6,  12, 0,  4,  8,  10, 14, /**/ 0, 2,  6,  12, 4,  8,  10, 14,  //
-      4,  6,  12, 0,  2,  8,  10, 14, /**/ 0, 4,  6,  12, 2,  8,  10, 14,  //
-      2,  4,  6,  12, 0,  8,  10, 14, /**/ 0, 2,  4,  6,  12, 8,  10, 14,  //
-      8,  12, 0,  2,  4,  6,  10, 14, /**/ 0, 8,  12, 2,  4,  6,  10, 14,  //
-      2,  8,  12, 0,  4,  6,  10, 14, /**/ 0, 2,  8,  12, 4,  6,  10, 14,  //
-      4,  8,  12, 0,  2,  6,  10, 14, /**/ 0, 4,  8,  12, 2,  6,  10, 14,  //
-      2,  4,  8,  12, 0,  6,  10, 14, /**/ 0, 2,  4,  8,  12, 6,  10, 14,  //
-      6,  8,  12, 0,  2,  4,  10, 14, /**/ 0, 6,  8,  12, 2,  4,  10, 14,  //
-      2,  6,  8,  12, 0,  4,  10, 14, /**/ 0, 2,  6,  8,  12, 4,  10, 14,  //
-      4,  6,  8,  12, 0,  2,  10, 14, /**/ 0, 4,  6,  8,  12, 2,  10, 14,  //
-      2,  4,  6,  8,  12, 0,  10, 14, /**/ 0, 2,  4,  6,  8,  12, 10, 14,  //
-      10, 12, 0,  2,  4,  6,  8,  14, /**/ 0, 10, 12, 2,  4,  6,  8,  14,  //
-      2,  10, 12, 0,  4,  6,  8,  14, /**/ 0, 2,  10, 12, 4,  6,  8,  14,  //
-      4,  10, 12, 0,  2,  6,  8,  14, /**/ 0, 4,  10, 12, 2,  6,  8,  14,  //
-      2,  4,  10, 12, 0,  6,  8,  14, /**/ 0, 2,  4,  10, 12, 6,  8,  14,  //
-      6,  10, 12, 0,  2,  4,  8,  14, /**/ 0, 6,  10, 12, 2,  4,  8,  14,  //
-      2,  6,  10, 12, 0,  4,  8,  14, /**/ 0, 2,  6,  10, 12, 4,  8,  14,  //
-      4,  6,  10, 12, 0,  2,  8,  14, /**/ 0, 4,  6,  10, 12, 2,  8,  14,  //
-      2,  4,  6,  10, 12, 0,  8,  14, /**/ 0, 2,  4,  6,  10, 12, 8,  14,  //
-      8,  10, 12, 0,  2,  4,  6,  14, /**/ 0, 8,  10, 12, 2,  4,  6,  14,  //
-      2,  8,  10, 12, 0,  4,  6,  14, /**/ 0, 2,  8,  10, 12, 4,  6,  14,  //
-      4,  8,  10, 12, 0,  2,  6,  14, /**/ 0, 4,  8,  10, 12, 2,  6,  14,  //
-      2,  4,  8,  10, 12, 0,  6,  14, /**/ 0, 2,  4,  8,  10, 12, 6,  14,  //
-      6,  8,  10, 12, 0,  2,  4,  14, /**/ 0, 6,  8,  10, 12, 2,  4,  14,  //
-      2,  6,  8,  10, 12, 0,  4,  14, /**/ 0, 2,  6,  8,  10, 12, 4,  14,  //
-      4,  6,  8,  10, 12, 0,  2,  14, /**/ 0, 4,  6,  8,  10, 12, 2,  14,  //
-      2,  4,  6,  8,  10, 12, 0,  14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
-      14, 0,  2,  4,  6,  8,  10, 12, /**/ 0, 14, 2,  4,  6,  8,  10, 12,  //
-      2,  14, 0,  4,  6,  8,  10, 12, /**/ 0, 2,  14, 4,  6,  8,  10, 12,  //
-      4,  14, 0,  2,  6,  8,  10, 12, /**/ 0, 4,  14, 2,  6,  8,  10, 12,  //
-      2,  4,  14, 0,  6,  8,  10, 12, /**/ 0, 2,  4,  14, 6,  8,  10, 12,  //
-      6,  14, 0,  2,  4,  8,  10, 12, /**/ 0, 6,  14, 2,  4,  8,  10, 12,  //
-      2,  6,  14, 0,  4,  8,  10, 12, /**/ 0, 2,  6,  14, 4,  8,  10, 12,  //
-      4,  6,  14, 0,  2,  8,  10, 12, /**/ 0, 4,  6,  14, 2,  8,  10, 12,  //
-      2,  4,  6,  14, 0,  8,  10, 12, /**/ 0, 2,  4,  6,  14, 8,  10, 12,  //
-      8,  14, 0,  2,  4,  6,  10, 12, /**/ 0, 8,  14, 2,  4,  6,  10, 12,  //
-      2,  8,  14, 0,  4,  6,  10, 12, /**/ 0, 2,  8,  14, 4,  6,  10, 12,  //
-      4,  8,  14, 0,  2,  6,  10, 12, /**/ 0, 4,  8,  14, 2,  6,  10, 12,  //
-      2,  4,  8,  14, 0,  6,  10, 12, /**/ 0, 2,  4,  8,  14, 6,  10, 12,  //
-      6,  8,  14, 0,  2,  4,  10, 12, /**/ 0, 6,  8,  14, 2,  4,  10, 12,  //
-      2,  6,  8,  14, 0,  4,  10, 12, /**/ 0, 2,  6,  8,  14, 4,  10, 12,  //
-      4,  6,  8,  14, 0,  2,  10, 12, /**/ 0, 4,  6,  8,  14, 2,  10, 12,  //
-      2,  4,  6,  8,  14, 0,  10, 12, /**/ 0, 2,  4,  6,  8,  14, 10, 12,  //
-      10, 14, 0,  2,  4,  6,  8,  12, /**/ 0, 10, 14, 2,  4,  6,  8,  12,  //
-      2,  10, 14, 0,  4,  6,  8,  12, /**/ 0, 2,  10, 14, 4,  6,  8,  12,  //
-      4,  10, 14, 0,  2,  6,  8,  12, /**/ 0, 4,  10, 14, 2,  6,  8,  12,  //
-      2,  4,  10, 14, 0,  6,  8,  12, /**/ 0, 2,  4,  10, 14, 6,  8,  12,  //
-      6,  10, 14, 0,  2,  4,  8,  12, /**/ 0, 6,  10, 14, 2,  4,  8,  12,  //
-      2,  6,  10, 14, 0,  4,  8,  12, /**/ 0, 2,  6,  10, 14, 4,  8,  12,  //
-      4,  6,  10, 14, 0,  2,  8,  12, /**/ 0, 4,  6,  10, 14, 2,  8,  12,  //
-      2,  4,  6,  10, 14, 0,  8,  12, /**/ 0, 2,  4,  6,  10, 14, 8,  12,  //
-      8,  10, 14, 0,  2,  4,  6,  12, /**/ 0, 8,  10, 14, 2,  4,  6,  12,  //
-      2,  8,  10, 14, 0,  4,  6,  12, /**/ 0, 2,  8,  10, 14, 4,  6,  12,  //
-      4,  8,  10, 14, 0,  2,  6,  12, /**/ 0, 4,  8,  10, 14, 2,  6,  12,  //
-      2,  4,  8,  10, 14, 0,  6,  12, /**/ 0, 2,  4,  8,  10, 14, 6,  12,  //
-      6,  8,  10, 14, 0,  2,  4,  12, /**/ 0, 6,  8,  10, 14, 2,  4,  12,  //
-      2,  6,  8,  10, 14, 0,  4,  12, /**/ 0, 2,  6,  8,  10, 14, 4,  12,  //
-      4,  6,  8,  10, 14, 0,  2,  12, /**/ 0, 4,  6,  8,  10, 14, 2,  12,  //
-      2,  4,  6,  8,  10, 14, 0,  12, /**/ 0, 2,  4,  6,  8,  10, 14, 12,  //
-      12, 14, 0,  2,  4,  6,  8,  10, /**/ 0, 12, 14, 2,  4,  6,  8,  10,  //
-      2,  12, 14, 0,  4,  6,  8,  10, /**/ 0, 2,  12, 14, 4,  6,  8,  10,  //
-      4,  12, 14, 0,  2,  6,  8,  10, /**/ 0, 4,  12, 14, 2,  6,  8,  10,  //
-      2,  4,  12, 14, 0,  6,  8,  10, /**/ 0, 2,  4,  12, 14, 6,  8,  10,  //
-      6,  12, 14, 0,  2,  4,  8,  10, /**/ 0, 6,  12, 14, 2,  4,  8,  10,  //
-      2,  6,  12, 14, 0,  4,  8,  10, /**/ 0, 2,  6,  12, 14, 4,  8,  10,  //
-      4,  6,  12, 14, 0,  2,  8,  10, /**/ 0, 4,  6,  12, 14, 2,  8,  10,  //
-      2,  4,  6,  12, 14, 0,  8,  10, /**/ 0, 2,  4,  6,  12, 14, 8,  10,  //
-      8,  12, 14, 0,  2,  4,  6,  10, /**/ 0, 8,  12, 14, 2,  4,  6,  10,  //
-      2,  8,  12, 14, 0,  4,  6,  10, /**/ 0, 2,  8,  12, 14, 4,  6,  10,  //
-      4,  8,  12, 14, 0,  2,  6,  10, /**/ 0, 4,  8,  12, 14, 2,  6,  10,  //
-      2,  4,  8,  12, 14, 0,  6,  10, /**/ 0, 2,  4,  8,  12, 14, 6,  10,  //
-      6,  8,  12, 14, 0,  2,  4,  10, /**/ 0, 6,  8,  12, 14, 2,  4,  10,  //
-      2,  6,  8,  12, 14, 0,  4,  10, /**/ 0, 2,  6,  8,  12, 14, 4,  10,  //
-      4,  6,  8,  12, 14, 0,  2,  10, /**/ 0, 4,  6,  8,  12, 14, 2,  10,  //
-      2,  4,  6,  8,  12, 14, 0,  10, /**/ 0, 2,  4,  6,  8,  12, 14, 10,  //
-      10, 12, 14, 0,  2,  4,  6,  8,  /**/ 0, 10, 12, 14, 2,  4,  6,  8,   //
-      2,  10, 12, 14, 0,  4,  6,  8,  /**/ 0, 2,  10, 12, 14, 4,  6,  8,   //
-      4,  10, 12, 14, 0,  2,  6,  8,  /**/ 0, 4,  10, 12, 14, 2,  6,  8,   //
-      2,  4,  10, 12, 14, 0,  6,  8,  /**/ 0, 2,  4,  10, 12, 14, 6,  8,   //
-      6,  10, 12, 14, 0,  2,  4,  8,  /**/ 0, 6,  10, 12, 14, 2,  4,  8,   //
-      2,  6,  10, 12, 14, 0,  4,  8,  /**/ 0, 2,  6,  10, 12, 14, 4,  8,   //
-      4,  6,  10, 12, 14, 0,  2,  8,  /**/ 0, 4,  6,  10, 12, 14, 2,  8,   //
-      2,  4,  6,  10, 12, 14, 0,  8,  /**/ 0, 2,  4,  6,  10, 12, 14, 8,   //
-      8,  10, 12, 14, 0,  2,  4,  6,  /**/ 0, 8,  10, 12, 14, 2,  4,  6,   //
-      2,  8,  10, 12, 14, 0,  4,  6,  /**/ 0, 2,  8,  10, 12, 14, 4,  6,   //
-      4,  8,  10, 12, 14, 0,  2,  6,  /**/ 0, 4,  8,  10, 12, 14, 2,  6,   //
-      2,  4,  8,  10, 12, 14, 0,  6,  /**/ 0, 2,  4,  8,  10, 12, 14, 6,   //
-      6,  8,  10, 12, 14, 0,  2,  4,  /**/ 0, 6,  8,  10, 12, 14, 2,  4,   //
-      2,  6,  8,  10, 12, 14, 0,  4,  /**/ 0, 2,  6,  8,  10, 12, 14, 4,   //
-      4,  6,  8,  10, 12, 14, 0,  2,  /**/ 0, 4,  6,  8,  10, 12, 14, 2,   //
-      2,  4,  6,  8,  10, 12, 14, 0,  /**/ 0, 2,  4,  6,  8,  10, 12, 14};
-
-  const Vec128<uint8_t, 2 * N> byte_idx = Load8Bytes(d8, table + mask_bits * 8);
-  const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
-  return BitCast(d, pairs + Set(du, 0x0100));
-}
-
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> IdxFromNotBits(hwy::SizeTag<2> /*tag*/,
-                                       const uint64_t mask_bits) {
-  HWY_DASSERT(mask_bits < 256);
-  const Simd<T, N, 0> d;
-  const Repartition<uint8_t, decltype(d)> d8;
-  const Simd<uint16_t, N, 0> du;
-
-  // ARM does not provide an equivalent of AVX2 permutevar, so we need byte
-  // indices for VTBL (one vector's worth for each of 256 combinations of
-  // 8 mask bits). Loading them directly would require 4 KiB. We can instead
-  // store lane indices and convert to byte indices (2*lane + 0..1), with the
-  // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
-  // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
-  // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
-  // is likely more costly than the higher cache footprint from storing bytes.
-  alignas(16) constexpr uint8_t table[256 * 8] = {
-      // PrintCompressNot16x8Tables
-      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 12, 14, 0,   //
-      0, 4,  6,  8,  10, 12, 14, 2,  /**/ 4,  6,  8,  10, 12, 14, 0,  2,   //
-      0, 2,  6,  8,  10, 12, 14, 4,  /**/ 2,  6,  8,  10, 12, 14, 0,  4,   //
-      0, 6,  8,  10, 12, 14, 2,  4,  /**/ 6,  8,  10, 12, 14, 0,  2,  4,   //
-      0, 2,  4,  8,  10, 12, 14, 6,  /**/ 2,  4,  8,  10, 12, 14, 0,  6,   //
-      0, 4,  8,  10, 12, 14, 2,  6,  /**/ 4,  8,  10, 12, 14, 0,  2,  6,   //
-      0, 2,  8,  10, 12, 14, 4,  6,  /**/ 2,  8,  10, 12, 14, 0,  4,  6,   //
-      0, 8,  10, 12, 14, 2,  4,  6,  /**/ 8,  10, 12, 14, 0,  2,  4,  6,   //
-      0, 2,  4,  6,  10, 12, 14, 8,  /**/ 2,  4,  6,  10, 12, 14, 0,  8,   //
-      0, 4,  6,  10, 12, 14, 2,  8,  /**/ 4,  6,  10, 12, 14, 0,  2,  8,   //
-      0, 2,  6,  10, 12, 14, 4,  8,  /**/ 2,  6,  10, 12, 14, 0,  4,  8,   //
-      0, 6,  10, 12, 14, 2,  4,  8,  /**/ 6,  10, 12, 14, 0,  2,  4,  8,   //
-      0, 2,  4,  10, 12, 14, 6,  8,  /**/ 2,  4,  10, 12, 14, 0,  6,  8,   //
-      0, 4,  10, 12, 14, 2,  6,  8,  /**/ 4,  10, 12, 14, 0,  2,  6,  8,   //
-      0, 2,  10, 12, 14, 4,  6,  8,  /**/ 2,  10, 12, 14, 0,  4,  6,  8,   //
-      0, 10, 12, 14, 2,  4,  6,  8,  /**/ 10, 12, 14, 0,  2,  4,  6,  8,   //
-      0, 2,  4,  6,  8,  12, 14, 10, /**/ 2,  4,  6,  8,  12, 14, 0,  10,  //
-      0, 4,  6,  8,  12, 14, 2,  10, /**/ 4,  6,  8,  12, 14, 0,  2,  10,  //
-      0, 2,  6,  8,  12, 14, 4,  10, /**/ 2,  6,  8,  12, 14, 0,  4,  10,  //
-      0, 6,  8,  12, 14, 2,  4,  10, /**/ 6,  8,  12, 14, 0,  2,  4,  10,  //
-      0, 2,  4,  8,  12, 14, 6,  10, /**/ 2,  4,  8,  12, 14, 0,  6,  10,  //
-      0, 4,  8,  12, 14, 2,  6,  10, /**/ 4,  8,  12, 14, 0,  2,  6,  10,  //
-      0, 2,  8,  12, 14, 4,  6,  10, /**/ 2,  8,  12, 14, 0,  4,  6,  10,  //
-      0, 8,  12, 14, 2,  4,  6,  10, /**/ 8,  12, 14, 0,  2,  4,  6,  10,  //
-      0, 2,  4,  6,  12, 14, 8,  10, /**/ 2,  4,  6,  12, 14, 0,  8,  10,  //
-      0, 4,  6,  12, 14, 2,  8,  10, /**/ 4,  6,  12, 14, 0,  2,  8,  10,  //
-      0, 2,  6,  12, 14, 4,  8,  10, /**/ 2,  6,  12, 14, 0,  4,  8,  10,  //
-      0, 6,  12, 14, 2,  4,  8,  10, /**/ 6,  12, 14, 0,  2,  4,  8,  10,  //
-      0, 2,  4,  12, 14, 6,  8,  10, /**/ 2,  4,  12, 14, 0,  6,  8,  10,  //
-      0, 4,  12, 14, 2,  6,  8,  10, /**/ 4,  12, 14, 0,  2,  6,  8,  10,  //
-      0, 2,  12, 14, 4,  6,  8,  10, /**/ 2,  12, 14, 0,  4,  6,  8,  10,  //
-      0, 12, 14, 2,  4,  6,  8,  10, /**/ 12, 14, 0,  2,  4,  6,  8,  10,  //
-      0, 2,  4,  6,  8,  10, 14, 12, /**/ 2,  4,  6,  8,  10, 14, 0,  12,  //
-      0, 4,  6,  8,  10, 14, 2,  12, /**/ 4,  6,  8,  10, 14, 0,  2,  12,  //
-      0, 2,  6,  8,  10, 14, 4,  12, /**/ 2,  6,  8,  10, 14, 0,  4,  12,  //
-      0, 6,  8,  10, 14, 2,  4,  12, /**/ 6,  8,  10, 14, 0,  2,  4,  12,  //
-      0, 2,  4,  8,  10, 14, 6,  12, /**/ 2,  4,  8,  10, 14, 0,  6,  12,  //
-      0, 4,  8,  10, 14, 2,  6,  12, /**/ 4,  8,  10, 14, 0,  2,  6,  12,  //
-      0, 2,  8,  10, 14, 4,  6,  12, /**/ 2,  8,  10, 14, 0,  4,  6,  12,  //
-      0, 8,  10, 14, 2,  4,  6,  12, /**/ 8,  10, 14, 0,  2,  4,  6,  12,  //
-      0, 2,  4,  6,  10, 14, 8,  12, /**/ 2,  4,  6,  10, 14, 0,  8,  12,  //
-      0, 4,  6,  10, 14, 2,  8,  12, /**/ 4,  6,  10, 14, 0,  2,  8,  12,  //
-      0, 2,  6,  10, 14, 4,  8,  12, /**/ 2,  6,  10, 14, 0,  4,  8,  12,  //
-      0, 6,  10, 14, 2,  4,  8,  12, /**/ 6,  10, 14, 0,  2,  4,  8,  12,  //
-      0, 2,  4,  10, 14, 6,  8,  12, /**/ 2,  4,  10, 14, 0,  6,  8,  12,  //
-      0, 4,  10, 14, 2,  6,  8,  12, /**/ 4,  10, 14, 0,  2,  6,  8,  12,  //
-      0, 2,  10, 14, 4,  6,  8,  12, /**/ 2,  10, 14, 0,  4,  6,  8,  12,  //
-      0, 10, 14, 2,  4,  6,  8,  12, /**/ 10, 14, 0,  2,  4,  6,  8,  12,  //
-      0, 2,  4,  6,  8,  14, 10, 12, /**/ 2,  4,  6,  8,  14, 0,  10, 12,  //
-      0, 4,  6,  8,  14, 2,  10, 12, /**/ 4,  6,  8,  14, 0,  2,  10, 12,  //
-      0, 2,  6,  8,  14, 4,  10, 12, /**/ 2,  6,  8,  14, 0,  4,  10, 12,  //
-      0, 6,  8,  14, 2,  4,  10, 12, /**/ 6,  8,  14, 0,  2,  4,  10, 12,  //
-      0, 2,  4,  8,  14, 6,  10, 12, /**/ 2,  4,  8,  14, 0,  6,  10, 12,  //
-      0, 4,  8,  14, 2,  6,  10, 12, /**/ 4,  8,  14, 0,  2,  6,  10, 12,  //
-      0, 2,  8,  14, 4,  6,  10, 12, /**/ 2,  8,  14, 0,  4,  6,  10, 12,  //
-      0, 8,  14, 2,  4,  6,  10, 12, /**/ 8,  14, 0,  2,  4,  6,  10, 12,  //
-      0, 2,  4,  6,  14, 8,  10, 12, /**/ 2,  4,  6,  14, 0,  8,  10, 12,  //
-      0, 4,  6,  14, 2,  8,  10, 12, /**/ 4,  6,  14, 0,  2,  8,  10, 12,  //
-      0, 2,  6,  14, 4,  8,  10, 12, /**/ 2,  6,  14, 0,  4,  8,  10, 12,  //
-      0, 6,  14, 2,  4,  8,  10, 12, /**/ 6,  14, 0,  2,  4,  8,  10, 12,  //
-      0, 2,  4,  14, 6,  8,  10, 12, /**/ 2,  4,  14, 0,  6,  8,  10, 12,  //
-      0, 4,  14, 2,  6,  8,  10, 12, /**/ 4,  14, 0,  2,  6,  8,  10, 12,  //
-      0, 2,  14, 4,  6,  8,  10, 12, /**/ 2,  14, 0,  4,  6,  8,  10, 12,  //
-      0, 14, 2,  4,  6,  8,  10, 12, /**/ 14, 0,  2,  4,  6,  8,  10, 12,  //
-      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 12, 0,  14,  //
-      0, 4,  6,  8,  10, 12, 2,  14, /**/ 4,  6,  8,  10, 12, 0,  2,  14,  //
-      0, 2,  6,  8,  10, 12, 4,  14, /**/ 2,  6,  8,  10, 12, 0,  4,  14,  //
-      0, 6,  8,  10, 12, 2,  4,  14, /**/ 6,  8,  10, 12, 0,  2,  4,  14,  //
-      0, 2,  4,  8,  10, 12, 6,  14, /**/ 2,  4,  8,  10, 12, 0,  6,  14,  //
-      0, 4,  8,  10, 12, 2,  6,  14, /**/ 4,  8,  10, 12, 0,  2,  6,  14,  //
-      0, 2,  8,  10, 12, 4,  6,  14, /**/ 2,  8,  10, 12, 0,  4,  6,  14,  //
-      0, 8,  10, 12, 2,  4,  6,  14, /**/ 8,  10, 12, 0,  2,  4,  6,  14,  //
-      0, 2,  4,  6,  10, 12, 8,  14, /**/ 2,  4,  6,  10, 12, 0,  8,  14,  //
-      0, 4,  6,  10, 12, 2,  8,  14, /**/ 4,  6,  10, 12, 0,  2,  8,  14,  //
-      0, 2,  6,  10, 12, 4,  8,  14, /**/ 2,  6,  10, 12, 0,  4,  8,  14,  //
-      0, 6,  10, 12, 2,  4,  8,  14, /**/ 6,  10, 12, 0,  2,  4,  8,  14,  //
-      0, 2,  4,  10, 12, 6,  8,  14, /**/ 2,  4,  10, 12, 0,  6,  8,  14,  //
-      0, 4,  10, 12, 2,  6,  8,  14, /**/ 4,  10, 12, 0,  2,  6,  8,  14,  //
-      0, 2,  10, 12, 4,  6,  8,  14, /**/ 2,  10, 12, 0,  4,  6,  8,  14,  //
-      0, 10, 12, 2,  4,  6,  8,  14, /**/ 10, 12, 0,  2,  4,  6,  8,  14,  //
-      0, 2,  4,  6,  8,  12, 10, 14, /**/ 2,  4,  6,  8,  12, 0,  10, 14,  //
-      0, 4,  6,  8,  12, 2,  10, 14, /**/ 4,  6,  8,  12, 0,  2,  10, 14,  //
-      0, 2,  6,  8,  12, 4,  10, 14, /**/ 2,  6,  8,  12, 0,  4,  10, 14,  //
-      0, 6,  8,  12, 2,  4,  10, 14, /**/ 6,  8,  12, 0,  2,  4,  10, 14,  //
-      0, 2,  4,  8,  12, 6,  10, 14, /**/ 2,  4,  8,  12, 0,  6,  10, 14,  //
-      0, 4,  8,  12, 2,  6,  10, 14, /**/ 4,  8,  12, 0,  2,  6,  10, 14,  //
-      0, 2,  8,  12, 4,  6,  10, 14, /**/ 2,  8,  12, 0,  4,  6,  10, 14,  //
-      0, 8,  12, 2,  4,  6,  10, 14, /**/ 8,  12, 0,  2,  4,  6,  10, 14,  //
-      0, 2,  4,  6,  12, 8,  10, 14, /**/ 2,  4,  6,  12, 0,  8,  10, 14,  //
-      0, 4,  6,  12, 2,  8,  10, 14, /**/ 4,  6,  12, 0,  2,  8,  10, 14,  //
-      0, 2,  6,  12, 4,  8,  10, 14, /**/ 2,  6,  12, 0,  4,  8,  10, 14,  //
-      0, 6,  12, 2,  4,  8,  10, 14, /**/ 6,  12, 0,  2,  4,  8,  10, 14,  //
-      0, 2,  4,  12, 6,  8,  10, 14, /**/ 2,  4,  12, 0,  6,  8,  10, 14,  //
-      0, 4,  12, 2,  6,  8,  10, 14, /**/ 4,  12, 0,  2,  6,  8,  10, 14,  //
-      0, 2,  12, 4,  6,  8,  10, 14, /**/ 2,  12, 0,  4,  6,  8,  10, 14,  //
-      0, 12, 2,  4,  6,  8,  10, 14, /**/ 12, 0,  2,  4,  6,  8,  10, 14,  //
-      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 0,  12, 14,  //
-      0, 4,  6,  8,  10, 2,  12, 14, /**/ 4,  6,  8,  10, 0,  2,  12, 14,  //
-      0, 2,  6,  8,  10, 4,  12, 14, /**/ 2,  6,  8,  10, 0,  4,  12, 14,  //
-      0, 6,  8,  10, 2,  4,  12, 14, /**/ 6,  8,  10, 0,  2,  4,  12, 14,  //
-      0, 2,  4,  8,  10, 6,  12, 14, /**/ 2,  4,  8,  10, 0,  6,  12, 14,  //
-      0, 4,  8,  10, 2,  6,  12, 14, /**/ 4,  8,  10, 0,  2,  6,  12, 14,  //
-      0, 2,  8,  10, 4,  6,  12, 14, /**/ 2,  8,  10, 0,  4,  6,  12, 14,  //
-      0, 8,  10, 2,  4,  6,  12, 14, /**/ 8,  10, 0,  2,  4,  6,  12, 14,  //
-      0, 2,  4,  6,  10, 8,  12, 14, /**/ 2,  4,  6,  10, 0,  8,  12, 14,  //
-      0, 4,  6,  10, 2,  8,  12, 14, /**/ 4,  6,  10, 0,  2,  8,  12, 14,  //
-      0, 2,  6,  10, 4,  8,  12, 14, /**/ 2,  6,  10, 0,  4,  8,  12, 14,  //
-      0, 6,  10, 2,  4,  8,  12, 14, /**/ 6,  10, 0,  2,  4,  8,  12, 14,  //
-      0, 2,  4,  10, 6,  8,  12, 14, /**/ 2,  4,  10, 0,  6,  8,  12, 14,  //
-      0, 4,  10, 2,  6,  8,  12, 14, /**/ 4,  10, 0,  2,  6,  8,  12, 14,  //
-      0, 2,  10, 4,  6,  8,  12, 14, /**/ 2,  10, 0,  4,  6,  8,  12, 14,  //
-      0, 10, 2,  4,  6,  8,  12, 14, /**/ 10, 0,  2,  4,  6,  8,  12, 14,  //
-      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  0,  10, 12, 14,  //
-      0, 4,  6,  8,  2,  10, 12, 14, /**/ 4,  6,  8,  0,  2,  10, 12, 14,  //
-      0, 2,  6,  8,  4,  10, 12, 14, /**/ 2,  6,  8,  0,  4,  10, 12, 14,  //
-      0, 6,  8,  2,  4,  10, 12, 14, /**/ 6,  8,  0,  2,  4,  10, 12, 14,  //
-      0, 2,  4,  8,  6,  10, 12, 14, /**/ 2,  4,  8,  0,  6,  10, 12, 14,  //
-      0, 4,  8,  2,  6,  10, 12, 14, /**/ 4,  8,  0,  2,  6,  10, 12, 14,  //
-      0, 2,  8,  4,  6,  10, 12, 14, /**/ 2,  8,  0,  4,  6,  10, 12, 14,  //
-      0, 8,  2,  4,  6,  10, 12, 14, /**/ 8,  0,  2,  4,  6,  10, 12, 14,  //
-      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  0,  8,  10, 12, 14,  //
-      0, 4,  6,  2,  8,  10, 12, 14, /**/ 4,  6,  0,  2,  8,  10, 12, 14,  //
-      0, 2,  6,  4,  8,  10, 12, 14, /**/ 2,  6,  0,  4,  8,  10, 12, 14,  //
-      0, 6,  2,  4,  8,  10, 12, 14, /**/ 6,  0,  2,  4,  8,  10, 12, 14,  //
-      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  0,  6,  8,  10, 12, 14,  //
-      0, 4,  2,  6,  8,  10, 12, 14, /**/ 4,  0,  2,  6,  8,  10, 12, 14,  //
-      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  0,  4,  6,  8,  10, 12, 14,  //
-      0, 2,  4,  6,  8,  10, 12, 14, /**/ 0,  2,  4,  6,  8,  10, 12, 14};
-
-  const Vec128<uint8_t, 2 * N> byte_idx = Load8Bytes(d8, table + mask_bits * 8);
-  const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
-  return BitCast(d, pairs + Set(du, 0x0100));
-}
-
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> IdxFromBits(hwy::SizeTag<4> /*tag*/,
-                                    const uint64_t mask_bits) {
-  HWY_DASSERT(mask_bits < 16);
-
-  // There are only 4 lanes, so we can afford to load the index vector directly.
-  alignas(16) constexpr uint8_t u8_indices[16 * 16] = {
-      // PrintCompress32x4Tables
-      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
-      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
-      4,  5,  6,  7,  0,  1,  2,  3,  8,  9,  10, 11, 12, 13, 14, 15,  //
-      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
-      8,  9,  10, 11, 0,  1,  2,  3,  4,  5,  6,  7,  12, 13, 14, 15,  //
-      0,  1,  2,  3,  8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15,  //
-      4,  5,  6,  7,  8,  9,  10, 11, 0,  1,  2,  3,  12, 13, 14, 15,  //
-      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
-      12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,  //
-      0,  1,  2,  3,  12, 13, 14, 15, 4,  5,  6,  7,  8,  9,  10, 11,  //
-      4,  5,  6,  7,  12, 13, 14, 15, 0,  1,  2,  3,  8,  9,  10, 11,  //
-      0,  1,  2,  3,  4,  5,  6,  7,  12, 13, 14, 15, 8,  9,  10, 11,  //
-      8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,   //
-      0,  1,  2,  3,  8,  9,  10, 11, 12, 13, 14, 15, 4,  5,  6,  7,   //
-      4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,   //
-      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15};
-  const Simd<T, N, 0> d;
-  const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
-}
-
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> IdxFromNotBits(hwy::SizeTag<4> /*tag*/,
-                                       const uint64_t mask_bits) {
-  HWY_DASSERT(mask_bits < 16);
-
-  // There are only 4 lanes, so we can afford to load the index vector directly.
-  alignas(16) constexpr uint8_t u8_indices[16 * 16] = {
-      // PrintCompressNot32x4Tables
-      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 4,  5,
-      6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  0,  1,  2,  3,
-      8,  9,  10, 11, 12, 13, 14, 15, 4,  5,  6,  7,  8,  9,  10, 11, 12, 13,
-      14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7,
-      12, 13, 14, 15, 8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15, 0,  1,
-      2,  3,  8,  9,  10, 11, 0,  1,  2,  3,  12, 13, 14, 15, 4,  5,  6,  7,
-      8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
-      10, 11, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-      4,  5,  6,  7,  8,  9,  10, 11, 0,  1,  2,  3,  12, 13, 14, 15, 0,  1,
-      2,  3,  8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15, 8,  9,  10, 11,
-      0,  1,  2,  3,  4,  5,  6,  7,  12, 13, 14, 15, 0,  1,  2,  3,  4,  5,
-      6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 4,  5,  6,  7,  0,  1,  2,  3,
-      8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
-      10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
-      12, 13, 14, 15};
-  const Simd<T, N, 0> d;
-  const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
-}
-
-#if HWY_HAVE_INTEGER64 || HWY_HAVE_FLOAT64
-
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> IdxFromBits(hwy::SizeTag<8> /*tag*/,
-                                    const uint64_t mask_bits) {
-  HWY_DASSERT(mask_bits < 4);
-
-  // There are only 2 lanes, so we can afford to load the index vector directly.
-  alignas(16) constexpr uint8_t u8_indices[64] = {
-      // PrintCompress64x2Tables
-      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
-      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
-      8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2,  3,  4,  5,  6,  7,
-      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15};
-
-  const Simd<T, N, 0> d;
-  const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
-}
-
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> IdxFromNotBits(hwy::SizeTag<8> /*tag*/,
-                                       const uint64_t mask_bits) {
-  HWY_DASSERT(mask_bits < 4);
-
-  // There are only 2 lanes, so we can afford to load the index vector directly.
-  alignas(16) constexpr uint8_t u8_indices[4 * 16] = {
-      // PrintCompressNot64x2Tables
-      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
-      8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2,  3,  4,  5,  6,  7,
-      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
-      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15};
-
-  const Simd<T, N, 0> d;
-  const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
-}
-
-#endif
-
-// Helper function called by both Compress and CompressStore - avoids a
-// redundant BitsFromMask in the latter.
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> Compress(Vec128<T, N> v, const uint64_t mask_bits) {
-  const auto idx =
-      detail::IdxFromBits<T, N>(hwy::SizeTag<sizeof(T)>(), mask_bits);
-  using D = Simd<T, N, 0>;
-  const RebindToSigned<D> di;
-  return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
-}
-
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> CompressNot(Vec128<T, N> v, const uint64_t mask_bits) {
-  const auto idx =
-      detail::IdxFromNotBits<T, N>(hwy::SizeTag<sizeof(T)>(), mask_bits);
-  using D = Simd<T, N, 0>;
-  const RebindToSigned<D> di;
-  return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
-}
-
-}  // namespace detail
-
-// Single lane: no-op
-template <typename T>
-HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
-  return v;
-}
-
-// Two lanes: conditional swap
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec128<T, N> Compress(Vec128<T, N> v, const Mask128<T, N> mask) {
-  // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep.
-  const Simd<T, N, 0> d;
-  const Vec128<T, N> m = VecFromMask(d, mask);
-  const Vec128<T, N> maskL = DupEven(m);
-  const Vec128<T, N> maskH = DupOdd(m);
-  const Vec128<T, N> swap = AndNot(maskL, maskH);
-  return IfVecThenElse(swap, Shuffle01(v), v);
-}
-
-// General case
-template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 8)>
-HWY_API Vec128<T, N> Compress(Vec128<T, N> v, const Mask128<T, N> mask) {
-  return detail::Compress(v, detail::BitsFromMask(mask));
-}
-
-// Single lane: no-op
-template <typename T>
-HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
-  return v;
-}
-
-// Two lanes: conditional swap
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) {
-  // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep.
-  const Full128<T> d;
-  const Vec128<T> m = VecFromMask(d, mask);
-  const Vec128<T> maskL = DupEven(m);
-  const Vec128<T> maskH = DupOdd(m);
-  const Vec128<T> swap = AndNot(maskH, maskL);
-  return IfVecThenElse(swap, Shuffle01(v), v);
-}
-
-// General case
-template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 8)>
-HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
-  // For partial vectors, we cannot pull the Not() into the table because
-  // BitsFromMask clears the upper bits.
-  if (N < 16 / sizeof(T)) {
-    return detail::Compress(v, detail::BitsFromMask(Not(mask)));
-  }
-  return detail::CompressNot(v, detail::BitsFromMask(mask));
-}
-
-// ------------------------------ CompressBlocksNot
-HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
-                                           Mask128<uint64_t> /* m */) {
-  return v;
-}
-
-// ------------------------------ CompressBits
-
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> CompressBits(Vec128<T, N> v,
-                                     const uint8_t* HWY_RESTRICT bits) {
-  uint64_t mask_bits = 0;
-  constexpr size_t kNumBytes = (N + 7) / 8;
-  CopyBytes<kNumBytes>(bits, &mask_bits);
-  if (N < 8) {
-    mask_bits &= (1ull << N) - 1;
-  }
-
-  return detail::Compress(v, mask_bits);
-}
-
-// ------------------------------ CompressStore
-template <typename T, size_t N>
-HWY_API size_t CompressStore(Vec128<T, N> v, const Mask128<T, N> mask,
-                             Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
-  const uint64_t mask_bits = detail::BitsFromMask(mask);
-  StoreU(detail::Compress(v, mask_bits), d, unaligned);
-  return PopCount(mask_bits);
-}
-
-// ------------------------------ CompressBlendedStore
-template <typename T, size_t N>
-HWY_API size_t CompressBlendedStore(Vec128<T, N> v, Mask128<T, N> m,
-                                    Simd<T, N, 0> d,
-                                    T* HWY_RESTRICT unaligned) {
-  const RebindToUnsigned<decltype(d)> du;  // so we can support fp16/bf16
-  using TU = TFromD<decltype(du)>;
-  const uint64_t mask_bits = detail::BitsFromMask(m);
-  const size_t count = PopCount(mask_bits);
-  const Mask128<T, N> store_mask = RebindMask(d, FirstN(du, count));
-  const Vec128<TU, N> compressed = detail::Compress(BitCast(du, v), mask_bits);
-  BlendedStore(BitCast(d, compressed), store_mask, d, unaligned);
-  return count;
-}
-
-// ------------------------------ CompressBitsStore
-
-template <typename T, size_t N>
-HWY_API size_t CompressBitsStore(Vec128<T, N> v,
-                                 const uint8_t* HWY_RESTRICT bits,
-                                 Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
-  uint64_t mask_bits = 0;
-  constexpr size_t kNumBytes = (N + 7) / 8;
-  CopyBytes<kNumBytes>(bits, &mask_bits);
-  if (N < 8) {
-    mask_bits &= (1ull << N) - 1;
-  }
-
-  StoreU(detail::Compress(v, mask_bits), d, unaligned);
-  return PopCount(mask_bits);
-}
-
-// ------------------------------ LoadInterleaved2
-
-// Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2.
-#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
-#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
-#else
-#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
-#endif
-
-namespace detail {
-#define HWY_NEON_BUILD_TPL_HWY_LOAD_INT
-#define HWY_NEON_BUILD_ARG_HWY_LOAD_INT from
-
-#if HWY_ARCH_ARM_A64
-#define HWY_IF_LOAD_INT(T, N) HWY_IF_GE64(T, N)
-#define HWY_NEON_DEF_FUNCTION_LOAD_INT HWY_NEON_DEF_FUNCTION_ALL_TYPES
-#else
-// Exclude 64x2 and f64x1, which are only supported on aarch64
-#define HWY_IF_LOAD_INT(T, N) \
-  hwy::EnableIf<N * sizeof(T) >= 8 && (N == 1 || sizeof(T) < 8)>* = nullptr
-#define HWY_NEON_DEF_FUNCTION_LOAD_INT(name, prefix, infix, args) \
-  HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args)    \
-  HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args)   \
-  HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args)       \
-  HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args) \
-  HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)
-#endif  // HWY_ARCH_ARM_A64
-
-// Must return raw tuple because Tuple2 lack a ctor, and we cannot use
-// brace-initialization in HWY_NEON_DEF_FUNCTION because some functions return
-// void.
-#define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \
-  decltype(Tuple2<type##_t, size>().raw)
-// Tuple tag arg allows overloading (cannot just overload on return type)
-#define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \
-  const type##_t *from, Tuple2<type##_t, size>
-HWY_NEON_DEF_FUNCTION_LOAD_INT(LoadInterleaved2, vld2, _, HWY_LOAD_INT)
-#undef HWY_NEON_BUILD_RET_HWY_LOAD_INT
-#undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT
-
-#define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \
-  decltype(Tuple3<type##_t, size>().raw)
-#define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \
-  const type##_t *from, Tuple3<type##_t, size>
-HWY_NEON_DEF_FUNCTION_LOAD_INT(LoadInterleaved3, vld3, _, HWY_LOAD_INT)
-#undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT
-#undef HWY_NEON_BUILD_RET_HWY_LOAD_INT
-
-#define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \
-  decltype(Tuple4<type##_t, size>().raw)
-#define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \
-  const type##_t *from, Tuple4<type##_t, size>
-HWY_NEON_DEF_FUNCTION_LOAD_INT(LoadInterleaved4, vld4, _, HWY_LOAD_INT)
-#undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT
-#undef HWY_NEON_BUILD_RET_HWY_LOAD_INT
-
-#undef HWY_NEON_DEF_FUNCTION_LOAD_INT
-#undef HWY_NEON_BUILD_TPL_HWY_LOAD_INT
-#undef HWY_NEON_BUILD_ARG_HWY_LOAD_INT
-}  // namespace detail
-
-template <typename T, size_t N, HWY_IF_LOAD_INT(T, N)>
-HWY_API void LoadInterleaved2(Simd<T, N, 0> /*tag*/,
-                              const T* HWY_RESTRICT unaligned, Vec128<T, N>& v0,
-                              Vec128<T, N>& v1) {
-  auto raw = detail::LoadInterleaved2(unaligned, detail::Tuple2<T, N>());
-  v0 = Vec128<T, N>(raw.val[0]);
-  v1 = Vec128<T, N>(raw.val[1]);
-}
-
-// <= 32 bits: avoid loading more than N bytes by copying to buffer
-template <typename T, size_t N, HWY_IF_LE32(T, N)>
-HWY_API void LoadInterleaved2(Simd<T, N, 0> /*tag*/,
-                              const T* HWY_RESTRICT unaligned, Vec128<T, N>& v0,
-                              Vec128<T, N>& v1) {
-  // The smallest vector registers are 64-bits and we want space for two.
-  alignas(16) T buf[2 * 8 / sizeof(T)] = {};
-  CopyBytes<N * 2 * sizeof(T)>(unaligned, buf);
-  auto raw = detail::LoadInterleaved2(buf, detail::Tuple2<T, N>());
-  v0 = Vec128<T, N>(raw.val[0]);
-  v1 = Vec128<T, N>(raw.val[1]);
-}
-
-#if HWY_ARCH_ARM_V7
-// 64x2: split into two 64x1
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API void LoadInterleaved2(Full128<T> d, T* HWY_RESTRICT unaligned,
-                              Vec128<T>& v0, Vec128<T>& v1) {
-  const Half<decltype(d)> dh;
-  VFromD<decltype(dh)> v00, v10, v01, v11;
-  LoadInterleaved2(dh, unaligned, v00, v10);
-  LoadInterleaved2(dh, unaligned + 2, v01, v11);
-  v0 = Combine(d, v01, v00);
-  v1 = Combine(d, v11, v10);
-}
-#endif  // HWY_ARCH_ARM_V7
-
-// ------------------------------ LoadInterleaved3
-
-template <typename T, size_t N, HWY_IF_LOAD_INT(T, N)>
-HWY_API void LoadInterleaved3(Simd<T, N, 0> /*tag*/,
-                              const T* HWY_RESTRICT unaligned, Vec128<T, N>& v0,
-                              Vec128<T, N>& v1, Vec128<T, N>& v2) {
-  auto raw = detail::LoadInterleaved3(unaligned, detail::Tuple3<T, N>());
-  v0 = Vec128<T, N>(raw.val[0]);
-  v1 = Vec128<T, N>(raw.val[1]);
-  v2 = Vec128<T, N>(raw.val[2]);
-}
-
-// <= 32 bits: avoid writing more than N bytes by copying to buffer
-template <typename T, size_t N, HWY_IF_LE32(T, N)>
-HWY_API void LoadInterleaved3(Simd<T, N, 0> /*tag*/,
-                              const T* HWY_RESTRICT unaligned, Vec128<T, N>& v0,
-                              Vec128<T, N>& v1, Vec128<T, N>& v2) {
-  // The smallest vector registers are 64-bits and we want space for three.
-  alignas(16) T buf[3 * 8 / sizeof(T)] = {};
-  CopyBytes<N * 3 * sizeof(T)>(unaligned, buf);
-  auto raw = detail::LoadInterleaved3(buf, detail::Tuple3<T, N>());
-  v0 = Vec128<T, N>(raw.val[0]);
-  v1 = Vec128<T, N>(raw.val[1]);
-  v2 = Vec128<T, N>(raw.val[2]);
-}
-
-#if HWY_ARCH_ARM_V7
-// 64x2: split into two 64x1
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API void LoadInterleaved3(Full128<T> d, const T* HWY_RESTRICT unaligned,
-                              Vec128<T>& v0, Vec128<T>& v1, Vec128<T>& v2) {
-  const Half<decltype(d)> dh;
-  VFromD<decltype(dh)> v00, v10, v20, v01, v11, v21;
-  LoadInterleaved3(dh, unaligned, v00, v10, v20);
-  LoadInterleaved3(dh, unaligned + 3, v01, v11, v21);
-  v0 = Combine(d, v01, v00);
-  v1 = Combine(d, v11, v10);
-  v2 = Combine(d, v21, v20);
-}
-#endif  // HWY_ARCH_ARM_V7
-
-// ------------------------------ LoadInterleaved4
-
-template <typename T, size_t N, HWY_IF_LOAD_INT(T, N)>
-HWY_API void LoadInterleaved4(Simd<T, N, 0> /*tag*/,
-                              const T* HWY_RESTRICT unaligned, Vec128<T, N>& v0,
-                              Vec128<T, N>& v1, Vec128<T, N>& v2,
-                              Vec128<T, N>& v3) {
-  auto raw = detail::LoadInterleaved4(unaligned, detail::Tuple4<T, N>());
-  v0 = Vec128<T, N>(raw.val[0]);
-  v1 = Vec128<T, N>(raw.val[1]);
-  v2 = Vec128<T, N>(raw.val[2]);
-  v3 = Vec128<T, N>(raw.val[3]);
-}
-
-// <= 32 bits: avoid writing more than N bytes by copying to buffer
-template <typename T, size_t N, HWY_IF_LE32(T, N)>
-HWY_API void LoadInterleaved4(Simd<T, N, 0> /*tag*/,
-                              const T* HWY_RESTRICT unaligned, Vec128<T, N>& v0,
-                              Vec128<T, N>& v1, Vec128<T, N>& v2,
-                              Vec128<T, N>& v3) {
-  alignas(16) T buf[4 * 8 / sizeof(T)] = {};
-  CopyBytes<N * 4 * sizeof(T)>(unaligned, buf);
-  auto raw = detail::LoadInterleaved4(buf, detail::Tuple4<T, N>());
-  v0 = Vec128<T, N>(raw.val[0]);
-  v1 = Vec128<T, N>(raw.val[1]);
-  v2 = Vec128<T, N>(raw.val[2]);
-  v3 = Vec128<T, N>(raw.val[3]);
-}
-
-#if HWY_ARCH_ARM_V7
-// 64x2: split into two 64x1
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API void LoadInterleaved4(Full128<T> d, const T* HWY_RESTRICT unaligned,
-                              Vec128<T>& v0, Vec128<T>& v1, Vec128<T>& v2,
-                              Vec128<T>& v3) {
-  const Half<decltype(d)> dh;
-  VFromD<decltype(dh)> v00, v10, v20, v30, v01, v11, v21, v31;
-  LoadInterleaved4(dh, unaligned, v00, v10, v20, v30);
-  LoadInterleaved4(dh, unaligned + 4, v01, v11, v21, v31);
-  v0 = Combine(d, v01, v00);
-  v1 = Combine(d, v11, v10);
-  v2 = Combine(d, v21, v20);
-  v3 = Combine(d, v31, v30);
-}
-#endif  // HWY_ARCH_ARM_V7
-
-#undef HWY_IF_LOAD_INT
-
-// ------------------------------ StoreInterleaved2
-
-namespace detail {
-#define HWY_NEON_BUILD_TPL_HWY_STORE_INT
-#define HWY_NEON_BUILD_RET_HWY_STORE_INT(type, size) void
-#define HWY_NEON_BUILD_ARG_HWY_STORE_INT to, tup.raw
-
-#if HWY_ARCH_ARM_A64
-#define HWY_IF_STORE_INT(T, N) HWY_IF_GE64(T, N)
-#define HWY_NEON_DEF_FUNCTION_STORE_INT HWY_NEON_DEF_FUNCTION_ALL_TYPES
-#else
-// Exclude 64x2 and f64x1, which are only supported on aarch64
-#define HWY_IF_STORE_INT(T, N) \
-  hwy::EnableIf<N * sizeof(T) >= 8 && (N == 1 || sizeof(T) < 8)>* = nullptr
-#define HWY_NEON_DEF_FUNCTION_STORE_INT(name, prefix, infix, args) \
-  HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args)     \
-  HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args)    \
-  HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args)        \
-  HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args)  \
-  HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)
-#endif  // HWY_ARCH_ARM_A64
-
-#define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \
-  Tuple2<type##_t, size> tup, type##_t *to
-HWY_NEON_DEF_FUNCTION_STORE_INT(StoreInterleaved2, vst2, _, HWY_STORE_INT)
-#undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT
-
-#define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \
-  Tuple3<type##_t, size> tup, type##_t *to
-HWY_NEON_DEF_FUNCTION_STORE_INT(StoreInterleaved3, vst3, _, HWY_STORE_INT)
-#undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT
-
-#define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \
-  Tuple4<type##_t, size> tup, type##_t *to
-HWY_NEON_DEF_FUNCTION_STORE_INT(StoreInterleaved4, vst4, _, HWY_STORE_INT)
-#undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT
-
-#undef HWY_NEON_DEF_FUNCTION_STORE_INT
-#undef HWY_NEON_BUILD_TPL_HWY_STORE_INT
-#undef HWY_NEON_BUILD_RET_HWY_STORE_INT
-#undef HWY_NEON_BUILD_ARG_HWY_STORE_INT
-}  // namespace detail
-
-template <typename T, size_t N, HWY_IF_STORE_INT(T, N)>
-HWY_API void StoreInterleaved2(const Vec128<T, N> v0, const Vec128<T, N> v1,
-                               Simd<T, N, 0> /*tag*/,
-                               T* HWY_RESTRICT unaligned) {
-  detail::Tuple2<T, N> tup = {{{v0.raw, v1.raw}}};
-  detail::StoreInterleaved2(tup, unaligned);
-}
-
-// <= 32 bits: avoid writing more than N bytes by copying to buffer
-template <typename T, size_t N, HWY_IF_LE32(T, N)>
-HWY_API void StoreInterleaved2(const Vec128<T, N> v0, const Vec128<T, N> v1,
-                               Simd<T, N, 0> /*tag*/,
-                               T* HWY_RESTRICT unaligned) {
-  alignas(16) T buf[2 * 8 / sizeof(T)];
-  detail::Tuple2<T, N> tup = {{{v0.raw, v1.raw}}};
-  detail::StoreInterleaved2(tup, buf);
-  CopyBytes<N * 2 * sizeof(T)>(buf, unaligned);
-}
-
-#if HWY_ARCH_ARM_V7
-// 64x2: split into two 64x1
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API void StoreInterleaved2(const Vec128<T> v0, const Vec128<T> v1,
-                               Full128<T> d, T* HWY_RESTRICT unaligned) {
-  const Half<decltype(d)> dh;
-  StoreInterleaved2(LowerHalf(dh, v0), LowerHalf(dh, v1), dh, unaligned);
-  StoreInterleaved2(UpperHalf(dh, v0), UpperHalf(dh, v1), dh, unaligned + 2);
-}
-#endif  // HWY_ARCH_ARM_V7
-
-// ------------------------------ StoreInterleaved3
-
-template <typename T, size_t N, HWY_IF_STORE_INT(T, N)>
-HWY_API void StoreInterleaved3(const Vec128<T, N> v0, const Vec128<T, N> v1,
-                               const Vec128<T, N> v2, Simd<T, N, 0> /*tag*/,
-                               T* HWY_RESTRICT unaligned) {
-  detail::Tuple3<T, N> tup = {{{v0.raw, v1.raw, v2.raw}}};
-  detail::StoreInterleaved3(tup, unaligned);
-}
-
-// <= 32 bits: avoid writing more than N bytes by copying to buffer
-template <typename T, size_t N, HWY_IF_LE32(T, N)>
-HWY_API void StoreInterleaved3(const Vec128<T, N> v0, const Vec128<T, N> v1,
-                               const Vec128<T, N> v2, Simd<T, N, 0> /*tag*/,
-                               T* HWY_RESTRICT unaligned) {
-  alignas(16) T buf[3 * 8 / sizeof(T)];
-  detail::Tuple3<T, N> tup = {{{v0.raw, v1.raw, v2.raw}}};
-  detail::StoreInterleaved3(tup, buf);
-  CopyBytes<N * 3 * sizeof(T)>(buf, unaligned);
-}
-
-#if HWY_ARCH_ARM_V7
-// 64x2: split into two 64x1
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API void StoreInterleaved3(const Vec128<T> v0, const Vec128<T> v1,
-                               const Vec128<T> v2, Full128<T> d,
-                               T* HWY_RESTRICT unaligned) {
-  const Half<decltype(d)> dh;
-  StoreInterleaved3(LowerHalf(dh, v0), LowerHalf(dh, v1), LowerHalf(dh, v2), dh,
-                    unaligned);
-  StoreInterleaved3(UpperHalf(dh, v0), UpperHalf(dh, v1), UpperHalf(dh, v2), dh,
-                    unaligned + 3);
-}
-#endif  // HWY_ARCH_ARM_V7
-
-// ------------------------------ StoreInterleaved4
-
-template <typename T, size_t N, HWY_IF_STORE_INT(T, N)>
-HWY_API void StoreInterleaved4(const Vec128<T, N> v0, const Vec128<T, N> v1,
-                               const Vec128<T, N> v2, const Vec128<T, N> v3,
-                               Simd<T, N, 0> /*tag*/,
-                               T* HWY_RESTRICT unaligned) {
-  detail::Tuple4<T, N> tup = {{{v0.raw, v1.raw, v2.raw, v3.raw}}};
-  detail::StoreInterleaved4(tup, unaligned);
-}
-
-// <= 32 bits: avoid writing more than N bytes by copying to buffer
-template <typename T, size_t N, HWY_IF_LE32(T, N)>
-HWY_API void StoreInterleaved4(const Vec128<T, N> v0, const Vec128<T, N> v1,
-                               const Vec128<T, N> v2, const Vec128<T, N> v3,
-                               Simd<T, N, 0> /*tag*/,
-                               T* HWY_RESTRICT unaligned) {
-  alignas(16) T buf[4 * 8 / sizeof(T)];
-  detail::Tuple4<T, N> tup = {{{v0.raw, v1.raw, v2.raw, v3.raw}}};
-  detail::StoreInterleaved4(tup, buf);
-  CopyBytes<N * 4 * sizeof(T)>(buf, unaligned);
-}
-
-#if HWY_ARCH_ARM_V7
-// 64x2: split into two 64x1
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API void StoreInterleaved4(const Vec128<T> v0, const Vec128<T> v1,
-                               const Vec128<T> v2, const Vec128<T> v3,
-                               Full128<T> d, T* HWY_RESTRICT unaligned) {
-  const Half<decltype(d)> dh;
-  StoreInterleaved4(LowerHalf(dh, v0), LowerHalf(dh, v1), LowerHalf(dh, v2),
-                    LowerHalf(dh, v3), dh, unaligned);
-  StoreInterleaved4(UpperHalf(dh, v0), UpperHalf(dh, v1), UpperHalf(dh, v2),
-                    UpperHalf(dh, v3), dh, unaligned + 4);
-}
-#endif  // HWY_ARCH_ARM_V7
-
-#undef HWY_IF_STORE_INT
-
-// ------------------------------ Lt128
-
-template <typename T, size_t N, HWY_IF_LE128(T, N)>
-HWY_INLINE Mask128<T, N> Lt128(Simd<T, N, 0> d, Vec128<T, N> a,
-                               Vec128<T, N> b) {
-  static_assert(!IsSigned<T>() && sizeof(T) == 8, "Use u64");
-  // Truth table of Eq and Lt for Hi and Lo u64.
-  // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
-  // =H =L cH cL  | out = cH | (=H & cL)
-  //  0  0  0  0  |  0
-  //  0  0  0  1  |  0
-  //  0  0  1  0  |  1
-  //  0  0  1  1  |  1
-  //  0  1  0  0  |  0
-  //  0  1  0  1  |  0
-  //  0  1  1  0  |  1
-  //  1  0  0  0  |  0
-  //  1  0  0  1  |  1
-  //  1  1  0  0  |  0
-  const Mask128<T, N> eqHL = Eq(a, b);
-  const Vec128<T, N> ltHL = VecFromMask(d, Lt(a, b));
-  // We need to bring cL to the upper lane/bit corresponding to cH. Comparing
-  // the result of InterleaveUpper/Lower requires 9 ops, whereas shifting the
-  // comparison result leftwards requires only 4. IfThenElse compiles to the
-  // same code as OrAnd().
-  const Vec128<T, N> ltLx = DupEven(ltHL);
-  const Vec128<T, N> outHx = IfThenElse(eqHL, ltLx, ltHL);
-  return MaskFromVec(DupOdd(outHx));
-}
-
-template <typename T, size_t N, HWY_IF_LE128(T, N)>
-HWY_INLINE Mask128<T, N> Lt128Upper(Simd<T, N, 0> d, Vec128<T, N> a,
-                                    Vec128<T, N> b) {
-  const Vec128<T, N> ltHL = VecFromMask(d, Lt(a, b));
-  return MaskFromVec(InterleaveUpper(d, ltHL, ltHL));
-}
-
-// ------------------------------ Eq128
-
-template <typename T, size_t N, HWY_IF_LE128(T, N)>
-HWY_INLINE Mask128<T, N> Eq128(Simd<T, N, 0> d, Vec128<T, N> a,
-                               Vec128<T, N> b) {
-  static_assert(!IsSigned<T>() && sizeof(T) == 8, "Use u64");
-  const Vec128<T, N> eqHL = VecFromMask(d, Eq(a, b));
-  return MaskFromVec(And(Reverse2(d, eqHL), eqHL));
-}
-
-template <typename T, size_t N, HWY_IF_LE128(T, N)>
-HWY_INLINE Mask128<T, N> Eq128Upper(Simd<T, N, 0> d, Vec128<T, N> a,
-                                    Vec128<T, N> b) {
-  const Vec128<T, N> eqHL = VecFromMask(d, Eq(a, b));
-  return MaskFromVec(InterleaveUpper(d, eqHL, eqHL));
-}
-
-// ------------------------------ Min128, Max128 (Lt128)
-
-// Without a native OddEven, it seems infeasible to go faster than Lt128.
-template <class D>
-HWY_INLINE VFromD<D> Min128(D d, const VFromD<D> a, const VFromD<D> b) {
-  return IfThenElse(Lt128(d, a, b), a, b);
-}
-
-template <class D>
-HWY_INLINE VFromD<D> Max128(D d, const VFromD<D> a, const VFromD<D> b) {
-  return IfThenElse(Lt128(d, b, a), a, b);
-}
-
-template <class D>
-HWY_INLINE VFromD<D> Min128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
-  return IfThenElse(Lt128Upper(d, a, b), a, b);
-}
-
-template <class D>
-HWY_INLINE VFromD<D> Max128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
-  return IfThenElse(Lt128Upper(d, b, a), a, b);
-}
-
-namespace detail {  // for code folding
-#if HWY_ARCH_ARM_V7
-#undef vuzp1_s8
-#undef vuzp1_u8
-#undef vuzp1_s16
-#undef vuzp1_u16
-#undef vuzp1_s32
-#undef vuzp1_u32
-#undef vuzp1_f32
-#undef vuzp1q_s8
-#undef vuzp1q_u8
-#undef vuzp1q_s16
-#undef vuzp1q_u16
-#undef vuzp1q_s32
-#undef vuzp1q_u32
-#undef vuzp1q_f32
-#undef vuzp2_s8
-#undef vuzp2_u8
-#undef vuzp2_s16
-#undef vuzp2_u16
-#undef vuzp2_s32
-#undef vuzp2_u32
-#undef vuzp2_f32
-#undef vuzp2q_s8
-#undef vuzp2q_u8
-#undef vuzp2q_s16
-#undef vuzp2q_u16
-#undef vuzp2q_s32
-#undef vuzp2q_u32
-#undef vuzp2q_f32
-#undef vzip1_s8
-#undef vzip1_u8
-#undef vzip1_s16
-#undef vzip1_u16
-#undef vzip1_s32
-#undef vzip1_u32
-#undef vzip1_f32
-#undef vzip1q_s8
-#undef vzip1q_u8
-#undef vzip1q_s16
-#undef vzip1q_u16
-#undef vzip1q_s32
-#undef vzip1q_u32
-#undef vzip1q_f32
-#undef vzip2_s8
-#undef vzip2_u8
-#undef vzip2_s16
-#undef vzip2_u16
-#undef vzip2_s32
-#undef vzip2_u32
-#undef vzip2_f32
-#undef vzip2q_s8
-#undef vzip2q_u8
-#undef vzip2q_s16
-#undef vzip2q_u16
-#undef vzip2q_s32
-#undef vzip2q_u32
-#undef vzip2q_f32
-#endif
-
-#undef HWY_NEON_BUILD_ARG_1
-#undef HWY_NEON_BUILD_ARG_2
-#undef HWY_NEON_BUILD_ARG_3
-#undef HWY_NEON_BUILD_PARAM_1
-#undef HWY_NEON_BUILD_PARAM_2
-#undef HWY_NEON_BUILD_PARAM_3
-#undef HWY_NEON_BUILD_RET_1
-#undef HWY_NEON_BUILD_RET_2
-#undef HWY_NEON_BUILD_RET_3
-#undef HWY_NEON_BUILD_TPL_1
-#undef HWY_NEON_BUILD_TPL_2
-#undef HWY_NEON_BUILD_TPL_3
-#undef HWY_NEON_DEF_FUNCTION
-#undef HWY_NEON_DEF_FUNCTION_ALL_FLOATS
-#undef HWY_NEON_DEF_FUNCTION_ALL_TYPES
-#undef HWY_NEON_DEF_FUNCTION_FLOAT_64
-#undef HWY_NEON_DEF_FUNCTION_INTS
-#undef HWY_NEON_DEF_FUNCTION_INTS_UINTS
-#undef HWY_NEON_DEF_FUNCTION_INT_16
-#undef HWY_NEON_DEF_FUNCTION_INT_32
-#undef HWY_NEON_DEF_FUNCTION_INT_8
-#undef HWY_NEON_DEF_FUNCTION_INT_8_16_32
-#undef HWY_NEON_DEF_FUNCTION_TPL
-#undef HWY_NEON_DEF_FUNCTION_UIF81632
-#undef HWY_NEON_DEF_FUNCTION_UINTS
-#undef HWY_NEON_DEF_FUNCTION_UINT_16
-#undef HWY_NEON_DEF_FUNCTION_UINT_32
-#undef HWY_NEON_DEF_FUNCTION_UINT_8
-#undef HWY_NEON_DEF_FUNCTION_UINT_8_16_32
-#undef HWY_NEON_EVAL
-}  // namespace detail
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
diff --git a/third_party/highway/hwy/ops/arm_sve-inl.h b/third_party/highway/hwy/ops/arm_sve-inl.h
deleted file mode 100644 (file)
index 5491620..0000000
+++ /dev/null
@@ -1,3040 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// ARM SVE[2] vectors (length not known at compile time).
-// External include guard in highway.h - see comment there.
-
-#include <arm_sve.h>
-#include <stddef.h>
-#include <stdint.h>
-
-#include "hwy/base.h"
-#include "hwy/ops/shared-inl.h"
-
-// If running on hardware whose vector length is known to be a power of two, we
-// can skip fixups for non-power of two sizes.
-#undef HWY_SVE_IS_POW2
-#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
-#define HWY_SVE_IS_POW2 1
-#else
-#define HWY_SVE_IS_POW2 0
-#endif
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-template <class V>
-struct DFromV_t {};  // specialized in macros
-template <class V>
-using DFromV = typename DFromV_t<RemoveConst<V>>::type;
-
-template <class V>
-using TFromV = TFromD<DFromV<V>>;
-
-// ================================================== MACROS
-
-// Generate specializations and function definitions using X macros. Although
-// harder to read and debug, writing everything manually is too bulky.
-
-namespace detail {  // for code folding
-
-// Unsigned:
-#define HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP) X_MACRO(uint, u, 8, 8, NAME, OP)
-#define HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP) X_MACRO(uint, u, 16, 8, NAME, OP)
-#define HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP) \
-  X_MACRO(uint, u, 32, 16, NAME, OP)
-#define HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP) \
-  X_MACRO(uint, u, 64, 32, NAME, OP)
-
-// Signed:
-#define HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP) X_MACRO(int, s, 8, 8, NAME, OP)
-#define HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP) X_MACRO(int, s, 16, 8, NAME, OP)
-#define HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP) X_MACRO(int, s, 32, 16, NAME, OP)
-#define HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP) X_MACRO(int, s, 64, 32, NAME, OP)
-
-// Float:
-#define HWY_SVE_FOREACH_F16(X_MACRO, NAME, OP) \
-  X_MACRO(float, f, 16, 16, NAME, OP)
-#define HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP) \
-  X_MACRO(float, f, 32, 16, NAME, OP)
-#define HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP) \
-  X_MACRO(float, f, 64, 32, NAME, OP)
-
-// For all element sizes:
-#define HWY_SVE_FOREACH_U(X_MACRO, NAME, OP) \
-  HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP)     \
-  HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP)     \
-  HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP)     \
-  HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP)
-
-#define HWY_SVE_FOREACH_I(X_MACRO, NAME, OP) \
-  HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP)     \
-  HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP)     \
-  HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP)     \
-  HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP)
-
-#define HWY_SVE_FOREACH_F(X_MACRO, NAME, OP) \
-  HWY_SVE_FOREACH_F16(X_MACRO, NAME, OP)     \
-  HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP)     \
-  HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP)
-
-// Commonly used type categories for a given element size:
-#define HWY_SVE_FOREACH_UI08(X_MACRO, NAME, OP) \
-  HWY_SVE_FOREACH_U08(X_MACRO, NAME, OP)        \
-  HWY_SVE_FOREACH_I08(X_MACRO, NAME, OP)
-
-#define HWY_SVE_FOREACH_UI16(X_MACRO, NAME, OP) \
-  HWY_SVE_FOREACH_U16(X_MACRO, NAME, OP)        \
-  HWY_SVE_FOREACH_I16(X_MACRO, NAME, OP)
-
-#define HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP) \
-  HWY_SVE_FOREACH_U32(X_MACRO, NAME, OP)        \
-  HWY_SVE_FOREACH_I32(X_MACRO, NAME, OP)
-
-#define HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP) \
-  HWY_SVE_FOREACH_U64(X_MACRO, NAME, OP)        \
-  HWY_SVE_FOREACH_I64(X_MACRO, NAME, OP)
-
-#define HWY_SVE_FOREACH_UIF3264(X_MACRO, NAME, OP) \
-  HWY_SVE_FOREACH_UI32(X_MACRO, NAME, OP)          \
-  HWY_SVE_FOREACH_UI64(X_MACRO, NAME, OP)          \
-  HWY_SVE_FOREACH_F32(X_MACRO, NAME, OP)           \
-  HWY_SVE_FOREACH_F64(X_MACRO, NAME, OP)
-
-// Commonly used type categories:
-#define HWY_SVE_FOREACH_UI(X_MACRO, NAME, OP) \
-  HWY_SVE_FOREACH_U(X_MACRO, NAME, OP)        \
-  HWY_SVE_FOREACH_I(X_MACRO, NAME, OP)
-
-#define HWY_SVE_FOREACH_IF(X_MACRO, NAME, OP) \
-  HWY_SVE_FOREACH_I(X_MACRO, NAME, OP)        \
-  HWY_SVE_FOREACH_F(X_MACRO, NAME, OP)
-
-#define HWY_SVE_FOREACH(X_MACRO, NAME, OP) \
-  HWY_SVE_FOREACH_U(X_MACRO, NAME, OP)     \
-  HWY_SVE_FOREACH_I(X_MACRO, NAME, OP)     \
-  HWY_SVE_FOREACH_F(X_MACRO, NAME, OP)
-
-// Assemble types for use in x-macros
-#define HWY_SVE_T(BASE, BITS) BASE##BITS##_t
-#define HWY_SVE_D(BASE, BITS, N, POW2) Simd<HWY_SVE_T(BASE, BITS), N, POW2>
-#define HWY_SVE_V(BASE, BITS) sv##BASE##BITS##_t
-
-}  // namespace detail
-
-#define HWY_SPECIALIZE(BASE, CHAR, BITS, HALF, NAME, OP) \
-  template <>                                            \
-  struct DFromV_t<HWY_SVE_V(BASE, BITS)> {               \
-    using type = ScalableTag<HWY_SVE_T(BASE, BITS)>;     \
-  };
-
-HWY_SVE_FOREACH(HWY_SPECIALIZE, _, _)
-#undef HWY_SPECIALIZE
-
-// Note: _x (don't-care value for inactive lanes) avoids additional MOVPRFX
-// instructions, and we anyway only use it when the predicate is ptrue.
-
-// vector = f(vector), e.g. Not
-#define HWY_SVE_RETV_ARGPV(BASE, CHAR, BITS, HALF, NAME, OP)    \
-  HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
-    return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v);   \
-  }
-#define HWY_SVE_RETV_ARGV(BASE, CHAR, BITS, HALF, NAME, OP)     \
-  HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
-    return sv##OP##_##CHAR##BITS(v);                            \
-  }
-
-// vector = f(vector, scalar), e.g. detail::AddN
-#define HWY_SVE_RETV_ARGPVN(BASE, CHAR, BITS, HALF, NAME, OP)    \
-  HWY_API HWY_SVE_V(BASE, BITS)                                  \
-      NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) {   \
-    return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, b); \
-  }
-#define HWY_SVE_RETV_ARGVN(BASE, CHAR, BITS, HALF, NAME, OP)   \
-  HWY_API HWY_SVE_V(BASE, BITS)                                \
-      NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
-    return sv##OP##_##CHAR##BITS(a, b);                        \
-  }
-
-// vector = f(vector, vector), e.g. Add
-#define HWY_SVE_RETV_ARGPVV(BASE, CHAR, BITS, HALF, NAME, OP)    \
-  HWY_API HWY_SVE_V(BASE, BITS)                                  \
-      NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) {   \
-    return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), a, b); \
-  }
-#define HWY_SVE_RETV_ARGVV(BASE, CHAR, BITS, HALF, NAME, OP)   \
-  HWY_API HWY_SVE_V(BASE, BITS)                                \
-      NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
-    return sv##OP##_##CHAR##BITS(a, b);                        \
-  }
-
-// ------------------------------ Lanes
-
-namespace detail {
-
-// Returns actual lanes of a hardware vector without rounding to a power of two.
-HWY_INLINE size_t AllHardwareLanes(hwy::SizeTag<1> /* tag */) {
-  return svcntb_pat(SV_ALL);
-}
-HWY_INLINE size_t AllHardwareLanes(hwy::SizeTag<2> /* tag */) {
-  return svcnth_pat(SV_ALL);
-}
-HWY_INLINE size_t AllHardwareLanes(hwy::SizeTag<4> /* tag */) {
-  return svcntw_pat(SV_ALL);
-}
-HWY_INLINE size_t AllHardwareLanes(hwy::SizeTag<8> /* tag */) {
-  return svcntd_pat(SV_ALL);
-}
-
-// All-true mask from a macro
-#define HWY_SVE_ALL_PTRUE(BITS) svptrue_pat_b##BITS(SV_ALL)
-
-#if HWY_SVE_IS_POW2
-#define HWY_SVE_PTRUE(BITS) HWY_SVE_ALL_PTRUE(BITS)
-#else
-#define HWY_SVE_PTRUE(BITS) svptrue_pat_b##BITS(SV_POW2)
-
-// Returns actual lanes of a hardware vector, rounded down to a power of two.
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_INLINE size_t HardwareLanes() {
-  return svcntb_pat(SV_POW2);
-}
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_INLINE size_t HardwareLanes() {
-  return svcnth_pat(SV_POW2);
-}
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_INLINE size_t HardwareLanes() {
-  return svcntw_pat(SV_POW2);
-}
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_INLINE size_t HardwareLanes() {
-  return svcntd_pat(SV_POW2);
-}
-
-#endif  // HWY_SVE_IS_POW2
-
-}  // namespace detail
-
-// Returns actual number of lanes after capping by N and shifting. May return 0
-// (e.g. for "1/8th" of a u32x4 - would be 1 for 1/8th of u32x8).
-#if HWY_TARGET == HWY_SVE_256
-template <typename T, size_t N, int kPow2>
-HWY_API constexpr size_t Lanes(Simd<T, N, kPow2> /* d */) {
-  return HWY_MIN(detail::ScaleByPower(32 / sizeof(T), kPow2), N);
-}
-#elif HWY_TARGET == HWY_SVE2_128
-template <typename T, size_t N, int kPow2>
-HWY_API constexpr size_t Lanes(Simd<T, N, kPow2> /* d */) {
-  return HWY_MIN(detail::ScaleByPower(16 / sizeof(T), kPow2), N);
-}
-#else
-template <typename T, size_t N, int kPow2>
-HWY_API size_t Lanes(Simd<T, N, kPow2> d) {
-  const size_t actual = detail::HardwareLanes<T>();
-  // Common case of full vectors: avoid any extra instructions.
-  if (detail::IsFull(d)) return actual;
-  return HWY_MIN(detail::ScaleByPower(actual, kPow2), N);
-}
-#endif  // HWY_TARGET
-
-// ================================================== MASK INIT
-
-// One mask bit per byte; only the one belonging to the lowest byte is valid.
-
-// ------------------------------ FirstN
-#define HWY_SVE_FIRSTN(BASE, CHAR, BITS, HALF, NAME, OP)                       \
-  template <size_t N, int kPow2>                                               \
-  HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, size_t count) {     \
-    const size_t limit = detail::IsFull(d) ? count : HWY_MIN(Lanes(d), count); \
-    return sv##OP##_b##BITS##_u32(uint32_t{0}, static_cast<uint32_t>(limit));  \
-  }
-HWY_SVE_FOREACH(HWY_SVE_FIRSTN, FirstN, whilelt)
-#undef HWY_SVE_FIRSTN
-
-namespace detail {
-
-#define HWY_SVE_WRAP_PTRUE(BASE, CHAR, BITS, HALF, NAME, OP)            \
-  template <size_t N, int kPow2>                                        \
-  HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */) {      \
-    return HWY_SVE_PTRUE(BITS);                                         \
-  }                                                                     \
-  template <size_t N, int kPow2>                                        \
-  HWY_API svbool_t All##NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */) { \
-    return HWY_SVE_ALL_PTRUE(BITS);                                     \
-  }
-
-HWY_SVE_FOREACH(HWY_SVE_WRAP_PTRUE, PTrue, ptrue)  // return all-true
-#undef HWY_SVE_WRAP_PTRUE
-
-HWY_API svbool_t PFalse() { return svpfalse_b(); }
-
-// Returns all-true if d is HWY_FULL or FirstN(N) after capping N.
-//
-// This is used in functions that load/store memory; other functions (e.g.
-// arithmetic) can ignore d and use PTrue instead.
-template <class D>
-svbool_t MakeMask(D d) {
-  return IsFull(d) ? PTrue(d) : FirstN(d, Lanes(d));
-}
-
-}  // namespace detail
-
-// ================================================== INIT
-
-// ------------------------------ Set
-// vector = f(d, scalar), e.g. Set
-#define HWY_SVE_SET(BASE, CHAR, BITS, HALF, NAME, OP)                         \
-  template <size_t N, int kPow2>                                              \
-  HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
-                                     HWY_SVE_T(BASE, BITS) arg) {             \
-    return sv##OP##_##CHAR##BITS(arg);                                        \
-  }
-
-HWY_SVE_FOREACH(HWY_SVE_SET, Set, dup_n)
-#undef HWY_SVE_SET
-
-// Required for Zero and VFromD
-template <size_t N, int kPow2>
-svuint16_t Set(Simd<bfloat16_t, N, kPow2> d, bfloat16_t arg) {
-  return Set(RebindToUnsigned<decltype(d)>(), arg.bits);
-}
-
-template <class D>
-using VFromD = decltype(Set(D(), TFromD<D>()));
-
-// ------------------------------ Zero
-
-template <class D>
-VFromD<D> Zero(D d) {
-  return Set(d, 0);
-}
-
-// ------------------------------ Undefined
-
-#define HWY_SVE_UNDEFINED(BASE, CHAR, BITS, HALF, NAME, OP) \
-  template <size_t N, int kPow2>                            \
-  HWY_API HWY_SVE_V(BASE, BITS)                             \
-      NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */) {       \
-    return sv##OP##_##CHAR##BITS();                         \
-  }
-
-HWY_SVE_FOREACH(HWY_SVE_UNDEFINED, Undefined, undef)
-
-// ------------------------------ BitCast
-
-namespace detail {
-
-// u8: no change
-#define HWY_SVE_CAST_NOP(BASE, CHAR, BITS, HALF, NAME, OP)                \
-  HWY_API HWY_SVE_V(BASE, BITS) BitCastToByte(HWY_SVE_V(BASE, BITS) v) {  \
-    return v;                                                             \
-  }                                                                       \
-  template <size_t N, int kPow2>                                          \
-  HWY_API HWY_SVE_V(BASE, BITS) BitCastFromByte(                          \
-      HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, HWY_SVE_V(BASE, BITS) v) { \
-    return v;                                                             \
-  }
-
-// All other types
-#define HWY_SVE_CAST(BASE, CHAR, BITS, HALF, NAME, OP)                        \
-  HWY_INLINE svuint8_t BitCastToByte(HWY_SVE_V(BASE, BITS) v) {               \
-    return sv##OP##_u8_##CHAR##BITS(v);                                       \
-  }                                                                           \
-  template <size_t N, int kPow2>                                              \
-  HWY_INLINE HWY_SVE_V(BASE, BITS)                                            \
-      BitCastFromByte(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, svuint8_t v) { \
-    return sv##OP##_##CHAR##BITS##_u8(v);                                     \
-  }
-
-HWY_SVE_FOREACH_U08(HWY_SVE_CAST_NOP, _, _)
-HWY_SVE_FOREACH_I08(HWY_SVE_CAST, _, reinterpret)
-HWY_SVE_FOREACH_UI16(HWY_SVE_CAST, _, reinterpret)
-HWY_SVE_FOREACH_UI32(HWY_SVE_CAST, _, reinterpret)
-HWY_SVE_FOREACH_UI64(HWY_SVE_CAST, _, reinterpret)
-HWY_SVE_FOREACH_F(HWY_SVE_CAST, _, reinterpret)
-
-#undef HWY_SVE_CAST_NOP
-#undef HWY_SVE_CAST
-
-template <size_t N, int kPow2>
-HWY_INLINE svuint16_t BitCastFromByte(Simd<bfloat16_t, N, kPow2> /* d */,
-                                      svuint8_t v) {
-  return BitCastFromByte(Simd<uint16_t, N, kPow2>(), v);
-}
-
-}  // namespace detail
-
-template <class D, class FromV>
-HWY_API VFromD<D> BitCast(D d, FromV v) {
-  return detail::BitCastFromByte(d, detail::BitCastToByte(v));
-}
-
-// ================================================== LOGICAL
-
-// detail::*N() functions accept a scalar argument to avoid extra Set().
-
-// ------------------------------ Not
-HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPV, Not, not )  // NOLINT
-
-// ------------------------------ And
-
-namespace detail {
-HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVN, AndN, and_n)
-}  // namespace detail
-
-HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVV, And, and)
-
-template <class V, HWY_IF_FLOAT_V(V)>
-HWY_API V And(const V a, const V b) {
-  const DFromV<V> df;
-  const RebindToUnsigned<decltype(df)> du;
-  return BitCast(df, And(BitCast(du, a), BitCast(du, b)));
-}
-
-// ------------------------------ Or
-
-HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVV, Or, orr)
-
-template <class V, HWY_IF_FLOAT_V(V)>
-HWY_API V Or(const V a, const V b) {
-  const DFromV<V> df;
-  const RebindToUnsigned<decltype(df)> du;
-  return BitCast(df, Or(BitCast(du, a), BitCast(du, b)));
-}
-
-// ------------------------------ Xor
-
-namespace detail {
-HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVN, XorN, eor_n)
-}  // namespace detail
-
-HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVV, Xor, eor)
-
-template <class V, HWY_IF_FLOAT_V(V)>
-HWY_API V Xor(const V a, const V b) {
-  const DFromV<V> df;
-  const RebindToUnsigned<decltype(df)> du;
-  return BitCast(df, Xor(BitCast(du, a), BitCast(du, b)));
-}
-
-// ------------------------------ AndNot
-
-namespace detail {
-#define HWY_SVE_RETV_ARGPVN_SWAP(BASE, CHAR, BITS, HALF, NAME, OP) \
-  HWY_API HWY_SVE_V(BASE, BITS)                                    \
-      NAME(HWY_SVE_T(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) {     \
-    return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), b, a);   \
-  }
-
-HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVN_SWAP, AndNotN, bic_n)
-#undef HWY_SVE_RETV_ARGPVN_SWAP
-}  // namespace detail
-
-#define HWY_SVE_RETV_ARGPVV_SWAP(BASE, CHAR, BITS, HALF, NAME, OP) \
-  HWY_API HWY_SVE_V(BASE, BITS)                                    \
-      NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) {     \
-    return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), b, a);   \
-  }
-HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVV_SWAP, AndNot, bic)
-#undef HWY_SVE_RETV_ARGPVV_SWAP
-
-template <class V, HWY_IF_FLOAT_V(V)>
-HWY_API V AndNot(const V a, const V b) {
-  const DFromV<V> df;
-  const RebindToUnsigned<decltype(df)> du;
-  return BitCast(df, AndNot(BitCast(du, a), BitCast(du, b)));
-}
-
-// ------------------------------ Or3
-template <class V>
-HWY_API V Or3(V o1, V o2, V o3) {
-  return Or(o1, Or(o2, o3));
-}
-
-// ------------------------------ OrAnd
-template <class V>
-HWY_API V OrAnd(const V o, const V a1, const V a2) {
-  return Or(o, And(a1, a2));
-}
-
-// ------------------------------ PopulationCount
-
-#ifdef HWY_NATIVE_POPCNT
-#undef HWY_NATIVE_POPCNT
-#else
-#define HWY_NATIVE_POPCNT
-#endif
-
-// Need to return original type instead of unsigned.
-#define HWY_SVE_POPCNT(BASE, CHAR, BITS, HALF, NAME, OP)               \
-  HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) {        \
-    return BitCast(DFromV<decltype(v)>(),                              \
-                   sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v)); \
-  }
-HWY_SVE_FOREACH_UI(HWY_SVE_POPCNT, PopulationCount, cnt)
-#undef HWY_SVE_POPCNT
-
-// ================================================== SIGN
-
-// ------------------------------ Neg
-HWY_SVE_FOREACH_IF(HWY_SVE_RETV_ARGPV, Neg, neg)
-
-// ------------------------------ Abs
-HWY_SVE_FOREACH_IF(HWY_SVE_RETV_ARGPV, Abs, abs)
-
-// ------------------------------ CopySign[ToAbs]
-
-template <class V>
-HWY_API V CopySign(const V magn, const V sign) {
-  const auto msb = SignBit(DFromV<V>());
-  return Or(AndNot(msb, magn), And(msb, sign));
-}
-
-template <class V>
-HWY_API V CopySignToAbs(const V abs, const V sign) {
-  const auto msb = SignBit(DFromV<V>());
-  return Or(abs, And(msb, sign));
-}
-
-// ================================================== ARITHMETIC
-
-// ------------------------------ Add
-
-namespace detail {
-HWY_SVE_FOREACH(HWY_SVE_RETV_ARGPVN, AddN, add_n)
-}  // namespace detail
-
-HWY_SVE_FOREACH(HWY_SVE_RETV_ARGPVV, Add, add)
-
-// ------------------------------ Sub
-
-namespace detail {
-// Can't use HWY_SVE_RETV_ARGPVN because caller wants to specify pg.
-#define HWY_SVE_RETV_ARGPVN_MASK(BASE, CHAR, BITS, HALF, NAME, OP)          \
-  HWY_API HWY_SVE_V(BASE, BITS)                                             \
-      NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
-    return sv##OP##_##CHAR##BITS##_z(pg, a, b);                             \
-  }
-
-HWY_SVE_FOREACH(HWY_SVE_RETV_ARGPVN_MASK, SubN, sub_n)
-#undef HWY_SVE_RETV_ARGPVN_MASK
-}  // namespace detail
-
-HWY_SVE_FOREACH(HWY_SVE_RETV_ARGPVV, Sub, sub)
-
-// ------------------------------ SumsOf8
-HWY_API svuint64_t SumsOf8(const svuint8_t v) {
-  const ScalableTag<uint32_t> du32;
-  const ScalableTag<uint64_t> du64;
-  const svbool_t pg = detail::PTrue(du64);
-
-  const svuint32_t sums_of_4 = svdot_n_u32(Zero(du32), v, 1);
-  // Compute pairwise sum of u32 and extend to u64.
-  // TODO(janwas): on SVE2, we can instead use svaddp.
-  const svuint64_t hi = svlsr_n_u64_x(pg, BitCast(du64, sums_of_4), 32);
-  // Isolate the lower 32 bits (to be added to the upper 32 and zero-extended)
-  const svuint64_t lo = svextw_u64_x(pg, BitCast(du64, sums_of_4));
-  return Add(hi, lo);
-}
-
-// ------------------------------ SaturatedAdd
-
-HWY_SVE_FOREACH_UI08(HWY_SVE_RETV_ARGVV, SaturatedAdd, qadd)
-HWY_SVE_FOREACH_UI16(HWY_SVE_RETV_ARGVV, SaturatedAdd, qadd)
-
-// ------------------------------ SaturatedSub
-
-HWY_SVE_FOREACH_UI08(HWY_SVE_RETV_ARGVV, SaturatedSub, qsub)
-HWY_SVE_FOREACH_UI16(HWY_SVE_RETV_ARGVV, SaturatedSub, qsub)
-
-// ------------------------------ AbsDiff
-HWY_SVE_FOREACH_IF(HWY_SVE_RETV_ARGPVV, AbsDiff, abd)
-
-// ------------------------------ ShiftLeft[Same]
-
-#define HWY_SVE_SHIFT_N(BASE, CHAR, BITS, HALF, NAME, OP)               \
-  template <int kBits>                                                  \
-  HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) {         \
-    return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, kBits);    \
-  }                                                                     \
-  HWY_API HWY_SVE_V(BASE, BITS)                                         \
-      NAME##Same(HWY_SVE_V(BASE, BITS) v, HWY_SVE_T(uint, BITS) bits) { \
-    return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v, bits);     \
-  }
-
-HWY_SVE_FOREACH_UI(HWY_SVE_SHIFT_N, ShiftLeft, lsl_n)
-
-// ------------------------------ ShiftRight[Same]
-
-HWY_SVE_FOREACH_U(HWY_SVE_SHIFT_N, ShiftRight, lsr_n)
-HWY_SVE_FOREACH_I(HWY_SVE_SHIFT_N, ShiftRight, asr_n)
-
-#undef HWY_SVE_SHIFT_N
-
-// ------------------------------ RotateRight
-
-// TODO(janwas): svxar on SVE2
-template <int kBits, class V>
-HWY_API V RotateRight(const V v) {
-  constexpr size_t kSizeInBits = sizeof(TFromV<V>) * 8;
-  static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
-  if (kBits == 0) return v;
-  return Or(ShiftRight<kBits>(v), ShiftLeft<kSizeInBits - kBits>(v));
-}
-
-// ------------------------------ Shl/r
-
-#define HWY_SVE_SHIFT(BASE, CHAR, BITS, HALF, NAME, OP)           \
-  HWY_API HWY_SVE_V(BASE, BITS)                                   \
-      NAME(HWY_SVE_V(BASE, BITS) v, HWY_SVE_V(BASE, BITS) bits) { \
-    const RebindToUnsigned<DFromV<decltype(v)>> du;               \
-    return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v,      \
-                                     BitCast(du, bits));          \
-  }
-
-HWY_SVE_FOREACH_UI(HWY_SVE_SHIFT, Shl, lsl)
-
-HWY_SVE_FOREACH_U(HWY_SVE_SHIFT, Shr, lsr)
-HWY_SVE_FOREACH_I(HWY_SVE_SHIFT, Shr, asr)
-
-#undef HWY_SVE_SHIFT
-
-// ------------------------------ Min/Max
-
-HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVV, Min, min)
-HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVV, Max, max)
-HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPVV, Min, minnm)
-HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPVV, Max, maxnm)
-
-namespace detail {
-HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVN, MinN, min_n)
-HWY_SVE_FOREACH_UI(HWY_SVE_RETV_ARGPVN, MaxN, max_n)
-}  // namespace detail
-
-// ------------------------------ Mul
-HWY_SVE_FOREACH_UI16(HWY_SVE_RETV_ARGPVV, Mul, mul)
-HWY_SVE_FOREACH_UIF3264(HWY_SVE_RETV_ARGPVV, Mul, mul)
-
-// Per-target flag to prevent generic_ops-inl.h from defining i64 operator*.
-#ifdef HWY_NATIVE_I64MULLO
-#undef HWY_NATIVE_I64MULLO
-#else
-#define HWY_NATIVE_I64MULLO
-#endif
-
-// ------------------------------ MulHigh
-HWY_SVE_FOREACH_UI16(HWY_SVE_RETV_ARGPVV, MulHigh, mulh)
-namespace detail {
-HWY_SVE_FOREACH_UI32(HWY_SVE_RETV_ARGPVV, MulHigh, mulh)
-HWY_SVE_FOREACH_U64(HWY_SVE_RETV_ARGPVV, MulHigh, mulh)
-}  // namespace detail
-
-// ------------------------------ MulFixedPoint15
-HWY_API svint16_t MulFixedPoint15(svint16_t a, svint16_t b) {
-#if HWY_TARGET == HWY_SVE2
-  return svqrdmulh_s16(a, b);
-#else
-  const DFromV<decltype(a)> d;
-  const RebindToUnsigned<decltype(d)> du;
-
-  const svuint16_t lo = BitCast(du, Mul(a, b));
-  const svint16_t hi = MulHigh(a, b);
-  // We want (lo + 0x4000) >> 15, but that can overflow, and if it does we must
-  // carry that into the result. Instead isolate the top two bits because only
-  // they can influence the result.
-  const svuint16_t lo_top2 = ShiftRight<14>(lo);
-  // Bits 11: add 2, 10: add 1, 01: add 1, 00: add 0.
-  const svuint16_t rounding = ShiftRight<1>(detail::AddN(lo_top2, 1));
-  return Add(Add(hi, hi), BitCast(d, rounding));
-#endif
-}
-
-// ------------------------------ Div
-HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPVV, Div, div)
-
-// ------------------------------ ApproximateReciprocal
-HWY_SVE_FOREACH_F32(HWY_SVE_RETV_ARGV, ApproximateReciprocal, recpe)
-
-// ------------------------------ Sqrt
-HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPV, Sqrt, sqrt)
-
-// ------------------------------ ApproximateReciprocalSqrt
-HWY_SVE_FOREACH_F32(HWY_SVE_RETV_ARGV, ApproximateReciprocalSqrt, rsqrte)
-
-// ------------------------------ MulAdd
-#define HWY_SVE_FMA(BASE, CHAR, BITS, HALF, NAME, OP)                   \
-  HWY_API HWY_SVE_V(BASE, BITS)                                         \
-      NAME(HWY_SVE_V(BASE, BITS) mul, HWY_SVE_V(BASE, BITS) x,          \
-           HWY_SVE_V(BASE, BITS) add) {                                 \
-    return sv##OP##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), x, mul, add); \
-  }
-
-HWY_SVE_FOREACH_F(HWY_SVE_FMA, MulAdd, mad)
-
-// ------------------------------ NegMulAdd
-HWY_SVE_FOREACH_F(HWY_SVE_FMA, NegMulAdd, msb)
-
-// ------------------------------ MulSub
-HWY_SVE_FOREACH_F(HWY_SVE_FMA, MulSub, nmsb)
-
-// ------------------------------ NegMulSub
-HWY_SVE_FOREACH_F(HWY_SVE_FMA, NegMulSub, nmad)
-
-#undef HWY_SVE_FMA
-
-// ------------------------------ Round etc.
-
-HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPV, Round, rintn)
-HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPV, Floor, rintm)
-HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPV, Ceil, rintp)
-HWY_SVE_FOREACH_F(HWY_SVE_RETV_ARGPV, Trunc, rintz)
-
-// ================================================== MASK
-
-// ------------------------------ RebindMask
-template <class D, typename MFrom>
-HWY_API svbool_t RebindMask(const D /*d*/, const MFrom mask) {
-  return mask;
-}
-
-// ------------------------------ Mask logical
-
-HWY_API svbool_t Not(svbool_t m) {
-  // We don't know the lane type, so assume 8-bit. For larger types, this will
-  // de-canonicalize the predicate, i.e. set bits to 1 even though they do not
-  // correspond to the lowest byte in the lane. Per ARM, such bits are ignored.
-  return svnot_b_z(HWY_SVE_PTRUE(8), m);
-}
-HWY_API svbool_t And(svbool_t a, svbool_t b) {
-  return svand_b_z(b, b, a);  // same order as AndNot for consistency
-}
-HWY_API svbool_t AndNot(svbool_t a, svbool_t b) {
-  return svbic_b_z(b, b, a);  // reversed order like NEON
-}
-HWY_API svbool_t Or(svbool_t a, svbool_t b) {
-  return svsel_b(a, a, b);  // a ? true : b
-}
-HWY_API svbool_t Xor(svbool_t a, svbool_t b) {
-  return svsel_b(a, svnand_b_z(a, a, b), b);  // a ? !(a & b) : b.
-}
-
-// ------------------------------ CountTrue
-
-#define HWY_SVE_COUNT_TRUE(BASE, CHAR, BITS, HALF, NAME, OP)           \
-  template <size_t N, int kPow2>                                       \
-  HWY_API size_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d, svbool_t m) { \
-    return sv##OP##_b##BITS(detail::MakeMask(d), m);                   \
-  }
-
-HWY_SVE_FOREACH(HWY_SVE_COUNT_TRUE, CountTrue, cntp)
-#undef HWY_SVE_COUNT_TRUE
-
-// For 16-bit Compress: full vector, not limited to SV_POW2.
-namespace detail {
-
-#define HWY_SVE_COUNT_TRUE_FULL(BASE, CHAR, BITS, HALF, NAME, OP)            \
-  template <size_t N, int kPow2>                                             \
-  HWY_API size_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, svbool_t m) { \
-    return sv##OP##_b##BITS(svptrue_b##BITS(), m);                           \
-  }
-
-HWY_SVE_FOREACH(HWY_SVE_COUNT_TRUE_FULL, CountTrueFull, cntp)
-#undef HWY_SVE_COUNT_TRUE_FULL
-
-}  // namespace detail
-
-// ------------------------------ AllFalse
-template <class D>
-HWY_API bool AllFalse(D d, svbool_t m) {
-  return !svptest_any(detail::MakeMask(d), m);
-}
-
-// ------------------------------ AllTrue
-template <class D>
-HWY_API bool AllTrue(D d, svbool_t m) {
-  return CountTrue(d, m) == Lanes(d);
-}
-
-// ------------------------------ FindFirstTrue
-template <class D>
-HWY_API intptr_t FindFirstTrue(D d, svbool_t m) {
-  return AllFalse(d, m) ? intptr_t{-1}
-                        : static_cast<intptr_t>(
-                              CountTrue(d, svbrkb_b_z(detail::MakeMask(d), m)));
-}
-
-// ------------------------------ IfThenElse
-#define HWY_SVE_IF_THEN_ELSE(BASE, CHAR, BITS, HALF, NAME, OP)                \
-  HWY_API HWY_SVE_V(BASE, BITS)                                               \
-      NAME(svbool_t m, HWY_SVE_V(BASE, BITS) yes, HWY_SVE_V(BASE, BITS) no) { \
-    return sv##OP##_##CHAR##BITS(m, yes, no);                                 \
-  }
-
-HWY_SVE_FOREACH(HWY_SVE_IF_THEN_ELSE, IfThenElse, sel)
-#undef HWY_SVE_IF_THEN_ELSE
-
-// ------------------------------ IfThenElseZero
-template <class V>
-HWY_API V IfThenElseZero(const svbool_t mask, const V yes) {
-  return IfThenElse(mask, yes, Zero(DFromV<V>()));
-}
-
-// ------------------------------ IfThenZeroElse
-template <class V>
-HWY_API V IfThenZeroElse(const svbool_t mask, const V no) {
-  return IfThenElse(mask, Zero(DFromV<V>()), no);
-}
-
-// ================================================== COMPARE
-
-// mask = f(vector, vector)
-#define HWY_SVE_COMPARE(BASE, CHAR, BITS, HALF, NAME, OP)                   \
-  HWY_API svbool_t NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_V(BASE, BITS) b) { \
-    return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(BITS), a, b);                \
-  }
-#define HWY_SVE_COMPARE_N(BASE, CHAR, BITS, HALF, NAME, OP)                 \
-  HWY_API svbool_t NAME(HWY_SVE_V(BASE, BITS) a, HWY_SVE_T(BASE, BITS) b) { \
-    return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(BITS), a, b);                \
-  }
-
-// ------------------------------ Eq
-HWY_SVE_FOREACH(HWY_SVE_COMPARE, Eq, cmpeq)
-namespace detail {
-HWY_SVE_FOREACH(HWY_SVE_COMPARE_N, EqN, cmpeq_n)
-}  // namespace detail
-
-// ------------------------------ Ne
-HWY_SVE_FOREACH(HWY_SVE_COMPARE, Ne, cmpne)
-namespace detail {
-HWY_SVE_FOREACH(HWY_SVE_COMPARE_N, NeN, cmpne_n)
-}  // namespace detail
-
-// ------------------------------ Lt
-HWY_SVE_FOREACH(HWY_SVE_COMPARE, Lt, cmplt)
-namespace detail {
-HWY_SVE_FOREACH(HWY_SVE_COMPARE_N, LtN, cmplt_n)
-}  // namespace detail
-
-// ------------------------------ Le
-HWY_SVE_FOREACH_F(HWY_SVE_COMPARE, Le, cmple)
-
-#undef HWY_SVE_COMPARE
-#undef HWY_SVE_COMPARE_N
-
-// ------------------------------ Gt/Ge (swapped order)
-template <class V>
-HWY_API svbool_t Gt(const V a, const V b) {
-  return Lt(b, a);
-}
-template <class V>
-HWY_API svbool_t Ge(const V a, const V b) {
-  return Le(b, a);
-}
-
-// ------------------------------ TestBit
-template <class V>
-HWY_API svbool_t TestBit(const V a, const V bit) {
-  return detail::NeN(And(a, bit), 0);
-}
-
-// ------------------------------ MaskFromVec (Ne)
-template <class V>
-HWY_API svbool_t MaskFromVec(const V v) {
-  return detail::NeN(v, static_cast<TFromV<V>>(0));
-}
-
-// ------------------------------ VecFromMask
-template <class D>
-HWY_API VFromD<D> VecFromMask(const D d, svbool_t mask) {
-  const RebindToSigned<D> di;
-  // This generates MOV imm, whereas svdup_n_s8_z generates MOV scalar, which
-  // requires an extra instruction plus M0 pipeline.
-  return BitCast(d, IfThenElseZero(mask, Set(di, -1)));
-}
-
-// ------------------------------ IfVecThenElse (MaskFromVec, IfThenElse)
-
-#if HWY_TARGET == HWY_SVE2
-
-#define HWY_SVE_IF_VEC(BASE, CHAR, BITS, HALF, NAME, OP)          \
-  HWY_API HWY_SVE_V(BASE, BITS)                                   \
-      NAME(HWY_SVE_V(BASE, BITS) mask, HWY_SVE_V(BASE, BITS) yes, \
-           HWY_SVE_V(BASE, BITS) no) {                            \
-    return sv##OP##_##CHAR##BITS(yes, no, mask);                  \
-  }
-
-HWY_SVE_FOREACH_UI(HWY_SVE_IF_VEC, IfVecThenElse, bsl)
-#undef HWY_SVE_IF_VEC
-
-template <class V, HWY_IF_FLOAT_V(V)>
-HWY_API V IfVecThenElse(const V mask, const V yes, const V no) {
-  const DFromV<V> d;
-  const RebindToUnsigned<decltype(d)> du;
-  return BitCast(
-      d, IfVecThenElse(BitCast(du, mask), BitCast(du, yes), BitCast(du, no)));
-}
-
-#else
-
-template <class V>
-HWY_API V IfVecThenElse(const V mask, const V yes, const V no) {
-  return Or(And(mask, yes), AndNot(mask, no));
-}
-
-#endif  // HWY_TARGET == HWY_SVE2
-
-// ------------------------------ Floating-point classification (Ne)
-
-template <class V>
-HWY_API svbool_t IsNaN(const V v) {
-  return Ne(v, v);  // could also use cmpuo
-}
-
-template <class V>
-HWY_API svbool_t IsInf(const V v) {
-  using T = TFromV<V>;
-  const DFromV<decltype(v)> d;
-  const RebindToSigned<decltype(d)> di;
-  const VFromD<decltype(di)> vi = BitCast(di, v);
-  // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
-  return RebindMask(d, detail::EqN(Add(vi, vi), hwy::MaxExponentTimes2<T>()));
-}
-
-// Returns whether normal/subnormal/zero.
-template <class V>
-HWY_API svbool_t IsFinite(const V v) {
-  using T = TFromV<V>;
-  const DFromV<decltype(v)> d;
-  const RebindToUnsigned<decltype(d)> du;
-  const RebindToSigned<decltype(d)> di;  // cheaper than unsigned comparison
-  const VFromD<decltype(du)> vu = BitCast(du, v);
-  // 'Shift left' to clear the sign bit, then right so we can compare with the
-  // max exponent (cannot compare with MaxExponentTimes2 directly because it is
-  // negative and non-negative floats would be greater).
-  const VFromD<decltype(di)> exp =
-      BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
-  return RebindMask(d, detail::LtN(exp, hwy::MaxExponentField<T>()));
-}
-
-// ================================================== MEMORY
-
-// ------------------------------ Load/MaskedLoad/LoadDup128/Store/Stream
-
-#define HWY_SVE_LOAD(BASE, CHAR, BITS, HALF, NAME, OP)     \
-  template <size_t N, int kPow2>                           \
-  HWY_API HWY_SVE_V(BASE, BITS)                            \
-      NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d,              \
-           const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
-    return sv##OP##_##CHAR##BITS(detail::MakeMask(d), p);  \
-  }
-
-#define HWY_SVE_MASKED_LOAD(BASE, CHAR, BITS, HALF, NAME, OP)   \
-  template <size_t N, int kPow2>                                \
-  HWY_API HWY_SVE_V(BASE, BITS)                                 \
-      NAME(svbool_t m, HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
-           const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) {      \
-    return sv##OP##_##CHAR##BITS(m, p);                         \
-  }
-
-#define HWY_SVE_LOAD_DUP128(BASE, CHAR, BITS, HALF, NAME, OP) \
-  template <size_t N, int kPow2>                              \
-  HWY_API HWY_SVE_V(BASE, BITS)                               \
-      NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */,           \
-           const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) {    \
-    /* All-true predicate to load all 128 bits. */            \
-    return sv##OP##_##CHAR##BITS(HWY_SVE_PTRUE(8), p);        \
-  }
-
-#define HWY_SVE_STORE(BASE, CHAR, BITS, HALF, NAME, OP)       \
-  template <size_t N, int kPow2>                              \
-  HWY_API void NAME(HWY_SVE_V(BASE, BITS) v,                  \
-                    HWY_SVE_D(BASE, BITS, N, kPow2) d,        \
-                    HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) { \
-    sv##OP##_##CHAR##BITS(detail::MakeMask(d), p, v);         \
-  }
-
-#define HWY_SVE_BLENDED_STORE(BASE, CHAR, BITS, HALF, NAME, OP) \
-  template <size_t N, int kPow2>                                \
-  HWY_API void NAME(HWY_SVE_V(BASE, BITS) v, svbool_t m,        \
-                    HWY_SVE_D(BASE, BITS, N, kPow2) /* d */,    \
-                    HWY_SVE_T(BASE, BITS) * HWY_RESTRICT p) {   \
-    sv##OP##_##CHAR##BITS(m, p, v);                             \
-  }
-
-HWY_SVE_FOREACH(HWY_SVE_LOAD, Load, ld1)
-HWY_SVE_FOREACH(HWY_SVE_MASKED_LOAD, MaskedLoad, ld1)
-HWY_SVE_FOREACH(HWY_SVE_LOAD_DUP128, LoadDup128, ld1rq)
-HWY_SVE_FOREACH(HWY_SVE_STORE, Store, st1)
-HWY_SVE_FOREACH(HWY_SVE_STORE, Stream, stnt1)
-HWY_SVE_FOREACH(HWY_SVE_BLENDED_STORE, BlendedStore, st1)
-
-#undef HWY_SVE_LOAD
-#undef HWY_SVE_MASKED_LOAD
-#undef HWY_SVE_LOAD_DUP128
-#undef HWY_SVE_STORE
-#undef HWY_SVE_BLENDED_STORE
-
-// BF16 is the same as svuint16_t because BF16 is optional before v8.6.
-template <size_t N, int kPow2>
-HWY_API svuint16_t Load(Simd<bfloat16_t, N, kPow2> d,
-                        const bfloat16_t* HWY_RESTRICT p) {
-  return Load(RebindToUnsigned<decltype(d)>(),
-              reinterpret_cast<const uint16_t * HWY_RESTRICT>(p));
-}
-
-template <size_t N, int kPow2>
-HWY_API void Store(svuint16_t v, Simd<bfloat16_t, N, kPow2> d,
-                   bfloat16_t* HWY_RESTRICT p) {
-  Store(v, RebindToUnsigned<decltype(d)>(),
-        reinterpret_cast<uint16_t * HWY_RESTRICT>(p));
-}
-
-// ------------------------------ Load/StoreU
-
-// SVE only requires lane alignment, not natural alignment of the entire
-// vector.
-template <class D>
-HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
-  return Load(d, p);
-}
-
-template <class V, class D>
-HWY_API void StoreU(const V v, D d, TFromD<D>* HWY_RESTRICT p) {
-  Store(v, d, p);
-}
-
-// ------------------------------ ScatterOffset/Index
-
-#define HWY_SVE_SCATTER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP)             \
-  template <size_t N, int kPow2>                                             \
-  HWY_API void NAME(HWY_SVE_V(BASE, BITS) v,                                 \
-                    HWY_SVE_D(BASE, BITS, N, kPow2) d,                       \
-                    HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base,               \
-                    HWY_SVE_V(int, BITS) offset) {                           \
-    sv##OP##_s##BITS##offset_##CHAR##BITS(detail::MakeMask(d), base, offset, \
-                                          v);                                \
-  }
-
-#define HWY_SVE_SCATTER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP)                \
-  template <size_t N, int kPow2>                                               \
-  HWY_API void NAME(                                                           \
-      HWY_SVE_V(BASE, BITS) v, HWY_SVE_D(BASE, BITS, N, kPow2) d,              \
-      HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base, HWY_SVE_V(int, BITS) index) { \
-    sv##OP##_s##BITS##index_##CHAR##BITS(detail::MakeMask(d), base, index, v); \
-  }
-
-HWY_SVE_FOREACH_UIF3264(HWY_SVE_SCATTER_OFFSET, ScatterOffset, st1_scatter)
-HWY_SVE_FOREACH_UIF3264(HWY_SVE_SCATTER_INDEX, ScatterIndex, st1_scatter)
-#undef HWY_SVE_SCATTER_OFFSET
-#undef HWY_SVE_SCATTER_INDEX
-
-// ------------------------------ GatherOffset/Index
-
-#define HWY_SVE_GATHER_OFFSET(BASE, CHAR, BITS, HALF, NAME, OP)             \
-  template <size_t N, int kPow2>                                            \
-  HWY_API HWY_SVE_V(BASE, BITS)                                             \
-      NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d,                               \
-           const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base,                 \
-           HWY_SVE_V(int, BITS) offset) {                                   \
-    return sv##OP##_s##BITS##offset_##CHAR##BITS(detail::MakeMask(d), base, \
-                                                 offset);                   \
-  }
-#define HWY_SVE_GATHER_INDEX(BASE, CHAR, BITS, HALF, NAME, OP)             \
-  template <size_t N, int kPow2>                                           \
-  HWY_API HWY_SVE_V(BASE, BITS)                                            \
-      NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d,                              \
-           const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT base,                \
-           HWY_SVE_V(int, BITS) index) {                                   \
-    return sv##OP##_s##BITS##index_##CHAR##BITS(detail::MakeMask(d), base, \
-                                                index);                    \
-  }
-
-HWY_SVE_FOREACH_UIF3264(HWY_SVE_GATHER_OFFSET, GatherOffset, ld1_gather)
-HWY_SVE_FOREACH_UIF3264(HWY_SVE_GATHER_INDEX, GatherIndex, ld1_gather)
-#undef HWY_SVE_GATHER_OFFSET
-#undef HWY_SVE_GATHER_INDEX
-
-// ------------------------------ LoadInterleaved2
-
-// Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2.
-#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
-#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
-#else
-#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
-#endif
-
-#define HWY_SVE_LOAD2(BASE, CHAR, BITS, HALF, NAME, OP)                       \
-  template <size_t N, int kPow2>                                              \
-  HWY_API void NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d,                        \
-                    const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned,     \
-                    HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1) { \
-    const sv##BASE##BITS##x2_t tuple =                                        \
-        sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned);                \
-    v0 = svget2(tuple, 0);                                                    \
-    v1 = svget2(tuple, 1);                                                    \
-  }
-HWY_SVE_FOREACH(HWY_SVE_LOAD2, LoadInterleaved2, ld2)
-
-#undef HWY_SVE_LOAD2
-
-// ------------------------------ LoadInterleaved3
-
-#define HWY_SVE_LOAD3(BASE, CHAR, BITS, HALF, NAME, OP)                     \
-  template <size_t N, int kPow2>                                            \
-  HWY_API void NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d,                      \
-                    const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned,   \
-                    HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1, \
-                    HWY_SVE_V(BASE, BITS) & v2) {                           \
-    const sv##BASE##BITS##x3_t tuple =                                      \
-        sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned);              \
-    v0 = svget3(tuple, 0);                                                  \
-    v1 = svget3(tuple, 1);                                                  \
-    v2 = svget3(tuple, 2);                                                  \
-  }
-HWY_SVE_FOREACH(HWY_SVE_LOAD3, LoadInterleaved3, ld3)
-
-#undef HWY_SVE_LOAD3
-
-// ------------------------------ LoadInterleaved4
-
-#define HWY_SVE_LOAD4(BASE, CHAR, BITS, HALF, NAME, OP)                       \
-  template <size_t N, int kPow2>                                              \
-  HWY_API void NAME(HWY_SVE_D(BASE, BITS, N, kPow2) d,                        \
-                    const HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned,     \
-                    HWY_SVE_V(BASE, BITS) & v0, HWY_SVE_V(BASE, BITS) & v1,   \
-                    HWY_SVE_V(BASE, BITS) & v2, HWY_SVE_V(BASE, BITS) & v3) { \
-    const sv##BASE##BITS##x4_t tuple =                                        \
-        sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned);                \
-    v0 = svget4(tuple, 0);                                                    \
-    v1 = svget4(tuple, 1);                                                    \
-    v2 = svget4(tuple, 2);                                                    \
-    v3 = svget4(tuple, 3);                                                    \
-  }
-HWY_SVE_FOREACH(HWY_SVE_LOAD4, LoadInterleaved4, ld4)
-
-#undef HWY_SVE_LOAD4
-
-// ------------------------------ StoreInterleaved2
-
-#define HWY_SVE_STORE2(BASE, CHAR, BITS, HALF, NAME, OP)                 \
-  template <size_t N, int kPow2>                                         \
-  HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1,  \
-                    HWY_SVE_D(BASE, BITS, N, kPow2) d,                   \
-                    HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) {    \
-    const sv##BASE##BITS##x2_t tuple = svcreate2##_##CHAR##BITS(v0, v1); \
-    sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned, tuple);        \
-  }
-HWY_SVE_FOREACH(HWY_SVE_STORE2, StoreInterleaved2, st2)
-
-#undef HWY_SVE_STORE2
-
-// ------------------------------ StoreInterleaved3
-
-#define HWY_SVE_STORE3(BASE, CHAR, BITS, HALF, NAME, OP)                      \
-  template <size_t N, int kPow2>                                              \
-  HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1,       \
-                    HWY_SVE_V(BASE, BITS) v2,                                 \
-                    HWY_SVE_D(BASE, BITS, N, kPow2) d,                        \
-                    HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) {         \
-    const sv##BASE##BITS##x3_t triple = svcreate3##_##CHAR##BITS(v0, v1, v2); \
-    sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned, triple);            \
-  }
-HWY_SVE_FOREACH(HWY_SVE_STORE3, StoreInterleaved3, st3)
-
-#undef HWY_SVE_STORE3
-
-// ------------------------------ StoreInterleaved4
-
-#define HWY_SVE_STORE4(BASE, CHAR, BITS, HALF, NAME, OP)                \
-  template <size_t N, int kPow2>                                        \
-  HWY_API void NAME(HWY_SVE_V(BASE, BITS) v0, HWY_SVE_V(BASE, BITS) v1, \
-                    HWY_SVE_V(BASE, BITS) v2, HWY_SVE_V(BASE, BITS) v3, \
-                    HWY_SVE_D(BASE, BITS, N, kPow2) d,                  \
-                    HWY_SVE_T(BASE, BITS) * HWY_RESTRICT unaligned) {   \
-    const sv##BASE##BITS##x4_t quad =                                   \
-        svcreate4##_##CHAR##BITS(v0, v1, v2, v3);                       \
-    sv##OP##_##CHAR##BITS(detail::MakeMask(d), unaligned, quad);        \
-  }
-HWY_SVE_FOREACH(HWY_SVE_STORE4, StoreInterleaved4, st4)
-
-#undef HWY_SVE_STORE4
-
-// ================================================== CONVERT
-
-// ------------------------------ PromoteTo
-
-// Same sign
-#define HWY_SVE_PROMOTE_TO(BASE, CHAR, BITS, HALF, NAME, OP)                \
-  template <size_t N, int kPow2>                                            \
-  HWY_API HWY_SVE_V(BASE, BITS) NAME(                                       \
-      HWY_SVE_D(BASE, BITS, N, kPow2) /* tag */, HWY_SVE_V(BASE, HALF) v) { \
-    return sv##OP##_##CHAR##BITS(v);                                        \
-  }
-
-HWY_SVE_FOREACH_UI16(HWY_SVE_PROMOTE_TO, PromoteTo, unpklo)
-HWY_SVE_FOREACH_UI32(HWY_SVE_PROMOTE_TO, PromoteTo, unpklo)
-HWY_SVE_FOREACH_UI64(HWY_SVE_PROMOTE_TO, PromoteTo, unpklo)
-
-// 2x
-template <size_t N, int kPow2>
-HWY_API svuint32_t PromoteTo(Simd<uint32_t, N, kPow2> dto, svuint8_t vfrom) {
-  const RepartitionToWide<DFromV<decltype(vfrom)>> d2;
-  return PromoteTo(dto, PromoteTo(d2, vfrom));
-}
-template <size_t N, int kPow2>
-HWY_API svint32_t PromoteTo(Simd<int32_t, N, kPow2> dto, svint8_t vfrom) {
-  const RepartitionToWide<DFromV<decltype(vfrom)>> d2;
-  return PromoteTo(dto, PromoteTo(d2, vfrom));
-}
-
-// Sign change
-template <size_t N, int kPow2>
-HWY_API svint16_t PromoteTo(Simd<int16_t, N, kPow2> dto, svuint8_t vfrom) {
-  const RebindToUnsigned<decltype(dto)> du;
-  return BitCast(dto, PromoteTo(du, vfrom));
-}
-template <size_t N, int kPow2>
-HWY_API svint32_t PromoteTo(Simd<int32_t, N, kPow2> dto, svuint16_t vfrom) {
-  const RebindToUnsigned<decltype(dto)> du;
-  return BitCast(dto, PromoteTo(du, vfrom));
-}
-template <size_t N, int kPow2>
-HWY_API svint32_t PromoteTo(Simd<int32_t, N, kPow2> dto, svuint8_t vfrom) {
-  const Repartition<uint16_t, DFromV<decltype(vfrom)>> du16;
-  const Repartition<int16_t, decltype(du16)> di16;
-  return PromoteTo(dto, BitCast(di16, PromoteTo(du16, vfrom)));
-}
-
-// ------------------------------ PromoteTo F
-
-namespace detail {
-HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, ZipLower, zip1)
-}  // namespace detail
-
-template <size_t N, int kPow2>
-HWY_API svfloat32_t PromoteTo(Simd<float32_t, N, kPow2> /* d */,
-                              const svfloat16_t v) {
-  // svcvt* expects inputs in even lanes, whereas Highway wants lower lanes, so
-  // first replicate each lane once.
-  const svfloat16_t vv = detail::ZipLower(v, v);
-  return svcvt_f32_f16_x(detail::PTrue(Simd<float16_t, N, kPow2>()), vv);
-}
-
-template <size_t N, int kPow2>
-HWY_API svfloat64_t PromoteTo(Simd<float64_t, N, kPow2> /* d */,
-                              const svfloat32_t v) {
-  const svfloat32_t vv = detail::ZipLower(v, v);
-  return svcvt_f64_f32_x(detail::PTrue(Simd<float32_t, N, kPow2>()), vv);
-}
-
-template <size_t N, int kPow2>
-HWY_API svfloat64_t PromoteTo(Simd<float64_t, N, kPow2> /* d */,
-                              const svint32_t v) {
-  const svint32_t vv = detail::ZipLower(v, v);
-  return svcvt_f64_s32_x(detail::PTrue(Simd<int32_t, N, kPow2>()), vv);
-}
-
-// For 16-bit Compress
-namespace detail {
-HWY_SVE_FOREACH_UI32(HWY_SVE_PROMOTE_TO, PromoteUpperTo, unpkhi)
-#undef HWY_SVE_PROMOTE_TO
-
-template <size_t N, int kPow2>
-HWY_API svfloat32_t PromoteUpperTo(Simd<float, N, kPow2> df, svfloat16_t v) {
-  const RebindToUnsigned<decltype(df)> du;
-  const RepartitionToNarrow<decltype(du)> dn;
-  return BitCast(df, PromoteUpperTo(du, BitCast(dn, v)));
-}
-
-}  // namespace detail
-
-// ------------------------------ DemoteTo U
-
-namespace detail {
-
-// Saturates unsigned vectors to half/quarter-width TN.
-template <typename TN, class VU>
-VU SaturateU(VU v) {
-  return detail::MinN(v, static_cast<TFromV<VU>>(LimitsMax<TN>()));
-}
-
-// Saturates unsigned vectors to half/quarter-width TN.
-template <typename TN, class VI>
-VI SaturateI(VI v) {
-  return detail::MinN(detail::MaxN(v, LimitsMin<TN>()), LimitsMax<TN>());
-}
-
-}  // namespace detail
-
-template <size_t N, int kPow2>
-HWY_API svuint8_t DemoteTo(Simd<uint8_t, N, kPow2> dn, const svint16_t v) {
-  const DFromV<decltype(v)> di;
-  const RebindToUnsigned<decltype(di)> du;
-  using TN = TFromD<decltype(dn)>;
-  // First clamp negative numbers to zero and cast to unsigned.
-  const svuint16_t clamped = BitCast(du, detail::MaxN(v, 0));
-  // Saturate to unsigned-max and halve the width.
-  const svuint8_t vn = BitCast(dn, detail::SaturateU<TN>(clamped));
-  return svuzp1_u8(vn, vn);
-}
-
-template <size_t N, int kPow2>
-HWY_API svuint16_t DemoteTo(Simd<uint16_t, N, kPow2> dn, const svint32_t v) {
-  const DFromV<decltype(v)> di;
-  const RebindToUnsigned<decltype(di)> du;
-  using TN = TFromD<decltype(dn)>;
-  // First clamp negative numbers to zero and cast to unsigned.
-  const svuint32_t clamped = BitCast(du, detail::MaxN(v, 0));
-  // Saturate to unsigned-max and halve the width.
-  const svuint16_t vn = BitCast(dn, detail::SaturateU<TN>(clamped));
-  return svuzp1_u16(vn, vn);
-}
-
-template <size_t N, int kPow2>
-HWY_API svuint8_t DemoteTo(Simd<uint8_t, N, kPow2> dn, const svint32_t v) {
-  const DFromV<decltype(v)> di;
-  const RebindToUnsigned<decltype(di)> du;
-  const RepartitionToNarrow<decltype(du)> d2;
-  using TN = TFromD<decltype(dn)>;
-  // First clamp negative numbers to zero and cast to unsigned.
-  const svuint32_t clamped = BitCast(du, detail::MaxN(v, 0));
-  // Saturate to unsigned-max and quarter the width.
-  const svuint16_t cast16 = BitCast(d2, detail::SaturateU<TN>(clamped));
-  const svuint8_t x2 = BitCast(dn, svuzp1_u16(cast16, cast16));
-  return svuzp1_u8(x2, x2);
-}
-
-HWY_API svuint8_t U8FromU32(const svuint32_t v) {
-  const DFromV<svuint32_t> du32;
-  const RepartitionToNarrow<decltype(du32)> du16;
-  const RepartitionToNarrow<decltype(du16)> du8;
-
-  const svuint16_t cast16 = BitCast(du16, v);
-  const svuint16_t x2 = svuzp1_u16(cast16, cast16);
-  const svuint8_t cast8 = BitCast(du8, x2);
-  return svuzp1_u8(cast8, cast8);
-}
-
-// ------------------------------ Truncations
-
-template <size_t N, int kPow2>
-HWY_API svuint8_t TruncateTo(Simd<uint8_t, N, kPow2> /* tag */,
-                             const svuint64_t v) {
-  const DFromV<svuint8_t> d;
-  const svuint8_t v1 = BitCast(d, v);
-  const svuint8_t v2 = svuzp1_u8(v1, v1);
-  const svuint8_t v3 = svuzp1_u8(v2, v2);
-  return svuzp1_u8(v3, v3);
-}
-
-template <size_t N, int kPow2>
-HWY_API svuint16_t TruncateTo(Simd<uint16_t, N, kPow2> /* tag */,
-                              const svuint64_t v) {
-  const DFromV<svuint16_t> d;
-  const svuint16_t v1 = BitCast(d, v);
-  const svuint16_t v2 = svuzp1_u16(v1, v1);
-  return svuzp1_u16(v2, v2);
-}
-
-template <size_t N, int kPow2>
-HWY_API svuint32_t TruncateTo(Simd<uint32_t, N, kPow2> /* tag */,
-                              const svuint64_t v) {
-  const DFromV<svuint32_t> d;
-  const svuint32_t v1 = BitCast(d, v);
-  return svuzp1_u32(v1, v1);
-}
-
-template <size_t N, int kPow2>
-HWY_API svuint8_t TruncateTo(Simd<uint8_t, N, kPow2> /* tag */,
-                             const svuint32_t v) {
-  const DFromV<svuint8_t> d;
-  const svuint8_t v1 = BitCast(d, v);
-  const svuint8_t v2 = svuzp1_u8(v1, v1);
-  return svuzp1_u8(v2, v2);
-}
-
-template <size_t N, int kPow2>
-HWY_API svuint16_t TruncateTo(Simd<uint16_t, N, kPow2> /* tag */,
-                              const svuint32_t v) {
-  const DFromV<svuint16_t> d;
-  const svuint16_t v1 = BitCast(d, v);
-  return svuzp1_u16(v1, v1);
-}
-
-template <size_t N, int kPow2>
-HWY_API svuint8_t TruncateTo(Simd<uint8_t, N, kPow2> /* tag */,
-                             const svuint16_t v) {
-  const DFromV<svuint8_t> d;
-  const svuint8_t v1 = BitCast(d, v);
-  return svuzp1_u8(v1, v1);
-}
-
-// ------------------------------ DemoteTo I
-
-template <size_t N, int kPow2>
-HWY_API svint8_t DemoteTo(Simd<int8_t, N, kPow2> dn, const svint16_t v) {
-#if HWY_TARGET == HWY_SVE2
-  const svint8_t vn = BitCast(dn, svqxtnb_s16(v));
-#else
-  using TN = TFromD<decltype(dn)>;
-  const svint8_t vn = BitCast(dn, detail::SaturateI<TN>(v));
-#endif
-  return svuzp1_s8(vn, vn);
-}
-
-template <size_t N, int kPow2>
-HWY_API svint16_t DemoteTo(Simd<int16_t, N, kPow2> dn, const svint32_t v) {
-#if HWY_TARGET == HWY_SVE2
-  const svint16_t vn = BitCast(dn, svqxtnb_s32(v));
-#else
-  using TN = TFromD<decltype(dn)>;
-  const svint16_t vn = BitCast(dn, detail::SaturateI<TN>(v));
-#endif
-  return svuzp1_s16(vn, vn);
-}
-
-template <size_t N, int kPow2>
-HWY_API svint8_t DemoteTo(Simd<int8_t, N, kPow2> dn, const svint32_t v) {
-  const RepartitionToWide<decltype(dn)> d2;
-#if HWY_TARGET == HWY_SVE2
-  const svint16_t cast16 = BitCast(d2, svqxtnb_s16(svqxtnb_s32(v)));
-#else
-  using TN = TFromD<decltype(dn)>;
-  const svint16_t cast16 = BitCast(d2, detail::SaturateI<TN>(v));
-#endif
-  const svint8_t v2 = BitCast(dn, svuzp1_s16(cast16, cast16));
-  return BitCast(dn, svuzp1_s8(v2, v2));
-}
-
-// ------------------------------ ConcatEven/ConcatOdd
-
-// WARNING: the upper half of these needs fixing up (uzp1/uzp2 use the
-// full vector length, not rounded down to a power of two as we require).
-namespace detail {
-
-#define HWY_SVE_CONCAT_EVERY_SECOND(BASE, CHAR, BITS, HALF, NAME, OP) \
-  HWY_INLINE HWY_SVE_V(BASE, BITS)                                    \
-      NAME(HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo) {      \
-    return sv##OP##_##CHAR##BITS(lo, hi);                             \
-  }
-HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEven, uzp1)
-HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOdd, uzp2)
-#if defined(__ARM_FEATURE_SVE_MATMUL_FP64)
-HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatEvenBlocks, uzp1q)
-HWY_SVE_FOREACH(HWY_SVE_CONCAT_EVERY_SECOND, ConcatOddBlocks, uzp2q)
-#endif
-#undef HWY_SVE_CONCAT_EVERY_SECOND
-
-// Used to slide up / shift whole register left; mask indicates which range
-// to take from lo, and the rest is filled from hi starting at its lowest.
-#define HWY_SVE_SPLICE(BASE, CHAR, BITS, HALF, NAME, OP)                   \
-  HWY_API HWY_SVE_V(BASE, BITS) NAME(                                      \
-      HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo, svbool_t mask) { \
-    return sv##OP##_##CHAR##BITS(mask, lo, hi);                            \
-  }
-HWY_SVE_FOREACH(HWY_SVE_SPLICE, Splice, splice)
-#undef HWY_SVE_SPLICE
-
-}  // namespace detail
-
-template <class D>
-HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
-#if HWY_SVE_IS_POW2
-  (void)d;
-  return detail::ConcatOdd(hi, lo);
-#else
-  const VFromD<D> hi_odd = detail::ConcatOdd(hi, hi);
-  const VFromD<D> lo_odd = detail::ConcatOdd(lo, lo);
-  return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2));
-#endif
-}
-
-template <class D>
-HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
-#if HWY_SVE_IS_POW2
-  (void)d;
-  return detail::ConcatEven(hi, lo);
-#else
-  const VFromD<D> hi_odd = detail::ConcatEven(hi, hi);
-  const VFromD<D> lo_odd = detail::ConcatEven(lo, lo);
-  return detail::Splice(hi_odd, lo_odd, FirstN(d, Lanes(d) / 2));
-#endif
-}
-
-// ------------------------------ DemoteTo F
-
-template <size_t N, int kPow2>
-HWY_API svfloat16_t DemoteTo(Simd<float16_t, N, kPow2> d, const svfloat32_t v) {
-  const svfloat16_t in_even = svcvt_f16_f32_x(detail::PTrue(d), v);
-  return detail::ConcatEven(in_even, in_even);  // only low 1/2 of result valid
-}
-
-template <size_t N, int kPow2>
-HWY_API svuint16_t DemoteTo(Simd<bfloat16_t, N, kPow2> /* d */, svfloat32_t v) {
-  const svuint16_t in_even = BitCast(ScalableTag<uint16_t>(), v);
-  return detail::ConcatOdd(in_even, in_even);  // can ignore upper half of vec
-}
-
-template <size_t N, int kPow2>
-HWY_API svfloat32_t DemoteTo(Simd<float32_t, N, kPow2> d, const svfloat64_t v) {
-  const svfloat32_t in_even = svcvt_f32_f64_x(detail::PTrue(d), v);
-  return detail::ConcatEven(in_even, in_even);  // only low 1/2 of result valid
-}
-
-template <size_t N, int kPow2>
-HWY_API svint32_t DemoteTo(Simd<int32_t, N, kPow2> d, const svfloat64_t v) {
-  const svint32_t in_even = svcvt_s32_f64_x(detail::PTrue(d), v);
-  return detail::ConcatEven(in_even, in_even);  // only low 1/2 of result valid
-}
-
-// ------------------------------ ConvertTo F
-
-#define HWY_SVE_CONVERT(BASE, CHAR, BITS, HALF, NAME, OP)                     \
-  /* signed integers */                                                       \
-  template <size_t N, int kPow2>                                              \
-  HWY_API HWY_SVE_V(BASE, BITS)                                               \
-      NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, HWY_SVE_V(int, BITS) v) { \
-    return sv##OP##_##CHAR##BITS##_s##BITS##_x(HWY_SVE_PTRUE(BITS), v);       \
-  }                                                                           \
-  /* unsigned integers */                                                     \
-  template <size_t N, int kPow2>                                              \
-  HWY_API HWY_SVE_V(BASE, BITS)                                               \
-      NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, HWY_SVE_V(uint, BITS) v) { \
-    return sv##OP##_##CHAR##BITS##_u##BITS##_x(HWY_SVE_PTRUE(BITS), v);       \
-  }                                                                           \
-  /* Truncates (rounds toward zero). */                                       \
-  template <size_t N, int kPow2>                                              \
-  HWY_API HWY_SVE_V(int, BITS)                                                \
-      NAME(HWY_SVE_D(int, BITS, N, kPow2) /* d */, HWY_SVE_V(BASE, BITS) v) { \
-    return sv##OP##_s##BITS##_##CHAR##BITS##_x(HWY_SVE_PTRUE(BITS), v);       \
-  }
-
-// API only requires f32 but we provide f64 for use by Iota.
-HWY_SVE_FOREACH_F(HWY_SVE_CONVERT, ConvertTo, cvt)
-#undef HWY_SVE_CONVERT
-
-// ------------------------------ NearestInt (Round, ConvertTo)
-template <class VF, class DI = RebindToSigned<DFromV<VF>>>
-HWY_API VFromD<DI> NearestInt(VF v) {
-  // No single instruction, round then truncate.
-  return ConvertTo(DI(), Round(v));
-}
-
-// ------------------------------ Iota (Add, ConvertTo)
-
-#define HWY_SVE_IOTA(BASE, CHAR, BITS, HALF, NAME, OP)                        \
-  template <size_t N, int kPow2>                                              \
-  HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /* d */, \
-                                     HWY_SVE_T(BASE, BITS) first) {           \
-    return sv##OP##_##CHAR##BITS(first, 1);                                   \
-  }
-
-HWY_SVE_FOREACH_UI(HWY_SVE_IOTA, Iota, index)
-#undef HWY_SVE_IOTA
-
-template <class D, HWY_IF_FLOAT_D(D)>
-HWY_API VFromD<D> Iota(const D d, TFromD<D> first) {
-  const RebindToSigned<D> di;
-  return detail::AddN(ConvertTo(d, Iota(di, 0)), first);
-}
-
-// ------------------------------ InterleaveLower
-
-template <class D, class V>
-HWY_API V InterleaveLower(D d, const V a, const V b) {
-  static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
-#if HWY_TARGET == HWY_SVE2_128
-  (void)d;
-  return detail::ZipLower(a, b);
-#else
-  // Move lower halves of blocks to lower half of vector.
-  const Repartition<uint64_t, decltype(d)> d64;
-  const auto a64 = BitCast(d64, a);
-  const auto b64 = BitCast(d64, b);
-  const auto a_blocks = detail::ConcatEven(a64, a64);  // only lower half needed
-  const auto b_blocks = detail::ConcatEven(b64, b64);
-  return detail::ZipLower(BitCast(d, a_blocks), BitCast(d, b_blocks));
-#endif
-}
-
-template <class V>
-HWY_API V InterleaveLower(const V a, const V b) {
-  return InterleaveLower(DFromV<V>(), a, b);
-}
-
-// ------------------------------ InterleaveUpper
-
-// Only use zip2 if vector are a powers of two, otherwise getting the actual
-// "upper half" requires MaskUpperHalf.
-#if HWY_TARGET == HWY_SVE2_128
-namespace detail {
-HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, ZipUpper, zip2)
-}  // namespace detail
-#endif
-
-// Full vector: guaranteed to have at least one block
-template <class D, class V = VFromD<D>,
-          hwy::EnableIf<detail::IsFull(D())>* = nullptr>
-HWY_API V InterleaveUpper(D d, const V a, const V b) {
-#if HWY_TARGET == HWY_SVE2_128
-  (void)d;
-  return detail::ZipUpper(a, b);
-#else
-  // Move upper halves of blocks to lower half of vector.
-  const Repartition<uint64_t, decltype(d)> d64;
-  const auto a64 = BitCast(d64, a);
-  const auto b64 = BitCast(d64, b);
-  const auto a_blocks = detail::ConcatOdd(a64, a64);  // only lower half needed
-  const auto b_blocks = detail::ConcatOdd(b64, b64);
-  return detail::ZipLower(BitCast(d, a_blocks), BitCast(d, b_blocks));
-#endif
-}
-
-// Capped/fraction: need runtime check
-template <class D, class V = VFromD<D>,
-          hwy::EnableIf<!detail::IsFull(D())>* = nullptr>
-HWY_API V InterleaveUpper(D d, const V a, const V b) {
-  // Less than one block: treat as capped
-  if (Lanes(d) * sizeof(TFromD<D>) < 16) {
-    const Half<decltype(d)> d2;
-    return InterleaveLower(d, UpperHalf(d2, a), UpperHalf(d2, b));
-  }
-  return InterleaveUpper(DFromV<V>(), a, b);
-}
-
-// ================================================== COMBINE
-
-namespace detail {
-
-#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
-template <class D, HWY_IF_LANE_SIZE_D(D, 1)>
-svbool_t MaskLowerHalf(D d) {
-  switch (Lanes(d)) {
-    case 32:
-      return svptrue_pat_b8(SV_VL16);
-    case 16:
-      return svptrue_pat_b8(SV_VL8);
-    case 8:
-      return svptrue_pat_b8(SV_VL4);
-    case 4:
-      return svptrue_pat_b8(SV_VL2);
-    default:
-      return svptrue_pat_b8(SV_VL1);
-  }
-}
-template <class D, HWY_IF_LANE_SIZE_D(D, 2)>
-svbool_t MaskLowerHalf(D d) {
-  switch (Lanes(d)) {
-    case 16:
-      return svptrue_pat_b16(SV_VL8);
-    case 8:
-      return svptrue_pat_b16(SV_VL4);
-    case 4:
-      return svptrue_pat_b16(SV_VL2);
-    default:
-      return svptrue_pat_b16(SV_VL1);
-  }
-}
-template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
-svbool_t MaskLowerHalf(D d) {
-  switch (Lanes(d)) {
-    case 8:
-      return svptrue_pat_b32(SV_VL4);
-    case 4:
-      return svptrue_pat_b32(SV_VL2);
-    default:
-      return svptrue_pat_b32(SV_VL1);
-  }
-}
-template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
-svbool_t MaskLowerHalf(D d) {
-  switch (Lanes(d)) {
-    case 4:
-      return svptrue_pat_b64(SV_VL2);
-    default:
-      return svptrue_pat_b64(SV_VL1);
-  }
-}
-#endif
-#if HWY_TARGET == HWY_SVE2_128 || HWY_IDE
-template <class D, HWY_IF_LANE_SIZE_D(D, 1)>
-svbool_t MaskLowerHalf(D d) {
-  switch (Lanes(d)) {
-    case 16:
-      return svptrue_pat_b8(SV_VL8);
-    case 8:
-      return svptrue_pat_b8(SV_VL4);
-    case 4:
-      return svptrue_pat_b8(SV_VL2);
-    case 2:
-    case 1:
-    default:
-      return svptrue_pat_b8(SV_VL1);
-  }
-}
-template <class D, HWY_IF_LANE_SIZE_D(D, 2)>
-svbool_t MaskLowerHalf(D d) {
-  switch (Lanes(d)) {
-    case 8:
-      return svptrue_pat_b16(SV_VL4);
-    case 4:
-      return svptrue_pat_b16(SV_VL2);
-    case 2:
-    case 1:
-    default:
-      return svptrue_pat_b16(SV_VL1);
-  }
-}
-template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
-svbool_t MaskLowerHalf(D d) {
-  return svptrue_pat_b32(Lanes(d) == 4 ? SV_VL2 : SV_VL1);
-}
-template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
-svbool_t MaskLowerHalf(D /*d*/) {
-  return svptrue_pat_b64(SV_VL1);
-}
-#endif  // HWY_TARGET == HWY_SVE2_128
-#if HWY_TARGET != HWY_SVE_256 && HWY_TARGET != HWY_SVE2_128
-template <class D>
-svbool_t MaskLowerHalf(D d) {
-  return FirstN(d, Lanes(d) / 2);
-}
-#endif
-
-template <class D>
-svbool_t MaskUpperHalf(D d) {
-  // TODO(janwas): WHILEGE on pow2 SVE2
-  if (HWY_SVE_IS_POW2 && IsFull(d)) {
-    return Not(MaskLowerHalf(d));
-  }
-
-  // For Splice to work as intended, make sure bits above Lanes(d) are zero.
-  return AndNot(MaskLowerHalf(d), detail::MakeMask(d));
-}
-
-// Right-shift vector pair by constexpr; can be used to slide down (=N) or up
-// (=Lanes()-N).
-#define HWY_SVE_EXT(BASE, CHAR, BITS, HALF, NAME, OP)            \
-  template <size_t kIndex>                                       \
-  HWY_API HWY_SVE_V(BASE, BITS)                                  \
-      NAME(HWY_SVE_V(BASE, BITS) hi, HWY_SVE_V(BASE, BITS) lo) { \
-    return sv##OP##_##CHAR##BITS(lo, hi, kIndex);                \
-  }
-HWY_SVE_FOREACH(HWY_SVE_EXT, Ext, ext)
-#undef HWY_SVE_EXT
-
-}  // namespace detail
-
-// ------------------------------ ConcatUpperLower
-template <class D, class V>
-HWY_API V ConcatUpperLower(const D d, const V hi, const V lo) {
-  return IfThenElse(detail::MaskLowerHalf(d), lo, hi);
-}
-
-// ------------------------------ ConcatLowerLower
-template <class D, class V>
-HWY_API V ConcatLowerLower(const D d, const V hi, const V lo) {
-  if (detail::IsFull(d)) {
-#if defined(__ARM_FEATURE_SVE_MATMUL_FP64) && HWY_TARGET == HWY_SVE_256
-    return detail::ConcatEvenBlocks(hi, lo);
-#endif
-#if HWY_TARGET == HWY_SVE2_128
-    const Repartition<uint64_t, D> du64;
-    const auto lo64 = BitCast(du64, lo);
-    return BitCast(d, InterleaveLower(du64, lo64, BitCast(du64, hi)));
-#endif
-  }
-  return detail::Splice(hi, lo, detail::MaskLowerHalf(d));
-}
-
-// ------------------------------ ConcatLowerUpper
-template <class D, class V>
-HWY_API V ConcatLowerUpper(const D d, const V hi, const V lo) {
-#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128  // constexpr Lanes
-  if (detail::IsFull(d)) {
-    return detail::Ext<Lanes(d) / 2>(hi, lo);
-  }
-#endif
-  return detail::Splice(hi, lo, detail::MaskUpperHalf(d));
-}
-
-// ------------------------------ ConcatUpperUpper
-template <class D, class V>
-HWY_API V ConcatUpperUpper(const D d, const V hi, const V lo) {
-  if (detail::IsFull(d)) {
-#if defined(__ARM_FEATURE_SVE_MATMUL_FP64) && HWY_TARGET == HWY_SVE_256
-    return detail::ConcatOddBlocks(hi, lo);
-#endif
-#if HWY_TARGET == HWY_SVE2_128
-    const Repartition<uint64_t, D> du64;
-    const auto lo64 = BitCast(du64, lo);
-    return BitCast(d, InterleaveUpper(du64, lo64, BitCast(du64, hi)));
-#endif
-  }
-  const svbool_t mask_upper = detail::MaskUpperHalf(d);
-  const V lo_upper = detail::Splice(lo, lo, mask_upper);
-  return IfThenElse(mask_upper, hi, lo_upper);
-}
-
-// ------------------------------ Combine
-template <class D, class V2>
-HWY_API VFromD<D> Combine(const D d, const V2 hi, const V2 lo) {
-  return ConcatLowerLower(d, hi, lo);
-}
-
-// ------------------------------ ZeroExtendVector
-template <class D, class V>
-HWY_API V ZeroExtendVector(const D d, const V lo) {
-  return Combine(d, Zero(Half<D>()), lo);
-}
-
-// ------------------------------ Lower/UpperHalf
-
-template <class D2, class V>
-HWY_API V LowerHalf(D2 /* tag */, const V v) {
-  return v;
-}
-
-template <class V>
-HWY_API V LowerHalf(const V v) {
-  return v;
-}
-
-template <class D2, class V>
-HWY_API V UpperHalf(const D2 d2, const V v) {
-#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128  // constexpr Lanes
-  return detail::Ext<Lanes(d2)>(v, v);
-#else
-  return detail::Splice(v, v, detail::MaskUpperHalf(Twice<decltype(d2)>()));
-#endif
-}
-
-// ================================================== REDUCE
-
-// These return T, whereas the Highway op returns a broadcasted vector.
-namespace detail {
-#define HWY_SVE_REDUCE_ADD(BASE, CHAR, BITS, HALF, NAME, OP)                   \
-  HWY_API HWY_SVE_T(BASE, BITS) NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) v) {   \
-    /* The intrinsic returns [u]int64_t; truncate to T so we can broadcast. */ \
-    using T = HWY_SVE_T(BASE, BITS);                                           \
-    using TU = MakeUnsigned<T>;                                                \
-    constexpr uint64_t kMask = LimitsMax<TU>();                                \
-    return static_cast<T>(static_cast<TU>(                                     \
-        static_cast<uint64_t>(sv##OP##_##CHAR##BITS(pg, v)) & kMask));         \
-  }
-
-#define HWY_SVE_REDUCE(BASE, CHAR, BITS, HALF, NAME, OP)                     \
-  HWY_API HWY_SVE_T(BASE, BITS) NAME(svbool_t pg, HWY_SVE_V(BASE, BITS) v) { \
-    return sv##OP##_##CHAR##BITS(pg, v);                                     \
-  }
-
-HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE_ADD, SumOfLanes, addv)
-HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, SumOfLanes, addv)
-
-HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MinOfLanes, minv)
-HWY_SVE_FOREACH_UI(HWY_SVE_REDUCE, MaxOfLanes, maxv)
-// NaN if all are
-HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MinOfLanes, minnmv)
-HWY_SVE_FOREACH_F(HWY_SVE_REDUCE, MaxOfLanes, maxnmv)
-
-#undef HWY_SVE_REDUCE
-#undef HWY_SVE_REDUCE_ADD
-}  // namespace detail
-
-template <class D, class V>
-V SumOfLanes(D d, V v) {
-  return Set(d, detail::SumOfLanes(detail::MakeMask(d), v));
-}
-
-template <class D, class V>
-V MinOfLanes(D d, V v) {
-  return Set(d, detail::MinOfLanes(detail::MakeMask(d), v));
-}
-
-template <class D, class V>
-V MaxOfLanes(D d, V v) {
-  return Set(d, detail::MaxOfLanes(detail::MakeMask(d), v));
-}
-
-
-// ================================================== SWIZZLE
-
-// ------------------------------ GetLane
-
-namespace detail {
-#define HWY_SVE_GET_LANE(BASE, CHAR, BITS, HALF, NAME, OP) \
-  HWY_INLINE HWY_SVE_T(BASE, BITS)                         \
-      NAME(HWY_SVE_V(BASE, BITS) v, svbool_t mask) {       \
-    return sv##OP##_##CHAR##BITS(mask, v);                 \
-  }
-
-HWY_SVE_FOREACH(HWY_SVE_GET_LANE, GetLane, lasta)
-#undef HWY_SVE_GET_LANE
-}  // namespace detail
-
-template <class V>
-HWY_API TFromV<V> GetLane(V v) {
-  return detail::GetLane(v, detail::PFalse());
-}
-
-// ------------------------------ ExtractLane
-template <class V>
-HWY_API TFromV<V> ExtractLane(V v, size_t i) {
-  return detail::GetLane(v, FirstN(DFromV<V>(), i));
-}
-
-// ------------------------------ InsertLane (IfThenElse)
-template <class V>
-HWY_API V InsertLane(const V v, size_t i, TFromV<V> t) {
-  const DFromV<V> d;
-  const auto is_i = detail::EqN(Iota(d, 0), static_cast<TFromV<V>>(i));
-  return IfThenElse(RebindMask(d, is_i), Set(d, t), v);
-}
-
-// ------------------------------ DupEven
-
-namespace detail {
-HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, InterleaveEven, trn1)
-}  // namespace detail
-
-template <class V>
-HWY_API V DupEven(const V v) {
-  return detail::InterleaveEven(v, v);
-}
-
-// ------------------------------ DupOdd
-
-namespace detail {
-HWY_SVE_FOREACH(HWY_SVE_RETV_ARGVV, InterleaveOdd, trn2)
-}  // namespace detail
-
-template <class V>
-HWY_API V DupOdd(const V v) {
-  return detail::InterleaveOdd(v, v);
-}
-
-// ------------------------------ OddEven
-
-#if HWY_TARGET == HWY_SVE2_128 || HWY_TARGET == HWY_SVE2
-
-#define HWY_SVE_ODD_EVEN(BASE, CHAR, BITS, HALF, NAME, OP)          \
-  HWY_API HWY_SVE_V(BASE, BITS)                                     \
-      NAME(HWY_SVE_V(BASE, BITS) odd, HWY_SVE_V(BASE, BITS) even) { \
-    return sv##OP##_##CHAR##BITS(even, odd, /*xor=*/0);             \
-  }
-
-HWY_SVE_FOREACH_UI(HWY_SVE_ODD_EVEN, OddEven, eortb_n)
-#undef HWY_SVE_ODD_EVEN
-
-template <class V, HWY_IF_FLOAT_V(V)>
-HWY_API V OddEven(const V odd, const V even) {
-  const DFromV<V> d;
-  const RebindToUnsigned<decltype(d)> du;
-  return BitCast(d, OddEven(BitCast(du, odd), BitCast(du, even)));
-}
-
-#else
-
-template <class V>
-HWY_API V OddEven(const V odd, const V even) {
-  const auto odd_in_even = detail::Ext<1>(odd, odd);
-  return detail::InterleaveEven(even, odd_in_even);
-}
-
-#endif  // HWY_TARGET
-
-// ------------------------------ OddEvenBlocks
-template <class V>
-HWY_API V OddEvenBlocks(const V odd, const V even) {
-  const DFromV<V> d;
-#if HWY_TARGET == HWY_SVE_256
-  return ConcatUpperLower(d, odd, even);
-#elif HWY_TARGET == HWY_SVE2_128
-  (void)odd;
-  (void)d;
-  return even;
-#else
-  const RebindToUnsigned<decltype(d)> du;
-  using TU = TFromD<decltype(du)>;
-  constexpr size_t kShift = CeilLog2(16 / sizeof(TU));
-  const auto idx_block = ShiftRight<kShift>(Iota(du, 0));
-  const auto lsb = detail::AndN(idx_block, static_cast<TU>(1));
-  const svbool_t is_even = detail::EqN(lsb, static_cast<TU>(0));
-  return IfThenElse(is_even, even, odd);
-#endif
-}
-
-// ------------------------------ TableLookupLanes
-
-template <class D, class VI>
-HWY_API VFromD<RebindToUnsigned<D>> IndicesFromVec(D d, VI vec) {
-  using TI = TFromV<VI>;
-  static_assert(sizeof(TFromD<D>) == sizeof(TI), "Index/lane size mismatch");
-  const RebindToUnsigned<D> du;
-  const auto indices = BitCast(du, vec);
-#if HWY_IS_DEBUG_BUILD
-  HWY_DASSERT(AllTrue(du, detail::LtN(indices, static_cast<TI>(Lanes(d)))));
-#else
-  (void)d;
-#endif
-  return indices;
-}
-
-template <class D, typename TI>
-HWY_API VFromD<RebindToUnsigned<D>> SetTableIndices(D d, const TI* idx) {
-  static_assert(sizeof(TFromD<D>) == sizeof(TI), "Index size must match lane");
-  return IndicesFromVec(d, LoadU(Rebind<TI, D>(), idx));
-}
-
-// <32bit are not part of Highway API, but used in Broadcast.
-#define HWY_SVE_TABLE(BASE, CHAR, BITS, HALF, NAME, OP)          \
-  HWY_API HWY_SVE_V(BASE, BITS)                                  \
-      NAME(HWY_SVE_V(BASE, BITS) v, HWY_SVE_V(uint, BITS) idx) { \
-    return sv##OP##_##CHAR##BITS(v, idx);                        \
-  }
-
-HWY_SVE_FOREACH(HWY_SVE_TABLE, TableLookupLanes, tbl)
-#undef HWY_SVE_TABLE
-
-// ------------------------------ SwapAdjacentBlocks (TableLookupLanes)
-
-namespace detail {
-
-template <typename T, size_t N, int kPow2>
-constexpr size_t LanesPerBlock(Simd<T, N, kPow2> /* tag */) {
-  // We might have a capped vector smaller than a block, so honor that.
-  return HWY_MIN(16 / sizeof(T), detail::ScaleByPower(N, kPow2));
-}
-
-}  // namespace detail
-
-template <class V>
-HWY_API V SwapAdjacentBlocks(const V v) {
-  const DFromV<V> d;
-#if HWY_TARGET == HWY_SVE_256
-  return ConcatLowerUpper(d, v, v);
-#elif HWY_TARGET == HWY_SVE2_128
-  (void)d;
-  return v;
-#else
-  const RebindToUnsigned<decltype(d)> du;
-  constexpr auto kLanesPerBlock =
-      static_cast<TFromD<decltype(du)>>(detail::LanesPerBlock(d));
-  const VFromD<decltype(du)> idx = detail::XorN(Iota(du, 0), kLanesPerBlock);
-  return TableLookupLanes(v, idx);
-#endif
-}
-
-// ------------------------------ Reverse
-
-namespace detail {
-
-#define HWY_SVE_REVERSE(BASE, CHAR, BITS, HALF, NAME, OP)       \
-  HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
-    return sv##OP##_##CHAR##BITS(v);                            \
-  }
-
-HWY_SVE_FOREACH(HWY_SVE_REVERSE, ReverseFull, rev)
-#undef HWY_SVE_REVERSE
-
-}  // namespace detail
-
-template <class D, class V>
-HWY_API V Reverse(D d, V v) {
-  using T = TFromD<D>;
-  const auto reversed = detail::ReverseFull(v);
-  if (HWY_SVE_IS_POW2 && detail::IsFull(d)) return reversed;
-  // Shift right to remove extra (non-pow2 and remainder) lanes.
-  // TODO(janwas): on SVE2, use WHILEGE.
-  // Avoids FirstN truncating to the return vector size. Must also avoid Not
-  // because that is limited to SV_POW2.
-  const ScalableTag<T> dfull;
-  const svbool_t all_true = detail::AllPTrue(dfull);
-  const size_t all_lanes = detail::AllHardwareLanes(hwy::SizeTag<sizeof(T)>());
-  const svbool_t mask =
-      svnot_b_z(all_true, FirstN(dfull, all_lanes - Lanes(d)));
-  return detail::Splice(reversed, reversed, mask);
-}
-
-// ------------------------------ Reverse2
-
-template <class D, HWY_IF_LANE_SIZE_D(D, 2)>
-HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) {
-  const RebindToUnsigned<decltype(d)> du;
-  const RepartitionToWide<decltype(du)> dw;
-  return BitCast(d, svrevh_u32_x(detail::PTrue(d), BitCast(dw, v)));
-}
-
-template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
-HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) {
-  const RebindToUnsigned<decltype(d)> du;
-  const RepartitionToWide<decltype(du)> dw;
-  return BitCast(d, svrevw_u64_x(detail::PTrue(d), BitCast(dw, v)));
-}
-
-template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
-HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) {  // 3210
-#if HWY_TARGET == HWY_SVE2_128
-  if (detail::IsFull(d)) {
-    return detail::Ext<1>(v, v);
-  }
-#endif
-  (void)d;
-  const auto odd_in_even = detail::Ext<1>(v, v);  // x321
-  return detail::InterleaveEven(odd_in_even, v);  // 2301
-}
-// ------------------------------ Reverse4 (TableLookupLanes)
-template <class D>
-HWY_API VFromD<D> Reverse4(D d, const VFromD<D> v) {
-  if (HWY_TARGET == HWY_SVE_256 && sizeof(TFromD<D>) == 8 &&
-      detail::IsFull(d)) {
-    return detail::ReverseFull(v);
-  }
-  // TODO(janwas): is this approach faster than Shuffle0123?
-  const RebindToUnsigned<decltype(d)> du;
-  const auto idx = detail::XorN(Iota(du, 0), 3);
-  return TableLookupLanes(v, idx);
-}
-
-// ------------------------------ Reverse8 (TableLookupLanes)
-template <class D>
-HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) {
-  const RebindToUnsigned<decltype(d)> du;
-  const auto idx = detail::XorN(Iota(du, 0), 7);
-  return TableLookupLanes(v, idx);
-}
-
-// ------------------------------ Compress (PromoteTo)
-
-template <typename T>
-struct CompressIsPartition {
-#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
-  // Optimization for 64-bit lanes (could also be applied to 32-bit, but that
-  // requires a larger table).
-  enum { value = (sizeof(T) == 8) };
-#else
-  enum { value = 0 };
-#endif  // HWY_TARGET == HWY_SVE_256
-};
-
-#define HWY_SVE_COMPRESS(BASE, CHAR, BITS, HALF, NAME, OP)                     \
-  HWY_API HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v, svbool_t mask) { \
-    return sv##OP##_##CHAR##BITS(mask, v);                                     \
-  }
-
-#if HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
-HWY_SVE_FOREACH_UI32(HWY_SVE_COMPRESS, Compress, compact)
-HWY_SVE_FOREACH_F32(HWY_SVE_COMPRESS, Compress, compact)
-#else
-HWY_SVE_FOREACH_UIF3264(HWY_SVE_COMPRESS, Compress, compact)
-#endif
-#undef HWY_SVE_COMPRESS
-
-#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
-template <class V, HWY_IF_LANE_SIZE_V(V, 8)>
-HWY_API V Compress(V v, svbool_t mask) {
-  const DFromV<V> d;
-  const RebindToUnsigned<decltype(d)> du64;
-
-  // Convert mask into bitfield via horizontal sum (faster than ORV) of masked
-  // bits 1, 2, 4, 8. Pre-multiply by N so we can use it as an offset for
-  // SetTableIndices.
-  const svuint64_t bits = Shl(Set(du64, 1), Iota(du64, 2));
-  const size_t offset = detail::SumOfLanes(mask, bits);
-
-  // See CompressIsPartition.
-  alignas(16) static constexpr uint64_t table[4 * 16] = {
-      // PrintCompress64x4Tables
-      0, 1, 2, 3, 0, 1, 2, 3, 1, 0, 2, 3, 0, 1, 2, 3, 2, 0, 1, 3, 0, 2,
-      1, 3, 1, 2, 0, 3, 0, 1, 2, 3, 3, 0, 1, 2, 0, 3, 1, 2, 1, 3, 0, 2,
-      0, 1, 3, 2, 2, 3, 0, 1, 0, 2, 3, 1, 1, 2, 3, 0, 0, 1, 2, 3};
-  return TableLookupLanes(v, SetTableIndices(d, table + offset));
-}
-#endif  // HWY_TARGET == HWY_SVE_256
-#if HWY_TARGET == HWY_SVE2_128 || HWY_IDE
-template <class V, HWY_IF_LANE_SIZE_V(V, 8)>
-HWY_API V Compress(V v, svbool_t mask) {
-  // If mask == 10: swap via splice. A mask of 00 or 11 leaves v unchanged, 10
-  // swaps upper/lower (the lower half is set to the upper half, and the
-  // remaining upper half is filled from the lower half of the second v), and
-  // 01 is invalid because it would ConcatLowerLower. zip1 and AndNot keep 10
-  // unchanged and map everything else to 00.
-  const svbool_t maskLL = svzip1_b64(mask, mask);  // broadcast lower lane
-  return detail::Splice(v, v, AndNot(maskLL, mask));
-}
-#endif  // HWY_TARGET == HWY_SVE_256
-
-template <class V, HWY_IF_LANE_SIZE_V(V, 2)>
-HWY_API V Compress(V v, svbool_t mask16) {
-  static_assert(!IsSame<V, svfloat16_t>(), "Must use overload");
-  const DFromV<V> d16;
-
-  // Promote vector and mask to 32-bit
-  const RepartitionToWide<decltype(d16)> dw;
-  const auto v32L = PromoteTo(dw, v);
-  const auto v32H = detail::PromoteUpperTo(dw, v);
-  const svbool_t mask32L = svunpklo_b(mask16);
-  const svbool_t mask32H = svunpkhi_b(mask16);
-
-  const auto compressedL = Compress(v32L, mask32L);
-  const auto compressedH = Compress(v32H, mask32H);
-
-  // Demote to 16-bit (already in range) - separately so we can splice
-  const V evenL = BitCast(d16, compressedL);
-  const V evenH = BitCast(d16, compressedH);
-  const V v16L = detail::ConcatEven(evenL, evenL);  // only lower half needed
-  const V v16H = detail::ConcatEven(evenH, evenH);
-
-  // We need to combine two vectors of non-constexpr length, so the only option
-  // is Splice, which requires us to synthesize a mask. NOTE: this function uses
-  // full vectors (SV_ALL instead of SV_POW2), hence we need unmasked svcnt.
-  const size_t countL = detail::CountTrueFull(dw, mask32L);
-  const auto compressed_maskL = FirstN(d16, countL);
-  return detail::Splice(v16H, v16L, compressed_maskL);
-}
-
-// Must treat float16_t as integers so we can ConcatEven.
-HWY_API svfloat16_t Compress(svfloat16_t v, svbool_t mask16) {
-  const DFromV<decltype(v)> df;
-  const RebindToSigned<decltype(df)> di;
-  return BitCast(df, Compress(BitCast(di, v), mask16));
-}
-
-// ------------------------------ CompressNot
-
-template <class V, HWY_IF_NOT_LANE_SIZE_V(V, 8)>
-HWY_API V CompressNot(V v, const svbool_t mask) {
-  return Compress(v, Not(mask));
-}
-
-template <class V, HWY_IF_LANE_SIZE_V(V, 8)>
-HWY_API V CompressNot(V v, svbool_t mask) {
-#if HWY_TARGET == HWY_SVE2_128 || HWY_IDE
-  // If mask == 01: swap via splice. A mask of 00 or 11 leaves v unchanged, 10
-  // swaps upper/lower (the lower half is set to the upper half, and the
-  // remaining upper half is filled from the lower half of the second v), and
-  // 01 is invalid because it would ConcatLowerLower. zip1 and AndNot map
-  // 01 to 10, and everything else to 00.
-  const svbool_t maskLL = svzip1_b64(mask, mask);  // broadcast lower lane
-  return detail::Splice(v, v, AndNot(mask, maskLL));
-#endif
-#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
-  const DFromV<V> d;
-  const RebindToUnsigned<decltype(d)> du64;
-
-  // Convert mask into bitfield via horizontal sum (faster than ORV) of masked
-  // bits 1, 2, 4, 8. Pre-multiply by N so we can use it as an offset for
-  // SetTableIndices.
-  const svuint64_t bits = Shl(Set(du64, 1), Iota(du64, 2));
-  const size_t offset = detail::SumOfLanes(mask, bits);
-
-  // See CompressIsPartition.
-  alignas(16) static constexpr uint64_t table[4 * 16] = {
-      // PrintCompressNot64x4Tables
-      0, 1, 2, 3, 1, 2, 3, 0, 0, 2, 3, 1, 2, 3, 0, 1, 0, 1, 3, 2, 1, 3,
-      0, 2, 0, 3, 1, 2, 3, 0, 1, 2, 0, 1, 2, 3, 1, 2, 0, 3, 0, 2, 1, 3,
-      2, 0, 1, 3, 0, 1, 2, 3, 1, 0, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3};
-  return TableLookupLanes(v, SetTableIndices(d, table + offset));
-#endif  // HWY_TARGET == HWY_SVE_256
-
-  return Compress(v, Not(mask));
-}
-
-// ------------------------------ CompressBlocksNot
-HWY_API svuint64_t CompressBlocksNot(svuint64_t v, svbool_t mask) {
-#if HWY_TARGET == HWY_SVE2_128
-  (void)mask;
-  return v;
-#endif
-#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
-  uint64_t bits = 0;  // predicate reg is 32-bit
-  CopyBytes<4>(&mask, &bits);  // not same size - 64-bit more efficient
-  // Concatenate LSB for upper and lower blocks, pre-scale by 4 for table idx.
-  const size_t offset = ((bits & 1) ? 4u : 0u) + ((bits & 0x10000) ? 8u : 0u);
-  // See CompressIsPartition. Manually generated; flip halves if mask = [0, 1].
-  alignas(16) static constexpr uint64_t table[4 * 4] = {0, 1, 2, 3, 2, 3, 0, 1,
-                                                        0, 1, 2, 3, 0, 1, 2, 3};
-  const ScalableTag<uint64_t> d;
-  return TableLookupLanes(v, SetTableIndices(d, table + offset));
-#endif
-
-  return CompressNot(v, mask);
-}
-
-// ------------------------------ CompressStore
-template <class V, class D>
-HWY_API size_t CompressStore(const V v, const svbool_t mask, const D d,
-                             TFromD<D>* HWY_RESTRICT unaligned) {
-  StoreU(Compress(v, mask), d, unaligned);
-  return CountTrue(d, mask);
-}
-
-// ------------------------------ CompressBlendedStore
-template <class V, class D>
-HWY_API size_t CompressBlendedStore(const V v, const svbool_t mask, const D d,
-                                    TFromD<D>* HWY_RESTRICT unaligned) {
-  const size_t count = CountTrue(d, mask);
-  const svbool_t store_mask = FirstN(d, count);
-  BlendedStore(Compress(v, mask), store_mask, d, unaligned);
-  return count;
-}
-
-// ================================================== BLOCKWISE
-
-// ------------------------------ CombineShiftRightBytes
-
-// Prevent accidentally using these for 128-bit vectors - should not be
-// necessary.
-#if HWY_TARGET != HWY_SVE2_128
-namespace detail {
-
-// For x86-compatible behaviour mandated by Highway API: TableLookupBytes
-// offsets are implicitly relative to the start of their 128-bit block.
-template <class D, class V>
-HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0) {
-  using T = MakeUnsigned<TFromD<D>>;
-  return detail::AndNotN(static_cast<T>(LanesPerBlock(d) - 1), iota0);
-}
-
-template <size_t kLanes, class D, HWY_IF_LANE_SIZE_D(D, 1)>
-svbool_t FirstNPerBlock(D d) {
-  const RebindToUnsigned<decltype(d)> du;
-  constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du);
-  const svuint8_t idx_mod =
-      svdupq_n_u8(0 % kLanesPerBlock, 1 % kLanesPerBlock, 2 % kLanesPerBlock,
-                  3 % kLanesPerBlock, 4 % kLanesPerBlock, 5 % kLanesPerBlock,
-                  6 % kLanesPerBlock, 7 % kLanesPerBlock, 8 % kLanesPerBlock,
-                  9 % kLanesPerBlock, 10 % kLanesPerBlock, 11 % kLanesPerBlock,
-                  12 % kLanesPerBlock, 13 % kLanesPerBlock, 14 % kLanesPerBlock,
-                  15 % kLanesPerBlock);
-  return detail::LtN(BitCast(du, idx_mod), kLanes);
-}
-template <size_t kLanes, class D, HWY_IF_LANE_SIZE_D(D, 2)>
-svbool_t FirstNPerBlock(D d) {
-  const RebindToUnsigned<decltype(d)> du;
-  constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du);
-  const svuint16_t idx_mod =
-      svdupq_n_u16(0 % kLanesPerBlock, 1 % kLanesPerBlock, 2 % kLanesPerBlock,
-                   3 % kLanesPerBlock, 4 % kLanesPerBlock, 5 % kLanesPerBlock,
-                   6 % kLanesPerBlock, 7 % kLanesPerBlock);
-  return detail::LtN(BitCast(du, idx_mod), kLanes);
-}
-template <size_t kLanes, class D, HWY_IF_LANE_SIZE_D(D, 4)>
-svbool_t FirstNPerBlock(D d) {
-  const RebindToUnsigned<decltype(d)> du;
-  constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du);
-  const svuint32_t idx_mod =
-      svdupq_n_u32(0 % kLanesPerBlock, 1 % kLanesPerBlock, 2 % kLanesPerBlock,
-                   3 % kLanesPerBlock);
-  return detail::LtN(BitCast(du, idx_mod), kLanes);
-}
-template <size_t kLanes, class D, HWY_IF_LANE_SIZE_D(D, 8)>
-svbool_t FirstNPerBlock(D d) {
-  const RebindToUnsigned<decltype(d)> du;
-  constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du);
-  const svuint64_t idx_mod =
-      svdupq_n_u64(0 % kLanesPerBlock, 1 % kLanesPerBlock);
-  return detail::LtN(BitCast(du, idx_mod), kLanes);
-}
-
-}  // namespace detail
-#endif  // HWY_TARGET != HWY_SVE2_128
-
-template <size_t kBytes, class D, class V = VFromD<D>>
-HWY_API V CombineShiftRightBytes(const D d, const V hi, const V lo) {
-  const Repartition<uint8_t, decltype(d)> d8;
-  const auto hi8 = BitCast(d8, hi);
-  const auto lo8 = BitCast(d8, lo);
-#if HWY_TARGET == HWY_SVE2_128
-  return BitCast(d, detail::Ext<kBytes>(hi8, lo8));
-#else
-  const auto hi_up = detail::Splice(hi8, hi8, FirstN(d8, 16 - kBytes));
-  const auto lo_down = detail::Ext<kBytes>(lo8, lo8);
-  const svbool_t is_lo = detail::FirstNPerBlock<16 - kBytes>(d8);
-  return BitCast(d, IfThenElse(is_lo, lo_down, hi_up));
-#endif
-}
-
-// ------------------------------ Shuffle2301
-template <class V>
-HWY_API V Shuffle2301(const V v) {
-  const DFromV<V> d;
-  static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
-  return Reverse2(d, v);
-}
-
-// ------------------------------ Shuffle2103
-template <class V>
-HWY_API V Shuffle2103(const V v) {
-  const DFromV<V> d;
-  const Repartition<uint8_t, decltype(d)> d8;
-  static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
-  const svuint8_t v8 = BitCast(d8, v);
-  return BitCast(d, CombineShiftRightBytes<12>(d8, v8, v8));
-}
-
-// ------------------------------ Shuffle0321
-template <class V>
-HWY_API V Shuffle0321(const V v) {
-  const DFromV<V> d;
-  const Repartition<uint8_t, decltype(d)> d8;
-  static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
-  const svuint8_t v8 = BitCast(d8, v);
-  return BitCast(d, CombineShiftRightBytes<4>(d8, v8, v8));
-}
-
-// ------------------------------ Shuffle1032
-template <class V>
-HWY_API V Shuffle1032(const V v) {
-  const DFromV<V> d;
-  const Repartition<uint8_t, decltype(d)> d8;
-  static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
-  const svuint8_t v8 = BitCast(d8, v);
-  return BitCast(d, CombineShiftRightBytes<8>(d8, v8, v8));
-}
-
-// ------------------------------ Shuffle01
-template <class V>
-HWY_API V Shuffle01(const V v) {
-  const DFromV<V> d;
-  const Repartition<uint8_t, decltype(d)> d8;
-  static_assert(sizeof(TFromD<decltype(d)>) == 8, "Defined for 64-bit types");
-  const svuint8_t v8 = BitCast(d8, v);
-  return BitCast(d, CombineShiftRightBytes<8>(d8, v8, v8));
-}
-
-// ------------------------------ Shuffle0123
-template <class V>
-HWY_API V Shuffle0123(const V v) {
-  return Shuffle2301(Shuffle1032(v));
-}
-
-// ------------------------------ ReverseBlocks (Reverse, Shuffle01)
-template <class D, class V = VFromD<D>>
-HWY_API V ReverseBlocks(D d, V v) {
-#if HWY_TARGET == HWY_SVE_256
-  if (detail::IsFull(d)) {
-    return SwapAdjacentBlocks(v);
-  } else if (detail::IsFull(Twice<D>())) {
-    return v;
-  }
-#elif HWY_TARGET == HWY_SVE2_128
-  (void)d;
-  return v;
-#endif
-  const Repartition<uint64_t, D> du64;
-  return BitCast(d, Shuffle01(Reverse(du64, BitCast(du64, v))));
-}
-
-// ------------------------------ TableLookupBytes
-
-template <class V, class VI>
-HWY_API VI TableLookupBytes(const V v, const VI idx) {
-  const DFromV<VI> d;
-  const Repartition<uint8_t, decltype(d)> du8;
-#if HWY_TARGET == HWY_SVE2_128
-  return BitCast(d, TableLookupLanes(BitCast(du8, v), BitCast(du8, idx)));
-#else
-  const auto offsets128 = detail::OffsetsOf128BitBlocks(du8, Iota(du8, 0));
-  const auto idx8 = Add(BitCast(du8, idx), offsets128);
-  return BitCast(d, TableLookupLanes(BitCast(du8, v), idx8));
-#endif
-}
-
-template <class V, class VI>
-HWY_API VI TableLookupBytesOr0(const V v, const VI idx) {
-  const DFromV<VI> d;
-  // Mask size must match vector type, so cast everything to this type.
-  const Repartition<int8_t, decltype(d)> di8;
-
-  auto idx8 = BitCast(di8, idx);
-  const auto msb = detail::LtN(idx8, 0);
-
-  const auto lookup = TableLookupBytes(BitCast(di8, v), idx8);
-  return BitCast(d, IfThenZeroElse(msb, lookup));
-}
-
-// ------------------------------ Broadcast
-
-#if HWY_TARGET == HWY_SVE2_128
-namespace detail {
-#define HWY_SVE_BROADCAST(BASE, CHAR, BITS, HALF, NAME, OP)        \
-  template <int kLane>                                             \
-  HWY_INLINE HWY_SVE_V(BASE, BITS) NAME(HWY_SVE_V(BASE, BITS) v) { \
-    return sv##OP##_##CHAR##BITS(v, kLane);                        \
-  }
-
-HWY_SVE_FOREACH(HWY_SVE_BROADCAST, Broadcast, dup_lane)
-#undef HWY_SVE_BROADCAST
-}  // namespace detail
-#endif
-
-template <int kLane, class V>
-HWY_API V Broadcast(const V v) {
-  const DFromV<V> d;
-  const RebindToUnsigned<decltype(d)> du;
-  constexpr size_t kLanesPerBlock = detail::LanesPerBlock(du);
-  static_assert(0 <= kLane && kLane < kLanesPerBlock, "Invalid lane");
-#if HWY_TARGET == HWY_SVE2_128
-  return detail::Broadcast<kLane>(v);
-#else
-  auto idx = detail::OffsetsOf128BitBlocks(du, Iota(du, 0));
-  if (kLane != 0) {
-    idx = detail::AddN(idx, kLane);
-  }
-  return TableLookupLanes(v, idx);
-#endif
-}
-
-// ------------------------------ ShiftLeftLanes
-
-template <size_t kLanes, class D, class V = VFromD<D>>
-HWY_API V ShiftLeftLanes(D d, const V v) {
-  const auto zero = Zero(d);
-  const auto shifted = detail::Splice(v, zero, FirstN(d, kLanes));
-#if HWY_TARGET == HWY_SVE2_128
-  return shifted;
-#else
-  // Match x86 semantics by zeroing lower lanes in 128-bit blocks
-  return IfThenElse(detail::FirstNPerBlock<kLanes>(d), zero, shifted);
-#endif
-}
-
-template <size_t kLanes, class V>
-HWY_API V ShiftLeftLanes(const V v) {
-  return ShiftLeftLanes<kLanes>(DFromV<V>(), v);
-}
-
-// ------------------------------ ShiftRightLanes
-template <size_t kLanes, class D, class V = VFromD<D>>
-HWY_API V ShiftRightLanes(D d, V v) {
-  // For capped/fractional vectors, clear upper lanes so we shift in zeros.
-  if (!detail::IsFull(d)) {
-    v = IfThenElseZero(detail::MakeMask(d), v);
-  }
-
-#if HWY_TARGET == HWY_SVE2_128
-  return detail::Ext<kLanes>(Zero(d), v);
-#else
-  const auto shifted = detail::Ext<kLanes>(v, v);
-  // Match x86 semantics by zeroing upper lanes in 128-bit blocks
-  constexpr size_t kLanesPerBlock = detail::LanesPerBlock(d);
-  const svbool_t mask = detail::FirstNPerBlock<kLanesPerBlock - kLanes>(d);
-  return IfThenElseZero(mask, shifted);
-#endif
-}
-
-// ------------------------------ ShiftLeftBytes
-
-template <int kBytes, class D, class V = VFromD<D>>
-HWY_API V ShiftLeftBytes(const D d, const V v) {
-  const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, ShiftLeftLanes<kBytes>(BitCast(d8, v)));
-}
-
-template <int kBytes, class V>
-HWY_API V ShiftLeftBytes(const V v) {
-  return ShiftLeftBytes<kBytes>(DFromV<V>(), v);
-}
-
-// ------------------------------ ShiftRightBytes
-template <int kBytes, class D, class V = VFromD<D>>
-HWY_API V ShiftRightBytes(const D d, const V v) {
-  const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, ShiftRightLanes<kBytes>(d8, BitCast(d8, v)));
-}
-
-// ------------------------------ ZipLower
-
-template <class V, class DW = RepartitionToWide<DFromV<V>>>
-HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
-  const RepartitionToNarrow<DW> dn;
-  static_assert(IsSame<TFromD<decltype(dn)>, TFromV<V>>(), "D/V mismatch");
-  return BitCast(dw, InterleaveLower(dn, a, b));
-}
-template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
-HWY_API VFromD<DW> ZipLower(const V a, const V b) {
-  return BitCast(DW(), InterleaveLower(D(), a, b));
-}
-
-// ------------------------------ ZipUpper
-template <class V, class DW = RepartitionToWide<DFromV<V>>>
-HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
-  const RepartitionToNarrow<DW> dn;
-  static_assert(IsSame<TFromD<decltype(dn)>, TFromV<V>>(), "D/V mismatch");
-  return BitCast(dw, InterleaveUpper(dn, a, b));
-}
-
-// ================================================== Ops with dependencies
-
-// ------------------------------ PromoteTo bfloat16 (ZipLower)
-template <size_t N, int kPow2>
-HWY_API svfloat32_t PromoteTo(Simd<float32_t, N, kPow2> df32,
-                              const svuint16_t v) {
-  return BitCast(df32, detail::ZipLower(svdup_n_u16(0), v));
-}
-
-// ------------------------------ ReorderDemote2To (OddEven)
-template <size_t N, int kPow2>
-HWY_API svuint16_t ReorderDemote2To(Simd<bfloat16_t, N, kPow2> dbf16,
-                                    svfloat32_t a, svfloat32_t b) {
-  const RebindToUnsigned<decltype(dbf16)> du16;
-  const Repartition<uint32_t, decltype(dbf16)> du32;
-  const svuint32_t b_in_even = ShiftRight<16>(BitCast(du32, b));
-  return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
-}
-
-// ------------------------------ ZeroIfNegative (Lt, IfThenElse)
-template <class V>
-HWY_API V ZeroIfNegative(const V v) {
-  return IfThenZeroElse(detail::LtN(v, 0), v);
-}
-
-// ------------------------------ BroadcastSignBit (ShiftRight)
-template <class V>
-HWY_API V BroadcastSignBit(const V v) {
-  return ShiftRight<sizeof(TFromV<V>) * 8 - 1>(v);
-}
-
-// ------------------------------ IfNegativeThenElse (BroadcastSignBit)
-template <class V>
-HWY_API V IfNegativeThenElse(V v, V yes, V no) {
-  static_assert(IsSigned<TFromV<V>>(), "Only works for signed/float");
-  const DFromV<V> d;
-  const RebindToSigned<decltype(d)> di;
-
-  const svbool_t m = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v))));
-  return IfThenElse(m, yes, no);
-}
-
-// ------------------------------ AverageRound (ShiftRight)
-
-#if HWY_TARGET == HWY_SVE2
-HWY_SVE_FOREACH_U08(HWY_SVE_RETV_ARGPVV, AverageRound, rhadd)
-HWY_SVE_FOREACH_U16(HWY_SVE_RETV_ARGPVV, AverageRound, rhadd)
-#else
-template <class V>
-V AverageRound(const V a, const V b) {
-  return ShiftRight<1>(detail::AddN(Add(a, b), 1));
-}
-#endif  // HWY_TARGET == HWY_SVE2
-
-// ------------------------------ LoadMaskBits (TestBit)
-
-// `p` points to at least 8 readable bytes, not all of which need be valid.
-template <class D, HWY_IF_LANE_SIZE_D(D, 1)>
-HWY_INLINE svbool_t LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
-  const RebindToUnsigned<D> du;
-  const svuint8_t iota = Iota(du, 0);
-
-  // Load correct number of bytes (bits/8) with 7 zeros after each.
-  const svuint8_t bytes = BitCast(du, svld1ub_u64(detail::PTrue(d), bits));
-  // Replicate bytes 8x such that each byte contains the bit that governs it.
-  const svuint8_t rep8 = svtbl_u8(bytes, detail::AndNotN(7, iota));
-
-  const svuint8_t bit =
-      svdupq_n_u8(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
-  return TestBit(rep8, bit);
-}
-
-template <class D, HWY_IF_LANE_SIZE_D(D, 2)>
-HWY_INLINE svbool_t LoadMaskBits(D /* tag */,
-                                 const uint8_t* HWY_RESTRICT bits) {
-  const RebindToUnsigned<D> du;
-  const Repartition<uint8_t, D> du8;
-
-  // There may be up to 128 bits; avoid reading past the end.
-  const svuint8_t bytes = svld1(FirstN(du8, (Lanes(du) + 7) / 8), bits);
-
-  // Replicate bytes 16x such that each lane contains the bit that governs it.
-  const svuint8_t rep16 = svtbl_u8(bytes, ShiftRight<4>(Iota(du8, 0)));
-
-  const svuint16_t bit = svdupq_n_u16(1, 2, 4, 8, 16, 32, 64, 128);
-  return TestBit(BitCast(du, rep16), bit);
-}
-
-template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
-HWY_INLINE svbool_t LoadMaskBits(D /* tag */,
-                                 const uint8_t* HWY_RESTRICT bits) {
-  const RebindToUnsigned<D> du;
-  const Repartition<uint8_t, D> du8;
-
-  // Upper bound = 2048 bits / 32 bit = 64 bits; at least 8 bytes are readable,
-  // so we can skip computing the actual length (Lanes(du)+7)/8.
-  const svuint8_t bytes = svld1(FirstN(du8, 8), bits);
-
-  // Replicate bytes 32x such that each lane contains the bit that governs it.
-  const svuint8_t rep32 = svtbl_u8(bytes, ShiftRight<5>(Iota(du8, 0)));
-
-  // 1, 2, 4, 8, 16, 32, 64, 128,  1, 2 ..
-  const svuint32_t bit = Shl(Set(du, 1), detail::AndN(Iota(du, 0), 7));
-
-  return TestBit(BitCast(du, rep32), bit);
-}
-
-template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
-HWY_INLINE svbool_t LoadMaskBits(D /* tag */,
-                                 const uint8_t* HWY_RESTRICT bits) {
-  const RebindToUnsigned<D> du;
-
-  // Max 2048 bits = 32 lanes = 32 input bits; replicate those into each lane.
-  // The "at least 8 byte" guarantee in quick_reference ensures this is safe.
-  uint32_t mask_bits;
-  CopyBytes<4>(bits, &mask_bits);  // copy from bytes
-  const auto vbits = Set(du, mask_bits);
-
-  // 2 ^ {0,1, .., 31}, will not have more lanes than that.
-  const svuint64_t bit = Shl(Set(du, 1), Iota(du, 0));
-
-  return TestBit(vbits, bit);
-}
-
-// ------------------------------ StoreMaskBits
-
-namespace detail {
-
-// For each mask lane (governing lane type T), store 1 or 0 in BYTE lanes.
-template <class T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
-  return svdup_n_u8_z(m, 1);
-}
-template <class T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
-  const ScalableTag<uint8_t> d8;
-  const svuint8_t b16 = BitCast(d8, svdup_n_u16_z(m, 1));
-  return detail::ConcatEven(b16, b16);  // only lower half needed
-}
-template <class T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
-  return U8FromU32(svdup_n_u32_z(m, 1));
-}
-template <class T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_INLINE svuint8_t BoolFromMask(svbool_t m) {
-  const ScalableTag<uint32_t> d32;
-  const svuint32_t b64 = BitCast(d32, svdup_n_u64_z(m, 1));
-  return U8FromU32(detail::ConcatEven(b64, b64));  // only lower half needed
-}
-
-// Compacts groups of 8 u8 into 8 contiguous bits in a 64-bit lane.
-HWY_INLINE svuint64_t BitsFromBool(svuint8_t x) {
-  const ScalableTag<uint8_t> d8;
-  const ScalableTag<uint16_t> d16;
-  const ScalableTag<uint32_t> d32;
-  const ScalableTag<uint64_t> d64;
-  // TODO(janwas): could use SVE2 BDEP, but it's optional.
-  x = Or(x, BitCast(d8, ShiftRight<7>(BitCast(d16, x))));
-  x = Or(x, BitCast(d8, ShiftRight<14>(BitCast(d32, x))));
-  x = Or(x, BitCast(d8, ShiftRight<28>(BitCast(d64, x))));
-  return BitCast(d64, x);
-}
-
-}  // namespace detail
-
-// `p` points to at least 8 writable bytes.
-// TODO(janwas): specialize for HWY_SVE_256
-template <class D>
-HWY_API size_t StoreMaskBits(D d, svbool_t m, uint8_t* bits) {
-  svuint64_t bits_in_u64 =
-      detail::BitsFromBool(detail::BoolFromMask<TFromD<D>>(m));
-
-  const size_t num_bits = Lanes(d);
-  const size_t num_bytes = (num_bits + 8 - 1) / 8;  // Round up, see below
-
-  // Truncate each u64 to 8 bits and store to u8.
-  svst1b_u64(FirstN(ScalableTag<uint64_t>(), num_bytes), bits, bits_in_u64);
-
-  // Non-full byte, need to clear the undefined upper bits. Can happen for
-  // capped/fractional vectors or large T and small hardware vectors.
-  if (num_bits < 8) {
-    const int mask = static_cast<int>((1ull << num_bits) - 1);
-    bits[0] = static_cast<uint8_t>(bits[0] & mask);
-  }
-  // Else: we wrote full bytes because num_bits is a power of two >= 8.
-
-  return num_bytes;
-}
-
-// ------------------------------ CompressBits (LoadMaskBits)
-template <class V>
-HWY_INLINE V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) {
-  return Compress(v, LoadMaskBits(DFromV<V>(), bits));
-}
-
-// ------------------------------ CompressBitsStore (LoadMaskBits)
-template <class D>
-HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
-                                 D d, TFromD<D>* HWY_RESTRICT unaligned) {
-  return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
-}
-
-// ------------------------------ MulEven (InterleaveEven)
-
-#if HWY_TARGET == HWY_SVE2
-namespace detail {
-#define HWY_SVE_MUL_EVEN(BASE, CHAR, BITS, HALF, NAME, OP)     \
-  HWY_API HWY_SVE_V(BASE, BITS)                                \
-      NAME(HWY_SVE_V(BASE, HALF) a, HWY_SVE_V(BASE, HALF) b) { \
-    return sv##OP##_##CHAR##BITS(a, b);                        \
-  }
-
-HWY_SVE_FOREACH_UI64(HWY_SVE_MUL_EVEN, MulEven, mullb)
-#undef HWY_SVE_MUL_EVEN
-}  // namespace detail
-#endif
-
-template <class V, class DW = RepartitionToWide<DFromV<V>>>
-HWY_API VFromD<DW> MulEven(const V a, const V b) {
-#if HWY_TARGET == HWY_SVE2
-  return BitCast(DW(), detail::MulEven(a, b));
-#else
-  const auto lo = Mul(a, b);
-  const auto hi = detail::MulHigh(a, b);
-  return BitCast(DW(), detail::InterleaveEven(lo, hi));
-#endif
-}
-
-HWY_API svuint64_t MulEven(const svuint64_t a, const svuint64_t b) {
-  const auto lo = Mul(a, b);
-  const auto hi = detail::MulHigh(a, b);
-  return detail::InterleaveEven(lo, hi);
-}
-
-HWY_API svuint64_t MulOdd(const svuint64_t a, const svuint64_t b) {
-  const auto lo = Mul(a, b);
-  const auto hi = detail::MulHigh(a, b);
-  return detail::InterleaveOdd(lo, hi);
-}
-
-// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
-template <size_t N, int kPow2>
-HWY_API svfloat32_t ReorderWidenMulAccumulate(Simd<float, N, kPow2> df32,
-                                              svuint16_t a, svuint16_t b,
-                                              const svfloat32_t sum0,
-                                              svfloat32_t& sum1) {
-  // TODO(janwas): svbfmlalb_f32 if __ARM_FEATURE_SVE_BF16.
-  const Repartition<uint16_t, decltype(df32)> du16;
-  const RebindToUnsigned<decltype(df32)> du32;
-  const svuint16_t zero = Zero(du16);
-  const svuint32_t a0 = ZipLower(du32, zero, BitCast(du16, a));
-  const svuint32_t a1 = ZipUpper(du32, zero, BitCast(du16, a));
-  const svuint32_t b0 = ZipLower(du32, zero, BitCast(du16, b));
-  const svuint32_t b1 = ZipUpper(du32, zero, BitCast(du16, b));
-  sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
-  return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
-}
-
-// ------------------------------ AESRound / CLMul
-
-#if defined(__ARM_FEATURE_SVE2_AES) ||                         \
-    ((HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128) && \
-     HWY_HAVE_RUNTIME_DISPATCH)
-
-// Per-target flag to prevent generic_ops-inl.h from defining AESRound.
-#ifdef HWY_NATIVE_AES
-#undef HWY_NATIVE_AES
-#else
-#define HWY_NATIVE_AES
-#endif
-
-HWY_API svuint8_t AESRound(svuint8_t state, svuint8_t round_key) {
-  // It is not clear whether E and MC fuse like they did on NEON.
-  const svuint8_t zero = svdup_n_u8(0);
-  return Xor(svaesmc_u8(svaese_u8(state, zero)), round_key);
-}
-
-HWY_API svuint8_t AESLastRound(svuint8_t state, svuint8_t round_key) {
-  return Xor(svaese_u8(state, svdup_n_u8(0)), round_key);
-}
-
-HWY_API svuint64_t CLMulLower(const svuint64_t a, const svuint64_t b) {
-  return svpmullb_pair(a, b);
-}
-
-HWY_API svuint64_t CLMulUpper(const svuint64_t a, const svuint64_t b) {
-  return svpmullt_pair(a, b);
-}
-
-#endif  // __ARM_FEATURE_SVE2_AES
-
-// ------------------------------ Lt128
-
-namespace detail {
-#define HWY_SVE_DUP(BASE, CHAR, BITS, HALF, NAME, OP)                        \
-  template <size_t N, int kPow2>                                             \
-  HWY_API svbool_t NAME(HWY_SVE_D(BASE, BITS, N, kPow2) /*d*/, svbool_t m) { \
-    return sv##OP##_b##BITS(m, m);                                           \
-  }
-
-HWY_SVE_FOREACH_U(HWY_SVE_DUP, DupEvenB, trn1)  // actually for bool
-HWY_SVE_FOREACH_U(HWY_SVE_DUP, DupOddB, trn2)   // actually for bool
-#undef HWY_SVE_DUP
-
-#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
-template <class D>
-HWY_INLINE svuint64_t Lt128Vec(D d, const svuint64_t a, const svuint64_t b) {
-  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
-  const svbool_t eqHx = Eq(a, b);  // only odd lanes used
-  // Convert to vector: more pipelines can execute vector TRN* instructions
-  // than the predicate version.
-  const svuint64_t ltHL = VecFromMask(d, Lt(a, b));
-  // Move into upper lane: ltL if the upper half is equal, otherwise ltH.
-  // Requires an extra IfThenElse because INSR, EXT, TRN2 are unpredicated.
-  const svuint64_t ltHx = IfThenElse(eqHx, DupEven(ltHL), ltHL);
-  // Duplicate upper lane into lower.
-  return DupOdd(ltHx);
-}
-#endif
-}  // namespace detail
-
-template <class D>
-HWY_INLINE svbool_t Lt128(D d, const svuint64_t a, const svuint64_t b) {
-#if HWY_TARGET == HWY_SVE_256
-  return MaskFromVec(detail::Lt128Vec(d, a, b));
-#else
-  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
-  const svbool_t eqHx = Eq(a, b);  // only odd lanes used
-  const svbool_t ltHL = Lt(a, b);
-  // Move into upper lane: ltL if the upper half is equal, otherwise ltH.
-  const svbool_t ltHx = svsel_b(eqHx, detail::DupEvenB(d, ltHL), ltHL);
-  // Duplicate upper lane into lower.
-  return detail::DupOddB(d, ltHx);
-#endif  // HWY_TARGET != HWY_SVE_256
-}
-
-// ------------------------------ Lt128Upper
-
-template <class D>
-HWY_INLINE svbool_t Lt128Upper(D d, svuint64_t a, svuint64_t b) {
-  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
-  const svbool_t ltHL = Lt(a, b);
-  return detail::DupOddB(d, ltHL);
-}
-
-// ------------------------------ Eq128
-
-#if HWY_TARGET == HWY_SVE_256 || HWY_IDE
-namespace detail {
-template <class D>
-HWY_INLINE svuint64_t Eq128Vec(D d, const svuint64_t a, const svuint64_t b) {
-  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
-  // Convert to vector: more pipelines can execute vector TRN* instructions
-  // than the predicate version.
-  const svuint64_t eqHL = VecFromMask(d, Eq(a, b));
-  // Duplicate upper and lower.
-  const svuint64_t eqHH = DupOdd(eqHL);
-  const svuint64_t eqLL = DupEven(eqHL);
-  return And(eqLL, eqHH);
-}
-}  // namespace detail
-#endif
-
-template <class D>
-HWY_INLINE svbool_t Eq128(D d, const svuint64_t a, const svuint64_t b) {
-#if HWY_TARGET == HWY_SVE_256
-  return MaskFromVec(detail::Eq128Vec(d, a, b));
-#else
-  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
-  const svbool_t eqHL = Eq(a, b);
-  const svbool_t eqHH = detail::DupOddB(d, eqHL);
-  const svbool_t eqLL = detail::DupEvenB(d, eqHL);
-  return And(eqLL, eqHH);
-#endif  // HWY_TARGET != HWY_SVE_256
-}
-
-// ------------------------------ Eq128Upper
-
-template <class D>
-HWY_INLINE svbool_t Eq128Upper(D d, svuint64_t a, svuint64_t b) {
-  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
-  const svbool_t eqHL = Eq(a, b);
-  return detail::DupOddB(d, eqHL);
-}
-
-// ------------------------------ Min128, Max128 (Lt128)
-
-template <class D>
-HWY_INLINE svuint64_t Min128(D d, const svuint64_t a, const svuint64_t b) {
-#if HWY_TARGET == HWY_SVE_256
-  return IfVecThenElse(detail::Lt128Vec(d, a, b), a, b);
-#else
-  return IfThenElse(Lt128(d, a, b), a, b);
-#endif
-}
-
-template <class D>
-HWY_INLINE svuint64_t Max128(D d, const svuint64_t a, const svuint64_t b) {
-#if HWY_TARGET == HWY_SVE_256
-  return IfVecThenElse(detail::Lt128Vec(d, b, a), a, b);
-#else
-  return IfThenElse(Lt128(d, b, a), a, b);
-#endif
-}
-
-template <class D>
-HWY_INLINE svuint64_t Min128Upper(D d, const svuint64_t a, const svuint64_t b) {
-  return IfThenElse(Lt128Upper(d, a, b), a, b);
-}
-
-template <class D>
-HWY_INLINE svuint64_t Max128Upper(D d, const svuint64_t a, const svuint64_t b) {
-  return IfThenElse(Lt128Upper(d, b, a), a, b);
-}
-
-// ================================================== END MACROS
-namespace detail {  // for code folding
-#undef HWY_IF_FLOAT_V
-#undef HWY_IF_LANE_SIZE_V
-#undef HWY_SVE_ALL_PTRUE
-#undef HWY_SVE_D
-#undef HWY_SVE_FOREACH
-#undef HWY_SVE_FOREACH_F
-#undef HWY_SVE_FOREACH_F16
-#undef HWY_SVE_FOREACH_F32
-#undef HWY_SVE_FOREACH_F64
-#undef HWY_SVE_FOREACH_I
-#undef HWY_SVE_FOREACH_I08
-#undef HWY_SVE_FOREACH_I16
-#undef HWY_SVE_FOREACH_I32
-#undef HWY_SVE_FOREACH_I64
-#undef HWY_SVE_FOREACH_IF
-#undef HWY_SVE_FOREACH_U
-#undef HWY_SVE_FOREACH_U08
-#undef HWY_SVE_FOREACH_U16
-#undef HWY_SVE_FOREACH_U32
-#undef HWY_SVE_FOREACH_U64
-#undef HWY_SVE_FOREACH_UI
-#undef HWY_SVE_FOREACH_UI08
-#undef HWY_SVE_FOREACH_UI16
-#undef HWY_SVE_FOREACH_UI32
-#undef HWY_SVE_FOREACH_UI64
-#undef HWY_SVE_FOREACH_UIF3264
-#undef HWY_SVE_PTRUE
-#undef HWY_SVE_RETV_ARGPV
-#undef HWY_SVE_RETV_ARGPVN
-#undef HWY_SVE_RETV_ARGPVV
-#undef HWY_SVE_RETV_ARGV
-#undef HWY_SVE_RETV_ARGVN
-#undef HWY_SVE_RETV_ARGVV
-#undef HWY_SVE_T
-#undef HWY_SVE_UNDEFINED
-#undef HWY_SVE_V
-
-}  // namespace detail
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
diff --git a/third_party/highway/hwy/ops/emu128-inl.h b/third_party/highway/hwy/ops/emu128-inl.h
deleted file mode 100644 (file)
index e61d6d8..0000000
+++ /dev/null
@@ -1,2448 +0,0 @@
-// Copyright 2022 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Single-element vectors and operations.
-// External include guard in highway.h - see comment there.
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "hwy/base.h"
-#include "hwy/ops/shared-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-template <typename T>
-using Full128 = Simd<T, 16 / sizeof(T), 0>;
-
-// (Wrapper class required for overloading comparison operators.)
-template <typename T, size_t N = 16 / sizeof(T)>
-struct Vec128 {
-  HWY_INLINE Vec128() = default;
-  Vec128(const Vec128&) = default;
-  Vec128& operator=(const Vec128&) = default;
-
-  HWY_INLINE Vec128& operator*=(const Vec128 other) {
-    return *this = (*this * other);
-  }
-  HWY_INLINE Vec128& operator/=(const Vec128 other) {
-    return *this = (*this / other);
-  }
-  HWY_INLINE Vec128& operator+=(const Vec128 other) {
-    return *this = (*this + other);
-  }
-  HWY_INLINE Vec128& operator-=(const Vec128 other) {
-    return *this = (*this - other);
-  }
-  HWY_INLINE Vec128& operator&=(const Vec128 other) {
-    return *this = (*this & other);
-  }
-  HWY_INLINE Vec128& operator|=(const Vec128 other) {
-    return *this = (*this | other);
-  }
-  HWY_INLINE Vec128& operator^=(const Vec128 other) {
-    return *this = (*this ^ other);
-  }
-
-  // Behave like wasm128 (vectors can always hold 128 bits). generic_ops-inl.h
-  // relies on this for LoadInterleaved*. CAVEAT: this method of padding
-  // prevents using range for, especially in SumOfLanes, where it would be
-  // incorrect. Moving padding to another field would require handling the case
-  // where N = 16 / sizeof(T) (i.e. there is no padding), which is also awkward.
-  T raw[16 / sizeof(T)] = {};
-};
-
-// 0 or FF..FF, same size as Vec128.
-template <typename T, size_t N = 16 / sizeof(T)>
-struct Mask128 {
-  using Raw = hwy::MakeUnsigned<T>;
-  static HWY_INLINE Raw FromBool(bool b) {
-    return b ? static_cast<Raw>(~Raw{0}) : 0;
-  }
-
-  // Must match the size of Vec128.
-  Raw bits[16 / sizeof(T)] = {};
-};
-
-namespace detail {
-
-// Deduce Simd<T, N, 0> from Vec128<T, N>
-struct Deduce128 {
-  template <typename T, size_t N>
-  Simd<T, N, 0> operator()(Vec128<T, N>) const {
-    return Simd<T, N, 0>();
-  }
-};
-
-}  // namespace detail
-
-template <class V>
-using DFromV = decltype(detail::Deduce128()(V()));
-
-template <class V>
-using TFromV = TFromD<DFromV<V>>;
-
-// ------------------------------ BitCast
-
-template <typename T, size_t N, typename FromT, size_t FromN>
-HWY_API Vec128<T, N> BitCast(Simd<T, N, 0> /* tag */, Vec128<FromT, FromN> v) {
-  Vec128<T, N> to;
-  CopySameSize(&v, &to);
-  return to;
-}
-
-// ------------------------------ Set
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Zero(Simd<T, N, 0> /* tag */) {
-  Vec128<T, N> v;
-  ZeroBytes<sizeof(T) * N>(v.raw);
-  return v;
-}
-
-template <class D>
-using VFromD = decltype(Zero(D()));
-
-template <typename T, size_t N, typename T2>
-HWY_API Vec128<T, N> Set(Simd<T, N, 0> /* tag */, const T2 t) {
-  Vec128<T, N> v;
-  for (size_t i = 0; i < N; ++i) {
-    v.raw[i] = static_cast<T>(t);
-  }
-  return v;
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Undefined(Simd<T, N, 0> d) {
-  return Zero(d);
-}
-
-namespace detail {
-
-template <typename T>
-HWY_INLINE constexpr T IncrementWithWraparound(hwy::FloatTag /*tag*/, T t) {
-  return t + T{1};
-}
-
-template <typename T>
-HWY_INLINE constexpr T IncrementWithWraparound(hwy::NonFloatTag /*tag*/, T t) {
-  using TU = MakeUnsigned<T>;
-  return static_cast<T>(static_cast<TU>(static_cast<TU>(t) + TU{1}) &
-                        hwy::LimitsMax<TU>());
-}
-
-}  // namespace detail
-
-template <typename T, size_t N, typename T2>
-HWY_API Vec128<T, N> Iota(const Simd<T, N, 0> /* tag */, T2 first) {
-  Vec128<T, N> v;
-  T counter = static_cast<T>(first);
-  for (size_t i = 0; i < N; ++i) {
-    v.raw[i] = counter;
-    counter = detail::IncrementWithWraparound(hwy::IsFloatTag<T>(), counter);
-  }
-  return v;
-}
-
-// ================================================== LOGICAL
-
-// ------------------------------ Not
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Not(const Vec128<T, N> v) {
-  const Simd<T, N, 0> d;
-  const RebindToUnsigned<decltype(d)> du;
-  using TU = TFromD<decltype(du)>;
-  VFromD<decltype(du)> vu = BitCast(du, v);
-  for (size_t i = 0; i < N; ++i) {
-    vu.raw[i] = static_cast<TU>(~vu.raw[i]);
-  }
-  return BitCast(d, vu);
-}
-
-// ------------------------------ And
-template <typename T, size_t N>
-HWY_API Vec128<T, N> And(const Vec128<T, N> a, const Vec128<T, N> b) {
-  const Simd<T, N, 0> d;
-  const RebindToUnsigned<decltype(d)> du;
-  auto au = BitCast(du, a);
-  auto bu = BitCast(du, b);
-  for (size_t i = 0; i < N; ++i) {
-    au.raw[i] &= bu.raw[i];
-  }
-  return BitCast(d, au);
-}
-template <typename T, size_t N>
-HWY_API Vec128<T, N> operator&(const Vec128<T, N> a, const Vec128<T, N> b) {
-  return And(a, b);
-}
-
-// ------------------------------ AndNot
-template <typename T, size_t N>
-HWY_API Vec128<T, N> AndNot(const Vec128<T, N> a, const Vec128<T, N> b) {
-  return And(Not(a), b);
-}
-
-// ------------------------------ Or
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Or(const Vec128<T, N> a, const Vec128<T, N> b) {
-  const Simd<T, N, 0> d;
-  const RebindToUnsigned<decltype(d)> du;
-  auto au = BitCast(du, a);
-  auto bu = BitCast(du, b);
-  for (size_t i = 0; i < N; ++i) {
-    au.raw[i] |= bu.raw[i];
-  }
-  return BitCast(d, au);
-}
-template <typename T, size_t N>
-HWY_API Vec128<T, N> operator|(const Vec128<T, N> a, const Vec128<T, N> b) {
-  return Or(a, b);
-}
-
-// ------------------------------ Xor
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Xor(const Vec128<T, N> a, const Vec128<T, N> b) {
-  const Simd<T, N, 0> d;
-  const RebindToUnsigned<decltype(d)> du;
-  auto au = BitCast(du, a);
-  auto bu = BitCast(du, b);
-  for (size_t i = 0; i < N; ++i) {
-    au.raw[i] ^= bu.raw[i];
-  }
-  return BitCast(d, au);
-}
-template <typename T, size_t N>
-HWY_API Vec128<T, N> operator^(const Vec128<T, N> a, const Vec128<T, N> b) {
-  return Xor(a, b);
-}
-
-// ------------------------------ Or3
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
-  return Or(o1, Or(o2, o3));
-}
-
-// ------------------------------ OrAnd
-template <typename T, size_t N>
-HWY_API Vec128<T, N> OrAnd(const Vec128<T, N> o, const Vec128<T, N> a1,
-                           const Vec128<T, N> a2) {
-  return Or(o, And(a1, a2));
-}
-
-// ------------------------------ IfVecThenElse
-template <typename T, size_t N>
-HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
-                                   Vec128<T, N> no) {
-  return Or(And(mask, yes), AndNot(mask, no));
-}
-
-// ------------------------------ CopySign
-template <typename T, size_t N>
-HWY_API Vec128<T, N> CopySign(const Vec128<T, N> magn,
-                              const Vec128<T, N> sign) {
-  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
-  const auto msb = SignBit(Simd<T, N, 0>());
-  return Or(AndNot(msb, magn), And(msb, sign));
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> CopySignToAbs(const Vec128<T, N> abs,
-                                   const Vec128<T, N> sign) {
-  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
-  return Or(abs, And(SignBit(Simd<T, N, 0>()), sign));
-}
-
-// ------------------------------ BroadcastSignBit
-template <typename T, size_t N>
-HWY_API Vec128<T, N> BroadcastSignBit(Vec128<T, N> v) {
-  // This is used inside ShiftRight, so we cannot implement in terms of it.
-  for (size_t i = 0; i < N; ++i) {
-    v.raw[i] = v.raw[i] < 0 ? T(-1) : T(0);
-  }
-  return v;
-}
-
-// ------------------------------ Mask
-
-template <typename TFrom, typename TTo, size_t N>
-HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N, 0> /*tag*/,
-                                   Mask128<TFrom, N> mask) {
-  Mask128<TTo, N> to;
-  CopySameSize(&mask, &to);
-  return to;
-}
-
-// v must be 0 or FF..FF.
-template <typename T, size_t N>
-HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
-  Mask128<T, N> mask;
-  CopySameSize(&v, &mask);
-  return mask;
-}
-
-template <typename T, size_t N>
-Vec128<T, N> VecFromMask(const Mask128<T, N> mask) {
-  Vec128<T, N> v;
-  CopySameSize(&mask, &v);
-  return v;
-}
-
-template <typename T, size_t N>
-Vec128<T, N> VecFromMask(Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
-  return VecFromMask(mask);
-}
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> FirstN(Simd<T, N, 0> /*tag*/, size_t n) {
-  Mask128<T, N> m;
-  for (size_t i = 0; i < N; ++i) {
-    m.bits[i] = Mask128<T, N>::FromBool(i < n);
-  }
-  return m;
-}
-
-// Returns mask ? yes : no.
-template <typename T, size_t N>
-HWY_API Vec128<T, N> IfThenElse(const Mask128<T, N> mask,
-                                const Vec128<T, N> yes, const Vec128<T, N> no) {
-  return IfVecThenElse(VecFromMask(mask), yes, no);
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> IfThenElseZero(const Mask128<T, N> mask,
-                                    const Vec128<T, N> yes) {
-  return IfVecThenElse(VecFromMask(mask), yes, Zero(Simd<T, N, 0>()));
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> IfThenZeroElse(const Mask128<T, N> mask,
-                                    const Vec128<T, N> no) {
-  return IfVecThenElse(VecFromMask(mask), Zero(Simd<T, N, 0>()), no);
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
-                                        Vec128<T, N> no) {
-  for (size_t i = 0; i < N; ++i) {
-    v.raw[i] = v.raw[i] < 0 ? yes.raw[i] : no.raw[i];
-  }
-  return v;
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> ZeroIfNegative(const Vec128<T, N> v) {
-  return IfNegativeThenElse(v, Zero(Simd<T, N, 0>()), v);
-}
-
-// ------------------------------ Mask logical
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
-  return MaskFromVec(Not(VecFromMask(Simd<T, N, 0>(), m)));
-}
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) {
-  const Simd<T, N, 0> d;
-  return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
-}
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) {
-  const Simd<T, N, 0> d;
-  return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
-}
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) {
-  const Simd<T, N, 0> d;
-  return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
-}
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
-  const Simd<T, N, 0> d;
-  return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
-}
-
-// ================================================== SHIFTS
-
-// ------------------------------ ShiftLeft/ShiftRight (BroadcastSignBit)
-
-template <int kBits, typename T, size_t N>
-HWY_API Vec128<T, N> ShiftLeft(Vec128<T, N> v) {
-  static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
-  for (size_t i = 0; i < N; ++i) {
-    const auto shifted = static_cast<hwy::MakeUnsigned<T>>(v.raw[i]) << kBits;
-    v.raw[i] = static_cast<T>(shifted);
-  }
-  return v;
-}
-
-template <int kBits, typename T, size_t N>
-HWY_API Vec128<T, N> ShiftRight(Vec128<T, N> v) {
-  static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
-#if __cplusplus >= 202002L
-  // Signed right shift is now guaranteed to be arithmetic (rounding toward
-  // negative infinity, i.e. shifting in the sign bit).
-  for (size_t i = 0; i < N; ++i) {
-    v.raw[i] = static_cast<T>(v.raw[i] >> kBits);
-  }
-#else
-  if (IsSigned<T>()) {
-    // Emulate arithmetic shift using only logical (unsigned) shifts, because
-    // signed shifts are still implementation-defined.
-    using TU = hwy::MakeUnsigned<T>;
-    for (size_t i = 0; i < N; ++i) {
-      const TU shifted = static_cast<TU>(static_cast<TU>(v.raw[i]) >> kBits);
-      const TU sign = v.raw[i] < 0 ? static_cast<TU>(~TU{0}) : 0;
-      const size_t sign_shift =
-          static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - kBits);
-      const TU upper = static_cast<TU>(sign << sign_shift);
-      v.raw[i] = static_cast<T>(shifted | upper);
-    }
-  } else {  // T is unsigned
-    for (size_t i = 0; i < N; ++i) {
-      v.raw[i] = static_cast<T>(v.raw[i] >> kBits);
-    }
-  }
-#endif
-  return v;
-}
-
-// ------------------------------ RotateRight (ShiftRight)
-
-namespace detail {
-
-// For partial specialization: kBits == 0 results in an invalid shift count
-template <int kBits>
-struct RotateRight {
-  template <typename T, size_t N>
-  HWY_INLINE Vec128<T, N> operator()(const Vec128<T, N> v) const {
-    return Or(ShiftRight<kBits>(v), ShiftLeft<sizeof(T) * 8 - kBits>(v));
-  }
-};
-
-template <>
-struct RotateRight<0> {
-  template <typename T, size_t N>
-  HWY_INLINE Vec128<T, N> operator()(const Vec128<T, N> v) const {
-    return v;
-  }
-};
-
-}  // namespace detail
-
-template <int kBits, typename T, size_t N>
-HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
-  static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
-  return detail::RotateRight<kBits>()(v);
-}
-
-// ------------------------------ ShiftLeftSame
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> ShiftLeftSame(Vec128<T, N> v, int bits) {
-  for (size_t i = 0; i < N; ++i) {
-    const auto shifted = static_cast<hwy::MakeUnsigned<T>>(v.raw[i]) << bits;
-    v.raw[i] = static_cast<T>(shifted);
-  }
-  return v;
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> ShiftRightSame(Vec128<T, N> v, int bits) {
-#if __cplusplus >= 202002L
-  // Signed right shift is now guaranteed to be arithmetic (rounding toward
-  // negative infinity, i.e. shifting in the sign bit).
-  for (size_t i = 0; i < N; ++i) {
-    v.raw[i] = static_cast<T>(v.raw[i] >> bits);
-  }
-#else
-  if (IsSigned<T>()) {
-    // Emulate arithmetic shift using only logical (unsigned) shifts, because
-    // signed shifts are still implementation-defined.
-    using TU = hwy::MakeUnsigned<T>;
-    for (size_t i = 0; i < N; ++i) {
-      const TU shifted = static_cast<TU>(static_cast<TU>(v.raw[i]) >> bits);
-      const TU sign = v.raw[i] < 0 ? static_cast<TU>(~TU{0}) : 0;
-      const size_t sign_shift =
-          static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - bits);
-      const TU upper = static_cast<TU>(sign << sign_shift);
-      v.raw[i] = static_cast<T>(shifted | upper);
-    }
-  } else {
-    for (size_t i = 0; i < N; ++i) {
-      v.raw[i] = static_cast<T>(v.raw[i] >> bits);  // unsigned, logical shift
-    }
-  }
-#endif
-  return v;
-}
-
-// ------------------------------ Shl
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, const Vec128<T, N> bits) {
-  for (size_t i = 0; i < N; ++i) {
-    const auto shifted = static_cast<hwy::MakeUnsigned<T>>(v.raw[i])
-                         << bits.raw[i];
-    v.raw[i] = static_cast<T>(shifted);
-  }
-  return v;
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, const Vec128<T, N> bits) {
-#if __cplusplus >= 202002L
-  // Signed right shift is now guaranteed to be arithmetic (rounding toward
-  // negative infinity, i.e. shifting in the sign bit).
-  for (size_t i = 0; i < N; ++i) {
-    v.raw[i] = static_cast<T>(v.raw[i] >> bits.raw[i]);
-  }
-#else
-  if (IsSigned<T>()) {
-    // Emulate arithmetic shift using only logical (unsigned) shifts, because
-    // signed shifts are still implementation-defined.
-    using TU = hwy::MakeUnsigned<T>;
-    for (size_t i = 0; i < N; ++i) {
-      const TU shifted =
-          static_cast<TU>(static_cast<TU>(v.raw[i]) >> bits.raw[i]);
-      const TU sign = v.raw[i] < 0 ? static_cast<TU>(~TU{0}) : 0;
-      const size_t sign_shift = static_cast<size_t>(
-          static_cast<int>(sizeof(TU)) * 8 - 1 - bits.raw[i]);
-      const TU upper = static_cast<TU>(sign << sign_shift);
-      v.raw[i] = static_cast<T>(shifted | upper);
-    }
-  } else {  // T is unsigned
-    for (size_t i = 0; i < N; ++i) {
-      v.raw[i] = static_cast<T>(v.raw[i] >> bits.raw[i]);
-    }
-  }
-#endif
-  return v;
-}
-
-// ================================================== ARITHMETIC
-
-// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
-namespace detail {
-
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> Add(hwy::NonFloatTag /*tag*/, Vec128<T, N> a,
-                            Vec128<T, N> b) {
-  for (size_t i = 0; i < N; ++i) {
-    const uint64_t a64 = static_cast<uint64_t>(a.raw[i]);
-    const uint64_t b64 = static_cast<uint64_t>(b.raw[i]);
-    a.raw[i] = static_cast<T>((a64 + b64) & static_cast<uint64_t>(~T(0)));
-  }
-  return a;
-}
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> Sub(hwy::NonFloatTag /*tag*/, Vec128<T, N> a,
-                            Vec128<T, N> b) {
-  for (size_t i = 0; i < N; ++i) {
-    const uint64_t a64 = static_cast<uint64_t>(a.raw[i]);
-    const uint64_t b64 = static_cast<uint64_t>(b.raw[i]);
-    a.raw[i] = static_cast<T>((a64 - b64) & static_cast<uint64_t>(~T(0)));
-  }
-  return a;
-}
-
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> Add(hwy::FloatTag /*tag*/, Vec128<T, N> a,
-                            const Vec128<T, N> b) {
-  for (size_t i = 0; i < N; ++i) {
-    a.raw[i] += b.raw[i];
-  }
-  return a;
-}
-
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> Sub(hwy::FloatTag /*tag*/, Vec128<T, N> a,
-                            const Vec128<T, N> b) {
-  for (size_t i = 0; i < N; ++i) {
-    a.raw[i] -= b.raw[i];
-  }
-  return a;
-}
-
-}  // namespace detail
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> operator-(Vec128<T, N> a, const Vec128<T, N> b) {
-  return detail::Sub(hwy::IsFloatTag<T>(), a, b);
-}
-template <typename T, size_t N>
-HWY_API Vec128<T, N> operator+(Vec128<T, N> a, const Vec128<T, N> b) {
-  return detail::Add(hwy::IsFloatTag<T>(), a, b);
-}
-
-// ------------------------------ SumsOf8
-
-template <size_t N>
-HWY_API Vec128<uint64_t, (N + 7) / 8> SumsOf8(const Vec128<uint8_t, N> v) {
-  Vec128<uint64_t, (N + 7) / 8> sums;
-  for (size_t i = 0; i < N; ++i) {
-    sums.raw[i / 8] += v.raw[i];
-  }
-  return sums;
-}
-
-// ------------------------------ SaturatedAdd
-template <typename T, size_t N>
-HWY_API Vec128<T, N> SaturatedAdd(Vec128<T, N> a, const Vec128<T, N> b) {
-  for (size_t i = 0; i < N; ++i) {
-    a.raw[i] = static_cast<T>(
-        HWY_MIN(HWY_MAX(hwy::LowestValue<T>(), a.raw[i] + b.raw[i]),
-                hwy::HighestValue<T>()));
-  }
-  return a;
-}
-
-// ------------------------------ SaturatedSub
-template <typename T, size_t N>
-HWY_API Vec128<T, N> SaturatedSub(Vec128<T, N> a, const Vec128<T, N> b) {
-  for (size_t i = 0; i < N; ++i) {
-    a.raw[i] = static_cast<T>(
-        HWY_MIN(HWY_MAX(hwy::LowestValue<T>(), a.raw[i] - b.raw[i]),
-                hwy::HighestValue<T>()));
-  }
-  return a;
-}
-
-// ------------------------------ AverageRound
-template <typename T, size_t N>
-HWY_API Vec128<T, N> AverageRound(Vec128<T, N> a, const Vec128<T, N> b) {
-  static_assert(!IsSigned<T>(), "Only for unsigned");
-  for (size_t i = 0; i < N; ++i) {
-    a.raw[i] = static_cast<T>((a.raw[i] + b.raw[i] + 1) / 2);
-  }
-  return a;
-}
-
-// ------------------------------ Abs
-
-// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
-namespace detail {
-
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> Abs(SignedTag /*tag*/, Vec128<T, N> a) {
-  for (size_t i = 0; i < N; ++i) {
-    const T s = a.raw[i];
-    const T min = hwy::LimitsMin<T>();
-    a.raw[i] = static_cast<T>((s >= 0 || s == min) ? a.raw[i] : -s);
-  }
-  return a;
-}
-
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> Abs(hwy::FloatTag /*tag*/, Vec128<T, N> v) {
-  for (size_t i = 0; i < N; ++i) {
-    v.raw[i] = std::abs(v.raw[i]);
-  }
-  return v;
-}
-
-}  // namespace detail
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Abs(Vec128<T, N> a) {
-  return detail::Abs(hwy::TypeTag<T>(), a);
-}
-
-// ------------------------------ Min/Max
-
-// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
-namespace detail {
-
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> Min(hwy::NonFloatTag /*tag*/, Vec128<T, N> a,
-                            const Vec128<T, N> b) {
-  for (size_t i = 0; i < N; ++i) {
-    a.raw[i] = HWY_MIN(a.raw[i], b.raw[i]);
-  }
-  return a;
-}
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> Max(hwy::NonFloatTag /*tag*/, Vec128<T, N> a,
-                            const Vec128<T, N> b) {
-  for (size_t i = 0; i < N; ++i) {
-    a.raw[i] = HWY_MAX(a.raw[i], b.raw[i]);
-  }
-  return a;
-}
-
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> Min(hwy::FloatTag /*tag*/, Vec128<T, N> a,
-                            const Vec128<T, N> b) {
-  for (size_t i = 0; i < N; ++i) {
-    if (std::isnan(a.raw[i])) {
-      a.raw[i] = b.raw[i];
-    } else if (std::isnan(b.raw[i])) {
-      // no change
-    } else {
-      a.raw[i] = HWY_MIN(a.raw[i], b.raw[i]);
-    }
-  }
-  return a;
-}
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> Max(hwy::FloatTag /*tag*/, Vec128<T, N> a,
-                            const Vec128<T, N> b) {
-  for (size_t i = 0; i < N; ++i) {
-    if (std::isnan(a.raw[i])) {
-      a.raw[i] = b.raw[i];
-    } else if (std::isnan(b.raw[i])) {
-      // no change
-    } else {
-      a.raw[i] = HWY_MAX(a.raw[i], b.raw[i]);
-    }
-  }
-  return a;
-}
-
-}  // namespace detail
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Min(Vec128<T, N> a, const Vec128<T, N> b) {
-  return detail::Min(hwy::IsFloatTag<T>(), a, b);
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Max(Vec128<T, N> a, const Vec128<T, N> b) {
-  return detail::Max(hwy::IsFloatTag<T>(), a, b);
-}
-
-// ------------------------------ Neg
-
-// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
-namespace detail {
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Neg(hwy::NonFloatTag /*tag*/, Vec128<T, N> v) {
-  return Zero(Simd<T, N, 0>()) - v;
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Neg(hwy::FloatTag /*tag*/, Vec128<T, N> v) {
-  return Xor(v, SignBit(Simd<T, N, 0>()));
-}
-
-}  // namespace detail
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Neg(Vec128<T, N> v) {
-  return detail::Neg(hwy::IsFloatTag<T>(), v);
-}
-
-// ------------------------------ Mul/Div
-
-// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
-namespace detail {
-
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> Mul(hwy::FloatTag /*tag*/, Vec128<T, N> a,
-                            const Vec128<T, N> b) {
-  for (size_t i = 0; i < N; ++i) {
-    a.raw[i] *= b.raw[i];
-  }
-  return a;
-}
-
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> Mul(SignedTag /*tag*/, Vec128<T, N> a,
-                            const Vec128<T, N> b) {
-  for (size_t i = 0; i < N; ++i) {
-    a.raw[i] = static_cast<T>(static_cast<int64_t>(a.raw[i]) * b.raw[i]);
-  }
-  return a;
-}
-
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> Mul(UnsignedTag /*tag*/, Vec128<T, N> a,
-                            const Vec128<T, N> b) {
-  for (size_t i = 0; i < N; ++i) {
-    a.raw[i] = static_cast<T>(static_cast<uint64_t>(a.raw[i]) * b.raw[i]);
-  }
-  return a;
-}
-
-}  // namespace detail
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> operator*(Vec128<T, N> a, const Vec128<T, N> b) {
-  return detail::Mul(hwy::TypeTag<T>(), a, b);
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> operator/(Vec128<T, N> a, const Vec128<T, N> b) {
-  for (size_t i = 0; i < N; ++i) {
-    a.raw[i] /= b.raw[i];
-  }
-  return a;
-}
-
-// Returns the upper 16 bits of a * b in each lane.
-template <size_t N>
-HWY_API Vec128<int16_t, N> MulHigh(Vec128<int16_t, N> a,
-                                   const Vec128<int16_t, N> b) {
-  for (size_t i = 0; i < N; ++i) {
-    a.raw[i] = static_cast<int16_t>((int32_t{a.raw[i]} * b.raw[i]) >> 16);
-  }
-  return a;
-}
-template <size_t N>
-HWY_API Vec128<uint16_t, N> MulHigh(Vec128<uint16_t, N> a,
-                                    const Vec128<uint16_t, N> b) {
-  for (size_t i = 0; i < N; ++i) {
-    // Cast to uint32_t first to prevent overflow. Otherwise the result of
-    // uint16_t * uint16_t is in "int" which may overflow. In practice the
-    // result is the same but this way it is also defined.
-    a.raw[i] = static_cast<uint16_t>(
-        (static_cast<uint32_t>(a.raw[i]) * static_cast<uint32_t>(b.raw[i])) >>
-        16);
-  }
-  return a;
-}
-
-template <size_t N>
-HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
-                                           Vec128<int16_t, N> b) {
-  for (size_t i = 0; i < N; ++i) {
-    a.raw[i] = static_cast<int16_t>((2 * a.raw[i] * b.raw[i] + 32768) >> 16);
-  }
-  return a;
-}
-
-// Multiplies even lanes (0, 2 ..) and returns the double-wide result.
-template <size_t N>
-HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a,
-                                             const Vec128<int32_t, N> b) {
-  Vec128<int64_t, (N + 1) / 2> mul;
-  for (size_t i = 0; i < N; i += 2) {
-    const int64_t a64 = a.raw[i];
-    mul.raw[i / 2] = a64 * b.raw[i];
-  }
-  return mul;
-}
-template <size_t N>
-HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(Vec128<uint32_t, N> a,
-                                              const Vec128<uint32_t, N> b) {
-  Vec128<uint64_t, (N + 1) / 2> mul;
-  for (size_t i = 0; i < N; i += 2) {
-    const uint64_t a64 = a.raw[i];
-    mul.raw[i / 2] = a64 * b.raw[i];
-  }
-  return mul;
-}
-
-template <size_t N>
-HWY_API Vec128<int64_t, (N + 1) / 2> MulOdd(const Vec128<int32_t, N> a,
-                                            const Vec128<int32_t, N> b) {
-  Vec128<int64_t, (N + 1) / 2> mul;
-  for (size_t i = 0; i < N; i += 2) {
-    const int64_t a64 = a.raw[i + 1];
-    mul.raw[i / 2] = a64 * b.raw[i + 1];
-  }
-  return mul;
-}
-template <size_t N>
-HWY_API Vec128<uint64_t, (N + 1) / 2> MulOdd(Vec128<uint32_t, N> a,
-                                             const Vec128<uint32_t, N> b) {
-  Vec128<uint64_t, (N + 1) / 2> mul;
-  for (size_t i = 0; i < N; i += 2) {
-    const uint64_t a64 = a.raw[i + 1];
-    mul.raw[i / 2] = a64 * b.raw[i + 1];
-  }
-  return mul;
-}
-
-template <size_t N>
-HWY_API Vec128<float, N> ApproximateReciprocal(Vec128<float, N> v) {
-  for (size_t i = 0; i < N; ++i) {
-    // Zero inputs are allowed, but callers are responsible for replacing the
-    // return value with something else (typically using IfThenElse). This check
-    // avoids a ubsan error. The result is arbitrary.
-    v.raw[i] = (std::abs(v.raw[i]) == 0.0f) ? 0.0f : 1.0f / v.raw[i];
-  }
-  return v;
-}
-
-template <size_t N>
-HWY_API Vec128<float, N> AbsDiff(Vec128<float, N> a, const Vec128<float, N> b) {
-  return Abs(a - b);
-}
-
-// ------------------------------ Floating-point multiply-add variants
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> MulAdd(Vec128<T, N> mul, const Vec128<T, N> x,
-                            const Vec128<T, N> add) {
-  return mul * x + add;
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, const Vec128<T, N> x,
-                               const Vec128<T, N> add) {
-  return add - mul * x;
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> MulSub(Vec128<T, N> mul, const Vec128<T, N> x,
-                            const Vec128<T, N> sub) {
-  return mul * x - sub;
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> NegMulSub(Vec128<T, N> mul, const Vec128<T, N> x,
-                               const Vec128<T, N> sub) {
-  return Neg(mul) * x - sub;
-}
-
-// ------------------------------ Floating-point square root
-
-template <size_t N>
-HWY_API Vec128<float, N> ApproximateReciprocalSqrt(Vec128<float, N> v) {
-  for (size_t i = 0; i < N; ++i) {
-    const float half = v.raw[i] * 0.5f;
-    uint32_t bits;
-    CopySameSize(&v.raw[i], &bits);
-    // Initial guess based on log2(f)
-    bits = 0x5F3759DF - (bits >> 1);
-    CopySameSize(&bits, &v.raw[i]);
-    // One Newton-Raphson iteration
-    v.raw[i] = v.raw[i] * (1.5f - (half * v.raw[i] * v.raw[i]));
-  }
-  return v;
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Sqrt(Vec128<T, N> v) {
-  for (size_t i = 0; i < N; ++i) {
-    v.raw[i] = std::sqrt(v.raw[i]);
-  }
-  return v;
-}
-
-// ------------------------------ Floating-point rounding
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Round(Vec128<T, N> v) {
-  using TI = MakeSigned<T>;
-  const Vec128<T, N> a = Abs(v);
-  for (size_t i = 0; i < N; ++i) {
-    if (!(a.raw[i] < MantissaEnd<T>())) {  // Huge or NaN
-      continue;
-    }
-    const T bias = v.raw[i] < T(0.0) ? T(-0.5) : T(0.5);
-    const TI rounded = static_cast<TI>(v.raw[i] + bias);
-    if (rounded == 0) {
-      v.raw[i] = v.raw[i] < 0 ? T{-0} : T{0};
-      continue;
-    }
-    const T rounded_f = static_cast<T>(rounded);
-    // Round to even
-    if ((rounded & 1) && std::abs(rounded_f - v.raw[i]) == T(0.5)) {
-      v.raw[i] = static_cast<T>(rounded - (v.raw[i] < T(0) ? -1 : 1));
-      continue;
-    }
-    v.raw[i] = rounded_f;
-  }
-  return v;
-}
-
-// Round-to-nearest even.
-template <size_t N>
-HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
-  using T = float;
-  using TI = int32_t;
-
-  const Vec128<float, N> abs = Abs(v);
-  Vec128<int32_t, N> ret;
-  for (size_t i = 0; i < N; ++i) {
-    const bool signbit = std::signbit(v.raw[i]);
-
-    if (!(abs.raw[i] < MantissaEnd<T>())) {  // Huge or NaN
-      // Check if too large to cast or NaN
-      if (!(abs.raw[i] <= static_cast<T>(LimitsMax<TI>()))) {
-        ret.raw[i] = signbit ? LimitsMin<TI>() : LimitsMax<TI>();
-        continue;
-      }
-      ret.raw[i] = static_cast<TI>(v.raw[i]);
-      continue;
-    }
-    const T bias = v.raw[i] < T(0.0) ? T(-0.5) : T(0.5);
-    const TI rounded = static_cast<TI>(v.raw[i] + bias);
-    if (rounded == 0) {
-      ret.raw[i] = 0;
-      continue;
-    }
-    const T rounded_f = static_cast<T>(rounded);
-    // Round to even
-    if ((rounded & 1) && std::abs(rounded_f - v.raw[i]) == T(0.5)) {
-      ret.raw[i] = rounded - (signbit ? -1 : 1);
-      continue;
-    }
-    ret.raw[i] = rounded;
-  }
-  return ret;
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Trunc(Vec128<T, N> v) {
-  using TI = MakeSigned<T>;
-  const Vec128<T, N> abs = Abs(v);
-  for (size_t i = 0; i < N; ++i) {
-    if (!(abs.raw[i] <= MantissaEnd<T>())) {  // Huge or NaN
-      continue;
-    }
-    const TI truncated = static_cast<TI>(v.raw[i]);
-    if (truncated == 0) {
-      v.raw[i] = v.raw[i] < 0 ? -T{0} : T{0};
-      continue;
-    }
-    v.raw[i] = static_cast<T>(truncated);
-  }
-  return v;
-}
-
-// Toward +infinity, aka ceiling
-template <typename Float, size_t N>
-Vec128<Float, N> Ceil(Vec128<Float, N> v) {
-  constexpr int kMantissaBits = MantissaBits<Float>();
-  using Bits = MakeUnsigned<Float>;
-  const Bits kExponentMask = MaxExponentField<Float>();
-  const Bits kMantissaMask = MantissaMask<Float>();
-  const Bits kBias = kExponentMask / 2;
-
-  for (size_t i = 0; i < N; ++i) {
-    const bool positive = v.raw[i] > Float(0.0);
-
-    Bits bits;
-    CopySameSize(&v.raw[i], &bits);
-
-    const int exponent =
-        static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
-    // Already an integer.
-    if (exponent >= kMantissaBits) continue;
-    // |v| <= 1 => 0 or 1.
-    if (exponent < 0) {
-      v.raw[i] = positive ? Float{1} : Float{-0.0};
-      continue;
-    }
-
-    const Bits mantissa_mask = kMantissaMask >> exponent;
-    // Already an integer
-    if ((bits & mantissa_mask) == 0) continue;
-
-    // Clear fractional bits and round up
-    if (positive) bits += (kMantissaMask + 1) >> exponent;
-    bits &= ~mantissa_mask;
-
-    CopySameSize(&bits, &v.raw[i]);
-  }
-  return v;
-}
-
-// Toward -infinity, aka floor
-template <typename Float, size_t N>
-Vec128<Float, N> Floor(Vec128<Float, N> v) {
-  constexpr int kMantissaBits = MantissaBits<Float>();
-  using Bits = MakeUnsigned<Float>;
-  const Bits kExponentMask = MaxExponentField<Float>();
-  const Bits kMantissaMask = MantissaMask<Float>();
-  const Bits kBias = kExponentMask / 2;
-
-  for (size_t i = 0; i < N; ++i) {
-    const bool negative = v.raw[i] < Float(0.0);
-
-    Bits bits;
-    CopySameSize(&v.raw[i], &bits);
-
-    const int exponent =
-        static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
-    // Already an integer.
-    if (exponent >= kMantissaBits) continue;
-    // |v| <= 1 => -1 or 0.
-    if (exponent < 0) {
-      v.raw[i] = negative ? Float(-1.0) : Float(0.0);
-      continue;
-    }
-
-    const Bits mantissa_mask = kMantissaMask >> exponent;
-    // Already an integer
-    if ((bits & mantissa_mask) == 0) continue;
-
-    // Clear fractional bits and round down
-    if (negative) bits += (kMantissaMask + 1) >> exponent;
-    bits &= ~mantissa_mask;
-
-    CopySameSize(&bits, &v.raw[i]);
-  }
-  return v;
-}
-
-// ------------------------------ Floating-point classification
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
-  Mask128<T, N> ret;
-  for (size_t i = 0; i < N; ++i) {
-    // std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY.
-    MakeUnsigned<T> bits;
-    CopySameSize(&v.raw[i], &bits);
-    bits += bits;
-    bits >>= 1;  // clear sign bit
-    // NaN if all exponent bits are set and the mantissa is not zero.
-    ret.bits[i] = Mask128<T, N>::FromBool(bits > ExponentMask<T>());
-  }
-  return ret;
-}
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> IsInf(const Vec128<T, N> v) {
-  static_assert(IsFloat<T>(), "Only for float");
-  const Simd<T, N, 0> d;
-  const RebindToSigned<decltype(d)> di;
-  const VFromD<decltype(di)> vi = BitCast(di, v);
-  // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
-  return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
-}
-
-// Returns whether normal/subnormal/zero.
-template <typename T, size_t N>
-HWY_API Mask128<T, N> IsFinite(const Vec128<T, N> v) {
-  static_assert(IsFloat<T>(), "Only for float");
-  const Simd<T, N, 0> d;
-  const RebindToUnsigned<decltype(d)> du;
-  const RebindToSigned<decltype(d)> di;  // cheaper than unsigned comparison
-  using VI = VFromD<decltype(di)>;
-  using VU = VFromD<decltype(du)>;
-  const VU vu = BitCast(du, v);
-  // 'Shift left' to clear the sign bit, then right so we can compare with the
-  // max exponent (cannot compare with MaxExponentTimes2 directly because it is
-  // negative and non-negative floats would be greater).
-  const VI exp =
-      BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
-  return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
-}
-
-// ================================================== COMPARE
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) {
-  Mask128<T, N> m;
-  for (size_t i = 0; i < N; ++i) {
-    m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] == b.raw[i]);
-  }
-  return m;
-}
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) {
-  Mask128<T, N> m;
-  for (size_t i = 0; i < N; ++i) {
-    m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] != b.raw[i]);
-  }
-  return m;
-}
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> TestBit(const Vec128<T, N> v, const Vec128<T, N> bit) {
-  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
-  return (v & bit) == bit;
-}
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> operator<(const Vec128<T, N> a, const Vec128<T, N> b) {
-  Mask128<T, N> m;
-  for (size_t i = 0; i < N; ++i) {
-    m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] < b.raw[i]);
-  }
-  return m;
-}
-template <typename T, size_t N>
-HWY_API Mask128<T, N> operator>(const Vec128<T, N> a, const Vec128<T, N> b) {
-  Mask128<T, N> m;
-  for (size_t i = 0; i < N; ++i) {
-    m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] > b.raw[i]);
-  }
-  return m;
-}
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> operator<=(const Vec128<T, N> a, const Vec128<T, N> b) {
-  Mask128<T, N> m;
-  for (size_t i = 0; i < N; ++i) {
-    m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] <= b.raw[i]);
-  }
-  return m;
-}
-template <typename T, size_t N>
-HWY_API Mask128<T, N> operator>=(const Vec128<T, N> a, const Vec128<T, N> b) {
-  Mask128<T, N> m;
-  for (size_t i = 0; i < N; ++i) {
-    m.bits[i] = Mask128<T, N>::FromBool(a.raw[i] >= b.raw[i]);
-  }
-  return m;
-}
-
-// ------------------------------ Lt128
-
-// Only makes sense for full vectors of u64.
-HWY_API Mask128<uint64_t> Lt128(Simd<uint64_t, 2, 0> /* tag */,
-                                Vec128<uint64_t> a, const Vec128<uint64_t> b) {
-  const bool lt =
-      (a.raw[1] < b.raw[1]) || (a.raw[1] == b.raw[1] && a.raw[0] < b.raw[0]);
-  Mask128<uint64_t> ret;
-  ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(lt);
-  return ret;
-}
-
-HWY_API Mask128<uint64_t> Lt128Upper(Simd<uint64_t, 2, 0> /* tag */,
-                                     Vec128<uint64_t> a,
-                                     const Vec128<uint64_t> b) {
-  const bool lt = a.raw[1] < b.raw[1];
-  Mask128<uint64_t> ret;
-  ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(lt);
-  return ret;
-}
-
-// ------------------------------ Eq128
-
-// Only makes sense for full vectors of u64.
-HWY_API Mask128<uint64_t> Eq128(Simd<uint64_t, 2, 0> /* tag */,
-                                Vec128<uint64_t> a, const Vec128<uint64_t> b) {
-  const bool eq = a.raw[1] == b.raw[1] && a.raw[0] == b.raw[0];
-  Mask128<uint64_t> ret;
-  ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(eq);
-  return ret;
-}
-
-HWY_API Mask128<uint64_t> Eq128Upper(Simd<uint64_t, 2, 0> /* tag */,
-                                     Vec128<uint64_t> a,
-                                     const Vec128<uint64_t> b) {
-  const bool eq = a.raw[1] == b.raw[1];
-  Mask128<uint64_t> ret;
-  ret.bits[0] = ret.bits[1] = Mask128<uint64_t>::FromBool(eq);
-  return ret;
-}
-
-// ------------------------------ Min128, Max128 (Lt128)
-
-template <class D, class V = VFromD<D>>
-HWY_API V Min128(D d, const V a, const V b) {
-  return IfThenElse(Lt128(d, a, b), a, b);
-}
-
-template <class D, class V = VFromD<D>>
-HWY_API V Max128(D d, const V a, const V b) {
-  return IfThenElse(Lt128(d, b, a), a, b);
-}
-
-template <class D, class V = VFromD<D>>
-HWY_API V Min128Upper(D d, const V a, const V b) {
-  return IfThenElse(Lt128Upper(d, a, b), a, b);
-}
-
-template <class D, class V = VFromD<D>>
-HWY_API V Max128Upper(D d, const V a, const V b) {
-  return IfThenElse(Lt128Upper(d, b, a), a, b);
-}
-
-// ================================================== MEMORY
-
-// ------------------------------ Load
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Load(Simd<T, N, 0> /* tag */,
-                          const T* HWY_RESTRICT aligned) {
-  Vec128<T, N> v;
-  CopyBytes<sizeof(T) * N>(aligned, v.raw);  // copy from array
-  return v;
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> d,
-                                const T* HWY_RESTRICT aligned) {
-  return IfThenElseZero(m, Load(d, aligned));
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> LoadU(Simd<T, N, 0> d, const T* HWY_RESTRICT p) {
-  return Load(d, p);
-}
-
-// In some use cases, "load single lane" is sufficient; otherwise avoid this.
-template <typename T, size_t N>
-HWY_API Vec128<T, N> LoadDup128(Simd<T, N, 0> d,
-                                const T* HWY_RESTRICT aligned) {
-  return Load(d, aligned);
-}
-
-// ------------------------------ Store
-
-template <typename T, size_t N>
-HWY_API void Store(const Vec128<T, N> v, Simd<T, N, 0> /* tag */,
-                   T* HWY_RESTRICT aligned) {
-  CopyBytes<sizeof(T) * N>(v.raw, aligned);  // copy to array
-}
-
-template <typename T, size_t N>
-HWY_API void StoreU(const Vec128<T, N> v, Simd<T, N, 0> d, T* HWY_RESTRICT p) {
-  Store(v, d, p);
-}
-
-template <typename T, size_t N>
-HWY_API void BlendedStore(const Vec128<T, N> v, Mask128<T, N> m,
-                          Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
-  for (size_t i = 0; i < N; ++i) {
-    if (m.bits[i]) p[i] = v.raw[i];
-  }
-}
-
-// ------------------------------ LoadInterleaved2/3/4
-
-// Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2.
-// We implement those here because scalar code is likely faster than emulation
-// via shuffles.
-#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
-#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
-#else
-#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
-#endif
-
-template <typename T, size_t N>
-HWY_API void LoadInterleaved2(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
-                              Vec128<T, N>& v0, Vec128<T, N>& v1) {
-  alignas(16) T buf0[N];
-  alignas(16) T buf1[N];
-  for (size_t i = 0; i < N; ++i) {
-    buf0[i] = *unaligned++;
-    buf1[i] = *unaligned++;
-  }
-  v0 = Load(d, buf0);
-  v1 = Load(d, buf1);
-}
-
-template <typename T, size_t N>
-HWY_API void LoadInterleaved3(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
-                              Vec128<T, N>& v0, Vec128<T, N>& v1,
-                              Vec128<T, N>& v2) {
-  alignas(16) T buf0[N];
-  alignas(16) T buf1[N];
-  alignas(16) T buf2[N];
-  for (size_t i = 0; i < N; ++i) {
-    buf0[i] = *unaligned++;
-    buf1[i] = *unaligned++;
-    buf2[i] = *unaligned++;
-  }
-  v0 = Load(d, buf0);
-  v1 = Load(d, buf1);
-  v2 = Load(d, buf2);
-}
-
-template <typename T, size_t N>
-HWY_API void LoadInterleaved4(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
-                              Vec128<T, N>& v0, Vec128<T, N>& v1,
-                              Vec128<T, N>& v2, Vec128<T, N>& v3) {
-  alignas(16) T buf0[N];
-  alignas(16) T buf1[N];
-  alignas(16) T buf2[N];
-  alignas(16) T buf3[N];
-  for (size_t i = 0; i < N; ++i) {
-    buf0[i] = *unaligned++;
-    buf1[i] = *unaligned++;
-    buf2[i] = *unaligned++;
-    buf3[i] = *unaligned++;
-  }
-  v0 = Load(d, buf0);
-  v1 = Load(d, buf1);
-  v2 = Load(d, buf2);
-  v3 = Load(d, buf3);
-}
-
-// ------------------------------ StoreInterleaved2/3/4
-
-template <typename T, size_t N>
-HWY_API void StoreInterleaved2(const Vec128<T, N> v0, const Vec128<T, N> v1,
-                               Simd<T, N, 0> /* tag */,
-                               T* HWY_RESTRICT unaligned) {
-  for (size_t i = 0; i < N; ++i) {
-    *unaligned++ = v0.raw[i];
-    *unaligned++ = v1.raw[i];
-  }
-}
-
-template <typename T, size_t N>
-HWY_API void StoreInterleaved3(const Vec128<T, N> v0, const Vec128<T, N> v1,
-                               const Vec128<T, N> v2, Simd<T, N, 0> /* tag */,
-                               T* HWY_RESTRICT unaligned) {
-  for (size_t i = 0; i < N; ++i) {
-    *unaligned++ = v0.raw[i];
-    *unaligned++ = v1.raw[i];
-    *unaligned++ = v2.raw[i];
-  }
-}
-
-template <typename T, size_t N>
-HWY_API void StoreInterleaved4(const Vec128<T, N> v0, const Vec128<T, N> v1,
-                               const Vec128<T, N> v2, const Vec128<T, N> v3,
-                               Simd<T, N, 0> /* tag */,
-                               T* HWY_RESTRICT unaligned) {
-  for (size_t i = 0; i < N; ++i) {
-    *unaligned++ = v0.raw[i];
-    *unaligned++ = v1.raw[i];
-    *unaligned++ = v2.raw[i];
-    *unaligned++ = v3.raw[i];
-  }
-}
-
-// ------------------------------ Stream
-
-template <typename T, size_t N>
-HWY_API void Stream(const Vec128<T, N> v, Simd<T, N, 0> d,
-                    T* HWY_RESTRICT aligned) {
-  Store(v, d, aligned);
-}
-
-// ------------------------------ Scatter
-
-template <typename T, size_t N, typename Offset>
-HWY_API void ScatterOffset(Vec128<T, N> v, Simd<T, N, 0> /* tag */, T* base,
-                           const Vec128<Offset, N> offset) {
-  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
-  for (size_t i = 0; i < N; ++i) {
-    uint8_t* const base8 = reinterpret_cast<uint8_t*>(base) + offset.raw[i];
-    CopyBytes<sizeof(T)>(&v.raw[i], base8);  // copy to bytes
-  }
-}
-
-template <typename T, size_t N, typename Index>
-HWY_API void ScatterIndex(Vec128<T, N> v, Simd<T, N, 0> /* tag */,
-                          T* HWY_RESTRICT base, const Vec128<Index, N> index) {
-  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
-  for (size_t i = 0; i < N; ++i) {
-    base[index.raw[i]] = v.raw[i];
-  }
-}
-
-// ------------------------------ Gather
-
-template <typename T, size_t N, typename Offset>
-HWY_API Vec128<T, N> GatherOffset(Simd<T, N, 0> /* tag */, const T* base,
-                                  const Vec128<Offset, N> offset) {
-  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
-  Vec128<T, N> v;
-  for (size_t i = 0; i < N; ++i) {
-    const uint8_t* base8 =
-        reinterpret_cast<const uint8_t*>(base) + offset.raw[i];
-    CopyBytes<sizeof(T)>(base8, &v.raw[i]);  // copy from bytes
-  }
-  return v;
-}
-
-template <typename T, size_t N, typename Index>
-HWY_API Vec128<T, N> GatherIndex(Simd<T, N, 0> /* tag */,
-                                 const T* HWY_RESTRICT base,
-                                 const Vec128<Index, N> index) {
-  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
-  Vec128<T, N> v;
-  for (size_t i = 0; i < N; ++i) {
-    v.raw[i] = base[index.raw[i]];
-  }
-  return v;
-}
-
-// ================================================== CONVERT
-
-// ConvertTo and DemoteTo with floating-point input and integer output truncate
-// (rounding toward zero).
-
-template <typename FromT, typename ToT, size_t N>
-HWY_API Vec128<ToT, N> PromoteTo(Simd<ToT, N, 0> /* tag */,
-                                 Vec128<FromT, N> from) {
-  static_assert(sizeof(ToT) > sizeof(FromT), "Not promoting");
-  Vec128<ToT, N> ret;
-  for (size_t i = 0; i < N; ++i) {
-    // For bits Y > X, floatX->floatY and intX->intY are always representable.
-    ret.raw[i] = static_cast<ToT>(from.raw[i]);
-  }
-  return ret;
-}
-
-// MSVC 19.10 cannot deduce the argument type if HWY_IF_FLOAT(FromT) is here,
-// so we overload for FromT=double and ToT={float,int32_t}.
-template <size_t N>
-HWY_API Vec128<float, N> DemoteTo(Simd<float, N, 0> /* tag */,
-                                  Vec128<double, N> from) {
-  Vec128<float, N> ret;
-  for (size_t i = 0; i < N; ++i) {
-    // Prevent ubsan errors when converting float to narrower integer/float
-    if (std::isinf(from.raw[i]) ||
-        std::fabs(from.raw[i]) > static_cast<double>(HighestValue<float>())) {
-      ret.raw[i] = std::signbit(from.raw[i]) ? LowestValue<float>()
-                                             : HighestValue<float>();
-      continue;
-    }
-    ret.raw[i] = static_cast<float>(from.raw[i]);
-  }
-  return ret;
-}
-template <size_t N>
-HWY_API Vec128<int32_t, N> DemoteTo(Simd<int32_t, N, 0> /* tag */,
-                                    Vec128<double, N> from) {
-  Vec128<int32_t, N> ret;
-  for (size_t i = 0; i < N; ++i) {
-    // Prevent ubsan errors when converting int32_t to narrower integer/int32_t
-    if (std::isinf(from.raw[i]) ||
-        std::fabs(from.raw[i]) > static_cast<double>(HighestValue<int32_t>())) {
-      ret.raw[i] = std::signbit(from.raw[i]) ? LowestValue<int32_t>()
-                                             : HighestValue<int32_t>();
-      continue;
-    }
-    ret.raw[i] = static_cast<int32_t>(from.raw[i]);
-  }
-  return ret;
-}
-
-template <typename FromT, typename ToT, size_t N>
-HWY_API Vec128<ToT, N> DemoteTo(Simd<ToT, N, 0> /* tag */,
-                                Vec128<FromT, N> from) {
-  static_assert(!IsFloat<FromT>(), "FromT=double are handled above");
-  static_assert(sizeof(ToT) < sizeof(FromT), "Not demoting");
-
-  Vec128<ToT, N> ret;
-  for (size_t i = 0; i < N; ++i) {
-    // Int to int: choose closest value in ToT to `from` (avoids UB)
-    from.raw[i] =
-        HWY_MIN(HWY_MAX(LimitsMin<ToT>(), from.raw[i]), LimitsMax<ToT>());
-    ret.raw[i] = static_cast<ToT>(from.raw[i]);
-  }
-  return ret;
-}
-
-template <size_t N>
-HWY_API Vec128<bfloat16_t, 2 * N> ReorderDemote2To(
-    Simd<bfloat16_t, 2 * N, 0> dbf16, Vec128<float, N> a, Vec128<float, N> b) {
-  const Repartition<uint32_t, decltype(dbf16)> du32;
-  const Vec128<uint32_t, N> b_in_lower = ShiftRight<16>(BitCast(du32, b));
-  // Avoid OddEven - we want the upper half of `a` even on big-endian systems.
-  const Vec128<uint32_t, N> a_mask = Set(du32, 0xFFFF0000);
-  return BitCast(dbf16, IfVecThenElse(a_mask, BitCast(du32, a), b_in_lower));
-}
-
-namespace detail {
-
-HWY_INLINE void StoreU16ToF16(const uint16_t val,
-                              hwy::float16_t* HWY_RESTRICT to) {
-  CopySameSize(&val, to);
-}
-
-HWY_INLINE uint16_t U16FromF16(const hwy::float16_t* HWY_RESTRICT from) {
-  uint16_t bits16;
-  CopySameSize(from, &bits16);
-  return bits16;
-}
-
-}  // namespace detail
-
-template <size_t N>
-HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> /* tag */,
-                                   const Vec128<float16_t, N> v) {
-  Vec128<float, N> ret;
-  for (size_t i = 0; i < N; ++i) {
-    const uint16_t bits16 = detail::U16FromF16(&v.raw[i]);
-    const uint32_t sign = static_cast<uint32_t>(bits16 >> 15);
-    const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
-    const uint32_t mantissa = bits16 & 0x3FF;
-
-    // Subnormal or zero
-    if (biased_exp == 0) {
-      const float subnormal =
-          (1.0f / 16384) * (static_cast<float>(mantissa) * (1.0f / 1024));
-      ret.raw[i] = sign ? -subnormal : subnormal;
-      continue;
-    }
-
-    // Normalized: convert the representation directly (faster than
-    // ldexp/tables).
-    const uint32_t biased_exp32 = biased_exp + (127 - 15);
-    const uint32_t mantissa32 = mantissa << (23 - 10);
-    const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
-    CopySameSize(&bits32, &ret.raw[i]);
-  }
-  return ret;
-}
-
-template <size_t N>
-HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> /* tag */,
-                                   const Vec128<bfloat16_t, N> v) {
-  Vec128<float, N> ret;
-  for (size_t i = 0; i < N; ++i) {
-    ret.raw[i] = F32FromBF16(v.raw[i]);
-  }
-  return ret;
-}
-
-template <size_t N>
-HWY_API Vec128<float16_t, N> DemoteTo(Simd<float16_t, N, 0> /* tag */,
-                                      const Vec128<float, N> v) {
-  Vec128<float16_t, N> ret;
-  for (size_t i = 0; i < N; ++i) {
-    uint32_t bits32;
-    CopySameSize(&v.raw[i], &bits32);
-    const uint32_t sign = bits32 >> 31;
-    const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
-    const uint32_t mantissa32 = bits32 & 0x7FFFFF;
-
-    const int32_t exp = HWY_MIN(static_cast<int32_t>(biased_exp32) - 127, 15);
-
-    // Tiny or zero => zero.
-    if (exp < -24) {
-      ZeroBytes<sizeof(uint16_t)>(&ret.raw[i]);
-      continue;
-    }
-
-    uint32_t biased_exp16, mantissa16;
-
-    // exp = [-24, -15] => subnormal
-    if (exp < -14) {
-      biased_exp16 = 0;
-      const uint32_t sub_exp = static_cast<uint32_t>(-14 - exp);
-      HWY_DASSERT(1 <= sub_exp && sub_exp < 11);
-      mantissa16 = static_cast<uint32_t>((1u << (10 - sub_exp)) +
-                                         (mantissa32 >> (13 + sub_exp)));
-    } else {
-      // exp = [-14, 15]
-      biased_exp16 = static_cast<uint32_t>(exp + 15);
-      HWY_DASSERT(1 <= biased_exp16 && biased_exp16 < 31);
-      mantissa16 = mantissa32 >> 13;
-    }
-
-    HWY_DASSERT(mantissa16 < 1024);
-    const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
-    HWY_DASSERT(bits16 < 0x10000);
-    const uint16_t narrowed = static_cast<uint16_t>(bits16);  // big-endian safe
-    detail::StoreU16ToF16(narrowed, &ret.raw[i]);
-  }
-  return ret;
-}
-
-template <size_t N>
-HWY_API Vec128<bfloat16_t, N> DemoteTo(Simd<bfloat16_t, N, 0> /* tag */,
-                                       const Vec128<float, N> v) {
-  Vec128<bfloat16_t, N> ret;
-  for (size_t i = 0; i < N; ++i) {
-    ret.raw[i] = BF16FromF32(v.raw[i]);
-  }
-  return ret;
-}
-
-// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
-namespace detail {
-
-template <typename FromT, typename ToT, size_t N>
-HWY_API Vec128<ToT, N> ConvertTo(hwy::FloatTag /*tag*/,
-                                 Simd<ToT, N, 0> /* tag */,
-                                 Vec128<FromT, N> from) {
-  static_assert(sizeof(ToT) == sizeof(FromT), "Should have same size");
-  Vec128<ToT, N> ret;
-  for (size_t i = 0; i < N; ++i) {
-    // float## -> int##: return closest representable value. We cannot exactly
-    // represent LimitsMax<ToT> in FromT, so use double.
-    const double f = static_cast<double>(from.raw[i]);
-    if (std::isinf(from.raw[i]) ||
-        std::fabs(f) > static_cast<double>(LimitsMax<ToT>())) {
-      ret.raw[i] =
-          std::signbit(from.raw[i]) ? LimitsMin<ToT>() : LimitsMax<ToT>();
-      continue;
-    }
-    ret.raw[i] = static_cast<ToT>(from.raw[i]);
-  }
-  return ret;
-}
-
-template <typename FromT, typename ToT, size_t N>
-HWY_API Vec128<ToT, N> ConvertTo(hwy::NonFloatTag /*tag*/,
-                                 Simd<ToT, N, 0> /* tag */,
-                                 Vec128<FromT, N> from) {
-  static_assert(sizeof(ToT) == sizeof(FromT), "Should have same size");
-  Vec128<ToT, N> ret;
-  for (size_t i = 0; i < N; ++i) {
-    // int## -> float##: no check needed
-    ret.raw[i] = static_cast<ToT>(from.raw[i]);
-  }
-  return ret;
-}
-
-}  // namespace detail
-
-template <typename FromT, typename ToT, size_t N>
-HWY_API Vec128<ToT, N> ConvertTo(Simd<ToT, N, 0> d, Vec128<FromT, N> from) {
-  return detail::ConvertTo(hwy::IsFloatTag<FromT>(), d, from);
-}
-
-template <size_t N>
-HWY_API Vec128<uint8_t, N> U8FromU32(const Vec128<uint32_t, N> v) {
-  return DemoteTo(Simd<uint8_t, N, 0>(), v);
-}
-
-// ------------------------------ Truncations
-
-template <size_t N>
-HWY_API Vec128<uint8_t, N> TruncateTo(Simd<uint8_t, N, 0> /* tag */,
-                                      const Vec128<uint64_t, N> v) {
-  Vec128<uint8_t, N> ret;
-  for (size_t i = 0; i < N; ++i) {
-    ret.raw[i] = static_cast<uint8_t>(v.raw[i] & 0xFF);
-  }
-  return ret;
-}
-
-template <size_t N>
-HWY_API Vec128<uint16_t, N> TruncateTo(Simd<uint16_t, N, 0> /* tag */,
-                                       const Vec128<uint64_t, N> v) {
-  Vec128<uint16_t, N> ret;
-  for (size_t i = 0; i < N; ++i) {
-    ret.raw[i] = static_cast<uint16_t>(v.raw[i] & 0xFFFF);
-  }
-  return ret;
-}
-
-template <size_t N>
-HWY_API Vec128<uint32_t, N> TruncateTo(Simd<uint32_t, N, 0> /* tag */,
-                                       const Vec128<uint64_t, N> v) {
-  Vec128<uint32_t, N> ret;
-  for (size_t i = 0; i < N; ++i) {
-    ret.raw[i] = static_cast<uint32_t>(v.raw[i] & 0xFFFFFFFFu);
-  }
-  return ret;
-}
-
-template <size_t N>
-HWY_API Vec128<uint8_t, N> TruncateTo(Simd<uint8_t, N, 0> /* tag */,
-                                      const Vec128<uint32_t, N> v) {
-  Vec128<uint8_t, N> ret;
-  for (size_t i = 0; i < N; ++i) {
-    ret.raw[i] = static_cast<uint8_t>(v.raw[i] & 0xFF);
-  }
-  return ret;
-}
-
-template <size_t N>
-HWY_API Vec128<uint16_t, N> TruncateTo(Simd<uint16_t, N, 0> /* tag */,
-                                       const Vec128<uint32_t, N> v) {
-  Vec128<uint16_t, N> ret;
-  for (size_t i = 0; i < N; ++i) {
-    ret.raw[i] = static_cast<uint16_t>(v.raw[i] & 0xFFFF);
-  }
-  return ret;
-}
-
-template <size_t N>
-HWY_API Vec128<uint8_t, N> TruncateTo(Simd<uint8_t, N, 0> /* tag */,
-                                      const Vec128<uint16_t, N> v) {
-  Vec128<uint8_t, N> ret;
-  for (size_t i = 0; i < N; ++i) {
-    ret.raw[i] = static_cast<uint8_t>(v.raw[i] & 0xFF);
-  }
-  return ret;
-}
-
-// ================================================== COMBINE
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
-  Vec128<T, N / 2> ret;
-  CopyBytes<N / 2 * sizeof(T)>(v.raw, ret.raw);
-  return ret;
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N / 2> LowerHalf(Simd<T, N / 2, 0> /* tag */,
-                                   Vec128<T, N> v) {
-  return LowerHalf(v);
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N / 2> UpperHalf(Simd<T, N / 2, 0> /* tag */,
-                                   Vec128<T, N> v) {
-  Vec128<T, N / 2> ret;
-  CopyBytes<N / 2 * sizeof(T)>(&v.raw[N / 2], ret.raw);
-  return ret;
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> ZeroExtendVector(Simd<T, N, 0> /* tag */,
-                                      Vec128<T, N / 2> v) {
-  Vec128<T, N> ret;
-  CopyBytes<N / 2 * sizeof(T)>(v.raw, ret.raw);
-  return ret;
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Combine(Simd<T, N, 0> /* tag */, Vec128<T, N / 2> hi_half,
-                             Vec128<T, N / 2> lo_half) {
-  Vec128<T, N> ret;
-  CopyBytes<N / 2 * sizeof(T)>(lo_half.raw, &ret.raw[0]);
-  CopyBytes<N / 2 * sizeof(T)>(hi_half.raw, &ret.raw[N / 2]);
-  return ret;
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> ConcatLowerLower(Simd<T, N, 0> /* tag */, Vec128<T, N> hi,
-                                      Vec128<T, N> lo) {
-  Vec128<T, N> ret;
-  CopyBytes<N / 2 * sizeof(T)>(lo.raw, &ret.raw[0]);
-  CopyBytes<N / 2 * sizeof(T)>(hi.raw, &ret.raw[N / 2]);
-  return ret;
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> ConcatUpperUpper(Simd<T, N, 0> /* tag */, Vec128<T, N> hi,
-                                      Vec128<T, N> lo) {
-  Vec128<T, N> ret;
-  CopyBytes<N / 2 * sizeof(T)>(&lo.raw[N / 2], &ret.raw[0]);
-  CopyBytes<N / 2 * sizeof(T)>(&hi.raw[N / 2], &ret.raw[N / 2]);
-  return ret;
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> ConcatLowerUpper(Simd<T, N, 0> /* tag */,
-                                      const Vec128<T, N> hi,
-                                      const Vec128<T, N> lo) {
-  Vec128<T, N> ret;
-  CopyBytes<N / 2 * sizeof(T)>(&lo.raw[N / 2], &ret.raw[0]);
-  CopyBytes<N / 2 * sizeof(T)>(hi.raw, &ret.raw[N / 2]);
-  return ret;
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> ConcatUpperLower(Simd<T, N, 0> /* tag */, Vec128<T, N> hi,
-                                      Vec128<T, N> lo) {
-  Vec128<T, N> ret;
-  CopyBytes<N / 2 * sizeof(T)>(lo.raw, &ret.raw[0]);
-  CopyBytes<N / 2 * sizeof(T)>(&hi.raw[N / 2], &ret.raw[N / 2]);
-  return ret;
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> ConcatEven(Simd<T, N, 0> /* tag */, Vec128<T, N> hi,
-                                Vec128<T, N> lo) {
-  Vec128<T, N> ret;
-  for (size_t i = 0; i < N / 2; ++i) {
-    ret.raw[i] = lo.raw[2 * i];
-  }
-  for (size_t i = 0; i < N / 2; ++i) {
-    ret.raw[N / 2 + i] = hi.raw[2 * i];
-  }
-  return ret;
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> ConcatOdd(Simd<T, N, 0> /* tag */, Vec128<T, N> hi,
-                               Vec128<T, N> lo) {
-  Vec128<T, N> ret;
-  for (size_t i = 0; i < N / 2; ++i) {
-    ret.raw[i] = lo.raw[2 * i + 1];
-  }
-  for (size_t i = 0; i < N / 2; ++i) {
-    ret.raw[N / 2 + i] = hi.raw[2 * i + 1];
-  }
-  return ret;
-}
-
-// ------------------------------ CombineShiftRightBytes
-
-template <int kBytes, typename T, size_t N, class V = Vec128<T, N>>
-HWY_API V CombineShiftRightBytes(Simd<T, N, 0> /* tag */, V hi, V lo) {
-  V ret;
-  const uint8_t* HWY_RESTRICT lo8 =
-      reinterpret_cast<const uint8_t * HWY_RESTRICT>(lo.raw);
-  uint8_t* HWY_RESTRICT ret8 =
-      reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
-  CopyBytes<sizeof(T) * N - kBytes>(lo8 + kBytes, ret8);
-  CopyBytes<kBytes>(hi.raw, ret8 + sizeof(T) * N - kBytes);
-  return ret;
-}
-
-// ------------------------------ ShiftLeftBytes
-
-template <int kBytes, typename T, size_t N>
-HWY_API Vec128<T, N> ShiftLeftBytes(Simd<T, N, 0> /* tag */, Vec128<T, N> v) {
-  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
-  Vec128<T, N> ret;
-  uint8_t* HWY_RESTRICT ret8 =
-      reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
-  ZeroBytes<kBytes>(ret8);
-  CopyBytes<sizeof(T) * N - kBytes>(v.raw, ret8 + kBytes);
-  return ret;
-}
-
-template <int kBytes, typename T, size_t N>
-HWY_API Vec128<T, N> ShiftLeftBytes(const Vec128<T, N> v) {
-  return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v);
-}
-
-// ------------------------------ ShiftLeftLanes
-
-template <int kLanes, typename T, size_t N>
-HWY_API Vec128<T, N> ShiftLeftLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
-  const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
-}
-
-template <int kLanes, typename T, size_t N>
-HWY_API Vec128<T, N> ShiftLeftLanes(const Vec128<T, N> v) {
-  return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
-}
-
-// ------------------------------ ShiftRightBytes
-template <int kBytes, typename T, size_t N>
-HWY_API Vec128<T, N> ShiftRightBytes(Simd<T, N, 0> /* tag */, Vec128<T, N> v) {
-  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
-  Vec128<T, N> ret;
-  const uint8_t* HWY_RESTRICT v8 =
-      reinterpret_cast<const uint8_t * HWY_RESTRICT>(v.raw);
-  uint8_t* HWY_RESTRICT ret8 =
-      reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
-  CopyBytes<sizeof(T) * N - kBytes>(v8 + kBytes, ret8);
-  ZeroBytes<kBytes>(ret8 + sizeof(T) * N - kBytes);
-  return ret;
-}
-
-// ------------------------------ ShiftRightLanes
-template <int kLanes, typename T, size_t N>
-HWY_API Vec128<T, N> ShiftRightLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
-  const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v)));
-}
-
-// ================================================== SWIZZLE
-
-template <typename T, size_t N>
-HWY_API T GetLane(const Vec128<T, N> v) {
-  return v.raw[0];
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> InsertLane(Vec128<T, N> v, size_t i, T t) {
-  v.raw[i] = t;
-  return v;
-}
-
-template <typename T, size_t N>
-HWY_API T ExtractLane(const Vec128<T, N> v, size_t i) {
-  return v.raw[i];
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
-  for (size_t i = 0; i < N; i += 2) {
-    v.raw[i + 1] = v.raw[i];
-  }
-  return v;
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
-  for (size_t i = 0; i < N; i += 2) {
-    v.raw[i] = v.raw[i + 1];
-  }
-  return v;
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> OddEven(Vec128<T, N> odd, Vec128<T, N> even) {
-  for (size_t i = 0; i < N; i += 2) {
-    odd.raw[i] = even.raw[i];
-  }
-  return odd;
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
-  return even;
-}
-
-// ------------------------------ SwapAdjacentBlocks
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
-  return v;
-}
-
-// ------------------------------ TableLookupLanes
-
-// Returned by SetTableIndices for use by TableLookupLanes.
-template <typename T, size_t N>
-struct Indices128 {
-  MakeSigned<T> raw[N];
-};
-
-template <typename T, size_t N, typename TI>
-HWY_API Indices128<T, N> IndicesFromVec(Simd<T, N, 0>, Vec128<TI, N> vec) {
-  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane size");
-  Indices128<T, N> ret;
-  CopyBytes<N * sizeof(T)>(vec.raw, ret.raw);
-  return ret;
-}
-
-template <typename T, size_t N, typename TI>
-HWY_API Indices128<T, N> SetTableIndices(Simd<T, N, 0> d, const TI* idx) {
-  return IndicesFromVec(d, LoadU(Simd<TI, N, 0>(), idx));
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> TableLookupLanes(const Vec128<T, N> v,
-                                      const Indices128<T, N> idx) {
-  Vec128<T, N> ret;
-  for (size_t i = 0; i < N; ++i) {
-    ret.raw[i] = v.raw[idx.raw[i]];
-  }
-  return ret;
-}
-
-// ------------------------------ ReverseBlocks
-
-// Single block: no change
-template <typename T, size_t N>
-HWY_API Vec128<T, N> ReverseBlocks(Simd<T, N, 0> /* tag */,
-                                   const Vec128<T, N> v) {
-  return v;
-}
-
-// ------------------------------ Reverse
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Reverse(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
-  Vec128<T, N> ret;
-  for (size_t i = 0; i < N; ++i) {
-    ret.raw[i] = v.raw[N - 1 - i];
-  }
-  return ret;
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
-  Vec128<T, N> ret;
-  for (size_t i = 0; i < N; i += 2) {
-    ret.raw[i + 0] = v.raw[i + 1];
-    ret.raw[i + 1] = v.raw[i + 0];
-  }
-  return ret;
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
-  Vec128<T, N> ret;
-  for (size_t i = 0; i < N; i += 4) {
-    ret.raw[i + 0] = v.raw[i + 3];
-    ret.raw[i + 1] = v.raw[i + 2];
-    ret.raw[i + 2] = v.raw[i + 1];
-    ret.raw[i + 3] = v.raw[i + 0];
-  }
-  return ret;
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Reverse8(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
-  Vec128<T, N> ret;
-  for (size_t i = 0; i < N; i += 8) {
-    ret.raw[i + 0] = v.raw[i + 7];
-    ret.raw[i + 1] = v.raw[i + 6];
-    ret.raw[i + 2] = v.raw[i + 5];
-    ret.raw[i + 3] = v.raw[i + 4];
-    ret.raw[i + 4] = v.raw[i + 3];
-    ret.raw[i + 5] = v.raw[i + 2];
-    ret.raw[i + 6] = v.raw[i + 1];
-    ret.raw[i + 7] = v.raw[i + 0];
-  }
-  return ret;
-}
-
-// ================================================== BLOCKWISE
-
-// ------------------------------ Shuffle*
-
-// Swap 32-bit halves in 64-bit halves.
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Shuffle2301(const Vec128<T, N> v) {
-  static_assert(sizeof(T) == 4, "Only for 32-bit");
-  static_assert(N == 2 || N == 4, "Does not make sense for N=1");
-  return Reverse2(DFromV<decltype(v)>(), v);
-}
-
-// Swap 64-bit halves
-template <typename T>
-HWY_API Vec128<T> Shuffle1032(const Vec128<T> v) {
-  static_assert(sizeof(T) == 4, "Only for 32-bit");
-  Vec128<T> ret;
-  ret.raw[3] = v.raw[1];
-  ret.raw[2] = v.raw[0];
-  ret.raw[1] = v.raw[3];
-  ret.raw[0] = v.raw[2];
-  return ret;
-}
-template <typename T>
-HWY_API Vec128<T> Shuffle01(const Vec128<T> v) {
-  static_assert(sizeof(T) == 8, "Only for 64-bit");
-  return Reverse2(DFromV<decltype(v)>(), v);
-}
-
-// Rotate right 32 bits
-template <typename T>
-HWY_API Vec128<T> Shuffle0321(const Vec128<T> v) {
-  Vec128<T> ret;
-  ret.raw[3] = v.raw[0];
-  ret.raw[2] = v.raw[3];
-  ret.raw[1] = v.raw[2];
-  ret.raw[0] = v.raw[1];
-  return ret;
-}
-
-// Rotate left 32 bits
-template <typename T>
-HWY_API Vec128<T> Shuffle2103(const Vec128<T> v) {
-  Vec128<T> ret;
-  ret.raw[3] = v.raw[2];
-  ret.raw[2] = v.raw[1];
-  ret.raw[1] = v.raw[0];
-  ret.raw[0] = v.raw[3];
-  return ret;
-}
-
-template <typename T>
-HWY_API Vec128<T> Shuffle0123(const Vec128<T> v) {
-  return Reverse4(DFromV<decltype(v)>(), v);
-}
-
-// ------------------------------ Broadcast/splat any lane
-
-template <int kLane, typename T, size_t N>
-HWY_API Vec128<T, N> Broadcast(Vec128<T, N> v) {
-  for (size_t i = 0; i < N; ++i) {
-    v.raw[i] = v.raw[kLane];
-  }
-  return v;
-}
-
-// ------------------------------ TableLookupBytes, TableLookupBytesOr0
-
-template <typename T, size_t N, typename TI, size_t NI>
-HWY_API Vec128<TI, NI> TableLookupBytes(const Vec128<T, N> v,
-                                        const Vec128<TI, NI> indices) {
-  const uint8_t* HWY_RESTRICT v_bytes =
-      reinterpret_cast<const uint8_t * HWY_RESTRICT>(v.raw);
-  const uint8_t* HWY_RESTRICT idx_bytes =
-      reinterpret_cast<const uint8_t*>(indices.raw);
-  Vec128<TI, NI> ret;
-  uint8_t* HWY_RESTRICT ret_bytes =
-      reinterpret_cast<uint8_t * HWY_RESTRICT>(ret.raw);
-  for (size_t i = 0; i < NI * sizeof(TI); ++i) {
-    const size_t idx = idx_bytes[i];
-    // Avoid out of bounds reads.
-    ret_bytes[i] = idx < sizeof(T) * N ? v_bytes[idx] : 0;
-  }
-  return ret;
-}
-
-template <typename T, size_t N, typename TI, size_t NI>
-HWY_API Vec128<TI, NI> TableLookupBytesOr0(const Vec128<T, N> v,
-                                           const Vec128<TI, NI> indices) {
-  // Same as TableLookupBytes, which already returns 0 if out of bounds.
-  return TableLookupBytes(v, indices);
-}
-
-// ------------------------------ InterleaveLower/InterleaveUpper
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> InterleaveLower(const Vec128<T, N> a,
-                                     const Vec128<T, N> b) {
-  Vec128<T, N> ret;
-  for (size_t i = 0; i < N / 2; ++i) {
-    ret.raw[2 * i + 0] = a.raw[i];
-    ret.raw[2 * i + 1] = b.raw[i];
-  }
-  return ret;
-}
-
-// Additional overload for the optional tag (also for 256/512).
-template <class V>
-HWY_API V InterleaveLower(DFromV<V> /* tag */, V a, V b) {
-  return InterleaveLower(a, b);
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> InterleaveUpper(Simd<T, N, 0> /* tag */,
-                                     const Vec128<T, N> a,
-                                     const Vec128<T, N> b) {
-  Vec128<T, N> ret;
-  for (size_t i = 0; i < N / 2; ++i) {
-    ret.raw[2 * i + 0] = a.raw[N / 2 + i];
-    ret.raw[2 * i + 1] = b.raw[N / 2 + i];
-  }
-  return ret;
-}
-
-// ------------------------------ ZipLower/ZipUpper (InterleaveLower)
-
-// Same as Interleave*, except that the return lanes are double-width integers;
-// this is necessary because the single-lane scalar cannot return two values.
-template <class V, class DW = RepartitionToWide<DFromV<V>>>
-HWY_API VFromD<DW> ZipLower(V a, V b) {
-  return BitCast(DW(), InterleaveLower(a, b));
-}
-template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
-HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
-  return BitCast(dw, InterleaveLower(D(), a, b));
-}
-
-template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
-HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
-  return BitCast(dw, InterleaveUpper(D(), a, b));
-}
-
-// ================================================== MASK
-
-template <typename T, size_t N>
-HWY_API bool AllFalse(Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
-  typename Mask128<T, N>::Raw or_sum = 0;
-  for (size_t i = 0; i < N; ++i) {
-    or_sum |= mask.bits[i];
-  }
-  return or_sum == 0;
-}
-
-template <typename T, size_t N>
-HWY_API bool AllTrue(Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
-  using Bits = typename Mask128<T, N>::Raw;
-  constexpr Bits kAll = static_cast<Bits>(~Bits{0});
-  Bits and_sum = kAll;
-  for (size_t i = 0; i < N; ++i) {
-    and_sum &= mask.bits[i];
-  }
-  return and_sum == kAll;
-}
-
-// `p` points to at least 8 readable bytes, not all of which need be valid.
-template <typename T, size_t N>
-HWY_API Mask128<T, N> LoadMaskBits(Simd<T, N, 0> /* tag */,
-                                   const uint8_t* HWY_RESTRICT bits) {
-  Mask128<T, N> m;
-  for (size_t i = 0; i < N; ++i) {
-    const size_t bit = size_t{1} << (i & 7);
-    const size_t idx_byte = i >> 3;
-    m.bits[i] = Mask128<T, N>::FromBool((bits[idx_byte] & bit) != 0);
-  }
-  return m;
-}
-
-// `p` points to at least 8 writable bytes.
-template <typename T, size_t N>
-HWY_API size_t StoreMaskBits(Simd<T, N, 0> /* tag */, const Mask128<T, N> mask,
-                             uint8_t* bits) {
-  bits[0] = 0;
-  if (N > 8) bits[1] = 0;  // N <= 16, so max two bytes
-  for (size_t i = 0; i < N; ++i) {
-    const size_t bit = size_t{1} << (i & 7);
-    const size_t idx_byte = i >> 3;
-    if (mask.bits[i]) {
-      bits[idx_byte] = static_cast<uint8_t>(bits[idx_byte] | bit);
-    }
-  }
-  return N > 8 ? 2 : 1;
-}
-
-template <typename T, size_t N>
-HWY_API size_t CountTrue(Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
-  size_t count = 0;
-  for (size_t i = 0; i < N; ++i) {
-    count += mask.bits[i] != 0;
-  }
-  return count;
-}
-
-template <typename T, size_t N>
-HWY_API intptr_t FindFirstTrue(Simd<T, N, 0> /* tag */,
-                               const Mask128<T, N> mask) {
-  for (size_t i = 0; i < N; ++i) {
-    if (mask.bits[i] != 0) return static_cast<intptr_t>(i);
-  }
-  return intptr_t{-1};
-}
-
-// ------------------------------ Compress
-
-template <typename T>
-struct CompressIsPartition {
-  enum { value = 1 };
-};
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Compress(Vec128<T, N> v, const Mask128<T, N> mask) {
-  size_t count = 0;
-  Vec128<T, N> ret;
-  for (size_t i = 0; i < N; ++i) {
-    if (mask.bits[i]) {
-      ret.raw[count++] = v.raw[i];
-    }
-  }
-  for (size_t i = 0; i < N; ++i) {
-    if (!mask.bits[i]) {
-      ret.raw[count++] = v.raw[i];
-    }
-  }
-  HWY_DASSERT(count == N);
-  return ret;
-}
-
-// ------------------------------ CompressNot
-template <typename T, size_t N>
-HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, const Mask128<T, N> mask) {
-  size_t count = 0;
-  Vec128<T, N> ret;
-  for (size_t i = 0; i < N; ++i) {
-    if (!mask.bits[i]) {
-      ret.raw[count++] = v.raw[i];
-    }
-  }
-  for (size_t i = 0; i < N; ++i) {
-    if (mask.bits[i]) {
-      ret.raw[count++] = v.raw[i];
-    }
-  }
-  HWY_DASSERT(count == N);
-  return ret;
-}
-
-// ------------------------------ CompressBlocksNot
-HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
-                                           Mask128<uint64_t> /* m */) {
-  return v;
-}
-
-// ------------------------------ CompressBits
-template <typename T, size_t N>
-HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v,
-                                  const uint8_t* HWY_RESTRICT bits) {
-  return Compress(v, LoadMaskBits(Simd<T, N, 0>(), bits));
-}
-
-// ------------------------------ CompressStore
-template <typename T, size_t N>
-HWY_API size_t CompressStore(Vec128<T, N> v, const Mask128<T, N> mask,
-                             Simd<T, N, 0> /* tag */,
-                             T* HWY_RESTRICT unaligned) {
-  size_t count = 0;
-  for (size_t i = 0; i < N; ++i) {
-    if (mask.bits[i]) {
-      unaligned[count++] = v.raw[i];
-    }
-  }
-  return count;
-}
-
-// ------------------------------ CompressBlendedStore
-template <typename T, size_t N>
-HWY_API size_t CompressBlendedStore(Vec128<T, N> v, const Mask128<T, N> mask,
-                                    Simd<T, N, 0> d,
-                                    T* HWY_RESTRICT unaligned) {
-  return CompressStore(v, mask, d, unaligned);
-}
-
-// ------------------------------ CompressBitsStore
-template <typename T, size_t N>
-HWY_API size_t CompressBitsStore(Vec128<T, N> v,
-                                 const uint8_t* HWY_RESTRICT bits,
-                                 Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
-  const Mask128<T, N> mask = LoadMaskBits(d, bits);
-  StoreU(Compress(v, mask), d, unaligned);
-  return CountTrue(d, mask);
-}
-
-// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
-template <size_t N>
-HWY_API Vec128<float, N> ReorderWidenMulAccumulate(Simd<float, N, 0> df32,
-                                                   Vec128<bfloat16_t, 2 * N> a,
-                                                   Vec128<bfloat16_t, 2 * N> b,
-                                                   const Vec128<float, N> sum0,
-                                                   Vec128<float, N>& sum1) {
-  const Rebind<bfloat16_t, decltype(df32)> dbf16;
-  // Avoid ZipLower/Upper so this also works on big-endian systems.
-  const Vec128<float, N> a0 = PromoteTo(df32, LowerHalf(dbf16, a));
-  const Vec128<float, N> a1 = PromoteTo(df32, UpperHalf(dbf16, a));
-  const Vec128<float, N> b0 = PromoteTo(df32, LowerHalf(dbf16, b));
-  const Vec128<float, N> b1 = PromoteTo(df32, UpperHalf(dbf16, b));
-  sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
-  return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
-}
-
-// ================================================== REDUCTIONS
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> SumOfLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
-  T sum = T{0};
-  for (size_t i = 0; i < N; ++i) {
-    sum += v.raw[i];
-  }
-  return Set(d, sum);
-}
-template <typename T, size_t N>
-HWY_API Vec128<T, N> MinOfLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
-  T min = HighestValue<T>();
-  for (size_t i = 0; i < N; ++i) {
-    min = HWY_MIN(min, v.raw[i]);
-  }
-  return Set(d, min);
-}
-template <typename T, size_t N>
-HWY_API Vec128<T, N> MaxOfLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
-  T max = LowestValue<T>();
-  for (size_t i = 0; i < N; ++i) {
-    max = HWY_MAX(max, v.raw[i]);
-  }
-  return Set(d, max);
-}
-
-// ================================================== OPS WITH DEPENDENCIES
-
-// ------------------------------ MulEven/Odd 64x64 (UpperHalf)
-
-HWY_INLINE Vec128<uint64_t> MulEven(const Vec128<uint64_t> a,
-                                    const Vec128<uint64_t> b) {
-  alignas(16) uint64_t mul[2];
-  mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]);
-  return Load(Full128<uint64_t>(), mul);
-}
-
-HWY_INLINE Vec128<uint64_t> MulOdd(const Vec128<uint64_t> a,
-                                   const Vec128<uint64_t> b) {
-  alignas(16) uint64_t mul[2];
-  const Half<Full128<uint64_t>> d2;
-  mul[0] =
-      Mul128(GetLane(UpperHalf(d2, a)), GetLane(UpperHalf(d2, b)), &mul[1]);
-  return Load(Full128<uint64_t>(), mul);
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
diff --git a/third_party/highway/hwy/ops/generic_ops-inl.h b/third_party/highway/hwy/ops/generic_ops-inl.h
deleted file mode 100644 (file)
index b01c5de..0000000
+++ /dev/null
@@ -1,1357 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Target-independent types/functions defined after target-specific ops.
-
-// Relies on the external include guard in highway.h.
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-// The lane type of a vector type, e.g. float for Vec<ScalableTag<float>>.
-template <class V>
-using LaneType = decltype(GetLane(V()));
-
-// Vector type, e.g. Vec128<float> for CappedTag<float, 4>. Useful as the return
-// type of functions that do not take a vector argument, or as an argument type
-// if the function only has a template argument for D, or for explicit type
-// names instead of auto. This may be a built-in type.
-template <class D>
-using Vec = decltype(Zero(D()));
-
-// Mask type. Useful as the return type of functions that do not take a mask
-// argument, or as an argument type if the function only has a template argument
-// for D, or for explicit type names instead of auto.
-template <class D>
-using Mask = decltype(MaskFromVec(Zero(D())));
-
-// Returns the closest value to v within [lo, hi].
-template <class V>
-HWY_API V Clamp(const V v, const V lo, const V hi) {
-  return Min(Max(lo, v), hi);
-}
-
-// CombineShiftRightBytes (and -Lanes) are not available for the scalar target,
-// and RVV has its own implementation of -Lanes.
-#if HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_RVV
-
-template <size_t kLanes, class D, class V = VFromD<D>>
-HWY_API V CombineShiftRightLanes(D d, const V hi, const V lo) {
-  constexpr size_t kBytes = kLanes * sizeof(LaneType<V>);
-  static_assert(kBytes < 16, "Shift count is per-block");
-  return CombineShiftRightBytes<kBytes>(d, hi, lo);
-}
-
-#endif
-
-// Returns lanes with the most significant bit set and all other bits zero.
-template <class D>
-HWY_API Vec<D> SignBit(D d) {
-  const RebindToUnsigned<decltype(d)> du;
-  return BitCast(d, Set(du, SignMask<TFromD<D>>()));
-}
-
-// Returns quiet NaN.
-template <class D>
-HWY_API Vec<D> NaN(D d) {
-  const RebindToSigned<D> di;
-  // LimitsMax sets all exponent and mantissa bits to 1. The exponent plus
-  // mantissa MSB (to indicate quiet) would be sufficient.
-  return BitCast(d, Set(di, LimitsMax<TFromD<decltype(di)>>()));
-}
-
-// Returns positive infinity.
-template <class D>
-HWY_API Vec<D> Inf(D d) {
-  const RebindToUnsigned<D> du;
-  using T = TFromD<D>;
-  using TU = TFromD<decltype(du)>;
-  const TU max_x2 = static_cast<TU>(MaxExponentTimes2<T>());
-  return BitCast(d, Set(du, max_x2 >> 1));
-}
-
-// ------------------------------ SafeFillN
-
-template <class D, typename T = TFromD<D>>
-HWY_API void SafeFillN(const size_t num, const T value, D d,
-                       T* HWY_RESTRICT to) {
-#if HWY_MEM_OPS_MIGHT_FAULT
-  (void)d;
-  for (size_t i = 0; i < num; ++i) {
-    to[i] = value;
-  }
-#else
-  BlendedStore(Set(d, value), FirstN(d, num), d, to);
-#endif
-}
-
-// ------------------------------ SafeCopyN
-
-template <class D, typename T = TFromD<D>>
-HWY_API void SafeCopyN(const size_t num, D d, const T* HWY_RESTRICT from,
-                       T* HWY_RESTRICT to) {
-#if HWY_MEM_OPS_MIGHT_FAULT
-  (void)d;
-  for (size_t i = 0; i < num; ++i) {
-    to[i] = from[i];
-  }
-#else
-  const Mask<D> mask = FirstN(d, num);
-  BlendedStore(MaskedLoad(mask, d, from), mask, d, to);
-#endif
-}
-
-// "Include guard": skip if native instructions are available. The generic
-// implementation is currently shared between x86_* and wasm_*, and is too large
-// to duplicate.
-
-#if (defined(HWY_NATIVE_LOAD_STORE_INTERLEAVED) == defined(HWY_TARGET_TOGGLE))
-#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
-#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
-#else
-#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
-#endif
-
-// ------------------------------ LoadInterleaved2
-
-template <typename T, size_t N, class V>
-HWY_API void LoadInterleaved2(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
-                              V& v0, V& v1) {
-  const V A = LoadU(d, unaligned + 0 * N);  // v1[1] v0[1] v1[0] v0[0]
-  const V B = LoadU(d, unaligned + 1 * N);
-  v0 = ConcatEven(d, B, A);
-  v1 = ConcatOdd(d, B, A);
-}
-
-template <typename T, class V>
-HWY_API void LoadInterleaved2(Simd<T, 1, 0> d, const T* HWY_RESTRICT unaligned,
-                              V& v0, V& v1) {
-  v0 = LoadU(d, unaligned + 0);
-  v1 = LoadU(d, unaligned + 1);
-}
-
-// ------------------------------ LoadInterleaved3 (CombineShiftRightBytes)
-
-namespace detail {
-
-// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
-template <typename T, size_t N, class V, HWY_IF_LE128(T, N)>
-HWY_API void LoadTransposedBlocks3(Simd<T, N, 0> d,
-                                   const T* HWY_RESTRICT unaligned, V& A, V& B,
-                                   V& C) {
-  A = LoadU(d, unaligned + 0 * N);
-  B = LoadU(d, unaligned + 1 * N);
-  C = LoadU(d, unaligned + 2 * N);
-}
-
-}  // namespace detail
-
-template <typename T, size_t N, class V, HWY_IF_LANES_PER_BLOCK(T, N, 16)>
-HWY_API void LoadInterleaved3(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
-                              V& v0, V& v1, V& v2) {
-  const RebindToUnsigned<decltype(d)> du;
-  // Compact notation so these fit on one line: 12 := v1[2].
-  V A;  // 05 24 14 04 23 13 03 22 12 02 21 11 01 20 10 00
-  V B;  // 1a 0a 29 19 09 28 18 08 27 17 07 26 16 06 25 15
-  V C;  // 2f 1f 0f 2e 1e 0e 2d 1d 0d 2c 1c 0c 2b 1b 0b 2a
-  detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
-  // Compress all lanes belonging to v0 into consecutive lanes.
-  constexpr uint8_t Z = 0x80;
-  alignas(16) constexpr uint8_t kIdx_v0A[16] = {0, 3, 6, 9, 12, 15, Z, Z,
-                                                Z, Z, Z, Z, Z,  Z,  Z, Z};
-  alignas(16) constexpr uint8_t kIdx_v0B[16] = {Z, Z,  Z,  Z, Z, Z, 2, 5,
-                                                8, 11, 14, Z, Z, Z, Z, Z};
-  alignas(16) constexpr uint8_t kIdx_v0C[16] = {Z, Z, Z, Z, Z, Z, Z,  Z,
-                                                Z, Z, Z, 1, 4, 7, 10, 13};
-  alignas(16) constexpr uint8_t kIdx_v1A[16] = {1, 4, 7, 10, 13, Z, Z, Z,
-                                                Z, Z, Z, Z,  Z,  Z, Z, Z};
-  alignas(16) constexpr uint8_t kIdx_v1B[16] = {Z, Z,  Z,  Z, Z, 0, 3, 6,
-                                                9, 12, 15, Z, Z, Z, Z, Z};
-  alignas(16) constexpr uint8_t kIdx_v1C[16] = {Z, Z, Z, Z, Z, Z, Z,  Z,
-                                                Z, Z, Z, 2, 5, 8, 11, 14};
-  alignas(16) constexpr uint8_t kIdx_v2A[16] = {2, 5, 8, 11, 14, Z, Z, Z,
-                                                Z, Z, Z, Z,  Z,  Z, Z, Z};
-  alignas(16) constexpr uint8_t kIdx_v2B[16] = {Z,  Z,  Z, Z, Z, 1, 4, 7,
-                                                10, 13, Z, Z, Z, Z, Z, Z};
-  alignas(16) constexpr uint8_t kIdx_v2C[16] = {Z, Z, Z, Z, Z, Z, Z,  Z,
-                                                Z, Z, 0, 3, 6, 9, 12, 15};
-  const V v0L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v0A)));
-  const V v0M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v0B)));
-  const V v0U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v0C)));
-  const V v1L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v1A)));
-  const V v1M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v1B)));
-  const V v1U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v1C)));
-  const V v2L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v2A)));
-  const V v2M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v2B)));
-  const V v2U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v2C)));
-  v0 = Or3(v0L, v0M, v0U);
-  v1 = Or3(v1L, v1M, v1U);
-  v2 = Or3(v2L, v2M, v2U);
-}
-
-// 8-bit lanes x8
-template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 1),
-          HWY_IF_LANES_PER_BLOCK(T, N, 8)>
-HWY_API void LoadInterleaved3(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
-                              V& v0, V& v1, V& v2) {
-  const RebindToUnsigned<decltype(d)> du;
-  V A;  // v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0]
-  V B;  // v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2]
-  V C;  // v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5]
-  detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
-  // Compress all lanes belonging to v0 into consecutive lanes.
-  constexpr uint8_t Z = 0x80;
-  alignas(16) constexpr uint8_t kIdx_v0A[16] = {0, 3, 6, Z, Z, Z, Z, Z};
-  alignas(16) constexpr uint8_t kIdx_v0B[16] = {Z, Z, Z, 1, 4, 7, Z, Z};
-  alignas(16) constexpr uint8_t kIdx_v0C[16] = {Z, Z, Z, Z, Z, Z, 2, 5};
-  alignas(16) constexpr uint8_t kIdx_v1A[16] = {1, 4, 7, Z, Z, Z, Z, Z};
-  alignas(16) constexpr uint8_t kIdx_v1B[16] = {Z, Z, Z, 2, 5, Z, Z, Z};
-  alignas(16) constexpr uint8_t kIdx_v1C[16] = {Z, Z, Z, Z, Z, 0, 3, 6};
-  alignas(16) constexpr uint8_t kIdx_v2A[16] = {2, 5, Z, Z, Z, Z, Z, Z};
-  alignas(16) constexpr uint8_t kIdx_v2B[16] = {Z, Z, 0, 3, 6, Z, Z, Z};
-  alignas(16) constexpr uint8_t kIdx_v2C[16] = {Z, Z, Z, Z, Z, 1, 4, 7};
-  const V v0L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v0A)));
-  const V v0M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v0B)));
-  const V v0U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v0C)));
-  const V v1L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v1A)));
-  const V v1M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v1B)));
-  const V v1U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v1C)));
-  const V v2L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v2A)));
-  const V v2M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v2B)));
-  const V v2U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v2C)));
-  v0 = Or3(v0L, v0M, v0U);
-  v1 = Or3(v1L, v1M, v1U);
-  v2 = Or3(v2L, v2M, v2U);
-}
-
-// 16-bit lanes x8
-template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 2),
-          HWY_IF_LANES_PER_BLOCK(T, N, 8)>
-HWY_API void LoadInterleaved3(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
-                              V& v0, V& v1, V& v2) {
-  const RebindToUnsigned<decltype(d)> du;
-  V A;  // v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0]
-  V B;  // v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2]
-  V C;  // v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5]
-  detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
-  // Compress all lanes belonging to v0 into consecutive lanes. Same as above,
-  // but each element of the array contains two byte indices for a lane.
-  constexpr uint16_t Z = 0x8080;
-  alignas(16) constexpr uint16_t kIdx_v0A[8] = {0x0100, 0x0706, 0x0D0C, Z,
-                                                Z,      Z,      Z,      Z};
-  alignas(16) constexpr uint16_t kIdx_v0B[8] = {Z,      Z,      Z, 0x0302,
-                                                0x0908, 0x0F0E, Z, Z};
-  alignas(16) constexpr uint16_t kIdx_v0C[8] = {Z, Z, Z,      Z,
-                                                Z, Z, 0x0504, 0x0B0A};
-  alignas(16) constexpr uint16_t kIdx_v1A[8] = {0x0302, 0x0908, 0x0F0E, Z,
-                                                Z,      Z,      Z,      Z};
-  alignas(16) constexpr uint16_t kIdx_v1B[8] = {Z,      Z, Z, 0x0504,
-                                                0x0B0A, Z, Z, Z};
-  alignas(16) constexpr uint16_t kIdx_v1C[8] = {Z, Z,      Z,      Z,
-                                                Z, 0x0100, 0x0706, 0x0D0C};
-  alignas(16) constexpr uint16_t kIdx_v2A[8] = {0x0504, 0x0B0A, Z, Z,
-                                                Z,      Z,      Z, Z};
-  alignas(16) constexpr uint16_t kIdx_v2B[8] = {Z,      Z, 0x0100, 0x0706,
-                                                0x0D0C, Z, Z,      Z};
-  alignas(16) constexpr uint16_t kIdx_v2C[8] = {Z, Z,      Z,      Z,
-                                                Z, 0x0302, 0x0908, 0x0F0E};
-  const V v0L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v0A)));
-  const V v0M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v0B)));
-  const V v0U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v0C)));
-  const V v1L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v1A)));
-  const V v1M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v1B)));
-  const V v1U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v1C)));
-  const V v2L = BitCast(d, TableLookupBytesOr0(A, LoadDup128(du, kIdx_v2A)));
-  const V v2M = BitCast(d, TableLookupBytesOr0(B, LoadDup128(du, kIdx_v2B)));
-  const V v2U = BitCast(d, TableLookupBytesOr0(C, LoadDup128(du, kIdx_v2C)));
-  v0 = Or3(v0L, v0M, v0U);
-  v1 = Or3(v1L, v1M, v1U);
-  v2 = Or3(v2L, v2M, v2U);
-}
-
-template <typename T, size_t N, class V, HWY_IF_LANES_PER_BLOCK(T, N, 4)>
-HWY_API void LoadInterleaved3(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
-                              V& v0, V& v1, V& v2) {
-  V A;  // v0[1] v2[0] v1[0] v0[0]
-  V B;  // v1[2] v0[2] v2[1] v1[1]
-  V C;  // v2[3] v1[3] v0[3] v2[2]
-  detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
-
-  const V vxx_02_03_xx = OddEven(C, B);
-  v0 = detail::Shuffle1230(A, vxx_02_03_xx);
-
-  // Shuffle2301 takes the upper/lower halves of the output from one input, so
-  // we cannot just combine 13 and 10 with 12 and 11 (similar to v0/v2). Use
-  // OddEven because it may have higher throughput than Shuffle.
-  const V vxx_xx_10_11 = OddEven(A, B);
-  const V v12_13_xx_xx = OddEven(B, C);
-  v1 = detail::Shuffle2301(vxx_xx_10_11, v12_13_xx_xx);
-
-  const V vxx_20_21_xx = OddEven(B, A);
-  v2 = detail::Shuffle3012(vxx_20_21_xx, C);
-}
-
-template <typename T, size_t N, class V, HWY_IF_LANES_PER_BLOCK(T, N, 2)>
-HWY_API void LoadInterleaved3(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
-                              V& v0, V& v1, V& v2) {
-  V A;  // v1[0] v0[0]
-  V B;  // v0[1] v2[0]
-  V C;  // v2[1] v1[1]
-  detail::LoadTransposedBlocks3(d, unaligned, A, B, C);
-  v0 = OddEven(B, A);
-  v1 = CombineShiftRightBytes<sizeof(T)>(d, C, A);
-  v2 = OddEven(C, B);
-}
-
-template <typename T, class V>
-HWY_API void LoadInterleaved3(Simd<T, 1, 0> d, const T* HWY_RESTRICT unaligned,
-                              V& v0, V& v1, V& v2) {
-  v0 = LoadU(d, unaligned + 0);
-  v1 = LoadU(d, unaligned + 1);
-  v2 = LoadU(d, unaligned + 2);
-}
-
-// ------------------------------ LoadInterleaved4
-
-namespace detail {
-
-// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
-template <typename T, size_t N, class V, HWY_IF_LE128(T, N)>
-HWY_API void LoadTransposedBlocks4(Simd<T, N, 0> d,
-                                   const T* HWY_RESTRICT unaligned, V& A, V& B,
-                                   V& C, V& D) {
-  A = LoadU(d, unaligned + 0 * N);
-  B = LoadU(d, unaligned + 1 * N);
-  C = LoadU(d, unaligned + 2 * N);
-  D = LoadU(d, unaligned + 3 * N);
-}
-
-}  // namespace detail
-
-template <typename T, size_t N, class V, HWY_IF_LANES_PER_BLOCK(T, N, 16)>
-HWY_API void LoadInterleaved4(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
-                              V& v0, V& v1, V& v2, V& v3) {
-  const Repartition<uint64_t, decltype(d)> d64;
-  using V64 = VFromD<decltype(d64)>;
-  // 16 lanes per block; the lowest four blocks are at the bottom of A,B,C,D.
-  // Here int[i] means the four interleaved values of the i-th 4-tuple and
-  // int[3..0] indicates four consecutive 4-tuples (0 = least-significant).
-  V A;  // int[13..10] int[3..0]
-  V B;  // int[17..14] int[7..4]
-  V C;  // int[1b..18] int[b..8]
-  V D;  // int[1f..1c] int[f..c]
-  detail::LoadTransposedBlocks4(d, unaligned, A, B, C, D);
-
-  // For brevity, the comments only list the lower block (upper = lower + 0x10)
-  const V v5140 = InterleaveLower(d, A, B);  // int[5,1,4,0]
-  const V vd9c8 = InterleaveLower(d, C, D);  // int[d,9,c,8]
-  const V v7362 = InterleaveUpper(d, A, B);  // int[7,3,6,2]
-  const V vfbea = InterleaveUpper(d, C, D);  // int[f,b,e,a]
-
-  const V v6420 = InterleaveLower(d, v5140, v7362);  // int[6,4,2,0]
-  const V veca8 = InterleaveLower(d, vd9c8, vfbea);  // int[e,c,a,8]
-  const V v7531 = InterleaveUpper(d, v5140, v7362);  // int[7,5,3,1]
-  const V vfdb9 = InterleaveUpper(d, vd9c8, vfbea);  // int[f,d,b,9]
-
-  const V64 v10L = BitCast(d64, InterleaveLower(d, v6420, v7531));  // v10[7..0]
-  const V64 v10U = BitCast(d64, InterleaveLower(d, veca8, vfdb9));  // v10[f..8]
-  const V64 v32L = BitCast(d64, InterleaveUpper(d, v6420, v7531));  // v32[7..0]
-  const V64 v32U = BitCast(d64, InterleaveUpper(d, veca8, vfdb9));  // v32[f..8]
-
-  v0 = BitCast(d, InterleaveLower(d64, v10L, v10U));
-  v1 = BitCast(d, InterleaveUpper(d64, v10L, v10U));
-  v2 = BitCast(d, InterleaveLower(d64, v32L, v32U));
-  v3 = BitCast(d, InterleaveUpper(d64, v32L, v32U));
-}
-
-template <typename T, size_t N, class V, HWY_IF_LANES_PER_BLOCK(T, N, 8)>
-HWY_API void LoadInterleaved4(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
-                              V& v0, V& v1, V& v2, V& v3) {
-  // In the last step, we interleave by half of the block size, which is usually
-  // 8 bytes but half that for 8-bit x8 vectors.
-  using TW = hwy::UnsignedFromSize<sizeof(T) * N == 8 ? 4 : 8>;
-  const Repartition<TW, decltype(d)> dw;
-  using VW = VFromD<decltype(dw)>;
-
-  // (Comments are for 256-bit vectors.)
-  // 8 lanes per block; the lowest four blocks are at the bottom of A,B,C,D.
-  V A;  // v3210[9]v3210[8] v3210[1]v3210[0]
-  V B;  // v3210[b]v3210[a] v3210[3]v3210[2]
-  V C;  // v3210[d]v3210[c] v3210[5]v3210[4]
-  V D;  // v3210[f]v3210[e] v3210[7]v3210[6]
-  detail::LoadTransposedBlocks4(d, unaligned, A, B, C, D);
-
-  const V va820 = InterleaveLower(d, A, B);  // v3210[a,8] v3210[2,0]
-  const V vec64 = InterleaveLower(d, C, D);  // v3210[e,c] v3210[6,4]
-  const V vb931 = InterleaveUpper(d, A, B);  // v3210[b,9] v3210[3,1]
-  const V vfd75 = InterleaveUpper(d, C, D);  // v3210[f,d] v3210[7,5]
-
-  const VW v10_b830 =  // v10[b..8] v10[3..0]
-      BitCast(dw, InterleaveLower(d, va820, vb931));
-  const VW v10_fc74 =  // v10[f..c] v10[7..4]
-      BitCast(dw, InterleaveLower(d, vec64, vfd75));
-  const VW v32_b830 =  // v32[b..8] v32[3..0]
-      BitCast(dw, InterleaveUpper(d, va820, vb931));
-  const VW v32_fc74 =  // v32[f..c] v32[7..4]
-      BitCast(dw, InterleaveUpper(d, vec64, vfd75));
-
-  v0 = BitCast(d, InterleaveLower(dw, v10_b830, v10_fc74));
-  v1 = BitCast(d, InterleaveUpper(dw, v10_b830, v10_fc74));
-  v2 = BitCast(d, InterleaveLower(dw, v32_b830, v32_fc74));
-  v3 = BitCast(d, InterleaveUpper(dw, v32_b830, v32_fc74));
-}
-
-template <typename T, size_t N, class V, HWY_IF_LANES_PER_BLOCK(T, N, 4)>
-HWY_API void LoadInterleaved4(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
-                              V& v0, V& v1, V& v2, V& v3) {
-  V A;  // v3210[4] v3210[0]
-  V B;  // v3210[5] v3210[1]
-  V C;  // v3210[6] v3210[2]
-  V D;  // v3210[7] v3210[3]
-  detail::LoadTransposedBlocks4(d, unaligned, A, B, C, D);
-  const V v10_ev = InterleaveLower(d, A, C);  // v1[6,4] v0[6,4] v1[2,0] v0[2,0]
-  const V v10_od = InterleaveLower(d, B, D);  // v1[7,5] v0[7,5] v1[3,1] v0[3,1]
-  const V v32_ev = InterleaveUpper(d, A, C);  // v3[6,4] v2[6,4] v3[2,0] v2[2,0]
-  const V v32_od = InterleaveUpper(d, B, D);  // v3[7,5] v2[7,5] v3[3,1] v2[3,1]
-
-  v0 = InterleaveLower(d, v10_ev, v10_od);
-  v1 = InterleaveUpper(d, v10_ev, v10_od);
-  v2 = InterleaveLower(d, v32_ev, v32_od);
-  v3 = InterleaveUpper(d, v32_ev, v32_od);
-}
-
-template <typename T, size_t N, class V, HWY_IF_LANES_PER_BLOCK(T, N, 2)>
-HWY_API void LoadInterleaved4(Simd<T, N, 0> d, const T* HWY_RESTRICT unaligned,
-                              V& v0, V& v1, V& v2, V& v3) {
-  V A, B, C, D;
-  detail::LoadTransposedBlocks4(d, unaligned, A, B, C, D);
-  v0 = InterleaveLower(d, A, C);
-  v1 = InterleaveUpper(d, A, C);
-  v2 = InterleaveLower(d, B, D);
-  v3 = InterleaveUpper(d, B, D);
-}
-
-// Any T x1
-template <typename T, class V>
-HWY_API void LoadInterleaved4(Simd<T, 1, 0> d, const T* HWY_RESTRICT unaligned,
-                              V& v0, V& v1, V& v2, V& v3) {
-  v0 = LoadU(d, unaligned + 0);
-  v1 = LoadU(d, unaligned + 1);
-  v2 = LoadU(d, unaligned + 2);
-  v3 = LoadU(d, unaligned + 3);
-}
-
-// ------------------------------ StoreInterleaved2
-
-namespace detail {
-
-// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
-template <typename T, size_t N, class V, HWY_IF_LE128(T, N)>
-HWY_API void StoreTransposedBlocks2(const V A, const V B, Simd<T, N, 0> d,
-                                    T* HWY_RESTRICT unaligned) {
-  StoreU(A, d, unaligned + 0 * N);
-  StoreU(B, d, unaligned + 1 * N);
-}
-
-}  // namespace detail
-
-// >= 128 bit vector
-template <typename T, size_t N, class V, HWY_IF_GE128(T, N)>
-HWY_API void StoreInterleaved2(const V v0, const V v1, Simd<T, N, 0> d,
-                               T* HWY_RESTRICT unaligned) {
-  const auto v10L = InterleaveLower(d, v0, v1);  // .. v1[0] v0[0]
-  const auto v10U = InterleaveUpper(d, v0, v1);  // .. v1[N/2] v0[N/2]
-  detail::StoreTransposedBlocks2(v10L, v10U, d, unaligned);
-}
-
-// 64 bits
-template <typename T>
-HWY_API void StoreInterleaved2(const Vec64<T> part0, const Vec64<T> part1,
-                               Full64<T> /*tag*/, T* HWY_RESTRICT unaligned) {
-  // Use full vectors to reduce the number of stores.
-  const Full128<T> d_full;
-  const Vec128<T> v0{part0.raw};
-  const Vec128<T> v1{part1.raw};
-  const auto v10 = InterleaveLower(d_full, v0, v1);
-  StoreU(v10, d_full, unaligned);
-}
-
-// <= 32 bits
-template <typename T, size_t N, HWY_IF_LE32(T, N)>
-HWY_API void StoreInterleaved2(const Vec128<T, N> part0,
-                               const Vec128<T, N> part1, Simd<T, N, 0> /*tag*/,
-                               T* HWY_RESTRICT unaligned) {
-  // Use full vectors to reduce the number of stores.
-  const Full128<T> d_full;
-  const Vec128<T> v0{part0.raw};
-  const Vec128<T> v1{part1.raw};
-  const auto v10 = InterleaveLower(d_full, v0, v1);
-  alignas(16) T buf[16 / sizeof(T)];
-  StoreU(v10, d_full, buf);
-  CopyBytes<2 * N * sizeof(T)>(buf, unaligned);
-}
-
-// ------------------------------ StoreInterleaved3 (CombineShiftRightBytes,
-// TableLookupBytes)
-
-namespace detail {
-
-// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
-template <typename T, size_t N, class V, HWY_IF_LE128(T, N)>
-HWY_API void StoreTransposedBlocks3(const V A, const V B, const V C,
-                                    Simd<T, N, 0> d,
-                                    T* HWY_RESTRICT unaligned) {
-  StoreU(A, d, unaligned + 0 * N);
-  StoreU(B, d, unaligned + 1 * N);
-  StoreU(C, d, unaligned + 2 * N);
-}
-
-}  // namespace detail
-
-// >= 128-bit vector, 8-bit lanes
-template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 1),
-          HWY_IF_GE128(T, N)>
-HWY_API void StoreInterleaved3(const V v0, const V v1, const V v2,
-                               Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
-  const RebindToUnsigned<decltype(d)> du;
-  const auto k5 = Set(du, 5);
-  const auto k6 = Set(du, 6);
-
-  // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
-  // v0[5], v2[4],v1[4],v0[4] .. v2[0],v1[0],v0[0]. We're expanding v0 lanes
-  // to their place, with 0x80 so lanes to be filled from other vectors are 0
-  // to enable blending by ORing together.
-  alignas(16) static constexpr uint8_t tbl_v0[16] = {
-      0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,  //
-      3, 0x80, 0x80, 4, 0x80, 0x80, 5};
-  alignas(16) static constexpr uint8_t tbl_v1[16] = {
-      0x80, 0, 0x80, 0x80, 1, 0x80,  //
-      0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
-  // The interleaved vectors will be named A, B, C; temporaries with suffix
-  // 0..2 indicate which input vector's lanes they hold.
-  const auto shuf_A0 = LoadDup128(du, tbl_v0);
-  const auto shuf_A1 = LoadDup128(du, tbl_v1);  // cannot reuse shuf_A0 (has 5)
-  const auto shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1);
-  const auto A0 = TableLookupBytesOr0(v0, shuf_A0);  // 5..4..3..2..1..0
-  const auto A1 = TableLookupBytesOr0(v1, shuf_A1);  // ..4..3..2..1..0.
-  const auto A2 = TableLookupBytesOr0(v2, shuf_A2);  // .4..3..2..1..0..
-  const V A = BitCast(d, A0 | A1 | A2);
-
-  // B: v1[10],v0[10], v2[9],v1[9],v0[9] .. , v2[6],v1[6],v0[6], v2[5],v1[5]
-  const auto shuf_B0 = shuf_A2 + k6;  // .A..9..8..7..6..
-  const auto shuf_B1 = shuf_A0 + k5;  // A..9..8..7..6..5
-  const auto shuf_B2 = shuf_A1 + k5;  // ..9..8..7..6..5.
-  const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
-  const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
-  const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
-  const V B = BitCast(d, B0 | B1 | B2);
-
-  // C: v2[15],v1[15],v0[15], v2[11],v1[11],v0[11], v2[10]
-  const auto shuf_C0 = shuf_B2 + k6;  // ..F..E..D..C..B.
-  const auto shuf_C1 = shuf_B0 + k5;  // .F..E..D..C..B..
-  const auto shuf_C2 = shuf_B1 + k5;  // F..E..D..C..B..A
-  const auto C0 = TableLookupBytesOr0(v0, shuf_C0);
-  const auto C1 = TableLookupBytesOr0(v1, shuf_C1);
-  const auto C2 = TableLookupBytesOr0(v2, shuf_C2);
-  const V C = BitCast(d, C0 | C1 | C2);
-
-  detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
-}
-
-// >= 128-bit vector, 16-bit lanes
-template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 2),
-          HWY_IF_GE128(T, N)>
-HWY_API void StoreInterleaved3(const V v0, const V v1, const V v2,
-                               Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
-  const Repartition<uint8_t, decltype(d)> du8;
-  const auto k2 = Set(du8, 2 * sizeof(T));
-  const auto k3 = Set(du8, 3 * sizeof(T));
-
-  // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
-  // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be
-  // filled from other vectors are 0 for blending. Note that these are byte
-  // indices for 16-bit lanes.
-  alignas(16) static constexpr uint8_t tbl_v1[16] = {
-      0x80, 0x80, 0,    1,    0x80, 0x80, 0x80, 0x80,
-      2,    3,    0x80, 0x80, 0x80, 0x80, 4,    5};
-  alignas(16) static constexpr uint8_t tbl_v2[16] = {
-      0x80, 0x80, 0x80, 0x80, 0,    1,    0x80, 0x80,
-      0x80, 0x80, 2,    3,    0x80, 0x80, 0x80, 0x80};
-
-  // The interleaved vectors will be named A, B, C; temporaries with suffix
-  // 0..2 indicate which input vector's lanes they hold.
-  const auto shuf_A1 = LoadDup128(du8, tbl_v1);  // 2..1..0.
-                                                 // .2..1..0
-  const auto shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1);
-  const auto shuf_A2 = LoadDup128(du8, tbl_v2);  // ..1..0..
-
-  const auto A0 = TableLookupBytesOr0(v0, shuf_A0);
-  const auto A1 = TableLookupBytesOr0(v1, shuf_A1);
-  const auto A2 = TableLookupBytesOr0(v2, shuf_A2);
-  const V A = BitCast(d, A0 | A1 | A2);
-
-  // B: v0[5] v2[4],v1[4],v0[4], v2[3],v1[3],v0[3], v2[2]
-  const auto shuf_B0 = shuf_A1 + k3;  // 5..4..3.
-  const auto shuf_B1 = shuf_A2 + k3;  // ..4..3..
-  const auto shuf_B2 = shuf_A0 + k2;  // .4..3..2
-  const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
-  const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
-  const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
-  const V B = BitCast(d, B0 | B1 | B2);
-
-  // C: v2[7],v1[7],v0[7], v2[6],v1[6],v0[6], v2[5],v1[5]
-  const auto shuf_C0 = shuf_B1 + k3;  // ..7..6..
-  const auto shuf_C1 = shuf_B2 + k3;  // .7..6..5
-  const auto shuf_C2 = shuf_B0 + k2;  // 7..6..5.
-  const auto C0 = TableLookupBytesOr0(v0, shuf_C0);
-  const auto C1 = TableLookupBytesOr0(v1, shuf_C1);
-  const auto C2 = TableLookupBytesOr0(v2, shuf_C2);
-  const V C = BitCast(d, C0 | C1 | C2);
-
-  detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
-}
-
-// >= 128-bit vector, 32-bit lanes
-template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 4),
-          HWY_IF_GE128(T, N)>
-HWY_API void StoreInterleaved3(const V v0, const V v1, const V v2,
-                               Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
-  const RepartitionToWide<decltype(d)> dw;
-
-  const V v10_v00 = InterleaveLower(d, v0, v1);
-  const V v01_v20 = OddEven(v0, v2);
-  // A: v0[1], v2[0],v1[0],v0[0] (<- lane 0)
-  const V A = BitCast(
-      d, InterleaveLower(dw, BitCast(dw, v10_v00), BitCast(dw, v01_v20)));
-
-  const V v1_321 = ShiftRightLanes<1>(d, v1);
-  const V v0_32 = ShiftRightLanes<2>(d, v0);
-  const V v21_v11 = OddEven(v2, v1_321);
-  const V v12_v02 = OddEven(v1_321, v0_32);
-  // B: v1[2],v0[2], v2[1],v1[1]
-  const V B = BitCast(
-      d, InterleaveLower(dw, BitCast(dw, v21_v11), BitCast(dw, v12_v02)));
-
-  // Notation refers to the upper 2 lanes of the vector for InterleaveUpper.
-  const V v23_v13 = OddEven(v2, v1_321);
-  const V v03_v22 = OddEven(v0, v2);
-  // C: v2[3],v1[3],v0[3], v2[2]
-  const V C = BitCast(
-      d, InterleaveUpper(dw, BitCast(dw, v03_v22), BitCast(dw, v23_v13)));
-
-  detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
-}
-
-// >= 128-bit vector, 64-bit lanes
-template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 8),
-          HWY_IF_GE128(T, N)>
-HWY_API void StoreInterleaved3(const V v0, const V v1, const V v2,
-                               Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
-  const V A = InterleaveLower(d, v0, v1);
-  const V B = OddEven(v0, v2);
-  const V C = InterleaveUpper(d, v1, v2);
-  detail::StoreTransposedBlocks3(A, B, C, d, unaligned);
-}
-
-// 64-bit vector, 8-bit lanes
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API void StoreInterleaved3(const Vec64<T> part0, const Vec64<T> part1,
-                               const Vec64<T> part2, Full64<T> d,
-                               T* HWY_RESTRICT unaligned) {
-  constexpr size_t N = 16 / sizeof(T);
-  // Use full vectors for the shuffles and first result.
-  const Full128<uint8_t> du;
-  const Full128<T> d_full;
-  const auto k5 = Set(du, 5);
-  const auto k6 = Set(du, 6);
-
-  const Vec128<T> v0{part0.raw};
-  const Vec128<T> v1{part1.raw};
-  const Vec128<T> v2{part2.raw};
-
-  // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right):
-  // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be
-  // filled from other vectors are 0 for blending.
-  alignas(16) static constexpr uint8_t tbl_v0[16] = {
-      0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80,  //
-      3, 0x80, 0x80, 4, 0x80, 0x80, 5};
-  alignas(16) static constexpr uint8_t tbl_v1[16] = {
-      0x80, 0, 0x80, 0x80, 1, 0x80,  //
-      0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80};
-  // The interleaved vectors will be named A, B, C; temporaries with suffix
-  // 0..2 indicate which input vector's lanes they hold.
-  const auto shuf_A0 = Load(du, tbl_v0);
-  const auto shuf_A1 = Load(du, tbl_v1);  // cannot reuse shuf_A0 (5 in MSB)
-  const auto shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1);
-  const auto A0 = TableLookupBytesOr0(v0, shuf_A0);  // 5..4..3..2..1..0
-  const auto A1 = TableLookupBytesOr0(v1, shuf_A1);  // ..4..3..2..1..0.
-  const auto A2 = TableLookupBytesOr0(v2, shuf_A2);  // .4..3..2..1..0..
-  const auto A = BitCast(d_full, A0 | A1 | A2);
-  StoreU(A, d_full, unaligned + 0 * N);
-
-  // Second (HALF) vector: v2[7],v1[7],v0[7], v2[6],v1[6],v0[6], v2[5],v1[5]
-  const auto shuf_B0 = shuf_A2 + k6;  // ..7..6..
-  const auto shuf_B1 = shuf_A0 + k5;  // .7..6..5
-  const auto shuf_B2 = shuf_A1 + k5;  // 7..6..5.
-  const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
-  const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
-  const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
-  const Vec64<T> B{(B0 | B1 | B2).raw};
-  StoreU(B, d, unaligned + 1 * N);
-}
-
-// 64-bit vector, 16-bit lanes
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API void StoreInterleaved3(const Vec64<T> part0, const Vec64<T> part1,
-                               const Vec64<T> part2, Full64<T> dh,
-                               T* HWY_RESTRICT unaligned) {
-  const Full128<T> d;
-  const Full128<uint8_t> du8;
-  constexpr size_t N = 16 / sizeof(T);
-  const auto k2 = Set(du8, 2 * sizeof(T));
-  const auto k3 = Set(du8, 3 * sizeof(T));
-
-  const Vec128<T> v0{part0.raw};
-  const Vec128<T> v1{part1.raw};
-  const Vec128<T> v2{part2.raw};
-
-  // Interleave part (v0,v1,v2) to full (MSB on left, lane 0 on right):
-  // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. We're expanding v0 lanes
-  // to their place, with 0x80 so lanes to be filled from other vectors are 0
-  // to enable blending by ORing together.
-  alignas(16) static constexpr uint8_t tbl_v1[16] = {
-      0x80, 0x80, 0,    1,    0x80, 0x80, 0x80, 0x80,
-      2,    3,    0x80, 0x80, 0x80, 0x80, 4,    5};
-  alignas(16) static constexpr uint8_t tbl_v2[16] = {
-      0x80, 0x80, 0x80, 0x80, 0,    1,    0x80, 0x80,
-      0x80, 0x80, 2,    3,    0x80, 0x80, 0x80, 0x80};
-
-  // The interleaved vectors will be named A, B; temporaries with suffix
-  // 0..2 indicate which input vector's lanes they hold.
-  const auto shuf_A1 = Load(du8, tbl_v1);  // 2..1..0.
-                                           // .2..1..0
-  const auto shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1);
-  const auto shuf_A2 = Load(du8, tbl_v2);  // ..1..0..
-
-  const auto A0 = TableLookupBytesOr0(v0, shuf_A0);
-  const auto A1 = TableLookupBytesOr0(v1, shuf_A1);
-  const auto A2 = TableLookupBytesOr0(v2, shuf_A2);
-  const Vec128<T> A = BitCast(d, A0 | A1 | A2);
-  StoreU(A, d, unaligned + 0 * N);
-
-  // Second (HALF) vector: v2[3],v1[3],v0[3], v2[2]
-  const auto shuf_B0 = shuf_A1 + k3;  // ..3.
-  const auto shuf_B1 = shuf_A2 + k3;  // .3..
-  const auto shuf_B2 = shuf_A0 + k2;  // 3..2
-  const auto B0 = TableLookupBytesOr0(v0, shuf_B0);
-  const auto B1 = TableLookupBytesOr0(v1, shuf_B1);
-  const auto B2 = TableLookupBytesOr0(v2, shuf_B2);
-  const Vec128<T> B = BitCast(d, B0 | B1 | B2);
-  StoreU(Vec64<T>{B.raw}, dh, unaligned + 1 * N);
-}
-
-// 64-bit vector, 32-bit lanes
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API void StoreInterleaved3(const Vec64<T> v0, const Vec64<T> v1,
-                               const Vec64<T> v2, Full64<T> d,
-                               T* HWY_RESTRICT unaligned) {
-  // (same code as 128-bit vector, 64-bit lanes)
-  constexpr size_t N = 2;
-  const Vec64<T> v10_v00 = InterleaveLower(d, v0, v1);
-  const Vec64<T> v01_v20 = OddEven(v0, v2);
-  const Vec64<T> v21_v11 = InterleaveUpper(d, v1, v2);
-  StoreU(v10_v00, d, unaligned + 0 * N);
-  StoreU(v01_v20, d, unaligned + 1 * N);
-  StoreU(v21_v11, d, unaligned + 2 * N);
-}
-
-// 64-bit lanes are handled by the N=1 case below.
-
-// <= 32-bit vector, 8-bit lanes
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1), HWY_IF_LE32(T, N)>
-HWY_API void StoreInterleaved3(const Vec128<T, N> part0,
-                               const Vec128<T, N> part1,
-                               const Vec128<T, N> part2, Simd<T, N, 0> /*tag*/,
-                               T* HWY_RESTRICT unaligned) {
-  // Use full vectors for the shuffles and result.
-  const Full128<uint8_t> du;
-  const Full128<T> d_full;
-
-  const Vec128<T> v0{part0.raw};
-  const Vec128<T> v1{part1.raw};
-  const Vec128<T> v2{part2.raw};
-
-  // Interleave (v0,v1,v2). We're expanding v0 lanes to their place, with 0x80
-  // so lanes to be filled from other vectors are 0 to enable blending by ORing
-  // together.
-  alignas(16) static constexpr uint8_t tbl_v0[16] = {
-      0,    0x80, 0x80, 1,    0x80, 0x80, 2,    0x80,
-      0x80, 3,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
-  // The interleaved vector will be named A; temporaries with suffix
-  // 0..2 indicate which input vector's lanes they hold.
-  const auto shuf_A0 = Load(du, tbl_v0);
-  const auto shuf_A1 = CombineShiftRightBytes<15>(du, shuf_A0, shuf_A0);
-  const auto shuf_A2 = CombineShiftRightBytes<14>(du, shuf_A0, shuf_A0);
-  const auto A0 = TableLookupBytesOr0(v0, shuf_A0);  // ......3..2..1..0
-  const auto A1 = TableLookupBytesOr0(v1, shuf_A1);  // .....3..2..1..0.
-  const auto A2 = TableLookupBytesOr0(v2, shuf_A2);  // ....3..2..1..0..
-  const Vec128<T> A = BitCast(d_full, A0 | A1 | A2);
-  alignas(16) T buf[16 / sizeof(T)];
-  StoreU(A, d_full, buf);
-  CopyBytes<N * 3 * sizeof(T)>(buf, unaligned);
-}
-
-// 32-bit vector, 16-bit lanes
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API void StoreInterleaved3(const Vec128<T, 2> part0,
-                               const Vec128<T, 2> part1,
-                               const Vec128<T, 2> part2, Simd<T, 2, 0> /*tag*/,
-                               T* HWY_RESTRICT unaligned) {
-  constexpr size_t N = 4 / sizeof(T);
-  // Use full vectors for the shuffles and result.
-  const Full128<uint8_t> du8;
-  const Full128<T> d_full;
-
-  const Vec128<T> v0{part0.raw};
-  const Vec128<T> v1{part1.raw};
-  const Vec128<T> v2{part2.raw};
-
-  // Interleave (v0,v1,v2). We're expanding v0 lanes to their place, with 0x80
-  // so lanes to be filled from other vectors are 0 to enable blending by ORing
-  // together.
-  alignas(16) static constexpr uint8_t tbl_v2[16] = {
-      0x80, 0x80, 0x80, 0x80, 0,    1,    0x80, 0x80,
-      0x80, 0x80, 2,    3,    0x80, 0x80, 0x80, 0x80};
-  // The interleaved vector will be named A; temporaries with suffix
-  // 0..2 indicate which input vector's lanes they hold.
-  const auto shuf_A2 =  // ..1..0..
-      Load(du8, tbl_v2);
-  const auto shuf_A1 =  // ...1..0.
-      CombineShiftRightBytes<2>(du8, shuf_A2, shuf_A2);
-  const auto shuf_A0 =  // ....1..0
-      CombineShiftRightBytes<4>(du8, shuf_A2, shuf_A2);
-  const auto A0 = TableLookupBytesOr0(v0, shuf_A0);  // ..1..0
-  const auto A1 = TableLookupBytesOr0(v1, shuf_A1);  // .1..0.
-  const auto A2 = TableLookupBytesOr0(v2, shuf_A2);  // 1..0..
-  const auto A = BitCast(d_full, A0 | A1 | A2);
-  alignas(16) T buf[16 / sizeof(T)];
-  StoreU(A, d_full, buf);
-  CopyBytes<N * 3 * sizeof(T)>(buf, unaligned);
-}
-
-// Single-element vector, any lane size: just store directly
-template <typename T>
-HWY_API void StoreInterleaved3(const Vec128<T, 1> v0, const Vec128<T, 1> v1,
-                               const Vec128<T, 1> v2, Simd<T, 1, 0> d,
-                               T* HWY_RESTRICT unaligned) {
-  StoreU(v0, d, unaligned + 0);
-  StoreU(v1, d, unaligned + 1);
-  StoreU(v2, d, unaligned + 2);
-}
-
-// ------------------------------ StoreInterleaved4
-
-namespace detail {
-
-// Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload.
-template <typename T, size_t N, class V, HWY_IF_LE128(T, N)>
-HWY_API void StoreTransposedBlocks4(const V A, const V B, const V C, const V D,
-                                    Simd<T, N, 0> d,
-                                    T* HWY_RESTRICT unaligned) {
-  StoreU(A, d, unaligned + 0 * N);
-  StoreU(B, d, unaligned + 1 * N);
-  StoreU(C, d, unaligned + 2 * N);
-  StoreU(D, d, unaligned + 3 * N);
-}
-
-}  // namespace detail
-
-// >= 128-bit vector, 8..32-bit lanes
-template <typename T, size_t N, class V, HWY_IF_NOT_LANE_SIZE(T, 8),
-          HWY_IF_GE128(T, N)>
-HWY_API void StoreInterleaved4(const V v0, const V v1, const V v2, const V v3,
-                               Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
-  const RepartitionToWide<decltype(d)> dw;
-  const auto v10L = ZipLower(dw, v0, v1);  // .. v1[0] v0[0]
-  const auto v32L = ZipLower(dw, v2, v3);
-  const auto v10U = ZipUpper(dw, v0, v1);
-  const auto v32U = ZipUpper(dw, v2, v3);
-  // The interleaved vectors are A, B, C, D.
-  const auto A = BitCast(d, InterleaveLower(dw, v10L, v32L));  // 3210
-  const auto B = BitCast(d, InterleaveUpper(dw, v10L, v32L));
-  const auto C = BitCast(d, InterleaveLower(dw, v10U, v32U));
-  const auto D = BitCast(d, InterleaveUpper(dw, v10U, v32U));
-  detail::StoreTransposedBlocks4(A, B, C, D, d, unaligned);
-}
-
-// >= 128-bit vector, 64-bit lanes
-template <typename T, size_t N, class V, HWY_IF_LANE_SIZE(T, 8),
-          HWY_IF_GE128(T, N)>
-HWY_API void StoreInterleaved4(const V v0, const V v1, const V v2, const V v3,
-                               Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
-  // The interleaved vectors are A, B, C, D.
-  const auto A = InterleaveLower(d, v0, v1);  // v1[0] v0[0]
-  const auto B = InterleaveLower(d, v2, v3);
-  const auto C = InterleaveUpper(d, v0, v1);
-  const auto D = InterleaveUpper(d, v2, v3);
-  detail::StoreTransposedBlocks4(A, B, C, D, d, unaligned);
-}
-
-// 64-bit vector, 8..32-bit lanes
-template <typename T, HWY_IF_NOT_LANE_SIZE(T, 8)>
-HWY_API void StoreInterleaved4(const Vec64<T> part0, const Vec64<T> part1,
-                               const Vec64<T> part2, const Vec64<T> part3,
-                               Full64<T> /*tag*/, T* HWY_RESTRICT unaligned) {
-  constexpr size_t N = 16 / sizeof(T);
-  // Use full vectors to reduce the number of stores.
-  const Full128<T> d_full;
-  const RepartitionToWide<decltype(d_full)> dw;
-  const Vec128<T> v0{part0.raw};
-  const Vec128<T> v1{part1.raw};
-  const Vec128<T> v2{part2.raw};
-  const Vec128<T> v3{part3.raw};
-  const auto v10 = ZipLower(dw, v0, v1);  // v1[0] v0[0]
-  const auto v32 = ZipLower(dw, v2, v3);
-  const auto A = BitCast(d_full, InterleaveLower(dw, v10, v32));
-  const auto B = BitCast(d_full, InterleaveUpper(dw, v10, v32));
-  StoreU(A, d_full, unaligned + 0 * N);
-  StoreU(B, d_full, unaligned + 1 * N);
-}
-
-// 64-bit vector, 64-bit lane
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API void StoreInterleaved4(const Vec64<T> part0, const Vec64<T> part1,
-                               const Vec64<T> part2, const Vec64<T> part3,
-                               Full64<T> /*tag*/, T* HWY_RESTRICT unaligned) {
-  constexpr size_t N = 16 / sizeof(T);
-  // Use full vectors to reduce the number of stores.
-  const Full128<T> d_full;
-  const Vec128<T> v0{part0.raw};
-  const Vec128<T> v1{part1.raw};
-  const Vec128<T> v2{part2.raw};
-  const Vec128<T> v3{part3.raw};
-  const auto A = InterleaveLower(d_full, v0, v1);  // v1[0] v0[0]
-  const auto B = InterleaveLower(d_full, v2, v3);
-  StoreU(A, d_full, unaligned + 0 * N);
-  StoreU(B, d_full, unaligned + 1 * N);
-}
-
-// <= 32-bit vectors
-template <typename T, size_t N, HWY_IF_LE32(T, N)>
-HWY_API void StoreInterleaved4(const Vec128<T, N> part0,
-                               const Vec128<T, N> part1,
-                               const Vec128<T, N> part2,
-                               const Vec128<T, N> part3, Simd<T, N, 0> /*tag*/,
-                               T* HWY_RESTRICT unaligned) {
-  // Use full vectors to reduce the number of stores.
-  const Full128<T> d_full;
-  const RepartitionToWide<decltype(d_full)> dw;
-  const Vec128<T> v0{part0.raw};
-  const Vec128<T> v1{part1.raw};
-  const Vec128<T> v2{part2.raw};
-  const Vec128<T> v3{part3.raw};
-  const auto v10 = ZipLower(dw, v0, v1);  // .. v1[0] v0[0]
-  const auto v32 = ZipLower(dw, v2, v3);
-  const auto v3210 = BitCast(d_full, InterleaveLower(dw, v10, v32));
-  alignas(16) T buf[16 / sizeof(T)];
-  StoreU(v3210, d_full, buf);
-  CopyBytes<4 * N * sizeof(T)>(buf, unaligned);
-}
-
-#endif  // HWY_NATIVE_LOAD_STORE_INTERLEAVED
-
-// ------------------------------ AESRound
-
-// Cannot implement on scalar: need at least 16 bytes for TableLookupBytes.
-#if HWY_TARGET != HWY_SCALAR
-
-// Define for white-box testing, even if native instructions are available.
-namespace detail {
-
-// Constant-time: computes inverse in GF(2^4) based on "Accelerating AES with
-// Vector Permute Instructions" and the accompanying assembly language
-// implementation: https://crypto.stanford.edu/vpaes/vpaes.tgz. See also Botan:
-// https://botan.randombit.net/doxygen/aes__vperm_8cpp_source.html .
-//
-// A brute-force 256 byte table lookup can also be made constant-time, and
-// possibly competitive on NEON, but this is more performance-portable
-// especially for x86 and large vectors.
-template <class V>  // u8
-HWY_INLINE V SubBytes(V state) {
-  const DFromV<V> du;
-  const auto mask = Set(du, 0xF);
-
-  // Change polynomial basis to GF(2^4)
-  {
-    alignas(16) static constexpr uint8_t basisL[16] = {
-        0x00, 0x70, 0x2A, 0x5A, 0x98, 0xE8, 0xB2, 0xC2,
-        0x08, 0x78, 0x22, 0x52, 0x90, 0xE0, 0xBA, 0xCA};
-    alignas(16) static constexpr uint8_t basisU[16] = {
-        0x00, 0x4D, 0x7C, 0x31, 0x7D, 0x30, 0x01, 0x4C,
-        0x81, 0xCC, 0xFD, 0xB0, 0xFC, 0xB1, 0x80, 0xCD};
-    const auto sL = And(state, mask);
-    const auto sU = ShiftRight<4>(state);  // byte shift => upper bits are zero
-    const auto gf4L = TableLookupBytes(LoadDup128(du, basisL), sL);
-    const auto gf4U = TableLookupBytes(LoadDup128(du, basisU), sU);
-    state = Xor(gf4L, gf4U);
-  }
-
-  // Inversion in GF(2^4). Elements 0 represent "infinity" (division by 0) and
-  // cause TableLookupBytesOr0 to return 0.
-  alignas(16) static constexpr uint8_t kZetaInv[16] = {
-      0x80, 7, 11, 15, 6, 10, 4, 1, 9, 8, 5, 2, 12, 14, 13, 3};
-  alignas(16) static constexpr uint8_t kInv[16] = {
-      0x80, 1, 8, 13, 15, 6, 5, 14, 2, 12, 11, 10, 9, 3, 7, 4};
-  const auto tbl = LoadDup128(du, kInv);
-  const auto sL = And(state, mask);      // L=low nibble, U=upper
-  const auto sU = ShiftRight<4>(state);  // byte shift => upper bits are zero
-  const auto sX = Xor(sU, sL);
-  const auto invL = TableLookupBytes(LoadDup128(du, kZetaInv), sL);
-  const auto invU = TableLookupBytes(tbl, sU);
-  const auto invX = TableLookupBytes(tbl, sX);
-  const auto outL = Xor(sX, TableLookupBytesOr0(tbl, Xor(invL, invU)));
-  const auto outU = Xor(sU, TableLookupBytesOr0(tbl, Xor(invL, invX)));
-
-  // Linear skew (cannot bake 0x63 bias into the table because out* indices
-  // may have the infinity flag set).
-  alignas(16) static constexpr uint8_t kAffineL[16] = {
-      0x00, 0xC7, 0xBD, 0x6F, 0x17, 0x6D, 0xD2, 0xD0,
-      0x78, 0xA8, 0x02, 0xC5, 0x7A, 0xBF, 0xAA, 0x15};
-  alignas(16) static constexpr uint8_t kAffineU[16] = {
-      0x00, 0x6A, 0xBB, 0x5F, 0xA5, 0x74, 0xE4, 0xCF,
-      0xFA, 0x35, 0x2B, 0x41, 0xD1, 0x90, 0x1E, 0x8E};
-  const auto affL = TableLookupBytesOr0(LoadDup128(du, kAffineL), outL);
-  const auto affU = TableLookupBytesOr0(LoadDup128(du, kAffineU), outU);
-  return Xor(Xor(affL, affU), Set(du, 0x63));
-}
-
-}  // namespace detail
-
-#endif  // HWY_TARGET != HWY_SCALAR
-
-// "Include guard": skip if native AES instructions are available.
-#if (defined(HWY_NATIVE_AES) == defined(HWY_TARGET_TOGGLE))
-#ifdef HWY_NATIVE_AES
-#undef HWY_NATIVE_AES
-#else
-#define HWY_NATIVE_AES
-#endif
-
-// (Must come after HWY_TARGET_TOGGLE, else we don't reset it for scalar)
-#if HWY_TARGET != HWY_SCALAR
-
-namespace detail {
-
-template <class V>  // u8
-HWY_API V ShiftRows(const V state) {
-  const DFromV<V> du;
-  alignas(16) static constexpr uint8_t kShiftRow[16] = {
-      0,  5,  10, 15,  // transposed: state is column major
-      4,  9,  14, 3,   //
-      8,  13, 2,  7,   //
-      12, 1,  6,  11};
-  const auto shift_row = LoadDup128(du, kShiftRow);
-  return TableLookupBytes(state, shift_row);
-}
-
-template <class V>  // u8
-HWY_API V MixColumns(const V state) {
-  const DFromV<V> du;
-  // For each column, the rows are the sum of GF(2^8) matrix multiplication by:
-  // 2 3 1 1  // Let s := state*1, d := state*2, t := state*3.
-  // 1 2 3 1  // d are on diagonal, no permutation needed.
-  // 1 1 2 3  // t1230 indicates column indices of threes for the 4 rows.
-  // 3 1 1 2  // We also need to compute s2301 and s3012 (=1230 o 2301).
-  alignas(16) static constexpr uint8_t k2301[16] = {
-      2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13};
-  alignas(16) static constexpr uint8_t k1230[16] = {
-      1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12};
-  const RebindToSigned<decltype(du)> di;  // can only do signed comparisons
-  const auto msb = Lt(BitCast(di, state), Zero(di));
-  const auto overflow = BitCast(du, IfThenElseZero(msb, Set(di, 0x1B)));
-  const auto d = Xor(Add(state, state), overflow);  // = state*2 in GF(2^8).
-  const auto s2301 = TableLookupBytes(state, LoadDup128(du, k2301));
-  const auto d_s2301 = Xor(d, s2301);
-  const auto t_s2301 = Xor(state, d_s2301);  // t(s*3) = XOR-sum {s, d(s*2)}
-  const auto t1230_s3012 = TableLookupBytes(t_s2301, LoadDup128(du, k1230));
-  return Xor(d_s2301, t1230_s3012);  // XOR-sum of 4 terms
-}
-
-}  // namespace detail
-
-template <class V>  // u8
-HWY_API V AESRound(V state, const V round_key) {
-  // Intel docs swap the first two steps, but it does not matter because
-  // ShiftRows is a permutation and SubBytes is independent of lane index.
-  state = detail::SubBytes(state);
-  state = detail::ShiftRows(state);
-  state = detail::MixColumns(state);
-  state = Xor(state, round_key);  // AddRoundKey
-  return state;
-}
-
-template <class V>  // u8
-HWY_API V AESLastRound(V state, const V round_key) {
-  // LIke AESRound, but without MixColumns.
-  state = detail::SubBytes(state);
-  state = detail::ShiftRows(state);
-  state = Xor(state, round_key);  // AddRoundKey
-  return state;
-}
-
-// Constant-time implementation inspired by
-// https://www.bearssl.org/constanttime.html, but about half the cost because we
-// use 64x64 multiplies and 128-bit XORs.
-template <class V>
-HWY_API V CLMulLower(V a, V b) {
-  const DFromV<V> d;
-  static_assert(IsSame<TFromD<decltype(d)>, uint64_t>(), "V must be u64");
-  const auto k1 = Set(d, 0x1111111111111111ULL);
-  const auto k2 = Set(d, 0x2222222222222222ULL);
-  const auto k4 = Set(d, 0x4444444444444444ULL);
-  const auto k8 = Set(d, 0x8888888888888888ULL);
-  const auto a0 = And(a, k1);
-  const auto a1 = And(a, k2);
-  const auto a2 = And(a, k4);
-  const auto a3 = And(a, k8);
-  const auto b0 = And(b, k1);
-  const auto b1 = And(b, k2);
-  const auto b2 = And(b, k4);
-  const auto b3 = And(b, k8);
-
-  auto m0 = Xor(MulEven(a0, b0), MulEven(a1, b3));
-  auto m1 = Xor(MulEven(a0, b1), MulEven(a1, b0));
-  auto m2 = Xor(MulEven(a0, b2), MulEven(a1, b1));
-  auto m3 = Xor(MulEven(a0, b3), MulEven(a1, b2));
-  m0 = Xor(m0, Xor(MulEven(a2, b2), MulEven(a3, b1)));
-  m1 = Xor(m1, Xor(MulEven(a2, b3), MulEven(a3, b2)));
-  m2 = Xor(m2, Xor(MulEven(a2, b0), MulEven(a3, b3)));
-  m3 = Xor(m3, Xor(MulEven(a2, b1), MulEven(a3, b0)));
-  return Or(Or(And(m0, k1), And(m1, k2)), Or(And(m2, k4), And(m3, k8)));
-}
-
-template <class V>
-HWY_API V CLMulUpper(V a, V b) {
-  const DFromV<V> d;
-  static_assert(IsSame<TFromD<decltype(d)>, uint64_t>(), "V must be u64");
-  const auto k1 = Set(d, 0x1111111111111111ULL);
-  const auto k2 = Set(d, 0x2222222222222222ULL);
-  const auto k4 = Set(d, 0x4444444444444444ULL);
-  const auto k8 = Set(d, 0x8888888888888888ULL);
-  const auto a0 = And(a, k1);
-  const auto a1 = And(a, k2);
-  const auto a2 = And(a, k4);
-  const auto a3 = And(a, k8);
-  const auto b0 = And(b, k1);
-  const auto b1 = And(b, k2);
-  const auto b2 = And(b, k4);
-  const auto b3 = And(b, k8);
-
-  auto m0 = Xor(MulOdd(a0, b0), MulOdd(a1, b3));
-  auto m1 = Xor(MulOdd(a0, b1), MulOdd(a1, b0));
-  auto m2 = Xor(MulOdd(a0, b2), MulOdd(a1, b1));
-  auto m3 = Xor(MulOdd(a0, b3), MulOdd(a1, b2));
-  m0 = Xor(m0, Xor(MulOdd(a2, b2), MulOdd(a3, b1)));
-  m1 = Xor(m1, Xor(MulOdd(a2, b3), MulOdd(a3, b2)));
-  m2 = Xor(m2, Xor(MulOdd(a2, b0), MulOdd(a3, b3)));
-  m3 = Xor(m3, Xor(MulOdd(a2, b1), MulOdd(a3, b0)));
-  return Or(Or(And(m0, k1), And(m1, k2)), Or(And(m2, k4), And(m3, k8)));
-}
-
-#endif  // HWY_NATIVE_AES
-#endif  // HWY_TARGET != HWY_SCALAR
-
-// "Include guard": skip if native POPCNT-related instructions are available.
-#if (defined(HWY_NATIVE_POPCNT) == defined(HWY_TARGET_TOGGLE))
-#ifdef HWY_NATIVE_POPCNT
-#undef HWY_NATIVE_POPCNT
-#else
-#define HWY_NATIVE_POPCNT
-#endif
-
-#undef HWY_MIN_POW2_FOR_128
-#if HWY_TARGET == HWY_RVV
-#define HWY_MIN_POW2_FOR_128 1
-#else
-// All other targets except HWY_SCALAR (which is excluded by HWY_IF_GE128_D)
-// guarantee 128 bits anyway.
-#define HWY_MIN_POW2_FOR_128 0
-#endif
-
-// This algorithm requires vectors to be at least 16 bytes, which is the case
-// for LMUL >= 2. If not, use the fallback below.
-template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 1),
-          HWY_IF_GE128_D(D), HWY_IF_POW2_GE(D, HWY_MIN_POW2_FOR_128)>
-HWY_API V PopulationCount(V v) {
-  static_assert(IsSame<TFromD<D>, uint8_t>(), "V must be u8");
-  const D d;
-  HWY_ALIGN constexpr uint8_t kLookup[16] = {
-      0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
-  };
-  const auto lo = And(v, Set(d, 0xF));
-  const auto hi = ShiftRight<4>(v);
-  const auto lookup = LoadDup128(d, kLookup);
-  return Add(TableLookupBytes(lookup, hi), TableLookupBytes(lookup, lo));
-}
-
-// RVV has a specialization that avoids the Set().
-#if HWY_TARGET != HWY_RVV
-// Slower fallback for capped vectors.
-template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 1),
-          HWY_IF_LT128_D(D)>
-HWY_API V PopulationCount(V v) {
-  static_assert(IsSame<TFromD<D>, uint8_t>(), "V must be u8");
-  const D d;
-  // See https://arxiv.org/pdf/1611.07612.pdf, Figure 3
-  v = Sub(v, And(ShiftRight<1>(v), Set(d, 0x55)));
-  v = Add(And(ShiftRight<2>(v), Set(d, 0x33)), And(v, Set(d, 0x33)));
-  return And(Add(v, ShiftRight<4>(v)), Set(d, 0x0F));
-}
-#endif  // HWY_TARGET != HWY_RVV
-
-template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 2)>
-HWY_API V PopulationCount(V v) {
-  static_assert(IsSame<TFromD<D>, uint16_t>(), "V must be u16");
-  const D d;
-  const Repartition<uint8_t, decltype(d)> d8;
-  const auto vals = BitCast(d, PopulationCount(BitCast(d8, v)));
-  return Add(ShiftRight<8>(vals), And(vals, Set(d, 0xFF)));
-}
-
-template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 4)>
-HWY_API V PopulationCount(V v) {
-  static_assert(IsSame<TFromD<D>, uint32_t>(), "V must be u32");
-  const D d;
-  Repartition<uint16_t, decltype(d)> d16;
-  auto vals = BitCast(d, PopulationCount(BitCast(d16, v)));
-  return Add(ShiftRight<16>(vals), And(vals, Set(d, 0xFF)));
-}
-
-#if HWY_HAVE_INTEGER64
-template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 8)>
-HWY_API V PopulationCount(V v) {
-  static_assert(IsSame<TFromD<D>, uint64_t>(), "V must be u64");
-  const D d;
-  Repartition<uint32_t, decltype(d)> d32;
-  auto vals = BitCast(d, PopulationCount(BitCast(d32, v)));
-  return Add(ShiftRight<32>(vals), And(vals, Set(d, 0xFF)));
-}
-#endif
-
-#endif  // HWY_NATIVE_POPCNT
-
-template <class V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 8),
-          HWY_IF_LT128_D(D)>
-HWY_API V operator*(V x, V y) {
-  return Set(D(), GetLane(x) * GetLane(y));
-}
-
-// "Include guard": skip if native 64-bit mul instructions are available.
-#if (defined(HWY_NATIVE_I64MULLO) == defined(HWY_TARGET_TOGGLE))
-#ifdef HWY_NATIVE_I64MULLO
-#undef HWY_NATIVE_I64MULLO
-#else
-#define HWY_NATIVE_I64MULLO
-#endif
-
-template <class V, class D64 = DFromV<V>, typename T = LaneType<V>,
-          HWY_IF_LANE_SIZE(T, 8), HWY_IF_UNSIGNED(T), HWY_IF_GE128_D(D64)>
-HWY_API V operator*(V x, V y) {
-  RepartitionToNarrow<D64> d32;
-  auto x32 = BitCast(d32, x);
-  auto y32 = BitCast(d32, y);
-  auto lolo = BitCast(d32, MulEven(x32, y32));
-  auto lohi = BitCast(d32, MulEven(x32, BitCast(d32, ShiftRight<32>(y))));
-  auto hilo = BitCast(d32, MulEven(BitCast(d32, ShiftRight<32>(x)), y32));
-  auto hi = BitCast(d32, ShiftLeft<32>(BitCast(D64{}, lohi + hilo)));
-  return BitCast(D64{}, lolo + hi);
-}
-template <class V, class DI64 = DFromV<V>, typename T = LaneType<V>,
-          HWY_IF_LANE_SIZE(T, 8), HWY_IF_SIGNED(T), HWY_IF_GE128_D(DI64)>
-HWY_API V operator*(V x, V y) {
-  RebindToUnsigned<DI64> du64;
-  return BitCast(DI64{}, BitCast(du64, x) * BitCast(du64, y));
-}
-
-#endif  // HWY_NATIVE_I64MULLO
-
-// ================================================== Operator wrapper
-
-// These targets currently cannot define operators and have already defined
-// (only) the corresponding functions such as Add.
-#if HWY_TARGET != HWY_RVV && HWY_TARGET != HWY_SVE &&      \
-    HWY_TARGET != HWY_SVE2 && HWY_TARGET != HWY_SVE_256 && \
-    HWY_TARGET != HWY_SVE2_128
-
-template <class V>
-HWY_API V Add(V a, V b) {
-  return a + b;
-}
-template <class V>
-HWY_API V Sub(V a, V b) {
-  return a - b;
-}
-
-template <class V>
-HWY_API V Mul(V a, V b) {
-  return a * b;
-}
-template <class V>
-HWY_API V Div(V a, V b) {
-  return a / b;
-}
-
-template <class V>
-V Shl(V a, V b) {
-  return a << b;
-}
-template <class V>
-V Shr(V a, V b) {
-  return a >> b;
-}
-
-template <class V>
-HWY_API auto Eq(V a, V b) -> decltype(a == b) {
-  return a == b;
-}
-template <class V>
-HWY_API auto Ne(V a, V b) -> decltype(a == b) {
-  return a != b;
-}
-template <class V>
-HWY_API auto Lt(V a, V b) -> decltype(a == b) {
-  return a < b;
-}
-
-template <class V>
-HWY_API auto Gt(V a, V b) -> decltype(a == b) {
-  return a > b;
-}
-template <class V>
-HWY_API auto Ge(V a, V b) -> decltype(a == b) {
-  return a >= b;
-}
-
-template <class V>
-HWY_API auto Le(V a, V b) -> decltype(a == b) {
-  return a <= b;
-}
-
-#endif  // HWY_TARGET for operators
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
diff --git a/third_party/highway/hwy/ops/rvv-inl.h b/third_party/highway/hwy/ops/rvv-inl.h
deleted file mode 100644 (file)
index 112bc94..0000000
+++ /dev/null
@@ -1,3292 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// RISC-V V vectors (length not known at compile time).
-// External include guard in highway.h - see comment there.
-
-#include <riscv_vector.h>
-#include <stddef.h>
-#include <stdint.h>
-
-#include "hwy/base.h"
-#include "hwy/ops/shared-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-template <class V>
-struct DFromV_t {};  // specialized in macros
-template <class V>
-using DFromV = typename DFromV_t<RemoveConst<V>>::type;
-
-template <class V>
-using TFromV = TFromD<DFromV<V>>;
-
-// Enables the overload if Pow2 is in [min, max].
-#define HWY_RVV_IF_POW2_IN(D, min, max) \
-  hwy::EnableIf<(min) <= Pow2(D()) && Pow2(D()) <= (max)>* = nullptr
-
-template <typename T, size_t N, int kPow2>
-constexpr size_t MLenFromD(Simd<T, N, kPow2> /* tag */) {
-  // Returns divisor = type bits / LMUL. Folding *8 into the ScaleByPower
-  // argument enables fractional LMUL < 1. Limit to 64 because that is the
-  // largest value for which vbool##_t are defined.
-  return HWY_MIN(64, sizeof(T) * 8 * 8 / detail::ScaleByPower(8, kPow2));
-}
-
-// ================================================== MACROS
-
-// Generate specializations and function definitions using X macros. Although
-// harder to read and debug, writing everything manually is too bulky.
-
-namespace detail {  // for code folding
-
-// For all mask sizes MLEN: (1/Nth of a register, one bit per lane)
-// The first two arguments are SEW and SHIFT such that SEW >> SHIFT = MLEN.
-#define HWY_RVV_FOREACH_B(X_MACRO, NAME, OP) \
-  X_MACRO(64, 0, 64, NAME, OP)               \
-  X_MACRO(32, 0, 32, NAME, OP)               \
-  X_MACRO(16, 0, 16, NAME, OP)               \
-  X_MACRO(8, 0, 8, NAME, OP)                 \
-  X_MACRO(8, 1, 4, NAME, OP)                 \
-  X_MACRO(8, 2, 2, NAME, OP)                 \
-  X_MACRO(8, 3, 1, NAME, OP)
-
-// For given SEW, iterate over one of LMULS: _TRUNC, _EXT, _ALL. This allows
-// reusing type lists such as HWY_RVV_FOREACH_U for _ALL (the usual case) or
-// _EXT (for Combine). To achieve this, we HWY_CONCAT with the LMULS suffix.
-//
-// Precompute SEW/LMUL => MLEN to allow token-pasting the result. For the same
-// reason, also pass the double-width and half SEW and LMUL (suffixed D and H,
-// respectively). "__" means there is no corresponding LMUL (e.g. LMULD for m8).
-// Args: BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP
-
-// LMULS = _TRUNC: truncatable (not the smallest LMUL)
-#define HWY_RVV_FOREACH_08_TRUNC(X_MACRO, BASE, CHAR, NAME, OP)            \
-  X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, /*MLEN=*/32, NAME, OP) \
-  X_MACRO(BASE, CHAR, 8, 16, __, mf2, m1, mf4, -1, /*MLEN=*/16, NAME, OP)  \
-  X_MACRO(BASE, CHAR, 8, 16, __, m1, m2, mf2, 0, /*MLEN=*/8, NAME, OP)     \
-  X_MACRO(BASE, CHAR, 8, 16, __, m2, m4, m1, 1, /*MLEN=*/4, NAME, OP)      \
-  X_MACRO(BASE, CHAR, 8, 16, __, m4, m8, m2, 2, /*MLEN=*/2, NAME, OP)      \
-  X_MACRO(BASE, CHAR, 8, 16, __, m8, __, m4, 3, /*MLEN=*/1, NAME, OP)
-
-#define HWY_RVV_FOREACH_16_TRUNC(X_MACRO, BASE, CHAR, NAME, OP)           \
-  X_MACRO(BASE, CHAR, 16, 32, 8, mf2, m1, mf4, -1, /*MLEN=*/32, NAME, OP) \
-  X_MACRO(BASE, CHAR, 16, 32, 8, m1, m2, mf2, 0, /*MLEN=*/16, NAME, OP)   \
-  X_MACRO(BASE, CHAR, 16, 32, 8, m2, m4, m1, 1, /*MLEN=*/8, NAME, OP)     \
-  X_MACRO(BASE, CHAR, 16, 32, 8, m4, m8, m2, 2, /*MLEN=*/4, NAME, OP)     \
-  X_MACRO(BASE, CHAR, 16, 32, 8, m8, __, m4, 3, /*MLEN=*/2, NAME, OP)
-
-#define HWY_RVV_FOREACH_32_TRUNC(X_MACRO, BASE, CHAR, NAME, OP)          \
-  X_MACRO(BASE, CHAR, 32, 64, 16, m1, m2, mf2, 0, /*MLEN=*/32, NAME, OP) \
-  X_MACRO(BASE, CHAR, 32, 64, 16, m2, m4, m1, 1, /*MLEN=*/16, NAME, OP)  \
-  X_MACRO(BASE, CHAR, 32, 64, 16, m4, m8, m2, 2, /*MLEN=*/8, NAME, OP)   \
-  X_MACRO(BASE, CHAR, 32, 64, 16, m8, __, m4, 3, /*MLEN=*/4, NAME, OP)
-
-#define HWY_RVV_FOREACH_64_TRUNC(X_MACRO, BASE, CHAR, NAME, OP)         \
-  X_MACRO(BASE, CHAR, 64, __, 32, m2, m4, m1, 1, /*MLEN=*/32, NAME, OP) \
-  X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, /*MLEN=*/16, NAME, OP) \
-  X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, /*MLEN=*/8, NAME, OP)
-
-// LMULS = _DEMOTE: can demote from SEW*LMUL to SEWH*LMULH.
-#define HWY_RVV_FOREACH_08_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP)           \
-  X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, /*MLEN=*/32, NAME, OP) \
-  X_MACRO(BASE, CHAR, 8, 16, __, mf2, m1, mf4, -1, /*MLEN=*/16, NAME, OP)  \
-  X_MACRO(BASE, CHAR, 8, 16, __, m1, m2, mf2, 0, /*MLEN=*/8, NAME, OP)     \
-  X_MACRO(BASE, CHAR, 8, 16, __, m2, m4, m1, 1, /*MLEN=*/4, NAME, OP)      \
-  X_MACRO(BASE, CHAR, 8, 16, __, m4, m8, m2, 2, /*MLEN=*/2, NAME, OP)      \
-  X_MACRO(BASE, CHAR, 8, 16, __, m8, __, m4, 3, /*MLEN=*/1, NAME, OP)
-
-#define HWY_RVV_FOREACH_16_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP)           \
-  X_MACRO(BASE, CHAR, 16, 32, 8, mf4, mf2, mf8, -2, /*MLEN=*/64, NAME, OP) \
-  X_MACRO(BASE, CHAR, 16, 32, 8, mf2, m1, mf4, -1, /*MLEN=*/32, NAME, OP)  \
-  X_MACRO(BASE, CHAR, 16, 32, 8, m1, m2, mf2, 0, /*MLEN=*/16, NAME, OP)    \
-  X_MACRO(BASE, CHAR, 16, 32, 8, m2, m4, m1, 1, /*MLEN=*/8, NAME, OP)      \
-  X_MACRO(BASE, CHAR, 16, 32, 8, m4, m8, m2, 2, /*MLEN=*/4, NAME, OP)      \
-  X_MACRO(BASE, CHAR, 16, 32, 8, m8, __, m4, 3, /*MLEN=*/2, NAME, OP)
-
-#define HWY_RVV_FOREACH_32_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP)           \
-  X_MACRO(BASE, CHAR, 32, 64, 16, mf2, m1, mf4, -1, /*MLEN=*/64, NAME, OP) \
-  X_MACRO(BASE, CHAR, 32, 64, 16, m1, m2, mf2, 0, /*MLEN=*/32, NAME, OP)   \
-  X_MACRO(BASE, CHAR, 32, 64, 16, m2, m4, m1, 1, /*MLEN=*/16, NAME, OP)    \
-  X_MACRO(BASE, CHAR, 32, 64, 16, m4, m8, m2, 2, /*MLEN=*/8, NAME, OP)     \
-  X_MACRO(BASE, CHAR, 32, 64, 16, m8, __, m4, 3, /*MLEN=*/4, NAME, OP)
-
-#define HWY_RVV_FOREACH_64_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP)         \
-  X_MACRO(BASE, CHAR, 64, __, 32, m1, m2, mf2, 0, /*MLEN=*/64, NAME, OP) \
-  X_MACRO(BASE, CHAR, 64, __, 32, m2, m4, m1, 1, /*MLEN=*/32, NAME, OP)  \
-  X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, /*MLEN=*/16, NAME, OP)  \
-  X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, /*MLEN=*/8, NAME, OP)
-
-// LMULS = _LE2: <= 2
-#define HWY_RVV_FOREACH_08_LE2(X_MACRO, BASE, CHAR, NAME, OP)              \
-  X_MACRO(BASE, CHAR, 8, 16, __, mf8, mf4, __, -3, /*MLEN=*/64, NAME, OP)  \
-  X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, /*MLEN=*/32, NAME, OP) \
-  X_MACRO(BASE, CHAR, 8, 16, __, mf2, m1, mf4, -1, /*MLEN=*/16, NAME, OP)  \
-  X_MACRO(BASE, CHAR, 8, 16, __, m1, m2, mf2, 0, /*MLEN=*/8, NAME, OP)     \
-  X_MACRO(BASE, CHAR, 8, 16, __, m2, m4, m1, 1, /*MLEN=*/4, NAME, OP)
-
-#define HWY_RVV_FOREACH_16_LE2(X_MACRO, BASE, CHAR, NAME, OP)              \
-  X_MACRO(BASE, CHAR, 16, 32, 8, mf4, mf2, mf8, -2, /*MLEN=*/64, NAME, OP) \
-  X_MACRO(BASE, CHAR, 16, 32, 8, mf2, m1, mf4, -1, /*MLEN=*/32, NAME, OP)  \
-  X_MACRO(BASE, CHAR, 16, 32, 8, m1, m2, mf2, 0, /*MLEN=*/16, NAME, OP)    \
-  X_MACRO(BASE, CHAR, 16, 32, 8, m2, m4, m1, 1, /*MLEN=*/8, NAME, OP)
-
-#define HWY_RVV_FOREACH_32_LE2(X_MACRO, BASE, CHAR, NAME, OP)              \
-  X_MACRO(BASE, CHAR, 32, 64, 16, mf2, m1, mf4, -1, /*MLEN=*/64, NAME, OP) \
-  X_MACRO(BASE, CHAR, 32, 64, 16, m1, m2, mf2, 0, /*MLEN=*/32, NAME, OP)   \
-  X_MACRO(BASE, CHAR, 32, 64, 16, m2, m4, m1, 1, /*MLEN=*/16, NAME, OP)
-
-#define HWY_RVV_FOREACH_64_LE2(X_MACRO, BASE, CHAR, NAME, OP)            \
-  X_MACRO(BASE, CHAR, 64, __, 32, m1, m2, mf2, 0, /*MLEN=*/64, NAME, OP) \
-  X_MACRO(BASE, CHAR, 64, __, 32, m2, m4, m1, 1, /*MLEN=*/32, NAME, OP)
-
-// LMULS = _EXT: not the largest LMUL
-#define HWY_RVV_FOREACH_08_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
-  HWY_RVV_FOREACH_08_LE2(X_MACRO, BASE, CHAR, NAME, OP)       \
-  X_MACRO(BASE, CHAR, 8, 16, __, m4, m8, m2, 2, /*MLEN=*/2, NAME, OP)
-
-#define HWY_RVV_FOREACH_16_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
-  HWY_RVV_FOREACH_16_LE2(X_MACRO, BASE, CHAR, NAME, OP)       \
-  X_MACRO(BASE, CHAR, 16, 32, 8, m4, m8, m2, 2, /*MLEN=*/4, NAME, OP)
-
-#define HWY_RVV_FOREACH_32_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
-  HWY_RVV_FOREACH_32_LE2(X_MACRO, BASE, CHAR, NAME, OP)       \
-  X_MACRO(BASE, CHAR, 32, 64, 16, m4, m8, m2, 2, /*MLEN=*/8, NAME, OP)
-
-#define HWY_RVV_FOREACH_64_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
-  HWY_RVV_FOREACH_64_LE2(X_MACRO, BASE, CHAR, NAME, OP)       \
-  X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, /*MLEN=*/16, NAME, OP)
-
-// LMULS = _ALL (2^MinPow2() <= LMUL <= 8)
-#define HWY_RVV_FOREACH_08_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
-  HWY_RVV_FOREACH_08_EXT(X_MACRO, BASE, CHAR, NAME, OP)       \
-  X_MACRO(BASE, CHAR, 8, 16, __, m8, __, m4, 3, /*MLEN=*/1, NAME, OP)
-
-#define HWY_RVV_FOREACH_16_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
-  HWY_RVV_FOREACH_16_EXT(X_MACRO, BASE, CHAR, NAME, OP)       \
-  X_MACRO(BASE, CHAR, 16, 32, 8, m8, __, m4, 3, /*MLEN=*/2, NAME, OP)
-
-#define HWY_RVV_FOREACH_32_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
-  HWY_RVV_FOREACH_32_EXT(X_MACRO, BASE, CHAR, NAME, OP)       \
-  X_MACRO(BASE, CHAR, 32, 64, 16, m8, __, m4, 3, /*MLEN=*/4, NAME, OP)
-
-#define HWY_RVV_FOREACH_64_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
-  HWY_RVV_FOREACH_64_EXT(X_MACRO, BASE, CHAR, NAME, OP)       \
-  X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, /*MLEN=*/8, NAME, OP)
-
-// 'Virtual' LMUL. This upholds the Highway guarantee that vectors are at least
-// 128 bit and LowerHalf is defined whenever there are at least 2 lanes, even
-// though RISC-V LMUL must be at least SEW/64 (notice that this rules out
-// LMUL=1/2 for SEW=64). To bridge the gap, we add overloads for kPow2 equal to
-// one less than should be supported, with all other parameters (vector type
-// etc.) unchanged. For D with the lowest kPow2 ('virtual LMUL'), Lanes()
-// returns half of what it usually would.
-//
-// Notice that we can only add overloads whenever there is a D argument: those
-// are unique with respect to non-virtual-LMUL overloads because their kPow2
-// template argument differs. Otherwise, there is no actual vuint64mf2_t, and
-// defining another overload with the same LMUL would be an error. Thus we have
-// a separate _VIRT category for HWY_RVV_FOREACH*, and the common case is
-// _ALL_VIRT (meaning the regular LMUL plus the VIRT overloads), used in most
-// functions that take a D.
-
-#define HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
-
-#define HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
-  X_MACRO(BASE, CHAR, 16, 32, 8, mf4, mf2, mf8, -3, /*MLEN=*/64, NAME, OP)
-
-#define HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
-  X_MACRO(BASE, CHAR, 32, 64, 16, mf2, m1, mf4, -2, /*MLEN=*/64, NAME, OP)
-
-#define HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
-  X_MACRO(BASE, CHAR, 64, __, 32, m1, m2, mf2, -1, /*MLEN=*/64, NAME, OP)
-
-// ALL + VIRT
-#define HWY_RVV_FOREACH_08_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
-  HWY_RVV_FOREACH_08_ALL(X_MACRO, BASE, CHAR, NAME, OP)            \
-  HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
-
-#define HWY_RVV_FOREACH_16_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
-  HWY_RVV_FOREACH_16_ALL(X_MACRO, BASE, CHAR, NAME, OP)            \
-  HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
-
-#define HWY_RVV_FOREACH_32_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
-  HWY_RVV_FOREACH_32_ALL(X_MACRO, BASE, CHAR, NAME, OP)            \
-  HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
-
-#define HWY_RVV_FOREACH_64_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
-  HWY_RVV_FOREACH_64_ALL(X_MACRO, BASE, CHAR, NAME, OP)            \
-  HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
-
-// LE2 + VIRT
-#define HWY_RVV_FOREACH_08_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
-  HWY_RVV_FOREACH_08_LE2(X_MACRO, BASE, CHAR, NAME, OP)            \
-  HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
-
-#define HWY_RVV_FOREACH_16_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
-  HWY_RVV_FOREACH_16_LE2(X_MACRO, BASE, CHAR, NAME, OP)            \
-  HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
-
-#define HWY_RVV_FOREACH_32_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
-  HWY_RVV_FOREACH_32_LE2(X_MACRO, BASE, CHAR, NAME, OP)            \
-  HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
-
-#define HWY_RVV_FOREACH_64_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
-  HWY_RVV_FOREACH_64_LE2(X_MACRO, BASE, CHAR, NAME, OP)            \
-  HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
-
-// EXT + VIRT
-#define HWY_RVV_FOREACH_08_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
-  HWY_RVV_FOREACH_08_EXT(X_MACRO, BASE, CHAR, NAME, OP)            \
-  HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
-
-#define HWY_RVV_FOREACH_16_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
-  HWY_RVV_FOREACH_16_EXT(X_MACRO, BASE, CHAR, NAME, OP)            \
-  HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
-
-#define HWY_RVV_FOREACH_32_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
-  HWY_RVV_FOREACH_32_EXT(X_MACRO, BASE, CHAR, NAME, OP)            \
-  HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
-
-#define HWY_RVV_FOREACH_64_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
-  HWY_RVV_FOREACH_64_EXT(X_MACRO, BASE, CHAR, NAME, OP)            \
-  HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
-
-// DEMOTE + VIRT
-#define HWY_RVV_FOREACH_08_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
-  HWY_RVV_FOREACH_08_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP)            \
-  HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
-
-#define HWY_RVV_FOREACH_16_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
-  HWY_RVV_FOREACH_16_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP)            \
-  HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
-
-#define HWY_RVV_FOREACH_32_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
-  HWY_RVV_FOREACH_32_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP)            \
-  HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
-
-#define HWY_RVV_FOREACH_64_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
-  HWY_RVV_FOREACH_64_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP)            \
-  HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
-
-// SEW for unsigned:
-#define HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS) \
-  HWY_CONCAT(HWY_RVV_FOREACH_08, LMULS)(X_MACRO, uint, u, NAME, OP)
-#define HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \
-  HWY_CONCAT(HWY_RVV_FOREACH_16, LMULS)(X_MACRO, uint, u, NAME, OP)
-#define HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \
-  HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, uint, u, NAME, OP)
-#define HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS) \
-  HWY_CONCAT(HWY_RVV_FOREACH_64, LMULS)(X_MACRO, uint, u, NAME, OP)
-
-// SEW for signed:
-#define HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS) \
-  HWY_CONCAT(HWY_RVV_FOREACH_08, LMULS)(X_MACRO, int, i, NAME, OP)
-#define HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS) \
-  HWY_CONCAT(HWY_RVV_FOREACH_16, LMULS)(X_MACRO, int, i, NAME, OP)
-#define HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS) \
-  HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, int, i, NAME, OP)
-#define HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS) \
-  HWY_CONCAT(HWY_RVV_FOREACH_64, LMULS)(X_MACRO, int, i, NAME, OP)
-
-// SEW for float:
-#if HWY_HAVE_FLOAT16
-#define HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS) \
-  HWY_CONCAT(HWY_RVV_FOREACH_16, LMULS)(X_MACRO, float, f, NAME, OP)
-#else
-#define HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS)
-#endif
-#define HWY_RVV_FOREACH_F32(X_MACRO, NAME, OP, LMULS) \
-  HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, float, f, NAME, OP)
-#define HWY_RVV_FOREACH_F64(X_MACRO, NAME, OP, LMULS) \
-  HWY_CONCAT(HWY_RVV_FOREACH_64, LMULS)(X_MACRO, float, f, NAME, OP)
-
-// Commonly used type/SEW groups:
-#define HWY_RVV_FOREACH_UI08(X_MACRO, NAME, OP, LMULS) \
-  HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS)        \
-  HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS)
-
-#define HWY_RVV_FOREACH_UI16(X_MACRO, NAME, OP, LMULS) \
-  HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS)        \
-  HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS)
-
-#define HWY_RVV_FOREACH_UI32(X_MACRO, NAME, OP, LMULS) \
-  HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS)        \
-  HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS)
-
-#define HWY_RVV_FOREACH_UI64(X_MACRO, NAME, OP, LMULS) \
-  HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS)        \
-  HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS)
-
-#define HWY_RVV_FOREACH_UI3264(X_MACRO, NAME, OP, LMULS) \
-  HWY_RVV_FOREACH_UI32(X_MACRO, NAME, OP, LMULS)         \
-  HWY_RVV_FOREACH_UI64(X_MACRO, NAME, OP, LMULS)
-
-#define HWY_RVV_FOREACH_U163264(X_MACRO, NAME, OP, LMULS) \
-  HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS)           \
-  HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS)           \
-  HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS)
-
-#define HWY_RVV_FOREACH_I163264(X_MACRO, NAME, OP, LMULS) \
-  HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS)           \
-  HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS)           \
-  HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS)
-
-#define HWY_RVV_FOREACH_UI163264(X_MACRO, NAME, OP, LMULS) \
-  HWY_RVV_FOREACH_U163264(X_MACRO, NAME, OP, LMULS)        \
-  HWY_RVV_FOREACH_I163264(X_MACRO, NAME, OP, LMULS)
-
-#define HWY_RVV_FOREACH_F3264(X_MACRO, NAME, OP, LMULS) \
-  HWY_RVV_FOREACH_F32(X_MACRO, NAME, OP, LMULS)         \
-  HWY_RVV_FOREACH_F64(X_MACRO, NAME, OP, LMULS)
-
-// For all combinations of SEW:
-#define HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS) \
-  HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS)     \
-  HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS)     \
-  HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS)     \
-  HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS)
-
-#define HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS) \
-  HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS)     \
-  HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS)     \
-  HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS)     \
-  HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS)
-
-#define HWY_RVV_FOREACH_F(X_MACRO, NAME, OP, LMULS) \
-  HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS)     \
-  HWY_RVV_FOREACH_F3264(X_MACRO, NAME, OP, LMULS)
-
-// Commonly used type categories:
-#define HWY_RVV_FOREACH_UI(X_MACRO, NAME, OP, LMULS) \
-  HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS)        \
-  HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS)
-
-#define HWY_RVV_FOREACH(X_MACRO, NAME, OP, LMULS) \
-  HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS)     \
-  HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS)     \
-  HWY_RVV_FOREACH_F(X_MACRO, NAME, OP, LMULS)
-
-// Assemble types for use in x-macros
-#define HWY_RVV_T(BASE, SEW) BASE##SEW##_t
-#define HWY_RVV_D(BASE, SEW, N, SHIFT) Simd<HWY_RVV_T(BASE, SEW), N, SHIFT>
-#define HWY_RVV_V(BASE, SEW, LMUL) v##BASE##SEW##LMUL##_t
-#define HWY_RVV_M(MLEN) vbool##MLEN##_t
-
-}  // namespace detail
-
-// Until we have full intrinsic support for fractional LMUL, mixed-precision
-// code can use LMUL 1..8 (adequate unless they need many registers).
-#define HWY_SPECIALIZE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
-                       MLEN, NAME, OP)                                         \
-  template <>                                                                  \
-  struct DFromV_t<HWY_RVV_V(BASE, SEW, LMUL)> {                                \
-    using Lane = HWY_RVV_T(BASE, SEW);                                         \
-    using type = ScalableTag<Lane, SHIFT>;                                     \
-  };
-
-HWY_RVV_FOREACH(HWY_SPECIALIZE, _, _, _ALL)
-#undef HWY_SPECIALIZE
-
-// ------------------------------ Lanes
-
-// WARNING: we want to query VLMAX/sizeof(T), but this actually changes VL!
-// vlenb is not exposed through intrinsics and vreadvl is not VLMAX.
-#define HWY_RVV_LANES(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
-                      MLEN, NAME, OP)                                         \
-  template <size_t N>                                                         \
-  HWY_API size_t NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d) {                     \
-    size_t actual = v##OP##SEW##LMUL();                                       \
-    /* Common case of full vectors: avoid any extra instructions. */          \
-    /* actual includes LMUL, so do not shift again. */                        \
-    if (detail::IsFull(d)) return actual;                                     \
-    /* Check for virtual LMUL, e.g. "uint16mf8_t" (not provided by */         \
-    /* intrinsics). In this case the actual LMUL is 1/4, so divide by */      \
-    /* another factor of two. */                                              \
-    if (detail::ScaleByPower(128 / SEW, SHIFT) == 1) actual >>= 1;            \
-    return HWY_MIN(actual, N);                                                \
-  }
-
-HWY_RVV_FOREACH(HWY_RVV_LANES, Lanes, setvlmax_e, _ALL_VIRT)
-#undef HWY_RVV_LANES
-
-template <size_t N, int kPow2>
-HWY_API size_t Lanes(Simd<bfloat16_t, N, kPow2> /* tag*/) {
-  return Lanes(Simd<uint16_t, N, kPow2>());
-}
-
-// ------------------------------ Common x-macros
-
-// Last argument to most intrinsics. Use when the op has no d arg of its own,
-// which means there is no user-specified cap.
-#define HWY_RVV_AVL(SEW, SHIFT) \
-  Lanes(ScalableTag<HWY_RVV_T(uint, SEW), SHIFT>())
-
-// vector = f(vector), e.g. Not
-#define HWY_RVV_RETV_ARGV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
-                          SHIFT, MLEN, NAME, OP)                           \
-  HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) {  \
-    return v##OP##_v_##CHAR##SEW##LMUL(v, HWY_RVV_AVL(SEW, SHIFT));        \
-  }
-
-// vector = f(vector, scalar), e.g. detail::AddS
-#define HWY_RVV_RETV_ARGVS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
-                           SHIFT, MLEN, NAME, OP)                           \
-  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                        \
-      NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_T(BASE, SEW) b) {          \
-    return v##OP##_##CHAR##SEW##LMUL(a, b, HWY_RVV_AVL(SEW, SHIFT));        \
-  }
-
-// vector = f(vector, vector), e.g. Add
-#define HWY_RVV_RETV_ARGVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
-                           SHIFT, MLEN, NAME, OP)                           \
-  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                        \
-      NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) {    \
-    return v##OP##_vv_##CHAR##SEW##LMUL(a, b, HWY_RVV_AVL(SEW, SHIFT));     \
-  }
-
-// mask = f(mask)
-#define HWY_RVV_RETM_ARGM(SEW, SHIFT, MLEN, NAME, OP) \
-  HWY_API HWY_RVV_M(MLEN) NAME(HWY_RVV_M(MLEN) m) {   \
-    return vm##OP##_m_b##MLEN(m, ~0ull);              \
-  }
-
-// ================================================== INIT
-
-// ------------------------------ Set
-
-#define HWY_RVV_SET(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
-                    MLEN, NAME, OP)                                         \
-  template <size_t N>                                                       \
-  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                        \
-      NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_T(BASE, SEW) arg) {    \
-    return v##OP##_##CHAR##SEW##LMUL(arg, Lanes(d));                        \
-  }
-
-HWY_RVV_FOREACH_UI(HWY_RVV_SET, Set, mv_v_x, _ALL_VIRT)
-HWY_RVV_FOREACH_F(HWY_RVV_SET, Set, fmv_v_f, _ALL_VIRT)
-#undef HWY_RVV_SET
-
-// Treat bfloat16_t as uint16_t (using the previously defined Set overloads);
-// required for Zero and VFromD.
-template <size_t N, int kPow2>
-decltype(Set(Simd<uint16_t, N, kPow2>(), 0)) Set(Simd<bfloat16_t, N, kPow2> d,
-                                                 bfloat16_t arg) {
-  return Set(RebindToUnsigned<decltype(d)>(), arg.bits);
-}
-
-template <class D>
-using VFromD = decltype(Set(D(), TFromD<D>()));
-
-// ------------------------------ Zero
-
-template <typename T, size_t N, int kPow2>
-HWY_API VFromD<Simd<T, N, kPow2>> Zero(Simd<T, N, kPow2> d) {
-  return Set(d, T(0));
-}
-
-// ------------------------------ Undefined
-
-// RVV vundefined is 'poisoned' such that even XORing a _variable_ initialized
-// by it gives unpredictable results. It should only be used for maskoff, so
-// keep it internal. For the Highway op, just use Zero (single instruction).
-namespace detail {
-#define HWY_RVV_UNDEFINED(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
-                          SHIFT, MLEN, NAME, OP)                           \
-  template <size_t N>                                                      \
-  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                       \
-      NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) /* tag */) {                     \
-    return v##OP##_##CHAR##SEW##LMUL(); /* no AVL */                       \
-  }
-
-HWY_RVV_FOREACH(HWY_RVV_UNDEFINED, Undefined, undefined, _ALL)
-#undef HWY_RVV_UNDEFINED
-}  // namespace detail
-
-template <class D>
-HWY_API VFromD<D> Undefined(D d) {
-  return Zero(d);
-}
-
-// ------------------------------ BitCast
-
-namespace detail {
-
-// Halves LMUL. (Use LMUL arg for the source so we can use _TRUNC.)
-#define HWY_RVV_TRUNC(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
-                      MLEN, NAME, OP)                                         \
-  HWY_API HWY_RVV_V(BASE, SEW, LMULH) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) {    \
-    return v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMULH(v); /* no AVL */  \
-  }
-HWY_RVV_FOREACH(HWY_RVV_TRUNC, Trunc, lmul_trunc, _TRUNC)
-#undef HWY_RVV_TRUNC
-
-// Doubles LMUL to `d2` (the arg is only necessary for _VIRT).
-#define HWY_RVV_EXT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT,  \
-                    MLEN, NAME, OP)                                          \
-  template <size_t N>                                                        \
-  HWY_API HWY_RVV_V(BASE, SEW, LMULD)                                        \
-      NAME(HWY_RVV_D(BASE, SEW, N, SHIFT + 1) /* d2 */,                      \
-           HWY_RVV_V(BASE, SEW, LMUL) v) {                                   \
-    return v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMULD(v); /* no AVL */ \
-  }
-HWY_RVV_FOREACH(HWY_RVV_EXT, Ext, lmul_ext, _EXT)
-#undef HWY_RVV_EXT
-
-// For virtual LMUL e.g. 'uint32mf4_t', the return type should be mf2, which is
-// the same as the actual input type.
-#define HWY_RVV_EXT_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
-                         SHIFT, MLEN, NAME, OP)                           \
-  template <size_t N>                                                     \
-  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                      \
-      NAME(HWY_RVV_D(BASE, SEW, N, SHIFT + 1) /* d2 */,                   \
-           HWY_RVV_V(BASE, SEW, LMUL) v) {                                \
-    return v;                                                             \
-  }
-HWY_RVV_FOREACH(HWY_RVV_EXT_VIRT, Ext, lmul_ext, _VIRT)
-#undef HWY_RVV_EXT_VIRT
-
-// For BitCastToByte, the D arg is only to prevent duplicate definitions caused
-// by _ALL_VIRT.
-
-// There is no reinterpret from u8 <-> u8, so just return.
-#define HWY_RVV_CAST_U8(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
-                        SHIFT, MLEN, NAME, OP)                           \
-  template <typename T, size_t N>                                        \
-  HWY_API vuint8##LMUL##_t BitCastToByte(Simd<T, N, SHIFT> /* d */,      \
-                                         vuint8##LMUL##_t v) {           \
-    return v;                                                            \
-  }                                                                      \
-  template <size_t N>                                                    \
-  HWY_API vuint8##LMUL##_t BitCastFromByte(                              \
-      HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) {      \
-    return v;                                                            \
-  }
-
-// For i8, need a single reinterpret (HWY_RVV_CAST_IF does two).
-#define HWY_RVV_CAST_I8(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
-                        SHIFT, MLEN, NAME, OP)                           \
-  template <typename T, size_t N>                                        \
-  HWY_API vuint8##LMUL##_t BitCastToByte(Simd<T, N, SHIFT> /* d */,      \
-                                         vint8##LMUL##_t v) {            \
-    return vreinterpret_v_i8##LMUL##_u8##LMUL(v);                        \
-  }                                                                      \
-  template <size_t N>                                                    \
-  HWY_API vint8##LMUL##_t BitCastFromByte(                               \
-      HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) {      \
-    return vreinterpret_v_u8##LMUL##_i8##LMUL(v);                        \
-  }
-
-// Separate u/i because clang only provides signed <-> unsigned reinterpret for
-// the same SEW.
-#define HWY_RVV_CAST_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
-                       MLEN, NAME, OP)                                         \
-  template <typename T, size_t N>                                              \
-  HWY_API vuint8##LMUL##_t BitCastToByte(Simd<T, N, SHIFT> /* d */,            \
-                                         HWY_RVV_V(BASE, SEW, LMUL) v) {       \
-    return v##OP##_v_##CHAR##SEW##LMUL##_u8##LMUL(v);                          \
-  }                                                                            \
-  template <size_t N>                                                          \
-  HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte(                          \
-      HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) {            \
-    return v##OP##_v_u8##LMUL##_##CHAR##SEW##LMUL(v);                          \
-  }
-
-// Signed/Float: first cast to/from unsigned
-#define HWY_RVV_CAST_IF(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
-                        SHIFT, MLEN, NAME, OP)                           \
-  template <typename T, size_t N>                                        \
-  HWY_API vuint8##LMUL##_t BitCastToByte(Simd<T, N, SHIFT> /* d */,      \
-                                         HWY_RVV_V(BASE, SEW, LMUL) v) { \
-    return v##OP##_v_u##SEW##LMUL##_u8##LMUL(                            \
-        v##OP##_v_##CHAR##SEW##LMUL##_u##SEW##LMUL(v));                  \
-  }                                                                      \
-  template <size_t N>                                                    \
-  HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte(                    \
-      HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) {      \
-    return v##OP##_v_u##SEW##LMUL##_##CHAR##SEW##LMUL(                   \
-        v##OP##_v_u8##LMUL##_u##SEW##LMUL(v));                           \
-  }
-
-// Additional versions for virtual LMUL using LMULH for byte vectors.
-#define HWY_RVV_CAST_VIRT_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
-                            SHIFT, MLEN, NAME, OP)                           \
-  template <typename T, size_t N>                                            \
-  HWY_API vuint8##LMULH##_t BitCastToByte(Simd<T, N, SHIFT> /* d */,         \
-                                          HWY_RVV_V(BASE, SEW, LMUL) v) {    \
-    return detail::Trunc(v##OP##_v_##CHAR##SEW##LMUL##_u8##LMUL(v));         \
-  }                                                                          \
-  template <size_t N>                                                        \
-  HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte(                        \
-      HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMULH##_t v) {         \
-    HWY_RVV_D(uint, 8, N, SHIFT + 1) d2;                                     \
-    const vuint8##LMUL##_t v2 = detail::Ext(d2, v);                          \
-    return v##OP##_v_u8##LMUL##_##CHAR##SEW##LMUL(v2);                       \
-  }
-
-// Signed/Float: first cast to/from unsigned
-#define HWY_RVV_CAST_VIRT_IF(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
-                             SHIFT, MLEN, NAME, OP)                           \
-  template <typename T, size_t N>                                             \
-  HWY_API vuint8##LMULH##_t BitCastToByte(Simd<T, N, SHIFT> /* d */,          \
-                                          HWY_RVV_V(BASE, SEW, LMUL) v) {     \
-    return detail::Trunc(v##OP##_v_u##SEW##LMUL##_u8##LMUL(                   \
-        v##OP##_v_##CHAR##SEW##LMUL##_u##SEW##LMUL(v)));                      \
-  }                                                                           \
-  template <size_t N>                                                         \
-  HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte(                         \
-      HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMULH##_t v) {          \
-    HWY_RVV_D(uint, 8, N, SHIFT + 1) d2;                                      \
-    const vuint8##LMUL##_t v2 = detail::Ext(d2, v);                           \
-    return v##OP##_v_u##SEW##LMUL##_##CHAR##SEW##LMUL(                        \
-        v##OP##_v_u8##LMUL##_u##SEW##LMUL(v2));                               \
-  }
-
-HWY_RVV_FOREACH_U08(HWY_RVV_CAST_U8, _, reinterpret, _ALL)
-HWY_RVV_FOREACH_I08(HWY_RVV_CAST_I8, _, reinterpret, _ALL)
-HWY_RVV_FOREACH_U163264(HWY_RVV_CAST_U, _, reinterpret, _ALL)
-HWY_RVV_FOREACH_I163264(HWY_RVV_CAST_IF, _, reinterpret, _ALL)
-HWY_RVV_FOREACH_F(HWY_RVV_CAST_IF, _, reinterpret, _ALL)
-HWY_RVV_FOREACH_U163264(HWY_RVV_CAST_VIRT_U, _, reinterpret, _VIRT)
-HWY_RVV_FOREACH_I163264(HWY_RVV_CAST_VIRT_IF, _, reinterpret, _VIRT)
-HWY_RVV_FOREACH_F(HWY_RVV_CAST_VIRT_IF, _, reinterpret, _VIRT)
-
-#undef HWY_RVV_CAST_U8
-#undef HWY_RVV_CAST_I8
-#undef HWY_RVV_CAST_U
-#undef HWY_RVV_CAST_IF
-#undef HWY_RVV_CAST_VIRT_U
-#undef HWY_RVV_CAST_VIRT_IF
-
-template <size_t N, int kPow2>
-HWY_INLINE VFromD<Simd<uint16_t, N, kPow2>> BitCastFromByte(
-    Simd<bfloat16_t, N, kPow2> /* d */, VFromD<Simd<uint8_t, N, kPow2>> v) {
-  return BitCastFromByte(Simd<uint16_t, N, kPow2>(), v);
-}
-
-}  // namespace detail
-
-template <class D, class FromV>
-HWY_API VFromD<D> BitCast(D d, FromV v) {
-  return detail::BitCastFromByte(d, detail::BitCastToByte(d, v));
-}
-
-namespace detail {
-
-template <class V, class DU = RebindToUnsigned<DFromV<V>>>
-HWY_INLINE VFromD<DU> BitCastToUnsigned(V v) {
-  return BitCast(DU(), v);
-}
-
-}  // namespace detail
-
-// ------------------------------ Iota
-
-namespace detail {
-
-#define HWY_RVV_IOTA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT,  \
-                     MLEN, NAME, OP)                                          \
-  template <size_t N>                                                         \
-  HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d) { \
-    return v##OP##_##CHAR##SEW##LMUL(Lanes(d));                               \
-  }
-
-HWY_RVV_FOREACH_U(HWY_RVV_IOTA, Iota0, id_v, _ALL_VIRT)
-#undef HWY_RVV_IOTA
-
-template <class D, class DU = RebindToUnsigned<D>>
-HWY_INLINE VFromD<DU> Iota0(const D /*d*/) {
-  return BitCastToUnsigned(Iota0(DU()));
-}
-
-}  // namespace detail
-
-// ================================================== LOGICAL
-
-// ------------------------------ Not
-
-HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGV, Not, not, _ALL)
-
-template <class V, HWY_IF_FLOAT_V(V)>
-HWY_API V Not(const V v) {
-  using DF = DFromV<V>;
-  using DU = RebindToUnsigned<DF>;
-  return BitCast(DF(), Not(BitCast(DU(), v)));
-}
-
-// ------------------------------ And
-
-// Non-vector version (ideally immediate) for use with Iota0
-namespace detail {
-HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, AndS, and_vx, _ALL)
-}  // namespace detail
-
-HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV, And, and, _ALL)
-
-template <class V, HWY_IF_FLOAT_V(V)>
-HWY_API V And(const V a, const V b) {
-  using DF = DFromV<V>;
-  using DU = RebindToUnsigned<DF>;
-  return BitCast(DF(), And(BitCast(DU(), a), BitCast(DU(), b)));
-}
-
-// ------------------------------ Or
-
-HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV, Or, or, _ALL)
-
-template <class V, HWY_IF_FLOAT_V(V)>
-HWY_API V Or(const V a, const V b) {
-  using DF = DFromV<V>;
-  using DU = RebindToUnsigned<DF>;
-  return BitCast(DF(), Or(BitCast(DU(), a), BitCast(DU(), b)));
-}
-
-// ------------------------------ Xor
-
-// Non-vector version (ideally immediate) for use with Iota0
-namespace detail {
-HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, XorS, xor_vx, _ALL)
-}  // namespace detail
-
-HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV, Xor, xor, _ALL)
-
-template <class V, HWY_IF_FLOAT_V(V)>
-HWY_API V Xor(const V a, const V b) {
-  using DF = DFromV<V>;
-  using DU = RebindToUnsigned<DF>;
-  return BitCast(DF(), Xor(BitCast(DU(), a), BitCast(DU(), b)));
-}
-
-// ------------------------------ AndNot
-
-template <class V>
-HWY_API V AndNot(const V not_a, const V b) {
-  return And(Not(not_a), b);
-}
-
-// ------------------------------ Or3
-
-template <class V>
-HWY_API V Or3(V o1, V o2, V o3) {
-  return Or(o1, Or(o2, o3));
-}
-
-// ------------------------------ OrAnd
-
-template <class V>
-HWY_API V OrAnd(const V o, const V a1, const V a2) {
-  return Or(o, And(a1, a2));
-}
-
-// ------------------------------ CopySign
-
-HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, CopySign, fsgnj, _ALL)
-
-template <class V>
-HWY_API V CopySignToAbs(const V abs, const V sign) {
-  // RVV can also handle abs < 0, so no extra action needed.
-  return CopySign(abs, sign);
-}
-
-// ================================================== ARITHMETIC
-
-// ------------------------------ Add
-
-namespace detail {
-HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, AddS, add_vx, _ALL)
-HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, AddS, fadd_vf, _ALL)
-HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, ReverseSubS, rsub_vx, _ALL)
-HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, ReverseSubS, frsub_vf, _ALL)
-}  // namespace detail
-
-HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV, Add, add, _ALL)
-HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Add, fadd, _ALL)
-
-// ------------------------------ Sub
-HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVV, Sub, sub, _ALL)
-HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Sub, fsub, _ALL)
-
-// ------------------------------ SaturatedAdd
-
-HWY_RVV_FOREACH_U08(HWY_RVV_RETV_ARGVV, SaturatedAdd, saddu, _ALL)
-HWY_RVV_FOREACH_U16(HWY_RVV_RETV_ARGVV, SaturatedAdd, saddu, _ALL)
-
-HWY_RVV_FOREACH_I08(HWY_RVV_RETV_ARGVV, SaturatedAdd, sadd, _ALL)
-HWY_RVV_FOREACH_I16(HWY_RVV_RETV_ARGVV, SaturatedAdd, sadd, _ALL)
-
-// ------------------------------ SaturatedSub
-
-HWY_RVV_FOREACH_U08(HWY_RVV_RETV_ARGVV, SaturatedSub, ssubu, _ALL)
-HWY_RVV_FOREACH_U16(HWY_RVV_RETV_ARGVV, SaturatedSub, ssubu, _ALL)
-
-HWY_RVV_FOREACH_I08(HWY_RVV_RETV_ARGVV, SaturatedSub, ssub, _ALL)
-HWY_RVV_FOREACH_I16(HWY_RVV_RETV_ARGVV, SaturatedSub, ssub, _ALL)
-
-// ------------------------------ AverageRound
-
-// TODO(janwas): check vxrm rounding mode
-HWY_RVV_FOREACH_U08(HWY_RVV_RETV_ARGVV, AverageRound, aaddu, _ALL)
-HWY_RVV_FOREACH_U16(HWY_RVV_RETV_ARGVV, AverageRound, aaddu, _ALL)
-
-// ------------------------------ ShiftLeft[Same]
-
-// Intrinsics do not define .vi forms, so use .vx instead.
-#define HWY_RVV_SHIFT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
-                      MLEN, NAME, OP)                                         \
-  template <int kBits>                                                        \
-  HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) {     \
-    return v##OP##_vx_##CHAR##SEW##LMUL(v, kBits, HWY_RVV_AVL(SEW, SHIFT));   \
-  }                                                                           \
-  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                          \
-      NAME##Same(HWY_RVV_V(BASE, SEW, LMUL) v, int bits) {                    \
-    return v##OP##_vx_##CHAR##SEW##LMUL(v, static_cast<uint8_t>(bits),        \
-                                        HWY_RVV_AVL(SEW, SHIFT));             \
-  }
-
-HWY_RVV_FOREACH_UI(HWY_RVV_SHIFT, ShiftLeft, sll, _ALL)
-
-// ------------------------------ ShiftRight[Same]
-
-HWY_RVV_FOREACH_U(HWY_RVV_SHIFT, ShiftRight, srl, _ALL)
-HWY_RVV_FOREACH_I(HWY_RVV_SHIFT, ShiftRight, sra, _ALL)
-
-#undef HWY_RVV_SHIFT
-
-// ------------------------------ SumsOf8 (ShiftRight, Add)
-template <class VU8>
-HWY_API VFromD<Repartition<uint64_t, DFromV<VU8>>> SumsOf8(const VU8 v) {
-  const DFromV<VU8> du8;
-  const RepartitionToWide<decltype(du8)> du16;
-  const RepartitionToWide<decltype(du16)> du32;
-  const RepartitionToWide<decltype(du32)> du64;
-  using VU16 = VFromD<decltype(du16)>;
-
-  const VU16 vFDB97531 = ShiftRight<8>(BitCast(du16, v));
-  const VU16 vECA86420 = detail::AndS(BitCast(du16, v), 0xFF);
-  const VU16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420);
-
-  const VU16 szz_FE_zz_BA_zz_76_zz_32 =
-      BitCast(du16, ShiftRight<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10)));
-  const VU16 sxx_FC_xx_B8_xx_74_xx_30 =
-      Add(sFE_DC_BA_98_76_54_32_10, szz_FE_zz_BA_zz_76_zz_32);
-  const VU16 szz_zz_xx_FC_zz_zz_xx_74 =
-      BitCast(du16, ShiftRight<32>(BitCast(du64, sxx_FC_xx_B8_xx_74_xx_30)));
-  const VU16 sxx_xx_xx_F8_xx_xx_xx_70 =
-      Add(sxx_FC_xx_B8_xx_74_xx_30, szz_zz_xx_FC_zz_zz_xx_74);
-  return detail::AndS(BitCast(du64, sxx_xx_xx_F8_xx_xx_xx_70), 0xFFFFull);
-}
-
-// ------------------------------ RotateRight
-template <int kBits, class V>
-HWY_API V RotateRight(const V v) {
-  constexpr size_t kSizeInBits = sizeof(TFromV<V>) * 8;
-  static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
-  if (kBits == 0) return v;
-  return Or(ShiftRight<kBits>(v), ShiftLeft<kSizeInBits - kBits>(v));
-}
-
-// ------------------------------ Shl
-#define HWY_RVV_SHIFT_VV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH,   \
-                         SHIFT, MLEN, NAME, OP)                             \
-  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                        \
-      NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \
-    return v##OP##_vv_##CHAR##SEW##LMUL(v, bits, HWY_RVV_AVL(SEW, SHIFT));  \
-  }
-
-HWY_RVV_FOREACH_U(HWY_RVV_SHIFT_VV, Shl, sll, _ALL)
-
-#define HWY_RVV_SHIFT_II(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH,   \
-                         SHIFT, MLEN, NAME, OP)                             \
-  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                        \
-      NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \
-    return v##OP##_vv_##CHAR##SEW##LMUL(v, detail::BitCastToUnsigned(bits), \
-                                        HWY_RVV_AVL(SEW, SHIFT));           \
-  }
-
-HWY_RVV_FOREACH_I(HWY_RVV_SHIFT_II, Shl, sll, _ALL)
-
-// ------------------------------ Shr
-
-HWY_RVV_FOREACH_U(HWY_RVV_SHIFT_VV, Shr, srl, _ALL)
-HWY_RVV_FOREACH_I(HWY_RVV_SHIFT_II, Shr, sra, _ALL)
-
-#undef HWY_RVV_SHIFT_II
-#undef HWY_RVV_SHIFT_VV
-
-// ------------------------------ Min
-
-HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, Min, minu, _ALL)
-HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, Min, min, _ALL)
-HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Min, fmin, _ALL)
-
-// ------------------------------ Max
-
-namespace detail {
-
-HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVS, MaxS, maxu_vx, _ALL)
-HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVS, MaxS, max_vx, _ALL)
-HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, MaxS, fmax_vf, _ALL)
-
-}  // namespace detail
-
-HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVV, Max, maxu, _ALL)
-HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVV, Max, max, _ALL)
-HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Max, fmax, _ALL)
-
-// ------------------------------ Mul
-
-HWY_RVV_FOREACH_UI163264(HWY_RVV_RETV_ARGVV, Mul, mul, _ALL)
-HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Mul, fmul, _ALL)
-
-// Per-target flag to prevent generic_ops-inl.h from defining i64 operator*.
-#ifdef HWY_NATIVE_I64MULLO
-#undef HWY_NATIVE_I64MULLO
-#else
-#define HWY_NATIVE_I64MULLO
-#endif
-
-// ------------------------------ MulHigh
-
-// Only for internal use (Highway only promises MulHigh for 16-bit inputs).
-// Used by MulEven; vwmul does not work for m8.
-namespace detail {
-HWY_RVV_FOREACH_I32(HWY_RVV_RETV_ARGVV, MulHigh, mulh, _ALL)
-HWY_RVV_FOREACH_U32(HWY_RVV_RETV_ARGVV, MulHigh, mulhu, _ALL)
-HWY_RVV_FOREACH_U64(HWY_RVV_RETV_ARGVV, MulHigh, mulhu, _ALL)
-}  // namespace detail
-
-HWY_RVV_FOREACH_U16(HWY_RVV_RETV_ARGVV, MulHigh, mulhu, _ALL)
-HWY_RVV_FOREACH_I16(HWY_RVV_RETV_ARGVV, MulHigh, mulh, _ALL)
-
-// ------------------------------ MulFixedPoint15
-HWY_RVV_FOREACH_I16(HWY_RVV_RETV_ARGVV, MulFixedPoint15, smul, _ALL)
-
-// ------------------------------ Div
-HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVV, Div, fdiv, _ALL)
-
-// ------------------------------ ApproximateReciprocal
-HWY_RVV_FOREACH_F32(HWY_RVV_RETV_ARGV, ApproximateReciprocal, frec7, _ALL)
-
-// ------------------------------ Sqrt
-HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGV, Sqrt, fsqrt, _ALL)
-
-// ------------------------------ ApproximateReciprocalSqrt
-HWY_RVV_FOREACH_F32(HWY_RVV_RETV_ARGV, ApproximateReciprocalSqrt, frsqrt7, _ALL)
-
-// ------------------------------ MulAdd
-// Note: op is still named vv, not vvv.
-#define HWY_RVV_FMA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT,    \
-                    MLEN, NAME, OP)                                            \
-  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                           \
-      NAME(HWY_RVV_V(BASE, SEW, LMUL) mul, HWY_RVV_V(BASE, SEW, LMUL) x,       \
-           HWY_RVV_V(BASE, SEW, LMUL) add) {                                   \
-    return v##OP##_vv_##CHAR##SEW##LMUL(add, mul, x, HWY_RVV_AVL(SEW, SHIFT)); \
-  }
-
-HWY_RVV_FOREACH_F(HWY_RVV_FMA, MulAdd, fmacc, _ALL)
-
-// ------------------------------ NegMulAdd
-HWY_RVV_FOREACH_F(HWY_RVV_FMA, NegMulAdd, fnmsac, _ALL)
-
-// ------------------------------ MulSub
-HWY_RVV_FOREACH_F(HWY_RVV_FMA, MulSub, fmsac, _ALL)
-
-// ------------------------------ NegMulSub
-HWY_RVV_FOREACH_F(HWY_RVV_FMA, NegMulSub, fnmacc, _ALL)
-
-#undef HWY_RVV_FMA
-
-// ================================================== COMPARE
-
-// Comparisons set a mask bit to 1 if the condition is true, else 0. The XX in
-// vboolXX_t is a power of two divisor for vector bits. SLEN 8 / LMUL 1 = 1/8th
-// of all bits; SLEN 8 / LMUL 4 = half of all bits.
-
-// mask = f(vector, vector)
-#define HWY_RVV_RETM_ARGVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
-                           SHIFT, MLEN, NAME, OP)                           \
-  HWY_API HWY_RVV_M(MLEN)                                                   \
-      NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) {    \
-    return v##OP##_vv_##CHAR##SEW##LMUL##_b##MLEN(a, b,                     \
-                                                  HWY_RVV_AVL(SEW, SHIFT)); \
-  }
-
-// mask = f(vector, scalar)
-#define HWY_RVV_RETM_ARGVS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH,    \
-                           SHIFT, MLEN, NAME, OP)                              \
-  HWY_API HWY_RVV_M(MLEN)                                                      \
-      NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_T(BASE, SEW) b) {             \
-    return v##OP##_##CHAR##SEW##LMUL##_b##MLEN(a, b, HWY_RVV_AVL(SEW, SHIFT)); \
-  }
-
-// ------------------------------ Eq
-HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVV, Eq, mseq, _ALL)
-HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVV, Eq, mfeq, _ALL)
-
-namespace detail {
-HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVS, EqS, mseq_vx, _ALL)
-HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVS, EqS, mfeq_vf, _ALL)
-}  // namespace detail
-
-// ------------------------------ Ne
-HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVV, Ne, msne, _ALL)
-HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVV, Ne, mfne, _ALL)
-
-namespace detail {
-HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVS, NeS, msne_vx, _ALL)
-HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVS, NeS, mfne_vf, _ALL)
-}  // namespace detail
-
-// ------------------------------ Lt
-HWY_RVV_FOREACH_U(HWY_RVV_RETM_ARGVV, Lt, msltu, _ALL)
-HWY_RVV_FOREACH_I(HWY_RVV_RETM_ARGVV, Lt, mslt, _ALL)
-HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVV, Lt, mflt, _ALL)
-
-namespace detail {
-HWY_RVV_FOREACH_I(HWY_RVV_RETM_ARGVS, LtS, mslt_vx, _ALL)
-HWY_RVV_FOREACH_U(HWY_RVV_RETM_ARGVS, LtS, msltu_vx, _ALL)
-HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVS, LtS, mflt_vf, _ALL)
-}  // namespace detail
-
-// ------------------------------ Le
-HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVV, Le, mfle, _ALL)
-
-#undef HWY_RVV_RETM_ARGVV
-#undef HWY_RVV_RETM_ARGVS
-
-// ------------------------------ Gt/Ge
-
-template <class V>
-HWY_API auto Ge(const V a, const V b) -> decltype(Le(a, b)) {
-  return Le(b, a);
-}
-
-template <class V>
-HWY_API auto Gt(const V a, const V b) -> decltype(Lt(a, b)) {
-  return Lt(b, a);
-}
-
-// ------------------------------ TestBit
-template <class V>
-HWY_API auto TestBit(const V a, const V bit) -> decltype(Eq(a, bit)) {
-  return detail::NeS(And(a, bit), 0);
-}
-
-// ------------------------------ Not
-// NOLINTNEXTLINE
-HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGM, Not, not )
-
-// ------------------------------ And
-
-// mask = f(mask_a, mask_b) (note arg2,arg1 order!)
-#define HWY_RVV_RETM_ARGMM(SEW, SHIFT, MLEN, NAME, OP)                 \
-  HWY_API HWY_RVV_M(MLEN) NAME(HWY_RVV_M(MLEN) a, HWY_RVV_M(MLEN) b) { \
-    return vm##OP##_mm_b##MLEN(b, a, HWY_RVV_AVL(SEW, SHIFT));         \
-  }
-
-HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, And, and)
-
-// ------------------------------ AndNot
-HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, AndNot, andn)
-
-// ------------------------------ Or
-HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, Or, or)
-
-// ------------------------------ Xor
-HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGMM, Xor, xor)
-
-#undef HWY_RVV_RETM_ARGMM
-
-// ------------------------------ IfThenElse
-#define HWY_RVV_IF_THEN_ELSE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH,  \
-                             SHIFT, MLEN, NAME, OP)                            \
-  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                           \
-      NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) yes,                  \
-           HWY_RVV_V(BASE, SEW, LMUL) no) {                                    \
-    return v##OP##_vvm_##CHAR##SEW##LMUL(m, no, yes, HWY_RVV_AVL(SEW, SHIFT)); \
-  }
-
-HWY_RVV_FOREACH(HWY_RVV_IF_THEN_ELSE, IfThenElse, merge, _ALL)
-
-#undef HWY_RVV_IF_THEN_ELSE
-
-// ------------------------------ IfThenElseZero
-template <class M, class V>
-HWY_API V IfThenElseZero(const M mask, const V yes) {
-  return IfThenElse(mask, yes, Zero(DFromV<V>()));
-}
-
-// ------------------------------ IfThenZeroElse
-
-#define HWY_RVV_IF_THEN_ZERO_ELSE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \
-                                  LMULH, SHIFT, MLEN, NAME, OP)             \
-  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                        \
-      NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) no) {              \
-    return v##OP##_##CHAR##SEW##LMUL(m, no, 0, HWY_RVV_AVL(SEW, SHIFT));    \
-  }
-
-HWY_RVV_FOREACH_UI(HWY_RVV_IF_THEN_ZERO_ELSE, IfThenZeroElse, merge_vxm, _ALL)
-HWY_RVV_FOREACH_F(HWY_RVV_IF_THEN_ZERO_ELSE, IfThenZeroElse, fmerge_vfm, _ALL)
-
-#undef HWY_RVV_IF_THEN_ZERO_ELSE
-
-// ------------------------------ MaskFromVec
-
-template <class V>
-HWY_API auto MaskFromVec(const V v) -> decltype(Eq(v, v)) {
-  return detail::NeS(v, 0);
-}
-
-template <class D>
-using MFromD = decltype(MaskFromVec(Zero(D())));
-
-template <class D, typename MFrom>
-HWY_API MFromD<D> RebindMask(const D /*d*/, const MFrom mask) {
-  // No need to check lane size/LMUL are the same: if not, casting MFrom to
-  // MFromD<D> would fail.
-  return mask;
-}
-
-// ------------------------------ VecFromMask
-
-namespace detail {
-#define HWY_RVV_VEC_FROM_MASK(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
-                              SHIFT, MLEN, NAME, OP)                           \
-  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                           \
-      NAME(HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_M(MLEN) m) {                 \
-    return v##OP##_##CHAR##SEW##LMUL##_m(m, v0, v0, 1,                         \
-                                         HWY_RVV_AVL(SEW, SHIFT));             \
-  }
-
-HWY_RVV_FOREACH_UI(HWY_RVV_VEC_FROM_MASK, SubS, sub_vx, _ALL)
-#undef HWY_RVV_VEC_FROM_MASK
-}  // namespace detail
-
-template <class D, HWY_IF_NOT_FLOAT_D(D)>
-HWY_API VFromD<D> VecFromMask(const D d, MFromD<D> mask) {
-  return detail::SubS(Zero(d), mask);
-}
-
-template <class D, HWY_IF_FLOAT_D(D)>
-HWY_API VFromD<D> VecFromMask(const D d, MFromD<D> mask) {
-  return BitCast(d, VecFromMask(RebindToUnsigned<D>(), mask));
-}
-
-// ------------------------------ IfVecThenElse (MaskFromVec)
-
-template <class V>
-HWY_API V IfVecThenElse(const V mask, const V yes, const V no) {
-  return IfThenElse(MaskFromVec(mask), yes, no);
-}
-
-// ------------------------------ ZeroIfNegative
-template <class V>
-HWY_API V ZeroIfNegative(const V v) {
-  return IfThenZeroElse(detail::LtS(v, 0), v);
-}
-
-// ------------------------------ BroadcastSignBit
-template <class V>
-HWY_API V BroadcastSignBit(const V v) {
-  return ShiftRight<sizeof(TFromV<V>) * 8 - 1>(v);
-}
-
-// ------------------------------ IfNegativeThenElse (BroadcastSignBit)
-template <class V>
-HWY_API V IfNegativeThenElse(V v, V yes, V no) {
-  static_assert(IsSigned<TFromV<V>>(), "Only works for signed/float");
-  const DFromV<V> d;
-  const RebindToSigned<decltype(d)> di;
-
-  MFromD<decltype(d)> m =
-      MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v))));
-  return IfThenElse(m, yes, no);
-}
-
-// ------------------------------ FindFirstTrue
-
-#define HWY_RVV_FIND_FIRST_TRUE(SEW, SHIFT, MLEN, NAME, OP) \
-  template <class D>                                        \
-  HWY_API intptr_t FindFirstTrue(D d, HWY_RVV_M(MLEN) m) {  \
-    static_assert(MLenFromD(d) == MLEN, "Type mismatch");   \
-    return vfirst_m_b##MLEN(m, Lanes(d));                   \
-  }
-
-HWY_RVV_FOREACH_B(HWY_RVV_FIND_FIRST_TRUE, _, _)
-#undef HWY_RVV_FIND_FIRST_TRUE
-
-// ------------------------------ AllFalse
-template <class D>
-HWY_API bool AllFalse(D d, MFromD<D> m) {
-  return FindFirstTrue(d, m) < 0;
-}
-
-// ------------------------------ AllTrue
-
-#define HWY_RVV_ALL_TRUE(SEW, SHIFT, MLEN, NAME, OP)      \
-  template <class D>                                      \
-  HWY_API bool AllTrue(D d, HWY_RVV_M(MLEN) m) {          \
-    static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \
-    return AllFalse(d, vmnot_m_b##MLEN(m, Lanes(d)));     \
-  }
-
-HWY_RVV_FOREACH_B(HWY_RVV_ALL_TRUE, _, _)
-#undef HWY_RVV_ALL_TRUE
-
-// ------------------------------ CountTrue
-
-#define HWY_RVV_COUNT_TRUE(SEW, SHIFT, MLEN, NAME, OP)    \
-  template <class D>                                      \
-  HWY_API size_t CountTrue(D d, HWY_RVV_M(MLEN) m) {      \
-    static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \
-    return vcpop_m_b##MLEN(m, Lanes(d));                  \
-  }
-
-HWY_RVV_FOREACH_B(HWY_RVV_COUNT_TRUE, _, _)
-#undef HWY_RVV_COUNT_TRUE
-
-// ================================================== MEMORY
-
-// ------------------------------ Load
-
-#define HWY_RVV_LOAD(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
-                     MLEN, NAME, OP)                                         \
-  template <size_t N>                                                        \
-  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                         \
-      NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d,                                 \
-           const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) {                    \
-    return v##OP##SEW##_v_##CHAR##SEW##LMUL(p, Lanes(d));                    \
-  }
-HWY_RVV_FOREACH(HWY_RVV_LOAD, Load, le, _ALL_VIRT)
-#undef HWY_RVV_LOAD
-
-// There is no native BF16, treat as uint16_t.
-template <size_t N, int kPow2>
-HWY_API VFromD<Simd<uint16_t, N, kPow2>> Load(
-    Simd<bfloat16_t, N, kPow2> d, const bfloat16_t* HWY_RESTRICT p) {
-  return Load(RebindToUnsigned<decltype(d)>(),
-              reinterpret_cast<const uint16_t * HWY_RESTRICT>(p));
-}
-
-template <size_t N, int kPow2>
-HWY_API void Store(VFromD<Simd<uint16_t, N, kPow2>> v,
-                   Simd<bfloat16_t, N, kPow2> d, bfloat16_t* HWY_RESTRICT p) {
-  Store(v, RebindToUnsigned<decltype(d)>(),
-        reinterpret_cast<uint16_t * HWY_RESTRICT>(p));
-}
-
-// ------------------------------ LoadU
-
-// RVV only requires lane alignment, not natural alignment of the entire vector.
-template <class D>
-HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
-  return Load(d, p);
-}
-
-// ------------------------------ MaskedLoad
-
-#define HWY_RVV_MASKED_LOAD(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
-                            SHIFT, MLEN, NAME, OP)                           \
-  template <size_t N>                                                        \
-  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                         \
-      NAME(HWY_RVV_M(MLEN) m, HWY_RVV_D(BASE, SEW, N, SHIFT) d,              \
-           const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) {                    \
-    return v##OP##SEW##_v_##CHAR##SEW##LMUL##_m(m, Zero(d), p, Lanes(d));    \
-  }
-HWY_RVV_FOREACH(HWY_RVV_MASKED_LOAD, MaskedLoad, le, _ALL_VIRT)
-#undef HWY_RVV_MASKED_LOAD
-
-// ------------------------------ Store
-
-#define HWY_RVV_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
-                      MLEN, NAME, OP)                                         \
-  template <size_t N>                                                         \
-  HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v,                             \
-                    HWY_RVV_D(BASE, SEW, N, SHIFT) d,                         \
-                    HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) {                  \
-    return v##OP##SEW##_v_##CHAR##SEW##LMUL(p, v, Lanes(d));                  \
-  }
-HWY_RVV_FOREACH(HWY_RVV_STORE, Store, se, _ALL_VIRT)
-#undef HWY_RVV_STORE
-
-// ------------------------------ BlendedStore
-
-#define HWY_RVV_BLENDED_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
-                              SHIFT, MLEN, NAME, OP)                           \
-  template <size_t N>                                                          \
-  HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) m,           \
-                    HWY_RVV_D(BASE, SEW, N, SHIFT) d,                          \
-                    HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) {                   \
-    return v##OP##SEW##_v_##CHAR##SEW##LMUL##_m(m, p, v, Lanes(d));            \
-  }
-HWY_RVV_FOREACH(HWY_RVV_BLENDED_STORE, BlendedStore, se, _ALL_VIRT)
-#undef HWY_RVV_BLENDED_STORE
-
-namespace detail {
-
-#define HWY_RVV_STOREN(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
-                       MLEN, NAME, OP)                                         \
-  template <size_t N>                                                          \
-  HWY_API void NAME(size_t count, HWY_RVV_V(BASE, SEW, LMUL) v,                \
-                    HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */,                    \
-                    HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) {                   \
-    return v##OP##SEW##_v_##CHAR##SEW##LMUL(p, v, count);                      \
-  }
-HWY_RVV_FOREACH(HWY_RVV_STOREN, StoreN, se, _ALL_VIRT)
-#undef HWY_RVV_STOREN
-
-}  // namespace detail
-
-// ------------------------------ StoreU
-
-// RVV only requires lane alignment, not natural alignment of the entire vector.
-template <class V, class D>
-HWY_API void StoreU(const V v, D d, TFromD<D>* HWY_RESTRICT p) {
-  Store(v, d, p);
-}
-
-// ------------------------------ Stream
-template <class V, class D, typename T>
-HWY_API void Stream(const V v, D d, T* HWY_RESTRICT aligned) {
-  Store(v, d, aligned);
-}
-
-// ------------------------------ ScatterOffset
-
-#define HWY_RVV_SCATTER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
-                        SHIFT, MLEN, NAME, OP)                           \
-  template <size_t N>                                                    \
-  HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v,                        \
-                    HWY_RVV_D(BASE, SEW, N, SHIFT) d,                    \
-                    HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base,            \
-                    HWY_RVV_V(int, SEW, LMUL) offset) {                  \
-    return v##OP##ei##SEW##_v_##CHAR##SEW##LMUL(                         \
-        base, detail::BitCastToUnsigned(offset), v, Lanes(d));           \
-  }
-HWY_RVV_FOREACH(HWY_RVV_SCATTER, ScatterOffset, sux, _ALL_VIRT)
-#undef HWY_RVV_SCATTER
-
-// ------------------------------ ScatterIndex
-
-template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
-HWY_API void ScatterIndex(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT base,
-                          const VFromD<RebindToSigned<D>> index) {
-  return ScatterOffset(v, d, base, ShiftLeft<2>(index));
-}
-
-template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
-HWY_API void ScatterIndex(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT base,
-                          const VFromD<RebindToSigned<D>> index) {
-  return ScatterOffset(v, d, base, ShiftLeft<3>(index));
-}
-
-// ------------------------------ GatherOffset
-
-#define HWY_RVV_GATHER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
-                       MLEN, NAME, OP)                                         \
-  template <size_t N>                                                          \
-  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                           \
-      NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d,                                   \
-           const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base,                     \
-           HWY_RVV_V(int, SEW, LMUL) offset) {                                 \
-    return v##OP##ei##SEW##_v_##CHAR##SEW##LMUL(                               \
-        base, detail::BitCastToUnsigned(offset), Lanes(d));                    \
-  }
-HWY_RVV_FOREACH(HWY_RVV_GATHER, GatherOffset, lux, _ALL_VIRT)
-#undef HWY_RVV_GATHER
-
-// ------------------------------ GatherIndex
-
-template <class D, HWY_IF_LANE_SIZE_D(D, 4)>
-HWY_API VFromD<D> GatherIndex(D d, const TFromD<D>* HWY_RESTRICT base,
-                              const VFromD<RebindToSigned<D>> index) {
-  return GatherOffset(d, base, ShiftLeft<2>(index));
-}
-
-template <class D, HWY_IF_LANE_SIZE_D(D, 8)>
-HWY_API VFromD<D> GatherIndex(D d, const TFromD<D>* HWY_RESTRICT base,
-                              const VFromD<RebindToSigned<D>> index) {
-  return GatherOffset(d, base, ShiftLeft<3>(index));
-}
-
-// ------------------------------ LoadInterleaved2
-
-// Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2.
-#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
-#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
-#else
-#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
-#endif
-
-#define HWY_RVV_LOAD2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
-                      MLEN, NAME, OP)                                         \
-  template <size_t N>                                                         \
-  HWY_API void NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d,                         \
-                    const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned,      \
-                    HWY_RVV_V(BASE, SEW, LMUL) & v0,                          \
-                    HWY_RVV_V(BASE, SEW, LMUL) & v1) {                        \
-    v##OP##e##SEW##_v_##CHAR##SEW##LMUL(&v0, &v1, unaligned, Lanes(d));       \
-  }
-// Segments are limited to 8 registers, so we can only go up to LMUL=2.
-HWY_RVV_FOREACH(HWY_RVV_LOAD2, LoadInterleaved2, lseg2, _LE2_VIRT)
-#undef HWY_RVV_LOAD2
-
-// ------------------------------ LoadInterleaved3
-
-#define HWY_RVV_LOAD3(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
-                      MLEN, NAME, OP)                                         \
-  template <size_t N>                                                         \
-  HWY_API void NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d,                         \
-                    const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned,      \
-                    HWY_RVV_V(BASE, SEW, LMUL) & v0,                          \
-                    HWY_RVV_V(BASE, SEW, LMUL) & v1,                          \
-                    HWY_RVV_V(BASE, SEW, LMUL) & v2) {                        \
-    v##OP##e##SEW##_v_##CHAR##SEW##LMUL(&v0, &v1, &v2, unaligned, Lanes(d));  \
-  }
-// Segments are limited to 8 registers, so we can only go up to LMUL=2.
-HWY_RVV_FOREACH(HWY_RVV_LOAD3, LoadInterleaved3, lseg3, _LE2_VIRT)
-#undef HWY_RVV_LOAD3
-
-// ------------------------------ LoadInterleaved4
-
-#define HWY_RVV_LOAD4(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
-                      MLEN, NAME, OP)                                         \
-  template <size_t N>                                                         \
-  HWY_API void NAME(                                                          \
-      HWY_RVV_D(BASE, SEW, N, SHIFT) d,                                       \
-      const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT aligned,                      \
-      HWY_RVV_V(BASE, SEW, LMUL) & v0, HWY_RVV_V(BASE, SEW, LMUL) & v1,       \
-      HWY_RVV_V(BASE, SEW, LMUL) & v2, HWY_RVV_V(BASE, SEW, LMUL) & v3) {     \
-    v##OP##e##SEW##_v_##CHAR##SEW##LMUL(&v0, &v1, &v2, &v3, aligned,          \
-                                        Lanes(d));                            \
-  }
-// Segments are limited to 8 registers, so we can only go up to LMUL=2.
-HWY_RVV_FOREACH(HWY_RVV_LOAD4, LoadInterleaved4, lseg4, _LE2_VIRT)
-#undef HWY_RVV_LOAD4
-
-// ------------------------------ StoreInterleaved2
-
-#define HWY_RVV_STORE2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
-                       MLEN, NAME, OP)                                         \
-  template <size_t N>                                                          \
-  HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v0,                             \
-                    HWY_RVV_V(BASE, SEW, LMUL) v1,                             \
-                    HWY_RVV_D(BASE, SEW, N, SHIFT) d,                          \
-                    HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned) {           \
-    v##OP##e##SEW##_v_##CHAR##SEW##LMUL(unaligned, v0, v1, Lanes(d));          \
-  }
-// Segments are limited to 8 registers, so we can only go up to LMUL=2.
-HWY_RVV_FOREACH(HWY_RVV_STORE2, StoreInterleaved2, sseg2, _LE2_VIRT)
-#undef HWY_RVV_STORE2
-
-// ------------------------------ StoreInterleaved3
-
-#define HWY_RVV_STORE3(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
-                       MLEN, NAME, OP)                                         \
-  template <size_t N>                                                          \
-  HWY_API void NAME(                                                           \
-      HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_V(BASE, SEW, LMUL) v1,            \
-      HWY_RVV_V(BASE, SEW, LMUL) v2, HWY_RVV_D(BASE, SEW, N, SHIFT) d,         \
-      HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned) {                         \
-    v##OP##e##SEW##_v_##CHAR##SEW##LMUL(unaligned, v0, v1, v2, Lanes(d));      \
-  }
-// Segments are limited to 8 registers, so we can only go up to LMUL=2.
-HWY_RVV_FOREACH(HWY_RVV_STORE3, StoreInterleaved3, sseg3, _LE2_VIRT)
-#undef HWY_RVV_STORE3
-
-// ------------------------------ StoreInterleaved4
-
-#define HWY_RVV_STORE4(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
-                       MLEN, NAME, OP)                                         \
-  template <size_t N>                                                          \
-  HWY_API void NAME(                                                           \
-      HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_V(BASE, SEW, LMUL) v1,            \
-      HWY_RVV_V(BASE, SEW, LMUL) v2, HWY_RVV_V(BASE, SEW, LMUL) v3,            \
-      HWY_RVV_D(BASE, SEW, N, SHIFT) d,                                        \
-      HWY_RVV_T(BASE, SEW) * HWY_RESTRICT aligned) {                           \
-    v##OP##e##SEW##_v_##CHAR##SEW##LMUL(aligned, v0, v1, v2, v3, Lanes(d));    \
-  }
-// Segments are limited to 8 registers, so we can only go up to LMUL=2.
-HWY_RVV_FOREACH(HWY_RVV_STORE4, StoreInterleaved4, sseg4, _LE2_VIRT)
-#undef HWY_RVV_STORE4
-
-// ================================================== CONVERT
-
-// ------------------------------ PromoteTo
-
-// SEW is for the input so we can use F16 (no-op if not supported).
-#define HWY_RVV_PROMOTE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH,     \
-                        SHIFT, MLEN, NAME, OP)                               \
-  template <size_t N>                                                        \
-  HWY_API HWY_RVV_V(BASE, SEWD, LMULD) NAME(                                 \
-      HWY_RVV_D(BASE, SEWD, N, SHIFT + 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \
-    return OP##CHAR##SEWD##LMULD(v, Lanes(d));                               \
-  }
-
-HWY_RVV_FOREACH_U08(HWY_RVV_PROMOTE, PromoteTo, vzext_vf2_, _EXT_VIRT)
-HWY_RVV_FOREACH_U16(HWY_RVV_PROMOTE, PromoteTo, vzext_vf2_, _EXT_VIRT)
-HWY_RVV_FOREACH_U32(HWY_RVV_PROMOTE, PromoteTo, vzext_vf2_, _EXT_VIRT)
-HWY_RVV_FOREACH_I08(HWY_RVV_PROMOTE, PromoteTo, vsext_vf2_, _EXT_VIRT)
-HWY_RVV_FOREACH_I16(HWY_RVV_PROMOTE, PromoteTo, vsext_vf2_, _EXT_VIRT)
-HWY_RVV_FOREACH_I32(HWY_RVV_PROMOTE, PromoteTo, vsext_vf2_, _EXT_VIRT)
-HWY_RVV_FOREACH_F16(HWY_RVV_PROMOTE, PromoteTo, vfwcvt_f_f_v_, _EXT_VIRT)
-HWY_RVV_FOREACH_F32(HWY_RVV_PROMOTE, PromoteTo, vfwcvt_f_f_v_, _EXT_VIRT)
-#undef HWY_RVV_PROMOTE
-
-// The above X-macro cannot handle 4x promotion nor type switching.
-// TODO(janwas): use BASE2 arg to allow the latter.
-#define HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, LMUL, LMUL_IN, \
-                        SHIFT, ADD)                                            \
-  template <size_t N>                                                          \
-  HWY_API HWY_RVV_V(BASE, BITS, LMUL)                                          \
-      PromoteTo(HWY_RVV_D(BASE, BITS, N, SHIFT + ADD) d,                       \
-                HWY_RVV_V(BASE_IN, BITS_IN, LMUL_IN) v) {                      \
-    return OP##CHAR##BITS##LMUL(v, Lanes(d));                                  \
-  }
-
-#define HWY_RVV_PROMOTE_X2(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN)        \
-  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf2, -2, 1) \
-  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf2, -1, 1) \
-  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m2, m1, 0, 1)   \
-  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m4, m2, 1, 1)   \
-  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m8, m4, 2, 1)
-
-#define HWY_RVV_PROMOTE_X4(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN)         \
-  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, mf2, mf8, -3, 2) \
-  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf4, -2, 2)  \
-  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m2, mf2, -1, 2)  \
-  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m4, m1, 0, 2)    \
-  HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m8, m2, 1, 2)
-
-HWY_RVV_PROMOTE_X4(vzext_vf4_, uint, u, 32, uint, 8)
-HWY_RVV_PROMOTE_X4(vsext_vf4_, int, i, 32, int, 8)
-
-// i32 to f64
-HWY_RVV_PROMOTE_X2(vfwcvt_f_x_v_, float, f, 64, int, 32)
-
-#undef HWY_RVV_PROMOTE_X4
-#undef HWY_RVV_PROMOTE_X2
-#undef HWY_RVV_PROMOTE
-
-// Unsigned to signed: cast for unsigned promote.
-template <size_t N, int kPow2>
-HWY_API auto PromoteTo(Simd<int16_t, N, kPow2> d,
-                       VFromD<Rebind<uint8_t, decltype(d)>> v)
-    -> VFromD<decltype(d)> {
-  return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
-}
-
-template <size_t N, int kPow2>
-HWY_API auto PromoteTo(Simd<int32_t, N, kPow2> d,
-                       VFromD<Rebind<uint8_t, decltype(d)>> v)
-    -> VFromD<decltype(d)> {
-  return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
-}
-
-template <size_t N, int kPow2>
-HWY_API auto PromoteTo(Simd<int32_t, N, kPow2> d,
-                       VFromD<Rebind<uint16_t, decltype(d)>> v)
-    -> VFromD<decltype(d)> {
-  return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
-}
-
-template <size_t N, int kPow2>
-HWY_API auto PromoteTo(Simd<float32_t, N, kPow2> d,
-                       VFromD<Rebind<bfloat16_t, decltype(d)>> v)
-    -> VFromD<decltype(d)> {
-  const RebindToSigned<decltype(d)> di32;
-  const Rebind<uint16_t, decltype(d)> du16;
-  return BitCast(d, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
-}
-
-// ------------------------------ DemoteTo U
-
-// SEW is for the source so we can use _DEMOTE.
-#define HWY_RVV_DEMOTE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
-                       MLEN, NAME, OP)                                         \
-  template <size_t N>                                                          \
-  HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME(                                   \
-      HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) {   \
-    return OP##CHAR##SEWH##LMULH(v, 0, Lanes(d));                              \
-  }                                                                            \
-  template <size_t N>                                                          \
-  HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME##Shr16(                            \
-      HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) {   \
-    return OP##CHAR##SEWH##LMULH(v, 16, Lanes(d));                             \
-  }
-
-// Unsigned -> unsigned (also used for bf16)
-namespace detail {
-HWY_RVV_FOREACH_U16(HWY_RVV_DEMOTE, DemoteTo, vnclipu_wx_, _DEMOTE_VIRT)
-HWY_RVV_FOREACH_U32(HWY_RVV_DEMOTE, DemoteTo, vnclipu_wx_, _DEMOTE_VIRT)
-}  // namespace detail
-
-// SEW is for the source so we can use _DEMOTE.
-#define HWY_RVV_DEMOTE_I_TO_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
-                              SHIFT, MLEN, NAME, OP)                           \
-  template <size_t N>                                                          \
-  HWY_API HWY_RVV_V(uint, SEWH, LMULH) NAME(                                   \
-      HWY_RVV_D(uint, SEWH, N, SHIFT - 1) d, HWY_RVV_V(int, SEW, LMUL) v) {    \
-    /* First clamp negative numbers to zero to match x86 packus. */            \
-    return detail::DemoteTo(d, detail::BitCastToUnsigned(detail::MaxS(v, 0))); \
-  }
-HWY_RVV_FOREACH_I32(HWY_RVV_DEMOTE_I_TO_U, DemoteTo, _, _DEMOTE_VIRT)
-HWY_RVV_FOREACH_I16(HWY_RVV_DEMOTE_I_TO_U, DemoteTo, _, _DEMOTE_VIRT)
-#undef HWY_RVV_DEMOTE_I_TO_U
-
-template <size_t N>
-HWY_API vuint8mf8_t DemoteTo(Simd<uint8_t, N, -3> d, const vint32mf2_t v) {
-  return vnclipu_wx_u8mf8(DemoteTo(Simd<uint16_t, N, -2>(), v), 0, Lanes(d));
-}
-template <size_t N>
-HWY_API vuint8mf4_t DemoteTo(Simd<uint8_t, N, -2> d, const vint32m1_t v) {
-  return vnclipu_wx_u8mf4(DemoteTo(Simd<uint16_t, N, -1>(), v), 0, Lanes(d));
-}
-template <size_t N>
-HWY_API vuint8mf2_t DemoteTo(Simd<uint8_t, N, -1> d, const vint32m2_t v) {
-  return vnclipu_wx_u8mf2(DemoteTo(Simd<uint16_t, N, 0>(), v), 0, Lanes(d));
-}
-template <size_t N>
-HWY_API vuint8m1_t DemoteTo(Simd<uint8_t, N, 0> d, const vint32m4_t v) {
-  return vnclipu_wx_u8m1(DemoteTo(Simd<uint16_t, N, 1>(), v), 0, Lanes(d));
-}
-template <size_t N>
-HWY_API vuint8m2_t DemoteTo(Simd<uint8_t, N, 1> d, const vint32m8_t v) {
-  return vnclipu_wx_u8m2(DemoteTo(Simd<uint16_t, N, 2>(), v), 0, Lanes(d));
-}
-
-HWY_API vuint8mf8_t U8FromU32(const vuint32mf2_t v) {
-  const size_t avl = Lanes(ScalableTag<uint8_t, -3>());
-  return vnclipu_wx_u8mf8(vnclipu_wx_u16mf4(v, 0, avl), 0, avl);
-}
-HWY_API vuint8mf4_t U8FromU32(const vuint32m1_t v) {
-  const size_t avl = Lanes(ScalableTag<uint8_t, -2>());
-  return vnclipu_wx_u8mf4(vnclipu_wx_u16mf2(v, 0, avl), 0, avl);
-}
-HWY_API vuint8mf2_t U8FromU32(const vuint32m2_t v) {
-  const size_t avl = Lanes(ScalableTag<uint8_t, -1>());
-  return vnclipu_wx_u8mf2(vnclipu_wx_u16m1(v, 0, avl), 0, avl);
-}
-HWY_API vuint8m1_t U8FromU32(const vuint32m4_t v) {
-  const size_t avl = Lanes(ScalableTag<uint8_t, 0>());
-  return vnclipu_wx_u8m1(vnclipu_wx_u16m2(v, 0, avl), 0, avl);
-}
-HWY_API vuint8m2_t U8FromU32(const vuint32m8_t v) {
-  const size_t avl = Lanes(ScalableTag<uint8_t, 1>());
-  return vnclipu_wx_u8m2(vnclipu_wx_u16m4(v, 0, avl), 0, avl);
-}
-
-// ------------------------------ Truncations
-
-template <size_t N>
-HWY_API vuint8mf8_t TruncateTo(Simd<uint8_t, N, -3> d,
-                               const VFromD<Simd<uint64_t, N, 0>> v) {
-  const size_t avl = Lanes(d);
-  const vuint64m1_t v1 = vand(v, 0xFF, avl);
-  const vuint32mf2_t v2 = vnclipu_wx_u32mf2(v1, 0, avl);
-  const vuint16mf4_t v3 = vnclipu_wx_u16mf4(v2, 0, avl);
-  return vnclipu_wx_u8mf8(v3, 0, avl);
-}
-
-template <size_t N>
-HWY_API vuint8mf4_t TruncateTo(Simd<uint8_t, N, -2> d,
-                               const VFromD<Simd<uint64_t, N, 1>> v) {
-  const size_t avl = Lanes(d);
-  const vuint64m2_t v1 = vand(v, 0xFF, avl);
-  const vuint32m1_t v2 = vnclipu_wx_u32m1(v1, 0, avl);
-  const vuint16mf2_t v3 = vnclipu_wx_u16mf2(v2, 0, avl);
-  return vnclipu_wx_u8mf4(v3, 0, avl);
-}
-
-template <size_t N>
-HWY_API vuint8mf2_t TruncateTo(Simd<uint8_t, N, -1> d,
-                               const VFromD<Simd<uint64_t, N, 2>> v) {
-  const size_t avl = Lanes(d);
-  const vuint64m4_t v1 = vand(v, 0xFF, avl);
-  const vuint32m2_t v2 = vnclipu_wx_u32m2(v1, 0, avl);
-  const vuint16m1_t v3 = vnclipu_wx_u16m1(v2, 0, avl);
-  return vnclipu_wx_u8mf2(v3, 0, avl);
-}
-
-template <size_t N>
-HWY_API vuint8m1_t TruncateTo(Simd<uint8_t, N, 0> d,
-                              const VFromD<Simd<uint64_t, N, 3>> v) {
-  const size_t avl = Lanes(d);
-  const vuint64m8_t v1 = vand(v, 0xFF, avl);
-  const vuint32m4_t v2 = vnclipu_wx_u32m4(v1, 0, avl);
-  const vuint16m2_t v3 = vnclipu_wx_u16m2(v2, 0, avl);
-  return vnclipu_wx_u8m1(v3, 0, avl);
-}
-
-template <size_t N>
-HWY_API vuint16mf4_t TruncateTo(Simd<uint16_t, N, -2> d,
-                                const VFromD<Simd<uint64_t, N, 0>> v) {
-  const size_t avl = Lanes(d);
-  const vuint64m1_t v1 = vand(v, 0xFFFF, avl);
-  const vuint32mf2_t v2 = vnclipu_wx_u32mf2(v1, 0, avl);
-  return vnclipu_wx_u16mf4(v2, 0, avl);
-}
-
-template <size_t N>
-HWY_API vuint16mf2_t TruncateTo(Simd<uint16_t, N, -1> d,
-                                const VFromD<Simd<uint64_t, N, 1>> v) {
-  const size_t avl = Lanes(d);
-  const vuint64m2_t v1 = vand(v, 0xFFFF, avl);
-  const vuint32m1_t v2 = vnclipu_wx_u32m1(v1, 0, avl);
-  return vnclipu_wx_u16mf2(v2, 0, avl);
-}
-
-template <size_t N>
-HWY_API vuint16m1_t TruncateTo(Simd<uint16_t, N, 0> d,
-                               const VFromD<Simd<uint64_t, N, 2>> v) {
-  const size_t avl = Lanes(d);
-  const vuint64m4_t v1 = vand(v, 0xFFFF, avl);
-  const vuint32m2_t v2 = vnclipu_wx_u32m2(v1, 0, avl);
-  return vnclipu_wx_u16m1(v2, 0, avl);
-}
-
-template <size_t N>
-HWY_API vuint16m2_t TruncateTo(Simd<uint16_t, N, 1> d,
-                               const VFromD<Simd<uint64_t, N, 3>> v) {
-  const size_t avl = Lanes(d);
-  const vuint64m8_t v1 = vand(v, 0xFFFF, avl);
-  const vuint32m4_t v2 = vnclipu_wx_u32m4(v1, 0, avl);
-  return vnclipu_wx_u16m2(v2, 0, avl);
-}
-
-template <size_t N>
-HWY_API vuint32mf2_t TruncateTo(Simd<uint32_t, N, -1> d,
-                                const VFromD<Simd<uint64_t, N, 0>> v) {
-  const size_t avl = Lanes(d);
-  const vuint64m1_t v1 = vand(v, 0xFFFFFFFFu, avl);
-  return vnclipu_wx_u32mf2(v1, 0, avl);
-}
-
-template <size_t N>
-HWY_API vuint32m1_t TruncateTo(Simd<uint32_t, N, 0> d,
-                               const VFromD<Simd<uint64_t, N, 1>> v) {
-  const size_t avl = Lanes(d);
-  const vuint64m2_t v1 = vand(v, 0xFFFFFFFFu, avl);
-  return vnclipu_wx_u32m1(v1, 0, avl);
-}
-
-template <size_t N>
-HWY_API vuint32m2_t TruncateTo(Simd<uint32_t, N, 1> d,
-                               const VFromD<Simd<uint64_t, N, 2>> v) {
-  const size_t avl = Lanes(d);
-  const vuint64m4_t v1 = vand(v, 0xFFFFFFFFu, avl);
-  return vnclipu_wx_u32m2(v1, 0, avl);
-}
-
-template <size_t N>
-HWY_API vuint32m4_t TruncateTo(Simd<uint32_t, N, 2> d,
-                               const VFromD<Simd<uint64_t, N, 3>> v) {
-  const size_t avl = Lanes(d);
-  const vuint64m8_t v1 = vand(v, 0xFFFFFFFFu, avl);
-  return vnclipu_wx_u32m4(v1, 0, avl);
-}
-
-template <size_t N>
-HWY_API vuint8mf8_t TruncateTo(Simd<uint8_t, N, -3> d,
-                               const VFromD<Simd<uint32_t, N, -1>> v) {
-  const size_t avl = Lanes(d);
-  const vuint32mf2_t v1 = vand(v, 0xFF, avl);
-  const vuint16mf4_t v2 = vnclipu_wx_u16mf4(v1, 0, avl);
-  return vnclipu_wx_u8mf8(v2, 0, avl);
-}
-
-template <size_t N>
-HWY_API vuint8mf4_t TruncateTo(Simd<uint8_t, N, -2> d,
-                               const VFromD<Simd<uint32_t, N, 0>> v) {
-  const size_t avl = Lanes(d);
-  const vuint32m1_t v1 = vand(v, 0xFF, avl);
-  const vuint16mf2_t v2 = vnclipu_wx_u16mf2(v1, 0, avl);
-  return vnclipu_wx_u8mf4(v2, 0, avl);
-}
-
-template <size_t N>
-HWY_API vuint8mf2_t TruncateTo(Simd<uint8_t, N, -1> d,
-                               const VFromD<Simd<uint32_t, N, 1>> v) {
-  const size_t avl = Lanes(d);
-  const vuint32m2_t v1 = vand(v, 0xFF, avl);
-  const vuint16m1_t v2 = vnclipu_wx_u16m1(v1, 0, avl);
-  return vnclipu_wx_u8mf2(v2, 0, avl);
-}
-
-template <size_t N>
-HWY_API vuint8m1_t TruncateTo(Simd<uint8_t, N, 0> d,
-                              const VFromD<Simd<uint32_t, N, 2>> v) {
-  const size_t avl = Lanes(d);
-  const vuint32m4_t v1 = vand(v, 0xFF, avl);
-  const vuint16m2_t v2 = vnclipu_wx_u16m2(v1, 0, avl);
-  return vnclipu_wx_u8m1(v2, 0, avl);
-}
-
-template <size_t N>
-HWY_API vuint8m2_t TruncateTo(Simd<uint8_t, N, 1> d,
-                              const VFromD<Simd<uint32_t, N, 3>> v) {
-  const size_t avl = Lanes(d);
-  const vuint32m8_t v1 = vand(v, 0xFF, avl);
-  const vuint16m4_t v2 = vnclipu_wx_u16m4(v1, 0, avl);
-  return vnclipu_wx_u8m2(v2, 0, avl);
-}
-
-template <size_t N>
-HWY_API vuint16mf4_t TruncateTo(Simd<uint16_t, N, -2> d,
-                                const VFromD<Simd<uint32_t, N, -1>> v) {
-  const size_t avl = Lanes(d);
-  const vuint32mf2_t v1 = vand(v, 0xFFFF, avl);
-  return vnclipu_wx_u16mf4(v1, 0, avl);
-}
-
-template <size_t N>
-HWY_API vuint16mf2_t TruncateTo(Simd<uint16_t, N, -1> d,
-                                const VFromD<Simd<uint32_t, N, 0>> v) {
-  const size_t avl = Lanes(d);
-  const vuint32m1_t v1 = vand(v, 0xFFFF, avl);
-  return vnclipu_wx_u16mf2(v1, 0, avl);
-}
-
-template <size_t N>
-HWY_API vuint16m1_t TruncateTo(Simd<uint16_t, N, 0> d,
-                               const VFromD<Simd<uint32_t, N, 1>> v) {
-  const size_t avl = Lanes(d);
-  const vuint32m2_t v1 = vand(v, 0xFFFF, avl);
-  return vnclipu_wx_u16m1(v1, 0, avl);
-}
-
-template <size_t N>
-HWY_API vuint16m2_t TruncateTo(Simd<uint16_t, N, 1> d,
-                               const VFromD<Simd<uint32_t, N, 2>> v) {
-  const size_t avl = Lanes(d);
-  const vuint32m4_t v1 = vand(v, 0xFFFF, avl);
-  return vnclipu_wx_u16m2(v1, 0, avl);
-}
-
-template <size_t N>
-HWY_API vuint16m4_t TruncateTo(Simd<uint16_t, N, 2> d,
-                               const VFromD<Simd<uint32_t, N, 3>> v) {
-  const size_t avl = Lanes(d);
-  const vuint32m8_t v1 = vand(v, 0xFFFF, avl);
-  return vnclipu_wx_u16m4(v1, 0, avl);
-}
-
-template <size_t N>
-HWY_API vuint8mf8_t TruncateTo(Simd<uint8_t, N, -3> d,
-                               const VFromD<Simd<uint16_t, N, -2>> v) {
-  const size_t avl = Lanes(d);
-  const vuint16mf4_t v1 = vand(v, 0xFF, avl);
-  return vnclipu_wx_u8mf8(v1, 0, avl);
-}
-
-template <size_t N>
-HWY_API vuint8mf4_t TruncateTo(Simd<uint8_t, N, -2> d,
-                               const VFromD<Simd<uint16_t, N, -1>> v) {
-  const size_t avl = Lanes(d);
-  const vuint16mf2_t v1 = vand(v, 0xFF, avl);
-  return vnclipu_wx_u8mf4(v1, 0, avl);
-}
-
-template <size_t N>
-HWY_API vuint8mf2_t TruncateTo(Simd<uint8_t, N, -1> d,
-                               const VFromD<Simd<uint16_t, N, 0>> v) {
-  const size_t avl = Lanes(d);
-  const vuint16m1_t v1 = vand(v, 0xFF, avl);
-  return vnclipu_wx_u8mf2(v1, 0, avl);
-}
-
-template <size_t N>
-HWY_API vuint8m1_t TruncateTo(Simd<uint8_t, N, 0> d,
-                              const VFromD<Simd<uint16_t, N, 1>> v) {
-  const size_t avl = Lanes(d);
-  const vuint16m2_t v1 = vand(v, 0xFF, avl);
-  return vnclipu_wx_u8m1(v1, 0, avl);
-}
-
-template <size_t N>
-HWY_API vuint8m2_t TruncateTo(Simd<uint8_t, N, 1> d,
-                              const VFromD<Simd<uint16_t, N, 2>> v) {
-  const size_t avl = Lanes(d);
-  const vuint16m4_t v1 = vand(v, 0xFF, avl);
-  return vnclipu_wx_u8m2(v1, 0, avl);
-}
-
-template <size_t N>
-HWY_API vuint8m4_t TruncateTo(Simd<uint8_t, N, 2> d,
-                              const VFromD<Simd<uint16_t, N, 3>> v) {
-  const size_t avl = Lanes(d);
-  const vuint16m8_t v1 = vand(v, 0xFF, avl);
-  return vnclipu_wx_u8m4(v1, 0, avl);
-}
-
-// ------------------------------ DemoteTo I
-
-HWY_RVV_FOREACH_I16(HWY_RVV_DEMOTE, DemoteTo, vnclip_wx_, _DEMOTE_VIRT)
-HWY_RVV_FOREACH_I32(HWY_RVV_DEMOTE, DemoteTo, vnclip_wx_, _DEMOTE_VIRT)
-
-template <size_t N>
-HWY_API vint8mf8_t DemoteTo(Simd<int8_t, N, -3> d, const vint32mf2_t v) {
-  return DemoteTo(d, DemoteTo(Simd<int16_t, N, -2>(), v));
-}
-template <size_t N>
-HWY_API vint8mf4_t DemoteTo(Simd<int8_t, N, -2> d, const vint32m1_t v) {
-  return DemoteTo(d, DemoteTo(Simd<int16_t, N, -1>(), v));
-}
-template <size_t N>
-HWY_API vint8mf2_t DemoteTo(Simd<int8_t, N, -1> d, const vint32m2_t v) {
-  return DemoteTo(d, DemoteTo(Simd<int16_t, N, 0>(), v));
-}
-template <size_t N>
-HWY_API vint8m1_t DemoteTo(Simd<int8_t, N, 0> d, const vint32m4_t v) {
-  return DemoteTo(d, DemoteTo(Simd<int16_t, N, 1>(), v));
-}
-template <size_t N>
-HWY_API vint8m2_t DemoteTo(Simd<int8_t, N, 1> d, const vint32m8_t v) {
-  return DemoteTo(d, DemoteTo(Simd<int16_t, N, 2>(), v));
-}
-
-#undef HWY_RVV_DEMOTE
-
-// ------------------------------ DemoteTo F
-
-// SEW is for the source so we can use _DEMOTE.
-#define HWY_RVV_DEMOTE_F(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH,    \
-                         SHIFT, MLEN, NAME, OP)                              \
-  template <size_t N>                                                        \
-  HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME(                                 \
-      HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \
-    return OP##SEWH##LMULH(v, Lanes(d));                                     \
-  }
-
-#if HWY_HAVE_FLOAT16
-HWY_RVV_FOREACH_F32(HWY_RVV_DEMOTE_F, DemoteTo, vfncvt_rod_f_f_w_f,
-                    _DEMOTE_VIRT)
-#endif
-HWY_RVV_FOREACH_F64(HWY_RVV_DEMOTE_F, DemoteTo, vfncvt_rod_f_f_w_f,
-                    _DEMOTE_VIRT)
-#undef HWY_RVV_DEMOTE_F
-
-// TODO(janwas): add BASE2 arg to allow generating this via DEMOTE_F.
-template <size_t N>
-HWY_API vint32mf2_t DemoteTo(Simd<int32_t, N, -2> d, const vfloat64m1_t v) {
-  return vfncvt_rtz_x_f_w_i32mf2(v, Lanes(d));
-}
-template <size_t N>
-HWY_API vint32mf2_t DemoteTo(Simd<int32_t, N, -1> d, const vfloat64m1_t v) {
-  return vfncvt_rtz_x_f_w_i32mf2(v, Lanes(d));
-}
-template <size_t N>
-HWY_API vint32m1_t DemoteTo(Simd<int32_t, N, 0> d, const vfloat64m2_t v) {
-  return vfncvt_rtz_x_f_w_i32m1(v, Lanes(d));
-}
-template <size_t N>
-HWY_API vint32m2_t DemoteTo(Simd<int32_t, N, 1> d, const vfloat64m4_t v) {
-  return vfncvt_rtz_x_f_w_i32m2(v, Lanes(d));
-}
-template <size_t N>
-HWY_API vint32m4_t DemoteTo(Simd<int32_t, N, 2> d, const vfloat64m8_t v) {
-  return vfncvt_rtz_x_f_w_i32m4(v, Lanes(d));
-}
-
-template <size_t N, int kPow2>
-HWY_API VFromD<Simd<uint16_t, N, kPow2>> DemoteTo(
-    Simd<bfloat16_t, N, kPow2> d, VFromD<Simd<float, N, kPow2 + 1>> v) {
-  const RebindToUnsigned<decltype(d)> du16;
-  const Rebind<uint32_t, decltype(d)> du32;
-  return detail::DemoteToShr16(du16, BitCast(du32, v));
-}
-
-// ------------------------------ ConvertTo F
-
-#define HWY_RVV_CONVERT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH,       \
-                        SHIFT, MLEN, NAME, OP)                                 \
-  template <size_t N>                                                          \
-  HWY_API HWY_RVV_V(BASE, SEW, LMUL) ConvertTo(                                \
-      HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(int, SEW, LMUL) v) {         \
-    return vfcvt_f_x_v_f##SEW##LMUL(v, Lanes(d));                              \
-  }                                                                            \
-  template <size_t N>                                                          \
-  HWY_API HWY_RVV_V(BASE, SEW, LMUL) ConvertTo(                                \
-      HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(uint, SEW, LMUL) v) {\
-    return vfcvt_f_xu_v_f##SEW##LMUL(v, Lanes(d));                             \
-  }                                                                            \
-  /* Truncates (rounds toward zero). */                                        \
-  template <size_t N>                                                          \
-  HWY_API HWY_RVV_V(int, SEW, LMUL) ConvertTo(HWY_RVV_D(int, SEW, N, SHIFT) d, \
-                                              HWY_RVV_V(BASE, SEW, LMUL) v) {  \
-    return vfcvt_rtz_x_f_v_i##SEW##LMUL(v, Lanes(d));                          \
-  }                                                                            \
-// API only requires f32 but we provide f64 for internal use.
-HWY_RVV_FOREACH_F(HWY_RVV_CONVERT, _, _, _ALL_VIRT)
-#undef HWY_RVV_CONVERT
-
-// Uses default rounding mode. Must be separate because there is no D arg.
-#define HWY_RVV_NEAREST(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH,       \
-                        SHIFT, MLEN, NAME, OP)                                 \
-  HWY_API HWY_RVV_V(int, SEW, LMUL) NearestInt(HWY_RVV_V(BASE, SEW, LMUL) v) { \
-    return vfcvt_x_f_v_i##SEW##LMUL(v, HWY_RVV_AVL(SEW, SHIFT));               \
-  }
-HWY_RVV_FOREACH_F(HWY_RVV_NEAREST, _, _, _ALL)
-#undef HWY_RVV_NEAREST
-
-// ================================================== COMBINE
-
-namespace detail {
-
-// For x86-compatible behaviour mandated by Highway API: TableLookupBytes
-// offsets are implicitly relative to the start of their 128-bit block.
-template <typename T, size_t N, int kPow2>
-size_t LanesPerBlock(Simd<T, N, kPow2> d) {
-  size_t lpb = 16 / sizeof(T);
-  if (IsFull(d)) return lpb;
-  // Also honor the user-specified (constexpr) N limit.
-  lpb = HWY_MIN(lpb, N);
-  // No fraction, we're done.
-  if (kPow2 >= 0) return lpb;
-  // Fractional LMUL: Lanes(d) may be smaller than lpb, so honor that.
-  return HWY_MIN(lpb, Lanes(d));
-}
-
-template <class D, class V>
-HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0) {
-  using T = MakeUnsigned<TFromD<D>>;
-  return AndS(iota0, static_cast<T>(~(LanesPerBlock(d) - 1)));
-}
-
-template <size_t kLanes, class D>
-HWY_INLINE MFromD<D> FirstNPerBlock(D /* tag */) {
-  const RebindToUnsigned<D> du;
-  const RebindToSigned<D> di;
-  using TU = TFromD<decltype(du)>;
-  const auto idx_mod = AndS(Iota0(du), static_cast<TU>(LanesPerBlock(du) - 1));
-  return LtS(BitCast(di, idx_mod), static_cast<TFromD<decltype(di)>>(kLanes));
-}
-
-// vector = f(vector, vector, size_t)
-#define HWY_RVV_SLIDE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
-                      MLEN, NAME, OP)                                         \
-  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                          \
-      NAME(HWY_RVV_V(BASE, SEW, LMUL) dst, HWY_RVV_V(BASE, SEW, LMUL) src,    \
-           size_t lanes) {                                                    \
-    return v##OP##_vx_##CHAR##SEW##LMUL(dst, src, lanes,                      \
-                                        HWY_RVV_AVL(SEW, SHIFT));             \
-  }
-
-HWY_RVV_FOREACH(HWY_RVV_SLIDE, SlideUp, slideup, _ALL)
-HWY_RVV_FOREACH(HWY_RVV_SLIDE, SlideDown, slidedown, _ALL)
-
-#undef HWY_RVV_SLIDE
-
-}  // namespace detail
-
-// ------------------------------ ConcatUpperLower
-template <class D, class V>
-HWY_API V ConcatUpperLower(D d, const V hi, const V lo) {
-  return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi);
-}
-
-// ------------------------------ ConcatLowerLower
-template <class D, class V>
-HWY_API V ConcatLowerLower(D d, const V hi, const V lo) {
-  return detail::SlideUp(lo, hi, Lanes(d) / 2);
-}
-
-// ------------------------------ ConcatUpperUpper
-template <class D, class V>
-HWY_API V ConcatUpperUpper(D d, const V hi, const V lo) {
-  // Move upper half into lower
-  const auto lo_down = detail::SlideDown(lo, lo, Lanes(d) / 2);
-  return ConcatUpperLower(d, hi, lo_down);
-}
-
-// ------------------------------ ConcatLowerUpper
-template <class D, class V>
-HWY_API V ConcatLowerUpper(D d, const V hi, const V lo) {
-  // Move half of both inputs to the other half
-  const auto hi_up = detail::SlideUp(hi, hi, Lanes(d) / 2);
-  const auto lo_down = detail::SlideDown(lo, lo, Lanes(d) / 2);
-  return ConcatUpperLower(d, hi_up, lo_down);
-}
-
-// ------------------------------ Combine
-template <class D2, class V>
-HWY_API VFromD<D2> Combine(D2 d2, const V hi, const V lo) {
-  return detail::SlideUp(detail::Ext(d2, lo), detail::Ext(d2, hi),
-                         Lanes(d2) / 2);
-}
-
-// ------------------------------ ZeroExtendVector
-
-template <class D2, class V>
-HWY_API VFromD<D2> ZeroExtendVector(D2 d2, const V lo) {
-  return Combine(d2, Xor(lo, lo), lo);
-}
-
-// ------------------------------ Lower/UpperHalf
-
-namespace detail {
-
-// RVV may only support LMUL >= SEW/64; returns whether that holds for D. Note
-// that SEW = sizeof(T)*8 and LMUL = 1 << Pow2().
-template <class D>
-constexpr bool IsSupportedLMUL(D d) {
-  return (size_t{1} << (Pow2(d) + 3)) >= sizeof(TFromD<D>);
-}
-
-}  // namespace detail
-
-// If IsSupportedLMUL, just 'truncate' i.e. halve LMUL.
-template <class DH, hwy::EnableIf<detail::IsSupportedLMUL(DH())>* = nullptr>
-HWY_API VFromD<DH> LowerHalf(const DH /* tag */, const VFromD<Twice<DH>> v) {
-  return detail::Trunc(v);
-}
-
-// Otherwise, there is no corresponding intrinsic type (e.g. vuint64mf2_t), and
-// the hardware may set "vill" if we attempt such an LMUL. However, the V
-// extension on application processors requires Zvl128b, i.e. VLEN >= 128, so it
-// still makes sense to have half of an SEW=64 vector. We instead just return
-// the vector, and rely on the kPow2 in DH to halve the return value of Lanes().
-template <class DH, class V,
-          hwy::EnableIf<!detail::IsSupportedLMUL(DH())>* = nullptr>
-HWY_API V LowerHalf(const DH /* tag */, const V v) {
-  return v;
-}
-
-// Same, but without D arg
-template <class V>
-HWY_API VFromD<Half<DFromV<V>>> LowerHalf(const V v) {
-  return LowerHalf(Half<DFromV<V>>(), v);
-}
-
-template <class DH>
-HWY_API VFromD<DH> UpperHalf(const DH d2, const VFromD<Twice<DH>> v) {
-  return LowerHalf(d2, detail::SlideDown(v, v, Lanes(d2)));
-}
-
-// ================================================== SWIZZLE
-
-namespace detail {
-// Special instruction for 1 lane is presumably faster?
-#define HWY_RVV_SLIDE1(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
-                       MLEN, NAME, OP)                                         \
-  HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) {      \
-    return v##OP##_##CHAR##SEW##LMUL(v, 0, HWY_RVV_AVL(SEW, SHIFT));           \
-  }
-
-HWY_RVV_FOREACH_UI3264(HWY_RVV_SLIDE1, Slide1Up, slide1up_vx, _ALL)
-HWY_RVV_FOREACH_F3264(HWY_RVV_SLIDE1, Slide1Up, fslide1up_vf, _ALL)
-HWY_RVV_FOREACH_UI3264(HWY_RVV_SLIDE1, Slide1Down, slide1down_vx, _ALL)
-HWY_RVV_FOREACH_F3264(HWY_RVV_SLIDE1, Slide1Down, fslide1down_vf, _ALL)
-#undef HWY_RVV_SLIDE1
-}  // namespace detail
-
-// ------------------------------ GetLane
-
-#define HWY_RVV_GET_LANE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
-                         SHIFT, MLEN, NAME, OP)                           \
-  HWY_API HWY_RVV_T(BASE, SEW) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) {       \
-    return v##OP##_s_##CHAR##SEW##LMUL##_##CHAR##SEW(v); /* no AVL */     \
-  }
-
-HWY_RVV_FOREACH_UI(HWY_RVV_GET_LANE, GetLane, mv_x, _ALL)
-HWY_RVV_FOREACH_F(HWY_RVV_GET_LANE, GetLane, fmv_f, _ALL)
-#undef HWY_RVV_GET_LANE
-
-// ------------------------------ ExtractLane
-template <class V>
-HWY_API TFromV<V> ExtractLane(const V v, size_t i) {
-  return GetLane(detail::SlideDown(v, v, i));
-}
-
-// ------------------------------ InsertLane
-
-template <class V, HWY_IF_NOT_LANE_SIZE_V(V, 1)>
-HWY_API V InsertLane(const V v, size_t i, TFromV<V> t) {
-  const DFromV<V> d;
-  const RebindToUnsigned<decltype(d)> du;  // Iota0 is unsigned only
-  using TU = TFromD<decltype(du)>;
-  const auto is_i = detail::EqS(detail::Iota0(du), static_cast<TU>(i));
-  return IfThenElse(RebindMask(d, is_i), Set(d, t), v);
-}
-
-namespace detail {
-HWY_RVV_FOREACH_B(HWY_RVV_RETM_ARGM, SetOnlyFirst, sof)
-}  // namespace detail
-
-// For 8-bit lanes, Iota0 might overflow.
-template <class V, HWY_IF_LANE_SIZE_V(V, 1)>
-HWY_API V InsertLane(const V v, size_t i, TFromV<V> t) {
-  const DFromV<V> d;
-  const auto zero = Zero(d);
-  const auto one = Set(d, 1);
-  const auto ge_i = Eq(detail::SlideUp(zero, one, i), one);
-  const auto is_i = detail::SetOnlyFirst(ge_i);
-  return IfThenElse(RebindMask(d, is_i), Set(d, t), v);
-}
-
-// ------------------------------ OddEven
-template <class V>
-HWY_API V OddEven(const V a, const V b) {
-  const RebindToUnsigned<DFromV<V>> du;  // Iota0 is unsigned only
-  const auto is_even = detail::EqS(detail::AndS(detail::Iota0(du), 1), 0);
-  return IfThenElse(is_even, b, a);
-}
-
-// ------------------------------ DupEven (OddEven)
-template <class V>
-HWY_API V DupEven(const V v) {
-  const V up = detail::Slide1Up(v);
-  return OddEven(up, v);
-}
-
-// ------------------------------ DupOdd (OddEven)
-template <class V>
-HWY_API V DupOdd(const V v) {
-  const V down = detail::Slide1Down(v);
-  return OddEven(v, down);
-}
-
-// ------------------------------ OddEvenBlocks
-template <class V>
-HWY_API V OddEvenBlocks(const V a, const V b) {
-  const RebindToUnsigned<DFromV<V>> du;  // Iota0 is unsigned only
-  constexpr size_t kShift = CeilLog2(16 / sizeof(TFromV<V>));
-  const auto idx_block = ShiftRight<kShift>(detail::Iota0(du));
-  const auto is_even = detail::EqS(detail::AndS(idx_block, 1), 0);
-  return IfThenElse(is_even, b, a);
-}
-
-// ------------------------------ SwapAdjacentBlocks
-
-template <class V>
-HWY_API V SwapAdjacentBlocks(const V v) {
-  const DFromV<V> d;
-  const size_t lpb = detail::LanesPerBlock(d);
-  const V down = detail::SlideDown(v, v, lpb);
-  const V up = detail::SlideUp(v, v, lpb);
-  return OddEvenBlocks(up, down);
-}
-
-// ------------------------------ TableLookupLanes
-
-template <class D, class VI>
-HWY_API VFromD<RebindToUnsigned<D>> IndicesFromVec(D d, VI vec) {
-  static_assert(sizeof(TFromD<D>) == sizeof(TFromV<VI>), "Index != lane");
-  const RebindToUnsigned<decltype(d)> du;  // instead of <D>: avoids unused d.
-  const auto indices = BitCast(du, vec);
-#if HWY_IS_DEBUG_BUILD
-  HWY_DASSERT(AllTrue(du, detail::LtS(indices, Lanes(d))));
-#endif
-  return indices;
-}
-
-template <class D, typename TI>
-HWY_API VFromD<RebindToUnsigned<D>> SetTableIndices(D d, const TI* idx) {
-  static_assert(sizeof(TFromD<D>) == sizeof(TI), "Index size must match lane");
-  return IndicesFromVec(d, LoadU(Rebind<TI, D>(), idx));
-}
-
-// <32bit are not part of Highway API, but used in Broadcast. This limits VLMAX
-// to 2048! We could instead use vrgatherei16.
-#define HWY_RVV_TABLE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
-                      MLEN, NAME, OP)                                         \
-  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                          \
-      NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(uint, SEW, LMUL) idx) {    \
-    return v##OP##_vv_##CHAR##SEW##LMUL(v, idx, HWY_RVV_AVL(SEW, SHIFT));     \
-  }
-
-HWY_RVV_FOREACH(HWY_RVV_TABLE, TableLookupLanes, rgather, _ALL)
-#undef HWY_RVV_TABLE
-
-// ------------------------------ ConcatOdd (TableLookupLanes)
-template <class D, class V>
-HWY_API V ConcatOdd(D d, const V hi, const V lo) {
-  const RebindToUnsigned<decltype(d)> du;  // Iota0 is unsigned only
-  const auto iota = detail::Iota0(du);
-  const auto idx = detail::AddS(Add(iota, iota), 1);
-  const auto lo_odd = TableLookupLanes(lo, idx);
-  const auto hi_odd = TableLookupLanes(hi, idx);
-  return detail::SlideUp(lo_odd, hi_odd, Lanes(d) / 2);
-}
-
-// ------------------------------ ConcatEven (TableLookupLanes)
-template <class D, class V>
-HWY_API V ConcatEven(D d, const V hi, const V lo) {
-  const RebindToUnsigned<decltype(d)> du;  // Iota0 is unsigned only
-  const auto iota = detail::Iota0(du);
-  const auto idx = Add(iota, iota);
-  const auto lo_even = TableLookupLanes(lo, idx);
-  const auto hi_even = TableLookupLanes(hi, idx);
-  return detail::SlideUp(lo_even, hi_even, Lanes(d) / 2);
-}
-
-// ------------------------------ Reverse (TableLookupLanes)
-template <class D>
-HWY_API VFromD<D> Reverse(D /* tag */, VFromD<D> v) {
-  const RebindToUnsigned<D> du;
-  using TU = TFromD<decltype(du)>;
-  const size_t N = Lanes(du);
-  const auto idx =
-      detail::ReverseSubS(detail::Iota0(du), static_cast<TU>(N - 1));
-  return TableLookupLanes(v, idx);
-}
-
-// ------------------------------ Reverse2 (RotateRight, OddEven)
-
-// Shifting and adding requires fewer instructions than blending, but casting to
-// u32 only works for LMUL in [1/2, 8].
-template <class D, HWY_IF_LANE_SIZE_D(D, 2), HWY_RVV_IF_POW2_IN(D, -1, 3)>
-HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) {
-  const Repartition<uint32_t, D> du32;
-  return BitCast(d, RotateRight<16>(BitCast(du32, v)));
-}
-// For LMUL < 1/2, we can extend and then truncate.
-template <class D, HWY_IF_LANE_SIZE_D(D, 2), HWY_RVV_IF_POW2_IN(D, -3, -2)>
-HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) {
-  const Twice<decltype(d)> d2;
-  const Twice<decltype(d2)> d4;
-  const Repartition<uint32_t, decltype(d4)> du32;
-  const auto vx = detail::Ext(d4, detail::Ext(d2, v));
-  const auto rx = BitCast(d4, RotateRight<16>(BitCast(du32, vx)));
-  return detail::Trunc(detail::Trunc(rx));
-}
-
-// Shifting and adding requires fewer instructions than blending, but casting to
-// u64 does not work for LMUL < 1.
-template <class D, HWY_IF_LANE_SIZE_D(D, 4), HWY_RVV_IF_POW2_IN(D, 0, 3)>
-HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) {
-  const Repartition<uint64_t, decltype(d)> du64;
-  return BitCast(d, RotateRight<32>(BitCast(du64, v)));
-}
-
-// For fractions, we can extend and then truncate.
-template <class D, HWY_IF_LANE_SIZE_D(D, 4), HWY_RVV_IF_POW2_IN(D, -2, -1)>
-HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) {
-  const Twice<decltype(d)> d2;
-  const Twice<decltype(d2)> d4;
-  const Repartition<uint64_t, decltype(d4)> du64;
-  const auto vx = detail::Ext(d4, detail::Ext(d2, v));
-  const auto rx = BitCast(d4, RotateRight<32>(BitCast(du64, vx)));
-  return detail::Trunc(detail::Trunc(rx));
-}
-
-template <class D, class V = VFromD<D>, HWY_IF_LANE_SIZE_D(D, 8)>
-HWY_API V Reverse2(D /* tag */, const V v) {
-  const V up = detail::Slide1Up(v);
-  const V down = detail::Slide1Down(v);
-  return OddEven(up, down);
-}
-
-// ------------------------------ Reverse4 (TableLookupLanes)
-
-template <class D>
-HWY_API VFromD<D> Reverse4(D d, const VFromD<D> v) {
-  const RebindToUnsigned<D> du;
-  const auto idx = detail::XorS(detail::Iota0(du), 3);
-  return BitCast(d, TableLookupLanes(BitCast(du, v), idx));
-}
-
-// ------------------------------ Reverse8 (TableLookupLanes)
-
-template <class D>
-HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) {
-  const RebindToUnsigned<D> du;
-  const auto idx = detail::XorS(detail::Iota0(du), 7);
-  return BitCast(d, TableLookupLanes(BitCast(du, v), idx));
-}
-
-// ------------------------------ ReverseBlocks (Reverse, Shuffle01)
-template <class D, class V = VFromD<D>>
-HWY_API V ReverseBlocks(D d, V v) {
-  const Repartition<uint64_t, D> du64;
-  const size_t N = Lanes(du64);
-  const auto rev =
-      detail::ReverseSubS(detail::Iota0(du64), static_cast<uint64_t>(N - 1));
-  // Swap lo/hi u64 within each block
-  const auto idx = detail::XorS(rev, 1);
-  return BitCast(d, TableLookupLanes(BitCast(du64, v), idx));
-}
-
-// ------------------------------ Compress
-
-template <typename T>
-struct CompressIsPartition {
-  enum { value = 0 };
-};
-
-#define HWY_RVV_COMPRESS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH,     \
-                         SHIFT, MLEN, NAME, OP)                               \
-  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                          \
-      NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) mask) {              \
-    return v##OP##_vm_##CHAR##SEW##LMUL(mask, v, v, HWY_RVV_AVL(SEW, SHIFT)); \
-  }
-
-HWY_RVV_FOREACH_UI163264(HWY_RVV_COMPRESS, Compress, compress, _ALL)
-HWY_RVV_FOREACH_F(HWY_RVV_COMPRESS, Compress, compress, _ALL)
-#undef HWY_RVV_COMPRESS
-
-// ------------------------------ CompressNot
-template <class V, class M>
-HWY_API V CompressNot(V v, const M mask) {
-  return Compress(v, Not(mask));
-}
-
-// ------------------------------ CompressBlocksNot
-template <class V, class M>
-HWY_API V CompressBlocksNot(V v, const M mask) {
-  return CompressNot(v, mask);
-}
-
-// ------------------------------ CompressStore
-template <class V, class M, class D>
-HWY_API size_t CompressStore(const V v, const M mask, const D d,
-                             TFromD<D>* HWY_RESTRICT unaligned) {
-  StoreU(Compress(v, mask), d, unaligned);
-  return CountTrue(d, mask);
-}
-
-// ------------------------------ CompressBlendedStore
-template <class V, class M, class D>
-HWY_API size_t CompressBlendedStore(const V v, const M mask, const D d,
-                                    TFromD<D>* HWY_RESTRICT unaligned) {
-  const size_t count = CountTrue(d, mask);
-  detail::StoreN(count, Compress(v, mask), d, unaligned);
-  return count;
-}
-
-// ================================================== BLOCKWISE
-
-// ------------------------------ CombineShiftRightBytes
-template <size_t kBytes, class D, class V = VFromD<D>>
-HWY_API V CombineShiftRightBytes(const D d, const V hi, V lo) {
-  const Repartition<uint8_t, decltype(d)> d8;
-  const auto hi8 = BitCast(d8, hi);
-  const auto lo8 = BitCast(d8, lo);
-  const auto hi_up = detail::SlideUp(hi8, hi8, 16 - kBytes);
-  const auto lo_down = detail::SlideDown(lo8, lo8, kBytes);
-  const auto is_lo = detail::FirstNPerBlock<16 - kBytes>(d8);
-  return BitCast(d, IfThenElse(is_lo, lo_down, hi_up));
-}
-
-// ------------------------------ CombineShiftRightLanes
-template <size_t kLanes, class D, class V = VFromD<D>>
-HWY_API V CombineShiftRightLanes(const D d, const V hi, V lo) {
-  constexpr size_t kLanesUp = 16 / sizeof(TFromV<V>) - kLanes;
-  const auto hi_up = detail::SlideUp(hi, hi, kLanesUp);
-  const auto lo_down = detail::SlideDown(lo, lo, kLanes);
-  const auto is_lo = detail::FirstNPerBlock<kLanesUp>(d);
-  return IfThenElse(is_lo, lo_down, hi_up);
-}
-
-// ------------------------------ Shuffle2301 (ShiftLeft)
-template <class V>
-HWY_API V Shuffle2301(const V v) {
-  const DFromV<V> d;
-  static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
-  const Repartition<uint64_t, decltype(d)> du64;
-  const auto v64 = BitCast(du64, v);
-  return BitCast(d, Or(ShiftRight<32>(v64), ShiftLeft<32>(v64)));
-}
-
-// ------------------------------ Shuffle2103
-template <class V>
-HWY_API V Shuffle2103(const V v) {
-  const DFromV<V> d;
-  static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
-  return CombineShiftRightLanes<3>(d, v, v);
-}
-
-// ------------------------------ Shuffle0321
-template <class V>
-HWY_API V Shuffle0321(const V v) {
-  const DFromV<V> d;
-  static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
-  return CombineShiftRightLanes<1>(d, v, v);
-}
-
-// ------------------------------ Shuffle1032
-template <class V>
-HWY_API V Shuffle1032(const V v) {
-  const DFromV<V> d;
-  static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
-  return CombineShiftRightLanes<2>(d, v, v);
-}
-
-// ------------------------------ Shuffle01
-template <class V>
-HWY_API V Shuffle01(const V v) {
-  const DFromV<V> d;
-  static_assert(sizeof(TFromD<decltype(d)>) == 8, "Defined for 64-bit types");
-  return CombineShiftRightLanes<1>(d, v, v);
-}
-
-// ------------------------------ Shuffle0123
-template <class V>
-HWY_API V Shuffle0123(const V v) {
-  return Shuffle2301(Shuffle1032(v));
-}
-
-// ------------------------------ TableLookupBytes
-
-// Extends or truncates a vector to match the given d.
-namespace detail {
-
-template <typename T, size_t N, int kPow2>
-HWY_INLINE auto ChangeLMUL(Simd<T, N, kPow2> d, VFromD<Simd<T, N, kPow2 - 3>> v)
-    -> VFromD<decltype(d)> {
-  const Simd<T, N, kPow2 - 1> dh;
-  const Simd<T, N, kPow2 - 2> dhh;
-  return Ext(d, Ext(dh, Ext(dhh, v)));
-}
-template <typename T, size_t N, int kPow2>
-HWY_INLINE auto ChangeLMUL(Simd<T, N, kPow2> d, VFromD<Simd<T, N, kPow2 - 2>> v)
-    -> VFromD<decltype(d)> {
-  const Simd<T, N, kPow2 - 1> dh;
-  return Ext(d, Ext(dh, v));
-}
-template <typename T, size_t N, int kPow2>
-HWY_INLINE auto ChangeLMUL(Simd<T, N, kPow2> d, VFromD<Simd<T, N, kPow2 - 1>> v)
-    -> VFromD<decltype(d)> {
-  return Ext(d, v);
-}
-
-template <typename T, size_t N, int kPow2>
-HWY_INLINE auto ChangeLMUL(Simd<T, N, kPow2> d, VFromD<decltype(d)> v)
-    -> VFromD<decltype(d)> {
-  return v;
-}
-
-template <typename T, size_t N, int kPow2>
-HWY_INLINE auto ChangeLMUL(Simd<T, N, kPow2> d, VFromD<Simd<T, N, kPow2 + 1>> v)
-    -> VFromD<decltype(d)> {
-  return Trunc(v);
-}
-template <typename T, size_t N, int kPow2>
-HWY_INLINE auto ChangeLMUL(Simd<T, N, kPow2> d, VFromD<Simd<T, N, kPow2 + 2>> v)
-    -> VFromD<decltype(d)> {
-  return Trunc(Trunc(v));
-}
-template <typename T, size_t N, int kPow2>
-HWY_INLINE auto ChangeLMUL(Simd<T, N, kPow2> d, VFromD<Simd<T, N, kPow2 + 3>> v)
-    -> VFromD<decltype(d)> {
-  return Trunc(Trunc(Trunc(v)));
-}
-
-}  // namespace detail
-
-template <class VT, class VI>
-HWY_API VI TableLookupBytes(const VT vt, const VI vi) {
-  const DFromV<VT> dt;  // T=table, I=index.
-  const DFromV<VI> di;
-  const Repartition<uint8_t, decltype(dt)> dt8;
-  const Repartition<uint8_t, decltype(di)> di8;
-  // Required for producing half-vectors with table lookups from a full vector.
-  // If we instead run at the LMUL of the index vector, lookups into the table
-  // would be truncated. Thus we run at the larger of the two LMULs and truncate
-  // the result vector to the original index LMUL.
-  constexpr int kPow2T = Pow2(dt8);
-  constexpr int kPow2I = Pow2(di8);
-  const Simd<uint8_t, MaxLanes(di8), HWY_MAX(kPow2T, kPow2I)> dm8;  // m=max
-  const auto vmt = detail::ChangeLMUL(dm8, BitCast(dt8, vt));
-  const auto vmi = detail::ChangeLMUL(dm8, BitCast(di8, vi));
-  auto offsets = detail::OffsetsOf128BitBlocks(dm8, detail::Iota0(dm8));
-  // If the table is shorter, wrap around offsets so they do not reference
-  // undefined lanes in the newly extended vmt.
-  if (kPow2T < kPow2I) {
-    offsets = detail::AndS(offsets, static_cast<uint8_t>(Lanes(dt8) - 1));
-  }
-  const auto out = TableLookupLanes(vmt, Add(vmi, offsets));
-  return BitCast(di, detail::ChangeLMUL(di8, out));
-}
-
-template <class VT, class VI>
-HWY_API VI TableLookupBytesOr0(const VT vt, const VI idx) {
-  const DFromV<VI> di;
-  const Repartition<int8_t, decltype(di)> di8;
-  const auto idx8 = BitCast(di8, idx);
-  const auto lookup = TableLookupBytes(vt, idx8);
-  return BitCast(di, IfThenZeroElse(detail::LtS(idx8, 0), lookup));
-}
-
-// ------------------------------ Broadcast
-template <int kLane, class V>
-HWY_API V Broadcast(const V v) {
-  const DFromV<V> d;
-  HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d));
-  auto idx = detail::OffsetsOf128BitBlocks(d, detail::Iota0(d));
-  if (kLane != 0) {
-    idx = detail::AddS(idx, kLane);
-  }
-  return TableLookupLanes(v, idx);
-}
-
-// ------------------------------ ShiftLeftLanes
-
-template <size_t kLanes, class D, class V = VFromD<D>>
-HWY_API V ShiftLeftLanes(const D d, const V v) {
-  const RebindToSigned<decltype(d)> di;
-  using TI = TFromD<decltype(di)>;
-  const auto shifted = detail::SlideUp(v, v, kLanes);
-  // Match x86 semantics by zeroing lower lanes in 128-bit blocks
-  const auto idx_mod = detail::AndS(
-      detail::Iota0(di), static_cast<TI>(detail::LanesPerBlock(di) - 1));
-  const auto clear = detail::LtS(BitCast(di, idx_mod), static_cast<TI>(kLanes));
-  return IfThenZeroElse(clear, shifted);
-}
-
-template <size_t kLanes, class V>
-HWY_API V ShiftLeftLanes(const V v) {
-  return ShiftLeftLanes<kLanes>(DFromV<V>(), v);
-}
-
-// ------------------------------ ShiftLeftBytes
-
-template <int kBytes, class D>
-HWY_API VFromD<D> ShiftLeftBytes(D d, const VFromD<D> v) {
-  const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, ShiftLeftLanes<kBytes>(BitCast(d8, v)));
-}
-
-template <int kBytes, class V>
-HWY_API V ShiftLeftBytes(const V v) {
-  return ShiftLeftBytes<kBytes>(DFromV<V>(), v);
-}
-
-// ------------------------------ ShiftRightLanes
-template <size_t kLanes, typename T, size_t N, int kPow2,
-          class V = VFromD<Simd<T, N, kPow2>>>
-HWY_API V ShiftRightLanes(const Simd<T, N, kPow2> d, V v) {
-  const RebindToSigned<decltype(d)> di;
-  using TI = TFromD<decltype(di)>;
-  // For partial vectors, clear upper lanes so we shift in zeros.
-  if (N <= 16 / sizeof(T)) {
-    v = IfThenElseZero(FirstN(d, N), v);
-  }
-
-  const auto shifted = detail::SlideDown(v, v, kLanes);
-  // Match x86 semantics by zeroing upper lanes in 128-bit blocks
-  const size_t lpb = detail::LanesPerBlock(di);
-  const auto idx_mod =
-      detail::AndS(detail::Iota0(di), static_cast<TI>(lpb - 1));
-  const auto keep =
-      detail::LtS(BitCast(di, idx_mod), static_cast<TI>(lpb - kLanes));
-  return IfThenElseZero(keep, shifted);
-}
-
-// ------------------------------ ShiftRightBytes
-template <int kBytes, class D, class V = VFromD<D>>
-HWY_API V ShiftRightBytes(const D d, const V v) {
-  const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, ShiftRightLanes<kBytes>(d8, BitCast(d8, v)));
-}
-
-// ------------------------------ InterleaveLower
-
-template <class D, class V>
-HWY_API V InterleaveLower(D d, const V a, const V b) {
-  static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
-  const RebindToUnsigned<decltype(d)> du;
-  using TU = TFromD<decltype(du)>;
-  const auto i = detail::Iota0(du);
-  const auto idx_mod = ShiftRight<1>(
-      detail::AndS(i, static_cast<TU>(detail::LanesPerBlock(du) - 1)));
-  const auto idx = Add(idx_mod, detail::OffsetsOf128BitBlocks(d, i));
-  const auto is_even = detail::EqS(detail::AndS(i, 1), 0u);
-  return IfThenElse(is_even, TableLookupLanes(a, idx),
-                    TableLookupLanes(b, idx));
-}
-
-template <class V>
-HWY_API V InterleaveLower(const V a, const V b) {
-  return InterleaveLower(DFromV<V>(), a, b);
-}
-
-// ------------------------------ InterleaveUpper
-
-template <class D, class V>
-HWY_API V InterleaveUpper(const D d, const V a, const V b) {
-  static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
-  const RebindToUnsigned<decltype(d)> du;
-  using TU = TFromD<decltype(du)>;
-  const size_t lpb = detail::LanesPerBlock(du);
-  const auto i = detail::Iota0(du);
-  const auto idx_mod = ShiftRight<1>(detail::AndS(i, static_cast<TU>(lpb - 1)));
-  const auto idx_lower = Add(idx_mod, detail::OffsetsOf128BitBlocks(d, i));
-  const auto idx = detail::AddS(idx_lower, static_cast<TU>(lpb / 2));
-  const auto is_even = detail::EqS(detail::AndS(i, 1), 0u);
-  return IfThenElse(is_even, TableLookupLanes(a, idx),
-                    TableLookupLanes(b, idx));
-}
-
-// ------------------------------ ZipLower
-
-template <class V, class DW = RepartitionToWide<DFromV<V>>>
-HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
-  const RepartitionToNarrow<DW> dn;
-  static_assert(IsSame<TFromD<decltype(dn)>, TFromV<V>>(), "D/V mismatch");
-  return BitCast(dw, InterleaveLower(dn, a, b));
-}
-
-template <class V, class DW = RepartitionToWide<DFromV<V>>>
-HWY_API VFromD<DW> ZipLower(V a, V b) {
-  return BitCast(DW(), InterleaveLower(a, b));
-}
-
-// ------------------------------ ZipUpper
-template <class DW, class V>
-HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
-  const RepartitionToNarrow<DW> dn;
-  static_assert(IsSame<TFromD<decltype(dn)>, TFromV<V>>(), "D/V mismatch");
-  return BitCast(dw, InterleaveUpper(dn, a, b));
-}
-
-// ================================================== REDUCE
-
-// vector = f(vector, zero_m1)
-#define HWY_RVV_REDUCE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
-                       MLEN, NAME, OP)                                         \
-  template <class D>                                                           \
-  HWY_API HWY_RVV_V(BASE, SEW, LMUL)                                           \
-      NAME(D d, HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, m1) v0) {   \
-    return Set(d, GetLane(v##OP##_vs_##CHAR##SEW##LMUL##_##CHAR##SEW##m1(      \
-                      v0, v, v0, Lanes(d))));                                  \
-  }
-
-// ------------------------------ SumOfLanes
-
-namespace detail {
-HWY_RVV_FOREACH_UI(HWY_RVV_REDUCE, RedSum, redsum, _ALL)
-HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedSum, fredusum, _ALL)
-}  // namespace detail
-
-template <class D>
-HWY_API VFromD<D> SumOfLanes(D d, const VFromD<D> v) {
-  const auto v0 = Zero(ScalableTag<TFromD<D>>());  // always m1
-  return detail::RedSum(d, v, v0);
-}
-
-// ------------------------------ MinOfLanes
-namespace detail {
-HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMin, redminu, _ALL)
-HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMin, redmin, _ALL)
-HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMin, fredmin, _ALL)
-}  // namespace detail
-
-template <class D>
-HWY_API VFromD<D> MinOfLanes(D d, const VFromD<D> v) {
-  using T = TFromD<D>;
-  const ScalableTag<T> d1;  // always m1
-  const auto neutral = Set(d1, HighestValue<T>());
-  return detail::RedMin(d, v, neutral);
-}
-
-// ------------------------------ MaxOfLanes
-namespace detail {
-HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMax, redmaxu, _ALL)
-HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMax, redmax, _ALL)
-HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMax, fredmax, _ALL)
-}  // namespace detail
-
-template <class D>
-HWY_API VFromD<D> MaxOfLanes(D d, const VFromD<D> v) {
-  using T = TFromD<D>;
-  const ScalableTag<T> d1;  // always m1
-  const auto neutral = Set(d1, LowestValue<T>());
-  return detail::RedMax(d, v, neutral);
-}
-
-#undef HWY_RVV_REDUCE
-
-// ================================================== Ops with dependencies
-
-// ------------------------------ PopulationCount (ShiftRight)
-
-// Handles LMUL >= 2 or capped vectors, which generic_ops-inl cannot.
-template <typename V, class D = DFromV<V>, HWY_IF_LANE_SIZE_D(D, 1),
-          hwy::EnableIf<Pow2(D()) < 1 || MaxLanes(D()) < 16>* = nullptr>
-HWY_API V PopulationCount(V v) {
-  // See https://arxiv.org/pdf/1611.07612.pdf, Figure 3
-  v = Sub(v, detail::AndS(ShiftRight<1>(v), 0x55));
-  v = Add(detail::AndS(ShiftRight<2>(v), 0x33), detail::AndS(v, 0x33));
-  return detail::AndS(Add(v, ShiftRight<4>(v)), 0x0F);
-}
-
-// ------------------------------ LoadDup128
-
-template <class D, typename T = TFromD<D>>
-HWY_API VFromD<D> LoadDup128(D d, const T* const HWY_RESTRICT p) {
-  const auto loaded = Load(d, p);
-  // Broadcast the first block
-  const auto idx = detail::AndS(detail::Iota0(d),
-                                static_cast<T>(detail::LanesPerBlock(d) - 1));
-  return TableLookupLanes(loaded, idx);
-}
-
-// ------------------------------ LoadMaskBits
-
-// Support all combinations of T and SHIFT(LMUL) without explicit overloads for
-// each. First overload for MLEN=1..64.
-namespace detail {
-
-// Maps D to MLEN (wrapped in SizeTag), such that #mask_bits = VLEN/MLEN. MLEN
-// increases with lane size and decreases for increasing LMUL. Cap at 64, the
-// largest supported by HWY_RVV_FOREACH_B (and intrinsics), for virtual LMUL
-// e.g. vuint16mf8_t: (8*2 << 3) == 128.
-template <class D>
-using MaskTag = hwy::SizeTag<HWY_MIN(
-    64, detail::ScaleByPower(8 * sizeof(TFromD<D>), -Pow2(D())))>;
-
-#define HWY_RVV_LOAD_MASK_BITS(SEW, SHIFT, MLEN, NAME, OP)                \
-  HWY_INLINE HWY_RVV_M(MLEN)                                              \
-      NAME(hwy::SizeTag<MLEN> /* tag */, const uint8_t* bits, size_t N) { \
-    return OP##_v_b##MLEN(bits, N);                                       \
-  }
-HWY_RVV_FOREACH_B(HWY_RVV_LOAD_MASK_BITS, LoadMaskBits, vlm)
-#undef HWY_RVV_LOAD_MASK_BITS
-}  // namespace detail
-
-template <class D, class MT = detail::MaskTag<D>>
-HWY_API auto LoadMaskBits(D d, const uint8_t* bits)
-    -> decltype(detail::LoadMaskBits(MT(), bits, Lanes(d))) {
-  return detail::LoadMaskBits(MT(), bits, Lanes(d));
-}
-
-// ------------------------------ StoreMaskBits
-#define HWY_RVV_STORE_MASK_BITS(SEW, SHIFT, MLEN, NAME, OP)               \
-  template <class D>                                                      \
-  HWY_API size_t NAME(D d, HWY_RVV_M(MLEN) m, uint8_t* bits) {            \
-    const size_t N = Lanes(d);                                            \
-    OP##_v_b##MLEN(bits, m, N);                                           \
-    /* Non-full byte, need to clear the undefined upper bits. */          \
-    /* Use MaxLanes and sizeof(T) to move some checks to compile-time. */ \
-    constexpr bool kLessThan8 =                                           \
-        detail::ScaleByPower(16 / sizeof(TFromD<D>), Pow2(d)) < 8;        \
-    if (MaxLanes(d) < 8 || (kLessThan8 && N < 8)) {                       \
-      const int mask = (1 << N) - 1;                                      \
-      bits[0] = static_cast<uint8_t>(bits[0] & mask);                     \
-    }                                                                     \
-    return (N + 7) / 8;                                                   \
-  }
-HWY_RVV_FOREACH_B(HWY_RVV_STORE_MASK_BITS, StoreMaskBits, vsm)
-#undef HWY_RVV_STORE_MASK_BITS
-
-// ------------------------------ CompressBits, CompressBitsStore (LoadMaskBits)
-
-template <class V>
-HWY_INLINE V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) {
-  return Compress(v, LoadMaskBits(DFromV<V>(), bits));
-}
-
-template <class D>
-HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
-                                 D d, TFromD<D>* HWY_RESTRICT unaligned) {
-  return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
-}
-
-// ------------------------------ FirstN (Iota0, Lt, RebindMask, SlideUp)
-
-// Disallow for 8-bit because Iota is likely to overflow.
-template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 1)>
-HWY_API MFromD<D> FirstN(const D d, const size_t n) {
-  const RebindToSigned<D> di;
-  using TI = TFromD<decltype(di)>;
-  return RebindMask(
-      d, detail::LtS(BitCast(di, detail::Iota0(d)), static_cast<TI>(n)));
-}
-
-template <class D, HWY_IF_LANE_SIZE_D(D, 1)>
-HWY_API MFromD<D> FirstN(const D d, const size_t n) {
-  const auto zero = Zero(d);
-  const auto one = Set(d, 1);
-  return Eq(detail::SlideUp(one, zero, n), one);
-}
-
-// ------------------------------ Neg (Sub)
-
-template <class V, HWY_IF_SIGNED_V(V)>
-HWY_API V Neg(const V v) {
-  return detail::ReverseSubS(v, 0);
-}
-
-// vector = f(vector), but argument is repeated
-#define HWY_RVV_RETV_ARGV2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
-                           SHIFT, MLEN, NAME, OP)                           \
-  HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) {   \
-    return v##OP##_vv_##CHAR##SEW##LMUL(v, v, HWY_RVV_AVL(SEW, SHIFT));     \
-  }
-
-HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGV2, Neg, fsgnjn, _ALL)
-
-// ------------------------------ Abs (Max, Neg)
-
-template <class V, HWY_IF_SIGNED_V(V)>
-HWY_API V Abs(const V v) {
-  return Max(v, Neg(v));
-}
-
-HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGV2, Abs, fsgnjx, _ALL)
-
-#undef HWY_RVV_RETV_ARGV2
-
-// ------------------------------ AbsDiff (Abs, Sub)
-template <class V>
-HWY_API V AbsDiff(const V a, const V b) {
-  return Abs(Sub(a, b));
-}
-
-// ------------------------------ Round  (NearestInt, ConvertTo, CopySign)
-
-// IEEE-754 roundToIntegralTiesToEven returns floating-point, but we do not have
-// a dedicated instruction for that. Rounding to integer and converting back to
-// float is correct except when the input magnitude is large, in which case the
-// input was already an integer (because mantissa >> exponent is zero).
-
-namespace detail {
-enum RoundingModes { kNear, kTrunc, kDown, kUp };
-
-template <class V>
-HWY_INLINE auto UseInt(const V v) -> decltype(MaskFromVec(v)) {
-  return detail::LtS(Abs(v), MantissaEnd<TFromV<V>>());
-}
-
-}  // namespace detail
-
-template <class V>
-HWY_API V Round(const V v) {
-  const DFromV<V> df;
-
-  const auto integer = NearestInt(v);  // round using current mode
-  const auto int_f = ConvertTo(df, integer);
-
-  return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v);
-}
-
-// ------------------------------ Trunc (ConvertTo)
-template <class V>
-HWY_API V Trunc(const V v) {
-  const DFromV<V> df;
-  const RebindToSigned<decltype(df)> di;
-
-  const auto integer = ConvertTo(di, v);  // round toward 0
-  const auto int_f = ConvertTo(df, integer);
-
-  return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v);
-}
-
-// ------------------------------ Ceil
-template <class V>
-HWY_API V Ceil(const V v) {
-  asm volatile("fsrm %0" ::"r"(detail::kUp));
-  const auto ret = Round(v);
-  asm volatile("fsrm %0" ::"r"(detail::kNear));
-  return ret;
-}
-
-// ------------------------------ Floor
-template <class V>
-HWY_API V Floor(const V v) {
-  asm volatile("fsrm %0" ::"r"(detail::kDown));
-  const auto ret = Round(v);
-  asm volatile("fsrm %0" ::"r"(detail::kNear));
-  return ret;
-}
-
-// ------------------------------ Floating-point classification (Ne)
-
-// vfclass does not help because it would require 3 instructions (to AND and
-// then compare the bits), whereas these are just 1-3 integer instructions.
-
-template <class V>
-HWY_API MFromD<DFromV<V>> IsNaN(const V v) {
-  return Ne(v, v);
-}
-
-template <class V, class D = DFromV<V>>
-HWY_API MFromD<D> IsInf(const V v) {
-  const D d;
-  const RebindToSigned<decltype(d)> di;
-  using T = TFromD<D>;
-  const VFromD<decltype(di)> vi = BitCast(di, v);
-  // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
-  return RebindMask(d, detail::EqS(Add(vi, vi), hwy::MaxExponentTimes2<T>()));
-}
-
-// Returns whether normal/subnormal/zero.
-template <class V, class D = DFromV<V>>
-HWY_API MFromD<D> IsFinite(const V v) {
-  const D d;
-  const RebindToUnsigned<decltype(d)> du;
-  const RebindToSigned<decltype(d)> di;  // cheaper than unsigned comparison
-  using T = TFromD<D>;
-  const VFromD<decltype(du)> vu = BitCast(du, v);
-  // 'Shift left' to clear the sign bit, then right so we can compare with the
-  // max exponent (cannot compare with MaxExponentTimes2 directly because it is
-  // negative and non-negative floats would be greater).
-  const VFromD<decltype(di)> exp =
-      BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
-  return RebindMask(d, detail::LtS(exp, hwy::MaxExponentField<T>()));
-}
-
-// ------------------------------ Iota (ConvertTo)
-
-template <class D, HWY_IF_UNSIGNED_D(D)>
-HWY_API VFromD<D> Iota(const D d, TFromD<D> first) {
-  return detail::AddS(detail::Iota0(d), first);
-}
-
-template <class D, HWY_IF_SIGNED_D(D)>
-HWY_API VFromD<D> Iota(const D d, TFromD<D> first) {
-  const RebindToUnsigned<D> du;
-  return detail::AddS(BitCast(d, detail::Iota0(du)), first);
-}
-
-template <class D, HWY_IF_FLOAT_D(D)>
-HWY_API VFromD<D> Iota(const D d, TFromD<D> first) {
-  const RebindToUnsigned<D> du;
-  const RebindToSigned<D> di;
-  return detail::AddS(ConvertTo(d, BitCast(di, detail::Iota0(du))), first);
-}
-
-// ------------------------------ MulEven/Odd (Mul, OddEven)
-
-template <class V, HWY_IF_LANE_SIZE_V(V, 4), class D = DFromV<V>,
-          class DW = RepartitionToWide<D>>
-HWY_API VFromD<DW> MulEven(const V a, const V b) {
-  const auto lo = Mul(a, b);
-  const auto hi = detail::MulHigh(a, b);
-  return BitCast(DW(), OddEven(detail::Slide1Up(hi), lo));
-}
-
-// There is no 64x64 vwmul.
-template <class V, HWY_IF_LANE_SIZE_V(V, 8)>
-HWY_INLINE V MulEven(const V a, const V b) {
-  const auto lo = Mul(a, b);
-  const auto hi = detail::MulHigh(a, b);
-  return OddEven(detail::Slide1Up(hi), lo);
-}
-
-template <class V, HWY_IF_LANE_SIZE_V(V, 8)>
-HWY_INLINE V MulOdd(const V a, const V b) {
-  const auto lo = Mul(a, b);
-  const auto hi = detail::MulHigh(a, b);
-  return OddEven(hi, detail::Slide1Down(lo));
-}
-
-// ------------------------------ ReorderDemote2To (OddEven)
-
-template <size_t N, int kPow2>
-HWY_API VFromD<Simd<uint16_t, N, kPow2>> ReorderDemote2To(
-    Simd<bfloat16_t, N, kPow2> dbf16,
-    VFromD<RepartitionToWide<decltype(dbf16)>> a,
-    VFromD<RepartitionToWide<decltype(dbf16)>> b) {
-  const RebindToUnsigned<decltype(dbf16)> du16;
-  const RebindToUnsigned<DFromV<decltype(a)>> du32;
-  const VFromD<decltype(du32)> b_in_even = ShiftRight<16>(BitCast(du32, b));
-  return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
-}
-
-// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
-
-template <class DF>
-using DU16FromDF = RepartitionToNarrow<RebindToUnsigned<DF>>;
-
-template <size_t N, int kPow2>
-HWY_API auto ReorderWidenMulAccumulate(Simd<float, N, kPow2> df32,
-                                       VFromD<DU16FromDF<decltype(df32)>> a,
-                                       VFromD<DU16FromDF<decltype(df32)>> b,
-                                       const VFromD<decltype(df32)> sum0,
-                                       VFromD<decltype(df32)>& sum1)
-    -> VFromD<decltype(df32)> {
-  const DU16FromDF<decltype(df32)> du16;
-  const RebindToUnsigned<decltype(df32)> du32;
-  using VU32 = VFromD<decltype(du32)>;
-  const VFromD<decltype(du16)> zero = Zero(du16);
-  const VU32 a0 = ZipLower(du32, zero, BitCast(du16, a));
-  const VU32 a1 = ZipUpper(du32, zero, BitCast(du16, a));
-  const VU32 b0 = ZipLower(du32, zero, BitCast(du16, b));
-  const VU32 b1 = ZipUpper(du32, zero, BitCast(du16, b));
-  sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
-  return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
-}
-
-// ------------------------------ Lt128
-template <class D>
-HWY_INLINE MFromD<D> Lt128(D d, const VFromD<D> a, const VFromD<D> b) {
-  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
-  // Truth table of Eq and Compare for Hi and Lo u64.
-  // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
-  // =H =L cH cL  | out = cH | (=H & cL)
-  //  0  0  0  0  |  0
-  //  0  0  0  1  |  0
-  //  0  0  1  0  |  1
-  //  0  0  1  1  |  1
-  //  0  1  0  0  |  0
-  //  0  1  0  1  |  0
-  //  0  1  1  0  |  1
-  //  1  0  0  0  |  0
-  //  1  0  0  1  |  1
-  //  1  1  0  0  |  0
-  const VFromD<D> eqHL = VecFromMask(d, Eq(a, b));
-  const VFromD<D> ltHL = VecFromMask(d, Lt(a, b));
-  // Shift leftward so L can influence H.
-  const VFromD<D> ltLx = detail::Slide1Up(ltHL);
-  const VFromD<D> vecHx = OrAnd(ltHL, eqHL, ltLx);
-  // Replicate H to its neighbor.
-  return MaskFromVec(OddEven(vecHx, detail::Slide1Down(vecHx)));
-}
-
-// ------------------------------ Lt128Upper
-template <class D>
-HWY_INLINE MFromD<D> Lt128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
-  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
-  const VFromD<D> ltHL = VecFromMask(d, Lt(a, b));
-  // Replicate H to its neighbor.
-  return MaskFromVec(OddEven(ltHL, detail::Slide1Down(ltHL)));
-}
-
-// ------------------------------ Eq128
-template <class D>
-HWY_INLINE MFromD<D> Eq128(D d, const VFromD<D> a, const VFromD<D> b) {
-  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
-  const VFromD<D> eqHL = VecFromMask(d, Eq(a, b));
-  const VFromD<D> eqLH = Reverse2(d, eqHL);
-  return MaskFromVec(And(eqHL, eqLH));
-}
-
-// ------------------------------ Eq128Upper
-template <class D>
-HWY_INLINE MFromD<D> Eq128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
-  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
-  const VFromD<D> eqHL = VecFromMask(d, Eq(a, b));
-  // Replicate H to its neighbor.
-  return MaskFromVec(OddEven(eqHL, detail::Slide1Down(eqHL)));
-}
-
-// ------------------------------ Min128, Max128 (Lt128)
-
-template <class D>
-HWY_INLINE VFromD<D> Min128(D /* tag */, const VFromD<D> a, const VFromD<D> b) {
-  const VFromD<D> aXH = detail::Slide1Down(a);
-  const VFromD<D> bXH = detail::Slide1Down(b);
-  const VFromD<D> minHL = Min(a, b);
-  const MFromD<D> ltXH = Lt(aXH, bXH);
-  const MFromD<D> eqXH = Eq(aXH, bXH);
-  // If the upper lane is the decider, take lo from the same reg.
-  const VFromD<D> lo = IfThenElse(ltXH, a, b);
-  // The upper lane is just minHL; if they are equal, we also need to use the
-  // actual min of the lower lanes.
-  return OddEven(minHL, IfThenElse(eqXH, minHL, lo));
-}
-
-template <class D>
-HWY_INLINE VFromD<D> Max128(D /* tag */, const VFromD<D> a, const VFromD<D> b) {
-  const VFromD<D> aXH = detail::Slide1Down(a);
-  const VFromD<D> bXH = detail::Slide1Down(b);
-  const VFromD<D> maxHL = Max(a, b);
-  const MFromD<D> ltXH = Lt(aXH, bXH);
-  const MFromD<D> eqXH = Eq(aXH, bXH);
-  // If the upper lane is the decider, take lo from the same reg.
-  const VFromD<D> lo = IfThenElse(ltXH, b, a);
-  // The upper lane is just maxHL; if they are equal, we also need to use the
-  // actual min of the lower lanes.
-  return OddEven(maxHL, IfThenElse(eqXH, maxHL, lo));
-}
-
-template <class D>
-HWY_INLINE VFromD<D> Min128Upper(D d, VFromD<D> a, VFromD<D> b) {
-  return IfThenElse(Lt128Upper(d, a, b), a, b);
-}
-
-template <class D>
-HWY_INLINE VFromD<D> Max128Upper(D d, VFromD<D> a, VFromD<D> b) {
-  return IfThenElse(Lt128Upper(d, b, a), a, b);
-}
-
-// ================================================== END MACROS
-namespace detail {  // for code folding
-#undef HWY_RVV_AVL
-#undef HWY_RVV_D
-#undef HWY_RVV_FOREACH
-#undef HWY_RVV_FOREACH_08_ALL
-#undef HWY_RVV_FOREACH_08_ALL_VIRT
-#undef HWY_RVV_FOREACH_08_DEMOTE
-#undef HWY_RVV_FOREACH_08_DEMOTE_VIRT
-#undef HWY_RVV_FOREACH_08_EXT
-#undef HWY_RVV_FOREACH_08_EXT_VIRT
-#undef HWY_RVV_FOREACH_08_TRUNC
-#undef HWY_RVV_FOREACH_08_VIRT
-#undef HWY_RVV_FOREACH_16_ALL
-#undef HWY_RVV_FOREACH_16_ALL_VIRT
-#undef HWY_RVV_FOREACH_16_DEMOTE
-#undef HWY_RVV_FOREACH_16_DEMOTE_VIRT
-#undef HWY_RVV_FOREACH_16_EXT
-#undef HWY_RVV_FOREACH_16_EXT_VIRT
-#undef HWY_RVV_FOREACH_16_TRUNC
-#undef HWY_RVV_FOREACH_16_VIRT
-#undef HWY_RVV_FOREACH_32_ALL
-#undef HWY_RVV_FOREACH_32_ALL_VIRT
-#undef HWY_RVV_FOREACH_32_DEMOTE
-#undef HWY_RVV_FOREACH_32_DEMOTE_VIRT
-#undef HWY_RVV_FOREACH_32_EXT
-#undef HWY_RVV_FOREACH_32_EXT_VIRT
-#undef HWY_RVV_FOREACH_32_TRUNC
-#undef HWY_RVV_FOREACH_32_VIRT
-#undef HWY_RVV_FOREACH_64_ALL
-#undef HWY_RVV_FOREACH_64_ALL_VIRT
-#undef HWY_RVV_FOREACH_64_DEMOTE
-#undef HWY_RVV_FOREACH_64_DEMOTE_VIRT
-#undef HWY_RVV_FOREACH_64_EXT
-#undef HWY_RVV_FOREACH_64_EXT_VIRT
-#undef HWY_RVV_FOREACH_64_TRUNC
-#undef HWY_RVV_FOREACH_64_VIRT
-#undef HWY_RVV_FOREACH_B
-#undef HWY_RVV_FOREACH_F
-#undef HWY_RVV_FOREACH_F16
-#undef HWY_RVV_FOREACH_F32
-#undef HWY_RVV_FOREACH_F3264
-#undef HWY_RVV_FOREACH_F64
-#undef HWY_RVV_FOREACH_I
-#undef HWY_RVV_FOREACH_I08
-#undef HWY_RVV_FOREACH_I16
-#undef HWY_RVV_FOREACH_I163264
-#undef HWY_RVV_FOREACH_I32
-#undef HWY_RVV_FOREACH_I64
-#undef HWY_RVV_FOREACH_U
-#undef HWY_RVV_FOREACH_U08
-#undef HWY_RVV_FOREACH_U16
-#undef HWY_RVV_FOREACH_U163264
-#undef HWY_RVV_FOREACH_U32
-#undef HWY_RVV_FOREACH_U64
-#undef HWY_RVV_FOREACH_UI
-#undef HWY_RVV_FOREACH_UI08
-#undef HWY_RVV_FOREACH_UI16
-#undef HWY_RVV_FOREACH_UI163264
-#undef HWY_RVV_FOREACH_UI32
-#undef HWY_RVV_FOREACH_UI3264
-#undef HWY_RVV_FOREACH_UI64
-#undef HWY_RVV_M
-#undef HWY_RVV_RETM_ARGM
-#undef HWY_RVV_RETV_ARGV
-#undef HWY_RVV_RETV_ARGVS
-#undef HWY_RVV_RETV_ARGVV
-#undef HWY_RVV_T
-#undef HWY_RVV_V
-}  // namespace detail
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
diff --git a/third_party/highway/hwy/ops/scalar-inl.h b/third_party/highway/hwy/ops/scalar-inl.h
deleted file mode 100644 (file)
index 5d28d92..0000000
+++ /dev/null
@@ -1,1552 +0,0 @@
-// Copyright 2019 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Single-element vectors and operations.
-// External include guard in highway.h - see comment there.
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "hwy/base.h"
-#include "hwy/ops/shared-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-// Single instruction, single data.
-template <typename T>
-using Sisd = Simd<T, 1, 0>;
-
-// (Wrapper class required for overloading comparison operators.)
-template <typename T>
-struct Vec1 {
-  HWY_INLINE Vec1() = default;
-  Vec1(const Vec1&) = default;
-  Vec1& operator=(const Vec1&) = default;
-  HWY_INLINE explicit Vec1(const T t) : raw(t) {}
-
-  HWY_INLINE Vec1& operator*=(const Vec1 other) {
-    return *this = (*this * other);
-  }
-  HWY_INLINE Vec1& operator/=(const Vec1 other) {
-    return *this = (*this / other);
-  }
-  HWY_INLINE Vec1& operator+=(const Vec1 other) {
-    return *this = (*this + other);
-  }
-  HWY_INLINE Vec1& operator-=(const Vec1 other) {
-    return *this = (*this - other);
-  }
-  HWY_INLINE Vec1& operator&=(const Vec1 other) {
-    return *this = (*this & other);
-  }
-  HWY_INLINE Vec1& operator|=(const Vec1 other) {
-    return *this = (*this | other);
-  }
-  HWY_INLINE Vec1& operator^=(const Vec1 other) {
-    return *this = (*this ^ other);
-  }
-
-  T raw;
-};
-
-// 0 or FF..FF, same size as Vec1.
-template <typename T>
-class Mask1 {
-  using Raw = hwy::MakeUnsigned<T>;
-
- public:
-  static HWY_INLINE Mask1<T> FromBool(bool b) {
-    Mask1<T> mask;
-    mask.bits = b ? static_cast<Raw>(~Raw{0}) : 0;
-    return mask;
-  }
-
-  Raw bits;
-};
-
-namespace detail {
-
-// Deduce Sisd<T> from Vec1<T>
-struct Deduce1 {
-  template <typename T>
-  Sisd<T> operator()(Vec1<T>) const {
-    return Sisd<T>();
-  }
-};
-
-}  // namespace detail
-
-template <class V>
-using DFromV = decltype(detail::Deduce1()(V()));
-
-template <class V>
-using TFromV = TFromD<DFromV<V>>;
-
-// ------------------------------ BitCast
-
-template <typename T, typename FromT>
-HWY_API Vec1<T> BitCast(Sisd<T> /* tag */, Vec1<FromT> v) {
-  static_assert(sizeof(T) <= sizeof(FromT), "Promoting is undefined");
-  T to;
-  CopyBytes<sizeof(FromT)>(&v.raw, &to);  // not same size - ok to shrink
-  return Vec1<T>(to);
-}
-
-// ------------------------------ Set
-
-template <typename T>
-HWY_API Vec1<T> Zero(Sisd<T> /* tag */) {
-  return Vec1<T>(T(0));
-}
-
-template <typename T, typename T2>
-HWY_API Vec1<T> Set(Sisd<T> /* tag */, const T2 t) {
-  return Vec1<T>(static_cast<T>(t));
-}
-
-template <typename T>
-HWY_API Vec1<T> Undefined(Sisd<T> d) {
-  return Zero(d);
-}
-
-template <typename T, typename T2>
-HWY_API Vec1<T> Iota(const Sisd<T> /* tag */, const T2 first) {
-  return Vec1<T>(static_cast<T>(first));
-}
-
-template <class D>
-using VFromD = decltype(Zero(D()));
-
-// ================================================== LOGICAL
-
-// ------------------------------ Not
-
-template <typename T>
-HWY_API Vec1<T> Not(const Vec1<T> v) {
-  using TU = MakeUnsigned<T>;
-  const Sisd<TU> du;
-  return BitCast(Sisd<T>(), Vec1<TU>(static_cast<TU>(~BitCast(du, v).raw)));
-}
-
-// ------------------------------ And
-
-template <typename T>
-HWY_API Vec1<T> And(const Vec1<T> a, const Vec1<T> b) {
-  using TU = MakeUnsigned<T>;
-  const Sisd<TU> du;
-  return BitCast(Sisd<T>(), Vec1<TU>(BitCast(du, a).raw & BitCast(du, b).raw));
-}
-template <typename T>
-HWY_API Vec1<T> operator&(const Vec1<T> a, const Vec1<T> b) {
-  return And(a, b);
-}
-
-// ------------------------------ AndNot
-
-template <typename T>
-HWY_API Vec1<T> AndNot(const Vec1<T> a, const Vec1<T> b) {
-  using TU = MakeUnsigned<T>;
-  const Sisd<TU> du;
-  return BitCast(Sisd<T>(), Vec1<TU>(static_cast<TU>(~BitCast(du, a).raw &
-                                                     BitCast(du, b).raw)));
-}
-
-// ------------------------------ Or
-
-template <typename T>
-HWY_API Vec1<T> Or(const Vec1<T> a, const Vec1<T> b) {
-  using TU = MakeUnsigned<T>;
-  const Sisd<TU> du;
-  return BitCast(Sisd<T>(), Vec1<TU>(BitCast(du, a).raw | BitCast(du, b).raw));
-}
-template <typename T>
-HWY_API Vec1<T> operator|(const Vec1<T> a, const Vec1<T> b) {
-  return Or(a, b);
-}
-
-// ------------------------------ Xor
-
-template <typename T>
-HWY_API Vec1<T> Xor(const Vec1<T> a, const Vec1<T> b) {
-  using TU = MakeUnsigned<T>;
-  const Sisd<TU> du;
-  return BitCast(Sisd<T>(), Vec1<TU>(BitCast(du, a).raw ^ BitCast(du, b).raw));
-}
-template <typename T>
-HWY_API Vec1<T> operator^(const Vec1<T> a, const Vec1<T> b) {
-  return Xor(a, b);
-}
-
-// ------------------------------ Or3
-
-template <typename T>
-HWY_API Vec1<T> Or3(Vec1<T> o1, Vec1<T> o2, Vec1<T> o3) {
-  return Or(o1, Or(o2, o3));
-}
-
-// ------------------------------ OrAnd
-
-template <typename T>
-HWY_API Vec1<T> OrAnd(const Vec1<T> o, const Vec1<T> a1, const Vec1<T> a2) {
-  return Or(o, And(a1, a2));
-}
-
-// ------------------------------ IfVecThenElse
-
-template <typename T>
-HWY_API Vec1<T> IfVecThenElse(Vec1<T> mask, Vec1<T> yes, Vec1<T> no) {
-  return IfThenElse(MaskFromVec(mask), yes, no);
-}
-
-// ------------------------------ CopySign
-
-template <typename T>
-HWY_API Vec1<T> CopySign(const Vec1<T> magn, const Vec1<T> sign) {
-  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
-  const auto msb = SignBit(Sisd<T>());
-  return Or(AndNot(msb, magn), And(msb, sign));
-}
-
-template <typename T>
-HWY_API Vec1<T> CopySignToAbs(const Vec1<T> abs, const Vec1<T> sign) {
-  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
-  return Or(abs, And(SignBit(Sisd<T>()), sign));
-}
-
-// ------------------------------ BroadcastSignBit
-
-template <typename T>
-HWY_API Vec1<T> BroadcastSignBit(const Vec1<T> v) {
-  // This is used inside ShiftRight, so we cannot implement in terms of it.
-  return v.raw < 0 ? Vec1<T>(T(-1)) : Vec1<T>(0);
-}
-
-// ------------------------------ PopulationCount
-
-#ifdef HWY_NATIVE_POPCNT
-#undef HWY_NATIVE_POPCNT
-#else
-#define HWY_NATIVE_POPCNT
-#endif
-
-template <typename T>
-HWY_API Vec1<T> PopulationCount(Vec1<T> v) {
-  return Vec1<T>(static_cast<T>(PopCount(v.raw)));
-}
-
-// ------------------------------ Mask
-
-template <typename TFrom, typename TTo>
-HWY_API Mask1<TTo> RebindMask(Sisd<TTo> /*tag*/, Mask1<TFrom> m) {
-  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
-  return Mask1<TTo>{m.bits};
-}
-
-// v must be 0 or FF..FF.
-template <typename T>
-HWY_API Mask1<T> MaskFromVec(const Vec1<T> v) {
-  Mask1<T> mask;
-  CopySameSize(&v, &mask);
-  return mask;
-}
-
-template <typename T>
-Vec1<T> VecFromMask(const Mask1<T> mask) {
-  Vec1<T> v;
-  CopySameSize(&mask, &v);
-  return v;
-}
-
-template <typename T>
-Vec1<T> VecFromMask(Sisd<T> /* tag */, const Mask1<T> mask) {
-  Vec1<T> v;
-  CopySameSize(&mask, &v);
-  return v;
-}
-
-template <typename T>
-HWY_API Mask1<T> FirstN(Sisd<T> /*tag*/, size_t n) {
-  return Mask1<T>::FromBool(n != 0);
-}
-
-// Returns mask ? yes : no.
-template <typename T>
-HWY_API Vec1<T> IfThenElse(const Mask1<T> mask, const Vec1<T> yes,
-                           const Vec1<T> no) {
-  return mask.bits ? yes : no;
-}
-
-template <typename T>
-HWY_API Vec1<T> IfThenElseZero(const Mask1<T> mask, const Vec1<T> yes) {
-  return mask.bits ? yes : Vec1<T>(0);
-}
-
-template <typename T>
-HWY_API Vec1<T> IfThenZeroElse(const Mask1<T> mask, const Vec1<T> no) {
-  return mask.bits ? Vec1<T>(0) : no;
-}
-
-template <typename T>
-HWY_API Vec1<T> IfNegativeThenElse(Vec1<T> v, Vec1<T> yes, Vec1<T> no) {
-  return v.raw < 0 ? yes : no;
-}
-
-template <typename T>
-HWY_API Vec1<T> ZeroIfNegative(const Vec1<T> v) {
-  return v.raw < 0 ? Vec1<T>(0) : v;
-}
-
-// ------------------------------ Mask logical
-
-template <typename T>
-HWY_API Mask1<T> Not(const Mask1<T> m) {
-  return MaskFromVec(Not(VecFromMask(Sisd<T>(), m)));
-}
-
-template <typename T>
-HWY_API Mask1<T> And(const Mask1<T> a, Mask1<T> b) {
-  const Sisd<T> d;
-  return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
-}
-
-template <typename T>
-HWY_API Mask1<T> AndNot(const Mask1<T> a, Mask1<T> b) {
-  const Sisd<T> d;
-  return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
-}
-
-template <typename T>
-HWY_API Mask1<T> Or(const Mask1<T> a, Mask1<T> b) {
-  const Sisd<T> d;
-  return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
-}
-
-template <typename T>
-HWY_API Mask1<T> Xor(const Mask1<T> a, Mask1<T> b) {
-  const Sisd<T> d;
-  return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
-}
-
-// ================================================== SHIFTS
-
-// ------------------------------ ShiftLeft/ShiftRight (BroadcastSignBit)
-
-template <int kBits, typename T>
-HWY_API Vec1<T> ShiftLeft(const Vec1<T> v) {
-  static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
-  return Vec1<T>(
-      static_cast<T>(static_cast<hwy::MakeUnsigned<T>>(v.raw) << kBits));
-}
-
-template <int kBits, typename T>
-HWY_API Vec1<T> ShiftRight(const Vec1<T> v) {
-  static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
-#if __cplusplus >= 202002L
-  // Signed right shift is now guaranteed to be arithmetic (rounding toward
-  // negative infinity, i.e. shifting in the sign bit).
-  return Vec1<T>(static_cast<T>(v.raw >> kBits));
-#else
-  if (IsSigned<T>()) {
-    // Emulate arithmetic shift using only logical (unsigned) shifts, because
-    // signed shifts are still implementation-defined.
-    using TU = hwy::MakeUnsigned<T>;
-    const Sisd<TU> du;
-    const TU shifted = BitCast(du, v).raw >> kBits;
-    const TU sign = BitCast(du, BroadcastSignBit(v)).raw;
-    const size_t sign_shift =
-        static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - kBits);
-    const TU upper = static_cast<TU>(sign << sign_shift);
-    return BitCast(Sisd<T>(), Vec1<TU>(shifted | upper));
-  } else {  // T is unsigned
-    return Vec1<T>(static_cast<T>(v.raw >> kBits));
-  }
-#endif
-}
-
-// ------------------------------ RotateRight (ShiftRight)
-
-namespace detail {
-
-// For partial specialization: kBits == 0 results in an invalid shift count
-template <int kBits>
-struct RotateRight {
-  template <typename T>
-  HWY_INLINE Vec1<T> operator()(const Vec1<T> v) const {
-    return Or(ShiftRight<kBits>(v), ShiftLeft<sizeof(T) * 8 - kBits>(v));
-  }
-};
-
-template <>
-struct RotateRight<0> {
-  template <typename T>
-  HWY_INLINE Vec1<T> operator()(const Vec1<T> v) const {
-    return v;
-  }
-};
-
-}  // namespace detail
-
-template <int kBits, typename T>
-HWY_API Vec1<T> RotateRight(const Vec1<T> v) {
-  static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift");
-  return detail::RotateRight<kBits>()(v);
-}
-
-// ------------------------------ ShiftLeftSame (BroadcastSignBit)
-
-template <typename T>
-HWY_API Vec1<T> ShiftLeftSame(const Vec1<T> v, int bits) {
-  return Vec1<T>(
-      static_cast<T>(static_cast<hwy::MakeUnsigned<T>>(v.raw) << bits));
-}
-
-template <typename T>
-HWY_API Vec1<T> ShiftRightSame(const Vec1<T> v, int bits) {
-#if __cplusplus >= 202002L
-  // Signed right shift is now guaranteed to be arithmetic (rounding toward
-  // negative infinity, i.e. shifting in the sign bit).
-  return Vec1<T>(static_cast<T>(v.raw >> bits));
-#else
-  if (IsSigned<T>()) {
-    // Emulate arithmetic shift using only logical (unsigned) shifts, because
-    // signed shifts are still implementation-defined.
-    using TU = hwy::MakeUnsigned<T>;
-    const Sisd<TU> du;
-    const TU shifted = BitCast(du, v).raw >> bits;
-    const TU sign = BitCast(du, BroadcastSignBit(v)).raw;
-    const size_t sign_shift =
-        static_cast<size_t>(static_cast<int>(sizeof(TU)) * 8 - 1 - bits);
-    const TU upper = static_cast<TU>(sign << sign_shift);
-    return BitCast(Sisd<T>(), Vec1<TU>(shifted | upper));
-  } else {  // T is unsigned
-    return Vec1<T>(static_cast<T>(v.raw >> bits));
-  }
-#endif
-}
-
-// ------------------------------ Shl
-
-// Single-lane => same as ShiftLeftSame except for the argument type.
-template <typename T>
-HWY_API Vec1<T> operator<<(const Vec1<T> v, const Vec1<T> bits) {
-  return ShiftLeftSame(v, static_cast<int>(bits.raw));
-}
-
-template <typename T>
-HWY_API Vec1<T> operator>>(const Vec1<T> v, const Vec1<T> bits) {
-  return ShiftRightSame(v, static_cast<int>(bits.raw));
-}
-
-// ================================================== ARITHMETIC
-
-template <typename T>
-HWY_API Vec1<T> operator+(Vec1<T> a, Vec1<T> b) {
-  const uint64_t a64 = static_cast<uint64_t>(a.raw);
-  const uint64_t b64 = static_cast<uint64_t>(b.raw);
-  return Vec1<T>(static_cast<T>((a64 + b64) & static_cast<uint64_t>(~T(0))));
-}
-HWY_API Vec1<float> operator+(const Vec1<float> a, const Vec1<float> b) {
-  return Vec1<float>(a.raw + b.raw);
-}
-HWY_API Vec1<double> operator+(const Vec1<double> a, const Vec1<double> b) {
-  return Vec1<double>(a.raw + b.raw);
-}
-
-template <typename T>
-HWY_API Vec1<T> operator-(Vec1<T> a, Vec1<T> b) {
-  const uint64_t a64 = static_cast<uint64_t>(a.raw);
-  const uint64_t b64 = static_cast<uint64_t>(b.raw);
-  return Vec1<T>(static_cast<T>((a64 - b64) & static_cast<uint64_t>(~T(0))));
-}
-HWY_API Vec1<float> operator-(const Vec1<float> a, const Vec1<float> b) {
-  return Vec1<float>(a.raw - b.raw);
-}
-HWY_API Vec1<double> operator-(const Vec1<double> a, const Vec1<double> b) {
-  return Vec1<double>(a.raw - b.raw);
-}
-
-// ------------------------------ SumsOf8
-
-HWY_API Vec1<uint64_t> SumsOf8(const Vec1<uint8_t> v) {
-  return Vec1<uint64_t>(v.raw);
-}
-
-// ------------------------------ SaturatedAdd
-
-// Returns a + b clamped to the destination range.
-
-// Unsigned
-HWY_API Vec1<uint8_t> SaturatedAdd(const Vec1<uint8_t> a,
-                                   const Vec1<uint8_t> b) {
-  return Vec1<uint8_t>(
-      static_cast<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 255)));
-}
-HWY_API Vec1<uint16_t> SaturatedAdd(const Vec1<uint16_t> a,
-                                    const Vec1<uint16_t> b) {
-  return Vec1<uint16_t>(
-      static_cast<uint16_t>(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 65535)));
-}
-
-// Signed
-HWY_API Vec1<int8_t> SaturatedAdd(const Vec1<int8_t> a, const Vec1<int8_t> b) {
-  return Vec1<int8_t>(
-      static_cast<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw + b.raw), 127)));
-}
-HWY_API Vec1<int16_t> SaturatedAdd(const Vec1<int16_t> a,
-                                   const Vec1<int16_t> b) {
-  return Vec1<int16_t>(
-      static_cast<int16_t>(HWY_MIN(HWY_MAX(-32768, a.raw + b.raw), 32767)));
-}
-
-// ------------------------------ Saturating subtraction
-
-// Returns a - b clamped to the destination range.
-
-// Unsigned
-HWY_API Vec1<uint8_t> SaturatedSub(const Vec1<uint8_t> a,
-                                   const Vec1<uint8_t> b) {
-  return Vec1<uint8_t>(
-      static_cast<uint8_t>(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 255)));
-}
-HWY_API Vec1<uint16_t> SaturatedSub(const Vec1<uint16_t> a,
-                                    const Vec1<uint16_t> b) {
-  return Vec1<uint16_t>(
-      static_cast<uint16_t>(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 65535)));
-}
-
-// Signed
-HWY_API Vec1<int8_t> SaturatedSub(const Vec1<int8_t> a, const Vec1<int8_t> b) {
-  return Vec1<int8_t>(
-      static_cast<int8_t>(HWY_MIN(HWY_MAX(-128, a.raw - b.raw), 127)));
-}
-HWY_API Vec1<int16_t> SaturatedSub(const Vec1<int16_t> a,
-                                   const Vec1<int16_t> b) {
-  return Vec1<int16_t>(
-      static_cast<int16_t>(HWY_MIN(HWY_MAX(-32768, a.raw - b.raw), 32767)));
-}
-
-// ------------------------------ Average
-
-// Returns (a + b + 1) / 2
-
-HWY_API Vec1<uint8_t> AverageRound(const Vec1<uint8_t> a,
-                                   const Vec1<uint8_t> b) {
-  return Vec1<uint8_t>(static_cast<uint8_t>((a.raw + b.raw + 1) / 2));
-}
-HWY_API Vec1<uint16_t> AverageRound(const Vec1<uint16_t> a,
-                                    const Vec1<uint16_t> b) {
-  return Vec1<uint16_t>(static_cast<uint16_t>((a.raw + b.raw + 1) / 2));
-}
-
-// ------------------------------ Absolute value
-
-template <typename T>
-HWY_API Vec1<T> Abs(const Vec1<T> a) {
-  const T i = a.raw;
-  return (i >= 0 || i == hwy::LimitsMin<T>()) ? a : Vec1<T>(-i);
-}
-HWY_API Vec1<float> Abs(const Vec1<float> a) {
-  return Vec1<float>(std::abs(a.raw));
-}
-HWY_API Vec1<double> Abs(const Vec1<double> a) {
-  return Vec1<double>(std::abs(a.raw));
-}
-
-// ------------------------------ min/max
-
-template <typename T, HWY_IF_NOT_FLOAT(T)>
-HWY_API Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) {
-  return Vec1<T>(HWY_MIN(a.raw, b.raw));
-}
-
-template <typename T, HWY_IF_FLOAT(T)>
-HWY_API Vec1<T> Min(const Vec1<T> a, const Vec1<T> b) {
-  if (std::isnan(a.raw)) return b;
-  if (std::isnan(b.raw)) return a;
-  return Vec1<T>(HWY_MIN(a.raw, b.raw));
-}
-
-template <typename T, HWY_IF_NOT_FLOAT(T)>
-HWY_API Vec1<T> Max(const Vec1<T> a, const Vec1<T> b) {
-  return Vec1<T>(HWY_MAX(a.raw, b.raw));
-}
-
-template <typename T, HWY_IF_FLOAT(T)>
-HWY_API Vec1<T> Max(const Vec1<T> a, const Vec1<T> b) {
-  if (std::isnan(a.raw)) return b;
-  if (std::isnan(b.raw)) return a;
-  return Vec1<T>(HWY_MAX(a.raw, b.raw));
-}
-
-// ------------------------------ Floating-point negate
-
-template <typename T, HWY_IF_FLOAT(T)>
-HWY_API Vec1<T> Neg(const Vec1<T> v) {
-  return Xor(v, SignBit(Sisd<T>()));
-}
-
-template <typename T, HWY_IF_NOT_FLOAT(T)>
-HWY_API Vec1<T> Neg(const Vec1<T> v) {
-  return Zero(Sisd<T>()) - v;
-}
-
-// ------------------------------ mul/div
-
-template <typename T, HWY_IF_FLOAT(T)>
-HWY_API Vec1<T> operator*(const Vec1<T> a, const Vec1<T> b) {
-  return Vec1<T>(static_cast<T>(double(a.raw) * b.raw));
-}
-
-template <typename T, HWY_IF_SIGNED(T)>
-HWY_API Vec1<T> operator*(const Vec1<T> a, const Vec1<T> b) {
-  return Vec1<T>(static_cast<T>(int64_t(a.raw) * b.raw));
-}
-
-template <typename T, HWY_IF_UNSIGNED(T)>
-HWY_API Vec1<T> operator*(const Vec1<T> a, const Vec1<T> b) {
-  return Vec1<T>(static_cast<T>(uint64_t(a.raw) * b.raw));
-}
-
-template <typename T>
-HWY_API Vec1<T> operator/(const Vec1<T> a, const Vec1<T> b) {
-  return Vec1<T>(a.raw / b.raw);
-}
-
-// Returns the upper 16 bits of a * b in each lane.
-HWY_API Vec1<int16_t> MulHigh(const Vec1<int16_t> a, const Vec1<int16_t> b) {
-  return Vec1<int16_t>(static_cast<int16_t>((a.raw * b.raw) >> 16));
-}
-HWY_API Vec1<uint16_t> MulHigh(const Vec1<uint16_t> a, const Vec1<uint16_t> b) {
-  // Cast to uint32_t first to prevent overflow. Otherwise the result of
-  // uint16_t * uint16_t is in "int" which may overflow. In practice the result
-  // is the same but this way it is also defined.
-  return Vec1<uint16_t>(static_cast<uint16_t>(
-      (static_cast<uint32_t>(a.raw) * static_cast<uint32_t>(b.raw)) >> 16));
-}
-
-HWY_API Vec1<int16_t> MulFixedPoint15(Vec1<int16_t> a, Vec1<int16_t> b) {
-  return Vec1<int16_t>(static_cast<int16_t>((2 * a.raw * b.raw + 32768) >> 16));
-}
-
-// Multiplies even lanes (0, 2 ..) and returns the double-wide result.
-HWY_API Vec1<int64_t> MulEven(const Vec1<int32_t> a, const Vec1<int32_t> b) {
-  const int64_t a64 = a.raw;
-  return Vec1<int64_t>(a64 * b.raw);
-}
-HWY_API Vec1<uint64_t> MulEven(const Vec1<uint32_t> a, const Vec1<uint32_t> b) {
-  const uint64_t a64 = a.raw;
-  return Vec1<uint64_t>(a64 * b.raw);
-}
-
-// Approximate reciprocal
-HWY_API Vec1<float> ApproximateReciprocal(const Vec1<float> v) {
-  // Zero inputs are allowed, but callers are responsible for replacing the
-  // return value with something else (typically using IfThenElse). This check
-  // avoids a ubsan error. The return value is arbitrary.
-  if (v.raw == 0.0f) return Vec1<float>(0.0f);
-  return Vec1<float>(1.0f / v.raw);
-}
-
-// Absolute value of difference.
-HWY_API Vec1<float> AbsDiff(const Vec1<float> a, const Vec1<float> b) {
-  return Abs(a - b);
-}
-
-// ------------------------------ Floating-point multiply-add variants
-
-template <typename T>
-HWY_API Vec1<T> MulAdd(const Vec1<T> mul, const Vec1<T> x, const Vec1<T> add) {
-  return mul * x + add;
-}
-
-template <typename T>
-HWY_API Vec1<T> NegMulAdd(const Vec1<T> mul, const Vec1<T> x,
-                          const Vec1<T> add) {
-  return add - mul * x;
-}
-
-template <typename T>
-HWY_API Vec1<T> MulSub(const Vec1<T> mul, const Vec1<T> x, const Vec1<T> sub) {
-  return mul * x - sub;
-}
-
-template <typename T>
-HWY_API Vec1<T> NegMulSub(const Vec1<T> mul, const Vec1<T> x,
-                          const Vec1<T> sub) {
-  return Neg(mul) * x - sub;
-}
-
-// ------------------------------ Floating-point square root
-
-// Approximate reciprocal square root
-HWY_API Vec1<float> ApproximateReciprocalSqrt(const Vec1<float> v) {
-  float f = v.raw;
-  const float half = f * 0.5f;
-  uint32_t bits;
-  CopySameSize(&f, &bits);
-  // Initial guess based on log2(f)
-  bits = 0x5F3759DF - (bits >> 1);
-  CopySameSize(&bits, &f);
-  // One Newton-Raphson iteration
-  return Vec1<float>(f * (1.5f - (half * f * f)));
-}
-
-// Square root
-HWY_API Vec1<float> Sqrt(const Vec1<float> v) {
-  return Vec1<float>(std::sqrt(v.raw));
-}
-HWY_API Vec1<double> Sqrt(const Vec1<double> v) {
-  return Vec1<double>(std::sqrt(v.raw));
-}
-
-// ------------------------------ Floating-point rounding
-
-template <typename T>
-HWY_API Vec1<T> Round(const Vec1<T> v) {
-  using TI = MakeSigned<T>;
-  if (!(Abs(v).raw < MantissaEnd<T>())) {  // Huge or NaN
-    return v;
-  }
-  const T bias = v.raw < T(0.0) ? T(-0.5) : T(0.5);
-  const TI rounded = static_cast<TI>(v.raw + bias);
-  if (rounded == 0) return CopySignToAbs(Vec1<T>(0), v);
-  // Round to even
-  if ((rounded & 1) && std::abs(static_cast<T>(rounded) - v.raw) == T(0.5)) {
-    return Vec1<T>(static_cast<T>(rounded - (v.raw < T(0) ? -1 : 1)));
-  }
-  return Vec1<T>(static_cast<T>(rounded));
-}
-
-// Round-to-nearest even.
-HWY_API Vec1<int32_t> NearestInt(const Vec1<float> v) {
-  using T = float;
-  using TI = int32_t;
-
-  const T abs = Abs(v).raw;
-  const bool signbit = std::signbit(v.raw);
-
-  if (!(abs < MantissaEnd<T>())) {  // Huge or NaN
-    // Check if too large to cast or NaN
-    if (!(abs <= static_cast<T>(LimitsMax<TI>()))) {
-      return Vec1<TI>(signbit ? LimitsMin<TI>() : LimitsMax<TI>());
-    }
-    return Vec1<int32_t>(static_cast<TI>(v.raw));
-  }
-  const T bias = v.raw < T(0.0) ? T(-0.5) : T(0.5);
-  const TI rounded = static_cast<TI>(v.raw + bias);
-  if (rounded == 0) return Vec1<int32_t>(0);
-  // Round to even
-  if ((rounded & 1) && std::abs(static_cast<T>(rounded) - v.raw) == T(0.5)) {
-    return Vec1<TI>(rounded - (signbit ? -1 : 1));
-  }
-  return Vec1<TI>(rounded);
-}
-
-template <typename T>
-HWY_API Vec1<T> Trunc(const Vec1<T> v) {
-  using TI = MakeSigned<T>;
-  if (!(Abs(v).raw <= MantissaEnd<T>())) {  // Huge or NaN
-    return v;
-  }
-  const TI truncated = static_cast<TI>(v.raw);
-  if (truncated == 0) return CopySignToAbs(Vec1<T>(0), v);
-  return Vec1<T>(static_cast<T>(truncated));
-}
-
-template <typename Float, typename Bits, int kMantissaBits, int kExponentBits,
-          class V>
-V Ceiling(const V v) {
-  const Bits kExponentMask = (1ull << kExponentBits) - 1;
-  const Bits kMantissaMask = (1ull << kMantissaBits) - 1;
-  const Bits kBias = kExponentMask / 2;
-
-  Float f = v.raw;
-  const bool positive = f > Float(0.0);
-
-  Bits bits;
-  CopySameSize(&v, &bits);
-
-  const int exponent =
-      static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
-  // Already an integer.
-  if (exponent >= kMantissaBits) return v;
-  // |v| <= 1 => 0 or 1.
-  if (exponent < 0) return positive ? V(1) : V(-0.0);
-
-  const Bits mantissa_mask = kMantissaMask >> exponent;
-  // Already an integer
-  if ((bits & mantissa_mask) == 0) return v;
-
-  // Clear fractional bits and round up
-  if (positive) bits += (kMantissaMask + 1) >> exponent;
-  bits &= ~mantissa_mask;
-
-  CopySameSize(&bits, &f);
-  return V(f);
-}
-
-template <typename Float, typename Bits, int kMantissaBits, int kExponentBits,
-          class V>
-V Floor(const V v) {
-  const Bits kExponentMask = (1ull << kExponentBits) - 1;
-  const Bits kMantissaMask = (1ull << kMantissaBits) - 1;
-  const Bits kBias = kExponentMask / 2;
-
-  Float f = v.raw;
-  const bool negative = f < Float(0.0);
-
-  Bits bits;
-  CopySameSize(&v, &bits);
-
-  const int exponent =
-      static_cast<int>(((bits >> kMantissaBits) & kExponentMask) - kBias);
-  // Already an integer.
-  if (exponent >= kMantissaBits) return v;
-  // |v| <= 1 => -1 or 0.
-  if (exponent < 0) return V(negative ? Float(-1.0) : Float(0.0));
-
-  const Bits mantissa_mask = kMantissaMask >> exponent;
-  // Already an integer
-  if ((bits & mantissa_mask) == 0) return v;
-
-  // Clear fractional bits and round down
-  if (negative) bits += (kMantissaMask + 1) >> exponent;
-  bits &= ~mantissa_mask;
-
-  CopySameSize(&bits, &f);
-  return V(f);
-}
-
-// Toward +infinity, aka ceiling
-HWY_API Vec1<float> Ceil(const Vec1<float> v) {
-  return Ceiling<float, uint32_t, 23, 8>(v);
-}
-HWY_API Vec1<double> Ceil(const Vec1<double> v) {
-  return Ceiling<double, uint64_t, 52, 11>(v);
-}
-
-// Toward -infinity, aka floor
-HWY_API Vec1<float> Floor(const Vec1<float> v) {
-  return Floor<float, uint32_t, 23, 8>(v);
-}
-HWY_API Vec1<double> Floor(const Vec1<double> v) {
-  return Floor<double, uint64_t, 52, 11>(v);
-}
-
-// ================================================== COMPARE
-
-template <typename T>
-HWY_API Mask1<T> operator==(const Vec1<T> a, const Vec1<T> b) {
-  return Mask1<T>::FromBool(a.raw == b.raw);
-}
-
-template <typename T>
-HWY_API Mask1<T> operator!=(const Vec1<T> a, const Vec1<T> b) {
-  return Mask1<T>::FromBool(a.raw != b.raw);
-}
-
-template <typename T>
-HWY_API Mask1<T> TestBit(const Vec1<T> v, const Vec1<T> bit) {
-  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
-  return (v & bit) == bit;
-}
-
-template <typename T>
-HWY_API Mask1<T> operator<(const Vec1<T> a, const Vec1<T> b) {
-  return Mask1<T>::FromBool(a.raw < b.raw);
-}
-template <typename T>
-HWY_API Mask1<T> operator>(const Vec1<T> a, const Vec1<T> b) {
-  return Mask1<T>::FromBool(a.raw > b.raw);
-}
-
-template <typename T>
-HWY_API Mask1<T> operator<=(const Vec1<T> a, const Vec1<T> b) {
-  return Mask1<T>::FromBool(a.raw <= b.raw);
-}
-template <typename T>
-HWY_API Mask1<T> operator>=(const Vec1<T> a, const Vec1<T> b) {
-  return Mask1<T>::FromBool(a.raw >= b.raw);
-}
-
-// ------------------------------ Floating-point classification (==)
-
-template <typename T>
-HWY_API Mask1<T> IsNaN(const Vec1<T> v) {
-  // std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY.
-  MakeUnsigned<T> bits;
-  CopySameSize(&v, &bits);
-  bits += bits;
-  bits >>= 1;  // clear sign bit
-  // NaN if all exponent bits are set and the mantissa is not zero.
-  return Mask1<T>::FromBool(bits > ExponentMask<T>());
-}
-
-HWY_API Mask1<float> IsInf(const Vec1<float> v) {
-  const Sisd<float> d;
-  const RebindToUnsigned<decltype(d)> du;
-  const Vec1<uint32_t> vu = BitCast(du, v);
-  // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
-  return RebindMask(d, (vu + vu) == Set(du, 0xFF000000u));
-}
-HWY_API Mask1<double> IsInf(const Vec1<double> v) {
-  const Sisd<double> d;
-  const RebindToUnsigned<decltype(d)> du;
-  const Vec1<uint64_t> vu = BitCast(du, v);
-  // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
-  return RebindMask(d, (vu + vu) == Set(du, 0xFFE0000000000000ull));
-}
-
-HWY_API Mask1<float> IsFinite(const Vec1<float> v) {
-  const Vec1<uint32_t> vu = BitCast(Sisd<uint32_t>(), v);
-  // Shift left to clear the sign bit, check whether exponent != max value.
-  return Mask1<float>::FromBool((vu.raw << 1) < 0xFF000000u);
-}
-HWY_API Mask1<double> IsFinite(const Vec1<double> v) {
-  const Vec1<uint64_t> vu = BitCast(Sisd<uint64_t>(), v);
-  // Shift left to clear the sign bit, check whether exponent != max value.
-  return Mask1<double>::FromBool((vu.raw << 1) < 0xFFE0000000000000ull);
-}
-
-// ================================================== MEMORY
-
-// ------------------------------ Load
-
-template <typename T>
-HWY_API Vec1<T> Load(Sisd<T> /* tag */, const T* HWY_RESTRICT aligned) {
-  T t;
-  CopySameSize(aligned, &t);
-  return Vec1<T>(t);
-}
-
-template <typename T>
-HWY_API Vec1<T> MaskedLoad(Mask1<T> m, Sisd<T> d,
-                           const T* HWY_RESTRICT aligned) {
-  return IfThenElseZero(m, Load(d, aligned));
-}
-
-template <typename T>
-HWY_API Vec1<T> LoadU(Sisd<T> d, const T* HWY_RESTRICT p) {
-  return Load(d, p);
-}
-
-// In some use cases, "load single lane" is sufficient; otherwise avoid this.
-template <typename T>
-HWY_API Vec1<T> LoadDup128(Sisd<T> d, const T* HWY_RESTRICT aligned) {
-  return Load(d, aligned);
-}
-
-// ------------------------------ Store
-
-template <typename T>
-HWY_API void Store(const Vec1<T> v, Sisd<T> /* tag */,
-                   T* HWY_RESTRICT aligned) {
-  CopySameSize(&v.raw, aligned);
-}
-
-template <typename T>
-HWY_API void StoreU(const Vec1<T> v, Sisd<T> d, T* HWY_RESTRICT p) {
-  return Store(v, d, p);
-}
-
-template <typename T>
-HWY_API void BlendedStore(const Vec1<T> v, Mask1<T> m, Sisd<T> d,
-                          T* HWY_RESTRICT p) {
-  if (!m.bits) return;
-  StoreU(v, d, p);
-}
-
-// ------------------------------ LoadInterleaved2/3/4
-
-// Per-target flag to prevent generic_ops-inl.h from defining StoreInterleaved2.
-#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
-#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
-#else
-#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
-#endif
-
-template <typename T>
-HWY_API void LoadInterleaved2(Sisd<T> d, const T* HWY_RESTRICT unaligned,
-                              Vec1<T>& v0, Vec1<T>& v1) {
-  v0 = LoadU(d, unaligned + 0);
-  v1 = LoadU(d, unaligned + 1);
-}
-
-template <typename T>
-HWY_API void LoadInterleaved3(Sisd<T> d, const T* HWY_RESTRICT unaligned,
-                              Vec1<T>& v0, Vec1<T>& v1, Vec1<T>& v2) {
-  v0 = LoadU(d, unaligned + 0);
-  v1 = LoadU(d, unaligned + 1);
-  v2 = LoadU(d, unaligned + 2);
-}
-
-template <typename T>
-HWY_API void LoadInterleaved4(Sisd<T> d, const T* HWY_RESTRICT unaligned,
-                              Vec1<T>& v0, Vec1<T>& v1, Vec1<T>& v2,
-                              Vec1<T>& v3) {
-  v0 = LoadU(d, unaligned + 0);
-  v1 = LoadU(d, unaligned + 1);
-  v2 = LoadU(d, unaligned + 2);
-  v3 = LoadU(d, unaligned + 3);
-}
-
-// ------------------------------ StoreInterleaved2/3/4
-
-template <typename T>
-HWY_API void StoreInterleaved2(const Vec1<T> v0, const Vec1<T> v1, Sisd<T> d,
-                               T* HWY_RESTRICT unaligned) {
-  StoreU(v0, d, unaligned + 0);
-  StoreU(v1, d, unaligned + 1);
-}
-
-template <typename T>
-HWY_API void StoreInterleaved3(const Vec1<T> v0, const Vec1<T> v1,
-                               const Vec1<T> v2, Sisd<T> d,
-                               T* HWY_RESTRICT unaligned) {
-  StoreU(v0, d, unaligned + 0);
-  StoreU(v1, d, unaligned + 1);
-  StoreU(v2, d, unaligned + 2);
-}
-
-template <typename T>
-HWY_API void StoreInterleaved4(const Vec1<T> v0, const Vec1<T> v1,
-                               const Vec1<T> v2, const Vec1<T> v3, Sisd<T> d,
-                               T* HWY_RESTRICT unaligned) {
-  StoreU(v0, d, unaligned + 0);
-  StoreU(v1, d, unaligned + 1);
-  StoreU(v2, d, unaligned + 2);
-  StoreU(v3, d, unaligned + 3);
-}
-
-// ------------------------------ Stream
-
-template <typename T>
-HWY_API void Stream(const Vec1<T> v, Sisd<T> d, T* HWY_RESTRICT aligned) {
-  return Store(v, d, aligned);
-}
-
-// ------------------------------ Scatter
-
-template <typename T, typename Offset>
-HWY_API void ScatterOffset(Vec1<T> v, Sisd<T> d, T* base,
-                           const Vec1<Offset> offset) {
-  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
-  uint8_t* const base8 = reinterpret_cast<uint8_t*>(base) + offset.raw;
-  return Store(v, d, reinterpret_cast<T*>(base8));
-}
-
-template <typename T, typename Index>
-HWY_API void ScatterIndex(Vec1<T> v, Sisd<T> d, T* HWY_RESTRICT base,
-                          const Vec1<Index> index) {
-  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
-  return Store(v, d, base + index.raw);
-}
-
-// ------------------------------ Gather
-
-template <typename T, typename Offset>
-HWY_API Vec1<T> GatherOffset(Sisd<T> d, const T* base,
-                             const Vec1<Offset> offset) {
-  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
-  const intptr_t addr =
-      reinterpret_cast<intptr_t>(base) + static_cast<intptr_t>(offset.raw);
-  return Load(d, reinterpret_cast<const T*>(addr));
-}
-
-template <typename T, typename Index>
-HWY_API Vec1<T> GatherIndex(Sisd<T> d, const T* HWY_RESTRICT base,
-                            const Vec1<Index> index) {
-  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
-  return Load(d, base + index.raw);
-}
-
-// ================================================== CONVERT
-
-// ConvertTo and DemoteTo with floating-point input and integer output truncate
-// (rounding toward zero).
-
-template <typename FromT, typename ToT>
-HWY_API Vec1<ToT> PromoteTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
-  static_assert(sizeof(ToT) > sizeof(FromT), "Not promoting");
-  // For bits Y > X, floatX->floatY and intX->intY are always representable.
-  return Vec1<ToT>(static_cast<ToT>(from.raw));
-}
-
-// MSVC 19.10 cannot deduce the argument type if HWY_IF_FLOAT(FromT) is here,
-// so we overload for FromT=double and ToT={float,int32_t}.
-HWY_API Vec1<float> DemoteTo(Sisd<float> /* tag */, Vec1<double> from) {
-  // Prevent ubsan errors when converting float to narrower integer/float
-  if (std::isinf(from.raw) ||
-      std::fabs(from.raw) > static_cast<double>(HighestValue<float>())) {
-    return Vec1<float>(std::signbit(from.raw) ? LowestValue<float>()
-                                              : HighestValue<float>());
-  }
-  return Vec1<float>(static_cast<float>(from.raw));
-}
-HWY_API Vec1<int32_t> DemoteTo(Sisd<int32_t> /* tag */, Vec1<double> from) {
-  // Prevent ubsan errors when converting int32_t to narrower integer/int32_t
-  if (std::isinf(from.raw) ||
-      std::fabs(from.raw) > static_cast<double>(HighestValue<int32_t>())) {
-    return Vec1<int32_t>(std::signbit(from.raw) ? LowestValue<int32_t>()
-                                                : HighestValue<int32_t>());
-  }
-  return Vec1<int32_t>(static_cast<int32_t>(from.raw));
-}
-
-template <typename FromT, typename ToT>
-HWY_API Vec1<ToT> DemoteTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
-  static_assert(!IsFloat<FromT>(), "FromT=double are handled above");
-  static_assert(sizeof(ToT) < sizeof(FromT), "Not demoting");
-
-  // Int to int: choose closest value in ToT to `from` (avoids UB)
-  from.raw = HWY_MIN(HWY_MAX(LimitsMin<ToT>(), from.raw), LimitsMax<ToT>());
-  return Vec1<ToT>(static_cast<ToT>(from.raw));
-}
-
-HWY_API Vec1<float> PromoteTo(Sisd<float> /* tag */, const Vec1<float16_t> v) {
-  uint16_t bits16;
-  CopySameSize(&v.raw, &bits16);
-  const uint32_t sign = static_cast<uint32_t>(bits16 >> 15);
-  const uint32_t biased_exp = (bits16 >> 10) & 0x1F;
-  const uint32_t mantissa = bits16 & 0x3FF;
-
-  // Subnormal or zero
-  if (biased_exp == 0) {
-    const float subnormal =
-        (1.0f / 16384) * (static_cast<float>(mantissa) * (1.0f / 1024));
-    return Vec1<float>(sign ? -subnormal : subnormal);
-  }
-
-  // Normalized: convert the representation directly (faster than ldexp/tables).
-  const uint32_t biased_exp32 = biased_exp + (127 - 15);
-  const uint32_t mantissa32 = mantissa << (23 - 10);
-  const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32;
-  float out;
-  CopySameSize(&bits32, &out);
-  return Vec1<float>(out);
-}
-
-HWY_API Vec1<float> PromoteTo(Sisd<float> d, const Vec1<bfloat16_t> v) {
-  return Set(d, F32FromBF16(v.raw));
-}
-
-HWY_API Vec1<float16_t> DemoteTo(Sisd<float16_t> /* tag */,
-                                 const Vec1<float> v) {
-  uint32_t bits32;
-  CopySameSize(&v.raw, &bits32);
-  const uint32_t sign = bits32 >> 31;
-  const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF;
-  const uint32_t mantissa32 = bits32 & 0x7FFFFF;
-
-  const int32_t exp = HWY_MIN(static_cast<int32_t>(biased_exp32) - 127, 15);
-
-  // Tiny or zero => zero.
-  Vec1<float16_t> out;
-  if (exp < -24) {
-    const uint16_t zero = 0;
-    CopySameSize(&zero, &out.raw);
-    return out;
-  }
-
-  uint32_t biased_exp16, mantissa16;
-
-  // exp = [-24, -15] => subnormal
-  if (exp < -14) {
-    biased_exp16 = 0;
-    const uint32_t sub_exp = static_cast<uint32_t>(-14 - exp);
-    HWY_DASSERT(1 <= sub_exp && sub_exp < 11);
-    mantissa16 = static_cast<uint32_t>((1u << (10 - sub_exp)) +
-                                       (mantissa32 >> (13 + sub_exp)));
-  } else {
-    // exp = [-14, 15]
-    biased_exp16 = static_cast<uint32_t>(exp + 15);
-    HWY_DASSERT(1 <= biased_exp16 && biased_exp16 < 31);
-    mantissa16 = mantissa32 >> 13;
-  }
-
-  HWY_DASSERT(mantissa16 < 1024);
-  const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16;
-  HWY_DASSERT(bits16 < 0x10000);
-  const uint16_t narrowed = static_cast<uint16_t>(bits16);  // big-endian safe
-  CopySameSize(&narrowed, &out.raw);
-  return out;
-}
-
-HWY_API Vec1<bfloat16_t> DemoteTo(Sisd<bfloat16_t> d, const Vec1<float> v) {
-  return Set(d, BF16FromF32(v.raw));
-}
-
-template <typename FromT, typename ToT, HWY_IF_FLOAT(FromT)>
-HWY_API Vec1<ToT> ConvertTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
-  static_assert(sizeof(ToT) == sizeof(FromT), "Should have same size");
-  // float## -> int##: return closest representable value. We cannot exactly
-  // represent LimitsMax<ToT> in FromT, so use double.
-  const double f = static_cast<double>(from.raw);
-  if (std::isinf(from.raw) ||
-      std::fabs(f) > static_cast<double>(LimitsMax<ToT>())) {
-    return Vec1<ToT>(std::signbit(from.raw) ? LimitsMin<ToT>()
-                                            : LimitsMax<ToT>());
-  }
-  return Vec1<ToT>(static_cast<ToT>(from.raw));
-}
-
-template <typename FromT, typename ToT, HWY_IF_NOT_FLOAT(FromT)>
-HWY_API Vec1<ToT> ConvertTo(Sisd<ToT> /* tag */, Vec1<FromT> from) {
-  static_assert(sizeof(ToT) == sizeof(FromT), "Should have same size");
-  // int## -> float##: no check needed
-  return Vec1<ToT>(static_cast<ToT>(from.raw));
-}
-
-HWY_API Vec1<uint8_t> U8FromU32(const Vec1<uint32_t> v) {
-  return DemoteTo(Sisd<uint8_t>(), v);
-}
-
-// ------------------------------ Truncations
-
-HWY_API Vec1<uint8_t> TruncateTo(Sisd<uint8_t> /* tag */,
-                                 const Vec1<uint64_t> v) {
-  return Vec1<uint8_t>{static_cast<uint8_t>(v.raw & 0xFF)};
-}
-
-HWY_API Vec1<uint16_t> TruncateTo(Sisd<uint16_t> /* tag */,
-                                  const Vec1<uint64_t> v) {
-  return Vec1<uint16_t>{static_cast<uint16_t>(v.raw & 0xFFFF)};
-}
-
-HWY_API Vec1<uint32_t> TruncateTo(Sisd<uint32_t> /* tag */,
-                                  const Vec1<uint64_t> v) {
-  return Vec1<uint32_t>{static_cast<uint32_t>(v.raw & 0xFFFFFFFFu)};
-}
-
-HWY_API Vec1<uint8_t> TruncateTo(Sisd<uint8_t> /* tag */,
-                                 const Vec1<uint32_t> v) {
-  return Vec1<uint8_t>{static_cast<uint8_t>(v.raw & 0xFF)};
-}
-
-HWY_API Vec1<uint16_t> TruncateTo(Sisd<uint16_t> /* tag */,
-                                  const Vec1<uint32_t> v) {
-  return Vec1<uint16_t>{static_cast<uint16_t>(v.raw & 0xFFFF)};
-}
-
-HWY_API Vec1<uint8_t> TruncateTo(Sisd<uint8_t> /* tag */,
-                                 const Vec1<uint16_t> v) {
-  return Vec1<uint8_t>{static_cast<uint8_t>(v.raw & 0xFF)};
-}
-
-// ================================================== COMBINE
-// UpperHalf, ZeroExtendVector, Combine, Concat* are unsupported.
-
-template <typename T>
-HWY_API Vec1<T> LowerHalf(Vec1<T> v) {
-  return v;
-}
-
-template <typename T>
-HWY_API Vec1<T> LowerHalf(Sisd<T> /* tag */, Vec1<T> v) {
-  return v;
-}
-
-// ================================================== SWIZZLE
-
-template <typename T>
-HWY_API T GetLane(const Vec1<T> v) {
-  return v.raw;
-}
-
-template <typename T>
-HWY_API T ExtractLane(const Vec1<T> v, size_t i) {
-  HWY_DASSERT(i == 0);
-  (void)i;
-  return v.raw;
-}
-
-template <typename T>
-HWY_API Vec1<T> InsertLane(Vec1<T> v, size_t i, T t) {
-  HWY_DASSERT(i == 0);
-  (void)i;
-  v.raw = t;
-  return v;
-}
-
-template <typename T>
-HWY_API Vec1<T> DupEven(Vec1<T> v) {
-  return v;
-}
-// DupOdd is unsupported.
-
-template <typename T>
-HWY_API Vec1<T> OddEven(Vec1<T> /* odd */, Vec1<T> even) {
-  return even;
-}
-
-template <typename T>
-HWY_API Vec1<T> OddEvenBlocks(Vec1<T> /* odd */, Vec1<T> even) {
-  return even;
-}
-
-// ------------------------------ SwapAdjacentBlocks
-
-template <typename T>
-HWY_API Vec1<T> SwapAdjacentBlocks(Vec1<T> v) {
-  return v;
-}
-
-// ------------------------------ TableLookupLanes
-
-// Returned by SetTableIndices for use by TableLookupLanes.
-template <typename T>
-struct Indices1 {
-  MakeSigned<T> raw;
-};
-
-template <typename T, typename TI>
-HWY_API Indices1<T> IndicesFromVec(Sisd<T>, Vec1<TI> vec) {
-  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane size");
-  HWY_DASSERT(vec.raw == 0);
-  return Indices1<T>{vec.raw};
-}
-
-template <typename T, typename TI>
-HWY_API Indices1<T> SetTableIndices(Sisd<T> d, const TI* idx) {
-  return IndicesFromVec(d, LoadU(Sisd<TI>(), idx));
-}
-
-template <typename T>
-HWY_API Vec1<T> TableLookupLanes(const Vec1<T> v, const Indices1<T> /* idx */) {
-  return v;
-}
-
-// ------------------------------ ReverseBlocks
-
-// Single block: no change
-template <typename T>
-HWY_API Vec1<T> ReverseBlocks(Sisd<T> /* tag */, const Vec1<T> v) {
-  return v;
-}
-
-// ------------------------------ Reverse
-
-template <typename T>
-HWY_API Vec1<T> Reverse(Sisd<T> /* tag */, const Vec1<T> v) {
-  return v;
-}
-
-// Must not be called:
-template <typename T>
-HWY_API Vec1<T> Reverse2(Sisd<T> /* tag */, const Vec1<T> v) {
-  return v;
-}
-
-template <typename T>
-HWY_API Vec1<T> Reverse4(Sisd<T> /* tag */, const Vec1<T> v) {
-  return v;
-}
-
-template <typename T>
-HWY_API Vec1<T> Reverse8(Sisd<T> /* tag */, const Vec1<T> v) {
-  return v;
-}
-
-// ================================================== BLOCKWISE
-// Shift*Bytes, CombineShiftRightBytes, Interleave*, Shuffle* are unsupported.
-
-// ------------------------------ Broadcast/splat any lane
-
-template <int kLane, typename T>
-HWY_API Vec1<T> Broadcast(const Vec1<T> v) {
-  static_assert(kLane == 0, "Scalar only has one lane");
-  return v;
-}
-
-// ------------------------------ TableLookupBytes, TableLookupBytesOr0
-
-template <typename T, typename TI>
-HWY_API Vec1<TI> TableLookupBytes(const Vec1<T> in, const Vec1<TI> indices) {
-  uint8_t in_bytes[sizeof(T)];
-  uint8_t idx_bytes[sizeof(T)];
-  uint8_t out_bytes[sizeof(T)];
-  CopyBytes<sizeof(T)>(&in, &in_bytes);  // copy to bytes
-  CopyBytes<sizeof(T)>(&indices, &idx_bytes);
-  for (size_t i = 0; i < sizeof(T); ++i) {
-    out_bytes[i] = in_bytes[idx_bytes[i]];
-  }
-  TI out;
-  CopyBytes<sizeof(TI)>(&out_bytes, &out);
-  return Vec1<TI>{out};
-}
-
-template <typename T, typename TI>
-HWY_API Vec1<TI> TableLookupBytesOr0(const Vec1<T> in, const Vec1<TI> indices) {
-  uint8_t in_bytes[sizeof(T)];
-  uint8_t idx_bytes[sizeof(T)];
-  uint8_t out_bytes[sizeof(T)];
-  CopyBytes<sizeof(T)>(&in, &in_bytes);  // copy to bytes
-  CopyBytes<sizeof(T)>(&indices, &idx_bytes);
-  for (size_t i = 0; i < sizeof(T); ++i) {
-    out_bytes[i] = idx_bytes[i] & 0x80 ? 0 : in_bytes[idx_bytes[i]];
-  }
-  TI out;
-  CopyBytes<sizeof(TI)>(&out_bytes, &out);
-  return Vec1<TI>{out};
-}
-
-// ------------------------------ ZipLower
-
-HWY_API Vec1<uint16_t> ZipLower(const Vec1<uint8_t> a, const Vec1<uint8_t> b) {
-  return Vec1<uint16_t>(static_cast<uint16_t>((uint32_t(b.raw) << 8) + a.raw));
-}
-HWY_API Vec1<uint32_t> ZipLower(const Vec1<uint16_t> a,
-                                const Vec1<uint16_t> b) {
-  return Vec1<uint32_t>((uint32_t(b.raw) << 16) + a.raw);
-}
-HWY_API Vec1<uint64_t> ZipLower(const Vec1<uint32_t> a,
-                                const Vec1<uint32_t> b) {
-  return Vec1<uint64_t>((uint64_t(b.raw) << 32) + a.raw);
-}
-HWY_API Vec1<int16_t> ZipLower(const Vec1<int8_t> a, const Vec1<int8_t> b) {
-  return Vec1<int16_t>(static_cast<int16_t>((int32_t(b.raw) << 8) + a.raw));
-}
-HWY_API Vec1<int32_t> ZipLower(const Vec1<int16_t> a, const Vec1<int16_t> b) {
-  return Vec1<int32_t>((int32_t(b.raw) << 16) + a.raw);
-}
-HWY_API Vec1<int64_t> ZipLower(const Vec1<int32_t> a, const Vec1<int32_t> b) {
-  return Vec1<int64_t>((int64_t(b.raw) << 32) + a.raw);
-}
-
-template <typename T, typename TW = MakeWide<T>, class VW = Vec1<TW>>
-HWY_API VW ZipLower(Sisd<TW> /* tag */, Vec1<T> a, Vec1<T> b) {
-  return VW(static_cast<TW>((TW{b.raw} << (sizeof(T) * 8)) + a.raw));
-}
-
-// ================================================== MASK
-
-template <typename T>
-HWY_API bool AllFalse(Sisd<T> /* tag */, const Mask1<T> mask) {
-  return mask.bits == 0;
-}
-
-template <typename T>
-HWY_API bool AllTrue(Sisd<T> /* tag */, const Mask1<T> mask) {
-  return mask.bits != 0;
-}
-
-// `p` points to at least 8 readable bytes, not all of which need be valid.
-template <typename T>
-HWY_API Mask1<T> LoadMaskBits(Sisd<T> /* tag */,
-                              const uint8_t* HWY_RESTRICT bits) {
-  return Mask1<T>::FromBool((bits[0] & 1) != 0);
-}
-
-// `p` points to at least 8 writable bytes.
-template <typename T>
-HWY_API size_t StoreMaskBits(Sisd<T> d, const Mask1<T> mask, uint8_t* bits) {
-  *bits = AllTrue(d, mask);
-  return 1;
-}
-
-template <typename T>
-HWY_API size_t CountTrue(Sisd<T> /* tag */, const Mask1<T> mask) {
-  return mask.bits == 0 ? 0 : 1;
-}
-
-template <typename T>
-HWY_API intptr_t FindFirstTrue(Sisd<T> /* tag */, const Mask1<T> mask) {
-  return mask.bits == 0 ? -1 : 0;
-}
-
-// ------------------------------ Compress, CompressBits
-
-template <typename T>
-struct CompressIsPartition {
-  enum { value = 1 };
-};
-
-template <typename T>
-HWY_API Vec1<T> Compress(Vec1<T> v, const Mask1<T> /* mask */) {
-  // A single lane is already partitioned by definition.
-  return v;
-}
-
-template <typename T>
-HWY_API Vec1<T> CompressNot(Vec1<T> v, const Mask1<T> /* mask */) {
-  // A single lane is already partitioned by definition.
-  return v;
-}
-
-// ------------------------------ CompressStore
-template <typename T>
-HWY_API size_t CompressStore(Vec1<T> v, const Mask1<T> mask, Sisd<T> d,
-                             T* HWY_RESTRICT unaligned) {
-  StoreU(Compress(v, mask), d, unaligned);
-  return CountTrue(d, mask);
-}
-
-// ------------------------------ CompressBlendedStore
-template <typename T>
-HWY_API size_t CompressBlendedStore(Vec1<T> v, const Mask1<T> mask, Sisd<T> d,
-                                    T* HWY_RESTRICT unaligned) {
-  if (!mask.bits) return 0;
-  StoreU(v, d, unaligned);
-  return 1;
-}
-
-// ------------------------------ CompressBits
-template <typename T>
-HWY_API Vec1<T> CompressBits(Vec1<T> v, const uint8_t* HWY_RESTRICT /*bits*/) {
-  return v;
-}
-
-// ------------------------------ CompressBitsStore
-template <typename T>
-HWY_API size_t CompressBitsStore(Vec1<T> v, const uint8_t* HWY_RESTRICT bits,
-                                 Sisd<T> d, T* HWY_RESTRICT unaligned) {
-  const Mask1<T> mask = LoadMaskBits(d, bits);
-  StoreU(Compress(v, mask), d, unaligned);
-  return CountTrue(d, mask);
-}
-
-// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
-
-HWY_API Vec1<float> ReorderWidenMulAccumulate(Sisd<float> /* tag */,
-                                              Vec1<bfloat16_t> a,
-                                              Vec1<bfloat16_t> b,
-                                              const Vec1<float> sum0,
-                                              Vec1<float>& /* sum1 */) {
-  return MulAdd(Vec1<float>(F32FromBF16(a.raw)),
-                Vec1<float>(F32FromBF16(b.raw)), sum0);
-}
-
-// ================================================== REDUCTIONS
-
-// Sum of all lanes, i.e. the only one.
-template <typename T>
-HWY_API Vec1<T> SumOfLanes(Sisd<T> /* tag */, const Vec1<T> v) {
-  return v;
-}
-template <typename T>
-HWY_API Vec1<T> MinOfLanes(Sisd<T> /* tag */, const Vec1<T> v) {
-  return v;
-}
-template <typename T>
-HWY_API Vec1<T> MaxOfLanes(Sisd<T> /* tag */, const Vec1<T> v) {
-  return v;
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
diff --git a/third_party/highway/hwy/ops/set_macros-inl.h b/third_party/highway/hwy/ops/set_macros-inl.h
deleted file mode 100644 (file)
index c118960..0000000
+++ /dev/null
@@ -1,444 +0,0 @@
-// Copyright 2020 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Sets macros based on HWY_TARGET.
-
-// This include guard is toggled by foreach_target, so avoid the usual _H_
-// suffix to prevent copybara from renaming it.
-#if defined(HWY_SET_MACROS_PER_TARGET) == defined(HWY_TARGET_TOGGLE)
-#ifdef HWY_SET_MACROS_PER_TARGET
-#undef HWY_SET_MACROS_PER_TARGET
-#else
-#define HWY_SET_MACROS_PER_TARGET
-#endif
-
-#endif  // HWY_SET_MACROS_PER_TARGET
-
-#include "hwy/detect_targets.h"
-
-#undef HWY_NAMESPACE
-#undef HWY_ALIGN
-#undef HWY_MAX_BYTES
-#undef HWY_LANES
-
-#undef HWY_HAVE_SCALABLE
-#undef HWY_HAVE_INTEGER64
-#undef HWY_HAVE_FLOAT16
-#undef HWY_HAVE_FLOAT64
-#undef HWY_MEM_OPS_MIGHT_FAULT
-#undef HWY_NATIVE_FMA
-#undef HWY_CAP_GE256
-#undef HWY_CAP_GE512
-
-#undef HWY_TARGET_STR
-
-#if defined(HWY_DISABLE_PCLMUL_AES)
-#define HWY_TARGET_STR_PCLMUL_AES ""
-#else
-#define HWY_TARGET_STR_PCLMUL_AES ",pclmul,aes"
-#endif
-
-#if defined(HWY_DISABLE_BMI2_FMA)
-#define HWY_TARGET_STR_BMI2_FMA ""
-#else
-#define HWY_TARGET_STR_BMI2_FMA ",bmi,bmi2,fma"
-#endif
-
-#if defined(HWY_DISABLE_F16C)
-#define HWY_TARGET_STR_F16C ""
-#else
-#define HWY_TARGET_STR_F16C ",f16c"
-#endif
-
-#define HWY_TARGET_STR_SSSE3 "sse2,ssse3"
-
-#define HWY_TARGET_STR_SSE4 \
-  HWY_TARGET_STR_SSSE3 ",sse4.1,sse4.2" HWY_TARGET_STR_PCLMUL_AES
-// Include previous targets, which are the half-vectors of the next target.
-#define HWY_TARGET_STR_AVX2 \
-  HWY_TARGET_STR_SSE4 ",avx,avx2" HWY_TARGET_STR_BMI2_FMA HWY_TARGET_STR_F16C
-#define HWY_TARGET_STR_AVX3 \
-  HWY_TARGET_STR_AVX2 ",avx512f,avx512vl,avx512dq,avx512bw"
-
-// Before include guard so we redefine HWY_TARGET_STR on each include,
-// governed by the current HWY_TARGET.
-
-//-----------------------------------------------------------------------------
-// SSSE3
-#if HWY_TARGET == HWY_SSSE3
-
-#define HWY_NAMESPACE N_SSSE3
-#define HWY_ALIGN alignas(16)
-#define HWY_MAX_BYTES 16
-#define HWY_LANES(T) (16 / sizeof(T))
-
-#define HWY_HAVE_SCALABLE 0
-#define HWY_HAVE_INTEGER64 1
-#define HWY_HAVE_FLOAT16 1
-#define HWY_HAVE_FLOAT64 1
-#define HWY_MEM_OPS_MIGHT_FAULT 1
-#define HWY_NATIVE_FMA 0
-#define HWY_CAP_GE256 0
-#define HWY_CAP_GE512 0
-
-#define HWY_TARGET_STR HWY_TARGET_STR_SSSE3
-
-//-----------------------------------------------------------------------------
-// SSE4
-#elif HWY_TARGET == HWY_SSE4
-
-#define HWY_NAMESPACE N_SSE4
-#define HWY_ALIGN alignas(16)
-#define HWY_MAX_BYTES 16
-#define HWY_LANES(T) (16 / sizeof(T))
-
-#define HWY_HAVE_SCALABLE 0
-#define HWY_HAVE_INTEGER64 1
-#define HWY_HAVE_FLOAT16 1
-#define HWY_HAVE_FLOAT64 1
-#define HWY_MEM_OPS_MIGHT_FAULT 1
-#define HWY_NATIVE_FMA 0
-#define HWY_CAP_GE256 0
-#define HWY_CAP_GE512 0
-
-#define HWY_TARGET_STR HWY_TARGET_STR_SSE4
-
-//-----------------------------------------------------------------------------
-// AVX2
-#elif HWY_TARGET == HWY_AVX2
-
-#define HWY_NAMESPACE N_AVX2
-#define HWY_ALIGN alignas(32)
-#define HWY_MAX_BYTES 32
-#define HWY_LANES(T) (32 / sizeof(T))
-
-#define HWY_HAVE_SCALABLE 0
-#define HWY_HAVE_INTEGER64 1
-#define HWY_HAVE_FLOAT16 1
-#define HWY_HAVE_FLOAT64 1
-#define HWY_MEM_OPS_MIGHT_FAULT 1
-
-#ifdef HWY_DISABLE_BMI2_FMA
-#define HWY_NATIVE_FMA 0
-#else
-#define HWY_NATIVE_FMA 1
-#endif
-
-#define HWY_CAP_GE256 1
-#define HWY_CAP_GE512 0
-
-#define HWY_TARGET_STR HWY_TARGET_STR_AVX2
-
-//-----------------------------------------------------------------------------
-// AVX3[_DL]
-#elif HWY_TARGET == HWY_AVX3 || HWY_TARGET == HWY_AVX3_DL
-
-#define HWY_ALIGN alignas(64)
-#define HWY_MAX_BYTES 64
-#define HWY_LANES(T) (64 / sizeof(T))
-
-#define HWY_HAVE_SCALABLE 0
-#define HWY_HAVE_INTEGER64 1
-#define HWY_HAVE_FLOAT16 1
-#define HWY_HAVE_FLOAT64 1
-#define HWY_MEM_OPS_MIGHT_FAULT 0
-#define HWY_NATIVE_FMA 1
-#define HWY_CAP_GE256 1
-#define HWY_CAP_GE512 1
-
-#if HWY_TARGET == HWY_AVX3
-
-#define HWY_NAMESPACE N_AVX3
-#define HWY_TARGET_STR HWY_TARGET_STR_AVX3
-
-#elif HWY_TARGET == HWY_AVX3_DL
-
-#define HWY_NAMESPACE N_AVX3_DL
-#define HWY_TARGET_STR                                            \
-  HWY_TARGET_STR_AVX3                                             \
-  ",vpclmulqdq,avx512vbmi,avx512vbmi2,vaes,avxvnni,avx512bitalg," \
-  "avx512vpopcntdq"
-
-#else
-#error "Logic error"
-#endif  // HWY_TARGET == HWY_AVX3_DL
-
-//-----------------------------------------------------------------------------
-// PPC8
-#elif HWY_TARGET == HWY_PPC8
-
-#define HWY_ALIGN alignas(16)
-#define HWY_MAX_BYTES 16
-#define HWY_LANES(T) (16 / sizeof(T))
-
-#define HWY_HAVE_SCALABLE 0
-#define HWY_HAVE_INTEGER64 1
-#define HWY_HAVE_FLOAT16 0
-#define HWY_HAVE_FLOAT64 1
-#define HWY_MEM_OPS_MIGHT_FAULT 1
-#define HWY_NATIVE_FMA 1
-#define HWY_CAP_GE256 0
-#define HWY_CAP_GE512 0
-
-#define HWY_NAMESPACE N_PPC8
-
-#define HWY_TARGET_STR "altivec,vsx"
-
-//-----------------------------------------------------------------------------
-// NEON
-#elif HWY_TARGET == HWY_NEON
-
-#define HWY_ALIGN alignas(16)
-#define HWY_MAX_BYTES 16
-#define HWY_LANES(T) (16 / sizeof(T))
-
-#define HWY_HAVE_SCALABLE 0
-#define HWY_HAVE_INTEGER64 1
-#define HWY_HAVE_FLOAT16 1
-
-#if HWY_ARCH_ARM_A64
-#define HWY_HAVE_FLOAT64 1
-#else
-#define HWY_HAVE_FLOAT64 0
-#endif
-
-#define HWY_MEM_OPS_MIGHT_FAULT 1
-
-#if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
-#define HWY_NATIVE_FMA 1
-#else
-#define HWY_NATIVE_FMA 0
-#endif
-
-#define HWY_CAP_GE256 0
-#define HWY_CAP_GE512 0
-
-#define HWY_NAMESPACE N_NEON
-
-// Can use pragmas instead of -march compiler flag
-#if HWY_HAVE_RUNTIME_DISPATCH
-#if HWY_ARCH_ARM_V7
-#define HWY_TARGET_STR "+neon-vfpv4"
-#else
-#define HWY_TARGET_STR "+crypto"
-#endif  // HWY_ARCH_ARM_V7
-#else
-// HWY_TARGET_STR remains undefined
-#endif
-
-//-----------------------------------------------------------------------------
-// SVE[2]
-#elif HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE || \
-    HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128
-
-// SVE only requires lane alignment, not natural alignment of the entire vector.
-#define HWY_ALIGN alignas(8)
-
-// Value ensures MaxLanes() is the tightest possible upper bound to reduce
-// overallocation.
-#define HWY_LANES(T) ((HWY_MAX_BYTES) / sizeof(T))
-
-#define HWY_HAVE_SCALABLE 1
-#define HWY_HAVE_INTEGER64 1
-#define HWY_HAVE_FLOAT16 1
-#define HWY_HAVE_FLOAT64 1
-#define HWY_MEM_OPS_MIGHT_FAULT 0
-#define HWY_NATIVE_FMA 1
-#define HWY_CAP_GE256 0
-#define HWY_CAP_GE512 0
-
-#if HWY_TARGET == HWY_SVE2
-#define HWY_NAMESPACE N_SVE2
-#define HWY_MAX_BYTES 256
-#elif HWY_TARGET == HWY_SVE_256
-#define HWY_NAMESPACE N_SVE_256
-#define HWY_MAX_BYTES 32
-#elif HWY_TARGET == HWY_SVE2_128
-#define HWY_NAMESPACE N_SVE2_128
-#define HWY_MAX_BYTES 16
-#else
-#define HWY_NAMESPACE N_SVE
-#define HWY_MAX_BYTES 256
-#endif
-
-// Can use pragmas instead of -march compiler flag
-#if HWY_HAVE_RUNTIME_DISPATCH
-#if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
-#define HWY_TARGET_STR "+sve2-aes"
-#else
-#define HWY_TARGET_STR "+sve"
-#endif
-#else
-// HWY_TARGET_STR remains undefined
-#endif
-
-//-----------------------------------------------------------------------------
-// WASM
-#elif HWY_TARGET == HWY_WASM
-
-#define HWY_ALIGN alignas(16)
-#define HWY_MAX_BYTES 16
-#define HWY_LANES(T) (16 / sizeof(T))
-
-#define HWY_HAVE_SCALABLE 0
-#define HWY_HAVE_INTEGER64 1
-#define HWY_HAVE_FLOAT16 1
-#define HWY_HAVE_FLOAT64 0
-#define HWY_MEM_OPS_MIGHT_FAULT 1
-#define HWY_NATIVE_FMA 0
-#define HWY_CAP_GE256 0
-#define HWY_CAP_GE512 0
-
-#define HWY_NAMESPACE N_WASM
-
-#define HWY_TARGET_STR "simd128"
-
-//-----------------------------------------------------------------------------
-// WASM_EMU256
-#elif HWY_TARGET == HWY_WASM_EMU256
-
-#define HWY_ALIGN alignas(32)
-#define HWY_MAX_BYTES 32
-#define HWY_LANES(T) (32 / sizeof(T))
-
-#define HWY_HAVE_SCALABLE 0
-#define HWY_HAVE_INTEGER64 1
-#define HWY_HAVE_FLOAT16 1
-#define HWY_HAVE_FLOAT64 0
-#define HWY_MEM_OPS_MIGHT_FAULT 1
-#define HWY_NATIVE_FMA 0
-#define HWY_CAP_GE256 0
-#define HWY_CAP_GE512 0
-
-#define HWY_NAMESPACE N_WASM_EMU256
-
-#define HWY_TARGET_STR "simd128"
-
-//-----------------------------------------------------------------------------
-// RVV
-#elif HWY_TARGET == HWY_RVV
-
-// RVV only requires lane alignment, not natural alignment of the entire vector,
-// and the compiler already aligns builtin types, so nothing to do here.
-#define HWY_ALIGN
-
-// The spec requires VLEN <= 2^16 bits, so the limit is 2^16 bytes (LMUL=8).
-#define HWY_MAX_BYTES 65536
-
-// = HWY_MAX_BYTES divided by max LMUL=8 because MaxLanes includes the actual
-// LMUL. This is the tightest possible upper bound.
-#define HWY_LANES(T) (8192 / sizeof(T))
-
-#define HWY_HAVE_SCALABLE 1
-#define HWY_HAVE_INTEGER64 1
-#define HWY_HAVE_FLOAT64 1
-#define HWY_MEM_OPS_MIGHT_FAULT 0
-#define HWY_NATIVE_FMA 1
-#define HWY_CAP_GE256 0
-#define HWY_CAP_GE512 0
-
-#if defined(__riscv_zvfh)
-#define HWY_HAVE_FLOAT16 1
-#else
-#define HWY_HAVE_FLOAT16 0
-#endif
-
-#define HWY_NAMESPACE N_RVV
-
-// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
-// (rv64gcv is not a valid target)
-
-//-----------------------------------------------------------------------------
-// EMU128
-#elif HWY_TARGET == HWY_EMU128
-
-#define HWY_ALIGN alignas(16)
-#define HWY_MAX_BYTES 16
-#define HWY_LANES(T) (16 / sizeof(T))
-
-#define HWY_HAVE_SCALABLE 0
-#define HWY_HAVE_INTEGER64 1
-#define HWY_HAVE_FLOAT16 1
-#define HWY_HAVE_FLOAT64 1
-#define HWY_MEM_OPS_MIGHT_FAULT 1
-#define HWY_NATIVE_FMA 0
-#define HWY_CAP_GE256 0
-#define HWY_CAP_GE512 0
-
-#define HWY_NAMESPACE N_EMU128
-
-// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
-
-//-----------------------------------------------------------------------------
-// SCALAR
-#elif HWY_TARGET == HWY_SCALAR
-
-#define HWY_ALIGN
-#define HWY_MAX_BYTES 8
-#define HWY_LANES(T) 1
-
-#define HWY_HAVE_SCALABLE 0
-#define HWY_HAVE_INTEGER64 1
-#define HWY_HAVE_FLOAT16 1
-#define HWY_HAVE_FLOAT64 1
-#define HWY_MEM_OPS_MIGHT_FAULT 0
-#define HWY_NATIVE_FMA 0
-#define HWY_CAP_GE256 0
-#define HWY_CAP_GE512 0
-
-#define HWY_NAMESPACE N_SCALAR
-
-// HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
-
-#else
-#pragma message("HWY_TARGET does not match any known target")
-#endif  // HWY_TARGET
-
-// Override this to 1 in asan/msan builds, which will still fault.
-#if HWY_IS_ASAN || HWY_IS_MSAN
-#undef HWY_MEM_OPS_MIGHT_FAULT
-#define HWY_MEM_OPS_MIGHT_FAULT 1
-#endif
-
-// Clang <9 requires this be invoked at file scope, before any namespace.
-#undef HWY_BEFORE_NAMESPACE
-#if defined(HWY_TARGET_STR)
-#define HWY_BEFORE_NAMESPACE()        \
-  HWY_PUSH_ATTRIBUTES(HWY_TARGET_STR) \
-  static_assert(true, "For requiring trailing semicolon")
-#else
-// avoids compiler warning if no HWY_TARGET_STR
-#define HWY_BEFORE_NAMESPACE() \
-  static_assert(true, "For requiring trailing semicolon")
-#endif
-
-// Clang <9 requires any namespaces be closed before this macro.
-#undef HWY_AFTER_NAMESPACE
-#if defined(HWY_TARGET_STR)
-#define HWY_AFTER_NAMESPACE() \
-  HWY_POP_ATTRIBUTES          \
-  static_assert(true, "For requiring trailing semicolon")
-#else
-// avoids compiler warning if no HWY_TARGET_STR
-#define HWY_AFTER_NAMESPACE() \
-  static_assert(true, "For requiring trailing semicolon")
-#endif
-
-#undef HWY_ATTR
-#if defined(HWY_TARGET_STR) && HWY_HAS_ATTRIBUTE(target)
-#define HWY_ATTR __attribute__((target(HWY_TARGET_STR)))
-#else
-#define HWY_ATTR
-#endif
diff --git a/third_party/highway/hwy/ops/shared-inl.h b/third_party/highway/hwy/ops/shared-inl.h
deleted file mode 100644 (file)
index 29c4303..0000000
+++ /dev/null
@@ -1,311 +0,0 @@
-// Copyright 2020 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Per-target definitions shared by ops/*.h and user code.
-
-#include <cmath>
-
-#include "hwy/base.h"
-
-// Separate header because foreach_target.h re-enables its include guard.
-#include "hwy/ops/set_macros-inl.h"
-
-// Relies on the external include guard in highway.h.
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-// Highway operations are implemented as overloaded functions selected using an
-// internal-only tag type D := Simd<T, N, kPow2>. T is the lane type. kPow2 is a
-// shift count applied to scalable vectors. Instead of referring to Simd<>
-// directly, users create D via aliases ScalableTag<T[, kPow2]>() (defaults to a
-// full vector, or fractions/groups if the argument is negative/positive),
-// CappedTag<T, kLimit> or FixedTag<T, kNumLanes>. The actual number of lanes is
-// Lanes(D()), a power of two. For scalable vectors, N is either HWY_LANES or a
-// cap. For constexpr-size vectors, N is the actual number of lanes. This
-// ensures Half<Full512<T>> is the same type as Full256<T>, as required by x86.
-template <typename Lane, size_t N, int kPow2>
-struct Simd {
-  constexpr Simd() = default;
-  using T = Lane;
-  static_assert((N & (N - 1)) == 0 && N != 0, "N must be a power of two");
-
-  // Only for use by MaxLanes, required by MSVC. Cannot be enum because GCC
-  // warns when using enums and non-enums in the same expression. Cannot be
-  // static constexpr function (another MSVC limitation).
-  static constexpr size_t kPrivateN = N;
-  static constexpr int kPrivatePow2 = kPow2;
-
-  template <typename NewT>
-  static constexpr size_t NewN() {
-    // Round up to correctly handle scalars with N=1.
-    return (N * sizeof(T) + sizeof(NewT) - 1) / sizeof(NewT);
-  }
-
-#if HWY_HAVE_SCALABLE
-  template <typename NewT>
-  static constexpr int Pow2Ratio() {
-    return (sizeof(NewT) > sizeof(T))
-               ? static_cast<int>(CeilLog2(sizeof(NewT) / sizeof(T)))
-               : -static_cast<int>(CeilLog2(sizeof(T) / sizeof(NewT)));
-  }
-#endif
-
-  // Widening/narrowing ops change the number of lanes and/or their type.
-  // To initialize such vectors, we need the corresponding tag types:
-
-// PromoteTo/DemoteTo() with another lane type, but same number of lanes.
-#if HWY_HAVE_SCALABLE
-  template <typename NewT>
-  using Rebind = Simd<NewT, N, kPow2 + Pow2Ratio<NewT>()>;
-#else
-  template <typename NewT>
-  using Rebind = Simd<NewT, N, kPow2>;
-#endif
-
-  // Change lane type while keeping the same vector size, e.g. for MulEven.
-  template <typename NewT>
-  using Repartition = Simd<NewT, NewN<NewT>(), kPow2>;
-
-// Half the lanes while keeping the same lane type, e.g. for LowerHalf.
-// Round up to correctly handle scalars with N=1.
-#if HWY_HAVE_SCALABLE
-  // Reducing the cap (N) is required for SVE - if N is the limiter for f32xN,
-  // then we expect Half<Rebind<u16>> to have N/2 lanes (rounded up).
-  using Half = Simd<T, (N + 1) / 2, kPow2 - 1>;
-#else
-  using Half = Simd<T, (N + 1) / 2, kPow2>;
-#endif
-
-// Twice the lanes while keeping the same lane type, e.g. for Combine.
-#if HWY_HAVE_SCALABLE
-  using Twice = Simd<T, 2 * N, kPow2 + 1>;
-#else
-  using Twice = Simd<T, 2 * N, kPow2>;
-#endif
-};
-
-namespace detail {
-
-template <typename T, size_t N, int kPow2>
-constexpr bool IsFull(Simd<T, N, kPow2> /* d */) {
-  return N == HWY_LANES(T) && kPow2 == 0;
-}
-
-// Returns the number of lanes (possibly zero) after applying a shift:
-// - 0: no change;
-// - [1,3]: a group of 2,4,8 [fractional] vectors;
-// - [-3,-1]: a fraction of a vector from 1/8 to 1/2.
-constexpr size_t ScaleByPower(size_t N, int pow2) {
-#if HWY_TARGET == HWY_RVV
-  return pow2 >= 0 ? (N << pow2) : (N >> (-pow2));
-#else
-  return pow2 >= 0 ? N : (N >> (-pow2));
-#endif
-}
-
-// Struct wrappers enable validation of arguments via static_assert.
-template <typename T, int kPow2>
-struct ScalableTagChecker {
-  static_assert(-3 <= kPow2 && kPow2 <= 3, "Fraction must be 1/8 to 8");
-#if HWY_TARGET == HWY_RVV
-  // Only RVV supports register groups.
-  using type = Simd<T, HWY_LANES(T), kPow2>;
-#elif HWY_HAVE_SCALABLE
-  // For SVE[2], only allow full or fractions.
-  using type = Simd<T, HWY_LANES(T), HWY_MIN(kPow2, 0)>;
-#elif HWY_TARGET == HWY_SCALAR
-  using type = Simd<T, /*N=*/1, 0>;
-#else
-  // Only allow full or fractions.
-  using type = Simd<T, ScaleByPower(HWY_LANES(T), HWY_MIN(kPow2, 0)), 0>;
-#endif
-};
-
-template <typename T, size_t kLimit>
-struct CappedTagChecker {
-  static_assert(kLimit != 0, "Does not make sense to have zero lanes");
-  // Safely handle non-power-of-two inputs by rounding down, which is allowed by
-  // CappedTag. Otherwise, Simd<T, 3, 0> would static_assert.
-  static constexpr size_t kLimitPow2 = size_t{1} << hwy::FloorLog2(kLimit);
-  using type = Simd<T, HWY_MIN(kLimitPow2, HWY_LANES(T)), 0>;
-};
-
-template <typename T, size_t kNumLanes>
-struct FixedTagChecker {
-  static_assert(kNumLanes != 0, "Does not make sense to have zero lanes");
-  static_assert(kNumLanes <= HWY_LANES(T), "Too many lanes");
-  using type = Simd<T, kNumLanes, 0>;
-};
-
-}  // namespace detail
-
-// Alias for a tag describing a full vector (kPow2 == 0: the most common usage,
-// e.g. 1D loops where the application does not care about the vector size) or a
-// fraction/multiple of one. Multiples are the same as full vectors for all
-// targets except RVV. Fractions (kPow2 < 0) are useful as the argument/return
-// value of type promotion and demotion.
-template <typename T, int kPow2 = 0>
-using ScalableTag = typename detail::ScalableTagChecker<T, kPow2>::type;
-
-// Alias for a tag describing a vector with *up to* kLimit active lanes, even on
-// targets with scalable vectors and HWY_SCALAR. The runtime lane count
-// `Lanes(tag)` may be less than kLimit, and is 1 on HWY_SCALAR. This alias is
-// typically used for 1D loops with a relatively low application-defined upper
-// bound, e.g. for 8x8 DCTs. However, it is better if data structures are
-// designed to be vector-length-agnostic (e.g. a hybrid SoA where there are
-// chunks of `M >= MaxLanes(d)` DC components followed by M AC1, .., and M AC63;
-// this would enable vector-length-agnostic loops using ScalableTag).
-template <typename T, size_t kLimit>
-using CappedTag = typename detail::CappedTagChecker<T, kLimit>::type;
-
-// Alias for a tag describing a vector with *exactly* kNumLanes active lanes,
-// even on targets with scalable vectors. Requires `kNumLanes` to be a power of
-// two not exceeding `HWY_LANES(T)`.
-//
-// NOTE: if the application does not need to support HWY_SCALAR (+), use this
-// instead of CappedTag to emphasize that there will be exactly kNumLanes lanes.
-// This is useful for data structures that rely on exactly 128-bit SIMD, but
-// these are discouraged because they cannot benefit from wider vectors.
-// Instead, applications would ideally define a larger problem size and loop
-// over it with the (unknown size) vectors from ScalableTag.
-//
-// + e.g. if the baseline is known to support SIMD, or the application requires
-//   ops such as TableLookupBytes not supported by HWY_SCALAR.
-template <typename T, size_t kNumLanes>
-using FixedTag = typename detail::FixedTagChecker<T, kNumLanes>::type;
-
-template <class D>
-using TFromD = typename D::T;
-
-// Tag for the same number of lanes as D, but with the LaneType T.
-template <class T, class D>
-using Rebind = typename D::template Rebind<T>;
-
-template <class D>
-using RebindToSigned = Rebind<MakeSigned<TFromD<D>>, D>;
-template <class D>
-using RebindToUnsigned = Rebind<MakeUnsigned<TFromD<D>>, D>;
-template <class D>
-using RebindToFloat = Rebind<MakeFloat<TFromD<D>>, D>;
-
-// Tag for the same total size as D, but with the LaneType T.
-template <class T, class D>
-using Repartition = typename D::template Repartition<T>;
-
-template <class D>
-using RepartitionToWide = Repartition<MakeWide<TFromD<D>>, D>;
-template <class D>
-using RepartitionToNarrow = Repartition<MakeNarrow<TFromD<D>>, D>;
-
-// Tag for the same lane type as D, but half the lanes.
-template <class D>
-using Half = typename D::Half;
-
-// Tag for the same lane type as D, but twice the lanes.
-template <class D>
-using Twice = typename D::Twice;
-
-template <typename T>
-using Full32 = Simd<T, 4 / sizeof(T), 0>;
-
-template <typename T>
-using Full64 = Simd<T, 8 / sizeof(T), 0>;
-
-template <typename T>
-using Full128 = Simd<T, 16 / sizeof(T), 0>;
-
-// Same as base.h macros but with a Simd<T, N, kPow2> argument instead of T.
-#define HWY_IF_UNSIGNED_D(D) HWY_IF_UNSIGNED(TFromD<D>)
-#define HWY_IF_SIGNED_D(D) HWY_IF_SIGNED(TFromD<D>)
-#define HWY_IF_FLOAT_D(D) HWY_IF_FLOAT(TFromD<D>)
-#define HWY_IF_NOT_FLOAT_D(D) HWY_IF_NOT_FLOAT(TFromD<D>)
-#define HWY_IF_LANE_SIZE_D(D, bytes) HWY_IF_LANE_SIZE(TFromD<D>, bytes)
-#define HWY_IF_NOT_LANE_SIZE_D(D, bytes) HWY_IF_NOT_LANE_SIZE(TFromD<D>, bytes)
-
-// MSVC workaround: use PrivateN directly instead of MaxLanes.
-#define HWY_IF_LT128_D(D) \
-  hwy::EnableIf<D::kPrivateN * sizeof(TFromD<D>) < 16>* = nullptr
-#define HWY_IF_GE128_D(D) \
-  hwy::EnableIf<D::kPrivateN * sizeof(TFromD<D>) >= 16>* = nullptr
-
-// Same, but with a vector argument. ops/*-inl.h define their own TFromV.
-#define HWY_IF_UNSIGNED_V(V) HWY_IF_UNSIGNED(TFromV<V>)
-#define HWY_IF_SIGNED_V(V) HWY_IF_SIGNED(TFromV<V>)
-#define HWY_IF_FLOAT_V(V) HWY_IF_FLOAT(TFromV<V>)
-#define HWY_IF_LANE_SIZE_V(V, bytes) HWY_IF_LANE_SIZE(TFromV<V>, bytes)
-#define HWY_IF_NOT_LANE_SIZE_V(V, bytes) HWY_IF_NOT_LANE_SIZE(TFromV<V>, bytes)
-
-template <class D>
-HWY_INLINE HWY_MAYBE_UNUSED constexpr int Pow2(D /* d */) {
-  return D::kPrivatePow2;
-}
-
-// MSVC requires the explicit <D>.
-#define HWY_IF_POW2_GE(D, MIN) hwy::EnableIf<Pow2<D>(D()) >= (MIN)>* = nullptr
-
-#if HWY_HAVE_SCALABLE
-
-// Upper bound on the number of lanes. Intended for template arguments and
-// reducing code size (e.g. for SSE4, we know at compile-time that vectors will
-// not exceed 16 bytes). WARNING: this may be a loose bound, use Lanes() as the
-// actual size for allocating storage. WARNING: MSVC might not be able to deduce
-// arguments if this is used in EnableIf. See HWY_IF_LT128_D above.
-template <class D>
-HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D) {
-  return detail::ScaleByPower(HWY_MIN(D::kPrivateN, HWY_LANES(TFromD<D>)),
-                              D::kPrivatePow2);
-}
-
-#else
-// Workaround for MSVC 2017: T,N,kPow2 argument deduction fails, so returning N
-// is not an option, nor does a member function work.
-template <class D>
-HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D) {
-  return D::kPrivateN;
-}
-
-// (Potentially) non-constant actual size of the vector at runtime, subject to
-// the limit imposed by the Simd. Useful for advancing loop counters.
-// Targets with scalable vectors define this themselves.
-template <typename T, size_t N, int kPow2>
-HWY_INLINE HWY_MAYBE_UNUSED size_t Lanes(Simd<T, N, kPow2>) {
-  return N;
-}
-
-#endif  // !HWY_HAVE_SCALABLE
-
-// NOTE: GCC generates incorrect code for vector arguments to non-inlined
-// functions in two situations:
-// - on Windows and GCC 10.3, passing by value crashes due to unaligned loads:
-//   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412.
-// - on ARM64 and GCC 9.3.0 or 11.2.1, passing by value causes many (but not
-//   all) tests to fail.
-//
-// We therefore pass by const& only on GCC and (Windows or ARM64). This alias
-// must be used for all vector/mask parameters of functions marked HWY_NOINLINE,
-// and possibly also other functions that are not inlined.
-#if HWY_COMPILER_GCC_ACTUAL && (HWY_OS_WIN || HWY_ARCH_ARM_A64)
-template <class V>
-using VecArg = const V&;
-#else
-template <class V>
-using VecArg = V;
-#endif
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
diff --git a/third_party/highway/hwy/ops/wasm_128-inl.h b/third_party/highway/hwy/ops/wasm_128-inl.h
deleted file mode 100644 (file)
index ab38985..0000000
+++ /dev/null
@@ -1,4500 +0,0 @@
-// Copyright 2019 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// 128-bit WASM vectors and operations.
-// External include guard in highway.h - see comment there.
-
-#include <stddef.h>
-#include <stdint.h>
-#include <wasm_simd128.h>
-
-#include "hwy/base.h"
-#include "hwy/ops/shared-inl.h"
-
-#ifdef HWY_WASM_OLD_NAMES
-#define wasm_i8x16_shuffle wasm_v8x16_shuffle
-#define wasm_i16x8_shuffle wasm_v16x8_shuffle
-#define wasm_i32x4_shuffle wasm_v32x4_shuffle
-#define wasm_i64x2_shuffle wasm_v64x2_shuffle
-#define wasm_u16x8_extend_low_u8x16 wasm_i16x8_widen_low_u8x16
-#define wasm_u32x4_extend_low_u16x8 wasm_i32x4_widen_low_u16x8
-#define wasm_i32x4_extend_low_i16x8 wasm_i32x4_widen_low_i16x8
-#define wasm_i16x8_extend_low_i8x16 wasm_i16x8_widen_low_i8x16
-#define wasm_u32x4_extend_high_u16x8 wasm_i32x4_widen_high_u16x8
-#define wasm_i32x4_extend_high_i16x8 wasm_i32x4_widen_high_i16x8
-#define wasm_i32x4_trunc_sat_f32x4 wasm_i32x4_trunc_saturate_f32x4
-#define wasm_u8x16_add_sat wasm_u8x16_add_saturate
-#define wasm_u8x16_sub_sat wasm_u8x16_sub_saturate
-#define wasm_u16x8_add_sat wasm_u16x8_add_saturate
-#define wasm_u16x8_sub_sat wasm_u16x8_sub_saturate
-#define wasm_i8x16_add_sat wasm_i8x16_add_saturate
-#define wasm_i8x16_sub_sat wasm_i8x16_sub_saturate
-#define wasm_i16x8_add_sat wasm_i16x8_add_saturate
-#define wasm_i16x8_sub_sat wasm_i16x8_sub_saturate
-#endif
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-namespace detail {
-
-template <typename T>
-struct Raw128 {
-  using type = __v128_u;
-};
-template <>
-struct Raw128<float> {
-  using type = __f32x4;
-};
-
-}  // namespace detail
-
-template <typename T, size_t N = 16 / sizeof(T)>
-class Vec128 {
-  using Raw = typename detail::Raw128<T>::type;
-
- public:
-  // Compound assignment. Only usable if there is a corresponding non-member
-  // binary operator overload. For example, only f32 and f64 support division.
-  HWY_INLINE Vec128& operator*=(const Vec128 other) {
-    return *this = (*this * other);
-  }
-  HWY_INLINE Vec128& operator/=(const Vec128 other) {
-    return *this = (*this / other);
-  }
-  HWY_INLINE Vec128& operator+=(const Vec128 other) {
-    return *this = (*this + other);
-  }
-  HWY_INLINE Vec128& operator-=(const Vec128 other) {
-    return *this = (*this - other);
-  }
-  HWY_INLINE Vec128& operator&=(const Vec128 other) {
-    return *this = (*this & other);
-  }
-  HWY_INLINE Vec128& operator|=(const Vec128 other) {
-    return *this = (*this | other);
-  }
-  HWY_INLINE Vec128& operator^=(const Vec128 other) {
-    return *this = (*this ^ other);
-  }
-
-  Raw raw;
-};
-
-template <typename T>
-using Vec64 = Vec128<T, 8 / sizeof(T)>;
-
-template <typename T>
-using Vec32 = Vec128<T, 4 / sizeof(T)>;
-
-// FF..FF or 0.
-template <typename T, size_t N = 16 / sizeof(T)>
-struct Mask128 {
-  typename detail::Raw128<T>::type raw;
-};
-
-namespace detail {
-
-// Deduce Simd<T, N, 0> from Vec128<T, N>
-struct DeduceD {
-  template <typename T, size_t N>
-  Simd<T, N, 0> operator()(Vec128<T, N>) const {
-    return Simd<T, N, 0>();
-  }
-};
-
-}  // namespace detail
-
-template <class V>
-using DFromV = decltype(detail::DeduceD()(V()));
-
-template <class V>
-using TFromV = TFromD<DFromV<V>>;
-
-// ------------------------------ BitCast
-
-namespace detail {
-
-HWY_INLINE __v128_u BitCastToInteger(__v128_u v) { return v; }
-HWY_INLINE __v128_u BitCastToInteger(__f32x4 v) {
-  return static_cast<__v128_u>(v);
-}
-HWY_INLINE __v128_u BitCastToInteger(__f64x2 v) {
-  return static_cast<__v128_u>(v);
-}
-
-template <typename T, size_t N>
-HWY_INLINE Vec128<uint8_t, N * sizeof(T)> BitCastToByte(Vec128<T, N> v) {
-  return Vec128<uint8_t, N * sizeof(T)>{BitCastToInteger(v.raw)};
-}
-
-// Cannot rely on function overloading because return types differ.
-template <typename T>
-struct BitCastFromInteger128 {
-  HWY_INLINE __v128_u operator()(__v128_u v) { return v; }
-};
-template <>
-struct BitCastFromInteger128<float> {
-  HWY_INLINE __f32x4 operator()(__v128_u v) { return static_cast<__f32x4>(v); }
-};
-
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> BitCastFromByte(Simd<T, N, 0> /* tag */,
-                                        Vec128<uint8_t, N * sizeof(T)> v) {
-  return Vec128<T, N>{BitCastFromInteger128<T>()(v.raw)};
-}
-
-}  // namespace detail
-
-template <typename T, size_t N, typename FromT>
-HWY_API Vec128<T, N> BitCast(Simd<T, N, 0> d,
-                             Vec128<FromT, N * sizeof(T) / sizeof(FromT)> v) {
-  return detail::BitCastFromByte(d, detail::BitCastToByte(v));
-}
-
-// ------------------------------ Zero
-
-// Returns an all-zero vector/part.
-template <typename T, size_t N, HWY_IF_LE128(T, N)>
-HWY_API Vec128<T, N> Zero(Simd<T, N, 0> /* tag */) {
-  return Vec128<T, N>{wasm_i32x4_splat(0)};
-}
-template <size_t N, HWY_IF_LE128(float, N)>
-HWY_API Vec128<float, N> Zero(Simd<float, N, 0> /* tag */) {
-  return Vec128<float, N>{wasm_f32x4_splat(0.0f)};
-}
-
-template <class D>
-using VFromD = decltype(Zero(D()));
-
-// ------------------------------ Set
-
-// Returns a vector/part with all lanes set to "t".
-template <size_t N, HWY_IF_LE128(uint8_t, N)>
-HWY_API Vec128<uint8_t, N> Set(Simd<uint8_t, N, 0> /* tag */, const uint8_t t) {
-  return Vec128<uint8_t, N>{wasm_i8x16_splat(static_cast<int8_t>(t))};
-}
-template <size_t N, HWY_IF_LE128(uint16_t, N)>
-HWY_API Vec128<uint16_t, N> Set(Simd<uint16_t, N, 0> /* tag */,
-                                const uint16_t t) {
-  return Vec128<uint16_t, N>{wasm_i16x8_splat(static_cast<int16_t>(t))};
-}
-template <size_t N, HWY_IF_LE128(uint32_t, N)>
-HWY_API Vec128<uint32_t, N> Set(Simd<uint32_t, N, 0> /* tag */,
-                                const uint32_t t) {
-  return Vec128<uint32_t, N>{wasm_i32x4_splat(static_cast<int32_t>(t))};
-}
-template <size_t N, HWY_IF_LE128(uint64_t, N)>
-HWY_API Vec128<uint64_t, N> Set(Simd<uint64_t, N, 0> /* tag */,
-                                const uint64_t t) {
-  return Vec128<uint64_t, N>{wasm_i64x2_splat(static_cast<int64_t>(t))};
-}
-
-template <size_t N, HWY_IF_LE128(int8_t, N)>
-HWY_API Vec128<int8_t, N> Set(Simd<int8_t, N, 0> /* tag */, const int8_t t) {
-  return Vec128<int8_t, N>{wasm_i8x16_splat(t)};
-}
-template <size_t N, HWY_IF_LE128(int16_t, N)>
-HWY_API Vec128<int16_t, N> Set(Simd<int16_t, N, 0> /* tag */, const int16_t t) {
-  return Vec128<int16_t, N>{wasm_i16x8_splat(t)};
-}
-template <size_t N, HWY_IF_LE128(int32_t, N)>
-HWY_API Vec128<int32_t, N> Set(Simd<int32_t, N, 0> /* tag */, const int32_t t) {
-  return Vec128<int32_t, N>{wasm_i32x4_splat(t)};
-}
-template <size_t N, HWY_IF_LE128(int64_t, N)>
-HWY_API Vec128<int64_t, N> Set(Simd<int64_t, N, 0> /* tag */, const int64_t t) {
-  return Vec128<int64_t, N>{wasm_i64x2_splat(t)};
-}
-
-template <size_t N, HWY_IF_LE128(float, N)>
-HWY_API Vec128<float, N> Set(Simd<float, N, 0> /* tag */, const float t) {
-  return Vec128<float, N>{wasm_f32x4_splat(t)};
-}
-
-HWY_DIAGNOSTICS(push)
-HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
-
-// Returns a vector with uninitialized elements.
-template <typename T, size_t N, HWY_IF_LE128(T, N)>
-HWY_API Vec128<T, N> Undefined(Simd<T, N, 0> d) {
-  return Zero(d);
-}
-
-HWY_DIAGNOSTICS(pop)
-
-// Returns a vector with lane i=[0, N) set to "first" + i.
-template <typename T, size_t N, typename T2>
-Vec128<T, N> Iota(const Simd<T, N, 0> d, const T2 first) {
-  HWY_ALIGN T lanes[16 / sizeof(T)];
-  for (size_t i = 0; i < 16 / sizeof(T); ++i) {
-    lanes[i] = static_cast<T>(first + static_cast<T2>(i));
-  }
-  return Load(d, lanes);
-}
-
-// ================================================== ARITHMETIC
-
-// ------------------------------ Addition
-
-// Unsigned
-template <size_t N>
-HWY_API Vec128<uint8_t, N> operator+(const Vec128<uint8_t, N> a,
-                                     const Vec128<uint8_t, N> b) {
-  return Vec128<uint8_t, N>{wasm_i8x16_add(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<uint16_t, N> operator+(const Vec128<uint16_t, N> a,
-                                      const Vec128<uint16_t, N> b) {
-  return Vec128<uint16_t, N>{wasm_i16x8_add(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<uint32_t, N> operator+(const Vec128<uint32_t, N> a,
-                                      const Vec128<uint32_t, N> b) {
-  return Vec128<uint32_t, N>{wasm_i32x4_add(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<uint64_t, N> operator+(const Vec128<uint64_t, N> a,
-                                      const Vec128<uint64_t, N> b) {
-  return Vec128<uint64_t, N>{wasm_i64x2_add(a.raw, b.raw)};
-}
-
-// Signed
-template <size_t N>
-HWY_API Vec128<int8_t, N> operator+(const Vec128<int8_t, N> a,
-                                    const Vec128<int8_t, N> b) {
-  return Vec128<int8_t, N>{wasm_i8x16_add(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<int16_t, N> operator+(const Vec128<int16_t, N> a,
-                                     const Vec128<int16_t, N> b) {
-  return Vec128<int16_t, N>{wasm_i16x8_add(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<int32_t, N> operator+(const Vec128<int32_t, N> a,
-                                     const Vec128<int32_t, N> b) {
-  return Vec128<int32_t, N>{wasm_i32x4_add(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<int64_t, N> operator+(const Vec128<int64_t, N> a,
-                                     const Vec128<int64_t, N> b) {
-  return Vec128<int64_t, N>{wasm_i64x2_add(a.raw, b.raw)};
-}
-
-// Float
-template <size_t N>
-HWY_API Vec128<float, N> operator+(const Vec128<float, N> a,
-                                   const Vec128<float, N> b) {
-  return Vec128<float, N>{wasm_f32x4_add(a.raw, b.raw)};
-}
-
-// ------------------------------ Subtraction
-
-// Unsigned
-template <size_t N>
-HWY_API Vec128<uint8_t, N> operator-(const Vec128<uint8_t, N> a,
-                                     const Vec128<uint8_t, N> b) {
-  return Vec128<uint8_t, N>{wasm_i8x16_sub(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<uint16_t, N> operator-(Vec128<uint16_t, N> a,
-                                      Vec128<uint16_t, N> b) {
-  return Vec128<uint16_t, N>{wasm_i16x8_sub(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<uint32_t, N> operator-(const Vec128<uint32_t, N> a,
-                                      const Vec128<uint32_t, N> b) {
-  return Vec128<uint32_t, N>{wasm_i32x4_sub(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<uint64_t, N> operator-(const Vec128<uint64_t, N> a,
-                                      const Vec128<uint64_t, N> b) {
-  return Vec128<uint64_t, N>{wasm_i64x2_sub(a.raw, b.raw)};
-}
-
-// Signed
-template <size_t N>
-HWY_API Vec128<int8_t, N> operator-(const Vec128<int8_t, N> a,
-                                    const Vec128<int8_t, N> b) {
-  return Vec128<int8_t, N>{wasm_i8x16_sub(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<int16_t, N> operator-(const Vec128<int16_t, N> a,
-                                     const Vec128<int16_t, N> b) {
-  return Vec128<int16_t, N>{wasm_i16x8_sub(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<int32_t, N> operator-(const Vec128<int32_t, N> a,
-                                     const Vec128<int32_t, N> b) {
-  return Vec128<int32_t, N>{wasm_i32x4_sub(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<int64_t, N> operator-(const Vec128<int64_t, N> a,
-                                     const Vec128<int64_t, N> b) {
-  return Vec128<int64_t, N>{wasm_i64x2_sub(a.raw, b.raw)};
-}
-
-// Float
-template <size_t N>
-HWY_API Vec128<float, N> operator-(const Vec128<float, N> a,
-                                   const Vec128<float, N> b) {
-  return Vec128<float, N>{wasm_f32x4_sub(a.raw, b.raw)};
-}
-
-// ------------------------------ SaturatedAdd
-
-// Returns a + b clamped to the destination range.
-
-// Unsigned
-template <size_t N>
-HWY_API Vec128<uint8_t, N> SaturatedAdd(const Vec128<uint8_t, N> a,
-                                        const Vec128<uint8_t, N> b) {
-  return Vec128<uint8_t, N>{wasm_u8x16_add_sat(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<uint16_t, N> SaturatedAdd(const Vec128<uint16_t, N> a,
-                                         const Vec128<uint16_t, N> b) {
-  return Vec128<uint16_t, N>{wasm_u16x8_add_sat(a.raw, b.raw)};
-}
-
-// Signed
-template <size_t N>
-HWY_API Vec128<int8_t, N> SaturatedAdd(const Vec128<int8_t, N> a,
-                                       const Vec128<int8_t, N> b) {
-  return Vec128<int8_t, N>{wasm_i8x16_add_sat(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<int16_t, N> SaturatedAdd(const Vec128<int16_t, N> a,
-                                        const Vec128<int16_t, N> b) {
-  return Vec128<int16_t, N>{wasm_i16x8_add_sat(a.raw, b.raw)};
-}
-
-// ------------------------------ SaturatedSub
-
-// Returns a - b clamped to the destination range.
-
-// Unsigned
-template <size_t N>
-HWY_API Vec128<uint8_t, N> SaturatedSub(const Vec128<uint8_t, N> a,
-                                        const Vec128<uint8_t, N> b) {
-  return Vec128<uint8_t, N>{wasm_u8x16_sub_sat(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<uint16_t, N> SaturatedSub(const Vec128<uint16_t, N> a,
-                                         const Vec128<uint16_t, N> b) {
-  return Vec128<uint16_t, N>{wasm_u16x8_sub_sat(a.raw, b.raw)};
-}
-
-// Signed
-template <size_t N>
-HWY_API Vec128<int8_t, N> SaturatedSub(const Vec128<int8_t, N> a,
-                                       const Vec128<int8_t, N> b) {
-  return Vec128<int8_t, N>{wasm_i8x16_sub_sat(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<int16_t, N> SaturatedSub(const Vec128<int16_t, N> a,
-                                        const Vec128<int16_t, N> b) {
-  return Vec128<int16_t, N>{wasm_i16x8_sub_sat(a.raw, b.raw)};
-}
-
-// ------------------------------ Average
-
-// Returns (a + b + 1) / 2
-
-// Unsigned
-template <size_t N>
-HWY_API Vec128<uint8_t, N> AverageRound(const Vec128<uint8_t, N> a,
-                                        const Vec128<uint8_t, N> b) {
-  return Vec128<uint8_t, N>{wasm_u8x16_avgr(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<uint16_t, N> AverageRound(const Vec128<uint16_t, N> a,
-                                         const Vec128<uint16_t, N> b) {
-  return Vec128<uint16_t, N>{wasm_u16x8_avgr(a.raw, b.raw)};
-}
-
-// ------------------------------ Absolute value
-
-// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
-template <size_t N>
-HWY_API Vec128<int8_t, N> Abs(const Vec128<int8_t, N> v) {
-  return Vec128<int8_t, N>{wasm_i8x16_abs(v.raw)};
-}
-template <size_t N>
-HWY_API Vec128<int16_t, N> Abs(const Vec128<int16_t, N> v) {
-  return Vec128<int16_t, N>{wasm_i16x8_abs(v.raw)};
-}
-template <size_t N>
-HWY_API Vec128<int32_t, N> Abs(const Vec128<int32_t, N> v) {
-  return Vec128<int32_t, N>{wasm_i32x4_abs(v.raw)};
-}
-template <size_t N>
-HWY_API Vec128<int64_t, N> Abs(const Vec128<int64_t, N> v) {
-  return Vec128<int64_t, N>{wasm_i64x2_abs(v.raw)};
-}
-
-template <size_t N>
-HWY_API Vec128<float, N> Abs(const Vec128<float, N> v) {
-  return Vec128<float, N>{wasm_f32x4_abs(v.raw)};
-}
-
-// ------------------------------ Shift lanes by constant #bits
-
-// Unsigned
-template <int kBits, size_t N>
-HWY_API Vec128<uint16_t, N> ShiftLeft(const Vec128<uint16_t, N> v) {
-  return Vec128<uint16_t, N>{wasm_i16x8_shl(v.raw, kBits)};
-}
-template <int kBits, size_t N>
-HWY_API Vec128<uint16_t, N> ShiftRight(const Vec128<uint16_t, N> v) {
-  return Vec128<uint16_t, N>{wasm_u16x8_shr(v.raw, kBits)};
-}
-template <int kBits, size_t N>
-HWY_API Vec128<uint32_t, N> ShiftLeft(const Vec128<uint32_t, N> v) {
-  return Vec128<uint32_t, N>{wasm_i32x4_shl(v.raw, kBits)};
-}
-template <int kBits, size_t N>
-HWY_API Vec128<uint64_t, N> ShiftLeft(const Vec128<uint64_t, N> v) {
-  return Vec128<uint64_t, N>{wasm_i64x2_shl(v.raw, kBits)};
-}
-template <int kBits, size_t N>
-HWY_API Vec128<uint32_t, N> ShiftRight(const Vec128<uint32_t, N> v) {
-  return Vec128<uint32_t, N>{wasm_u32x4_shr(v.raw, kBits)};
-}
-template <int kBits, size_t N>
-HWY_API Vec128<uint64_t, N> ShiftRight(const Vec128<uint64_t, N> v) {
-  return Vec128<uint64_t, N>{wasm_u64x2_shr(v.raw, kBits)};
-}
-
-// Signed
-template <int kBits, size_t N>
-HWY_API Vec128<int16_t, N> ShiftLeft(const Vec128<int16_t, N> v) {
-  return Vec128<int16_t, N>{wasm_i16x8_shl(v.raw, kBits)};
-}
-template <int kBits, size_t N>
-HWY_API Vec128<int16_t, N> ShiftRight(const Vec128<int16_t, N> v) {
-  return Vec128<int16_t, N>{wasm_i16x8_shr(v.raw, kBits)};
-}
-template <int kBits, size_t N>
-HWY_API Vec128<int32_t, N> ShiftLeft(const Vec128<int32_t, N> v) {
-  return Vec128<int32_t, N>{wasm_i32x4_shl(v.raw, kBits)};
-}
-template <int kBits, size_t N>
-HWY_API Vec128<int64_t, N> ShiftLeft(const Vec128<int64_t, N> v) {
-  return Vec128<int64_t, N>{wasm_i64x2_shl(v.raw, kBits)};
-}
-template <int kBits, size_t N>
-HWY_API Vec128<int32_t, N> ShiftRight(const Vec128<int32_t, N> v) {
-  return Vec128<int32_t, N>{wasm_i32x4_shr(v.raw, kBits)};
-}
-template <int kBits, size_t N>
-HWY_API Vec128<int64_t, N> ShiftRight(const Vec128<int64_t, N> v) {
-  return Vec128<int64_t, N>{wasm_i64x2_shr(v.raw, kBits)};
-}
-
-// 8-bit
-template <int kBits, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Vec128<T, N> ShiftLeft(const Vec128<T, N> v) {
-  const DFromV<decltype(v)> d8;
-  // Use raw instead of BitCast to support N=1.
-  const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{v.raw}).raw};
-  return kBits == 1
-             ? (v + v)
-             : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
-}
-
-template <int kBits, size_t N>
-HWY_API Vec128<uint8_t, N> ShiftRight(const Vec128<uint8_t, N> v) {
-  const DFromV<decltype(v)> d8;
-  // Use raw instead of BitCast to support N=1.
-  const Vec128<uint8_t, N> shifted{
-      ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw};
-  return shifted & Set(d8, 0xFF >> kBits);
-}
-
-template <int kBits, size_t N>
-HWY_API Vec128<int8_t, N> ShiftRight(const Vec128<int8_t, N> v) {
-  const DFromV<decltype(v)> di;
-  const RebindToUnsigned<decltype(di)> du;
-  const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
-  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
-  return (shifted ^ shifted_sign) - shifted_sign;
-}
-
-// ------------------------------ RotateRight (ShiftRight, Or)
-template <int kBits, typename T, size_t N>
-HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
-  constexpr size_t kSizeInBits = sizeof(T) * 8;
-  static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
-  if (kBits == 0) return v;
-  return Or(ShiftRight<kBits>(v), ShiftLeft<kSizeInBits - kBits>(v));
-}
-
-// ------------------------------ Shift lanes by same variable #bits
-
-// After https://reviews.llvm.org/D108415 shift argument became unsigned.
-HWY_DIAGNOSTICS(push)
-HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
-
-// Unsigned
-template <size_t N>
-HWY_API Vec128<uint16_t, N> ShiftLeftSame(const Vec128<uint16_t, N> v,
-                                          const int bits) {
-  return Vec128<uint16_t, N>{wasm_i16x8_shl(v.raw, bits)};
-}
-template <size_t N>
-HWY_API Vec128<uint16_t, N> ShiftRightSame(const Vec128<uint16_t, N> v,
-                                           const int bits) {
-  return Vec128<uint16_t, N>{wasm_u16x8_shr(v.raw, bits)};
-}
-template <size_t N>
-HWY_API Vec128<uint32_t, N> ShiftLeftSame(const Vec128<uint32_t, N> v,
-                                          const int bits) {
-  return Vec128<uint32_t, N>{wasm_i32x4_shl(v.raw, bits)};
-}
-template <size_t N>
-HWY_API Vec128<uint32_t, N> ShiftRightSame(const Vec128<uint32_t, N> v,
-                                           const int bits) {
-  return Vec128<uint32_t, N>{wasm_u32x4_shr(v.raw, bits)};
-}
-template <size_t N>
-HWY_API Vec128<uint64_t, N> ShiftLeftSame(const Vec128<uint64_t, N> v,
-                                          const int bits) {
-  return Vec128<uint64_t, N>{wasm_i64x2_shl(v.raw, bits)};
-}
-template <size_t N>
-HWY_API Vec128<uint64_t, N> ShiftRightSame(const Vec128<uint64_t, N> v,
-                                           const int bits) {
-  return Vec128<uint64_t, N>{wasm_u64x2_shr(v.raw, bits)};
-}
-
-// Signed
-template <size_t N>
-HWY_API Vec128<int16_t, N> ShiftLeftSame(const Vec128<int16_t, N> v,
-                                         const int bits) {
-  return Vec128<int16_t, N>{wasm_i16x8_shl(v.raw, bits)};
-}
-template <size_t N>
-HWY_API Vec128<int16_t, N> ShiftRightSame(const Vec128<int16_t, N> v,
-                                          const int bits) {
-  return Vec128<int16_t, N>{wasm_i16x8_shr(v.raw, bits)};
-}
-template <size_t N>
-HWY_API Vec128<int32_t, N> ShiftLeftSame(const Vec128<int32_t, N> v,
-                                         const int bits) {
-  return Vec128<int32_t, N>{wasm_i32x4_shl(v.raw, bits)};
-}
-template <size_t N>
-HWY_API Vec128<int32_t, N> ShiftRightSame(const Vec128<int32_t, N> v,
-                                          const int bits) {
-  return Vec128<int32_t, N>{wasm_i32x4_shr(v.raw, bits)};
-}
-template <size_t N>
-HWY_API Vec128<int64_t, N> ShiftLeftSame(const Vec128<int64_t, N> v,
-                                         const int bits) {
-  return Vec128<int64_t, N>{wasm_i64x2_shl(v.raw, bits)};
-}
-template <size_t N>
-HWY_API Vec128<int64_t, N> ShiftRightSame(const Vec128<int64_t, N> v,
-                                          const int bits) {
-  return Vec128<int64_t, N>{wasm_i64x2_shr(v.raw, bits)};
-}
-
-// 8-bit
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Vec128<T, N> ShiftLeftSame(const Vec128<T, N> v, const int bits) {
-  const DFromV<decltype(v)> d8;
-  // Use raw instead of BitCast to support N=1.
-  const Vec128<T, N> shifted{
-      ShiftLeftSame(Vec128<MakeWide<T>>{v.raw}, bits).raw};
-  return shifted & Set(d8, static_cast<T>((0xFF << bits) & 0xFF));
-}
-
-template <size_t N>
-HWY_API Vec128<uint8_t, N> ShiftRightSame(Vec128<uint8_t, N> v,
-                                          const int bits) {
-  const DFromV<decltype(v)> d8;
-  // Use raw instead of BitCast to support N=1.
-  const Vec128<uint8_t, N> shifted{
-      ShiftRightSame(Vec128<uint16_t>{v.raw}, bits).raw};
-  return shifted & Set(d8, 0xFF >> bits);
-}
-
-template <size_t N>
-HWY_API Vec128<int8_t, N> ShiftRightSame(Vec128<int8_t, N> v, const int bits) {
-  const DFromV<decltype(v)> di;
-  const RebindToUnsigned<decltype(di)> du;
-  const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
-  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits));
-  return (shifted ^ shifted_sign) - shifted_sign;
-}
-
-// ignore Wsign-conversion
-HWY_DIAGNOSTICS(pop)
-
-// ------------------------------ Minimum
-
-// Unsigned
-template <size_t N>
-HWY_API Vec128<uint8_t, N> Min(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) {
-  return Vec128<uint8_t, N>{wasm_u8x16_min(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<uint16_t, N> Min(Vec128<uint16_t, N> a, Vec128<uint16_t, N> b) {
-  return Vec128<uint16_t, N>{wasm_u16x8_min(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<uint32_t, N> Min(Vec128<uint32_t, N> a, Vec128<uint32_t, N> b) {
-  return Vec128<uint32_t, N>{wasm_u32x4_min(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<uint64_t, N> Min(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
-  // Avoid wasm_u64x2_extract_lane - not all implementations have it yet.
-  const uint64_t a0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 0));
-  const uint64_t b0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 0));
-  const uint64_t a1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 1));
-  const uint64_t b1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 1));
-  alignas(16) uint64_t min[2] = {HWY_MIN(a0, b0), HWY_MIN(a1, b1)};
-  return Vec128<uint64_t, N>{wasm_v128_load(min)};
-}
-
-// Signed
-template <size_t N>
-HWY_API Vec128<int8_t, N> Min(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
-  return Vec128<int8_t, N>{wasm_i8x16_min(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<int16_t, N> Min(Vec128<int16_t, N> a, Vec128<int16_t, N> b) {
-  return Vec128<int16_t, N>{wasm_i16x8_min(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<int32_t, N> Min(Vec128<int32_t, N> a, Vec128<int32_t, N> b) {
-  return Vec128<int32_t, N>{wasm_i32x4_min(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<int64_t, N> Min(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
-  alignas(16) int64_t min[4];
-  min[0] = HWY_MIN(wasm_i64x2_extract_lane(a.raw, 0),
-                   wasm_i64x2_extract_lane(b.raw, 0));
-  min[1] = HWY_MIN(wasm_i64x2_extract_lane(a.raw, 1),
-                   wasm_i64x2_extract_lane(b.raw, 1));
-  return Vec128<int64_t, N>{wasm_v128_load(min)};
-}
-
-// Float
-template <size_t N>
-HWY_API Vec128<float, N> Min(Vec128<float, N> a, Vec128<float, N> b) {
-  return Vec128<float, N>{wasm_f32x4_min(a.raw, b.raw)};
-}
-
-// ------------------------------ Maximum
-
-// Unsigned
-template <size_t N>
-HWY_API Vec128<uint8_t, N> Max(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) {
-  return Vec128<uint8_t, N>{wasm_u8x16_max(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<uint16_t, N> Max(Vec128<uint16_t, N> a, Vec128<uint16_t, N> b) {
-  return Vec128<uint16_t, N>{wasm_u16x8_max(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<uint32_t, N> Max(Vec128<uint32_t, N> a, Vec128<uint32_t, N> b) {
-  return Vec128<uint32_t, N>{wasm_u32x4_max(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<uint64_t, N> Max(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
-  // Avoid wasm_u64x2_extract_lane - not all implementations have it yet.
-  const uint64_t a0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 0));
-  const uint64_t b0 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 0));
-  const uint64_t a1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 1));
-  const uint64_t b1 = static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 1));
-  alignas(16) uint64_t max[2] = {HWY_MAX(a0, b0), HWY_MAX(a1, b1)};
-  return Vec128<uint64_t, N>{wasm_v128_load(max)};
-}
-
-// Signed
-template <size_t N>
-HWY_API Vec128<int8_t, N> Max(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
-  return Vec128<int8_t, N>{wasm_i8x16_max(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<int16_t, N> Max(Vec128<int16_t, N> a, Vec128<int16_t, N> b) {
-  return Vec128<int16_t, N>{wasm_i16x8_max(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<int32_t, N> Max(Vec128<int32_t, N> a, Vec128<int32_t, N> b) {
-  return Vec128<int32_t, N>{wasm_i32x4_max(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<int64_t, N> Max(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
-  alignas(16) int64_t max[2];
-  max[0] = HWY_MAX(wasm_i64x2_extract_lane(a.raw, 0),
-                   wasm_i64x2_extract_lane(b.raw, 0));
-  max[1] = HWY_MAX(wasm_i64x2_extract_lane(a.raw, 1),
-                   wasm_i64x2_extract_lane(b.raw, 1));
-  return Vec128<int64_t, N>{wasm_v128_load(max)};
-}
-
-// Float
-template <size_t N>
-HWY_API Vec128<float, N> Max(Vec128<float, N> a, Vec128<float, N> b) {
-  return Vec128<float, N>{wasm_f32x4_max(a.raw, b.raw)};
-}
-
-// ------------------------------ Integer multiplication
-
-// Unsigned
-template <size_t N>
-HWY_API Vec128<uint16_t, N> operator*(const Vec128<uint16_t, N> a,
-                                      const Vec128<uint16_t, N> b) {
-  return Vec128<uint16_t, N>{wasm_i16x8_mul(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<uint32_t, N> operator*(const Vec128<uint32_t, N> a,
-                                      const Vec128<uint32_t, N> b) {
-  return Vec128<uint32_t, N>{wasm_i32x4_mul(a.raw, b.raw)};
-}
-
-// Signed
-template <size_t N>
-HWY_API Vec128<int16_t, N> operator*(const Vec128<int16_t, N> a,
-                                     const Vec128<int16_t, N> b) {
-  return Vec128<int16_t, N>{wasm_i16x8_mul(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<int32_t, N> operator*(const Vec128<int32_t, N> a,
-                                     const Vec128<int32_t, N> b) {
-  return Vec128<int32_t, N>{wasm_i32x4_mul(a.raw, b.raw)};
-}
-
-// Returns the upper 16 bits of a * b in each lane.
-template <size_t N>
-HWY_API Vec128<uint16_t, N> MulHigh(const Vec128<uint16_t, N> a,
-                                    const Vec128<uint16_t, N> b) {
-  // TODO(eustas): replace, when implemented in WASM.
-  const auto al = wasm_u32x4_extend_low_u16x8(a.raw);
-  const auto ah = wasm_u32x4_extend_high_u16x8(a.raw);
-  const auto bl = wasm_u32x4_extend_low_u16x8(b.raw);
-  const auto bh = wasm_u32x4_extend_high_u16x8(b.raw);
-  const auto l = wasm_i32x4_mul(al, bl);
-  const auto h = wasm_i32x4_mul(ah, bh);
-  // TODO(eustas): shift-right + narrow?
-  return Vec128<uint16_t, N>{
-      wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
-}
-template <size_t N>
-HWY_API Vec128<int16_t, N> MulHigh(const Vec128<int16_t, N> a,
-                                   const Vec128<int16_t, N> b) {
-  // TODO(eustas): replace, when implemented in WASM.
-  const auto al = wasm_i32x4_extend_low_i16x8(a.raw);
-  const auto ah = wasm_i32x4_extend_high_i16x8(a.raw);
-  const auto bl = wasm_i32x4_extend_low_i16x8(b.raw);
-  const auto bh = wasm_i32x4_extend_high_i16x8(b.raw);
-  const auto l = wasm_i32x4_mul(al, bl);
-  const auto h = wasm_i32x4_mul(ah, bh);
-  // TODO(eustas): shift-right + narrow?
-  return Vec128<int16_t, N>{
-      wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
-}
-
-template <size_t N>
-HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
-                                           Vec128<int16_t, N> b) {
-  const DFromV<decltype(a)> d;
-  const RebindToUnsigned<decltype(d)> du;
-
-  const Vec128<uint16_t, N> lo = BitCast(du, Mul(a, b));
-  const Vec128<int16_t, N> hi = MulHigh(a, b);
-  // We want (lo + 0x4000) >> 15, but that can overflow, and if it does we must
-  // carry that into the result. Instead isolate the top two bits because only
-  // they can influence the result.
-  const Vec128<uint16_t, N> lo_top2 = ShiftRight<14>(lo);
-  // Bits 11: add 2, 10: add 1, 01: add 1, 00: add 0.
-  const Vec128<uint16_t, N> rounding = ShiftRight<1>(Add(lo_top2, Set(du, 1)));
-  return Add(Add(hi, hi), BitCast(d, rounding));
-}
-
-// Multiplies even lanes (0, 2 ..) and returns the double-width result.
-template <size_t N>
-HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a,
-                                             const Vec128<int32_t, N> b) {
-  // TODO(eustas): replace, when implemented in WASM.
-  const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
-  const auto ae = wasm_v128_and(a.raw, kEvenMask);
-  const auto be = wasm_v128_and(b.raw, kEvenMask);
-  return Vec128<int64_t, (N + 1) / 2>{wasm_i64x2_mul(ae, be)};
-}
-template <size_t N>
-HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(const Vec128<uint32_t, N> a,
-                                              const Vec128<uint32_t, N> b) {
-  // TODO(eustas): replace, when implemented in WASM.
-  const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
-  const auto ae = wasm_v128_and(a.raw, kEvenMask);
-  const auto be = wasm_v128_and(b.raw, kEvenMask);
-  return Vec128<uint64_t, (N + 1) / 2>{wasm_i64x2_mul(ae, be)};
-}
-
-// ------------------------------ Negate
-
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
-HWY_API Vec128<T, N> Neg(const Vec128<T, N> v) {
-  return Xor(v, SignBit(DFromV<decltype(v)>()));
-}
-
-template <size_t N>
-HWY_API Vec128<int8_t, N> Neg(const Vec128<int8_t, N> v) {
-  return Vec128<int8_t, N>{wasm_i8x16_neg(v.raw)};
-}
-template <size_t N>
-HWY_API Vec128<int16_t, N> Neg(const Vec128<int16_t, N> v) {
-  return Vec128<int16_t, N>{wasm_i16x8_neg(v.raw)};
-}
-template <size_t N>
-HWY_API Vec128<int32_t, N> Neg(const Vec128<int32_t, N> v) {
-  return Vec128<int32_t, N>{wasm_i32x4_neg(v.raw)};
-}
-template <size_t N>
-HWY_API Vec128<int64_t, N> Neg(const Vec128<int64_t, N> v) {
-  return Vec128<int64_t, N>{wasm_i64x2_neg(v.raw)};
-}
-
-// ------------------------------ Floating-point mul / div
-
-template <size_t N>
-HWY_API Vec128<float, N> operator*(Vec128<float, N> a, Vec128<float, N> b) {
-  return Vec128<float, N>{wasm_f32x4_mul(a.raw, b.raw)};
-}
-
-template <size_t N>
-HWY_API Vec128<float, N> operator/(const Vec128<float, N> a,
-                                   const Vec128<float, N> b) {
-  return Vec128<float, N>{wasm_f32x4_div(a.raw, b.raw)};
-}
-
-// Approximate reciprocal
-template <size_t N>
-HWY_API Vec128<float, N> ApproximateReciprocal(const Vec128<float, N> v) {
-  const Vec128<float, N> one = Vec128<float, N>{wasm_f32x4_splat(1.0f)};
-  return one / v;
-}
-
-// Absolute value of difference.
-template <size_t N>
-HWY_API Vec128<float, N> AbsDiff(const Vec128<float, N> a,
-                                 const Vec128<float, N> b) {
-  return Abs(a - b);
-}
-
-// ------------------------------ Floating-point multiply-add variants
-
-// Returns mul * x + add
-template <size_t N>
-HWY_API Vec128<float, N> MulAdd(const Vec128<float, N> mul,
-                                const Vec128<float, N> x,
-                                const Vec128<float, N> add) {
-  // TODO(eustas): replace, when implemented in WASM.
-  // TODO(eustas): is it wasm_f32x4_qfma?
-  return mul * x + add;
-}
-
-// Returns add - mul * x
-template <size_t N>
-HWY_API Vec128<float, N> NegMulAdd(const Vec128<float, N> mul,
-                                   const Vec128<float, N> x,
-                                   const Vec128<float, N> add) {
-  // TODO(eustas): replace, when implemented in WASM.
-  return add - mul * x;
-}
-
-// Returns mul * x - sub
-template <size_t N>
-HWY_API Vec128<float, N> MulSub(const Vec128<float, N> mul,
-                                const Vec128<float, N> x,
-                                const Vec128<float, N> sub) {
-  // TODO(eustas): replace, when implemented in WASM.
-  // TODO(eustas): is it wasm_f32x4_qfms?
-  return mul * x - sub;
-}
-
-// Returns -mul * x - sub
-template <size_t N>
-HWY_API Vec128<float, N> NegMulSub(const Vec128<float, N> mul,
-                                   const Vec128<float, N> x,
-                                   const Vec128<float, N> sub) {
-  // TODO(eustas): replace, when implemented in WASM.
-  return Neg(mul) * x - sub;
-}
-
-// ------------------------------ Floating-point square root
-
-// Full precision square root
-template <size_t N>
-HWY_API Vec128<float, N> Sqrt(const Vec128<float, N> v) {
-  return Vec128<float, N>{wasm_f32x4_sqrt(v.raw)};
-}
-
-// Approximate reciprocal square root
-template <size_t N>
-HWY_API Vec128<float, N> ApproximateReciprocalSqrt(const Vec128<float, N> v) {
-  // TODO(eustas): find cheaper a way to calculate this.
-  const Vec128<float, N> one = Vec128<float, N>{wasm_f32x4_splat(1.0f)};
-  return one / Sqrt(v);
-}
-
-// ------------------------------ Floating-point rounding
-
-// Toward nearest integer, ties to even
-template <size_t N>
-HWY_API Vec128<float, N> Round(const Vec128<float, N> v) {
-  return Vec128<float, N>{wasm_f32x4_nearest(v.raw)};
-}
-
-// Toward zero, aka truncate
-template <size_t N>
-HWY_API Vec128<float, N> Trunc(const Vec128<float, N> v) {
-  return Vec128<float, N>{wasm_f32x4_trunc(v.raw)};
-}
-
-// Toward +infinity, aka ceiling
-template <size_t N>
-HWY_API Vec128<float, N> Ceil(const Vec128<float, N> v) {
-  return Vec128<float, N>{wasm_f32x4_ceil(v.raw)};
-}
-
-// Toward -infinity, aka floor
-template <size_t N>
-HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) {
-  return Vec128<float, N>{wasm_f32x4_floor(v.raw)};
-}
-
-// ------------------------------ Floating-point classification
-template <typename T, size_t N>
-HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
-  return v != v;
-}
-
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
-HWY_API Mask128<T, N> IsInf(const Vec128<T, N> v) {
-  const Simd<T, N, 0> d;
-  const RebindToSigned<decltype(d)> di;
-  const VFromD<decltype(di)> vi = BitCast(di, v);
-  // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
-  return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
-}
-
-// Returns whether normal/subnormal/zero.
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
-HWY_API Mask128<T, N> IsFinite(const Vec128<T, N> v) {
-  const Simd<T, N, 0> d;
-  const RebindToUnsigned<decltype(d)> du;
-  const RebindToSigned<decltype(d)> di;  // cheaper than unsigned comparison
-  const VFromD<decltype(du)> vu = BitCast(du, v);
-  // 'Shift left' to clear the sign bit, then right so we can compare with the
-  // max exponent (cannot compare with MaxExponentTimes2 directly because it is
-  // negative and non-negative floats would be greater).
-  const VFromD<decltype(di)> exp =
-      BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
-  return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
-}
-
-// ================================================== COMPARE
-
-// Comparisons fill a lane with 1-bits if the condition is true, else 0.
-
-template <typename TFrom, typename TTo, size_t N>
-HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N, 0> /*tag*/,
-                                   Mask128<TFrom, N> m) {
-  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
-  return Mask128<TTo, N>{m.raw};
-}
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) {
-  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
-  return (v & bit) == bit;
-}
-
-// ------------------------------ Equality
-
-// Unsigned
-template <size_t N>
-HWY_API Mask128<uint8_t, N> operator==(const Vec128<uint8_t, N> a,
-                                       const Vec128<uint8_t, N> b) {
-  return Mask128<uint8_t, N>{wasm_i8x16_eq(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Mask128<uint16_t, N> operator==(const Vec128<uint16_t, N> a,
-                                        const Vec128<uint16_t, N> b) {
-  return Mask128<uint16_t, N>{wasm_i16x8_eq(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Mask128<uint32_t, N> operator==(const Vec128<uint32_t, N> a,
-                                        const Vec128<uint32_t, N> b) {
-  return Mask128<uint32_t, N>{wasm_i32x4_eq(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a,
-                                        const Vec128<uint64_t, N> b) {
-  return Mask128<uint64_t, N>{wasm_i64x2_eq(a.raw, b.raw)};
-}
-
-// Signed
-template <size_t N>
-HWY_API Mask128<int8_t, N> operator==(const Vec128<int8_t, N> a,
-                                      const Vec128<int8_t, N> b) {
-  return Mask128<int8_t, N>{wasm_i8x16_eq(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Mask128<int16_t, N> operator==(Vec128<int16_t, N> a,
-                                       Vec128<int16_t, N> b) {
-  return Mask128<int16_t, N>{wasm_i16x8_eq(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Mask128<int32_t, N> operator==(const Vec128<int32_t, N> a,
-                                       const Vec128<int32_t, N> b) {
-  return Mask128<int32_t, N>{wasm_i32x4_eq(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a,
-                                       const Vec128<int64_t, N> b) {
-  return Mask128<int64_t, N>{wasm_i64x2_eq(a.raw, b.raw)};
-}
-
-// Float
-template <size_t N>
-HWY_API Mask128<float, N> operator==(const Vec128<float, N> a,
-                                     const Vec128<float, N> b) {
-  return Mask128<float, N>{wasm_f32x4_eq(a.raw, b.raw)};
-}
-
-// ------------------------------ Inequality
-
-// Unsigned
-template <size_t N>
-HWY_API Mask128<uint8_t, N> operator!=(const Vec128<uint8_t, N> a,
-                                       const Vec128<uint8_t, N> b) {
-  return Mask128<uint8_t, N>{wasm_i8x16_ne(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Mask128<uint16_t, N> operator!=(const Vec128<uint16_t, N> a,
-                                        const Vec128<uint16_t, N> b) {
-  return Mask128<uint16_t, N>{wasm_i16x8_ne(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Mask128<uint32_t, N> operator!=(const Vec128<uint32_t, N> a,
-                                        const Vec128<uint32_t, N> b) {
-  return Mask128<uint32_t, N>{wasm_i32x4_ne(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Mask128<uint64_t, N> operator!=(const Vec128<uint64_t, N> a,
-                                        const Vec128<uint64_t, N> b) {
-  return Mask128<uint64_t, N>{wasm_i64x2_ne(a.raw, b.raw)};
-}
-
-// Signed
-template <size_t N>
-HWY_API Mask128<int8_t, N> operator!=(const Vec128<int8_t, N> a,
-                                      const Vec128<int8_t, N> b) {
-  return Mask128<int8_t, N>{wasm_i8x16_ne(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Mask128<int16_t, N> operator!=(const Vec128<int16_t, N> a,
-                                       const Vec128<int16_t, N> b) {
-  return Mask128<int16_t, N>{wasm_i16x8_ne(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Mask128<int32_t, N> operator!=(const Vec128<int32_t, N> a,
-                                       const Vec128<int32_t, N> b) {
-  return Mask128<int32_t, N>{wasm_i32x4_ne(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Mask128<int64_t, N> operator!=(const Vec128<int64_t, N> a,
-                                       const Vec128<int64_t, N> b) {
-  return Mask128<int64_t, N>{wasm_i64x2_ne(a.raw, b.raw)};
-}
-
-// Float
-template <size_t N>
-HWY_API Mask128<float, N> operator!=(const Vec128<float, N> a,
-                                     const Vec128<float, N> b) {
-  return Mask128<float, N>{wasm_f32x4_ne(a.raw, b.raw)};
-}
-
-// ------------------------------ Strict inequality
-
-template <size_t N>
-HWY_API Mask128<int8_t, N> operator>(const Vec128<int8_t, N> a,
-                                     const Vec128<int8_t, N> b) {
-  return Mask128<int8_t, N>{wasm_i8x16_gt(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Mask128<int16_t, N> operator>(const Vec128<int16_t, N> a,
-                                      const Vec128<int16_t, N> b) {
-  return Mask128<int16_t, N>{wasm_i16x8_gt(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Mask128<int32_t, N> operator>(const Vec128<int32_t, N> a,
-                                      const Vec128<int32_t, N> b) {
-  return Mask128<int32_t, N>{wasm_i32x4_gt(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Mask128<int64_t, N> operator>(const Vec128<int64_t, N> a,
-                                      const Vec128<int64_t, N> b) {
-  return Mask128<int64_t, N>{wasm_i64x2_gt(a.raw, b.raw)};
-}
-
-template <size_t N>
-HWY_API Mask128<uint8_t, N> operator>(const Vec128<uint8_t, N> a,
-                                      const Vec128<uint8_t, N> b) {
-  return Mask128<uint8_t, N>{wasm_u8x16_gt(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Mask128<uint16_t, N> operator>(const Vec128<uint16_t, N> a,
-                                       const Vec128<uint16_t, N> b) {
-  return Mask128<uint16_t, N>{wasm_u16x8_gt(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Mask128<uint32_t, N> operator>(const Vec128<uint32_t, N> a,
-                                       const Vec128<uint32_t, N> b) {
-  return Mask128<uint32_t, N>{wasm_u32x4_gt(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Mask128<uint64_t, N> operator>(const Vec128<uint64_t, N> a,
-                                       const Vec128<uint64_t, N> b) {
-  const DFromV<decltype(a)> d;
-  const Repartition<uint32_t, decltype(d)> d32;
-  const auto a32 = BitCast(d32, a);
-  const auto b32 = BitCast(d32, b);
-  // If the upper halves are not equal, this is the answer.
-  const auto m_gt = a32 > b32;
-
-  // Otherwise, the lower half decides.
-  const auto m_eq = a32 == b32;
-  const auto lo_in_hi = wasm_i32x4_shuffle(m_gt.raw, m_gt.raw, 0, 0, 2, 2);
-  const auto lo_gt = And(m_eq, MaskFromVec(VFromD<decltype(d32)>{lo_in_hi}));
-
-  const auto gt = Or(lo_gt, m_gt);
-  // Copy result in upper 32 bits to lower 32 bits.
-  return Mask128<uint64_t, N>{wasm_i32x4_shuffle(gt.raw, gt.raw, 1, 1, 3, 3)};
-}
-
-template <size_t N>
-HWY_API Mask128<float, N> operator>(const Vec128<float, N> a,
-                                    const Vec128<float, N> b) {
-  return Mask128<float, N>{wasm_f32x4_gt(a.raw, b.raw)};
-}
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> operator<(const Vec128<T, N> a, const Vec128<T, N> b) {
-  return operator>(b, a);
-}
-
-// ------------------------------ Weak inequality
-
-// Float <= >=
-template <size_t N>
-HWY_API Mask128<float, N> operator<=(const Vec128<float, N> a,
-                                     const Vec128<float, N> b) {
-  return Mask128<float, N>{wasm_f32x4_le(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Mask128<float, N> operator>=(const Vec128<float, N> a,
-                                     const Vec128<float, N> b) {
-  return Mask128<float, N>{wasm_f32x4_ge(a.raw, b.raw)};
-}
-
-// ------------------------------ FirstN (Iota, Lt)
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> FirstN(const Simd<T, N, 0> d, size_t num) {
-  const RebindToSigned<decltype(d)> di;  // Signed comparisons may be cheaper.
-  return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
-}
-
-// ================================================== LOGICAL
-
-// ------------------------------ Not
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Not(Vec128<T, N> v) {
-  return Vec128<T, N>{wasm_v128_not(v.raw)};
-}
-
-// ------------------------------ And
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) {
-  return Vec128<T, N>{wasm_v128_and(a.raw, b.raw)};
-}
-
-// ------------------------------ AndNot
-
-// Returns ~not_mask & mask.
-template <typename T, size_t N>
-HWY_API Vec128<T, N> AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) {
-  return Vec128<T, N>{wasm_v128_andnot(mask.raw, not_mask.raw)};
-}
-
-// ------------------------------ Or
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) {
-  return Vec128<T, N>{wasm_v128_or(a.raw, b.raw)};
-}
-
-// ------------------------------ Xor
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) {
-  return Vec128<T, N>{wasm_v128_xor(a.raw, b.raw)};
-}
-
-// ------------------------------ Or3
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
-  return Or(o1, Or(o2, o3));
-}
-
-// ------------------------------ OrAnd
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
-  return Or(o, And(a1, a2));
-}
-
-// ------------------------------ IfVecThenElse
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
-                                   Vec128<T, N> no) {
-  return IfThenElse(MaskFromVec(mask), yes, no);
-}
-
-// ------------------------------ Operator overloads (internal-only if float)
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> operator&(const Vec128<T, N> a, const Vec128<T, N> b) {
-  return And(a, b);
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> operator|(const Vec128<T, N> a, const Vec128<T, N> b) {
-  return Or(a, b);
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> operator^(const Vec128<T, N> a, const Vec128<T, N> b) {
-  return Xor(a, b);
-}
-
-// ------------------------------ CopySign
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> CopySign(const Vec128<T, N> magn,
-                              const Vec128<T, N> sign) {
-  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
-  const auto msb = SignBit(DFromV<decltype(magn)>());
-  return Or(AndNot(msb, magn), And(msb, sign));
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> CopySignToAbs(const Vec128<T, N> abs,
-                                   const Vec128<T, N> sign) {
-  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
-  return Or(abs, And(SignBit(DFromV<decltype(abs)>()), sign));
-}
-
-// ------------------------------ BroadcastSignBit (compare)
-
-template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 1)>
-HWY_API Vec128<T, N> BroadcastSignBit(const Vec128<T, N> v) {
-  return ShiftRight<sizeof(T) * 8 - 1>(v);
-}
-template <size_t N>
-HWY_API Vec128<int8_t, N> BroadcastSignBit(const Vec128<int8_t, N> v) {
-  const DFromV<decltype(v)> d;
-  return VecFromMask(d, v < Zero(d));
-}
-
-// ------------------------------ Mask
-
-// Mask and Vec are the same (true = FF..FF).
-template <typename T, size_t N>
-HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
-  return Mask128<T, N>{v.raw};
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> VecFromMask(Simd<T, N, 0> /* tag */, Mask128<T, N> v) {
-  return Vec128<T, N>{v.raw};
-}
-
-// mask ? yes : no
-template <typename T, size_t N>
-HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
-                                Vec128<T, N> no) {
-  return Vec128<T, N>{wasm_v128_bitselect(yes.raw, no.raw, mask.raw)};
-}
-
-// mask ? yes : 0
-template <typename T, size_t N>
-HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
-  return yes & VecFromMask(DFromV<decltype(yes)>(), mask);
-}
-
-// mask ? 0 : no
-template <typename T, size_t N>
-HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
-  return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no);
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
-                                        Vec128<T, N> no) {
-  static_assert(IsSigned<T>(), "Only works for signed/float");
-  const DFromV<decltype(v)> d;
-  const RebindToSigned<decltype(d)> di;
-
-  v = BitCast(d, BroadcastSignBit(BitCast(di, v)));
-  return IfThenElse(MaskFromVec(v), yes, no);
-}
-
-template <typename T, size_t N, HWY_IF_FLOAT(T)>
-HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
-  const DFromV<decltype(v)> d;
-  const auto zero = Zero(d);
-  return IfThenElse(Mask128<T, N>{(v > zero).raw}, v, zero);
-}
-
-// ------------------------------ Mask logical
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
-  return MaskFromVec(Not(VecFromMask(Simd<T, N, 0>(), m)));
-}
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) {
-  const Simd<T, N, 0> d;
-  return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
-}
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) {
-  const Simd<T, N, 0> d;
-  return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
-}
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) {
-  const Simd<T, N, 0> d;
-  return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
-}
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
-  const Simd<T, N, 0> d;
-  return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
-}
-
-// ------------------------------ Shl (BroadcastSignBit, IfThenElse)
-
-// The x86 multiply-by-Pow2() trick will not work because WASM saturates
-// float->int correctly to 2^31-1 (not 2^31). Because WASM's shifts take a
-// scalar count operand, per-lane shift instructions would require extract_lane
-// for each lane, and hoping that shuffle is correctly mapped to a native
-// instruction. Using non-vector shifts would incur a store-load forwarding
-// stall when loading the result vector. We instead test bits of the shift
-// count to "predicate" a shift of the entire vector by a constant.
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, const Vec128<T, N> bits) {
-  const DFromV<decltype(v)> d;
-  Mask128<T, N> mask;
-  // Need a signed type for BroadcastSignBit.
-  auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
-  // Move the highest valid bit of the shift count into the sign bit.
-  test = ShiftLeft<12>(test);
-
-  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
-  test = ShiftLeft<1>(test);  // next bit (descending order)
-  v = IfThenElse(mask, ShiftLeft<8>(v), v);
-
-  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
-  test = ShiftLeft<1>(test);  // next bit (descending order)
-  v = IfThenElse(mask, ShiftLeft<4>(v), v);
-
-  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
-  test = ShiftLeft<1>(test);  // next bit (descending order)
-  v = IfThenElse(mask, ShiftLeft<2>(v), v);
-
-  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
-  return IfThenElse(mask, ShiftLeft<1>(v), v);
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, const Vec128<T, N> bits) {
-  const DFromV<decltype(v)> d;
-  Mask128<T, N> mask;
-  // Need a signed type for BroadcastSignBit.
-  auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
-  // Move the highest valid bit of the shift count into the sign bit.
-  test = ShiftLeft<27>(test);
-
-  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
-  test = ShiftLeft<1>(test);  // next bit (descending order)
-  v = IfThenElse(mask, ShiftLeft<16>(v), v);
-
-  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
-  test = ShiftLeft<1>(test);  // next bit (descending order)
-  v = IfThenElse(mask, ShiftLeft<8>(v), v);
-
-  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
-  test = ShiftLeft<1>(test);  // next bit (descending order)
-  v = IfThenElse(mask, ShiftLeft<4>(v), v);
-
-  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
-  test = ShiftLeft<1>(test);  // next bit (descending order)
-  v = IfThenElse(mask, ShiftLeft<2>(v), v);
-
-  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
-  return IfThenElse(mask, ShiftLeft<1>(v), v);
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, const Vec128<T, N> bits) {
-  const DFromV<decltype(v)> d;
-  alignas(16) T lanes[2];
-  alignas(16) T bits_lanes[2];
-  Store(v, d, lanes);
-  Store(bits, d, bits_lanes);
-  lanes[0] <<= bits_lanes[0];
-  lanes[1] <<= bits_lanes[1];
-  return Load(d, lanes);
-}
-
-// ------------------------------ Shr (BroadcastSignBit, IfThenElse)
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, const Vec128<T, N> bits) {
-  const DFromV<decltype(v)> d;
-  Mask128<T, N> mask;
-  // Need a signed type for BroadcastSignBit.
-  auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
-  // Move the highest valid bit of the shift count into the sign bit.
-  test = ShiftLeft<12>(test);
-
-  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
-  test = ShiftLeft<1>(test);  // next bit (descending order)
-  v = IfThenElse(mask, ShiftRight<8>(v), v);
-
-  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
-  test = ShiftLeft<1>(test);  // next bit (descending order)
-  v = IfThenElse(mask, ShiftRight<4>(v), v);
-
-  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
-  test = ShiftLeft<1>(test);  // next bit (descending order)
-  v = IfThenElse(mask, ShiftRight<2>(v), v);
-
-  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
-  return IfThenElse(mask, ShiftRight<1>(v), v);
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec128<T, N> operator>>(Vec128<T, N> v, const Vec128<T, N> bits) {
-  const DFromV<decltype(v)> d;
-  Mask128<T, N> mask;
-  // Need a signed type for BroadcastSignBit.
-  auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
-  // Move the highest valid bit of the shift count into the sign bit.
-  test = ShiftLeft<27>(test);
-
-  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
-  test = ShiftLeft<1>(test);  // next bit (descending order)
-  v = IfThenElse(mask, ShiftRight<16>(v), v);
-
-  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
-  test = ShiftLeft<1>(test);  // next bit (descending order)
-  v = IfThenElse(mask, ShiftRight<8>(v), v);
-
-  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
-  test = ShiftLeft<1>(test);  // next bit (descending order)
-  v = IfThenElse(mask, ShiftRight<4>(v), v);
-
-  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
-  test = ShiftLeft<1>(test);  // next bit (descending order)
-  v = IfThenElse(mask, ShiftRight<2>(v), v);
-
-  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
-  return IfThenElse(mask, ShiftRight<1>(v), v);
-}
-
-// ================================================== MEMORY
-
-// ------------------------------ Load
-
-template <typename T>
-HWY_API Vec128<T> Load(Full128<T> /* tag */, const T* HWY_RESTRICT aligned) {
-  return Vec128<T>{wasm_v128_load(aligned)};
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> d,
-                                const T* HWY_RESTRICT aligned) {
-  return IfThenElseZero(m, Load(d, aligned));
-}
-
-// Partial load.
-template <typename T, size_t N, HWY_IF_LE64(T, N)>
-HWY_API Vec128<T, N> Load(Simd<T, N, 0> /* tag */, const T* HWY_RESTRICT p) {
-  Vec128<T, N> v;
-  CopyBytes<sizeof(T) * N>(p, &v);
-  return v;
-}
-
-// LoadU == Load.
-template <typename T, size_t N>
-HWY_API Vec128<T, N> LoadU(Simd<T, N, 0> d, const T* HWY_RESTRICT p) {
-  return Load(d, p);
-}
-
-// 128-bit SIMD => nothing to duplicate, same as an unaligned load.
-template <typename T, size_t N, HWY_IF_LE128(T, N)>
-HWY_API Vec128<T, N> LoadDup128(Simd<T, N, 0> d, const T* HWY_RESTRICT p) {
-  return Load(d, p);
-}
-
-// ------------------------------ Store
-
-template <typename T>
-HWY_API void Store(Vec128<T> v, Full128<T> /* tag */, T* HWY_RESTRICT aligned) {
-  wasm_v128_store(aligned, v.raw);
-}
-
-// Partial store.
-template <typename T, size_t N, HWY_IF_LE64(T, N)>
-HWY_API void Store(Vec128<T, N> v, Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
-  CopyBytes<sizeof(T) * N>(&v, p);
-}
-
-HWY_API void Store(const Vec128<float, 1> v, Simd<float, 1, 0> /* tag */,
-                   float* HWY_RESTRICT p) {
-  *p = wasm_f32x4_extract_lane(v.raw, 0);
-}
-
-// StoreU == Store.
-template <typename T, size_t N>
-HWY_API void StoreU(Vec128<T, N> v, Simd<T, N, 0> d, T* HWY_RESTRICT p) {
-  Store(v, d, p);
-}
-
-template <typename T, size_t N>
-HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m, Simd<T, N, 0> d,
-                          T* HWY_RESTRICT p) {
-  StoreU(IfThenElse(m, v, LoadU(d, p)), d, p);
-}
-
-// ------------------------------ Non-temporal stores
-
-// Same as aligned stores on non-x86.
-
-template <typename T, size_t N>
-HWY_API void Stream(Vec128<T, N> v, Simd<T, N, 0> /* tag */,
-                    T* HWY_RESTRICT aligned) {
-  wasm_v128_store(aligned, v.raw);
-}
-
-// ------------------------------ Scatter (Store)
-
-template <typename T, size_t N, typename Offset, HWY_IF_LE128(T, N)>
-HWY_API void ScatterOffset(Vec128<T, N> v, Simd<T, N, 0> d,
-                           T* HWY_RESTRICT base,
-                           const Vec128<Offset, N> offset) {
-  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
-
-  alignas(16) T lanes[N];
-  Store(v, d, lanes);
-
-  alignas(16) Offset offset_lanes[N];
-  Store(offset, Rebind<Offset, decltype(d)>(), offset_lanes);
-
-  uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
-  for (size_t i = 0; i < N; ++i) {
-    CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
-  }
-}
-
-template <typename T, size_t N, typename Index, HWY_IF_LE128(T, N)>
-HWY_API void ScatterIndex(Vec128<T, N> v, Simd<T, N, 0> d, T* HWY_RESTRICT base,
-                          const Vec128<Index, N> index) {
-  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
-
-  alignas(16) T lanes[N];
-  Store(v, d, lanes);
-
-  alignas(16) Index index_lanes[N];
-  Store(index, Rebind<Index, decltype(d)>(), index_lanes);
-
-  for (size_t i = 0; i < N; ++i) {
-    base[index_lanes[i]] = lanes[i];
-  }
-}
-
-// ------------------------------ Gather (Load/Store)
-
-template <typename T, size_t N, typename Offset>
-HWY_API Vec128<T, N> GatherOffset(const Simd<T, N, 0> d,
-                                  const T* HWY_RESTRICT base,
-                                  const Vec128<Offset, N> offset) {
-  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
-
-  alignas(16) Offset offset_lanes[N];
-  Store(offset, Rebind<Offset, decltype(d)>(), offset_lanes);
-
-  alignas(16) T lanes[N];
-  const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
-  for (size_t i = 0; i < N; ++i) {
-    CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
-  }
-  return Load(d, lanes);
-}
-
-template <typename T, size_t N, typename Index>
-HWY_API Vec128<T, N> GatherIndex(const Simd<T, N, 0> d,
-                                 const T* HWY_RESTRICT base,
-                                 const Vec128<Index, N> index) {
-  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
-
-  alignas(16) Index index_lanes[N];
-  Store(index, Rebind<Index, decltype(d)>(), index_lanes);
-
-  alignas(16) T lanes[N];
-  for (size_t i = 0; i < N; ++i) {
-    lanes[i] = base[index_lanes[i]];
-  }
-  return Load(d, lanes);
-}
-
-// ================================================== SWIZZLE
-
-// ------------------------------ ExtractLane
-
-namespace detail {
-
-template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
-HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
-  return static_cast<T>(wasm_i8x16_extract_lane(v.raw, kLane));
-}
-template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
-HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
-  return static_cast<T>(wasm_i16x8_extract_lane(v.raw, kLane));
-}
-template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
-HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
-  return static_cast<T>(wasm_i32x4_extract_lane(v.raw, kLane));
-}
-template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
-HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
-  return static_cast<T>(wasm_i64x2_extract_lane(v.raw, kLane));
-}
-
-template <size_t kLane, size_t N>
-HWY_INLINE float ExtractLane(const Vec128<float, N> v) {
-  return wasm_f32x4_extract_lane(v.raw, kLane);
-}
-
-}  // namespace detail
-
-// One overload per vector length just in case *_extract_lane raise compile
-// errors if their argument is out of bounds (even if that would never be
-// reached at runtime).
-template <typename T>
-HWY_API T ExtractLane(const Vec128<T, 1> v, size_t i) {
-  HWY_DASSERT(i == 0);
-  (void)i;
-  return GetLane(v);
-}
-
-template <typename T>
-HWY_API T ExtractLane(const Vec128<T, 2> v, size_t i) {
-#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
-  if (__builtin_constant_p(i)) {
-    switch (i) {
-      case 0:
-        return detail::ExtractLane<0>(v);
-      case 1:
-        return detail::ExtractLane<1>(v);
-    }
-  }
-#endif
-  alignas(16) T lanes[2];
-  Store(v, DFromV<decltype(v)>(), lanes);
-  return lanes[i];
-}
-
-template <typename T>
-HWY_API T ExtractLane(const Vec128<T, 4> v, size_t i) {
-#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
-  if (__builtin_constant_p(i)) {
-    switch (i) {
-      case 0:
-        return detail::ExtractLane<0>(v);
-      case 1:
-        return detail::ExtractLane<1>(v);
-      case 2:
-        return detail::ExtractLane<2>(v);
-      case 3:
-        return detail::ExtractLane<3>(v);
-    }
-  }
-#endif
-  alignas(16) T lanes[4];
-  Store(v, DFromV<decltype(v)>(), lanes);
-  return lanes[i];
-}
-
-template <typename T>
-HWY_API T ExtractLane(const Vec128<T, 8> v, size_t i) {
-#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
-  if (__builtin_constant_p(i)) {
-    switch (i) {
-      case 0:
-        return detail::ExtractLane<0>(v);
-      case 1:
-        return detail::ExtractLane<1>(v);
-      case 2:
-        return detail::ExtractLane<2>(v);
-      case 3:
-        return detail::ExtractLane<3>(v);
-      case 4:
-        return detail::ExtractLane<4>(v);
-      case 5:
-        return detail::ExtractLane<5>(v);
-      case 6:
-        return detail::ExtractLane<6>(v);
-      case 7:
-        return detail::ExtractLane<7>(v);
-    }
-  }
-#endif
-  alignas(16) T lanes[8];
-  Store(v, DFromV<decltype(v)>(), lanes);
-  return lanes[i];
-}
-
-template <typename T>
-HWY_API T ExtractLane(const Vec128<T, 16> v, size_t i) {
-#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
-  if (__builtin_constant_p(i)) {
-    switch (i) {
-      case 0:
-        return detail::ExtractLane<0>(v);
-      case 1:
-        return detail::ExtractLane<1>(v);
-      case 2:
-        return detail::ExtractLane<2>(v);
-      case 3:
-        return detail::ExtractLane<3>(v);
-      case 4:
-        return detail::ExtractLane<4>(v);
-      case 5:
-        return detail::ExtractLane<5>(v);
-      case 6:
-        return detail::ExtractLane<6>(v);
-      case 7:
-        return detail::ExtractLane<7>(v);
-      case 8:
-        return detail::ExtractLane<8>(v);
-      case 9:
-        return detail::ExtractLane<9>(v);
-      case 10:
-        return detail::ExtractLane<10>(v);
-      case 11:
-        return detail::ExtractLane<11>(v);
-      case 12:
-        return detail::ExtractLane<12>(v);
-      case 13:
-        return detail::ExtractLane<13>(v);
-      case 14:
-        return detail::ExtractLane<14>(v);
-      case 15:
-        return detail::ExtractLane<15>(v);
-    }
-  }
-#endif
-  alignas(16) T lanes[16];
-  Store(v, DFromV<decltype(v)>(), lanes);
-  return lanes[i];
-}
-
-// ------------------------------ GetLane
-template <typename T, size_t N>
-HWY_API T GetLane(const Vec128<T, N> v) {
-  return detail::ExtractLane<0>(v);
-}
-
-// ------------------------------ InsertLane
-
-namespace detail {
-
-template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
-HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
-  static_assert(kLane < N, "Lane index out of bounds");
-  return Vec128<T, N>{
-      wasm_i8x16_replace_lane(v.raw, kLane, static_cast<int8_t>(t))};
-}
-
-template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
-HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
-  static_assert(kLane < N, "Lane index out of bounds");
-  return Vec128<T, N>{
-      wasm_i16x8_replace_lane(v.raw, kLane, static_cast<int16_t>(t))};
-}
-
-template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
-HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
-  static_assert(kLane < N, "Lane index out of bounds");
-  return Vec128<T, N>{
-      wasm_i32x4_replace_lane(v.raw, kLane, static_cast<int32_t>(t))};
-}
-
-template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
-HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
-  static_assert(kLane < N, "Lane index out of bounds");
-  return Vec128<T, N>{
-      wasm_i64x2_replace_lane(v.raw, kLane, static_cast<int64_t>(t))};
-}
-
-template <size_t kLane, size_t N>
-HWY_INLINE Vec128<float, N> InsertLane(const Vec128<float, N> v, float t) {
-  static_assert(kLane < N, "Lane index out of bounds");
-  return Vec128<float, N>{wasm_f32x4_replace_lane(v.raw, kLane, t)};
-}
-
-template <size_t kLane, size_t N>
-HWY_INLINE Vec128<double, N> InsertLane(const Vec128<double, N> v, double t) {
-  static_assert(kLane < 2, "Lane index out of bounds");
-  return Vec128<double, N>{wasm_f64x2_replace_lane(v.raw, kLane, t)};
-}
-
-}  // namespace detail
-
-// Requires one overload per vector length because InsertLane<3> may be a
-// compile error if it calls wasm_f64x2_replace_lane.
-
-template <typename T>
-HWY_API Vec128<T, 1> InsertLane(const Vec128<T, 1> v, size_t i, T t) {
-  HWY_DASSERT(i == 0);
-  (void)i;
-  return Set(DFromV<decltype(v)>(), t);
-}
-
-template <typename T>
-HWY_API Vec128<T, 2> InsertLane(const Vec128<T, 2> v, size_t i, T t) {
-#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
-  if (__builtin_constant_p(i)) {
-    switch (i) {
-      case 0:
-        return detail::InsertLane<0>(v, t);
-      case 1:
-        return detail::InsertLane<1>(v, t);
-    }
-  }
-#endif
-  const DFromV<decltype(v)> d;
-  alignas(16) T lanes[2];
-  Store(v, d, lanes);
-  lanes[i] = t;
-  return Load(d, lanes);
-}
-
-template <typename T>
-HWY_API Vec128<T, 4> InsertLane(const Vec128<T, 4> v, size_t i, T t) {
-#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
-  if (__builtin_constant_p(i)) {
-    switch (i) {
-      case 0:
-        return detail::InsertLane<0>(v, t);
-      case 1:
-        return detail::InsertLane<1>(v, t);
-      case 2:
-        return detail::InsertLane<2>(v, t);
-      case 3:
-        return detail::InsertLane<3>(v, t);
-    }
-  }
-#endif
-  const DFromV<decltype(v)> d;
-  alignas(16) T lanes[4];
-  Store(v, d, lanes);
-  lanes[i] = t;
-  return Load(d, lanes);
-}
-
-template <typename T>
-HWY_API Vec128<T, 8> InsertLane(const Vec128<T, 8> v, size_t i, T t) {
-#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
-  if (__builtin_constant_p(i)) {
-    switch (i) {
-      case 0:
-        return detail::InsertLane<0>(v, t);
-      case 1:
-        return detail::InsertLane<1>(v, t);
-      case 2:
-        return detail::InsertLane<2>(v, t);
-      case 3:
-        return detail::InsertLane<3>(v, t);
-      case 4:
-        return detail::InsertLane<4>(v, t);
-      case 5:
-        return detail::InsertLane<5>(v, t);
-      case 6:
-        return detail::InsertLane<6>(v, t);
-      case 7:
-        return detail::InsertLane<7>(v, t);
-    }
-  }
-#endif
-  const DFromV<decltype(v)> d;
-  alignas(16) T lanes[8];
-  Store(v, d, lanes);
-  lanes[i] = t;
-  return Load(d, lanes);
-}
-
-template <typename T>
-HWY_API Vec128<T, 16> InsertLane(const Vec128<T, 16> v, size_t i, T t) {
-#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
-  if (__builtin_constant_p(i)) {
-    switch (i) {
-      case 0:
-        return detail::InsertLane<0>(v, t);
-      case 1:
-        return detail::InsertLane<1>(v, t);
-      case 2:
-        return detail::InsertLane<2>(v, t);
-      case 3:
-        return detail::InsertLane<3>(v, t);
-      case 4:
-        return detail::InsertLane<4>(v, t);
-      case 5:
-        return detail::InsertLane<5>(v, t);
-      case 6:
-        return detail::InsertLane<6>(v, t);
-      case 7:
-        return detail::InsertLane<7>(v, t);
-      case 8:
-        return detail::InsertLane<8>(v, t);
-      case 9:
-        return detail::InsertLane<9>(v, t);
-      case 10:
-        return detail::InsertLane<10>(v, t);
-      case 11:
-        return detail::InsertLane<11>(v, t);
-      case 12:
-        return detail::InsertLane<12>(v, t);
-      case 13:
-        return detail::InsertLane<13>(v, t);
-      case 14:
-        return detail::InsertLane<14>(v, t);
-      case 15:
-        return detail::InsertLane<15>(v, t);
-    }
-  }
-#endif
-  const DFromV<decltype(v)> d;
-  alignas(16) T lanes[16];
-  Store(v, d, lanes);
-  lanes[i] = t;
-  return Load(d, lanes);
-}
-
-// ------------------------------ LowerHalf
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N / 2> LowerHalf(Simd<T, N / 2, 0> /* tag */,
-                                   Vec128<T, N> v) {
-  return Vec128<T, N / 2>{v.raw};
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
-  return LowerHalf(Simd<T, N / 2, 0>(), v);
-}
-
-// ------------------------------ ShiftLeftBytes
-
-// 0x01..0F, kBytes = 1 => 0x02..0F00
-template <int kBytes, typename T, size_t N>
-HWY_API Vec128<T, N> ShiftLeftBytes(Simd<T, N, 0> /* tag */, Vec128<T, N> v) {
-  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
-  const __i8x16 zero = wasm_i8x16_splat(0);
-  switch (kBytes) {
-    case 0:
-      return v;
-
-    case 1:
-      return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 0, 1, 2, 3, 4, 5,
-                                             6, 7, 8, 9, 10, 11, 12, 13, 14)};
-
-    case 2:
-      return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 0, 1, 2, 3, 4,
-                                             5, 6, 7, 8, 9, 10, 11, 12, 13)};
-
-    case 3:
-      return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 0, 1, 2,
-                                             3, 4, 5, 6, 7, 8, 9, 10, 11, 12)};
-
-    case 4:
-      return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 0, 1,
-                                             2, 3, 4, 5, 6, 7, 8, 9, 10, 11)};
-
-    case 5:
-      return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 0,
-                                             1, 2, 3, 4, 5, 6, 7, 8, 9, 10)};
-
-    case 6:
-      return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
-                                             16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9)};
-
-    case 7:
-      return Vec128<T, N>{wasm_i8x16_shuffle(
-          v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8)};
-
-    case 8:
-      return Vec128<T, N>{wasm_i8x16_shuffle(
-          v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7)};
-
-    case 9:
-      return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
-                                             16, 16, 16, 16, 0, 1, 2, 3, 4, 5,
-                                             6)};
-
-    case 10:
-      return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
-                                             16, 16, 16, 16, 16, 0, 1, 2, 3, 4,
-                                             5)};
-
-    case 11:
-      return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
-                                             16, 16, 16, 16, 16, 16, 0, 1, 2, 3,
-                                             4)};
-
-    case 12:
-      return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
-                                             16, 16, 16, 16, 16, 16, 16, 0, 1,
-                                             2, 3)};
-
-    case 13:
-      return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
-                                             16, 16, 16, 16, 16, 16, 16, 16, 0,
-                                             1, 2)};
-
-    case 14:
-      return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
-                                             16, 16, 16, 16, 16, 16, 16, 16, 16,
-                                             0, 1)};
-
-    case 15:
-      return Vec128<T, N>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16,
-                                             16, 16, 16, 16, 16, 16, 16, 16, 16,
-                                             16, 0)};
-  }
-  return Vec128<T, N>{zero};
-}
-
-template <int kBytes, typename T, size_t N>
-HWY_API Vec128<T, N> ShiftLeftBytes(Vec128<T, N> v) {
-  return ShiftLeftBytes<kBytes>(Simd<T, N, 0>(), v);
-}
-
-// ------------------------------ ShiftLeftLanes
-
-template <int kLanes, typename T, size_t N>
-HWY_API Vec128<T, N> ShiftLeftLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
-  const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
-}
-
-template <int kLanes, typename T, size_t N>
-HWY_API Vec128<T, N> ShiftLeftLanes(const Vec128<T, N> v) {
-  return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
-}
-
-// ------------------------------ ShiftRightBytes
-namespace detail {
-
-// Helper function allows zeroing invalid lanes in caller.
-template <int kBytes, typename T, size_t N>
-HWY_API __i8x16 ShrBytes(const Vec128<T, N> v) {
-  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
-  const __i8x16 zero = wasm_i8x16_splat(0);
-
-  switch (kBytes) {
-    case 0:
-      return v.raw;
-
-    case 1:
-      return wasm_i8x16_shuffle(v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
-                                12, 13, 14, 15, 16);
-
-    case 2:
-      return wasm_i8x16_shuffle(v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
-                                13, 14, 15, 16, 16);
-
-    case 3:
-      return wasm_i8x16_shuffle(v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
-                                13, 14, 15, 16, 16, 16);
-
-    case 4:
-      return wasm_i8x16_shuffle(v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
-                                14, 15, 16, 16, 16, 16);
-
-    case 5:
-      return wasm_i8x16_shuffle(v.raw, zero, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
-                                15, 16, 16, 16, 16, 16);
-
-    case 6:
-      return wasm_i8x16_shuffle(v.raw, zero, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                16, 16, 16, 16, 16, 16);
-
-    case 7:
-      return wasm_i8x16_shuffle(v.raw, zero, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                16, 16, 16, 16, 16, 16, 16);
-
-    case 8:
-      return wasm_i8x16_shuffle(v.raw, zero, 8, 9, 10, 11, 12, 13, 14, 15, 16,
-                                16, 16, 16, 16, 16, 16, 16);
-
-    case 9:
-      return wasm_i8x16_shuffle(v.raw, zero, 9, 10, 11, 12, 13, 14, 15, 16, 16,
-                                16, 16, 16, 16, 16, 16, 16);
-
-    case 10:
-      return wasm_i8x16_shuffle(v.raw, zero, 10, 11, 12, 13, 14, 15, 16, 16, 16,
-                                16, 16, 16, 16, 16, 16, 16);
-
-    case 11:
-      return wasm_i8x16_shuffle(v.raw, zero, 11, 12, 13, 14, 15, 16, 16, 16, 16,
-                                16, 16, 16, 16, 16, 16, 16);
-
-    case 12:
-      return wasm_i8x16_shuffle(v.raw, zero, 12, 13, 14, 15, 16, 16, 16, 16, 16,
-                                16, 16, 16, 16, 16, 16, 16);
-
-    case 13:
-      return wasm_i8x16_shuffle(v.raw, zero, 13, 14, 15, 16, 16, 16, 16, 16, 16,
-                                16, 16, 16, 16, 16, 16, 16);
-
-    case 14:
-      return wasm_i8x16_shuffle(v.raw, zero, 14, 15, 16, 16, 16, 16, 16, 16, 16,
-                                16, 16, 16, 16, 16, 16, 16);
-
-    case 15:
-      return wasm_i8x16_shuffle(v.raw, zero, 15, 16, 16, 16, 16, 16, 16, 16, 16,
-                                16, 16, 16, 16, 16, 16, 16);
-    case 16:
-      return zero;
-  }
-}
-
-}  // namespace detail
-
-// 0x01..0F, kBytes = 1 => 0x0001..0E
-template <int kBytes, typename T, size_t N>
-HWY_API Vec128<T, N> ShiftRightBytes(Simd<T, N, 0> /* tag */, Vec128<T, N> v) {
-  // For partial vectors, clear upper lanes so we shift in zeros.
-  if (N != 16 / sizeof(T)) {
-    const Vec128<T> vfull{v.raw};
-    v = Vec128<T, N>{IfThenElseZero(FirstN(Full128<T>(), N), vfull).raw};
-  }
-  return Vec128<T, N>{detail::ShrBytes<kBytes>(v)};
-}
-
-// ------------------------------ ShiftRightLanes
-template <int kLanes, typename T, size_t N>
-HWY_API Vec128<T, N> ShiftRightLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
-  const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v)));
-}
-
-// ------------------------------ UpperHalf (ShiftRightBytes)
-
-// Full input: copy hi into lo (smaller instruction encoding than shifts).
-template <typename T>
-HWY_API Vec64<T> UpperHalf(Full64<T> /* tag */, const Vec128<T> v) {
-  return Vec64<T>{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
-}
-HWY_API Vec64<float> UpperHalf(Full64<float> /* tag */, const Vec128<float> v) {
-  return Vec64<float>{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
-}
-
-// Partial
-template <typename T, size_t N, HWY_IF_LE64(T, N)>
-HWY_API Vec128<T, (N + 1) / 2> UpperHalf(Half<Simd<T, N, 0>> /* tag */,
-                                         Vec128<T, N> v) {
-  const DFromV<decltype(v)> d;
-  const RebindToUnsigned<decltype(d)> du;
-  const auto vu = BitCast(du, v);
-  const auto upper = BitCast(d, ShiftRightBytes<N * sizeof(T) / 2>(du, vu));
-  return Vec128<T, (N + 1) / 2>{upper.raw};
-}
-
-// ------------------------------ CombineShiftRightBytes
-
-template <int kBytes, typename T, class V = Vec128<T>>
-HWY_API V CombineShiftRightBytes(Full128<T> /* tag */, V hi, V lo) {
-  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
-  switch (kBytes) {
-    case 0:
-      return lo;
-
-    case 1:
-      return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
-                                  11, 12, 13, 14, 15, 16)};
-
-    case 2:
-      return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8, 9, 10,
-                                  11, 12, 13, 14, 15, 16, 17)};
-
-    case 3:
-      return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9, 10, 11,
-                                  12, 13, 14, 15, 16, 17, 18)};
-
-    case 4:
-      return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10, 11, 12,
-                                  13, 14, 15, 16, 17, 18, 19)};
-
-    case 5:
-      return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11, 12, 13,
-                                  14, 15, 16, 17, 18, 19, 20)};
-
-    case 6:
-      return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11, 12, 13,
-                                  14, 15, 16, 17, 18, 19, 20, 21)};
-
-    case 7:
-      return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12, 13, 14,
-                                  15, 16, 17, 18, 19, 20, 21, 22)};
-
-    case 8:
-      return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13, 14, 15,
-                                  16, 17, 18, 19, 20, 21, 22, 23)};
-
-    case 9:
-      return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14, 15, 16,
-                                  17, 18, 19, 20, 21, 22, 23, 24)};
-
-    case 10:
-      return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14, 15, 16,
-                                  17, 18, 19, 20, 21, 22, 23, 24, 25)};
-
-    case 11:
-      return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15, 16, 17,
-                                  18, 19, 20, 21, 22, 23, 24, 25, 26)};
-
-    case 12:
-      return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16, 17, 18,
-                                  19, 20, 21, 22, 23, 24, 25, 26, 27)};
-
-    case 13:
-      return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17, 18, 19,
-                                  20, 21, 22, 23, 24, 25, 26, 27, 28)};
-
-    case 14:
-      return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18, 19, 20,
-                                  21, 22, 23, 24, 25, 26, 27, 28, 29)};
-
-    case 15:
-      return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19, 20, 21,
-                                  22, 23, 24, 25, 26, 27, 28, 29, 30)};
-  }
-  return hi;
-}
-
-template <int kBytes, typename T, size_t N, HWY_IF_LE64(T, N),
-          class V = Vec128<T, N>>
-HWY_API V CombineShiftRightBytes(Simd<T, N, 0> d, V hi, V lo) {
-  constexpr size_t kSize = N * sizeof(T);
-  static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
-  const Repartition<uint8_t, decltype(d)> d8;
-  const Full128<uint8_t> d_full8;
-  using V8 = VFromD<decltype(d_full8)>;
-  const V8 hi8{BitCast(d8, hi).raw};
-  // Move into most-significant bytes
-  const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw});
-  const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(d_full8, hi8, lo8);
-  return V{BitCast(Full128<T>(), r).raw};
-}
-
-// ------------------------------ Broadcast/splat any lane
-
-template <int kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) {
-  static_assert(0 <= kLane && kLane < N, "Invalid lane");
-  return Vec128<T, N>{wasm_i16x8_shuffle(v.raw, v.raw, kLane, kLane, kLane,
-                                         kLane, kLane, kLane, kLane, kLane)};
-}
-
-template <int kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) {
-  static_assert(0 <= kLane && kLane < N, "Invalid lane");
-  return Vec128<T, N>{
-      wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
-}
-
-template <int kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec128<T, N> Broadcast(const Vec128<T, N> v) {
-  static_assert(0 <= kLane && kLane < N, "Invalid lane");
-  return Vec128<T, N>{wasm_i64x2_shuffle(v.raw, v.raw, kLane, kLane)};
-}
-
-// ------------------------------ TableLookupBytes
-
-// Returns vector of bytes[from[i]]. "from" is also interpreted as bytes, i.e.
-// lane indices in [0, 16).
-template <typename T, size_t N, typename TI, size_t NI>
-HWY_API Vec128<TI, NI> TableLookupBytes(const Vec128<T, N> bytes,
-                                        const Vec128<TI, NI> from) {
-// Not yet available in all engines, see
-// https://github.com/WebAssembly/simd/blob/bdcc304b2d379f4601c2c44ea9b44ed9484fde7e/proposals/simd/ImplementationStatus.md
-// V8 implementation of this had a bug, fixed on 2021-04-03:
-// https://chromium-review.googlesource.com/c/v8/v8/+/2822951
-#if 0
-  return Vec128<TI, NI>{wasm_i8x16_swizzle(bytes.raw, from.raw)};
-#else
-  alignas(16) uint8_t control[16];
-  alignas(16) uint8_t input[16];
-  alignas(16) uint8_t output[16];
-  wasm_v128_store(control, from.raw);
-  wasm_v128_store(input, bytes.raw);
-  for (size_t i = 0; i < 16; ++i) {
-    output[i] = control[i] < 16 ? input[control[i]] : 0;
-  }
-  return Vec128<TI, NI>{wasm_v128_load(output)};
-#endif
-}
-
-template <typename T, size_t N, typename TI, size_t NI>
-HWY_API Vec128<TI, NI> TableLookupBytesOr0(const Vec128<T, N> bytes,
-                                           const Vec128<TI, NI> from) {
-  const Simd<TI, NI, 0> d;
-  // Mask size must match vector type, so cast everything to this type.
-  Repartition<int8_t, decltype(d)> di8;
-  Repartition<int8_t, Simd<T, N, 0>> d_bytes8;
-  const auto msb = BitCast(di8, from) < Zero(di8);
-  const auto lookup =
-      TableLookupBytes(BitCast(d_bytes8, bytes), BitCast(di8, from));
-  return BitCast(d, IfThenZeroElse(msb, lookup));
-}
-
-// ------------------------------ Hard-coded shuffles
-
-// Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
-// Shuffle0321 rotates one lane to the right (the previous least-significant
-// lane is now most-significant). These could also be implemented via
-// CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
-
-// Swap 32-bit halves in 64-bit halves.
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Shuffle2301(const Vec128<T, N> v) {
-  static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
-  static_assert(N == 2 || N == 4, "Does not make sense for N=1");
-  return Vec128<T, N>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
-}
-
-// These are used by generic_ops-inl to implement LoadInterleaved3.
-namespace detail {
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Vec128<T, N> Shuffle2301(const Vec128<T, N> a, const Vec128<T, N> b) {
-  static_assert(N == 2 || N == 4, "Does not make sense for N=1");
-  return Vec128<T, N>{wasm_i8x16_shuffle(a.raw, b.raw, 1, 0, 3 + 16, 2 + 16,
-                                         0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
-                                         0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)};
-}
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec128<T, N> Shuffle2301(const Vec128<T, N> a, const Vec128<T, N> b) {
-  static_assert(N == 2 || N == 4, "Does not make sense for N=1");
-  return Vec128<T, N>{wasm_i16x8_shuffle(a.raw, b.raw, 1, 0, 3 + 8, 2 + 8,
-                                         0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)};
-}
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec128<T, N> Shuffle2301(const Vec128<T, N> a, const Vec128<T, N> b) {
-  static_assert(N == 2 || N == 4, "Does not make sense for N=1");
-  return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 1, 0, 3 + 4, 2 + 4)};
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Vec128<T, N> Shuffle1230(const Vec128<T, N> a, const Vec128<T, N> b) {
-  static_assert(N == 2 || N == 4, "Does not make sense for N=1");
-  return Vec128<T, N>{wasm_i8x16_shuffle(a.raw, b.raw, 0, 3, 2 + 16, 1 + 16,
-                                         0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
-                                         0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)};
-}
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec128<T, N> Shuffle1230(const Vec128<T, N> a, const Vec128<T, N> b) {
-  static_assert(N == 2 || N == 4, "Does not make sense for N=1");
-  return Vec128<T, N>{wasm_i16x8_shuffle(a.raw, b.raw, 0, 3, 2 + 8, 1 + 8,
-                                         0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)};
-}
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec128<T, N> Shuffle1230(const Vec128<T, N> a, const Vec128<T, N> b) {
-  static_assert(N == 2 || N == 4, "Does not make sense for N=1");
-  return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 3, 2 + 4, 1 + 4)};
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Vec128<T, N> Shuffle3012(const Vec128<T, N> a, const Vec128<T, N> b) {
-  static_assert(N == 2 || N == 4, "Does not make sense for N=1");
-  return Vec128<T, N>{wasm_i8x16_shuffle(a.raw, b.raw, 2, 1, 0 + 16, 3 + 16,
-                                         0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
-                                         0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)};
-}
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec128<T, N> Shuffle3012(const Vec128<T, N> a, const Vec128<T, N> b) {
-  static_assert(N == 2 || N == 4, "Does not make sense for N=1");
-  return Vec128<T, N>{wasm_i16x8_shuffle(a.raw, b.raw, 2, 1, 0 + 8, 3 + 8,
-                                         0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)};
-}
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec128<T, N> Shuffle3012(const Vec128<T, N> a, const Vec128<T, N> b) {
-  static_assert(N == 2 || N == 4, "Does not make sense for N=1");
-  return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 1, 0 + 4, 3 + 4)};
-}
-
-}  // namespace detail
-
-// Swap 64-bit halves
-template <typename T>
-HWY_API Vec128<T> Shuffle01(const Vec128<T> v) {
-  static_assert(sizeof(T) == 8, "Only for 64-bit lanes");
-  return Vec128<T>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
-}
-template <typename T>
-HWY_API Vec128<T> Shuffle1032(const Vec128<T> v) {
-  static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
-  return Vec128<T>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
-}
-
-// Rotate right 32 bits
-template <typename T>
-HWY_API Vec128<T> Shuffle0321(const Vec128<T> v) {
-  static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
-  return Vec128<T>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
-}
-
-// Rotate left 32 bits
-template <typename T>
-HWY_API Vec128<T> Shuffle2103(const Vec128<T> v) {
-  static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
-  return Vec128<T>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
-}
-
-// Reverse
-template <typename T>
-HWY_API Vec128<T> Shuffle0123(const Vec128<T> v) {
-  static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
-  return Vec128<T>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
-}
-
-// ------------------------------ TableLookupLanes
-
-// Returned by SetTableIndices for use by TableLookupLanes.
-template <typename T, size_t N>
-struct Indices128 {
-  __v128_u raw;
-};
-
-template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N)>
-HWY_API Indices128<T, N> IndicesFromVec(Simd<T, N, 0> d, Vec128<TI, N> vec) {
-  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
-#if HWY_IS_DEBUG_BUILD
-  const Rebind<TI, decltype(d)> di;
-  HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
-              AllTrue(di, Lt(vec, Set(di, static_cast<TI>(N)))));
-#endif
-
-  const Repartition<uint8_t, decltype(d)> d8;
-  using V8 = VFromD<decltype(d8)>;
-  const Repartition<uint16_t, decltype(d)> d16;
-
-  // Broadcast each lane index to all bytes of T and shift to bytes
-  static_assert(sizeof(T) == 4 || sizeof(T) == 8, "");
-  if (sizeof(T) == 4) {
-    alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = {
-        0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
-    const V8 lane_indices =
-        TableLookupBytes(BitCast(d8, vec), Load(d8, kBroadcastLaneBytes));
-    const V8 byte_indices =
-        BitCast(d8, ShiftLeft<2>(BitCast(d16, lane_indices)));
-    alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 0, 1, 2, 3,
-                                                      0, 1, 2, 3, 0, 1, 2, 3};
-    return Indices128<T, N>{Add(byte_indices, Load(d8, kByteOffsets)).raw};
-  } else {
-    alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = {
-        0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
-    const V8 lane_indices =
-        TableLookupBytes(BitCast(d8, vec), Load(d8, kBroadcastLaneBytes));
-    const V8 byte_indices =
-        BitCast(d8, ShiftLeft<3>(BitCast(d16, lane_indices)));
-    alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 4, 5, 6, 7,
-                                                      0, 1, 2, 3, 4, 5, 6, 7};
-    return Indices128<T, N>{Add(byte_indices, Load(d8, kByteOffsets)).raw};
-  }
-}
-
-template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N)>
-HWY_API Indices128<T, N> SetTableIndices(Simd<T, N, 0> d, const TI* idx) {
-  const Rebind<TI, decltype(d)> di;
-  return IndicesFromVec(d, LoadU(di, idx));
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) {
-  using TI = MakeSigned<T>;
-  const DFromV<decltype(v)> d;
-  const Rebind<TI, decltype(d)> di;
-  return BitCast(d, TableLookupBytes(BitCast(di, v), Vec128<TI, N>{idx.raw}));
-}
-
-// ------------------------------ Reverse (Shuffle0123, Shuffle2301, Shuffle01)
-
-// Single lane: no change
-template <typename T>
-HWY_API Vec128<T, 1> Reverse(Simd<T, 1, 0> /* tag */, const Vec128<T, 1> v) {
-  return v;
-}
-
-// Two lanes: shuffle
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec128<T, 2> Reverse(Simd<T, 2, 0> /* tag */, const Vec128<T, 2> v) {
-  return Vec128<T, 2>{Shuffle2301(Vec128<T>{v.raw}).raw};
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) {
-  return Shuffle01(v);
-}
-
-// Four lanes: shuffle
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) {
-  return Shuffle0123(v);
-}
-
-// 16-bit
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec128<T, N> Reverse(Simd<T, N, 0> d, const Vec128<T, N> v) {
-  const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
-  return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v))));
-}
-
-// ------------------------------ Reverse2
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> d, const Vec128<T, N> v) {
-  const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
-  return BitCast(d, RotateRight<16>(BitCast(du32, v)));
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
-  return Shuffle2301(v);
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
-  return Shuffle01(v);
-}
-
-// ------------------------------ Reverse4
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> d, const Vec128<T, N> v) {
-  return BitCast(d, Vec128<uint16_t, N>{wasm_i16x8_shuffle(v.raw, v.raw, 3, 2,
-                                                           1, 0, 7, 6, 5, 4)});
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
-  return Shuffle0123(v);
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> /* tag */, const Vec128<T, N>) {
-  HWY_ASSERT(0);  // don't have 8 u64 lanes
-}
-
-// ------------------------------ Reverse8
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec128<T, N> Reverse8(Simd<T, N, 0> d, const Vec128<T, N> v) {
-  return Reverse(d, v);
-}
-
-template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 2)>
-HWY_API Vec128<T, N> Reverse8(Simd<T, N, 0>, const Vec128<T, N>) {
-  HWY_ASSERT(0);  // don't have 8 lanes unless 16-bit
-}
-
-// ------------------------------ InterleaveLower
-
-template <size_t N>
-HWY_API Vec128<uint8_t, N> InterleaveLower(Vec128<uint8_t, N> a,
-                                           Vec128<uint8_t, N> b) {
-  return Vec128<uint8_t, N>{wasm_i8x16_shuffle(
-      a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
-}
-template <size_t N>
-HWY_API Vec128<uint16_t, N> InterleaveLower(Vec128<uint16_t, N> a,
-                                            Vec128<uint16_t, N> b) {
-  return Vec128<uint16_t, N>{
-      wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
-}
-template <size_t N>
-HWY_API Vec128<uint32_t, N> InterleaveLower(Vec128<uint32_t, N> a,
-                                            Vec128<uint32_t, N> b) {
-  return Vec128<uint32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
-}
-template <size_t N>
-HWY_API Vec128<uint64_t, N> InterleaveLower(Vec128<uint64_t, N> a,
-                                            Vec128<uint64_t, N> b) {
-  return Vec128<uint64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)};
-}
-
-template <size_t N>
-HWY_API Vec128<int8_t, N> InterleaveLower(Vec128<int8_t, N> a,
-                                          Vec128<int8_t, N> b) {
-  return Vec128<int8_t, N>{wasm_i8x16_shuffle(
-      a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
-}
-template <size_t N>
-HWY_API Vec128<int16_t, N> InterleaveLower(Vec128<int16_t, N> a,
-                                           Vec128<int16_t, N> b) {
-  return Vec128<int16_t, N>{
-      wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
-}
-template <size_t N>
-HWY_API Vec128<int32_t, N> InterleaveLower(Vec128<int32_t, N> a,
-                                           Vec128<int32_t, N> b) {
-  return Vec128<int32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
-}
-template <size_t N>
-HWY_API Vec128<int64_t, N> InterleaveLower(Vec128<int64_t, N> a,
-                                           Vec128<int64_t, N> b) {
-  return Vec128<int64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)};
-}
-
-template <size_t N>
-HWY_API Vec128<float, N> InterleaveLower(Vec128<float, N> a,
-                                         Vec128<float, N> b) {
-  return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
-}
-
-template <size_t N>
-HWY_API Vec128<double, N> InterleaveLower(Vec128<double, N> a,
-                                          Vec128<double, N> b) {
-  return Vec128<double, N>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)};
-}
-
-// Additional overload for the optional tag.
-template <class V>
-HWY_API V InterleaveLower(DFromV<V> /* tag */, V a, V b) {
-  return InterleaveLower(a, b);
-}
-
-// ------------------------------ InterleaveUpper (UpperHalf)
-
-// All functions inside detail lack the required D parameter.
-namespace detail {
-
-template <size_t N>
-HWY_API Vec128<uint8_t, N> InterleaveUpper(Vec128<uint8_t, N> a,
-                                           Vec128<uint8_t, N> b) {
-  return Vec128<uint8_t, N>{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10,
-                                               26, 11, 27, 12, 28, 13, 29, 14,
-                                               30, 15, 31)};
-}
-template <size_t N>
-HWY_API Vec128<uint16_t, N> InterleaveUpper(Vec128<uint16_t, N> a,
-                                            Vec128<uint16_t, N> b) {
-  return Vec128<uint16_t, N>{
-      wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
-}
-template <size_t N>
-HWY_API Vec128<uint32_t, N> InterleaveUpper(Vec128<uint32_t, N> a,
-                                            Vec128<uint32_t, N> b) {
-  return Vec128<uint32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
-}
-template <size_t N>
-HWY_API Vec128<uint64_t, N> InterleaveUpper(Vec128<uint64_t, N> a,
-                                            Vec128<uint64_t, N> b) {
-  return Vec128<uint64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)};
-}
-
-template <size_t N>
-HWY_API Vec128<int8_t, N> InterleaveUpper(Vec128<int8_t, N> a,
-                                          Vec128<int8_t, N> b) {
-  return Vec128<int8_t, N>{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10,
-                                              26, 11, 27, 12, 28, 13, 29, 14,
-                                              30, 15, 31)};
-}
-template <size_t N>
-HWY_API Vec128<int16_t, N> InterleaveUpper(Vec128<int16_t, N> a,
-                                           Vec128<int16_t, N> b) {
-  return Vec128<int16_t, N>{
-      wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
-}
-template <size_t N>
-HWY_API Vec128<int32_t, N> InterleaveUpper(Vec128<int32_t, N> a,
-                                           Vec128<int32_t, N> b) {
-  return Vec128<int32_t, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
-}
-template <size_t N>
-HWY_API Vec128<int64_t, N> InterleaveUpper(Vec128<int64_t, N> a,
-                                           Vec128<int64_t, N> b) {
-  return Vec128<int64_t, N>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)};
-}
-
-template <size_t N>
-HWY_API Vec128<float, N> InterleaveUpper(Vec128<float, N> a,
-                                         Vec128<float, N> b) {
-  return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
-}
-
-template <size_t N>
-HWY_API Vec128<double, N> InterleaveUpper(Vec128<double, N> a,
-                                          Vec128<double, N> b) {
-  return Vec128<double, N>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)};
-}
-
-}  // namespace detail
-
-// Full
-template <typename T, class V = Vec128<T>>
-HWY_API V InterleaveUpper(Full128<T> /* tag */, V a, V b) {
-  return detail::InterleaveUpper(a, b);
-}
-
-// Partial
-template <typename T, size_t N, HWY_IF_LE64(T, N), class V = Vec128<T, N>>
-HWY_API V InterleaveUpper(Simd<T, N, 0> d, V a, V b) {
-  const Half<decltype(d)> d2;
-  return InterleaveLower(d, V{UpperHalf(d2, a).raw}, V{UpperHalf(d2, b).raw});
-}
-
-// ------------------------------ ZipLower/ZipUpper (InterleaveLower)
-
-// Same as Interleave*, except that the return lanes are double-width integers;
-// this is necessary because the single-lane scalar cannot return two values.
-template <class V, class DW = RepartitionToWide<DFromV<V>>>
-HWY_API VFromD<DW> ZipLower(V a, V b) {
-  return BitCast(DW(), InterleaveLower(a, b));
-}
-template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
-HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
-  return BitCast(dw, InterleaveLower(D(), a, b));
-}
-
-template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
-HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
-  return BitCast(dw, InterleaveUpper(D(), a, b));
-}
-
-// ================================================== COMBINE
-
-// ------------------------------ Combine (InterleaveLower)
-
-// N = N/2 + N/2 (upper half undefined)
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Combine(Simd<T, N, 0> d, Vec128<T, N / 2> hi_half,
-                             Vec128<T, N / 2> lo_half) {
-  const Half<decltype(d)> d2;
-  const RebindToUnsigned<decltype(d2)> du2;
-  // Treat half-width input as one lane, and expand to two lanes.
-  using VU = Vec128<UnsignedFromSize<N * sizeof(T) / 2>, 2>;
-  const VU lo{BitCast(du2, lo_half).raw};
-  const VU hi{BitCast(du2, hi_half).raw};
-  return BitCast(d, InterleaveLower(lo, hi));
-}
-
-// ------------------------------ ZeroExtendVector (Combine, IfThenElseZero)
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> ZeroExtendVector(Simd<T, N, 0> d, Vec128<T, N / 2> lo) {
-  return IfThenElseZero(FirstN(d, N / 2), Vec128<T, N>{lo.raw});
-}
-
-// ------------------------------ ConcatLowerLower
-
-// hiH,hiL loH,loL |-> hiL,loL (= lower halves)
-template <typename T>
-HWY_API Vec128<T> ConcatLowerLower(Full128<T> /* tag */, const Vec128<T> hi,
-                                   const Vec128<T> lo) {
-  return Vec128<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 0, 2)};
-}
-template <typename T, size_t N, HWY_IF_LE64(T, N)>
-HWY_API Vec128<T, N> ConcatLowerLower(Simd<T, N, 0> d, const Vec128<T, N> hi,
-                                      const Vec128<T, N> lo) {
-  const Half<decltype(d)> d2;
-  return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo));
-}
-
-// ------------------------------ ConcatUpperUpper
-
-template <typename T>
-HWY_API Vec128<T> ConcatUpperUpper(Full128<T> /* tag */, const Vec128<T> hi,
-                                   const Vec128<T> lo) {
-  return Vec128<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 1, 3)};
-}
-template <typename T, size_t N, HWY_IF_LE64(T, N)>
-HWY_API Vec128<T, N> ConcatUpperUpper(Simd<T, N, 0> d, const Vec128<T, N> hi,
-                                      const Vec128<T, N> lo) {
-  const Half<decltype(d)> d2;
-  return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo));
-}
-
-// ------------------------------ ConcatLowerUpper
-
-template <typename T>
-HWY_API Vec128<T> ConcatLowerUpper(Full128<T> d, const Vec128<T> hi,
-                                   const Vec128<T> lo) {
-  return CombineShiftRightBytes<8>(d, hi, lo);
-}
-template <typename T, size_t N, HWY_IF_LE64(T, N)>
-HWY_API Vec128<T, N> ConcatLowerUpper(Simd<T, N, 0> d, const Vec128<T, N> hi,
-                                      const Vec128<T, N> lo) {
-  const Half<decltype(d)> d2;
-  return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo));
-}
-
-// ------------------------------ ConcatUpperLower
-template <typename T, size_t N>
-HWY_API Vec128<T, N> ConcatUpperLower(Simd<T, N, 0> d, const Vec128<T, N> hi,
-                                      const Vec128<T, N> lo) {
-  return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi);
-}
-
-// ------------------------------ ConcatOdd
-
-// 8-bit full
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Vec128<T> ConcatOdd(Full128<T> /* tag */, Vec128<T> hi, Vec128<T> lo) {
-  return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 9, 11, 13, 15,
-                                      17, 19, 21, 23, 25, 27, 29, 31)};
-}
-
-// 8-bit x8
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Vec128<T, 8> ConcatOdd(Simd<T, 8, 0> /* tag */, Vec128<T, 8> hi,
-                               Vec128<T, 8> lo) {
-  // Don't care about upper half.
-  return Vec128<T, 8>{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 17, 19, 21,
-                                         23, 1, 3, 5, 7, 17, 19, 21, 23)};
-}
-
-// 8-bit x4
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Vec128<T, 4> ConcatOdd(Simd<T, 4, 0> /* tag */, Vec128<T, 4> hi,
-                               Vec128<T, 4> lo) {
-  // Don't care about upper 3/4.
-  return Vec128<T, 4>{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 17, 19, 1, 3, 17,
-                                         19, 1, 3, 17, 19, 1, 3, 17, 19)};
-}
-
-// 16-bit full
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec128<T> ConcatOdd(Full128<T> /* tag */, Vec128<T> hi, Vec128<T> lo) {
-  return Vec128<T>{
-      wasm_i16x8_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 9, 11, 13, 15)};
-}
-
-// 16-bit x4
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec128<T, 4> ConcatOdd(Simd<T, 4, 0> /* tag */, Vec128<T, 4> hi,
-                               Vec128<T, 4> lo) {
-  // Don't care about upper half.
-  return Vec128<T, 4>{
-      wasm_i16x8_shuffle(lo.raw, hi.raw, 1, 3, 9, 11, 1, 3, 9, 11)};
-}
-
-// 32-bit full
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec128<T> ConcatOdd(Full128<T> /* tag */, Vec128<T> hi, Vec128<T> lo) {
-  return Vec128<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 1, 3, 5, 7)};
-}
-
-// Any T x2
-template <typename T>
-HWY_API Vec128<T, 2> ConcatOdd(Simd<T, 2, 0> d, Vec128<T, 2> hi,
-                               Vec128<T, 2> lo) {
-  return InterleaveUpper(d, lo, hi);
-}
-
-// ------------------------------ ConcatEven (InterleaveLower)
-
-// 8-bit full
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Vec128<T> ConcatEven(Full128<T> /* tag */, Vec128<T> hi, Vec128<T> lo) {
-  return Vec128<T>{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 8, 10, 12, 14,
-                                      16, 18, 20, 22, 24, 26, 28, 30)};
-}
-
-// 8-bit x8
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Vec128<T, 8> ConcatEven(Simd<T, 8, 0> /* tag */, Vec128<T, 8> hi,
-                                Vec128<T, 8> lo) {
-  // Don't care about upper half.
-  return Vec128<T, 8>{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 16, 18, 20,
-                                         22, 0, 2, 4, 6, 16, 18, 20, 22)};
-}
-
-// 8-bit x4
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Vec128<T, 4> ConcatEven(Simd<T, 4, 0> /* tag */, Vec128<T, 4> hi,
-                                Vec128<T, 4> lo) {
-  // Don't care about upper 3/4.
-  return Vec128<T, 4>{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 16, 18, 0, 2, 16,
-                                         18, 0, 2, 16, 18, 0, 2, 16, 18)};
-}
-
-// 16-bit full
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec128<T> ConcatEven(Full128<T> /* tag */, Vec128<T> hi, Vec128<T> lo) {
-  return Vec128<T>{
-      wasm_i16x8_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 8, 10, 12, 14)};
-}
-
-// 16-bit x4
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec128<T, 4> ConcatEven(Simd<T, 4, 0> /* tag */, Vec128<T, 4> hi,
-                                Vec128<T, 4> lo) {
-  // Don't care about upper half.
-  return Vec128<T, 4>{
-      wasm_i16x8_shuffle(lo.raw, hi.raw, 0, 2, 8, 10, 0, 2, 8, 10)};
-}
-
-// 32-bit full
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec128<T> ConcatEven(Full128<T> /* tag */, Vec128<T> hi, Vec128<T> lo) {
-  return Vec128<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 0, 2, 4, 6)};
-}
-
-// Any T x2
-template <typename T>
-HWY_API Vec128<T, 2> ConcatEven(Simd<T, 2, 0> d, Vec128<T, 2> hi,
-                                Vec128<T, 2> lo) {
-  return InterleaveLower(d, lo, hi);
-}
-
-// ------------------------------ DupEven (InterleaveLower)
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
-  return Vec128<T, N>{wasm_i32x4_shuffle(v.raw, v.raw, 0, 0, 2, 2)};
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec128<T, N> DupEven(const Vec128<T, N> v) {
-  return InterleaveLower(DFromV<decltype(v)>(), v, v);
-}
-
-// ------------------------------ DupOdd (InterleaveUpper)
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
-  return Vec128<T, N>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 1, 3, 3)};
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec128<T, N> DupOdd(const Vec128<T, N> v) {
-  return InterleaveUpper(DFromV<decltype(v)>(), v, v);
-}
-
-// ------------------------------ OddEven
-
-namespace detail {
-
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<1> /* tag */, const Vec128<T, N> a,
-                                const Vec128<T, N> b) {
-  const DFromV<decltype(a)> d;
-  const Repartition<uint8_t, decltype(d)> d8;
-  alignas(16) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
-                                            0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
-  return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
-}
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<2> /* tag */, const Vec128<T, N> a,
-                                const Vec128<T, N> b) {
-  return Vec128<T, N>{
-      wasm_i16x8_shuffle(a.raw, b.raw, 8, 1, 10, 3, 12, 5, 14, 7)};
-}
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<4> /* tag */, const Vec128<T, N> a,
-                                const Vec128<T, N> b) {
-  return Vec128<T, N>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
-}
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> OddEven(hwy::SizeTag<8> /* tag */, const Vec128<T, N> a,
-                                const Vec128<T, N> b) {
-  return Vec128<T, N>{wasm_i64x2_shuffle(a.raw, b.raw, 2, 1)};
-}
-
-}  // namespace detail
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
-  return detail::OddEven(hwy::SizeTag<sizeof(T)>(), a, b);
-}
-template <size_t N>
-HWY_API Vec128<float, N> OddEven(const Vec128<float, N> a,
-                                 const Vec128<float, N> b) {
-  return Vec128<float, N>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
-}
-
-// ------------------------------ OddEvenBlocks
-template <typename T, size_t N>
-HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
-  return even;
-}
-
-// ------------------------------ SwapAdjacentBlocks
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
-  return v;
-}
-
-// ------------------------------ ReverseBlocks
-
-// Single block: no change
-template <typename T>
-HWY_API Vec128<T> ReverseBlocks(Full128<T> /* tag */, const Vec128<T> v) {
-  return v;
-}
-
-// ================================================== CONVERT
-
-// ------------------------------ Promotions (part w/ narrow lanes -> full)
-
-// Unsigned: zero-extend.
-template <size_t N>
-HWY_API Vec128<uint16_t, N> PromoteTo(Simd<uint16_t, N, 0> /* tag */,
-                                      const Vec128<uint8_t, N> v) {
-  return Vec128<uint16_t, N>{wasm_u16x8_extend_low_u8x16(v.raw)};
-}
-template <size_t N>
-HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N, 0> /* tag */,
-                                      const Vec128<uint8_t, N> v) {
-  return Vec128<uint32_t, N>{
-      wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
-}
-template <size_t N>
-HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N, 0> /* tag */,
-                                     const Vec128<uint8_t, N> v) {
-  return Vec128<int16_t, N>{wasm_u16x8_extend_low_u8x16(v.raw)};
-}
-template <size_t N>
-HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
-                                     const Vec128<uint8_t, N> v) {
-  return Vec128<int32_t, N>{
-      wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
-}
-template <size_t N>
-HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N, 0> /* tag */,
-                                      const Vec128<uint16_t, N> v) {
-  return Vec128<uint32_t, N>{wasm_u32x4_extend_low_u16x8(v.raw)};
-}
-template <size_t N>
-HWY_API Vec128<uint64_t, N> PromoteTo(Simd<uint64_t, N, 0> /* tag */,
-                                      const Vec128<uint32_t, N> v) {
-  return Vec128<uint64_t, N>{wasm_u64x2_extend_low_u32x4(v.raw)};
-}
-
-template <size_t N>
-HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
-                                     const Vec128<uint16_t, N> v) {
-  return Vec128<int32_t, N>{wasm_u32x4_extend_low_u16x8(v.raw)};
-}
-
-// Signed: replicate sign bit.
-template <size_t N>
-HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N, 0> /* tag */,
-                                     const Vec128<int8_t, N> v) {
-  return Vec128<int16_t, N>{wasm_i16x8_extend_low_i8x16(v.raw)};
-}
-template <size_t N>
-HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
-                                     const Vec128<int8_t, N> v) {
-  return Vec128<int32_t, N>{
-      wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(v.raw))};
-}
-template <size_t N>
-HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
-                                     const Vec128<int16_t, N> v) {
-  return Vec128<int32_t, N>{wasm_i32x4_extend_low_i16x8(v.raw)};
-}
-template <size_t N>
-HWY_API Vec128<int64_t, N> PromoteTo(Simd<int64_t, N, 0> /* tag */,
-                                     const Vec128<int32_t, N> v) {
-  return Vec128<int64_t, N>{wasm_i64x2_extend_low_i32x4(v.raw)};
-}
-
-template <size_t N>
-HWY_API Vec128<double, N> PromoteTo(Simd<double, N, 0> /* tag */,
-                                    const Vec128<int32_t, N> v) {
-  return Vec128<double, N>{wasm_f64x2_convert_low_i32x4(v.raw)};
-}
-
-template <size_t N>
-HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> df32,
-                                   const Vec128<float16_t, N> v) {
-  const RebindToSigned<decltype(df32)> di32;
-  const RebindToUnsigned<decltype(df32)> du32;
-  // Expand to u32 so we can shift.
-  const auto bits16 = PromoteTo(du32, Vec128<uint16_t, N>{v.raw});
-  const auto sign = ShiftRight<15>(bits16);
-  const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F);
-  const auto mantissa = bits16 & Set(du32, 0x3FF);
-  const auto subnormal =
-      BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) *
-                        Set(df32, 1.0f / 16384 / 1024));
-
-  const auto biased_exp32 = biased_exp + Set(du32, 127 - 15);
-  const auto mantissa32 = ShiftLeft<23 - 10>(mantissa);
-  const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
-  const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal);
-  return BitCast(df32, ShiftLeft<31>(sign) | bits32);
-}
-
-template <size_t N>
-HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> df32,
-                                   const Vec128<bfloat16_t, N> v) {
-  const Rebind<uint16_t, decltype(df32)> du16;
-  const RebindToSigned<decltype(df32)> di32;
-  return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
-}
-
-// ------------------------------ Demotions (full -> part w/ narrow lanes)
-
-template <size_t N>
-HWY_API Vec128<uint16_t, N> DemoteTo(Simd<uint16_t, N, 0> /* tag */,
-                                     const Vec128<int32_t, N> v) {
-  return Vec128<uint16_t, N>{wasm_u16x8_narrow_i32x4(v.raw, v.raw)};
-}
-
-template <size_t N>
-HWY_API Vec128<int16_t, N> DemoteTo(Simd<int16_t, N, 0> /* tag */,
-                                    const Vec128<int32_t, N> v) {
-  return Vec128<int16_t, N>{wasm_i16x8_narrow_i32x4(v.raw, v.raw)};
-}
-
-template <size_t N>
-HWY_API Vec128<uint8_t, N> DemoteTo(Simd<uint8_t, N, 0> /* tag */,
-                                    const Vec128<int32_t, N> v) {
-  const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
-  return Vec128<uint8_t, N>{
-      wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
-}
-
-template <size_t N>
-HWY_API Vec128<uint8_t, N> DemoteTo(Simd<uint8_t, N, 0> /* tag */,
-                                    const Vec128<int16_t, N> v) {
-  return Vec128<uint8_t, N>{wasm_u8x16_narrow_i16x8(v.raw, v.raw)};
-}
-
-template <size_t N>
-HWY_API Vec128<int8_t, N> DemoteTo(Simd<int8_t, N, 0> /* tag */,
-                                   const Vec128<int32_t, N> v) {
-  const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
-  return Vec128<int8_t, N>{wasm_i8x16_narrow_i16x8(intermediate, intermediate)};
-}
-
-template <size_t N>
-HWY_API Vec128<int8_t, N> DemoteTo(Simd<int8_t, N, 0> /* tag */,
-                                   const Vec128<int16_t, N> v) {
-  return Vec128<int8_t, N>{wasm_i8x16_narrow_i16x8(v.raw, v.raw)};
-}
-
-template <size_t N>
-HWY_API Vec128<int32_t, N> DemoteTo(Simd<int32_t, N, 0> /* di */,
-                                    const Vec128<double, N> v) {
-  return Vec128<int32_t, N>{wasm_i32x4_trunc_sat_f64x2_zero(v.raw)};
-}
-
-template <size_t N>
-HWY_API Vec128<float16_t, N> DemoteTo(Simd<float16_t, N, 0> df16,
-                                      const Vec128<float, N> v) {
-  const RebindToUnsigned<decltype(df16)> du16;
-  const Rebind<uint32_t, decltype(du16)> du;
-  const RebindToSigned<decltype(du)> di;
-  const auto bits32 = BitCast(du, v);
-  const auto sign = ShiftRight<31>(bits32);
-  const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF);
-  const auto mantissa32 = bits32 & Set(du, 0x7FFFFF);
-
-  const auto k15 = Set(di, 15);
-  const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15);
-  const auto is_tiny = exp < Set(di, -24);
-
-  const auto is_subnormal = exp < Set(di, -14);
-  const auto biased_exp16 =
-      BitCast(du, IfThenZeroElse(is_subnormal, exp + k15));
-  const auto sub_exp = BitCast(du, Set(di, -14) - exp);  // [1, 11)
-  const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) +
-                     (mantissa32 >> (Set(du, 13) + sub_exp));
-  const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m,
-                                     ShiftRight<13>(mantissa32));  // <1024
-
-  const auto sign16 = ShiftLeft<15>(sign);
-  const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
-  const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16));
-  return Vec128<float16_t, N>{DemoteTo(du16, bits16).raw};
-}
-
-template <size_t N>
-HWY_API Vec128<bfloat16_t, N> DemoteTo(Simd<bfloat16_t, N, 0> dbf16,
-                                       const Vec128<float, N> v) {
-  const Rebind<int32_t, decltype(dbf16)> di32;
-  const Rebind<uint32_t, decltype(dbf16)> du32;  // for logical shift right
-  const Rebind<uint16_t, decltype(dbf16)> du16;
-  const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
-  return BitCast(dbf16, DemoteTo(du16, bits_in_32));
-}
-
-template <size_t N>
-HWY_API Vec128<bfloat16_t, 2 * N> ReorderDemote2To(
-    Simd<bfloat16_t, 2 * N, 0> dbf16, Vec128<float, N> a, Vec128<float, N> b) {
-  const RebindToUnsigned<decltype(dbf16)> du16;
-  const Repartition<uint32_t, decltype(dbf16)> du32;
-  const Vec128<uint32_t, N> b_in_even = ShiftRight<16>(BitCast(du32, b));
-  return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
-}
-
-// For already range-limited input [0, 255].
-template <size_t N>
-HWY_API Vec128<uint8_t, N> U8FromU32(const Vec128<uint32_t, N> v) {
-  const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
-  return Vec128<uint8_t, N>{
-      wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
-}
-
-// ------------------------------ Truncations
-
-template <typename From, typename To, HWY_IF_UNSIGNED(From),
-          HWY_IF_UNSIGNED(To),
-          hwy::EnableIf<(sizeof(To) < sizeof(From))>* = nullptr>
-HWY_API Vec128<To, 1> TruncateTo(Simd<To, 1, 0> /* tag */,
-                                 const Vec128<From, 1> v) {
-  const Repartition<To, DFromV<decltype(v)>> d;
-  const auto v1 = BitCast(d, v);
-  return Vec128<To, 1>{v1.raw};
-}
-
-HWY_API Vec128<uint8_t, 2> TruncateTo(Simd<uint8_t, 2, 0> /* tag */,
-                                      const Vec128<uint64_t> v) {
-  const Full128<uint8_t> d;
-  const auto v1 = BitCast(d, v);
-  const auto v2 = ConcatEven(d, v1, v1);
-  const auto v4 = ConcatEven(d, v2, v2);
-  return LowerHalf(LowerHalf(LowerHalf(ConcatEven(d, v4, v4))));
-}
-
-HWY_API Vec128<uint16_t, 2> TruncateTo(Simd<uint16_t, 2, 0> /* tag */,
-                                       const Vec128<uint64_t> v) {
-  const Full128<uint16_t> d;
-  const auto v1 = BitCast(d, v);
-  const auto v2 = ConcatEven(d, v1, v1);
-  return LowerHalf(LowerHalf(ConcatEven(d, v2, v2)));
-}
-
-HWY_API Vec128<uint32_t, 2> TruncateTo(Simd<uint32_t, 2, 0> /* tag */,
-                                       const Vec128<uint64_t> v) {
-  const Full128<uint32_t> d;
-  const auto v1 = BitCast(d, v);
-  return LowerHalf(ConcatEven(d, v1, v1));
-}
-
-template <size_t N, hwy::EnableIf<N >= 2>* = nullptr>
-HWY_API Vec128<uint8_t, N> TruncateTo(Simd<uint8_t, N, 0> /* tag */,
-                                      const Vec128<uint32_t, N> v) {
-  const Full128<uint8_t> d;
-  const auto v1 = Vec128<uint8_t>{v.raw};
-  const auto v2 = ConcatEven(d, v1, v1);
-  const auto v3 = ConcatEven(d, v2, v2);
-  return Vec128<uint8_t, N>{v3.raw};
-}
-
-template <size_t N, hwy::EnableIf<N >= 2>* = nullptr>
-HWY_API Vec128<uint16_t, N> TruncateTo(Simd<uint16_t, N, 0> /* tag */,
-                                       const Vec128<uint32_t, N> v) {
-  const Full128<uint16_t> d;
-  const auto v1 = Vec128<uint16_t>{v.raw};
-  const auto v2 = ConcatEven(d, v1, v1);
-  return Vec128<uint16_t, N>{v2.raw};
-}
-
-template <size_t N, hwy::EnableIf<N >= 2>* = nullptr>
-HWY_API Vec128<uint8_t, N> TruncateTo(Simd<uint8_t, N, 0> /* tag */,
-                                      const Vec128<uint16_t, N> v) {
-  const Full128<uint8_t> d;
-  const auto v1 = Vec128<uint8_t>{v.raw};
-  const auto v2 = ConcatEven(d, v1, v1);
-  return Vec128<uint8_t, N>{v2.raw};
-}
-
-// ------------------------------ Convert i32 <=> f32 (Round)
-
-template <size_t N>
-HWY_API Vec128<float, N> ConvertTo(Simd<float, N, 0> /* tag */,
-                                   const Vec128<int32_t, N> v) {
-  return Vec128<float, N>{wasm_f32x4_convert_i32x4(v.raw)};
-}
-template <size_t N>
-HWY_API Vec128<float, N> ConvertTo(Simd<float, N, 0> /* tag */,
-                                   const Vec128<uint32_t, N> v) {
-  return Vec128<float, N>{wasm_f32x4_convert_u32x4(v.raw)};
-}
-// Truncates (rounds toward zero).
-template <size_t N>
-HWY_API Vec128<int32_t, N> ConvertTo(Simd<int32_t, N, 0> /* tag */,
-                                     const Vec128<float, N> v) {
-  return Vec128<int32_t, N>{wasm_i32x4_trunc_sat_f32x4(v.raw)};
-}
-
-template <size_t N>
-HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
-  return ConvertTo(Simd<int32_t, N, 0>(), Round(v));
-}
-
-// ================================================== MISC
-
-// ------------------------------ SumsOf8 (ShiftRight, Add)
-template <size_t N>
-HWY_API Vec128<uint64_t, N / 8> SumsOf8(const Vec128<uint8_t, N> v) {
-  const DFromV<decltype(v)> du8;
-  const RepartitionToWide<decltype(du8)> du16;
-  const RepartitionToWide<decltype(du16)> du32;
-  const RepartitionToWide<decltype(du32)> du64;
-  using VU16 = VFromD<decltype(du16)>;
-
-  const VU16 vFDB97531 = ShiftRight<8>(BitCast(du16, v));
-  const VU16 vECA86420 = And(BitCast(du16, v), Set(du16, 0xFF));
-  const VU16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420);
-
-  const VU16 szz_FE_zz_BA_zz_76_zz_32 =
-      BitCast(du16, ShiftRight<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10)));
-  const VU16 sxx_FC_xx_B8_xx_74_xx_30 =
-      Add(sFE_DC_BA_98_76_54_32_10, szz_FE_zz_BA_zz_76_zz_32);
-  const VU16 szz_zz_xx_FC_zz_zz_xx_74 =
-      BitCast(du16, ShiftRight<32>(BitCast(du64, sxx_FC_xx_B8_xx_74_xx_30)));
-  const VU16 sxx_xx_xx_F8_xx_xx_xx_70 =
-      Add(sxx_FC_xx_B8_xx_74_xx_30, szz_zz_xx_FC_zz_zz_xx_74);
-  return And(BitCast(du64, sxx_xx_xx_F8_xx_xx_xx_70), Set(du64, 0xFFFF));
-}
-
-// ------------------------------ LoadMaskBits (TestBit)
-
-namespace detail {
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
-HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t bits) {
-  const RebindToUnsigned<decltype(d)> du;
-  // Easier than Set(), which would require an >8-bit type, which would not
-  // compile for T=uint8_t, N=1.
-  const Vec128<T, N> vbits{wasm_i32x4_splat(static_cast<int32_t>(bits))};
-
-  // Replicate bytes 8x such that each byte contains the bit that governs it.
-  alignas(16) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
-                                             1, 1, 1, 1, 1, 1, 1, 1};
-  const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8));
-
-  alignas(16) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
-                                            1, 2, 4, 8, 16, 32, 64, 128};
-  return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit)));
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
-HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t bits) {
-  const RebindToUnsigned<decltype(d)> du;
-  alignas(16) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
-  return RebindMask(
-      d, TestBit(Set(du, static_cast<uint16_t>(bits)), Load(du, kBit)));
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
-HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t bits) {
-  const RebindToUnsigned<decltype(d)> du;
-  alignas(16) constexpr uint32_t kBit[8] = {1, 2, 4, 8};
-  return RebindMask(
-      d, TestBit(Set(du, static_cast<uint32_t>(bits)), Load(du, kBit)));
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
-HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t bits) {
-  const RebindToUnsigned<decltype(d)> du;
-  alignas(16) constexpr uint64_t kBit[8] = {1, 2};
-  return RebindMask(d, TestBit(Set(du, bits), Load(du, kBit)));
-}
-
-}  // namespace detail
-
-// `p` points to at least 8 readable bytes, not all of which need be valid.
-template <typename T, size_t N, HWY_IF_LE128(T, N)>
-HWY_API Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d,
-                                   const uint8_t* HWY_RESTRICT bits) {
-  uint64_t mask_bits = 0;
-  CopyBytes<(N + 7) / 8>(bits, &mask_bits);
-  return detail::LoadMaskBits(d, mask_bits);
-}
-
-// ------------------------------ Mask
-
-namespace detail {
-
-// Full
-template <typename T>
-HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
-                                 const Mask128<T> mask) {
-  alignas(16) uint64_t lanes[2];
-  wasm_v128_store(lanes, mask.raw);
-
-  constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
-  const uint64_t lo = ((lanes[0] * kMagic) >> 56);
-  const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00;
-  return (hi + lo);
-}
-
-// 64-bit
-template <typename T>
-HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
-                                 const Mask128<T, 8> mask) {
-  constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
-  return (static_cast<uint64_t>(wasm_i64x2_extract_lane(mask.raw, 0)) *
-          kMagic) >>
-         56;
-}
-
-// 32-bit or less: need masking
-template <typename T, size_t N, HWY_IF_LE32(T, N)>
-HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
-                                 const Mask128<T, N> mask) {
-  uint64_t bytes = static_cast<uint64_t>(wasm_i64x2_extract_lane(mask.raw, 0));
-  // Clear potentially undefined bytes.
-  bytes &= (1ULL << (N * 8)) - 1;
-  constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
-  return (bytes * kMagic) >> 56;
-}
-
-template <typename T, size_t N>
-HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/,
-                                 const Mask128<T, N> mask) {
-  // Remove useless lower half of each u16 while preserving the sign bit.
-  const __i16x8 zero = wasm_i16x8_splat(0);
-  const Mask128<uint8_t, N> mask8{wasm_i8x16_narrow_i16x8(mask.raw, zero)};
-  return BitsFromMask(hwy::SizeTag<1>(), mask8);
-}
-
-template <typename T, size_t N>
-HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/,
-                                 const Mask128<T, N> mask) {
-  const __i32x4 mask_i = static_cast<__i32x4>(mask.raw);
-  const __i32x4 slice = wasm_i32x4_make(1, 2, 4, 8);
-  const __i32x4 sliced_mask = wasm_v128_and(mask_i, slice);
-  alignas(16) uint32_t lanes[4];
-  wasm_v128_store(lanes, sliced_mask);
-  return lanes[0] | lanes[1] | lanes[2] | lanes[3];
-}
-
-template <typename T, size_t N>
-HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/,
-                                 const Mask128<T, N> mask) {
-  const __i64x2 mask_i = static_cast<__i64x2>(mask.raw);
-  const __i64x2 slice = wasm_i64x2_make(1, 2);
-  const __i64x2 sliced_mask = wasm_v128_and(mask_i, slice);
-  alignas(16) uint64_t lanes[2];
-  wasm_v128_store(lanes, sliced_mask);
-  return lanes[0] | lanes[1];
-}
-
-// Returns the lowest N bits for the BitsFromMask result.
-template <typename T, size_t N>
-constexpr uint64_t OnlyActive(uint64_t bits) {
-  return ((N * sizeof(T)) == 16) ? bits : bits & ((1ull << N) - 1);
-}
-
-// Returns 0xFF for bytes with index >= N, otherwise 0.
-template <size_t N>
-constexpr __i8x16 BytesAbove() {
-  return /**/
-      (N == 0)    ? wasm_i32x4_make(-1, -1, -1, -1)
-      : (N == 4)  ? wasm_i32x4_make(0, -1, -1, -1)
-      : (N == 8)  ? wasm_i32x4_make(0, 0, -1, -1)
-      : (N == 12) ? wasm_i32x4_make(0, 0, 0, -1)
-      : (N == 16) ? wasm_i32x4_make(0, 0, 0, 0)
-      : (N == 2)  ? wasm_i16x8_make(0, -1, -1, -1, -1, -1, -1, -1)
-      : (N == 6)  ? wasm_i16x8_make(0, 0, 0, -1, -1, -1, -1, -1)
-      : (N == 10) ? wasm_i16x8_make(0, 0, 0, 0, 0, -1, -1, -1)
-      : (N == 14) ? wasm_i16x8_make(0, 0, 0, 0, 0, 0, 0, -1)
-      : (N == 1)  ? wasm_i8x16_make(0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                                   -1, -1, -1, -1, -1)
-      : (N == 3)  ? wasm_i8x16_make(0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                                   -1, -1, -1, -1)
-      : (N == 5)  ? wasm_i8x16_make(0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1,
-                                   -1, -1, -1, -1)
-      : (N == 7)  ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1,
-                                   -1, -1, -1)
-      : (N == 9)  ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1,
-                                   -1, -1, -1)
-      : (N == 11)
-          ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1)
-      : (N == 13)
-          ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1)
-          : wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1);
-}
-
-template <typename T, size_t N>
-HWY_INLINE uint64_t BitsFromMask(const Mask128<T, N> mask) {
-  return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
-}
-
-template <typename T>
-HWY_INLINE size_t CountTrue(hwy::SizeTag<1> tag, const Mask128<T> m) {
-  return PopCount(BitsFromMask(tag, m));
-}
-
-template <typename T>
-HWY_INLINE size_t CountTrue(hwy::SizeTag<2> tag, const Mask128<T> m) {
-  return PopCount(BitsFromMask(tag, m));
-}
-
-template <typename T>
-HWY_INLINE size_t CountTrue(hwy::SizeTag<4> /*tag*/, const Mask128<T> m) {
-  const __i32x4 var_shift = wasm_i32x4_make(1, 2, 4, 8);
-  const __i32x4 shifted_bits = wasm_v128_and(m.raw, var_shift);
-  alignas(16) uint64_t lanes[2];
-  wasm_v128_store(lanes, shifted_bits);
-  return PopCount(lanes[0] | lanes[1]);
-}
-
-template <typename T>
-HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, const Mask128<T> m) {
-  alignas(16) int64_t lanes[2];
-  wasm_v128_store(lanes, m.raw);
-  return static_cast<size_t>(-(lanes[0] + lanes[1]));
-}
-
-}  // namespace detail
-
-// `p` points to at least 8 writable bytes.
-template <typename T, size_t N>
-HWY_API size_t StoreMaskBits(const Simd<T, N, 0> /* tag */,
-                             const Mask128<T, N> mask, uint8_t* bits) {
-  const uint64_t mask_bits = detail::BitsFromMask(mask);
-  const size_t kNumBytes = (N + 7) / 8;
-  CopyBytes<kNumBytes>(&mask_bits, bits);
-  return kNumBytes;
-}
-
-template <typename T, size_t N>
-HWY_API size_t CountTrue(const Simd<T, N, 0> /* tag */, const Mask128<T> m) {
-  return detail::CountTrue(hwy::SizeTag<sizeof(T)>(), m);
-}
-
-// Partial vector
-template <typename T, size_t N, HWY_IF_LE64(T, N)>
-HWY_API size_t CountTrue(const Simd<T, N, 0> d, const Mask128<T, N> m) {
-  // Ensure all undefined bytes are 0.
-  const Mask128<T, N> mask{detail::BytesAbove<N * sizeof(T)>()};
-  return CountTrue(d, Mask128<T>{AndNot(mask, m).raw});
-}
-
-// Full vector
-template <typename T>
-HWY_API bool AllFalse(const Full128<T> d, const Mask128<T> m) {
-#if 0
-  // Casting followed by wasm_i8x16_any_true results in wasm error:
-  // i32.eqz[0] expected type i32, found i8x16.popcnt of type s128
-  const auto v8 = BitCast(Full128<int8_t>(), VecFromMask(d, m));
-  return !wasm_i8x16_any_true(v8.raw);
-#else
-  (void)d;
-  return (wasm_i64x2_extract_lane(m.raw, 0) |
-          wasm_i64x2_extract_lane(m.raw, 1)) == 0;
-#endif
-}
-
-// Full vector
-namespace detail {
-template <typename T>
-HWY_INLINE bool AllTrue(hwy::SizeTag<1> /*tag*/, const Mask128<T> m) {
-  return wasm_i8x16_all_true(m.raw);
-}
-template <typename T>
-HWY_INLINE bool AllTrue(hwy::SizeTag<2> /*tag*/, const Mask128<T> m) {
-  return wasm_i16x8_all_true(m.raw);
-}
-template <typename T>
-HWY_INLINE bool AllTrue(hwy::SizeTag<4> /*tag*/, const Mask128<T> m) {
-  return wasm_i32x4_all_true(m.raw);
-}
-template <typename T>
-HWY_INLINE bool AllTrue(hwy::SizeTag<8> /*tag*/, const Mask128<T> m) {
-  return wasm_i64x2_all_true(m.raw);
-}
-
-}  // namespace detail
-
-template <typename T, size_t N>
-HWY_API bool AllTrue(const Simd<T, N, 0> /* tag */, const Mask128<T> m) {
-  return detail::AllTrue(hwy::SizeTag<sizeof(T)>(), m);
-}
-
-// Partial vectors
-
-template <typename T, size_t N, HWY_IF_LE64(T, N)>
-HWY_API bool AllFalse(Simd<T, N, 0> /* tag */, const Mask128<T, N> m) {
-  // Ensure all undefined bytes are 0.
-  const Mask128<T, N> mask{detail::BytesAbove<N * sizeof(T)>()};
-  return AllFalse(Full128<T>(), Mask128<T>{AndNot(mask, m).raw});
-}
-
-template <typename T, size_t N, HWY_IF_LE64(T, N)>
-HWY_API bool AllTrue(const Simd<T, N, 0> /* d */, const Mask128<T, N> m) {
-  // Ensure all undefined bytes are FF.
-  const Mask128<T, N> mask{detail::BytesAbove<N * sizeof(T)>()};
-  return AllTrue(Full128<T>(), Mask128<T>{Or(mask, m).raw});
-}
-
-template <typename T, size_t N>
-HWY_API intptr_t FindFirstTrue(const Simd<T, N, 0> /* tag */,
-                               const Mask128<T, N> mask) {
-  const uint64_t bits = detail::BitsFromMask(mask);
-  return bits ? static_cast<intptr_t>(Num0BitsBelowLS1Bit_Nonzero64(bits)) : -1;
-}
-
-// ------------------------------ Compress
-
-namespace detail {
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
-HWY_INLINE Vec128<T, N> IdxFromBits(const uint64_t mask_bits) {
-  HWY_DASSERT(mask_bits < 256);
-  const Simd<T, N, 0> d;
-  const Rebind<uint8_t, decltype(d)> d8;
-  const Simd<uint16_t, N, 0> du;
-
-  // We need byte indices for TableLookupBytes (one vector's worth for each of
-  // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We
-  // can instead store lane indices and convert to byte indices (2*lane + 0..1),
-  // with the doubling baked into the table. Unpacking nibbles is likely more
-  // costly than the higher cache footprint from storing bytes.
-  alignas(16) constexpr uint8_t table[256 * 8] = {
-      // PrintCompress16x8Tables
-      0,  2,  4,  6,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
-      2,  0,  4,  6,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
-      4,  0,  2,  6,  8,  10, 12, 14, /**/ 0, 4,  2,  6,  8,  10, 12, 14,  //
-      2,  4,  0,  6,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
-      6,  0,  2,  4,  8,  10, 12, 14, /**/ 0, 6,  2,  4,  8,  10, 12, 14,  //
-      2,  6,  0,  4,  8,  10, 12, 14, /**/ 0, 2,  6,  4,  8,  10, 12, 14,  //
-      4,  6,  0,  2,  8,  10, 12, 14, /**/ 0, 4,  6,  2,  8,  10, 12, 14,  //
-      2,  4,  6,  0,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
-      8,  0,  2,  4,  6,  10, 12, 14, /**/ 0, 8,  2,  4,  6,  10, 12, 14,  //
-      2,  8,  0,  4,  6,  10, 12, 14, /**/ 0, 2,  8,  4,  6,  10, 12, 14,  //
-      4,  8,  0,  2,  6,  10, 12, 14, /**/ 0, 4,  8,  2,  6,  10, 12, 14,  //
-      2,  4,  8,  0,  6,  10, 12, 14, /**/ 0, 2,  4,  8,  6,  10, 12, 14,  //
-      6,  8,  0,  2,  4,  10, 12, 14, /**/ 0, 6,  8,  2,  4,  10, 12, 14,  //
-      2,  6,  8,  0,  4,  10, 12, 14, /**/ 0, 2,  6,  8,  4,  10, 12, 14,  //
-      4,  6,  8,  0,  2,  10, 12, 14, /**/ 0, 4,  6,  8,  2,  10, 12, 14,  //
-      2,  4,  6,  8,  0,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
-      10, 0,  2,  4,  6,  8,  12, 14, /**/ 0, 10, 2,  4,  6,  8,  12, 14,  //
-      2,  10, 0,  4,  6,  8,  12, 14, /**/ 0, 2,  10, 4,  6,  8,  12, 14,  //
-      4,  10, 0,  2,  6,  8,  12, 14, /**/ 0, 4,  10, 2,  6,  8,  12, 14,  //
-      2,  4,  10, 0,  6,  8,  12, 14, /**/ 0, 2,  4,  10, 6,  8,  12, 14,  //
-      6,  10, 0,  2,  4,  8,  12, 14, /**/ 0, 6,  10, 2,  4,  8,  12, 14,  //
-      2,  6,  10, 0,  4,  8,  12, 14, /**/ 0, 2,  6,  10, 4,  8,  12, 14,  //
-      4,  6,  10, 0,  2,  8,  12, 14, /**/ 0, 4,  6,  10, 2,  8,  12, 14,  //
-      2,  4,  6,  10, 0,  8,  12, 14, /**/ 0, 2,  4,  6,  10, 8,  12, 14,  //
-      8,  10, 0,  2,  4,  6,  12, 14, /**/ 0, 8,  10, 2,  4,  6,  12, 14,  //
-      2,  8,  10, 0,  4,  6,  12, 14, /**/ 0, 2,  8,  10, 4,  6,  12, 14,  //
-      4,  8,  10, 0,  2,  6,  12, 14, /**/ 0, 4,  8,  10, 2,  6,  12, 14,  //
-      2,  4,  8,  10, 0,  6,  12, 14, /**/ 0, 2,  4,  8,  10, 6,  12, 14,  //
-      6,  8,  10, 0,  2,  4,  12, 14, /**/ 0, 6,  8,  10, 2,  4,  12, 14,  //
-      2,  6,  8,  10, 0,  4,  12, 14, /**/ 0, 2,  6,  8,  10, 4,  12, 14,  //
-      4,  6,  8,  10, 0,  2,  12, 14, /**/ 0, 4,  6,  8,  10, 2,  12, 14,  //
-      2,  4,  6,  8,  10, 0,  12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
-      12, 0,  2,  4,  6,  8,  10, 14, /**/ 0, 12, 2,  4,  6,  8,  10, 14,  //
-      2,  12, 0,  4,  6,  8,  10, 14, /**/ 0, 2,  12, 4,  6,  8,  10, 14,  //
-      4,  12, 0,  2,  6,  8,  10, 14, /**/ 0, 4,  12, 2,  6,  8,  10, 14,  //
-      2,  4,  12, 0,  6,  8,  10, 14, /**/ 0, 2,  4,  12, 6,  8,  10, 14,  //
-      6,  12, 0,  2,  4,  8,  10, 14, /**/ 0, 6,  12, 2,  4,  8,  10, 14,  //
-      2,  6,  12, 0,  4,  8,  10, 14, /**/ 0, 2,  6,  12, 4,  8,  10, 14,  //
-      4,  6,  12, 0,  2,  8,  10, 14, /**/ 0, 4,  6,  12, 2,  8,  10, 14,  //
-      2,  4,  6,  12, 0,  8,  10, 14, /**/ 0, 2,  4,  6,  12, 8,  10, 14,  //
-      8,  12, 0,  2,  4,  6,  10, 14, /**/ 0, 8,  12, 2,  4,  6,  10, 14,  //
-      2,  8,  12, 0,  4,  6,  10, 14, /**/ 0, 2,  8,  12, 4,  6,  10, 14,  //
-      4,  8,  12, 0,  2,  6,  10, 14, /**/ 0, 4,  8,  12, 2,  6,  10, 14,  //
-      2,  4,  8,  12, 0,  6,  10, 14, /**/ 0, 2,  4,  8,  12, 6,  10, 14,  //
-      6,  8,  12, 0,  2,  4,  10, 14, /**/ 0, 6,  8,  12, 2,  4,  10, 14,  //
-      2,  6,  8,  12, 0,  4,  10, 14, /**/ 0, 2,  6,  8,  12, 4,  10, 14,  //
-      4,  6,  8,  12, 0,  2,  10, 14, /**/ 0, 4,  6,  8,  12, 2,  10, 14,  //
-      2,  4,  6,  8,  12, 0,  10, 14, /**/ 0, 2,  4,  6,  8,  12, 10, 14,  //
-      10, 12, 0,  2,  4,  6,  8,  14, /**/ 0, 10, 12, 2,  4,  6,  8,  14,  //
-      2,  10, 12, 0,  4,  6,  8,  14, /**/ 0, 2,  10, 12, 4,  6,  8,  14,  //
-      4,  10, 12, 0,  2,  6,  8,  14, /**/ 0, 4,  10, 12, 2,  6,  8,  14,  //
-      2,  4,  10, 12, 0,  6,  8,  14, /**/ 0, 2,  4,  10, 12, 6,  8,  14,  //
-      6,  10, 12, 0,  2,  4,  8,  14, /**/ 0, 6,  10, 12, 2,  4,  8,  14,  //
-      2,  6,  10, 12, 0,  4,  8,  14, /**/ 0, 2,  6,  10, 12, 4,  8,  14,  //
-      4,  6,  10, 12, 0,  2,  8,  14, /**/ 0, 4,  6,  10, 12, 2,  8,  14,  //
-      2,  4,  6,  10, 12, 0,  8,  14, /**/ 0, 2,  4,  6,  10, 12, 8,  14,  //
-      8,  10, 12, 0,  2,  4,  6,  14, /**/ 0, 8,  10, 12, 2,  4,  6,  14,  //
-      2,  8,  10, 12, 0,  4,  6,  14, /**/ 0, 2,  8,  10, 12, 4,  6,  14,  //
-      4,  8,  10, 12, 0,  2,  6,  14, /**/ 0, 4,  8,  10, 12, 2,  6,  14,  //
-      2,  4,  8,  10, 12, 0,  6,  14, /**/ 0, 2,  4,  8,  10, 12, 6,  14,  //
-      6,  8,  10, 12, 0,  2,  4,  14, /**/ 0, 6,  8,  10, 12, 2,  4,  14,  //
-      2,  6,  8,  10, 12, 0,  4,  14, /**/ 0, 2,  6,  8,  10, 12, 4,  14,  //
-      4,  6,  8,  10, 12, 0,  2,  14, /**/ 0, 4,  6,  8,  10, 12, 2,  14,  //
-      2,  4,  6,  8,  10, 12, 0,  14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
-      14, 0,  2,  4,  6,  8,  10, 12, /**/ 0, 14, 2,  4,  6,  8,  10, 12,  //
-      2,  14, 0,  4,  6,  8,  10, 12, /**/ 0, 2,  14, 4,  6,  8,  10, 12,  //
-      4,  14, 0,  2,  6,  8,  10, 12, /**/ 0, 4,  14, 2,  6,  8,  10, 12,  //
-      2,  4,  14, 0,  6,  8,  10, 12, /**/ 0, 2,  4,  14, 6,  8,  10, 12,  //
-      6,  14, 0,  2,  4,  8,  10, 12, /**/ 0, 6,  14, 2,  4,  8,  10, 12,  //
-      2,  6,  14, 0,  4,  8,  10, 12, /**/ 0, 2,  6,  14, 4,  8,  10, 12,  //
-      4,  6,  14, 0,  2,  8,  10, 12, /**/ 0, 4,  6,  14, 2,  8,  10, 12,  //
-      2,  4,  6,  14, 0,  8,  10, 12, /**/ 0, 2,  4,  6,  14, 8,  10, 12,  //
-      8,  14, 0,  2,  4,  6,  10, 12, /**/ 0, 8,  14, 2,  4,  6,  10, 12,  //
-      2,  8,  14, 0,  4,  6,  10, 12, /**/ 0, 2,  8,  14, 4,  6,  10, 12,  //
-      4,  8,  14, 0,  2,  6,  10, 12, /**/ 0, 4,  8,  14, 2,  6,  10, 12,  //
-      2,  4,  8,  14, 0,  6,  10, 12, /**/ 0, 2,  4,  8,  14, 6,  10, 12,  //
-      6,  8,  14, 0,  2,  4,  10, 12, /**/ 0, 6,  8,  14, 2,  4,  10, 12,  //
-      2,  6,  8,  14, 0,  4,  10, 12, /**/ 0, 2,  6,  8,  14, 4,  10, 12,  //
-      4,  6,  8,  14, 0,  2,  10, 12, /**/ 0, 4,  6,  8,  14, 2,  10, 12,  //
-      2,  4,  6,  8,  14, 0,  10, 12, /**/ 0, 2,  4,  6,  8,  14, 10, 12,  //
-      10, 14, 0,  2,  4,  6,  8,  12, /**/ 0, 10, 14, 2,  4,  6,  8,  12,  //
-      2,  10, 14, 0,  4,  6,  8,  12, /**/ 0, 2,  10, 14, 4,  6,  8,  12,  //
-      4,  10, 14, 0,  2,  6,  8,  12, /**/ 0, 4,  10, 14, 2,  6,  8,  12,  //
-      2,  4,  10, 14, 0,  6,  8,  12, /**/ 0, 2,  4,  10, 14, 6,  8,  12,  //
-      6,  10, 14, 0,  2,  4,  8,  12, /**/ 0, 6,  10, 14, 2,  4,  8,  12,  //
-      2,  6,  10, 14, 0,  4,  8,  12, /**/ 0, 2,  6,  10, 14, 4,  8,  12,  //
-      4,  6,  10, 14, 0,  2,  8,  12, /**/ 0, 4,  6,  10, 14, 2,  8,  12,  //
-      2,  4,  6,  10, 14, 0,  8,  12, /**/ 0, 2,  4,  6,  10, 14, 8,  12,  //
-      8,  10, 14, 0,  2,  4,  6,  12, /**/ 0, 8,  10, 14, 2,  4,  6,  12,  //
-      2,  8,  10, 14, 0,  4,  6,  12, /**/ 0, 2,  8,  10, 14, 4,  6,  12,  //
-      4,  8,  10, 14, 0,  2,  6,  12, /**/ 0, 4,  8,  10, 14, 2,  6,  12,  //
-      2,  4,  8,  10, 14, 0,  6,  12, /**/ 0, 2,  4,  8,  10, 14, 6,  12,  //
-      6,  8,  10, 14, 0,  2,  4,  12, /**/ 0, 6,  8,  10, 14, 2,  4,  12,  //
-      2,  6,  8,  10, 14, 0,  4,  12, /**/ 0, 2,  6,  8,  10, 14, 4,  12,  //
-      4,  6,  8,  10, 14, 0,  2,  12, /**/ 0, 4,  6,  8,  10, 14, 2,  12,  //
-      2,  4,  6,  8,  10, 14, 0,  12, /**/ 0, 2,  4,  6,  8,  10, 14, 12,  //
-      12, 14, 0,  2,  4,  6,  8,  10, /**/ 0, 12, 14, 2,  4,  6,  8,  10,  //
-      2,  12, 14, 0,  4,  6,  8,  10, /**/ 0, 2,  12, 14, 4,  6,  8,  10,  //
-      4,  12, 14, 0,  2,  6,  8,  10, /**/ 0, 4,  12, 14, 2,  6,  8,  10,  //
-      2,  4,  12, 14, 0,  6,  8,  10, /**/ 0, 2,  4,  12, 14, 6,  8,  10,  //
-      6,  12, 14, 0,  2,  4,  8,  10, /**/ 0, 6,  12, 14, 2,  4,  8,  10,  //
-      2,  6,  12, 14, 0,  4,  8,  10, /**/ 0, 2,  6,  12, 14, 4,  8,  10,  //
-      4,  6,  12, 14, 0,  2,  8,  10, /**/ 0, 4,  6,  12, 14, 2,  8,  10,  //
-      2,  4,  6,  12, 14, 0,  8,  10, /**/ 0, 2,  4,  6,  12, 14, 8,  10,  //
-      8,  12, 14, 0,  2,  4,  6,  10, /**/ 0, 8,  12, 14, 2,  4,  6,  10,  //
-      2,  8,  12, 14, 0,  4,  6,  10, /**/ 0, 2,  8,  12, 14, 4,  6,  10,  //
-      4,  8,  12, 14, 0,  2,  6,  10, /**/ 0, 4,  8,  12, 14, 2,  6,  10,  //
-      2,  4,  8,  12, 14, 0,  6,  10, /**/ 0, 2,  4,  8,  12, 14, 6,  10,  //
-      6,  8,  12, 14, 0,  2,  4,  10, /**/ 0, 6,  8,  12, 14, 2,  4,  10,  //
-      2,  6,  8,  12, 14, 0,  4,  10, /**/ 0, 2,  6,  8,  12, 14, 4,  10,  //
-      4,  6,  8,  12, 14, 0,  2,  10, /**/ 0, 4,  6,  8,  12, 14, 2,  10,  //
-      2,  4,  6,  8,  12, 14, 0,  10, /**/ 0, 2,  4,  6,  8,  12, 14, 10,  //
-      10, 12, 14, 0,  2,  4,  6,  8,  /**/ 0, 10, 12, 14, 2,  4,  6,  8,   //
-      2,  10, 12, 14, 0,  4,  6,  8,  /**/ 0, 2,  10, 12, 14, 4,  6,  8,   //
-      4,  10, 12, 14, 0,  2,  6,  8,  /**/ 0, 4,  10, 12, 14, 2,  6,  8,   //
-      2,  4,  10, 12, 14, 0,  6,  8,  /**/ 0, 2,  4,  10, 12, 14, 6,  8,   //
-      6,  10, 12, 14, 0,  2,  4,  8,  /**/ 0, 6,  10, 12, 14, 2,  4,  8,   //
-      2,  6,  10, 12, 14, 0,  4,  8,  /**/ 0, 2,  6,  10, 12, 14, 4,  8,   //
-      4,  6,  10, 12, 14, 0,  2,  8,  /**/ 0, 4,  6,  10, 12, 14, 2,  8,   //
-      2,  4,  6,  10, 12, 14, 0,  8,  /**/ 0, 2,  4,  6,  10, 12, 14, 8,   //
-      8,  10, 12, 14, 0,  2,  4,  6,  /**/ 0, 8,  10, 12, 14, 2,  4,  6,   //
-      2,  8,  10, 12, 14, 0,  4,  6,  /**/ 0, 2,  8,  10, 12, 14, 4,  6,   //
-      4,  8,  10, 12, 14, 0,  2,  6,  /**/ 0, 4,  8,  10, 12, 14, 2,  6,   //
-      2,  4,  8,  10, 12, 14, 0,  6,  /**/ 0, 2,  4,  8,  10, 12, 14, 6,   //
-      6,  8,  10, 12, 14, 0,  2,  4,  /**/ 0, 6,  8,  10, 12, 14, 2,  4,   //
-      2,  6,  8,  10, 12, 14, 0,  4,  /**/ 0, 2,  6,  8,  10, 12, 14, 4,   //
-      4,  6,  8,  10, 12, 14, 0,  2,  /**/ 0, 4,  6,  8,  10, 12, 14, 2,   //
-      2,  4,  6,  8,  10, 12, 14, 0,  /**/ 0, 2,  4,  6,  8,  10, 12, 14};
-
-  const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw};
-  const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
-  return BitCast(d, pairs + Set(du, 0x0100));
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
-HWY_INLINE Vec128<T, N> IdxFromNotBits(const uint64_t mask_bits) {
-  HWY_DASSERT(mask_bits < 256);
-  const Simd<T, N, 0> d;
-  const Rebind<uint8_t, decltype(d)> d8;
-  const Simd<uint16_t, N, 0> du;
-
-  // We need byte indices for TableLookupBytes (one vector's worth for each of
-  // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We
-  // can instead store lane indices and convert to byte indices (2*lane + 0..1),
-  // with the doubling baked into the table. Unpacking nibbles is likely more
-  // costly than the higher cache footprint from storing bytes.
-  alignas(16) constexpr uint8_t table[256 * 8] = {
-      // PrintCompressNot16x8Tables
-      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 12, 14, 0,   //
-      0, 4,  6,  8,  10, 12, 14, 2,  /**/ 4,  6,  8,  10, 12, 14, 0,  2,   //
-      0, 2,  6,  8,  10, 12, 14, 4,  /**/ 2,  6,  8,  10, 12, 14, 0,  4,   //
-      0, 6,  8,  10, 12, 14, 2,  4,  /**/ 6,  8,  10, 12, 14, 0,  2,  4,   //
-      0, 2,  4,  8,  10, 12, 14, 6,  /**/ 2,  4,  8,  10, 12, 14, 0,  6,   //
-      0, 4,  8,  10, 12, 14, 2,  6,  /**/ 4,  8,  10, 12, 14, 0,  2,  6,   //
-      0, 2,  8,  10, 12, 14, 4,  6,  /**/ 2,  8,  10, 12, 14, 0,  4,  6,   //
-      0, 8,  10, 12, 14, 2,  4,  6,  /**/ 8,  10, 12, 14, 0,  2,  4,  6,   //
-      0, 2,  4,  6,  10, 12, 14, 8,  /**/ 2,  4,  6,  10, 12, 14, 0,  8,   //
-      0, 4,  6,  10, 12, 14, 2,  8,  /**/ 4,  6,  10, 12, 14, 0,  2,  8,   //
-      0, 2,  6,  10, 12, 14, 4,  8,  /**/ 2,  6,  10, 12, 14, 0,  4,  8,   //
-      0, 6,  10, 12, 14, 2,  4,  8,  /**/ 6,  10, 12, 14, 0,  2,  4,  8,   //
-      0, 2,  4,  10, 12, 14, 6,  8,  /**/ 2,  4,  10, 12, 14, 0,  6,  8,   //
-      0, 4,  10, 12, 14, 2,  6,  8,  /**/ 4,  10, 12, 14, 0,  2,  6,  8,   //
-      0, 2,  10, 12, 14, 4,  6,  8,  /**/ 2,  10, 12, 14, 0,  4,  6,  8,   //
-      0, 10, 12, 14, 2,  4,  6,  8,  /**/ 10, 12, 14, 0,  2,  4,  6,  8,   //
-      0, 2,  4,  6,  8,  12, 14, 10, /**/ 2,  4,  6,  8,  12, 14, 0,  10,  //
-      0, 4,  6,  8,  12, 14, 2,  10, /**/ 4,  6,  8,  12, 14, 0,  2,  10,  //
-      0, 2,  6,  8,  12, 14, 4,  10, /**/ 2,  6,  8,  12, 14, 0,  4,  10,  //
-      0, 6,  8,  12, 14, 2,  4,  10, /**/ 6,  8,  12, 14, 0,  2,  4,  10,  //
-      0, 2,  4,  8,  12, 14, 6,  10, /**/ 2,  4,  8,  12, 14, 0,  6,  10,  //
-      0, 4,  8,  12, 14, 2,  6,  10, /**/ 4,  8,  12, 14, 0,  2,  6,  10,  //
-      0, 2,  8,  12, 14, 4,  6,  10, /**/ 2,  8,  12, 14, 0,  4,  6,  10,  //
-      0, 8,  12, 14, 2,  4,  6,  10, /**/ 8,  12, 14, 0,  2,  4,  6,  10,  //
-      0, 2,  4,  6,  12, 14, 8,  10, /**/ 2,  4,  6,  12, 14, 0,  8,  10,  //
-      0, 4,  6,  12, 14, 2,  8,  10, /**/ 4,  6,  12, 14, 0,  2,  8,  10,  //
-      0, 2,  6,  12, 14, 4,  8,  10, /**/ 2,  6,  12, 14, 0,  4,  8,  10,  //
-      0, 6,  12, 14, 2,  4,  8,  10, /**/ 6,  12, 14, 0,  2,  4,  8,  10,  //
-      0, 2,  4,  12, 14, 6,  8,  10, /**/ 2,  4,  12, 14, 0,  6,  8,  10,  //
-      0, 4,  12, 14, 2,  6,  8,  10, /**/ 4,  12, 14, 0,  2,  6,  8,  10,  //
-      0, 2,  12, 14, 4,  6,  8,  10, /**/ 2,  12, 14, 0,  4,  6,  8,  10,  //
-      0, 12, 14, 2,  4,  6,  8,  10, /**/ 12, 14, 0,  2,  4,  6,  8,  10,  //
-      0, 2,  4,  6,  8,  10, 14, 12, /**/ 2,  4,  6,  8,  10, 14, 0,  12,  //
-      0, 4,  6,  8,  10, 14, 2,  12, /**/ 4,  6,  8,  10, 14, 0,  2,  12,  //
-      0, 2,  6,  8,  10, 14, 4,  12, /**/ 2,  6,  8,  10, 14, 0,  4,  12,  //
-      0, 6,  8,  10, 14, 2,  4,  12, /**/ 6,  8,  10, 14, 0,  2,  4,  12,  //
-      0, 2,  4,  8,  10, 14, 6,  12, /**/ 2,  4,  8,  10, 14, 0,  6,  12,  //
-      0, 4,  8,  10, 14, 2,  6,  12, /**/ 4,  8,  10, 14, 0,  2,  6,  12,  //
-      0, 2,  8,  10, 14, 4,  6,  12, /**/ 2,  8,  10, 14, 0,  4,  6,  12,  //
-      0, 8,  10, 14, 2,  4,  6,  12, /**/ 8,  10, 14, 0,  2,  4,  6,  12,  //
-      0, 2,  4,  6,  10, 14, 8,  12, /**/ 2,  4,  6,  10, 14, 0,  8,  12,  //
-      0, 4,  6,  10, 14, 2,  8,  12, /**/ 4,  6,  10, 14, 0,  2,  8,  12,  //
-      0, 2,  6,  10, 14, 4,  8,  12, /**/ 2,  6,  10, 14, 0,  4,  8,  12,  //
-      0, 6,  10, 14, 2,  4,  8,  12, /**/ 6,  10, 14, 0,  2,  4,  8,  12,  //
-      0, 2,  4,  10, 14, 6,  8,  12, /**/ 2,  4,  10, 14, 0,  6,  8,  12,  //
-      0, 4,  10, 14, 2,  6,  8,  12, /**/ 4,  10, 14, 0,  2,  6,  8,  12,  //
-      0, 2,  10, 14, 4,  6,  8,  12, /**/ 2,  10, 14, 0,  4,  6,  8,  12,  //
-      0, 10, 14, 2,  4,  6,  8,  12, /**/ 10, 14, 0,  2,  4,  6,  8,  12,  //
-      0, 2,  4,  6,  8,  14, 10, 12, /**/ 2,  4,  6,  8,  14, 0,  10, 12,  //
-      0, 4,  6,  8,  14, 2,  10, 12, /**/ 4,  6,  8,  14, 0,  2,  10, 12,  //
-      0, 2,  6,  8,  14, 4,  10, 12, /**/ 2,  6,  8,  14, 0,  4,  10, 12,  //
-      0, 6,  8,  14, 2,  4,  10, 12, /**/ 6,  8,  14, 0,  2,  4,  10, 12,  //
-      0, 2,  4,  8,  14, 6,  10, 12, /**/ 2,  4,  8,  14, 0,  6,  10, 12,  //
-      0, 4,  8,  14, 2,  6,  10, 12, /**/ 4,  8,  14, 0,  2,  6,  10, 12,  //
-      0, 2,  8,  14, 4,  6,  10, 12, /**/ 2,  8,  14, 0,  4,  6,  10, 12,  //
-      0, 8,  14, 2,  4,  6,  10, 12, /**/ 8,  14, 0,  2,  4,  6,  10, 12,  //
-      0, 2,  4,  6,  14, 8,  10, 12, /**/ 2,  4,  6,  14, 0,  8,  10, 12,  //
-      0, 4,  6,  14, 2,  8,  10, 12, /**/ 4,  6,  14, 0,  2,  8,  10, 12,  //
-      0, 2,  6,  14, 4,  8,  10, 12, /**/ 2,  6,  14, 0,  4,  8,  10, 12,  //
-      0, 6,  14, 2,  4,  8,  10, 12, /**/ 6,  14, 0,  2,  4,  8,  10, 12,  //
-      0, 2,  4,  14, 6,  8,  10, 12, /**/ 2,  4,  14, 0,  6,  8,  10, 12,  //
-      0, 4,  14, 2,  6,  8,  10, 12, /**/ 4,  14, 0,  2,  6,  8,  10, 12,  //
-      0, 2,  14, 4,  6,  8,  10, 12, /**/ 2,  14, 0,  4,  6,  8,  10, 12,  //
-      0, 14, 2,  4,  6,  8,  10, 12, /**/ 14, 0,  2,  4,  6,  8,  10, 12,  //
-      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 12, 0,  14,  //
-      0, 4,  6,  8,  10, 12, 2,  14, /**/ 4,  6,  8,  10, 12, 0,  2,  14,  //
-      0, 2,  6,  8,  10, 12, 4,  14, /**/ 2,  6,  8,  10, 12, 0,  4,  14,  //
-      0, 6,  8,  10, 12, 2,  4,  14, /**/ 6,  8,  10, 12, 0,  2,  4,  14,  //
-      0, 2,  4,  8,  10, 12, 6,  14, /**/ 2,  4,  8,  10, 12, 0,  6,  14,  //
-      0, 4,  8,  10, 12, 2,  6,  14, /**/ 4,  8,  10, 12, 0,  2,  6,  14,  //
-      0, 2,  8,  10, 12, 4,  6,  14, /**/ 2,  8,  10, 12, 0,  4,  6,  14,  //
-      0, 8,  10, 12, 2,  4,  6,  14, /**/ 8,  10, 12, 0,  2,  4,  6,  14,  //
-      0, 2,  4,  6,  10, 12, 8,  14, /**/ 2,  4,  6,  10, 12, 0,  8,  14,  //
-      0, 4,  6,  10, 12, 2,  8,  14, /**/ 4,  6,  10, 12, 0,  2,  8,  14,  //
-      0, 2,  6,  10, 12, 4,  8,  14, /**/ 2,  6,  10, 12, 0,  4,  8,  14,  //
-      0, 6,  10, 12, 2,  4,  8,  14, /**/ 6,  10, 12, 0,  2,  4,  8,  14,  //
-      0, 2,  4,  10, 12, 6,  8,  14, /**/ 2,  4,  10, 12, 0,  6,  8,  14,  //
-      0, 4,  10, 12, 2,  6,  8,  14, /**/ 4,  10, 12, 0,  2,  6,  8,  14,  //
-      0, 2,  10, 12, 4,  6,  8,  14, /**/ 2,  10, 12, 0,  4,  6,  8,  14,  //
-      0, 10, 12, 2,  4,  6,  8,  14, /**/ 10, 12, 0,  2,  4,  6,  8,  14,  //
-      0, 2,  4,  6,  8,  12, 10, 14, /**/ 2,  4,  6,  8,  12, 0,  10, 14,  //
-      0, 4,  6,  8,  12, 2,  10, 14, /**/ 4,  6,  8,  12, 0,  2,  10, 14,  //
-      0, 2,  6,  8,  12, 4,  10, 14, /**/ 2,  6,  8,  12, 0,  4,  10, 14,  //
-      0, 6,  8,  12, 2,  4,  10, 14, /**/ 6,  8,  12, 0,  2,  4,  10, 14,  //
-      0, 2,  4,  8,  12, 6,  10, 14, /**/ 2,  4,  8,  12, 0,  6,  10, 14,  //
-      0, 4,  8,  12, 2,  6,  10, 14, /**/ 4,  8,  12, 0,  2,  6,  10, 14,  //
-      0, 2,  8,  12, 4,  6,  10, 14, /**/ 2,  8,  12, 0,  4,  6,  10, 14,  //
-      0, 8,  12, 2,  4,  6,  10, 14, /**/ 8,  12, 0,  2,  4,  6,  10, 14,  //
-      0, 2,  4,  6,  12, 8,  10, 14, /**/ 2,  4,  6,  12, 0,  8,  10, 14,  //
-      0, 4,  6,  12, 2,  8,  10, 14, /**/ 4,  6,  12, 0,  2,  8,  10, 14,  //
-      0, 2,  6,  12, 4,  8,  10, 14, /**/ 2,  6,  12, 0,  4,  8,  10, 14,  //
-      0, 6,  12, 2,  4,  8,  10, 14, /**/ 6,  12, 0,  2,  4,  8,  10, 14,  //
-      0, 2,  4,  12, 6,  8,  10, 14, /**/ 2,  4,  12, 0,  6,  8,  10, 14,  //
-      0, 4,  12, 2,  6,  8,  10, 14, /**/ 4,  12, 0,  2,  6,  8,  10, 14,  //
-      0, 2,  12, 4,  6,  8,  10, 14, /**/ 2,  12, 0,  4,  6,  8,  10, 14,  //
-      0, 12, 2,  4,  6,  8,  10, 14, /**/ 12, 0,  2,  4,  6,  8,  10, 14,  //
-      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 0,  12, 14,  //
-      0, 4,  6,  8,  10, 2,  12, 14, /**/ 4,  6,  8,  10, 0,  2,  12, 14,  //
-      0, 2,  6,  8,  10, 4,  12, 14, /**/ 2,  6,  8,  10, 0,  4,  12, 14,  //
-      0, 6,  8,  10, 2,  4,  12, 14, /**/ 6,  8,  10, 0,  2,  4,  12, 14,  //
-      0, 2,  4,  8,  10, 6,  12, 14, /**/ 2,  4,  8,  10, 0,  6,  12, 14,  //
-      0, 4,  8,  10, 2,  6,  12, 14, /**/ 4,  8,  10, 0,  2,  6,  12, 14,  //
-      0, 2,  8,  10, 4,  6,  12, 14, /**/ 2,  8,  10, 0,  4,  6,  12, 14,  //
-      0, 8,  10, 2,  4,  6,  12, 14, /**/ 8,  10, 0,  2,  4,  6,  12, 14,  //
-      0, 2,  4,  6,  10, 8,  12, 14, /**/ 2,  4,  6,  10, 0,  8,  12, 14,  //
-      0, 4,  6,  10, 2,  8,  12, 14, /**/ 4,  6,  10, 0,  2,  8,  12, 14,  //
-      0, 2,  6,  10, 4,  8,  12, 14, /**/ 2,  6,  10, 0,  4,  8,  12, 14,  //
-      0, 6,  10, 2,  4,  8,  12, 14, /**/ 6,  10, 0,  2,  4,  8,  12, 14,  //
-      0, 2,  4,  10, 6,  8,  12, 14, /**/ 2,  4,  10, 0,  6,  8,  12, 14,  //
-      0, 4,  10, 2,  6,  8,  12, 14, /**/ 4,  10, 0,  2,  6,  8,  12, 14,  //
-      0, 2,  10, 4,  6,  8,  12, 14, /**/ 2,  10, 0,  4,  6,  8,  12, 14,  //
-      0, 10, 2,  4,  6,  8,  12, 14, /**/ 10, 0,  2,  4,  6,  8,  12, 14,  //
-      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  0,  10, 12, 14,  //
-      0, 4,  6,  8,  2,  10, 12, 14, /**/ 4,  6,  8,  0,  2,  10, 12, 14,  //
-      0, 2,  6,  8,  4,  10, 12, 14, /**/ 2,  6,  8,  0,  4,  10, 12, 14,  //
-      0, 6,  8,  2,  4,  10, 12, 14, /**/ 6,  8,  0,  2,  4,  10, 12, 14,  //
-      0, 2,  4,  8,  6,  10, 12, 14, /**/ 2,  4,  8,  0,  6,  10, 12, 14,  //
-      0, 4,  8,  2,  6,  10, 12, 14, /**/ 4,  8,  0,  2,  6,  10, 12, 14,  //
-      0, 2,  8,  4,  6,  10, 12, 14, /**/ 2,  8,  0,  4,  6,  10, 12, 14,  //
-      0, 8,  2,  4,  6,  10, 12, 14, /**/ 8,  0,  2,  4,  6,  10, 12, 14,  //
-      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  0,  8,  10, 12, 14,  //
-      0, 4,  6,  2,  8,  10, 12, 14, /**/ 4,  6,  0,  2,  8,  10, 12, 14,  //
-      0, 2,  6,  4,  8,  10, 12, 14, /**/ 2,  6,  0,  4,  8,  10, 12, 14,  //
-      0, 6,  2,  4,  8,  10, 12, 14, /**/ 6,  0,  2,  4,  8,  10, 12, 14,  //
-      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  0,  6,  8,  10, 12, 14,  //
-      0, 4,  2,  6,  8,  10, 12, 14, /**/ 4,  0,  2,  6,  8,  10, 12, 14,  //
-      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  0,  4,  6,  8,  10, 12, 14,  //
-      0, 2,  4,  6,  8,  10, 12, 14, /**/ 0,  2,  4,  6,  8,  10, 12, 14};
-
-  const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw};
-  const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
-  return BitCast(d, pairs + Set(du, 0x0100));
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
-HWY_INLINE Vec128<T, N> IdxFromBits(const uint64_t mask_bits) {
-  HWY_DASSERT(mask_bits < 16);
-
-  // There are only 4 lanes, so we can afford to load the index vector directly.
-  alignas(16) constexpr uint8_t u8_indices[16 * 16] = {
-      // PrintCompress32x4Tables
-      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
-      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
-      4,  5,  6,  7,  0,  1,  2,  3,  8,  9,  10, 11, 12, 13, 14, 15,  //
-      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
-      8,  9,  10, 11, 0,  1,  2,  3,  4,  5,  6,  7,  12, 13, 14, 15,  //
-      0,  1,  2,  3,  8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15,  //
-      4,  5,  6,  7,  8,  9,  10, 11, 0,  1,  2,  3,  12, 13, 14, 15,  //
-      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
-      12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,  //
-      0,  1,  2,  3,  12, 13, 14, 15, 4,  5,  6,  7,  8,  9,  10, 11,  //
-      4,  5,  6,  7,  12, 13, 14, 15, 0,  1,  2,  3,  8,  9,  10, 11,  //
-      0,  1,  2,  3,  4,  5,  6,  7,  12, 13, 14, 15, 8,  9,  10, 11,  //
-      8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,   //
-      0,  1,  2,  3,  8,  9,  10, 11, 12, 13, 14, 15, 4,  5,  6,  7,   //
-      4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,   //
-      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15};
-  const Simd<T, N, 0> d;
-  const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
-HWY_INLINE Vec128<T, N> IdxFromNotBits(const uint64_t mask_bits) {
-  HWY_DASSERT(mask_bits < 16);
-
-  // There are only 4 lanes, so we can afford to load the index vector directly.
-  alignas(16) constexpr uint8_t u8_indices[16 * 16] = {
-      // PrintCompressNot32x4Tables
-      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 4,  5,
-      6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  0,  1,  2,  3,
-      8,  9,  10, 11, 12, 13, 14, 15, 4,  5,  6,  7,  8,  9,  10, 11, 12, 13,
-      14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7,
-      12, 13, 14, 15, 8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15, 0,  1,
-      2,  3,  8,  9,  10, 11, 0,  1,  2,  3,  12, 13, 14, 15, 4,  5,  6,  7,
-      8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
-      10, 11, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-      4,  5,  6,  7,  8,  9,  10, 11, 0,  1,  2,  3,  12, 13, 14, 15, 0,  1,
-      2,  3,  8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15, 8,  9,  10, 11,
-      0,  1,  2,  3,  4,  5,  6,  7,  12, 13, 14, 15, 0,  1,  2,  3,  4,  5,
-      6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 4,  5,  6,  7,  0,  1,  2,  3,
-      8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
-      10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
-      12, 13, 14, 15};
-  const Simd<T, N, 0> d;
-  const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
-HWY_INLINE Vec128<T, N> IdxFromBits(const uint64_t mask_bits) {
-  HWY_DASSERT(mask_bits < 4);
-
-  // There are only 2 lanes, so we can afford to load the index vector directly.
-  alignas(16) constexpr uint8_t u8_indices[4 * 16] = {
-      // PrintCompress64x2Tables
-      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
-      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
-      8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2,  3,  4,  5,  6,  7,
-      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15};
-
-  const Simd<T, N, 0> d;
-  const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
-HWY_INLINE Vec128<T, N> IdxFromNotBits(const uint64_t mask_bits) {
-  HWY_DASSERT(mask_bits < 4);
-
-  // There are only 2 lanes, so we can afford to load the index vector directly.
-  alignas(16) constexpr uint8_t u8_indices[4 * 16] = {
-      // PrintCompressNot64x2Tables
-      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
-      8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2,  3,  4,  5,  6,  7,
-      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
-      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15};
-
-  const Simd<T, N, 0> d;
-  const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
-}
-
-// Helper functions called by both Compress and CompressStore - avoids a
-// redundant BitsFromMask in the latter.
-
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> Compress(Vec128<T, N> v, const uint64_t mask_bits) {
-  const auto idx = detail::IdxFromBits<T, N>(mask_bits);
-  const DFromV<decltype(v)> d;
-  const RebindToSigned<decltype(d)> di;
-  return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
-}
-
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> CompressNot(Vec128<T, N> v, const uint64_t mask_bits) {
-  const auto idx = detail::IdxFromNotBits<T, N>(mask_bits);
-  const DFromV<decltype(v)> d;
-  const RebindToSigned<decltype(d)> di;
-  return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
-}
-
-}  // namespace detail
-
-template <typename T>
-struct CompressIsPartition {
-  enum { value = 1 };
-};
-
-// Single lane: no-op
-template <typename T>
-HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
-  return v;
-}
-
-// Two lanes: conditional swap
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) {
-  // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep.
-  const Full128<T> d;
-  const Vec128<T> m = VecFromMask(d, mask);
-  const Vec128<T> maskL = DupEven(m);
-  const Vec128<T> maskH = DupOdd(m);
-  const Vec128<T> swap = AndNot(maskL, maskH);
-  return IfVecThenElse(swap, Shuffle01(v), v);
-}
-
-// General case
-template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 8)>
-HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
-  return detail::Compress(v, detail::BitsFromMask(mask));
-}
-
-// Single lane: no-op
-template <typename T>
-HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
-  return v;
-}
-
-// Two lanes: conditional swap
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) {
-  // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep.
-  const Full128<T> d;
-  const Vec128<T> m = VecFromMask(d, mask);
-  const Vec128<T> maskL = DupEven(m);
-  const Vec128<T> maskH = DupOdd(m);
-  const Vec128<T> swap = AndNot(maskH, maskL);
-  return IfVecThenElse(swap, Shuffle01(v), v);
-}
-
-// General case
-template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 8)>
-HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
-  // For partial vectors, we cannot pull the Not() into the table because
-  // BitsFromMask clears the upper bits.
-  if (N < 16 / sizeof(T)) {
-    return detail::Compress(v, detail::BitsFromMask(Not(mask)));
-  }
-  return detail::CompressNot(v, detail::BitsFromMask(mask));
-}
-// ------------------------------ CompressBlocksNot
-HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
-                                           Mask128<uint64_t> /* m */) {
-  return v;
-}
-
-// ------------------------------ CompressBits
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v,
-                                  const uint8_t* HWY_RESTRICT bits) {
-  uint64_t mask_bits = 0;
-  constexpr size_t kNumBytes = (N + 7) / 8;
-  CopyBytes<kNumBytes>(bits, &mask_bits);
-  if (N < 8) {
-    mask_bits &= (1ull << N) - 1;
-  }
-
-  return detail::Compress(v, mask_bits);
-}
-
-// ------------------------------ CompressStore
-template <typename T, size_t N>
-HWY_API size_t CompressStore(Vec128<T, N> v, const Mask128<T, N> mask,
-                             Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
-  const uint64_t mask_bits = detail::BitsFromMask(mask);
-  const auto c = detail::Compress(v, mask_bits);
-  StoreU(c, d, unaligned);
-  return PopCount(mask_bits);
-}
-
-// ------------------------------ CompressBlendedStore
-template <typename T, size_t N>
-HWY_API size_t CompressBlendedStore(Vec128<T, N> v, Mask128<T, N> m,
-                                    Simd<T, N, 0> d,
-                                    T* HWY_RESTRICT unaligned) {
-  const RebindToUnsigned<decltype(d)> du;  // so we can support fp16/bf16
-  using TU = TFromD<decltype(du)>;
-  const uint64_t mask_bits = detail::BitsFromMask(m);
-  const size_t count = PopCount(mask_bits);
-  const Vec128<TU, N> compressed = detail::Compress(BitCast(du, v), mask_bits);
-  const Mask128<T, N> store_mask = RebindMask(d, FirstN(du, count));
-  BlendedStore(BitCast(d, compressed), store_mask, d, unaligned);
-  return count;
-}
-
-// ------------------------------ CompressBitsStore
-
-template <typename T, size_t N>
-HWY_API size_t CompressBitsStore(Vec128<T, N> v,
-                                 const uint8_t* HWY_RESTRICT bits,
-                                 Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
-  uint64_t mask_bits = 0;
-  constexpr size_t kNumBytes = (N + 7) / 8;
-  CopyBytes<kNumBytes>(bits, &mask_bits);
-  if (N < 8) {
-    mask_bits &= (1ull << N) - 1;
-  }
-
-  const auto c = detail::Compress(v, mask_bits);
-  StoreU(c, d, unaligned);
-  return PopCount(mask_bits);
-}
-
-// ------------------------------ StoreInterleaved2/3/4
-
-// HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in
-// generic_ops-inl.h.
-
-// ------------------------------ MulEven/Odd (Load)
-
-HWY_INLINE Vec128<uint64_t> MulEven(const Vec128<uint64_t> a,
-                                    const Vec128<uint64_t> b) {
-  alignas(16) uint64_t mul[2];
-  mul[0] =
-      Mul128(static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 0)),
-             static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 0)), &mul[1]);
-  return Load(Full128<uint64_t>(), mul);
-}
-
-HWY_INLINE Vec128<uint64_t> MulOdd(const Vec128<uint64_t> a,
-                                   const Vec128<uint64_t> b) {
-  alignas(16) uint64_t mul[2];
-  mul[0] =
-      Mul128(static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 1)),
-             static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 1)), &mul[1]);
-  return Load(Full128<uint64_t>(), mul);
-}
-
-// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
-
-template <size_t N>
-HWY_API Vec128<float, N> ReorderWidenMulAccumulate(Simd<float, N, 0> df32,
-                                                   Vec128<bfloat16_t, 2 * N> a,
-                                                   Vec128<bfloat16_t, 2 * N> b,
-                                                   const Vec128<float, N> sum0,
-                                                   Vec128<float, N>& sum1) {
-  const Repartition<uint16_t, decltype(df32)> du16;
-  const RebindToUnsigned<decltype(df32)> du32;
-  const Vec128<uint16_t, 2 * N> zero = Zero(du16);
-  const Vec128<uint32_t, N> a0 = ZipLower(du32, zero, BitCast(du16, a));
-  const Vec128<uint32_t, N> a1 = ZipUpper(du32, zero, BitCast(du16, a));
-  const Vec128<uint32_t, N> b0 = ZipLower(du32, zero, BitCast(du16, b));
-  const Vec128<uint32_t, N> b1 = ZipUpper(du32, zero, BitCast(du16, b));
-  sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
-  return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
-}
-
-// ------------------------------ Reductions
-
-namespace detail {
-
-// N=1 for any T: no-op
-template <typename T>
-HWY_INLINE Vec128<T, 1> SumOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
-                                   const Vec128<T, 1> v) {
-  return v;
-}
-template <typename T>
-HWY_INLINE Vec128<T, 1> MinOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
-                                   const Vec128<T, 1> v) {
-  return v;
-}
-template <typename T>
-HWY_INLINE Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
-                                   const Vec128<T, 1> v) {
-  return v;
-}
-
-// u32/i32/f32:
-
-// N=2
-template <typename T>
-HWY_INLINE Vec128<T, 2> SumOfLanes(hwy::SizeTag<4> /* tag */,
-                                   const Vec128<T, 2> v10) {
-  return v10 + Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw};
-}
-template <typename T>
-HWY_INLINE Vec128<T, 2> MinOfLanes(hwy::SizeTag<4> /* tag */,
-                                   const Vec128<T, 2> v10) {
-  return Min(v10, Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw});
-}
-template <typename T>
-HWY_INLINE Vec128<T, 2> MaxOfLanes(hwy::SizeTag<4> /* tag */,
-                                   const Vec128<T, 2> v10) {
-  return Max(v10, Vec128<T, 2>{Shuffle2301(Vec128<T>{v10.raw}).raw});
-}
-
-// N=4 (full)
-template <typename T>
-HWY_INLINE Vec128<T> SumOfLanes(hwy::SizeTag<4> /* tag */,
-                                const Vec128<T> v3210) {
-  const Vec128<T> v1032 = Shuffle1032(v3210);
-  const Vec128<T> v31_20_31_20 = v3210 + v1032;
-  const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
-  return v20_31_20_31 + v31_20_31_20;
-}
-template <typename T>
-HWY_INLINE Vec128<T> MinOfLanes(hwy::SizeTag<4> /* tag */,
-                                const Vec128<T> v3210) {
-  const Vec128<T> v1032 = Shuffle1032(v3210);
-  const Vec128<T> v31_20_31_20 = Min(v3210, v1032);
-  const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
-  return Min(v20_31_20_31, v31_20_31_20);
-}
-template <typename T>
-HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<4> /* tag */,
-                                const Vec128<T> v3210) {
-  const Vec128<T> v1032 = Shuffle1032(v3210);
-  const Vec128<T> v31_20_31_20 = Max(v3210, v1032);
-  const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
-  return Max(v20_31_20_31, v31_20_31_20);
-}
-
-// u64/i64/f64:
-
-// N=2 (full)
-template <typename T>
-HWY_INLINE Vec128<T> SumOfLanes(hwy::SizeTag<8> /* tag */,
-                                const Vec128<T> v10) {
-  const Vec128<T> v01 = Shuffle01(v10);
-  return v10 + v01;
-}
-template <typename T>
-HWY_INLINE Vec128<T> MinOfLanes(hwy::SizeTag<8> /* tag */,
-                                const Vec128<T> v10) {
-  const Vec128<T> v01 = Shuffle01(v10);
-  return Min(v10, v01);
-}
-template <typename T>
-HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
-                                const Vec128<T> v10) {
-  const Vec128<T> v01 = Shuffle01(v10);
-  return Max(v10, v01);
-}
-
-template <size_t N, HWY_IF_GE32(uint16_t, N)>
-HWY_API Vec128<uint16_t, N> MinOfLanes(hwy::SizeTag<2> /* tag */,
-                                       Vec128<uint16_t, N> v) {
-  const Simd<uint16_t, N, 0> d;
-  const RepartitionToWide<decltype(d)> d32;
-  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
-  const auto odd = ShiftRight<16>(BitCast(d32, v));
-  const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd));
-  // Also broadcast into odd lanes.
-  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
-}
-template <size_t N, HWY_IF_GE32(int16_t, N)>
-HWY_API Vec128<int16_t, N> MinOfLanes(hwy::SizeTag<2> /* tag */,
-                                      Vec128<int16_t, N> v) {
-  const Simd<int16_t, N, 0> d;
-  const RepartitionToWide<decltype(d)> d32;
-  // Sign-extend
-  const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
-  const auto odd = ShiftRight<16>(BitCast(d32, v));
-  const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd));
-  // Also broadcast into odd lanes.
-  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
-}
-
-template <size_t N, HWY_IF_GE32(uint16_t, N)>
-HWY_API Vec128<uint16_t, N> MaxOfLanes(hwy::SizeTag<2> /* tag */,
-                                       Vec128<uint16_t, N> v) {
-  const Simd<uint16_t, N, 0> d;
-  const RepartitionToWide<decltype(d)> d32;
-  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
-  const auto odd = ShiftRight<16>(BitCast(d32, v));
-  const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd));
-  // Also broadcast into odd lanes.
-  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
-}
-template <size_t N, HWY_IF_GE32(int16_t, N)>
-HWY_API Vec128<int16_t, N> MaxOfLanes(hwy::SizeTag<2> /* tag */,
-                                      Vec128<int16_t, N> v) {
-  const Simd<int16_t, N, 0> d;
-  const RepartitionToWide<decltype(d)> d32;
-  // Sign-extend
-  const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
-  const auto odd = ShiftRight<16>(BitCast(d32, v));
-  const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd));
-  // Also broadcast into odd lanes.
-  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
-}
-
-}  // namespace detail
-
-// Supported for u/i/f 32/64. Returns the same value in each lane.
-template <typename T, size_t N>
-HWY_API Vec128<T, N> SumOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
-  return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), v);
-}
-template <typename T, size_t N>
-HWY_API Vec128<T, N> MinOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
-  return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), v);
-}
-template <typename T, size_t N>
-HWY_API Vec128<T, N> MaxOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
-  return detail::MaxOfLanes(hwy::SizeTag<sizeof(T)>(), v);
-}
-
-// ------------------------------ Lt128
-
-template <typename T, size_t N, HWY_IF_LE128(T, N)>
-HWY_INLINE Mask128<T, N> Lt128(Simd<T, N, 0> d, Vec128<T, N> a,
-                               Vec128<T, N> b) {
-  static_assert(!IsSigned<T>() && sizeof(T) == 8, "Use u64");
-  // Truth table of Eq and Lt for Hi and Lo u64.
-  // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
-  // =H =L cH cL  | out = cH | (=H & cL)
-  //  0  0  0  0  |  0
-  //  0  0  0  1  |  0
-  //  0  0  1  0  |  1
-  //  0  0  1  1  |  1
-  //  0  1  0  0  |  0
-  //  0  1  0  1  |  0
-  //  0  1  1  0  |  1
-  //  1  0  0  0  |  0
-  //  1  0  0  1  |  1
-  //  1  1  0  0  |  0
-  const Mask128<T, N> eqHL = Eq(a, b);
-  const Vec128<T, N> ltHL = VecFromMask(d, Lt(a, b));
-  // We need to bring cL to the upper lane/bit corresponding to cH. Comparing
-  // the result of InterleaveUpper/Lower requires 9 ops, whereas shifting the
-  // comparison result leftwards requires only 4. IfThenElse compiles to the
-  // same code as OrAnd().
-  const Vec128<T, N> ltLx = DupEven(ltHL);
-  const Vec128<T, N> outHx = IfThenElse(eqHL, ltLx, ltHL);
-  return MaskFromVec(DupOdd(outHx));
-}
-
-template <typename T, size_t N, HWY_IF_LE128(T, N)>
-HWY_INLINE Mask128<T, N> Lt128Upper(Simd<T, N, 0> d, Vec128<T, N> a,
-                                    Vec128<T, N> b) {
-  const Vec128<T, N> ltHL = VecFromMask(d, Lt(a, b));
-  return MaskFromVec(InterleaveUpper(d, ltHL, ltHL));
-}
-
-// ------------------------------ Eq128
-
-template <typename T, size_t N, HWY_IF_LE128(T, N)>
-HWY_INLINE Mask128<T, N> Eq128(Simd<T, N, 0> d, Vec128<T, N> a,
-                               Vec128<T, N> b) {
-  static_assert(!IsSigned<T>() && sizeof(T) == 8, "Use u64");
-  const Vec128<T, N> eqHL = VecFromMask(d, Eq(a, b));
-  return MaskFromVec(And(Reverse2(d, eqHL), eqHL));
-}
-
-template <typename T, size_t N, HWY_IF_LE128(T, N)>
-HWY_INLINE Mask128<T, N> Eq128Upper(Simd<T, N, 0> d, Vec128<T, N> a,
-                                    Vec128<T, N> b) {
-  const Vec128<T, N> eqHL = VecFromMask(d, Eq(a, b));
-  return MaskFromVec(InterleaveUpper(d, eqHL, eqHL));
-}
-
-// ------------------------------ Min128, Max128 (Lt128)
-
-// Without a native OddEven, it seems infeasible to go faster than Lt128.
-template <class D>
-HWY_INLINE VFromD<D> Min128(D d, const VFromD<D> a, const VFromD<D> b) {
-  return IfThenElse(Lt128(d, a, b), a, b);
-}
-
-template <class D>
-HWY_INLINE VFromD<D> Max128(D d, const VFromD<D> a, const VFromD<D> b) {
-  return IfThenElse(Lt128(d, b, a), a, b);
-}
-
-template <class D>
-HWY_INLINE VFromD<D> Min128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
-  return IfThenElse(Lt128Upper(d, a, b), a, b);
-}
-
-template <class D>
-HWY_INLINE VFromD<D> Max128Upper(D d, const VFromD<D> a, const VFromD<D> b) {
-  return IfThenElse(Lt128Upper(d, b, a), a, b);
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
diff --git a/third_party/highway/hwy/ops/wasm_256-inl.h b/third_party/highway/hwy/ops/wasm_256-inl.h
deleted file mode 100644 (file)
index e95d91e..0000000
+++ /dev/null
@@ -1,3028 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// 256-bit WASM vectors and operations. Experimental.
-// External include guard in highway.h - see comment there.
-
-#include <stddef.h>
-#include <stdint.h>
-#include <wasm_simd128.h>
-
-#include "hwy/base.h"
-#include "hwy/ops/shared-inl.h"
-#include "hwy/ops/wasm_128-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-template <typename T>
-using Full256 = Simd<T, 32 / sizeof(T), 0>;
-
-template <typename T>
-using Full128 = Simd<T, 16 / sizeof(T), 0>;
-
-// TODO(richardwinterton): add this to DeduceD in wasm_128 similar to x86_128.
-template <typename T>
-class Vec256 {
- public:
-  // Compound assignment. Only usable if there is a corresponding non-member
-  // binary operator overload. For example, only f32 and f64 support division.
-  HWY_INLINE Vec256& operator*=(const Vec256 other) {
-    return *this = (*this * other);
-  }
-  HWY_INLINE Vec256& operator/=(const Vec256 other) {
-    return *this = (*this / other);
-  }
-  HWY_INLINE Vec256& operator+=(const Vec256 other) {
-    return *this = (*this + other);
-  }
-  HWY_INLINE Vec256& operator-=(const Vec256 other) {
-    return *this = (*this - other);
-  }
-  HWY_INLINE Vec256& operator&=(const Vec256 other) {
-    return *this = (*this & other);
-  }
-  HWY_INLINE Vec256& operator|=(const Vec256 other) {
-    return *this = (*this | other);
-  }
-  HWY_INLINE Vec256& operator^=(const Vec256 other) {
-    return *this = (*this ^ other);
-  }
-
-  Vec128<T> v0;
-  Vec128<T> v1;
-};
-
-template <typename T>
-struct Mask256 {
-  Mask128<T> m0;
-  Mask128<T> m1;
-};
-
-// ------------------------------ BitCast
-
-template <typename T, typename FromT>
-HWY_API Vec256<T> BitCast(Full256<T> d, Vec256<FromT> v) {
-  const Half<decltype(d)> dh;
-  Vec256<T> ret;
-  ret.v0 = BitCast(dh, v.v0);
-  ret.v1 = BitCast(dh, v.v1);
-  return ret;
-
-  // TODO(richardwinterton): implement other ops like this
-}
-
-// ------------------------------ Zero
-
-// Returns an all-zero vector/part.
-template <typename T>
-HWY_API Vec256<T> Zero(Full256<T> /* tag */) {
-  return Vec256<T>{wasm_i32x4_splat(0)};
-}
-HWY_API Vec256<float> Zero(Full256<float> /* tag */) {
-  return Vec256<float>{wasm_f32x4_splat(0.0f)};
-}
-
-template <class D>
-using VFromD = decltype(Zero(D()));
-
-// ------------------------------ Set
-
-// Returns a vector/part with all lanes set to "t".
-HWY_API Vec256<uint8_t> Set(Full256<uint8_t> /* tag */, const uint8_t t) {
-  return Vec256<uint8_t>{wasm_i8x16_splat(static_cast<int8_t>(t))};
-}
-HWY_API Vec256<uint16_t> Set(Full256<uint16_t> /* tag */, const uint16_t t) {
-  return Vec256<uint16_t>{wasm_i16x8_splat(static_cast<int16_t>(t))};
-}
-HWY_API Vec256<uint32_t> Set(Full256<uint32_t> /* tag */, const uint32_t t) {
-  return Vec256<uint32_t>{wasm_i32x4_splat(static_cast<int32_t>(t))};
-}
-HWY_API Vec256<uint64_t> Set(Full256<uint64_t> /* tag */, const uint64_t t) {
-  return Vec256<uint64_t>{wasm_i64x2_splat(static_cast<int64_t>(t))};
-}
-
-HWY_API Vec256<int8_t> Set(Full256<int8_t> /* tag */, const int8_t t) {
-  return Vec256<int8_t>{wasm_i8x16_splat(t)};
-}
-HWY_API Vec256<int16_t> Set(Full256<int16_t> /* tag */, const int16_t t) {
-  return Vec256<int16_t>{wasm_i16x8_splat(t)};
-}
-HWY_API Vec256<int32_t> Set(Full256<int32_t> /* tag */, const int32_t t) {
-  return Vec256<int32_t>{wasm_i32x4_splat(t)};
-}
-HWY_API Vec256<int64_t> Set(Full256<int64_t> /* tag */, const int64_t t) {
-  return Vec256<int64_t>{wasm_i64x2_splat(t)};
-}
-
-HWY_API Vec256<float> Set(Full256<float> /* tag */, const float t) {
-  return Vec256<float>{wasm_f32x4_splat(t)};
-}
-
-HWY_DIAGNOSTICS(push)
-HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
-
-// Returns a vector with uninitialized elements.
-template <typename T>
-HWY_API Vec256<T> Undefined(Full256<T> d) {
-  return Zero(d);
-}
-
-HWY_DIAGNOSTICS(pop)
-
-// Returns a vector with lane i=[0, N) set to "first" + i.
-template <typename T, typename T2>
-Vec256<T> Iota(const Full256<T> d, const T2 first) {
-  HWY_ALIGN T lanes[16 / sizeof(T)];
-  for (size_t i = 0; i < 16 / sizeof(T); ++i) {
-    lanes[i] = static_cast<T>(first + static_cast<T2>(i));
-  }
-  return Load(d, lanes);
-}
-
-// ================================================== ARITHMETIC
-
-// ------------------------------ Addition
-
-// Unsigned
-HWY_API Vec256<uint8_t> operator+(const Vec256<uint8_t> a,
-                                  const Vec256<uint8_t> b) {
-  return Vec256<uint8_t>{wasm_i8x16_add(a.raw, b.raw)};
-}
-HWY_API Vec256<uint16_t> operator+(const Vec256<uint16_t> a,
-                                   const Vec256<uint16_t> b) {
-  return Vec256<uint16_t>{wasm_i16x8_add(a.raw, b.raw)};
-}
-HWY_API Vec256<uint32_t> operator+(const Vec256<uint32_t> a,
-                                   const Vec256<uint32_t> b) {
-  return Vec256<uint32_t>{wasm_i32x4_add(a.raw, b.raw)};
-}
-
-// Signed
-HWY_API Vec256<int8_t> operator+(const Vec256<int8_t> a,
-                                 const Vec256<int8_t> b) {
-  return Vec256<int8_t>{wasm_i8x16_add(a.raw, b.raw)};
-}
-HWY_API Vec256<int16_t> operator+(const Vec256<int16_t> a,
-                                  const Vec256<int16_t> b) {
-  return Vec256<int16_t>{wasm_i16x8_add(a.raw, b.raw)};
-}
-HWY_API Vec256<int32_t> operator+(const Vec256<int32_t> a,
-                                  const Vec256<int32_t> b) {
-  return Vec256<int32_t>{wasm_i32x4_add(a.raw, b.raw)};
-}
-
-// Float
-HWY_API Vec256<float> operator+(const Vec256<float> a, const Vec256<float> b) {
-  return Vec256<float>{wasm_f32x4_add(a.raw, b.raw)};
-}
-
-// ------------------------------ Subtraction
-
-// Unsigned
-HWY_API Vec256<uint8_t> operator-(const Vec256<uint8_t> a,
-                                  const Vec256<uint8_t> b) {
-  return Vec256<uint8_t>{wasm_i8x16_sub(a.raw, b.raw)};
-}
-HWY_API Vec256<uint16_t> operator-(Vec256<uint16_t> a, Vec256<uint16_t> b) {
-  return Vec256<uint16_t>{wasm_i16x8_sub(a.raw, b.raw)};
-}
-HWY_API Vec256<uint32_t> operator-(const Vec256<uint32_t> a,
-                                   const Vec256<uint32_t> b) {
-  return Vec256<uint32_t>{wasm_i32x4_sub(a.raw, b.raw)};
-}
-
-// Signed
-HWY_API Vec256<int8_t> operator-(const Vec256<int8_t> a,
-                                 const Vec256<int8_t> b) {
-  return Vec256<int8_t>{wasm_i8x16_sub(a.raw, b.raw)};
-}
-HWY_API Vec256<int16_t> operator-(const Vec256<int16_t> a,
-                                  const Vec256<int16_t> b) {
-  return Vec256<int16_t>{wasm_i16x8_sub(a.raw, b.raw)};
-}
-HWY_API Vec256<int32_t> operator-(const Vec256<int32_t> a,
-                                  const Vec256<int32_t> b) {
-  return Vec256<int32_t>{wasm_i32x4_sub(a.raw, b.raw)};
-}
-
-// Float
-HWY_API Vec256<float> operator-(const Vec256<float> a, const Vec256<float> b) {
-  return Vec256<float>{wasm_f32x4_sub(a.raw, b.raw)};
-}
-
-// ------------------------------ SumsOf8
-HWY_API Vec256<uint64_t> SumsOf8(const Vec256<uint8_t> v) {
-  HWY_ABORT("not implemented");
-}
-
-// ------------------------------ SaturatedAdd
-
-// Returns a + b clamped to the destination range.
-
-// Unsigned
-HWY_API Vec256<uint8_t> SaturatedAdd(const Vec256<uint8_t> a,
-                                     const Vec256<uint8_t> b) {
-  return Vec256<uint8_t>{wasm_u8x16_add_sat(a.raw, b.raw)};
-}
-HWY_API Vec256<uint16_t> SaturatedAdd(const Vec256<uint16_t> a,
-                                      const Vec256<uint16_t> b) {
-  return Vec256<uint16_t>{wasm_u16x8_add_sat(a.raw, b.raw)};
-}
-
-// Signed
-HWY_API Vec256<int8_t> SaturatedAdd(const Vec256<int8_t> a,
-                                    const Vec256<int8_t> b) {
-  return Vec256<int8_t>{wasm_i8x16_add_sat(a.raw, b.raw)};
-}
-HWY_API Vec256<int16_t> SaturatedAdd(const Vec256<int16_t> a,
-                                     const Vec256<int16_t> b) {
-  return Vec256<int16_t>{wasm_i16x8_add_sat(a.raw, b.raw)};
-}
-
-// ------------------------------ SaturatedSub
-
-// Returns a - b clamped to the destination range.
-
-// Unsigned
-HWY_API Vec256<uint8_t> SaturatedSub(const Vec256<uint8_t> a,
-                                     const Vec256<uint8_t> b) {
-  return Vec256<uint8_t>{wasm_u8x16_sub_sat(a.raw, b.raw)};
-}
-HWY_API Vec256<uint16_t> SaturatedSub(const Vec256<uint16_t> a,
-                                      const Vec256<uint16_t> b) {
-  return Vec256<uint16_t>{wasm_u16x8_sub_sat(a.raw, b.raw)};
-}
-
-// Signed
-HWY_API Vec256<int8_t> SaturatedSub(const Vec256<int8_t> a,
-                                    const Vec256<int8_t> b) {
-  return Vec256<int8_t>{wasm_i8x16_sub_sat(a.raw, b.raw)};
-}
-HWY_API Vec256<int16_t> SaturatedSub(const Vec256<int16_t> a,
-                                     const Vec256<int16_t> b) {
-  return Vec256<int16_t>{wasm_i16x8_sub_sat(a.raw, b.raw)};
-}
-
-// ------------------------------ Average
-
-// Returns (a + b + 1) / 2
-
-// Unsigned
-HWY_API Vec256<uint8_t> AverageRound(const Vec256<uint8_t> a,
-                                     const Vec256<uint8_t> b) {
-  return Vec256<uint8_t>{wasm_u8x16_avgr(a.raw, b.raw)};
-}
-HWY_API Vec256<uint16_t> AverageRound(const Vec256<uint16_t> a,
-                                      const Vec256<uint16_t> b) {
-  return Vec256<uint16_t>{wasm_u16x8_avgr(a.raw, b.raw)};
-}
-
-// ------------------------------ Absolute value
-
-// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
-HWY_API Vec256<int8_t> Abs(const Vec256<int8_t> v) {
-  return Vec256<int8_t>{wasm_i8x16_abs(v.raw)};
-}
-HWY_API Vec256<int16_t> Abs(const Vec256<int16_t> v) {
-  return Vec256<int16_t>{wasm_i16x8_abs(v.raw)};
-}
-HWY_API Vec256<int32_t> Abs(const Vec256<int32_t> v) {
-  return Vec256<int32_t>{wasm_i32x4_abs(v.raw)};
-}
-HWY_API Vec256<int64_t> Abs(const Vec256<int64_t> v) {
-  return Vec256<int32_t>{wasm_i62x2_abs(v.raw)};
-}
-
-HWY_API Vec256<float> Abs(const Vec256<float> v) {
-  return Vec256<float>{wasm_f32x4_abs(v.raw)};
-}
-
-// ------------------------------ Shift lanes by constant #bits
-
-// Unsigned
-template <int kBits>
-HWY_API Vec256<uint16_t> ShiftLeft(const Vec256<uint16_t> v) {
-  return Vec256<uint16_t>{wasm_i16x8_shl(v.raw, kBits)};
-}
-template <int kBits>
-HWY_API Vec256<uint16_t> ShiftRight(const Vec256<uint16_t> v) {
-  return Vec256<uint16_t>{wasm_u16x8_shr(v.raw, kBits)};
-}
-template <int kBits>
-HWY_API Vec256<uint32_t> ShiftLeft(const Vec256<uint32_t> v) {
-  return Vec256<uint32_t>{wasm_i32x4_shl(v.raw, kBits)};
-}
-template <int kBits>
-HWY_API Vec256<uint32_t> ShiftRight(const Vec256<uint32_t> v) {
-  return Vec256<uint32_t>{wasm_u32x4_shr(v.raw, kBits)};
-}
-
-// Signed
-template <int kBits>
-HWY_API Vec256<int16_t> ShiftLeft(const Vec256<int16_t> v) {
-  return Vec256<int16_t>{wasm_i16x8_shl(v.raw, kBits)};
-}
-template <int kBits>
-HWY_API Vec256<int16_t> ShiftRight(const Vec256<int16_t> v) {
-  return Vec256<int16_t>{wasm_i16x8_shr(v.raw, kBits)};
-}
-template <int kBits>
-HWY_API Vec256<int32_t> ShiftLeft(const Vec256<int32_t> v) {
-  return Vec256<int32_t>{wasm_i32x4_shl(v.raw, kBits)};
-}
-template <int kBits>
-HWY_API Vec256<int32_t> ShiftRight(const Vec256<int32_t> v) {
-  return Vec256<int32_t>{wasm_i32x4_shr(v.raw, kBits)};
-}
-
-// 8-bit
-template <int kBits, typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Vec256<T> ShiftLeft(const Vec256<T> v) {
-  const Full256<T> d8;
-  // Use raw instead of BitCast to support N=1.
-  const Vec256<T> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{v.raw}).raw};
-  return kBits == 1
-             ? (v + v)
-             : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
-}
-
-template <int kBits>
-HWY_API Vec256<uint8_t> ShiftRight(const Vec256<uint8_t> v) {
-  const Full256<uint8_t> d8;
-  // Use raw instead of BitCast to support N=1.
-  const Vec256<uint8_t> shifted{ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw};
-  return shifted & Set(d8, 0xFF >> kBits);
-}
-
-template <int kBits>
-HWY_API Vec256<int8_t> ShiftRight(const Vec256<int8_t> v) {
-  const Full256<int8_t> di;
-  const Full256<uint8_t> du;
-  const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
-  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
-  return (shifted ^ shifted_sign) - shifted_sign;
-}
-
-// ------------------------------ RotateRight (ShiftRight, Or)
-template <int kBits, typename T>
-HWY_API Vec256<T> RotateRight(const Vec256<T> v) {
-  constexpr size_t kSizeInBits = sizeof(T) * 8;
-  static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
-  if (kBits == 0) return v;
-  return Or(ShiftRight<kBits>(v), ShiftLeft<kSizeInBits - kBits>(v));
-}
-
-// ------------------------------ Shift lanes by same variable #bits
-
-// Unsigned
-HWY_API Vec256<uint16_t> ShiftLeftSame(const Vec256<uint16_t> v,
-                                       const int bits) {
-  return Vec256<uint16_t>{wasm_i16x8_shl(v.raw, bits)};
-}
-HWY_API Vec256<uint16_t> ShiftRightSame(const Vec256<uint16_t> v,
-                                        const int bits) {
-  return Vec256<uint16_t>{wasm_u16x8_shr(v.raw, bits)};
-}
-HWY_API Vec256<uint32_t> ShiftLeftSame(const Vec256<uint32_t> v,
-                                       const int bits) {
-  return Vec256<uint32_t>{wasm_i32x4_shl(v.raw, bits)};
-}
-HWY_API Vec256<uint32_t> ShiftRightSame(const Vec256<uint32_t> v,
-                                        const int bits) {
-  return Vec256<uint32_t>{wasm_u32x4_shr(v.raw, bits)};
-}
-
-// Signed
-HWY_API Vec256<int16_t> ShiftLeftSame(const Vec256<int16_t> v, const int bits) {
-  return Vec256<int16_t>{wasm_i16x8_shl(v.raw, bits)};
-}
-HWY_API Vec256<int16_t> ShiftRightSame(const Vec256<int16_t> v,
-                                       const int bits) {
-  return Vec256<int16_t>{wasm_i16x8_shr(v.raw, bits)};
-}
-HWY_API Vec256<int32_t> ShiftLeftSame(const Vec256<int32_t> v, const int bits) {
-  return Vec256<int32_t>{wasm_i32x4_shl(v.raw, bits)};
-}
-HWY_API Vec256<int32_t> ShiftRightSame(const Vec256<int32_t> v,
-                                       const int bits) {
-  return Vec256<int32_t>{wasm_i32x4_shr(v.raw, bits)};
-}
-
-// 8-bit
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Vec256<T> ShiftLeftSame(const Vec256<T> v, const int bits) {
-  const Full256<T> d8;
-  // Use raw instead of BitCast to support N=1.
-  const Vec256<T> shifted{ShiftLeftSame(Vec128<MakeWide<T>>{v.raw}, bits).raw};
-  return shifted & Set(d8, (0xFF << bits) & 0xFF);
-}
-
-HWY_API Vec256<uint8_t> ShiftRightSame(Vec256<uint8_t> v, const int bits) {
-  const Full256<uint8_t> d8;
-  // Use raw instead of BitCast to support N=1.
-  const Vec256<uint8_t> shifted{
-      ShiftRightSame(Vec128<uint16_t>{v.raw}, bits).raw};
-  return shifted & Set(d8, 0xFF >> bits);
-}
-
-HWY_API Vec256<int8_t> ShiftRightSame(Vec256<int8_t> v, const int bits) {
-  const Full256<int8_t> di;
-  const Full256<uint8_t> du;
-  const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
-  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits));
-  return (shifted ^ shifted_sign) - shifted_sign;
-}
-
-// ------------------------------ Minimum
-
-// Unsigned
-HWY_API Vec256<uint8_t> Min(const Vec256<uint8_t> a, const Vec256<uint8_t> b) {
-  return Vec256<uint8_t>{wasm_u8x16_min(a.raw, b.raw)};
-}
-HWY_API Vec256<uint16_t> Min(const Vec256<uint16_t> a,
-                             const Vec256<uint16_t> b) {
-  return Vec256<uint16_t>{wasm_u16x8_min(a.raw, b.raw)};
-}
-HWY_API Vec256<uint32_t> Min(const Vec256<uint32_t> a,
-                             const Vec256<uint32_t> b) {
-  return Vec256<uint32_t>{wasm_u32x4_min(a.raw, b.raw)};
-}
-HWY_API Vec256<uint64_t> Min(const Vec256<uint64_t> a,
-                             const Vec256<uint64_t> b) {
-  alignas(32) float min[4];
-  min[0] =
-      HWY_MIN(wasm_u64x2_extract_lane(a, 0), wasm_u64x2_extract_lane(b, 0));
-  min[1] =
-      HWY_MIN(wasm_u64x2_extract_lane(a, 1), wasm_u64x2_extract_lane(b, 1));
-  return Vec256<uint64_t>{wasm_v128_load(min)};
-}
-
-// Signed
-HWY_API Vec256<int8_t> Min(const Vec256<int8_t> a, const Vec256<int8_t> b) {
-  return Vec256<int8_t>{wasm_i8x16_min(a.raw, b.raw)};
-}
-HWY_API Vec256<int16_t> Min(const Vec256<int16_t> a, const Vec256<int16_t> b) {
-  return Vec256<int16_t>{wasm_i16x8_min(a.raw, b.raw)};
-}
-HWY_API Vec256<int32_t> Min(const Vec256<int32_t> a, const Vec256<int32_t> b) {
-  return Vec256<int32_t>{wasm_i32x4_min(a.raw, b.raw)};
-}
-HWY_API Vec256<int64_t> Min(const Vec256<int64_t> a, const Vec256<int64_t> b) {
-  alignas(32) float min[4];
-  min[0] =
-      HWY_MIN(wasm_i64x2_extract_lane(a, 0), wasm_i64x2_extract_lane(b, 0));
-  min[1] =
-      HWY_MIN(wasm_i64x2_extract_lane(a, 1), wasm_i64x2_extract_lane(b, 1));
-  return Vec256<int64_t>{wasm_v128_load(min)};
-}
-
-// Float
-HWY_API Vec256<float> Min(const Vec256<float> a, const Vec256<float> b) {
-  return Vec256<float>{wasm_f32x4_min(a.raw, b.raw)};
-}
-
-// ------------------------------ Maximum
-
-// Unsigned
-HWY_API Vec256<uint8_t> Max(const Vec256<uint8_t> a, const Vec256<uint8_t> b) {
-  return Vec256<uint8_t>{wasm_u8x16_max(a.raw, b.raw)};
-}
-HWY_API Vec256<uint16_t> Max(const Vec256<uint16_t> a,
-                             const Vec256<uint16_t> b) {
-  return Vec256<uint16_t>{wasm_u16x8_max(a.raw, b.raw)};
-}
-HWY_API Vec256<uint32_t> Max(const Vec256<uint32_t> a,
-                             const Vec256<uint32_t> b) {
-  return Vec256<uint32_t>{wasm_u32x4_max(a.raw, b.raw)};
-}
-HWY_API Vec256<uint64_t> Max(const Vec256<uint64_t> a,
-                             const Vec256<uint64_t> b) {
-  alignas(32) float max[4];
-  max[0] =
-      HWY_MAX(wasm_u64x2_extract_lane(a, 0), wasm_u64x2_extract_lane(b, 0));
-  max[1] =
-      HWY_MAX(wasm_u64x2_extract_lane(a, 1), wasm_u64x2_extract_lane(b, 1));
-  return Vec256<int64_t>{wasm_v128_load(max)};
-}
-
-// Signed
-HWY_API Vec256<int8_t> Max(const Vec256<int8_t> a, const Vec256<int8_t> b) {
-  return Vec256<int8_t>{wasm_i8x16_max(a.raw, b.raw)};
-}
-HWY_API Vec256<int16_t> Max(const Vec256<int16_t> a, const Vec256<int16_t> b) {
-  return Vec256<int16_t>{wasm_i16x8_max(a.raw, b.raw)};
-}
-HWY_API Vec256<int32_t> Max(const Vec256<int32_t> a, const Vec256<int32_t> b) {
-  return Vec256<int32_t>{wasm_i32x4_max(a.raw, b.raw)};
-}
-HWY_API Vec256<int64_t> Max(const Vec256<int64_t> a, const Vec256<int64_t> b) {
-  alignas(32) float max[4];
-  max[0] =
-      HWY_MAX(wasm_i64x2_extract_lane(a, 0), wasm_i64x2_extract_lane(b, 0));
-  max[1] =
-      HWY_MAX(wasm_i64x2_extract_lane(a, 1), wasm_i64x2_extract_lane(b, 1));
-  return Vec256<int64_t>{wasm_v128_load(max)};
-}
-
-// Float
-HWY_API Vec256<float> Max(const Vec256<float> a, const Vec256<float> b) {
-  return Vec256<float>{wasm_f32x4_max(a.raw, b.raw)};
-}
-
-// ------------------------------ Integer multiplication
-
-// Unsigned
-HWY_API Vec256<uint16_t> operator*(const Vec256<uint16_t> a,
-                                   const Vec256<uint16_t> b) {
-  return Vec256<uint16_t>{wasm_i16x8_mul(a.raw, b.raw)};
-}
-HWY_API Vec256<uint32_t> operator*(const Vec256<uint32_t> a,
-                                   const Vec256<uint32_t> b) {
-  return Vec256<uint32_t>{wasm_i32x4_mul(a.raw, b.raw)};
-}
-
-// Signed
-HWY_API Vec256<int16_t> operator*(const Vec256<int16_t> a,
-                                  const Vec256<int16_t> b) {
-  return Vec256<int16_t>{wasm_i16x8_mul(a.raw, b.raw)};
-}
-HWY_API Vec256<int32_t> operator*(const Vec256<int32_t> a,
-                                  const Vec256<int32_t> b) {
-  return Vec256<int32_t>{wasm_i32x4_mul(a.raw, b.raw)};
-}
-
-// Returns the upper 16 bits of a * b in each lane.
-HWY_API Vec256<uint16_t> MulHigh(const Vec256<uint16_t> a,
-                                 const Vec256<uint16_t> b) {
-  // TODO(eustas): replace, when implemented in WASM.
-  const auto al = wasm_u32x4_extend_low_u16x8(a.raw);
-  const auto ah = wasm_u32x4_extend_high_u16x8(a.raw);
-  const auto bl = wasm_u32x4_extend_low_u16x8(b.raw);
-  const auto bh = wasm_u32x4_extend_high_u16x8(b.raw);
-  const auto l = wasm_i32x4_mul(al, bl);
-  const auto h = wasm_i32x4_mul(ah, bh);
-  // TODO(eustas): shift-right + narrow?
-  return Vec256<uint16_t>{wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
-}
-HWY_API Vec256<int16_t> MulHigh(const Vec256<int16_t> a,
-                                const Vec256<int16_t> b) {
-  // TODO(eustas): replace, when implemented in WASM.
-  const auto al = wasm_i32x4_extend_low_i16x8(a.raw);
-  const auto ah = wasm_i32x4_extend_high_i16x8(a.raw);
-  const auto bl = wasm_i32x4_extend_low_i16x8(b.raw);
-  const auto bh = wasm_i32x4_extend_high_i16x8(b.raw);
-  const auto l = wasm_i32x4_mul(al, bl);
-  const auto h = wasm_i32x4_mul(ah, bh);
-  // TODO(eustas): shift-right + narrow?
-  return Vec256<int16_t>{wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)};
-}
-
-HWY_API Vec256<int16_t> MulFixedPoint15(Vec256<int16_t>, Vec256<int16_t>) {
-  HWY_ASSERT(0);  // Not implemented
-}
-
-// Multiplies even lanes (0, 2 ..) and returns the double-width result.
-HWY_API Vec256<int64_t> MulEven(const Vec256<int32_t> a,
-                                const Vec256<int32_t> b) {
-  // TODO(eustas): replace, when implemented in WASM.
-  const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
-  const auto ae = wasm_v128_and(a.raw, kEvenMask);
-  const auto be = wasm_v128_and(b.raw, kEvenMask);
-  return Vec256<int64_t>{wasm_i64x2_mul(ae, be)};
-}
-HWY_API Vec256<uint64_t> MulEven(const Vec256<uint32_t> a,
-                                 const Vec256<uint32_t> b) {
-  // TODO(eustas): replace, when implemented in WASM.
-  const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0);
-  const auto ae = wasm_v128_and(a.raw, kEvenMask);
-  const auto be = wasm_v128_and(b.raw, kEvenMask);
-  return Vec256<uint64_t>{wasm_i64x2_mul(ae, be)};
-}
-
-// ------------------------------ Negate
-
-template <typename T, HWY_IF_FLOAT(T)>
-HWY_API Vec256<T> Neg(const Vec256<T> v) {
-  return Xor(v, SignBit(Full256<T>()));
-}
-
-HWY_API Vec256<int8_t> Neg(const Vec256<int8_t> v) {
-  return Vec256<int8_t>{wasm_i8x16_neg(v.raw)};
-}
-HWY_API Vec256<int16_t> Neg(const Vec256<int16_t> v) {
-  return Vec256<int16_t>{wasm_i16x8_neg(v.raw)};
-}
-HWY_API Vec256<int32_t> Neg(const Vec256<int32_t> v) {
-  return Vec256<int32_t>{wasm_i32x4_neg(v.raw)};
-}
-HWY_API Vec256<int64_t> Neg(const Vec256<int64_t> v) {
-  return Vec256<int64_t>{wasm_i64x2_neg(v.raw)};
-}
-
-// ------------------------------ Floating-point mul / div
-
-HWY_API Vec256<float> operator*(Vec256<float> a, Vec256<float> b) {
-  return Vec256<float>{wasm_f32x4_mul(a.raw, b.raw)};
-}
-
-HWY_API Vec256<float> operator/(const Vec256<float> a, const Vec256<float> b) {
-  return Vec256<float>{wasm_f32x4_div(a.raw, b.raw)};
-}
-
-// Approximate reciprocal
-HWY_API Vec256<float> ApproximateReciprocal(const Vec256<float> v) {
-  const Vec256<float> one = Vec256<float>{wasm_f32x4_splat(1.0f)};
-  return one / v;
-}
-
-// Absolute value of difference.
-HWY_API Vec256<float> AbsDiff(const Vec256<float> a, const Vec256<float> b) {
-  return Abs(a - b);
-}
-
-// ------------------------------ Floating-point multiply-add variants
-
-// Returns mul * x + add
-HWY_API Vec256<float> MulAdd(const Vec256<float> mul, const Vec256<float> x,
-                             const Vec256<float> add) {
-  // TODO(eustas): replace, when implemented in WASM.
-  // TODO(eustas): is it wasm_f32x4_qfma?
-  return mul * x + add;
-}
-
-// Returns add - mul * x
-HWY_API Vec256<float> NegMulAdd(const Vec256<float> mul, const Vec256<float> x,
-                                const Vec256<float> add) {
-  // TODO(eustas): replace, when implemented in WASM.
-  return add - mul * x;
-}
-
-// Returns mul * x - sub
-HWY_API Vec256<float> MulSub(const Vec256<float> mul, const Vec256<float> x,
-                             const Vec256<float> sub) {
-  // TODO(eustas): replace, when implemented in WASM.
-  // TODO(eustas): is it wasm_f32x4_qfms?
-  return mul * x - sub;
-}
-
-// Returns -mul * x - sub
-HWY_API Vec256<float> NegMulSub(const Vec256<float> mul, const Vec256<float> x,
-                                const Vec256<float> sub) {
-  // TODO(eustas): replace, when implemented in WASM.
-  return Neg(mul) * x - sub;
-}
-
-// ------------------------------ Floating-point square root
-
-// Full precision square root
-HWY_API Vec256<float> Sqrt(const Vec256<float> v) {
-  return Vec256<float>{wasm_f32x4_sqrt(v.raw)};
-}
-
-// Approximate reciprocal square root
-HWY_API Vec256<float> ApproximateReciprocalSqrt(const Vec256<float> v) {
-  // TODO(eustas): find cheaper a way to calculate this.
-  const Vec256<float> one = Vec256<float>{wasm_f32x4_splat(1.0f)};
-  return one / Sqrt(v);
-}
-
-// ------------------------------ Floating-point rounding
-
-// Toward nearest integer, ties to even
-HWY_API Vec256<float> Round(const Vec256<float> v) {
-  return Vec256<float>{wasm_f32x4_nearest(v.raw)};
-}
-
-// Toward zero, aka truncate
-HWY_API Vec256<float> Trunc(const Vec256<float> v) {
-  return Vec256<float>{wasm_f32x4_trunc(v.raw)};
-}
-
-// Toward +infinity, aka ceiling
-HWY_API Vec256<float> Ceil(const Vec256<float> v) {
-  return Vec256<float>{wasm_f32x4_ceil(v.raw)};
-}
-
-// Toward -infinity, aka floor
-HWY_API Vec256<float> Floor(const Vec256<float> v) {
-  return Vec256<float>{wasm_f32x4_floor(v.raw)};
-}
-
-// ------------------------------ Floating-point classification
-
-template <typename T>
-HWY_API Mask256<T> IsNaN(const Vec256<T> v) {
-  return v != v;
-}
-
-template <typename T, HWY_IF_FLOAT(T)>
-HWY_API Mask256<T> IsInf(const Vec256<T> v) {
-  const Full256<T> d;
-  const RebindToSigned<decltype(d)> di;
-  const VFromD<decltype(di)> vi = BitCast(di, v);
-  // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
-  return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
-}
-
-// Returns whether normal/subnormal/zero.
-template <typename T, HWY_IF_FLOAT(T)>
-HWY_API Mask256<T> IsFinite(const Vec256<T> v) {
-  const Full256<T> d;
-  const RebindToUnsigned<decltype(d)> du;
-  const RebindToSigned<decltype(d)> di;  // cheaper than unsigned comparison
-  const VFromD<decltype(du)> vu = BitCast(du, v);
-  // 'Shift left' to clear the sign bit, then right so we can compare with the
-  // max exponent (cannot compare with MaxExponentTimes2 directly because it is
-  // negative and non-negative floats would be greater).
-  const VFromD<decltype(di)> exp =
-      BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
-  return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
-}
-
-// ================================================== COMPARE
-
-// Comparisons fill a lane with 1-bits if the condition is true, else 0.
-
-template <typename TFrom, typename TTo>
-HWY_API Mask256<TTo> RebindMask(Full256<TTo> /*tag*/, Mask256<TFrom> m) {
-  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
-  return Mask256<TTo>{m.raw};
-}
-
-template <typename T>
-HWY_API Mask256<T> TestBit(Vec256<T> v, Vec256<T> bit) {
-  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
-  return (v & bit) == bit;
-}
-
-// ------------------------------ Equality
-
-// Unsigned
-HWY_API Mask256<uint8_t> operator==(const Vec256<uint8_t> a,
-                                    const Vec256<uint8_t> b) {
-  return Mask256<uint8_t>{wasm_i8x16_eq(a.raw, b.raw)};
-}
-HWY_API Mask256<uint16_t> operator==(const Vec256<uint16_t> a,
-                                     const Vec256<uint16_t> b) {
-  return Mask256<uint16_t>{wasm_i16x8_eq(a.raw, b.raw)};
-}
-HWY_API Mask256<uint32_t> operator==(const Vec256<uint32_t> a,
-                                     const Vec256<uint32_t> b) {
-  return Mask256<uint32_t>{wasm_i32x4_eq(a.raw, b.raw)};
-}
-
-// Signed
-HWY_API Mask256<int8_t> operator==(const Vec256<int8_t> a,
-                                   const Vec256<int8_t> b) {
-  return Mask256<int8_t>{wasm_i8x16_eq(a.raw, b.raw)};
-}
-HWY_API Mask256<int16_t> operator==(Vec256<int16_t> a, Vec256<int16_t> b) {
-  return Mask256<int16_t>{wasm_i16x8_eq(a.raw, b.raw)};
-}
-HWY_API Mask256<int32_t> operator==(const Vec256<int32_t> a,
-                                    const Vec256<int32_t> b) {
-  return Mask256<int32_t>{wasm_i32x4_eq(a.raw, b.raw)};
-}
-
-// Float
-HWY_API Mask256<float> operator==(const Vec256<float> a,
-                                  const Vec256<float> b) {
-  return Mask256<float>{wasm_f32x4_eq(a.raw, b.raw)};
-}
-
-// ------------------------------ Inequality
-
-// Unsigned
-HWY_API Mask256<uint8_t> operator!=(const Vec256<uint8_t> a,
-                                    const Vec256<uint8_t> b) {
-  return Mask256<uint8_t>{wasm_i8x16_ne(a.raw, b.raw)};
-}
-HWY_API Mask256<uint16_t> operator!=(const Vec256<uint16_t> a,
-                                     const Vec256<uint16_t> b) {
-  return Mask256<uint16_t>{wasm_i16x8_ne(a.raw, b.raw)};
-}
-HWY_API Mask256<uint32_t> operator!=(const Vec256<uint32_t> a,
-                                     const Vec256<uint32_t> b) {
-  return Mask256<uint32_t>{wasm_i32x4_ne(a.raw, b.raw)};
-}
-
-// Signed
-HWY_API Mask256<int8_t> operator!=(const Vec256<int8_t> a,
-                                   const Vec256<int8_t> b) {
-  return Mask256<int8_t>{wasm_i8x16_ne(a.raw, b.raw)};
-}
-HWY_API Mask256<int16_t> operator!=(Vec256<int16_t> a, Vec256<int16_t> b) {
-  return Mask256<int16_t>{wasm_i16x8_ne(a.raw, b.raw)};
-}
-HWY_API Mask256<int32_t> operator!=(const Vec256<int32_t> a,
-                                    const Vec256<int32_t> b) {
-  return Mask256<int32_t>{wasm_i32x4_ne(a.raw, b.raw)};
-}
-
-// Float
-HWY_API Mask256<float> operator!=(const Vec256<float> a,
-                                  const Vec256<float> b) {
-  return Mask256<float>{wasm_f32x4_ne(a.raw, b.raw)};
-}
-
-// ------------------------------ Strict inequality
-
-HWY_API Mask256<int8_t> operator>(const Vec256<int8_t> a,
-                                  const Vec256<int8_t> b) {
-  return Mask256<int8_t>{wasm_i8x16_gt(a.raw, b.raw)};
-}
-HWY_API Mask256<int16_t> operator>(const Vec256<int16_t> a,
-                                   const Vec256<int16_t> b) {
-  return Mask256<int16_t>{wasm_i16x8_gt(a.raw, b.raw)};
-}
-HWY_API Mask256<int32_t> operator>(const Vec256<int32_t> a,
-                                   const Vec256<int32_t> b) {
-  return Mask256<int32_t>{wasm_i32x4_gt(a.raw, b.raw)};
-}
-HWY_API Mask256<int64_t> operator>(const Vec256<int64_t> a,
-                                   const Vec256<int64_t> b) {
-  const Rebind < int32_t, DFromV<decltype(a)> d32;
-  const auto a32 = BitCast(d32, a);
-  const auto b32 = BitCast(d32, b);
-  // If the upper half is less than or greater, this is the answer.
-  const auto m_gt = a32 < b32;
-
-  // Otherwise, the lower half decides.
-  const auto m_eq = a32 == b32;
-  const auto lo_in_hi = wasm_i32x4_shuffle(m_gt, m_gt, 2, 2, 0, 0);
-  const auto lo_gt = And(m_eq, lo_in_hi);
-
-  const auto gt = Or(lo_gt, m_gt);
-  // Copy result in upper 32 bits to lower 32 bits.
-  return Mask256<int64_t>{wasm_i32x4_shuffle(gt, gt, 3, 3, 1, 1)};
-}
-
-template <typename T, HWY_IF_UNSIGNED(T)>
-HWY_API Mask256<T> operator>(Vec256<T> a, Vec256<T> b) {
-  const Full256<T> du;
-  const RebindToSigned<decltype(du)> di;
-  const Vec256<T> msb = Set(du, (LimitsMax<T>() >> 1) + 1);
-  return RebindMask(du, BitCast(di, Xor(a, msb)) > BitCast(di, Xor(b, msb)));
-}
-
-HWY_API Mask256<float> operator>(const Vec256<float> a, const Vec256<float> b) {
-  return Mask256<float>{wasm_f32x4_gt(a.raw, b.raw)};
-}
-
-template <typename T>
-HWY_API Mask256<T> operator<(const Vec256<T> a, const Vec256<T> b) {
-  return operator>(b, a);
-}
-
-// ------------------------------ Weak inequality
-
-// Float <= >=
-HWY_API Mask256<float> operator<=(const Vec256<float> a,
-                                  const Vec256<float> b) {
-  return Mask256<float>{wasm_f32x4_le(a.raw, b.raw)};
-}
-HWY_API Mask256<float> operator>=(const Vec256<float> a,
-                                  const Vec256<float> b) {
-  return Mask256<float>{wasm_f32x4_ge(a.raw, b.raw)};
-}
-
-// ------------------------------ FirstN (Iota, Lt)
-
-template <typename T>
-HWY_API Mask256<T> FirstN(const Full256<T> d, size_t num) {
-  const RebindToSigned<decltype(d)> di;  // Signed comparisons may be cheaper.
-  return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
-}
-
-// ================================================== LOGICAL
-
-// ------------------------------ Not
-
-template <typename T>
-HWY_API Vec256<T> Not(Vec256<T> v) {
-  return Vec256<T>{wasm_v128_not(v.raw)};
-}
-
-// ------------------------------ And
-
-template <typename T>
-HWY_API Vec256<T> And(Vec256<T> a, Vec256<T> b) {
-  return Vec256<T>{wasm_v128_and(a.raw, b.raw)};
-}
-
-// ------------------------------ AndNot
-
-// Returns ~not_mask & mask.
-template <typename T>
-HWY_API Vec256<T> AndNot(Vec256<T> not_mask, Vec256<T> mask) {
-  return Vec256<T>{wasm_v128_andnot(mask.raw, not_mask.raw)};
-}
-
-// ------------------------------ Or
-
-template <typename T>
-HWY_API Vec256<T> Or(Vec256<T> a, Vec256<T> b) {
-  return Vec256<T>{wasm_v128_or(a.raw, b.raw)};
-}
-
-// ------------------------------ Xor
-
-template <typename T>
-HWY_API Vec256<T> Xor(Vec256<T> a, Vec256<T> b) {
-  return Vec256<T>{wasm_v128_xor(a.raw, b.raw)};
-}
-
-// ------------------------------ Or3
-
-template <typename T>
-HWY_API Vec256<T> Or3(Vec256<T> o1, Vec256<T> o2, Vec256<T> o3) {
-  return Or(o1, Or(o2, o3));
-}
-
-// ------------------------------ OrAnd
-
-template <typename T>
-HWY_API Vec256<T> OrAnd(Vec256<T> o, Vec256<T> a1, Vec256<T> a2) {
-  return Or(o, And(a1, a2));
-}
-
-// ------------------------------ IfVecThenElse
-
-template <typename T>
-HWY_API Vec256<T> IfVecThenElse(Vec256<T> mask, Vec256<T> yes, Vec256<T> no) {
-  return IfThenElse(MaskFromVec(mask), yes, no);
-}
-
-// ------------------------------ Operator overloads (internal-only if float)
-
-template <typename T>
-HWY_API Vec256<T> operator&(const Vec256<T> a, const Vec256<T> b) {
-  return And(a, b);
-}
-
-template <typename T>
-HWY_API Vec256<T> operator|(const Vec256<T> a, const Vec256<T> b) {
-  return Or(a, b);
-}
-
-template <typename T>
-HWY_API Vec256<T> operator^(const Vec256<T> a, const Vec256<T> b) {
-  return Xor(a, b);
-}
-
-// ------------------------------ CopySign
-
-template <typename T>
-HWY_API Vec256<T> CopySign(const Vec256<T> magn, const Vec256<T> sign) {
-  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
-  const auto msb = SignBit(Full256<T>());
-  return Or(AndNot(msb, magn), And(msb, sign));
-}
-
-template <typename T>
-HWY_API Vec256<T> CopySignToAbs(const Vec256<T> abs, const Vec256<T> sign) {
-  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
-  return Or(abs, And(SignBit(Full256<T>()), sign));
-}
-
-// ------------------------------ BroadcastSignBit (compare)
-
-template <typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
-HWY_API Vec256<T> BroadcastSignBit(const Vec256<T> v) {
-  return ShiftRight<sizeof(T) * 8 - 1>(v);
-}
-HWY_API Vec256<int8_t> BroadcastSignBit(const Vec256<int8_t> v) {
-  return VecFromMask(Full256<int8_t>(), v < Zero(Full256<int8_t>()));
-}
-
-// ------------------------------ Mask
-
-// Mask and Vec are the same (true = FF..FF).
-template <typename T>
-HWY_API Mask256<T> MaskFromVec(const Vec256<T> v) {
-  return Mask256<T>{v.raw};
-}
-
-template <typename T>
-HWY_API Vec256<T> VecFromMask(Full256<T> /* tag */, Mask256<T> v) {
-  return Vec256<T>{v.raw};
-}
-
-// mask ? yes : no
-template <typename T>
-HWY_API Vec256<T> IfThenElse(Mask256<T> mask, Vec256<T> yes, Vec256<T> no) {
-  return Vec256<T>{wasm_v128_bitselect(yes.raw, no.raw, mask.raw)};
-}
-
-// mask ? yes : 0
-template <typename T>
-HWY_API Vec256<T> IfThenElseZero(Mask256<T> mask, Vec256<T> yes) {
-  return yes & VecFromMask(Full256<T>(), mask);
-}
-
-// mask ? 0 : no
-template <typename T>
-HWY_API Vec256<T> IfThenZeroElse(Mask256<T> mask, Vec256<T> no) {
-  return AndNot(VecFromMask(Full256<T>(), mask), no);
-}
-
-template <typename T>
-    HWY_API Vec256 <
-    T IfNegativeThenElse(Vec256<T> v, Vec256<T> yes, Vec256<T> no) {
-  HWY_ASSERT(0);  // Not implemented
-}
-
-template <typename T, HWY_IF_FLOAT(T)>
-HWY_API Vec256<T> ZeroIfNegative(Vec256<T> v) {
-  const Full256<T> d;
-  const auto zero = Zero(d);
-  return IfThenElse(Mask256<T>{(v > zero).raw}, v, zero);
-}
-
-// ------------------------------ Mask logical
-
-template <typename T>
-HWY_API Mask256<T> Not(const Mask256<T> m) {
-  return MaskFromVec(Not(VecFromMask(Full256<T>(), m)));
-}
-
-template <typename T>
-HWY_API Mask256<T> And(const Mask256<T> a, Mask256<T> b) {
-  const Full256<T> d;
-  return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
-}
-
-template <typename T>
-HWY_API Mask256<T> AndNot(const Mask256<T> a, Mask256<T> b) {
-  const Full256<T> d;
-  return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
-}
-
-template <typename T>
-HWY_API Mask256<T> Or(const Mask256<T> a, Mask256<T> b) {
-  const Full256<T> d;
-  return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
-}
-
-template <typename T>
-HWY_API Mask256<T> Xor(const Mask256<T> a, Mask256<T> b) {
-  const Full256<T> d;
-  return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
-}
-
-// ------------------------------ Shl (BroadcastSignBit, IfThenElse)
-
-// The x86 multiply-by-Pow2() trick will not work because WASM saturates
-// float->int correctly to 2^31-1 (not 2^31). Because WASM's shifts take a
-// scalar count operand, per-lane shift instructions would require extract_lane
-// for each lane, and hoping that shuffle is correctly mapped to a native
-// instruction. Using non-vector shifts would incur a store-load forwarding
-// stall when loading the result vector. We instead test bits of the shift
-// count to "predicate" a shift of the entire vector by a constant.
-
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec256<T> operator<<(Vec256<T> v, const Vec256<T> bits) {
-  const Full256<T> d;
-  Mask256<T> mask;
-  // Need a signed type for BroadcastSignBit.
-  auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
-  // Move the highest valid bit of the shift count into the sign bit.
-  test = ShiftLeft<12>(test);
-
-  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
-  test = ShiftLeft<1>(test);  // next bit (descending order)
-  v = IfThenElse(mask, ShiftLeft<8>(v), v);
-
-  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
-  test = ShiftLeft<1>(test);  // next bit (descending order)
-  v = IfThenElse(mask, ShiftLeft<4>(v), v);
-
-  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
-  test = ShiftLeft<1>(test);  // next bit (descending order)
-  v = IfThenElse(mask, ShiftLeft<2>(v), v);
-
-  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
-  return IfThenElse(mask, ShiftLeft<1>(v), v);
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec256<T> operator<<(Vec256<T> v, const Vec256<T> bits) {
-  const Full256<T> d;
-  Mask256<T> mask;
-  // Need a signed type for BroadcastSignBit.
-  auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
-  // Move the highest valid bit of the shift count into the sign bit.
-  test = ShiftLeft<27>(test);
-
-  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
-  test = ShiftLeft<1>(test);  // next bit (descending order)
-  v = IfThenElse(mask, ShiftLeft<16>(v), v);
-
-  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
-  test = ShiftLeft<1>(test);  // next bit (descending order)
-  v = IfThenElse(mask, ShiftLeft<8>(v), v);
-
-  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
-  test = ShiftLeft<1>(test);  // next bit (descending order)
-  v = IfThenElse(mask, ShiftLeft<4>(v), v);
-
-  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
-  test = ShiftLeft<1>(test);  // next bit (descending order)
-  v = IfThenElse(mask, ShiftLeft<2>(v), v);
-
-  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
-  return IfThenElse(mask, ShiftLeft<1>(v), v);
-}
-
-// ------------------------------ Shr (BroadcastSignBit, IfThenElse)
-
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec256<T> operator>>(Vec256<T> v, const Vec256<T> bits) {
-  const Full256<T> d;
-  Mask256<T> mask;
-  // Need a signed type for BroadcastSignBit.
-  auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
-  // Move the highest valid bit of the shift count into the sign bit.
-  test = ShiftLeft<12>(test);
-
-  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
-  test = ShiftLeft<1>(test);  // next bit (descending order)
-  v = IfThenElse(mask, ShiftRight<8>(v), v);
-
-  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
-  test = ShiftLeft<1>(test);  // next bit (descending order)
-  v = IfThenElse(mask, ShiftRight<4>(v), v);
-
-  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
-  test = ShiftLeft<1>(test);  // next bit (descending order)
-  v = IfThenElse(mask, ShiftRight<2>(v), v);
-
-  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
-  return IfThenElse(mask, ShiftRight<1>(v), v);
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec256<T> operator>>(Vec256<T> v, const Vec256<T> bits) {
-  const Full256<T> d;
-  Mask256<T> mask;
-  // Need a signed type for BroadcastSignBit.
-  auto test = BitCast(RebindToSigned<decltype(d)>(), bits);
-  // Move the highest valid bit of the shift count into the sign bit.
-  test = ShiftLeft<27>(test);
-
-  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
-  test = ShiftLeft<1>(test);  // next bit (descending order)
-  v = IfThenElse(mask, ShiftRight<16>(v), v);
-
-  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
-  test = ShiftLeft<1>(test);  // next bit (descending order)
-  v = IfThenElse(mask, ShiftRight<8>(v), v);
-
-  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
-  test = ShiftLeft<1>(test);  // next bit (descending order)
-  v = IfThenElse(mask, ShiftRight<4>(v), v);
-
-  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
-  test = ShiftLeft<1>(test);  // next bit (descending order)
-  v = IfThenElse(mask, ShiftRight<2>(v), v);
-
-  mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test)));
-  return IfThenElse(mask, ShiftRight<1>(v), v);
-}
-
-// ================================================== MEMORY
-
-// ------------------------------ Load
-
-template <typename T>
-HWY_API Vec256<T> Load(Full256<T> /* tag */, const T* HWY_RESTRICT aligned) {
-  return Vec256<T>{wasm_v128_load(aligned)};
-}
-
-template <typename T>
-HWY_API Vec256<T> MaskedLoad(Mask256<T> m, Full256<T> d,
-                             const T* HWY_RESTRICT aligned) {
-  return IfThenElseZero(m, Load(d, aligned));
-}
-
-// LoadU == Load.
-template <typename T>
-HWY_API Vec256<T> LoadU(Full256<T> d, const T* HWY_RESTRICT p) {
-  return Load(d, p);
-}
-
-// 128-bit SIMD => nothing to duplicate, same as an unaligned load.
-template <typename T>
-HWY_API Vec256<T> LoadDup128(Full256<T> d, const T* HWY_RESTRICT p) {
-  return Load(d, p);
-}
-
-// ------------------------------ Store
-
-template <typename T>
-HWY_API void Store(Vec256<T> v, Full256<T> /* tag */, T* HWY_RESTRICT aligned) {
-  wasm_v128_store(aligned, v.raw);
-}
-
-// StoreU == Store.
-template <typename T>
-HWY_API void StoreU(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT p) {
-  Store(v, d, p);
-}
-
-template <typename T>
-HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
-                          T* HWY_RESTRICT p) {
-  StoreU(IfThenElse(m, v, LoadU(d, p)), d, p);
-}
-
-// ------------------------------ Non-temporal stores
-
-// Same as aligned stores on non-x86.
-
-template <typename T>
-HWY_API void Stream(Vec256<T> v, Full256<T> /* tag */,
-                    T* HWY_RESTRICT aligned) {
-  wasm_v128_store(aligned, v.raw);
-}
-
-// ------------------------------ Scatter (Store)
-
-template <typename T, typename Offset>
-HWY_API void ScatterOffset(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
-                           const Vec256<Offset> offset) {
-  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
-
-  alignas(32) T lanes[32 / sizeof(T)];
-  Store(v, d, lanes);
-
-  alignas(32) Offset offset_lanes[32 / sizeof(T)];
-  Store(offset, Full256<Offset>(), offset_lanes);
-
-  uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
-  for (size_t i = 0; i < N; ++i) {
-    CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
-  }
-}
-
-template <typename T, typename Index>
-HWY_API void ScatterIndex(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
-                          const Vec256<Index> index) {
-  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
-
-  alignas(32) T lanes[32 / sizeof(T)];
-  Store(v, d, lanes);
-
-  alignas(32) Index index_lanes[32 / sizeof(T)];
-  Store(index, Full256<Index>(), index_lanes);
-
-  for (size_t i = 0; i < N; ++i) {
-    base[index_lanes[i]] = lanes[i];
-  }
-}
-
-// ------------------------------ Gather (Load/Store)
-
-template <typename T, typename Offset>
-HWY_API Vec256<T> GatherOffset(const Full256<T> d, const T* HWY_RESTRICT base,
-                               const Vec256<Offset> offset) {
-  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
-
-  alignas(32) Offset offset_lanes[32 / sizeof(T)];
-  Store(offset, Full256<Offset>(), offset_lanes);
-
-  alignas(32) T lanes[32 / sizeof(T)];
-  const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
-  for (size_t i = 0; i < N; ++i) {
-    CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
-  }
-  return Load(d, lanes);
-}
-
-template <typename T, typename Index>
-HWY_API Vec256<T> GatherIndex(const Full256<T> d, const T* HWY_RESTRICT base,
-                              const Vec256<Index> index) {
-  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
-
-  alignas(32) Index index_lanes[32 / sizeof(T)];
-  Store(index, Full256<Index>(), index_lanes);
-
-  alignas(32) T lanes[32 / sizeof(T)];
-  for (size_t i = 0; i < N; ++i) {
-    lanes[i] = base[index_lanes[i]];
-  }
-  return Load(d, lanes);
-}
-
-// ================================================== SWIZZLE
-
-// ------------------------------ ExtractLane
-template <typename T, size_t N>
-HWY_API T ExtractLane(const Vec128<T, N> v, size_t i) {
-  HWY_ASSERT(0);  // Not implemented
-}
-
-// ------------------------------ InsertLane
-template <typename T, size_t N>
-HWY_API Vec128<T, N> InsertLane(const Vec128<T, N> v, size_t i, T t) {
-  HWY_ASSERT(0);  // Not implemented
-}
-
-// ------------------------------ GetLane
-// Gets the single value stored in a vector/part.
-HWY_API uint8_t GetLane(const Vec256<uint8_t> v) {
-  return wasm_i8x16_extract_lane(v.raw, 0);
-}
-HWY_API int8_t GetLane(const Vec256<int8_t> v) {
-  return wasm_i8x16_extract_lane(v.raw, 0);
-}
-HWY_API uint16_t GetLane(const Vec256<uint16_t> v) {
-  return wasm_i16x8_extract_lane(v.raw, 0);
-}
-HWY_API int16_t GetLane(const Vec256<int16_t> v) {
-  return wasm_i16x8_extract_lane(v.raw, 0);
-}
-HWY_API uint32_t GetLane(const Vec256<uint32_t> v) {
-  return wasm_i32x4_extract_lane(v.raw, 0);
-}
-HWY_API int32_t GetLane(const Vec256<int32_t> v) {
-  return wasm_i32x4_extract_lane(v.raw, 0);
-}
-HWY_API uint64_t GetLane(const Vec256<uint64_t> v) {
-  return wasm_i64x2_extract_lane(v.raw, 0);
-}
-HWY_API int64_t GetLane(const Vec256<int64_t> v) {
-  return wasm_i64x2_extract_lane(v.raw, 0);
-}
-
-HWY_API float GetLane(const Vec256<float> v) {
-  return wasm_f32x4_extract_lane(v.raw, 0);
-}
-
-// ------------------------------ LowerHalf
-
-template <typename T>
-HWY_API Vec128<T> LowerHalf(Full128<T> /* tag */, Vec256<T> v) {
-  return Vec128<T>{v.raw};
-}
-
-template <typename T>
-HWY_API Vec128<T> LowerHalf(Vec256<T> v) {
-  return LowerHalf(Full128<T>(), v);
-}
-
-// ------------------------------ ShiftLeftBytes
-
-// 0x01..0F, kBytes = 1 => 0x02..0F00
-template <int kBytes, typename T>
-HWY_API Vec256<T> ShiftLeftBytes(Full256<T> /* tag */, Vec256<T> v) {
-  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
-  const __i8x16 zero = wasm_i8x16_splat(0);
-  switch (kBytes) {
-    case 0:
-      return v;
-
-    case 1:
-      return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 0, 1, 2, 3, 4, 5, 6,
-                                          7, 8, 9, 10, 11, 12, 13, 14)};
-
-    case 2:
-      return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 0, 1, 2, 3, 4, 5,
-                                          6, 7, 8, 9, 10, 11, 12, 13)};
-
-    case 3:
-      return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 0, 1, 2, 3,
-                                          4, 5, 6, 7, 8, 9, 10, 11, 12)};
-
-    case 4:
-      return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 0, 1, 2,
-                                          3, 4, 5, 6, 7, 8, 9, 10, 11)};
-
-    case 5:
-      return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 0, 1,
-                                          2, 3, 4, 5, 6, 7, 8, 9, 10)};
-
-    case 6:
-      return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
-                                          0, 1, 2, 3, 4, 5, 6, 7, 8, 9)};
-
-    case 7:
-      return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
-                                          16, 0, 1, 2, 3, 4, 5, 6, 7, 8)};
-
-    case 8:
-      return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
-                                          16, 16, 0, 1, 2, 3, 4, 5, 6, 7)};
-
-    case 9:
-      return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
-                                          16, 16, 16, 0, 1, 2, 3, 4, 5, 6)};
-
-    case 10:
-      return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
-                                          16, 16, 16, 16, 0, 1, 2, 3, 4, 5)};
-
-    case 11:
-      return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
-                                          16, 16, 16, 16, 16, 0, 1, 2, 3, 4)};
-
-    case 12:
-      return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
-                                          16, 16, 16, 16, 16, 16, 0, 1, 2, 3)};
-
-    case 13:
-      return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
-                                          16, 16, 16, 16, 16, 16, 16, 0, 1, 2)};
-
-    case 14:
-      return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
-                                          16, 16, 16, 16, 16, 16, 16, 16, 0,
-                                          1)};
-
-    case 15:
-      return Vec256<T>{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16,
-                                          16, 16, 16, 16, 16, 16, 16, 16, 16,
-                                          0)};
-  }
-  return Vec256<T>{zero};
-}
-
-template <int kBytes, typename T>
-HWY_API Vec256<T> ShiftLeftBytes(Vec256<T> v) {
-  return ShiftLeftBytes<kBytes>(Full256<T>(), v);
-}
-
-// ------------------------------ ShiftLeftLanes
-
-template <int kLanes, typename T>
-HWY_API Vec256<T> ShiftLeftLanes(Full256<T> d, const Vec256<T> v) {
-  const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
-}
-
-template <int kLanes, typename T>
-HWY_API Vec256<T> ShiftLeftLanes(const Vec256<T> v) {
-  return ShiftLeftLanes<kLanes>(Full256<T>(), v);
-}
-
-// ------------------------------ ShiftRightBytes
-namespace detail {
-
-// Helper function allows zeroing invalid lanes in caller.
-template <int kBytes, typename T>
-HWY_API __i8x16 ShrBytes(const Vec256<T> v) {
-  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
-  const __i8x16 zero = wasm_i8x16_splat(0);
-
-  switch (kBytes) {
-    case 0:
-      return v.raw;
-
-    case 1:
-      return wasm_i8x16_shuffle(v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
-                                12, 13, 14, 15, 16);
-
-    case 2:
-      return wasm_i8x16_shuffle(v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
-                                13, 14, 15, 16, 16);
-
-    case 3:
-      return wasm_i8x16_shuffle(v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
-                                13, 14, 15, 16, 16, 16);
-
-    case 4:
-      return wasm_i8x16_shuffle(v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
-                                14, 15, 16, 16, 16, 16);
-
-    case 5:
-      return wasm_i8x16_shuffle(v.raw, zero, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
-                                15, 16, 16, 16, 16, 16);
-
-    case 6:
-      return wasm_i8x16_shuffle(v.raw, zero, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                16, 16, 16, 16, 16, 16);
-
-    case 7:
-      return wasm_i8x16_shuffle(v.raw, zero, 7, 8, 9, 10, 11, 12, 13, 14, 15,
-                                16, 16, 16, 16, 16, 16, 16);
-
-    case 8:
-      return wasm_i8x16_shuffle(v.raw, zero, 8, 9, 10, 11, 12, 13, 14, 15, 16,
-                                16, 16, 16, 16, 16, 16, 16);
-
-    case 9:
-      return wasm_i8x16_shuffle(v.raw, zero, 9, 10, 11, 12, 13, 14, 15, 16, 16,
-                                16, 16, 16, 16, 16, 16, 16);
-
-    case 10:
-      return wasm_i8x16_shuffle(v.raw, zero, 10, 11, 12, 13, 14, 15, 16, 16, 16,
-                                16, 16, 16, 16, 16, 16, 16);
-
-    case 11:
-      return wasm_i8x16_shuffle(v.raw, zero, 11, 12, 13, 14, 15, 16, 16, 16, 16,
-                                16, 16, 16, 16, 16, 16, 16);
-
-    case 12:
-      return wasm_i8x16_shuffle(v.raw, zero, 12, 13, 14, 15, 16, 16, 16, 16, 16,
-                                16, 16, 16, 16, 16, 16, 16);
-
-    case 13:
-      return wasm_i8x16_shuffle(v.raw, zero, 13, 14, 15, 16, 16, 16, 16, 16, 16,
-                                16, 16, 16, 16, 16, 16, 16);
-
-    case 14:
-      return wasm_i8x16_shuffle(v.raw, zero, 14, 15, 16, 16, 16, 16, 16, 16, 16,
-                                16, 16, 16, 16, 16, 16, 16);
-
-    case 15:
-      return wasm_i8x16_shuffle(v.raw, zero, 15, 16, 16, 16, 16, 16, 16, 16, 16,
-                                16, 16, 16, 16, 16, 16, 16);
-    case 16:
-      return zero;
-  }
-}
-
-}  // namespace detail
-
-// 0x01..0F, kBytes = 1 => 0x0001..0E
-template <int kBytes, typename T>
-HWY_API Vec256<T> ShiftRightBytes(Full256<T> /* tag */, Vec256<T> v) {
-  return Vec256<T>{detail::ShrBytes<kBytes>(v)};
-}
-
-// ------------------------------ ShiftRightLanes
-template <int kLanes, typename T>
-HWY_API Vec256<T> ShiftRightLanes(Full256<T> d, const Vec256<T> v) {
-  const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
-}
-
-// ------------------------------ UpperHalf (ShiftRightBytes)
-
-// Full input: copy hi into lo (smaller instruction encoding than shifts).
-template <typename T>
-HWY_API Vec128<T, 8 / sizeof(T)> UpperHalf(Full128<T> /* tag */,
-                                           const Vec256<T> v) {
-  return Vec128<T, 8 / sizeof(T)>{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
-}
-HWY_API Vec128<float, 2> UpperHalf(Full128<float> /* tag */,
-                                   const Vec128<float> v) {
-  return Vec128<float, 2>{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)};
-}
-
-// ------------------------------ CombineShiftRightBytes
-
-template <int kBytes, typename T, class V = Vec256<T>>
-HWY_API V CombineShiftRightBytes(Full256<T> /* tag */, V hi, V lo) {
-  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
-  switch (kBytes) {
-    case 0:
-      return lo;
-
-    case 1:
-      return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
-                                  11, 12, 13, 14, 15, 16)};
-
-    case 2:
-      return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8, 9, 10,
-                                  11, 12, 13, 14, 15, 16, 17)};
-
-    case 3:
-      return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9, 10, 11,
-                                  12, 13, 14, 15, 16, 17, 18)};
-
-    case 4:
-      return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10, 11, 12,
-                                  13, 14, 15, 16, 17, 18, 19)};
-
-    case 5:
-      return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11, 12, 13,
-                                  14, 15, 16, 17, 18, 19, 20)};
-
-    case 6:
-      return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11, 12, 13,
-                                  14, 15, 16, 17, 18, 19, 20, 21)};
-
-    case 7:
-      return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12, 13, 14,
-                                  15, 16, 17, 18, 19, 20, 21, 22)};
-
-    case 8:
-      return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13, 14, 15,
-                                  16, 17, 18, 19, 20, 21, 22, 23)};
-
-    case 9:
-      return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14, 15, 16,
-                                  17, 18, 19, 20, 21, 22, 23, 24)};
-
-    case 10:
-      return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14, 15, 16,
-                                  17, 18, 19, 20, 21, 22, 23, 24, 25)};
-
-    case 11:
-      return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15, 16, 17,
-                                  18, 19, 20, 21, 22, 23, 24, 25, 26)};
-
-    case 12:
-      return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16, 17, 18,
-                                  19, 20, 21, 22, 23, 24, 25, 26, 27)};
-
-    case 13:
-      return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17, 18, 19,
-                                  20, 21, 22, 23, 24, 25, 26, 27, 28)};
-
-    case 14:
-      return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18, 19, 20,
-                                  21, 22, 23, 24, 25, 26, 27, 28, 29)};
-
-    case 15:
-      return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19, 20, 21,
-                                  22, 23, 24, 25, 26, 27, 28, 29, 30)};
-  }
-  return hi;
-}
-
-// ------------------------------ Broadcast/splat any lane
-
-// Unsigned
-template <int kLane>
-HWY_API Vec256<uint16_t> Broadcast(const Vec256<uint16_t> v) {
-  static_assert(0 <= kLane && kLane < N, "Invalid lane");
-  return Vec256<uint16_t>{wasm_i16x8_shuffle(
-      v.raw, v.raw, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane)};
-}
-template <int kLane>
-HWY_API Vec256<uint32_t> Broadcast(const Vec256<uint32_t> v) {
-  static_assert(0 <= kLane && kLane < N, "Invalid lane");
-  return Vec256<uint32_t>{
-      wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
-}
-
-// Signed
-template <int kLane>
-HWY_API Vec256<int16_t> Broadcast(const Vec256<int16_t> v) {
-  static_assert(0 <= kLane && kLane < N, "Invalid lane");
-  return Vec256<int16_t>{wasm_i16x8_shuffle(v.raw, v.raw, kLane, kLane, kLane,
-                                            kLane, kLane, kLane, kLane, kLane)};
-}
-template <int kLane>
-HWY_API Vec256<int32_t> Broadcast(const Vec256<int32_t> v) {
-  static_assert(0 <= kLane && kLane < N, "Invalid lane");
-  return Vec256<int32_t>{
-      wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
-}
-
-// Float
-template <int kLane>
-HWY_API Vec256<float> Broadcast(const Vec256<float> v) {
-  static_assert(0 <= kLane && kLane < N, "Invalid lane");
-  return Vec256<float>{
-      wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)};
-}
-
-// ------------------------------ TableLookupBytes
-
-// Returns vector of bytes[from[i]]. "from" is also interpreted as bytes, i.e.
-// lane indices in [0, 16).
-template <typename T, typename TI>
-HWY_API Vec256<TI> TableLookupBytes(const Vec256<T> bytes,
-                                    const Vec256<TI> from) {
-// Not yet available in all engines, see
-// https://github.com/WebAssembly/simd/blob/bdcc304b2d379f4601c2c44ea9b44ed9484fde7e/proposals/simd/ImplementationStatus.md
-// V8 implementation of this had a bug, fixed on 2021-04-03:
-// https://chromium-review.googlesource.com/c/v8/v8/+/2822951
-#if 0
-  return Vec256<TI>{wasm_i8x16_swizzle(bytes.raw, from.raw)};
-#else
-  alignas(32) uint8_t control[16];
-  alignas(32) uint8_t input[16];
-  alignas(32) uint8_t output[16];
-  wasm_v128_store(control, from.raw);
-  wasm_v128_store(input, bytes.raw);
-  for (size_t i = 0; i < 16; ++i) {
-    output[i] = control[i] < 16 ? input[control[i]] : 0;
-  }
-  return Vec256<TI>{wasm_v128_load(output)};
-#endif
-}
-
-template <typename T, typename TI>
-HWY_API Vec256<TI> TableLookupBytesOr0(const Vec256<T> bytes,
-                                       const Vec256<TI> from) {
-  const Full256<TI> d;
-  // Mask size must match vector type, so cast everything to this type.
-  Repartition<int8_t, decltype(d)> di8;
-  Repartition<int8_t, Full256<T>> d_bytes8;
-  const auto msb = BitCast(di8, from) < Zero(di8);
-  const auto lookup =
-      TableLookupBytes(BitCast(d_bytes8, bytes), BitCast(di8, from));
-  return BitCast(d, IfThenZeroElse(msb, lookup));
-}
-
-// ------------------------------ Hard-coded shuffles
-
-// Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
-// Shuffle0321 rotates one lane to the right (the previous least-significant
-// lane is now most-significant). These could also be implemented via
-// CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
-
-// Swap 32-bit halves in 64-bit halves.
-HWY_API Vec128<uint32_t> Shuffle2301(const Vec128<uint32_t> v) {
-  return Vec128<uint32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
-}
-HWY_API Vec128<int32_t> Shuffle2301(const Vec128<int32_t> v) {
-  return Vec128<int32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
-}
-HWY_API Vec128<float> Shuffle2301(const Vec128<float> v) {
-  return Vec128<float>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)};
-}
-
-// Swap 64-bit halves
-HWY_API Vec128<uint32_t> Shuffle1032(const Vec128<uint32_t> v) {
-  return Vec128<uint32_t>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
-}
-HWY_API Vec128<int32_t> Shuffle1032(const Vec128<int32_t> v) {
-  return Vec128<int32_t>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
-}
-HWY_API Vec128<float> Shuffle1032(const Vec128<float> v) {
-  return Vec128<float>{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)};
-}
-
-// Rotate right 32 bits
-HWY_API Vec128<uint32_t> Shuffle0321(const Vec128<uint32_t> v) {
-  return Vec128<uint32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
-}
-HWY_API Vec128<int32_t> Shuffle0321(const Vec128<int32_t> v) {
-  return Vec128<int32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
-}
-HWY_API Vec128<float> Shuffle0321(const Vec128<float> v) {
-  return Vec128<float>{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)};
-}
-// Rotate left 32 bits
-HWY_API Vec128<uint32_t> Shuffle2103(const Vec128<uint32_t> v) {
-  return Vec128<uint32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
-}
-HWY_API Vec128<int32_t> Shuffle2103(const Vec128<int32_t> v) {
-  return Vec128<int32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
-}
-HWY_API Vec128<float> Shuffle2103(const Vec128<float> v) {
-  return Vec128<float>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)};
-}
-
-// Reverse
-HWY_API Vec128<uint32_t> Shuffle0123(const Vec128<uint32_t> v) {
-  return Vec128<uint32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
-}
-HWY_API Vec128<int32_t> Shuffle0123(const Vec128<int32_t> v) {
-  return Vec128<int32_t>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
-}
-HWY_API Vec128<float> Shuffle0123(const Vec128<float> v) {
-  return Vec128<float>{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)};
-}
-
-// ------------------------------ TableLookupLanes
-
-// Returned by SetTableIndices for use by TableLookupLanes.
-template <typename T>
-struct Indices256 {
-  __v128_u raw;
-};
-
-template <typename T, typename TI>
-HWY_API Indices256<T> IndicesFromVec(Full256<T> d, Vec256<TI> vec) {
-  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
-  return Indices256<T>{};
-}
-
-template <typename T, typename TI>
-HWY_API Indices256<T> SetTableIndices(Full256<T> d, const TI* idx) {
-  const Rebind<TI, decltype(d)> di;
-  return IndicesFromVec(d, LoadU(di, idx));
-}
-
-template <typename T>
-HWY_API Vec256<T> TableLookupLanes(Vec256<T> v, Indices256<T> idx) {
-  using TI = MakeSigned<T>;
-  const Full256<T> d;
-  const Full256<TI> di;
-  return BitCast(d, TableLookupBytes(BitCast(di, v), Vec256<TI>{idx.raw}));
-}
-
-// ------------------------------ Reverse (Shuffle0123, Shuffle2301, Shuffle01)
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec256<T> Reverse(Full256<T> /* tag */, const Vec256<T> v) {
-  return Shuffle01(v);
-}
-
-// Four lanes: shuffle
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec256<T> Reverse(Full256<T> /* tag */, const Vec256<T> v) {
-  return Shuffle0123(v);
-}
-
-// 16-bit
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec256<T> Reverse(Full256<T> d, const Vec256<T> v) {
-  const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
-  return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v))));
-}
-
-// ------------------------------ Reverse2
-
-template <typename T>
-HWY_API Vec256<T> Reverse2(Full256<T> d, const Vec256<T> v) {
-  HWY_ASSERT(0);  // Not implemented
-}
-
-// ------------------------------ Reverse4
-
-template <typename T>
-HWY_API Vec256<T> Reverse4(Full256<T> d, const Vec256<T> v) {
-  HWY_ASSERT(0);  // Not implemented
-}
-
-// ------------------------------ Reverse8
-
-template <typename T>
-HWY_API Vec256<T> Reverse8(Full256<T> d, const Vec256<T> v) {
-  HWY_ASSERT(0);  // Not implemented
-}
-
-// ------------------------------ InterleaveLower
-
-HWY_API Vec256<uint8_t> InterleaveLower(Vec256<uint8_t> a, Vec256<uint8_t> b) {
-  return Vec256<uint8_t>{wasm_i8x16_shuffle(a.raw, b.raw, 0, 16, 1, 17, 2, 18,
-                                            3, 19, 4, 20, 5, 21, 6, 22, 7, 23)};
-}
-HWY_API Vec256<uint16_t> InterleaveLower(Vec256<uint16_t> a,
-                                         Vec256<uint16_t> b) {
-  return Vec256<uint16_t>{
-      wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
-}
-HWY_API Vec256<uint32_t> InterleaveLower(Vec256<uint32_t> a,
-                                         Vec256<uint32_t> b) {
-  return Vec256<uint32_t>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
-}
-HWY_API Vec256<uint64_t> InterleaveLower(Vec256<uint64_t> a,
-                                         Vec256<uint64_t> b) {
-  return Vec256<uint64_t>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)};
-}
-
-HWY_API Vec256<int8_t> InterleaveLower(Vec256<int8_t> a, Vec256<int8_t> b) {
-  return Vec256<int8_t>{wasm_i8x16_shuffle(a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3,
-                                           19, 4, 20, 5, 21, 6, 22, 7, 23)};
-}
-HWY_API Vec256<int16_t> InterleaveLower(Vec256<int16_t> a, Vec256<int16_t> b) {
-  return Vec256<int16_t>{
-      wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)};
-}
-HWY_API Vec256<int32_t> InterleaveLower(Vec256<int32_t> a, Vec256<int32_t> b) {
-  return Vec256<int32_t>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
-}
-HWY_API Vec256<int64_t> InterleaveLower(Vec256<int64_t> a, Vec256<int64_t> b) {
-  return Vec256<int64_t>{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)};
-}
-
-HWY_API Vec256<float> InterleaveLower(Vec256<float> a, Vec256<float> b) {
-  return Vec256<float>{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)};
-}
-
-// Additional overload for the optional tag.
-template <typename T, class V = Vec256<T>>
-HWY_API V InterleaveLower(Full256<T> /* tag */, V a, V b) {
-  return InterleaveLower(a, b);
-}
-
-// ------------------------------ InterleaveUpper (UpperHalf)
-
-// All functions inside detail lack the required D parameter.
-namespace detail {
-
-HWY_API Vec256<uint8_t> InterleaveUpper(Vec256<uint8_t> a, Vec256<uint8_t> b) {
-  return Vec256<uint8_t>{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10, 26,
-                                            11, 27, 12, 28, 13, 29, 14, 30, 15,
-                                            31)};
-}
-HWY_API Vec256<uint16_t> InterleaveUpper(Vec256<uint16_t> a,
-                                         Vec256<uint16_t> b) {
-  return Vec256<uint16_t>{
-      wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
-}
-HWY_API Vec256<uint32_t> InterleaveUpper(Vec256<uint32_t> a,
-                                         Vec256<uint32_t> b) {
-  return Vec256<uint32_t>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
-}
-HWY_API Vec256<uint64_t> InterleaveUpper(Vec256<uint64_t> a,
-                                         Vec256<uint64_t> b) {
-  return Vec256<uint64_t>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)};
-}
-
-HWY_API Vec256<int8_t> InterleaveUpper(Vec256<int8_t> a, Vec256<int8_t> b) {
-  return Vec256<int8_t>{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10, 26,
-                                           11, 27, 12, 28, 13, 29, 14, 30, 15,
-                                           31)};
-}
-HWY_API Vec256<int16_t> InterleaveUpper(Vec256<int16_t> a, Vec256<int16_t> b) {
-  return Vec256<int16_t>{
-      wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)};
-}
-HWY_API Vec256<int32_t> InterleaveUpper(Vec256<int32_t> a, Vec256<int32_t> b) {
-  return Vec256<int32_t>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
-}
-HWY_API Vec256<int64_t> InterleaveUpper(Vec256<int64_t> a, Vec256<int64_t> b) {
-  return Vec256<int64_t>{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)};
-}
-
-HWY_API Vec256<float> InterleaveUpper(Vec256<float> a, Vec256<float> b) {
-  return Vec256<float>{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)};
-}
-
-}  // namespace detail
-
-template <typename T, class V = Vec256<T>>
-HWY_API V InterleaveUpper(Full256<T> /* tag */, V a, V b) {
-  return detail::InterleaveUpper(a, b);
-}
-
-// ------------------------------ ZipLower/ZipUpper (InterleaveLower)
-
-// Same as Interleave*, except that the return lanes are double-width integers;
-// this is necessary because the single-lane scalar cannot return two values.
-template <typename T, class DW = RepartitionToWide<Full256<T>>>
-HWY_API VFromD<DW> ZipLower(Vec256<T> a, Vec256<T> b) {
-  return BitCast(DW(), InterleaveLower(a, b));
-}
-template <typename T, class D = Full256<T>, class DW = RepartitionToWide<D>>
-HWY_API VFromD<DW> ZipLower(DW dw, Vec256<T> a, Vec256<T> b) {
-  return BitCast(dw, InterleaveLower(D(), a, b));
-}
-
-template <typename T, class D = Full256<T>, class DW = RepartitionToWide<D>>
-HWY_API VFromD<DW> ZipUpper(DW dw, Vec256<T> a, Vec256<T> b) {
-  return BitCast(dw, InterleaveUpper(D(), a, b));
-}
-
-// ================================================== COMBINE
-
-// ------------------------------ Combine (InterleaveLower)
-
-// N = N/2 + N/2 (upper half undefined)
-template <typename T>
-HWY_API Vec256<T> Combine(Full256<T> d, Vec128<T> hi_half, Vec128<T> lo_half) {
-  const Half<decltype(d)> d2;
-  const RebindToUnsigned<decltype(d2)> du2;
-  // Treat half-width input as one lane, and expand to two lanes.
-  using VU = Vec128<UnsignedFromSize<N * sizeof(T) / 2>, 2>;
-  const VU lo{BitCast(du2, lo_half).raw};
-  const VU hi{BitCast(du2, hi_half).raw};
-  return BitCast(d, InterleaveLower(lo, hi));
-}
-
-// ------------------------------ ZeroExtendVector (Combine, IfThenElseZero)
-
-template <typename T>
-HWY_API Vec256<T> ZeroExtendVector(Full256<T> d, Vec128<T> lo) {
-  return IfThenElseZero(FirstN(d, 16 / sizeof(T)), Vec256<T>{lo.raw});
-}
-
-// ------------------------------ ConcatLowerLower
-
-// hiH,hiL loH,loL |-> hiL,loL (= lower halves)
-template <typename T>
-HWY_API Vec256<T> ConcatLowerLower(Full256<T> /* tag */, const Vec256<T> hi,
-                                   const Vec256<T> lo) {
-  return Vec256<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 0, 2)};
-}
-
-// ------------------------------ ConcatUpperUpper
-
-template <typename T>
-HWY_API Vec256<T> ConcatUpperUpper(Full256<T> /* tag */, const Vec256<T> hi,
-                                   const Vec256<T> lo) {
-  return Vec256<T>{wasm_i64x2_shuffle(lo.raw, hi.raw, 1, 3)};
-}
-
-// ------------------------------ ConcatLowerUpper
-
-template <typename T>
-HWY_API Vec256<T> ConcatLowerUpper(Full256<T> d, const Vec256<T> hi,
-                                   const Vec256<T> lo) {
-  return CombineShiftRightBytes<8>(d, hi, lo);
-}
-
-// ------------------------------ ConcatUpperLower
-template <typename T>
-HWY_API Vec256<T> ConcatUpperLower(Full256<T> d, const Vec256<T> hi,
-                                   const Vec256<T> lo) {
-  return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi);
-}
-
-// ------------------------------ ConcatOdd
-
-// 32-bit
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec256<T> ConcatOdd(Full256<T> /* tag */, Vec256<T> hi, Vec256<T> lo) {
-  return Vec256<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 1, 3, 5, 7)};
-}
-
-// 64-bit full - no partial because we need at least two inputs to have
-// even/odd.
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec256<T> ConcatOdd(Full256<T> /* tag */, Vec256<T> hi, Vec256<T> lo) {
-  return InterleaveUpper(Full256<T>(), lo, hi);
-}
-
-// ------------------------------ ConcatEven (InterleaveLower)
-
-// 32-bit full
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec256<T> ConcatEven(Full256<T> /* tag */, Vec256<T> hi, Vec256<T> lo) {
-  return Vec256<T>{wasm_i32x4_shuffle(lo.raw, hi.raw, 0, 2, 4, 6)};
-}
-
-// 64-bit full - no partial because we need at least two inputs to have
-// even/odd.
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec256<T> ConcatEven(Full256<T> /* tag */, Vec256<T> hi, Vec256<T> lo) {
-  return InterleaveLower(Full256<T>(), lo, hi);
-}
-
-// ------------------------------ DupEven
-template <typename T>
-HWY_API Vec256<T> DupEven(Vec256<T> v) {
-  HWY_ASSERT(0);  // Not implemented
-}
-
-// ------------------------------ DupOdd
-template <typename T>
-HWY_API Vec256<T> DupOdd(Vec256<T> v) {
-  HWY_ASSERT(0);  // Not implemented
-}
-
-// ------------------------------ OddEven
-
-namespace detail {
-
-template <typename T>
-HWY_INLINE Vec256<T> OddEven(hwy::SizeTag<1> /* tag */, const Vec256<T> a,
-                             const Vec256<T> b) {
-  const Full256<T> d;
-  const Repartition<uint8_t, decltype(d)> d8;
-  alignas(32) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
-                                            0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
-  return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
-}
-template <typename T>
-HWY_INLINE Vec256<T> OddEven(hwy::SizeTag<2> /* tag */, const Vec256<T> a,
-                             const Vec256<T> b) {
-  return Vec256<T>{wasm_i16x8_shuffle(a.raw, b.raw, 8, 1, 10, 3, 12, 5, 14, 7)};
-}
-template <typename T>
-HWY_INLINE Vec256<T> OddEven(hwy::SizeTag<4> /* tag */, const Vec256<T> a,
-                             const Vec256<T> b) {
-  return Vec256<T>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
-}
-template <typename T>
-HWY_INLINE Vec256<T> OddEven(hwy::SizeTag<8> /* tag */, const Vec256<T> a,
-                             const Vec256<T> b) {
-  return Vec256<T>{wasm_i64x2_shuffle(a.raw, b.raw, 2, 1)};
-}
-
-}  // namespace detail
-
-template <typename T>
-HWY_API Vec256<T> OddEven(const Vec256<T> a, const Vec256<T> b) {
-  return detail::OddEven(hwy::SizeTag<sizeof(T)>(), a, b);
-}
-HWY_API Vec256<float> OddEven(const Vec256<float> a, const Vec256<float> b) {
-  return Vec256<float>{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)};
-}
-
-// ------------------------------ OddEvenBlocks
-template <typename T>
-HWY_API Vec256<T> OddEvenBlocks(Vec256<T> /* odd */, Vec256<T> even) {
-  return even;
-}
-
-// ------------------------------ SwapAdjacentBlocks
-
-template <typename T>
-HWY_API Vec256<T> SwapAdjacentBlocks(Vec256<T> v) {
-  return v;
-}
-
-// ------------------------------ ReverseBlocks
-
-template <typename T>
-HWY_API Vec256<T> ReverseBlocks(Full256<T> /* tag */, const Vec256<T> v) {
-  return v;
-}
-
-// ================================================== CONVERT
-
-// ------------------------------ Promotions (part w/ narrow lanes -> full)
-
-// Unsigned: zero-extend.
-HWY_API Vec256<uint16_t> PromoteTo(Full256<uint16_t> /* tag */,
-                                   const Vec128<uint8_t> v) {
-  return Vec256<uint16_t>{wasm_u16x8_extend_low_u8x16(v.raw)};
-}
-HWY_API Vec256<uint32_t> PromoteTo(Full256<uint32_t> /* tag */,
-                                   const Vec128<uint8_t> v) {
-  return Vec256<uint32_t>{
-      wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
-}
-HWY_API Vec256<int16_t> PromoteTo(Full256<int16_t> /* tag */,
-                                  const Vec128<uint8_t> v) {
-  return Vec256<int16_t>{wasm_u16x8_extend_low_u8x16(v.raw)};
-}
-HWY_API Vec256<int32_t> PromoteTo(Full256<int32_t> /* tag */,
-                                  const Vec128<uint8_t> v) {
-  return Vec256<int32_t>{
-      wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))};
-}
-HWY_API Vec256<uint32_t> PromoteTo(Full256<uint32_t> /* tag */,
-                                   const Vec128<uint16_t> v) {
-  return Vec256<uint32_t>{wasm_u32x4_extend_low_u16x8(v.raw)};
-}
-HWY_API Vec256<int32_t> PromoteTo(Full256<int32_t> /* tag */,
-                                  const Vec128<uint16_t> v) {
-  return Vec256<int32_t>{wasm_u32x4_extend_low_u16x8(v.raw)};
-}
-
-// Signed: replicate sign bit.
-HWY_API Vec256<int16_t> PromoteTo(Full256<int16_t> /* tag */,
-                                  const Vec128<int8_t> v) {
-  return Vec256<int16_t>{wasm_i16x8_extend_low_i8x16(v.raw)};
-}
-HWY_API Vec256<int32_t> PromoteTo(Full256<int32_t> /* tag */,
-                                  const Vec128<int8_t> v) {
-  return Vec256<int32_t>{
-      wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(v.raw))};
-}
-HWY_API Vec256<int32_t> PromoteTo(Full256<int32_t> /* tag */,
-                                  const Vec128<int16_t> v) {
-  return Vec256<int32_t>{wasm_i32x4_extend_low_i16x8(v.raw)};
-}
-
-HWY_API Vec256<double> PromoteTo(Full256<double> /* tag */,
-                                 const Vec128<int32_t> v) {
-  return Vec256<double>{wasm_f64x2_convert_low_i32x4(v.raw)};
-}
-
-HWY_API Vec256<float> PromoteTo(Full256<float> /* tag */,
-                                const Vec128<float16_t> v) {
-  const Full256<int32_t> di32;
-  const Full256<uint32_t> du32;
-  const Full256<float> df32;
-  // Expand to u32 so we can shift.
-  const auto bits16 = PromoteTo(du32, Vec256<uint16_t>{v.raw});
-  const auto sign = ShiftRight<15>(bits16);
-  const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F);
-  const auto mantissa = bits16 & Set(du32, 0x3FF);
-  const auto subnormal =
-      BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) *
-                        Set(df32, 1.0f / 16384 / 1024));
-
-  const auto biased_exp32 = biased_exp + Set(du32, 127 - 15);
-  const auto mantissa32 = ShiftLeft<23 - 10>(mantissa);
-  const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
-  const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal);
-  return BitCast(df32, ShiftLeft<31>(sign) | bits32);
-}
-
-HWY_API Vec256<float> PromoteTo(Full256<float> df32,
-                                const Vec128<bfloat16_t> v) {
-  const Rebind<uint16_t, decltype(df32)> du16;
-  const RebindToSigned<decltype(df32)> di32;
-  return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
-}
-
-// ------------------------------ Demotions (full -> part w/ narrow lanes)
-
-HWY_API Vec128<uint16_t> DemoteTo(Full128<uint16_t> /* tag */,
-                                  const Vec256<int32_t> v) {
-  return Vec128<uint16_t>{wasm_u16x8_narrow_i32x4(v.raw, v.raw)};
-}
-
-HWY_API Vec128<int16_t> DemoteTo(Full128<int16_t> /* tag */,
-                                 const Vec256<int32_t> v) {
-  return Vec128<int16_t>{wasm_i16x8_narrow_i32x4(v.raw, v.raw)};
-}
-
-HWY_API Vec128<uint8_t> DemoteTo(Full128<uint8_t> /* tag */,
-                                 const Vec256<int32_t> v) {
-  const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
-  return Vec128<uint8_t>{wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
-}
-
-HWY_API Vec128<uint8_t> DemoteTo(Full128<uint8_t> /* tag */,
-                                 const Vec256<int16_t> v) {
-  return Vec128<uint8_t>{wasm_u8x16_narrow_i16x8(v.raw, v.raw)};
-}
-
-HWY_API Vec128<int8_t> DemoteTo(Full128<int8_t> /* tag */,
-                                const Vec256<int32_t> v) {
-  const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
-  return Vec128<int8_t>{wasm_i8x16_narrow_i16x8(intermediate, intermediate)};
-}
-
-HWY_API Vec128<int8_t> DemoteTo(Full128<int8_t> /* tag */,
-                                const Vec256<int16_t> v) {
-  return Vec128<int8_t>{wasm_i8x16_narrow_i16x8(v.raw, v.raw)};
-}
-
-HWY_API Vec128<int32_t> DemoteTo(Full128<int32_t> /* di */,
-                                 const Vec256<double> v) {
-  return Vec128<int32_t>{wasm_i32x4_trunc_sat_f64x2_zero(v.raw)};
-}
-
-HWY_API Vec128<float16_t> DemoteTo(Full128<float16_t> /* tag */,
-                                   const Vec256<float> v) {
-  const Full256<int32_t> di;
-  const Full256<uint32_t> du;
-  const Full256<uint16_t> du16;
-  const auto bits32 = BitCast(du, v);
-  const auto sign = ShiftRight<31>(bits32);
-  const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF);
-  const auto mantissa32 = bits32 & Set(du, 0x7FFFFF);
-
-  const auto k15 = Set(di, 15);
-  const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15);
-  const auto is_tiny = exp < Set(di, -24);
-
-  const auto is_subnormal = exp < Set(di, -14);
-  const auto biased_exp16 =
-      BitCast(du, IfThenZeroElse(is_subnormal, exp + k15));
-  const auto sub_exp = BitCast(du, Set(di, -14) - exp);  // [1, 11)
-  const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) +
-                     (mantissa32 >> (Set(du, 13) + sub_exp));
-  const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m,
-                                     ShiftRight<13>(mantissa32));  // <1024
-
-  const auto sign16 = ShiftLeft<15>(sign);
-  const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
-  const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16));
-  return Vec128<float16_t>{DemoteTo(du16, bits16).raw};
-}
-
-HWY_API Vec128<bfloat16_t> DemoteTo(Full128<bfloat16_t> dbf16,
-                                    const Vec256<float> v) {
-  const Rebind<int32_t, decltype(dbf16)> di32;
-  const Rebind<uint32_t, decltype(dbf16)> du32;  // for logical shift right
-  const Rebind<uint16_t, decltype(dbf16)> du16;
-  const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
-  return BitCast(dbf16, DemoteTo(du16, bits_in_32));
-}
-
-HWY_API Vec128<bfloat16_t> ReorderDemote2To(Full128<bfloat16_t> dbf16,
-                                            Vec256<float> a, Vec256<float> b) {
-  const RebindToUnsigned<decltype(dbf16)> du16;
-  const Repartition<uint32_t, decltype(dbf16)> du32;
-  const Vec256<uint32_t> b_in_even = ShiftRight<16>(BitCast(du32, b));
-  return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
-}
-
-// For already range-limited input [0, 255].
-HWY_API Vec256<uint8_t> U8FromU32(const Vec256<uint32_t> v) {
-  const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw);
-  return Vec256<uint8_t>{wasm_u8x16_narrow_i16x8(intermediate, intermediate)};
-}
-
-// ------------------------------ Truncations
-
-HWY_API Vec256<uint8_t, 4> TruncateTo(Simd<uint8_t, 4, 0> /* tag */,
-                                      const Vec256<uint64_t> v) {
-  return Vec256<uint8_t, 4>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 8, 16, 24,
-                                               0, 8, 16, 24, 0, 8, 16, 24, 0, 8,
-                                               16, 24)};
-}
-
-HWY_API Vec256<uint16_t, 4> TruncateTo(Simd<uint16_t, 4, 0> /* tag */,
-                                       const Vec256<uint64_t> v) {
-  return Vec256<uint16_t, 4>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 1, 8, 9,
-                                                16, 17, 24, 25, 0, 1, 8, 9, 16,
-                                                17, 24, 25)};
-}
-
-HWY_API Vec256<uint32_t, 4> TruncateTo(Simd<uint32_t, 4, 0> /* tag */,
-                                       const Vec256<uint64_t> v) {
-  return Vec256<uint32_t, 4>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 1, 2, 3,
-                                                8, 9, 10, 11, 16, 17, 18, 19,
-                                                24, 25, 26, 27)};
-}
-
-HWY_API Vec256<uint8_t, 8> TruncateTo(Simd<uint8_t, 8, 0> /* tag */,
-                                      const Vec256<uint32_t> v) {
-  return Vec256<uint8_t, 8>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 4, 8, 12,
-                                               16, 20, 24, 28, 0, 4, 8, 12, 16,
-                                               20, 24, 28)};
-}
-
-HWY_API Vec256<uint16_t, 8> TruncateTo(Simd<uint16_t, 8, 0> /* tag */,
-                                       const Vec256<uint32_t> v) {
-  return Vec256<uint16_t, 8>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 1, 4, 5,
-                                                8, 9, 12, 13, 16, 17, 20, 21,
-                                                24, 25, 28, 29)};
-}
-
-HWY_API Vec256<uint8_t, 16> TruncateTo(Simd<uint8_t, 16, 0> /* tag */,
-                                       const Vec256<uint16_t> v) {
-  return Vec256<uint8_t, 16>{wasm_i8x16_shuffle(v.v0.raw, v.v1.raw, 0, 2, 4, 6,
-                                                8, 10, 12, 14, 16, 18, 20, 22,
-                                                24, 26, 28, 30)};
-}
-
-// ------------------------------ Convert i32 <=> f32 (Round)
-
-HWY_API Vec256<float> ConvertTo(Full256<float> /* tag */,
-                                const Vec256<int32_t> v) {
-  return Vec256<float>{wasm_f32x4_convert_i32x4(v.raw)};
-}
-HWY_API Vec256<float> ConvertTo(Full256<float> /* tag */,
-                                const Vec256<uint32_t> v) {
-  return Vec256<float>{wasm_f32x4_convert_u32x4(v.raw)};
-}
-// Truncates (rounds toward zero).
-HWY_API Vec256<int32_t> ConvertTo(Full256<int32_t> /* tag */,
-                                  const Vec256<float> v) {
-  return Vec256<int32_t>{wasm_i32x4_trunc_sat_f32x4(v.raw)};
-}
-
-HWY_API Vec256<int32_t> NearestInt(const Vec256<float> v) {
-  return ConvertTo(Full256<int32_t>(), Round(v));
-}
-
-// ================================================== MISC
-
-// ------------------------------ LoadMaskBits (TestBit)
-
-namespace detail {
-
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_INLINE Mask256<T> LoadMaskBits(Full256<T> d, uint64_t bits) {
-  const RebindToUnsigned<decltype(d)> du;
-  // Easier than Set(), which would require an >8-bit type, which would not
-  // compile for T=uint8_t, N=1.
-  const Vec256<T> vbits{wasm_i32x4_splat(static_cast<int32_t>(bits))};
-
-  // Replicate bytes 8x such that each byte contains the bit that governs it.
-  alignas(32) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
-                                             1, 1, 1, 1, 1, 1, 1, 1};
-  const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8));
-
-  alignas(32) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
-                                            1, 2, 4, 8, 16, 32, 64, 128};
-  return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit)));
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_INLINE Mask256<T> LoadMaskBits(Full256<T> d, uint64_t bits) {
-  const RebindToUnsigned<decltype(d)> du;
-  alignas(32) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
-  return RebindMask(d, TestBit(Set(du, bits), Load(du, kBit)));
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_INLINE Mask256<T> LoadMaskBits(Full256<T> d, uint64_t bits) {
-  const RebindToUnsigned<decltype(d)> du;
-  alignas(32) constexpr uint32_t kBit[8] = {1, 2, 4, 8};
-  return RebindMask(d, TestBit(Set(du, bits), Load(du, kBit)));
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_INLINE Mask256<T> LoadMaskBits(Full256<T> d, uint64_t bits) {
-  const RebindToUnsigned<decltype(d)> du;
-  alignas(32) constexpr uint64_t kBit[8] = {1, 2};
-  return RebindMask(d, TestBit(Set(du, bits), Load(du, kBit)));
-}
-
-}  // namespace detail
-
-// `p` points to at least 8 readable bytes, not all of which need be valid.
-template <typename T>
-HWY_API Mask256<T> LoadMaskBits(Full256<T> d,
-                                const uint8_t* HWY_RESTRICT bits) {
-  uint64_t mask_bits = 0;
-  CopyBytes<(N + 7) / 8>(bits, &mask_bits);
-  return detail::LoadMaskBits(d, mask_bits);
-}
-
-// ------------------------------ Mask
-
-namespace detail {
-
-// Full
-template <typename T>
-HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
-                                 const Mask128<T> mask) {
-  alignas(32) uint64_t lanes[2];
-  wasm_v128_store(lanes, mask.raw);
-
-  constexpr uint64_t kMagic = 0x103070F1F3F80ULL;
-  const uint64_t lo = ((lanes[0] * kMagic) >> 56);
-  const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00;
-  return (hi + lo);
-}
-
-template <typename T>
-HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/,
-                                 const Mask256<T> mask) {
-  // Remove useless lower half of each u16 while preserving the sign bit.
-  const __i16x8 zero = wasm_i16x8_splat(0);
-  const Mask256<uint8_t> mask8{wasm_i8x16_narrow_i16x8(mask.raw, zero)};
-  return BitsFromMask(hwy::SizeTag<1>(), mask8);
-}
-
-template <typename T>
-HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/,
-                                 const Mask256<T> mask) {
-  const __i32x4 mask_i = static_cast<__i32x4>(mask.raw);
-  const __i32x4 slice = wasm_i32x4_make(1, 2, 4, 8);
-  const __i32x4 sliced_mask = wasm_v128_and(mask_i, slice);
-  alignas(32) uint32_t lanes[4];
-  wasm_v128_store(lanes, sliced_mask);
-  return lanes[0] | lanes[1] | lanes[2] | lanes[3];
-}
-
-// Returns 0xFF for bytes with index >= N, otherwise 0.
-constexpr __i8x16 BytesAbove() {
-  return /**/
-      (N == 0)    ? wasm_i32x4_make(-1, -1, -1, -1)
-      : (N == 4)  ? wasm_i32x4_make(0, -1, -1, -1)
-      : (N == 8)  ? wasm_i32x4_make(0, 0, -1, -1)
-      : (N == 12) ? wasm_i32x4_make(0, 0, 0, -1)
-      : (N == 16) ? wasm_i32x4_make(0, 0, 0, 0)
-      : (N == 2)  ? wasm_i16x8_make(0, -1, -1, -1, -1, -1, -1, -1)
-      : (N == 6)  ? wasm_i16x8_make(0, 0, 0, -1, -1, -1, -1, -1)
-      : (N == 10) ? wasm_i16x8_make(0, 0, 0, 0, 0, -1, -1, -1)
-      : (N == 14) ? wasm_i16x8_make(0, 0, 0, 0, 0, 0, 0, -1)
-      : (N == 1)  ? wasm_i8x16_make(0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                                   -1, -1, -1, -1, -1)
-      : (N == 3)  ? wasm_i8x16_make(0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                                   -1, -1, -1, -1)
-      : (N == 5)  ? wasm_i8x16_make(0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1,
-                                   -1, -1, -1, -1)
-      : (N == 7)  ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1,
-                                   -1, -1, -1)
-      : (N == 9)  ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1,
-                                   -1, -1, -1)
-      : (N == 11)
-          ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1)
-      : (N == 13)
-          ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1)
-          : wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1);
-}
-
-template <typename T>
-HWY_INLINE uint64_t BitsFromMask(const Mask256<T> mask) {
-  return BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask);
-}
-
-template <typename T>
-HWY_INLINE size_t CountTrue(hwy::SizeTag<1> tag, const Mask128<T> m) {
-  return PopCount(BitsFromMask(tag, m));
-}
-
-template <typename T>
-HWY_INLINE size_t CountTrue(hwy::SizeTag<2> tag, const Mask128<T> m) {
-  return PopCount(BitsFromMask(tag, m));
-}
-
-template <typename T>
-HWY_INLINE size_t CountTrue(hwy::SizeTag<4> /*tag*/, const Mask128<T> m) {
-  const __i32x4 var_shift = wasm_i32x4_make(1, 2, 4, 8);
-  const __i32x4 shifted_bits = wasm_v128_and(m.raw, var_shift);
-  alignas(32) uint64_t lanes[2];
-  wasm_v128_store(lanes, shifted_bits);
-  return PopCount(lanes[0] | lanes[1]);
-}
-
-}  // namespace detail
-
-// `p` points to at least 8 writable bytes.
-template <typename T>
-HWY_API size_t StoreMaskBits(const Full256<T> /* tag */, const Mask256<T> mask,
-                             uint8_t* bits) {
-  const uint64_t mask_bits = detail::BitsFromMask(mask);
-  const size_t kNumBytes = (N + 7) / 8;
-  CopyBytes<kNumBytes>(&mask_bits, bits);
-  return kNumBytes;
-}
-
-template <typename T>
-HWY_API size_t CountTrue(const Full256<T> /* tag */, const Mask128<T> m) {
-  return detail::CountTrue(hwy::SizeTag<sizeof(T)>(), m);
-}
-
-template <typename T>
-HWY_API bool AllFalse(const Full256<T> d, const Mask128<T> m) {
-#if 0
-  // Casting followed by wasm_i8x16_any_true results in wasm error:
-  // i32.eqz[0] expected type i32, found i8x16.popcnt of type s128
-  const auto v8 = BitCast(Full256<int8_t>(), VecFromMask(d, m));
-  return !wasm_i8x16_any_true(v8.raw);
-#else
-  (void)d;
-  return (wasm_i64x2_extract_lane(m.raw, 0) |
-          wasm_i64x2_extract_lane(m.raw, 1)) == 0;
-#endif
-}
-
-// Full vector
-namespace detail {
-template <typename T>
-HWY_INLINE bool AllTrue(hwy::SizeTag<1> /*tag*/, const Mask128<T> m) {
-  return wasm_i8x16_all_true(m.raw);
-}
-template <typename T>
-HWY_INLINE bool AllTrue(hwy::SizeTag<2> /*tag*/, const Mask128<T> m) {
-  return wasm_i16x8_all_true(m.raw);
-}
-template <typename T>
-HWY_INLINE bool AllTrue(hwy::SizeTag<4> /*tag*/, const Mask128<T> m) {
-  return wasm_i32x4_all_true(m.raw);
-}
-
-}  // namespace detail
-
-template <typename T>
-HWY_API bool AllTrue(const Full256<T> /* tag */, const Mask128<T> m) {
-  return detail::AllTrue(hwy::SizeTag<sizeof(T)>(), m);
-}
-
-template <typename T>
-HWY_API intptr_t FindFirstTrue(const Full256<T> /* tag */,
-                               const Mask256<T> mask) {
-  const uint64_t bits = detail::BitsFromMask(mask);
-  return bits ? Num0BitsBelowLS1Bit_Nonzero64(bits) : -1;
-}
-
-// ------------------------------ Compress
-
-namespace detail {
-
-template <typename T>
-HWY_INLINE Vec256<T> Idx16x8FromBits(const uint64_t mask_bits) {
-  HWY_DASSERT(mask_bits < 256);
-  const Full256<T> d;
-  const Rebind<uint8_t, decltype(d)> d8;
-  const Full256<uint16_t> du;
-
-  // We need byte indices for TableLookupBytes (one vector's worth for each of
-  // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We
-  // can instead store lane indices and convert to byte indices (2*lane + 0..1),
-  // with the doubling baked into the table. Unpacking nibbles is likely more
-  // costly than the higher cache footprint from storing bytes.
-  alignas(32) constexpr uint8_t table[256 * 8] = {
-      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  0,
-      0,  0,  0,  0,  0,  0,  0,  2,  0,  0,  0,  0,  0,  0,  4,  0,  0,  0,
-      0,  0,  0,  0,  0,  4,  0,  0,  0,  0,  0,  0,  2,  4,  0,  0,  0,  0,
-      0,  0,  0,  2,  4,  0,  0,  0,  0,  0,  6,  0,  0,  0,  0,  0,  0,  0,
-      0,  6,  0,  0,  0,  0,  0,  0,  2,  6,  0,  0,  0,  0,  0,  0,  0,  2,
-      6,  0,  0,  0,  0,  0,  4,  6,  0,  0,  0,  0,  0,  0,  0,  4,  6,  0,
-      0,  0,  0,  0,  2,  4,  6,  0,  0,  0,  0,  0,  0,  2,  4,  6,  0,  0,
-      0,  0,  8,  0,  0,  0,  0,  0,  0,  0,  0,  8,  0,  0,  0,  0,  0,  0,
-      2,  8,  0,  0,  0,  0,  0,  0,  0,  2,  8,  0,  0,  0,  0,  0,  4,  8,
-      0,  0,  0,  0,  0,  0,  0,  4,  8,  0,  0,  0,  0,  0,  2,  4,  8,  0,
-      0,  0,  0,  0,  0,  2,  4,  8,  0,  0,  0,  0,  6,  8,  0,  0,  0,  0,
-      0,  0,  0,  6,  8,  0,  0,  0,  0,  0,  2,  6,  8,  0,  0,  0,  0,  0,
-      0,  2,  6,  8,  0,  0,  0,  0,  4,  6,  8,  0,  0,  0,  0,  0,  0,  4,
-      6,  8,  0,  0,  0,  0,  2,  4,  6,  8,  0,  0,  0,  0,  0,  2,  4,  6,
-      8,  0,  0,  0,  10, 0,  0,  0,  0,  0,  0,  0,  0,  10, 0,  0,  0,  0,
-      0,  0,  2,  10, 0,  0,  0,  0,  0,  0,  0,  2,  10, 0,  0,  0,  0,  0,
-      4,  10, 0,  0,  0,  0,  0,  0,  0,  4,  10, 0,  0,  0,  0,  0,  2,  4,
-      10, 0,  0,  0,  0,  0,  0,  2,  4,  10, 0,  0,  0,  0,  6,  10, 0,  0,
-      0,  0,  0,  0,  0,  6,  10, 0,  0,  0,  0,  0,  2,  6,  10, 0,  0,  0,
-      0,  0,  0,  2,  6,  10, 0,  0,  0,  0,  4,  6,  10, 0,  0,  0,  0,  0,
-      0,  4,  6,  10, 0,  0,  0,  0,  2,  4,  6,  10, 0,  0,  0,  0,  0,  2,
-      4,  6,  10, 0,  0,  0,  8,  10, 0,  0,  0,  0,  0,  0,  0,  8,  10, 0,
-      0,  0,  0,  0,  2,  8,  10, 0,  0,  0,  0,  0,  0,  2,  8,  10, 0,  0,
-      0,  0,  4,  8,  10, 0,  0,  0,  0,  0,  0,  4,  8,  10, 0,  0,  0,  0,
-      2,  4,  8,  10, 0,  0,  0,  0,  0,  2,  4,  8,  10, 0,  0,  0,  6,  8,
-      10, 0,  0,  0,  0,  0,  0,  6,  8,  10, 0,  0,  0,  0,  2,  6,  8,  10,
-      0,  0,  0,  0,  0,  2,  6,  8,  10, 0,  0,  0,  4,  6,  8,  10, 0,  0,
-      0,  0,  0,  4,  6,  8,  10, 0,  0,  0,  2,  4,  6,  8,  10, 0,  0,  0,
-      0,  2,  4,  6,  8,  10, 0,  0,  12, 0,  0,  0,  0,  0,  0,  0,  0,  12,
-      0,  0,  0,  0,  0,  0,  2,  12, 0,  0,  0,  0,  0,  0,  0,  2,  12, 0,
-      0,  0,  0,  0,  4,  12, 0,  0,  0,  0,  0,  0,  0,  4,  12, 0,  0,  0,
-      0,  0,  2,  4,  12, 0,  0,  0,  0,  0,  0,  2,  4,  12, 0,  0,  0,  0,
-      6,  12, 0,  0,  0,  0,  0,  0,  0,  6,  12, 0,  0,  0,  0,  0,  2,  6,
-      12, 0,  0,  0,  0,  0,  0,  2,  6,  12, 0,  0,  0,  0,  4,  6,  12, 0,
-      0,  0,  0,  0,  0,  4,  6,  12, 0,  0,  0,  0,  2,  4,  6,  12, 0,  0,
-      0,  0,  0,  2,  4,  6,  12, 0,  0,  0,  8,  12, 0,  0,  0,  0,  0,  0,
-      0,  8,  12, 0,  0,  0,  0,  0,  2,  8,  12, 0,  0,  0,  0,  0,  0,  2,
-      8,  12, 0,  0,  0,  0,  4,  8,  12, 0,  0,  0,  0,  0,  0,  4,  8,  12,
-      0,  0,  0,  0,  2,  4,  8,  12, 0,  0,  0,  0,  0,  2,  4,  8,  12, 0,
-      0,  0,  6,  8,  12, 0,  0,  0,  0,  0,  0,  6,  8,  12, 0,  0,  0,  0,
-      2,  6,  8,  12, 0,  0,  0,  0,  0,  2,  6,  8,  12, 0,  0,  0,  4,  6,
-      8,  12, 0,  0,  0,  0,  0,  4,  6,  8,  12, 0,  0,  0,  2,  4,  6,  8,
-      12, 0,  0,  0,  0,  2,  4,  6,  8,  12, 0,  0,  10, 12, 0,  0,  0,  0,
-      0,  0,  0,  10, 12, 0,  0,  0,  0,  0,  2,  10, 12, 0,  0,  0,  0,  0,
-      0,  2,  10, 12, 0,  0,  0,  0,  4,  10, 12, 0,  0,  0,  0,  0,  0,  4,
-      10, 12, 0,  0,  0,  0,  2,  4,  10, 12, 0,  0,  0,  0,  0,  2,  4,  10,
-      12, 0,  0,  0,  6,  10, 12, 0,  0,  0,  0,  0,  0,  6,  10, 12, 0,  0,
-      0,  0,  2,  6,  10, 12, 0,  0,  0,  0,  0,  2,  6,  10, 12, 0,  0,  0,
-      4,  6,  10, 12, 0,  0,  0,  0,  0,  4,  6,  10, 12, 0,  0,  0,  2,  4,
-      6,  10, 12, 0,  0,  0,  0,  2,  4,  6,  10, 12, 0,  0,  8,  10, 12, 0,
-      0,  0,  0,  0,  0,  8,  10, 12, 0,  0,  0,  0,  2,  8,  10, 12, 0,  0,
-      0,  0,  0,  2,  8,  10, 12, 0,  0,  0,  4,  8,  10, 12, 0,  0,  0,  0,
-      0,  4,  8,  10, 12, 0,  0,  0,  2,  4,  8,  10, 12, 0,  0,  0,  0,  2,
-      4,  8,  10, 12, 0,  0,  6,  8,  10, 12, 0,  0,  0,  0,  0,  6,  8,  10,
-      12, 0,  0,  0,  2,  6,  8,  10, 12, 0,  0,  0,  0,  2,  6,  8,  10, 12,
-      0,  0,  4,  6,  8,  10, 12, 0,  0,  0,  0,  4,  6,  8,  10, 12, 0,  0,
-      2,  4,  6,  8,  10, 12, 0,  0,  0,  2,  4,  6,  8,  10, 12, 0,  14, 0,
-      0,  0,  0,  0,  0,  0,  0,  14, 0,  0,  0,  0,  0,  0,  2,  14, 0,  0,
-      0,  0,  0,  0,  0,  2,  14, 0,  0,  0,  0,  0,  4,  14, 0,  0,  0,  0,
-      0,  0,  0,  4,  14, 0,  0,  0,  0,  0,  2,  4,  14, 0,  0,  0,  0,  0,
-      0,  2,  4,  14, 0,  0,  0,  0,  6,  14, 0,  0,  0,  0,  0,  0,  0,  6,
-      14, 0,  0,  0,  0,  0,  2,  6,  14, 0,  0,  0,  0,  0,  0,  2,  6,  14,
-      0,  0,  0,  0,  4,  6,  14, 0,  0,  0,  0,  0,  0,  4,  6,  14, 0,  0,
-      0,  0,  2,  4,  6,  14, 0,  0,  0,  0,  0,  2,  4,  6,  14, 0,  0,  0,
-      8,  14, 0,  0,  0,  0,  0,  0,  0,  8,  14, 0,  0,  0,  0,  0,  2,  8,
-      14, 0,  0,  0,  0,  0,  0,  2,  8,  14, 0,  0,  0,  0,  4,  8,  14, 0,
-      0,  0,  0,  0,  0,  4,  8,  14, 0,  0,  0,  0,  2,  4,  8,  14, 0,  0,
-      0,  0,  0,  2,  4,  8,  14, 0,  0,  0,  6,  8,  14, 0,  0,  0,  0,  0,
-      0,  6,  8,  14, 0,  0,  0,  0,  2,  6,  8,  14, 0,  0,  0,  0,  0,  2,
-      6,  8,  14, 0,  0,  0,  4,  6,  8,  14, 0,  0,  0,  0,  0,  4,  6,  8,
-      14, 0,  0,  0,  2,  4,  6,  8,  14, 0,  0,  0,  0,  2,  4,  6,  8,  14,
-      0,  0,  10, 14, 0,  0,  0,  0,  0,  0,  0,  10, 14, 0,  0,  0,  0,  0,
-      2,  10, 14, 0,  0,  0,  0,  0,  0,  2,  10, 14, 0,  0,  0,  0,  4,  10,
-      14, 0,  0,  0,  0,  0,  0,  4,  10, 14, 0,  0,  0,  0,  2,  4,  10, 14,
-      0,  0,  0,  0,  0,  2,  4,  10, 14, 0,  0,  0,  6,  10, 14, 0,  0,  0,
-      0,  0,  0,  6,  10, 14, 0,  0,  0,  0,  2,  6,  10, 14, 0,  0,  0,  0,
-      0,  2,  6,  10, 14, 0,  0,  0,  4,  6,  10, 14, 0,  0,  0,  0,  0,  4,
-      6,  10, 14, 0,  0,  0,  2,  4,  6,  10, 14, 0,  0,  0,  0,  2,  4,  6,
-      10, 14, 0,  0,  8,  10, 14, 0,  0,  0,  0,  0,  0,  8,  10, 14, 0,  0,
-      0,  0,  2,  8,  10, 14, 0,  0,  0,  0,  0,  2,  8,  10, 14, 0,  0,  0,
-      4,  8,  10, 14, 0,  0,  0,  0,  0,  4,  8,  10, 14, 0,  0,  0,  2,  4,
-      8,  10, 14, 0,  0,  0,  0,  2,  4,  8,  10, 14, 0,  0,  6,  8,  10, 14,
-      0,  0,  0,  0,  0,  6,  8,  10, 14, 0,  0,  0,  2,  6,  8,  10, 14, 0,
-      0,  0,  0,  2,  6,  8,  10, 14, 0,  0,  4,  6,  8,  10, 14, 0,  0,  0,
-      0,  4,  6,  8,  10, 14, 0,  0,  2,  4,  6,  8,  10, 14, 0,  0,  0,  2,
-      4,  6,  8,  10, 14, 0,  12, 14, 0,  0,  0,  0,  0,  0,  0,  12, 14, 0,
-      0,  0,  0,  0,  2,  12, 14, 0,  0,  0,  0,  0,  0,  2,  12, 14, 0,  0,
-      0,  0,  4,  12, 14, 0,  0,  0,  0,  0,  0,  4,  12, 14, 0,  0,  0,  0,
-      2,  4,  12, 14, 0,  0,  0,  0,  0,  2,  4,  12, 14, 0,  0,  0,  6,  12,
-      14, 0,  0,  0,  0,  0,  0,  6,  12, 14, 0,  0,  0,  0,  2,  6,  12, 14,
-      0,  0,  0,  0,  0,  2,  6,  12, 14, 0,  0,  0,  4,  6,  12, 14, 0,  0,
-      0,  0,  0,  4,  6,  12, 14, 0,  0,  0,  2,  4,  6,  12, 14, 0,  0,  0,
-      0,  2,  4,  6,  12, 14, 0,  0,  8,  12, 14, 0,  0,  0,  0,  0,  0,  8,
-      12, 14, 0,  0,  0,  0,  2,  8,  12, 14, 0,  0,  0,  0,  0,  2,  8,  12,
-      14, 0,  0,  0,  4,  8,  12, 14, 0,  0,  0,  0,  0,  4,  8,  12, 14, 0,
-      0,  0,  2,  4,  8,  12, 14, 0,  0,  0,  0,  2,  4,  8,  12, 14, 0,  0,
-      6,  8,  12, 14, 0,  0,  0,  0,  0,  6,  8,  12, 14, 0,  0,  0,  2,  6,
-      8,  12, 14, 0,  0,  0,  0,  2,  6,  8,  12, 14, 0,  0,  4,  6,  8,  12,
-      14, 0,  0,  0,  0,  4,  6,  8,  12, 14, 0,  0,  2,  4,  6,  8,  12, 14,
-      0,  0,  0,  2,  4,  6,  8,  12, 14, 0,  10, 12, 14, 0,  0,  0,  0,  0,
-      0,  10, 12, 14, 0,  0,  0,  0,  2,  10, 12, 14, 0,  0,  0,  0,  0,  2,
-      10, 12, 14, 0,  0,  0,  4,  10, 12, 14, 0,  0,  0,  0,  0,  4,  10, 12,
-      14, 0,  0,  0,  2,  4,  10, 12, 14, 0,  0,  0,  0,  2,  4,  10, 12, 14,
-      0,  0,  6,  10, 12, 14, 0,  0,  0,  0,  0,  6,  10, 12, 14, 0,  0,  0,
-      2,  6,  10, 12, 14, 0,  0,  0,  0,  2,  6,  10, 12, 14, 0,  0,  4,  6,
-      10, 12, 14, 0,  0,  0,  0,  4,  6,  10, 12, 14, 0,  0,  2,  4,  6,  10,
-      12, 14, 0,  0,  0,  2,  4,  6,  10, 12, 14, 0,  8,  10, 12, 14, 0,  0,
-      0,  0,  0,  8,  10, 12, 14, 0,  0,  0,  2,  8,  10, 12, 14, 0,  0,  0,
-      0,  2,  8,  10, 12, 14, 0,  0,  4,  8,  10, 12, 14, 0,  0,  0,  0,  4,
-      8,  10, 12, 14, 0,  0,  2,  4,  8,  10, 12, 14, 0,  0,  0,  2,  4,  8,
-      10, 12, 14, 0,  6,  8,  10, 12, 14, 0,  0,  0,  0,  6,  8,  10, 12, 14,
-      0,  0,  2,  6,  8,  10, 12, 14, 0,  0,  0,  2,  6,  8,  10, 12, 14, 0,
-      4,  6,  8,  10, 12, 14, 0,  0,  0,  4,  6,  8,  10, 12, 14, 0,  2,  4,
-      6,  8,  10, 12, 14, 0,  0,  2,  4,  6,  8,  10, 12, 14};
-
-  const Vec256<uint8_t> byte_idx{Load(d8, table + mask_bits * 8).raw};
-  const Vec256<uint16_t> pairs = ZipLower(byte_idx, byte_idx);
-  return BitCast(d, pairs + Set(du, 0x0100));
-}
-
-template <typename T>
-HWY_INLINE Vec256<T> Idx32x4FromBits(const uint64_t mask_bits) {
-  HWY_DASSERT(mask_bits < 16);
-
-  // There are only 4 lanes, so we can afford to load the index vector directly.
-  alignas(32) constexpr uint8_t packed_array[16 * 16] = {
-      0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  //
-      0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  //
-      4,  5,  6,  7,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  //
-      0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  0,  1,  2,  3,  //
-      8,  9,  10, 11, 0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  //
-      0,  1,  2,  3,  8,  9,  10, 11, 0,  1,  2,  3,  0,  1,  2,  3,  //
-      4,  5,  6,  7,  8,  9,  10, 11, 0,  1,  2,  3,  0,  1,  2,  3,  //
-      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 0,  1,  2,  3,  //
-      12, 13, 14, 15, 0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  //
-      0,  1,  2,  3,  12, 13, 14, 15, 0,  1,  2,  3,  0,  1,  2,  3,  //
-      4,  5,  6,  7,  12, 13, 14, 15, 0,  1,  2,  3,  0,  1,  2,  3,  //
-      0,  1,  2,  3,  4,  5,  6,  7,  12, 13, 14, 15, 0,  1,  2,  3,  //
-      8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  0,  1,  2,  3,  //
-      0,  1,  2,  3,  8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  //
-      4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  //
-      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15};
-
-  const Full256<T> d;
-  const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, Load(d8, packed_array + 16 * mask_bits));
-}
-
-#if HWY_HAVE_INTEGER64 || HWY_HAVE_FLOAT64
-
-template <typename T>
-HWY_INLINE Vec256<T> Idx64x2FromBits(const uint64_t mask_bits) {
-  HWY_DASSERT(mask_bits < 4);
-
-  // There are only 2 lanes, so we can afford to load the index vector directly.
-  alignas(32) constexpr uint8_t packed_array[4 * 16] = {
-      0, 1, 2,  3,  4,  5,  6,  7,  0, 1, 2,  3,  4,  5,  6,  7,  //
-      0, 1, 2,  3,  4,  5,  6,  7,  0, 1, 2,  3,  4,  5,  6,  7,  //
-      8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2,  3,  4,  5,  6,  7,  //
-      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15};
-
-  const Full256<T> d;
-  const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, Load(d8, packed_array + 16 * mask_bits));
-}
-
-#endif
-
-// Helper functions called by both Compress and CompressStore - avoids a
-// redundant BitsFromMask in the latter.
-
-template <typename T>
-HWY_INLINE Vec256<T> Compress(hwy::SizeTag<2> /*tag*/, Vec256<T> v,
-                              const uint64_t mask_bits) {
-  const auto idx = detail::Idx16x8FromBits<T>(mask_bits);
-  using D = Full256<T>;
-  const RebindToSigned<D> di;
-  return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
-}
-
-template <typename T>
-HWY_INLINE Vec256<T> Compress(hwy::SizeTag<4> /*tag*/, Vec256<T> v,
-                              const uint64_t mask_bits) {
-  const auto idx = detail::Idx32x4FromBits<T>(mask_bits);
-  using D = Full256<T>;
-  const RebindToSigned<D> di;
-  return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
-}
-
-#if HWY_HAVE_INTEGER64 || HWY_HAVE_FLOAT64
-
-template <typename T>
-HWY_INLINE Vec256<uint64_t> Compress(hwy::SizeTag<8> /*tag*/,
-                                     Vec256<uint64_t> v,
-                                     const uint64_t mask_bits) {
-  const auto idx = detail::Idx64x2FromBits<uint64_t>(mask_bits);
-  using D = Full256<T>;
-  const RebindToSigned<D> di;
-  return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
-}
-
-#endif
-
-}  // namespace detail
-
-template <typename T>
-struct CompressIsPartition {
-  enum { value = 1 };
-};
-
-template <typename T>
-HWY_API Vec256<T> Compress(Vec256<T> v, const Mask256<T> mask) {
-  const uint64_t mask_bits = detail::BitsFromMask(mask);
-  return detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
-}
-
-// ------------------------------ CompressNot
-template <typename T>
-HWY_API Vec256<T> Compress(Vec256<T> v, const Mask256<T> mask) {
-  return Compress(v, Not(mask));
-}
-
-// ------------------------------ CompressBlocksNot
-HWY_API Vec256<uint64_t> CompressBlocksNot(Vec256<uint64_t> v,
-                                           Mask256<uint64_t> mask) {
-  HWY_ASSERT(0);  // Not implemented
-}
-
-// ------------------------------ CompressBits
-
-template <typename T>
-HWY_API Vec256<T> CompressBits(Vec256<T> v, const uint8_t* HWY_RESTRICT bits) {
-  uint64_t mask_bits = 0;
-  constexpr size_t kNumBytes = (N + 7) / 8;
-  CopyBytes<kNumBytes>(bits, &mask_bits);
-  if (N < 8) {
-    mask_bits &= (1ull << N) - 1;
-  }
-
-  return detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
-}
-
-// ------------------------------ CompressStore
-template <typename T>
-HWY_API size_t CompressStore(Vec256<T> v, const Mask256<T> mask, Full256<T> d,
-                             T* HWY_RESTRICT unaligned) {
-  const uint64_t mask_bits = detail::BitsFromMask(mask);
-  const auto c = detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
-  StoreU(c, d, unaligned);
-  return PopCount(mask_bits);
-}
-
-// ------------------------------ CompressBlendedStore
-template <typename T>
-HWY_API size_t CompressBlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
-                                    T* HWY_RESTRICT unaligned) {
-  const RebindToUnsigned<decltype(d)> du;  // so we can support fp16/bf16
-  using TU = TFromD<decltype(du)>;
-  const uint64_t mask_bits = detail::BitsFromMask(m);
-  const size_t count = PopCount(mask_bits);
-  const Mask256<TU> store_mask = FirstN(du, count);
-  const Vec256<TU> compressed =
-      detail::Compress(hwy::SizeTag<sizeof(T)>(), BitCast(du, v), mask_bits);
-  const Vec256<TU> prev = BitCast(du, LoadU(d, unaligned));
-  StoreU(BitCast(d, IfThenElse(store_mask, compressed, prev)), d, unaligned);
-  return count;
-}
-
-// ------------------------------ CompressBitsStore
-
-template <typename T>
-HWY_API size_t CompressBitsStore(Vec256<T> v, const uint8_t* HWY_RESTRICT bits,
-                                 Full256<T> d, T* HWY_RESTRICT unaligned) {
-  uint64_t mask_bits = 0;
-  constexpr size_t kNumBytes = (N + 7) / 8;
-  CopyBytes<kNumBytes>(bits, &mask_bits);
-  if (N < 8) {
-    mask_bits &= (1ull << N) - 1;
-  }
-
-  const auto c = detail::Compress(hwy::SizeTag<sizeof(T)>(), v, mask_bits);
-  StoreU(c, d, unaligned);
-  return PopCount(mask_bits);
-}
-
-// ------------------------------ StoreInterleaved2/3/4
-
-// HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in
-// generic_ops-inl.h.
-
-// ------------------------------ MulEven/Odd (Load)
-
-HWY_INLINE Vec256<uint64_t> MulEven(const Vec256<uint64_t> a,
-                                    const Vec256<uint64_t> b) {
-  alignas(32) uint64_t mul[2];
-  mul[0] =
-      Mul128(static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 0)),
-             static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 0)), &mul[1]);
-  return Load(Full256<uint64_t>(), mul);
-}
-
-HWY_INLINE Vec256<uint64_t> MulOdd(const Vec256<uint64_t> a,
-                                   const Vec256<uint64_t> b) {
-  alignas(32) uint64_t mul[2];
-  mul[0] =
-      Mul128(static_cast<uint64_t>(wasm_i64x2_extract_lane(a.raw, 1)),
-             static_cast<uint64_t>(wasm_i64x2_extract_lane(b.raw, 1)), &mul[1]);
-  return Load(Full256<uint64_t>(), mul);
-}
-
-// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
-
-HWY_API Vec256<float> ReorderWidenMulAccumulate(Full256<float> df32,
-                                                Vec256<bfloat16_t> a,
-                                                Vec256<bfloat16_t> b,
-                                                const Vec256<float> sum0,
-                                                Vec256<float>& sum1) {
-  const Repartition<uint16_t, decltype(df32)> du16;
-  const RebindToUnsigned<decltype(df32)> du32;
-  const Vec256<uint16_t> zero = Zero(du16);
-  const Vec256<uint32_t> a0 = ZipLower(du32, zero, BitCast(du16, a));
-  const Vec256<uint32_t> a1 = ZipUpper(du32, zero, BitCast(du16, a));
-  const Vec256<uint32_t> b0 = ZipLower(du32, zero, BitCast(du16, b));
-  const Vec256<uint32_t> b1 = ZipUpper(du32, zero, BitCast(du16, b));
-  sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
-  return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
-}
-
-// ------------------------------ Reductions
-
-namespace detail {
-
-// u32/i32/f32:
-
-template <typename T>
-HWY_INLINE Vec256<T> SumOfLanes(hwy::SizeTag<4> /* tag */,
-                                const Vec256<T> v3210) {
-  const Vec256<T> v1032 = Shuffle1032(v3210);
-  const Vec256<T> v31_20_31_20 = v3210 + v1032;
-  const Vec256<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
-  return v20_31_20_31 + v31_20_31_20;
-}
-template <typename T>
-HWY_INLINE Vec256<T> MinOfLanes(hwy::SizeTag<4> /* tag */,
-                                const Vec256<T> v3210) {
-  const Vec256<T> v1032 = Shuffle1032(v3210);
-  const Vec256<T> v31_20_31_20 = Min(v3210, v1032);
-  const Vec256<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
-  return Min(v20_31_20_31, v31_20_31_20);
-}
-template <typename T>
-HWY_INLINE Vec256<T> MaxOfLanes(hwy::SizeTag<4> /* tag */,
-                                const Vec256<T> v3210) {
-  const Vec256<T> v1032 = Shuffle1032(v3210);
-  const Vec256<T> v31_20_31_20 = Max(v3210, v1032);
-  const Vec256<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
-  return Max(v20_31_20_31, v31_20_31_20);
-}
-
-// u64/i64/f64:
-
-template <typename T>
-HWY_INLINE Vec256<T> SumOfLanes(hwy::SizeTag<8> /* tag */,
-                                const Vec256<T> v10) {
-  const Vec256<T> v01 = Shuffle01(v10);
-  return v10 + v01;
-}
-template <typename T>
-HWY_INLINE Vec256<T> MinOfLanes(hwy::SizeTag<8> /* tag */,
-                                const Vec256<T> v10) {
-  const Vec256<T> v01 = Shuffle01(v10);
-  return Min(v10, v01);
-}
-template <typename T>
-HWY_INLINE Vec256<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
-                                const Vec256<T> v10) {
-  const Vec256<T> v01 = Shuffle01(v10);
-  return Max(v10, v01);
-}
-
-// u16/i16
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec256<T> MinOfLanes(hwy::SizeTag<2> /* tag */, Vec256<T> /*v*/) {
-  HWY_ASSERT(0);  // Not implemented
-}
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec256<T> MaxOfLanes(hwy::SizeTag<2> /* tag */, Vec256<T> /*v*/) {
-  HWY_ASSERT(0);  // Not implemented
-}
-
-}  // namespace detail
-
-// Supported for u/i/f 32/64. Returns the same value in each lane.
-template <typename T>
-HWY_API Vec256<T> SumOfLanes(Full256<T> /* tag */, const Vec256<T> v) {
-  return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), v);
-}
-template <typename T>
-HWY_API Vec256<T> MinOfLanes(Full256<T> /* tag */, const Vec256<T> v) {
-  return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), v);
-}
-template <typename T>
-HWY_API Vec256<T> MaxOfLanes(Full256<T> /* tag */, const Vec256<T> v) {
-  return detail::MaxOfLanes(hwy::SizeTag<sizeof(T)>(), v);
-}
-
-// ------------------------------ Lt128
-
-template <typename T>
-HWY_INLINE Mask256<T> Lt128(Full256<T> d, Vec256<T> a, Vec256<T> b) {}
-
-template <typename T>
-HWY_INLINE Mask256<T> Lt128Upper(Full256<T> d, Vec256<T> a, Vec256<T> b) {}
-
-template <typename T>
-HWY_INLINE Mask256<T> Eq128(Full256<T> d, Vec256<T> a, Vec256<T> b) {}
-
-template <typename T>
-HWY_INLINE Mask256<T> Eq128Upper(Full256<T> d, Vec256<T> a, Vec256<T> b) {}
-
-template <typename T>
-HWY_INLINE Vec256<T> Min128(Full256<T> d, Vec256<T> a, Vec256<T> b) {}
-
-template <typename T>
-HWY_INLINE Vec256<T> Max128(Full256<T> d, Vec256<T> a, Vec256<T> b) {}
-
-template <typename T>
-HWY_INLINE Vec256<T> Min128Upper(Full256<T> d, Vec256<T> a, Vec256<T> b) {}
-
-template <typename T>
-HWY_INLINE Vec256<T> Max128Upper(Full256<T> d, Vec256<T> a, Vec256<T> b) {}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
diff --git a/third_party/highway/hwy/ops/x86_128-inl.h b/third_party/highway/hwy/ops/x86_128-inl.h
deleted file mode 100644 (file)
index 17f6ce3..0000000
+++ /dev/null
@@ -1,7333 +0,0 @@
-// Copyright 2019 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// 128-bit vectors and SSE4 instructions, plus some AVX2 and AVX512-VL
-// operations when compiling for those targets.
-// External include guard in highway.h - see comment there.
-
-// Must come before HWY_DIAGNOSTICS and HWY_COMPILER_GCC_ACTUAL
-#include "hwy/base.h"
-
-// Avoid uninitialized warnings in GCC's emmintrin.h - see
-// https://github.com/google/highway/issues/710 and pull/902)
-HWY_DIAGNOSTICS(push)
-#if HWY_COMPILER_GCC_ACTUAL
-HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
-HWY_DIAGNOSTICS_OFF(disable : 4703 6001 26494, ignored "-Wmaybe-uninitialized")
-#endif
-
-#include <emmintrin.h>
-#include <stdio.h>
-#if HWY_TARGET == HWY_SSSE3
-#include <tmmintrin.h>  // SSSE3
-#else
-#include <smmintrin.h>  // SSE4
-#include <wmmintrin.h>  // CLMUL
-#endif
-#include <stddef.h>
-#include <stdint.h>
-#include <string.h>  // memcpy
-
-#include "hwy/ops/shared-inl.h"
-
-#if HWY_IS_MSAN
-#include <sanitizer/msan_interface.h>
-#endif
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-#if HWY_TARGET <= HWY_AVX2
-template <typename T>
-using Full256 = Simd<T, 32 / sizeof(T), 0>;
-#endif
-
-#if HWY_TARGET <= HWY_AVX3
-template <typename T>
-using Full512 = Simd<T, 64 / sizeof(T), 0>;
-#endif
-
-namespace detail {
-
-template <typename T>
-struct Raw128 {
-  using type = __m128i;
-};
-template <>
-struct Raw128<float> {
-  using type = __m128;
-};
-template <>
-struct Raw128<double> {
-  using type = __m128d;
-};
-
-}  // namespace detail
-
-template <typename T, size_t N = 16 / sizeof(T)>
-class Vec128 {
-  using Raw = typename detail::Raw128<T>::type;
-
- public:
-  // Compound assignment. Only usable if there is a corresponding non-member
-  // binary operator overload. For example, only f32 and f64 support division.
-  HWY_INLINE Vec128& operator*=(const Vec128 other) {
-    return *this = (*this * other);
-  }
-  HWY_INLINE Vec128& operator/=(const Vec128 other) {
-    return *this = (*this / other);
-  }
-  HWY_INLINE Vec128& operator+=(const Vec128 other) {
-    return *this = (*this + other);
-  }
-  HWY_INLINE Vec128& operator-=(const Vec128 other) {
-    return *this = (*this - other);
-  }
-  HWY_INLINE Vec128& operator&=(const Vec128 other) {
-    return *this = (*this & other);
-  }
-  HWY_INLINE Vec128& operator|=(const Vec128 other) {
-    return *this = (*this | other);
-  }
-  HWY_INLINE Vec128& operator^=(const Vec128 other) {
-    return *this = (*this ^ other);
-  }
-
-  Raw raw;
-};
-
-template <typename T>
-using Vec64 = Vec128<T, 8 / sizeof(T)>;
-
-template <typename T>
-using Vec32 = Vec128<T, 4 / sizeof(T)>;
-
-#if HWY_TARGET <= HWY_AVX3
-
-// Forward-declare for use by DeduceD, see below.
-template <typename T>
-class Vec512;
-
-namespace detail {
-
-// Template arg: sizeof(lane type)
-template <size_t size>
-struct RawMask128 {};
-template <>
-struct RawMask128<1> {
-  using type = __mmask16;
-};
-template <>
-struct RawMask128<2> {
-  using type = __mmask8;
-};
-template <>
-struct RawMask128<4> {
-  using type = __mmask8;
-};
-template <>
-struct RawMask128<8> {
-  using type = __mmask8;
-};
-
-}  // namespace detail
-
-template <typename T, size_t N = 16 / sizeof(T)>
-struct Mask128 {
-  using Raw = typename detail::RawMask128<sizeof(T)>::type;
-
-  static Mask128<T, N> FromBits(uint64_t mask_bits) {
-    return Mask128<T, N>{static_cast<Raw>(mask_bits)};
-  }
-
-  Raw raw;
-};
-
-#else  // AVX2 or below
-
-// FF..FF or 0.
-template <typename T, size_t N = 16 / sizeof(T)>
-struct Mask128 {
-  typename detail::Raw128<T>::type raw;
-};
-
-#endif  // HWY_TARGET <= HWY_AVX3
-
-#if HWY_TARGET <= HWY_AVX2
-// Forward-declare for use by DeduceD, see below.
-template <typename T>
-class Vec256;
-#endif
-
-namespace detail {
-
-// Deduce Simd<T, N, 0> from Vec*<T, N> (pointers because Vec256/512 may be
-// incomplete types at this point; this is simpler than avoiding multiple
-// definitions of DFromV via #if)
-struct DeduceD {
-  template <typename T, size_t N>
-  Simd<T, N, 0> operator()(const Vec128<T, N>*) const {
-    return Simd<T, N, 0>();
-  }
-#if HWY_TARGET <= HWY_AVX2
-  template <typename T>
-  Full256<T> operator()(const hwy::HWY_NAMESPACE::Vec256<T>*) const {
-    return Full256<T>();
-  }
-#endif
-#if HWY_TARGET <= HWY_AVX3
-  template <typename T>
-  Full512<T> operator()(const hwy::HWY_NAMESPACE::Vec512<T>*) const {
-    return Full512<T>();
-  }
-#endif
-};
-
-// Workaround for MSVC v19.14: alias with a dependent type fails to specialize.
-template <class V>
-struct ExpandDFromV {
-  using type = decltype(DeduceD()(static_cast<V*>(nullptr)));
-};
-
-}  // namespace detail
-
-template <class V>
-using DFromV = typename detail::ExpandDFromV<V>::type;
-
-template <class V>
-using TFromV = TFromD<DFromV<V>>;
-
-// ------------------------------ BitCast
-
-namespace detail {
-
-HWY_INLINE __m128i BitCastToInteger(__m128i v) { return v; }
-HWY_INLINE __m128i BitCastToInteger(__m128 v) { return _mm_castps_si128(v); }
-HWY_INLINE __m128i BitCastToInteger(__m128d v) { return _mm_castpd_si128(v); }
-
-template <typename T, size_t N>
-HWY_INLINE Vec128<uint8_t, N * sizeof(T)> BitCastToByte(Vec128<T, N> v) {
-  return Vec128<uint8_t, N * sizeof(T)>{BitCastToInteger(v.raw)};
-}
-
-// Cannot rely on function overloading because return types differ.
-template <typename T>
-struct BitCastFromInteger128 {
-  HWY_INLINE __m128i operator()(__m128i v) { return v; }
-};
-template <>
-struct BitCastFromInteger128<float> {
-  HWY_INLINE __m128 operator()(__m128i v) { return _mm_castsi128_ps(v); }
-};
-template <>
-struct BitCastFromInteger128<double> {
-  HWY_INLINE __m128d operator()(__m128i v) { return _mm_castsi128_pd(v); }
-};
-
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> BitCastFromByte(Simd<T, N, 0> /* tag */,
-                                        Vec128<uint8_t, N * sizeof(T)> v) {
-  return Vec128<T, N>{BitCastFromInteger128<T>()(v.raw)};
-}
-
-}  // namespace detail
-
-template <typename T, size_t N, typename FromT>
-HWY_API Vec128<T, N> BitCast(Simd<T, N, 0> d,
-                             Vec128<FromT, N * sizeof(T) / sizeof(FromT)> v) {
-  return detail::BitCastFromByte(d, detail::BitCastToByte(v));
-}
-
-// ------------------------------ Zero
-
-// Returns an all-zero vector/part.
-template <typename T, size_t N, HWY_IF_LE128(T, N)>
-HWY_API Vec128<T, N> Zero(Simd<T, N, 0> /* tag */) {
-  return Vec128<T, N>{_mm_setzero_si128()};
-}
-template <size_t N, HWY_IF_LE128(float, N)>
-HWY_API Vec128<float, N> Zero(Simd<float, N, 0> /* tag */) {
-  return Vec128<float, N>{_mm_setzero_ps()};
-}
-template <size_t N, HWY_IF_LE128(double, N)>
-HWY_API Vec128<double, N> Zero(Simd<double, N, 0> /* tag */) {
-  return Vec128<double, N>{_mm_setzero_pd()};
-}
-
-template <class D>
-using VFromD = decltype(Zero(D()));
-
-// ------------------------------ Set
-
-// Returns a vector/part with all lanes set to "t".
-template <size_t N, HWY_IF_LE128(uint8_t, N)>
-HWY_API Vec128<uint8_t, N> Set(Simd<uint8_t, N, 0> /* tag */, const uint8_t t) {
-  return Vec128<uint8_t, N>{_mm_set1_epi8(static_cast<char>(t))};  // NOLINT
-}
-template <size_t N, HWY_IF_LE128(uint16_t, N)>
-HWY_API Vec128<uint16_t, N> Set(Simd<uint16_t, N, 0> /* tag */,
-                                const uint16_t t) {
-  return Vec128<uint16_t, N>{_mm_set1_epi16(static_cast<short>(t))};  // NOLINT
-}
-template <size_t N, HWY_IF_LE128(uint32_t, N)>
-HWY_API Vec128<uint32_t, N> Set(Simd<uint32_t, N, 0> /* tag */,
-                                const uint32_t t) {
-  return Vec128<uint32_t, N>{_mm_set1_epi32(static_cast<int>(t))};
-}
-template <size_t N, HWY_IF_LE128(uint64_t, N)>
-HWY_API Vec128<uint64_t, N> Set(Simd<uint64_t, N, 0> /* tag */,
-                                const uint64_t t) {
-  return Vec128<uint64_t, N>{
-      _mm_set1_epi64x(static_cast<long long>(t))};  // NOLINT
-}
-template <size_t N, HWY_IF_LE128(int8_t, N)>
-HWY_API Vec128<int8_t, N> Set(Simd<int8_t, N, 0> /* tag */, const int8_t t) {
-  return Vec128<int8_t, N>{_mm_set1_epi8(static_cast<char>(t))};  // NOLINT
-}
-template <size_t N, HWY_IF_LE128(int16_t, N)>
-HWY_API Vec128<int16_t, N> Set(Simd<int16_t, N, 0> /* tag */, const int16_t t) {
-  return Vec128<int16_t, N>{_mm_set1_epi16(static_cast<short>(t))};  // NOLINT
-}
-template <size_t N, HWY_IF_LE128(int32_t, N)>
-HWY_API Vec128<int32_t, N> Set(Simd<int32_t, N, 0> /* tag */, const int32_t t) {
-  return Vec128<int32_t, N>{_mm_set1_epi32(t)};
-}
-template <size_t N, HWY_IF_LE128(int64_t, N)>
-HWY_API Vec128<int64_t, N> Set(Simd<int64_t, N, 0> /* tag */, const int64_t t) {
-  return Vec128<int64_t, N>{
-      _mm_set1_epi64x(static_cast<long long>(t))};  // NOLINT
-}
-template <size_t N, HWY_IF_LE128(float, N)>
-HWY_API Vec128<float, N> Set(Simd<float, N, 0> /* tag */, const float t) {
-  return Vec128<float, N>{_mm_set1_ps(t)};
-}
-template <size_t N, HWY_IF_LE128(double, N)>
-HWY_API Vec128<double, N> Set(Simd<double, N, 0> /* tag */, const double t) {
-  return Vec128<double, N>{_mm_set1_pd(t)};
-}
-
-HWY_DIAGNOSTICS(push)
-HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
-
-// Returns a vector with uninitialized elements.
-template <typename T, size_t N, HWY_IF_LE128(T, N)>
-HWY_API Vec128<T, N> Undefined(Simd<T, N, 0> /* tag */) {
-  // Available on Clang 6.0, GCC 6.2, ICC 16.03, MSVC 19.14. All but ICC
-  // generate an XOR instruction.
-  return Vec128<T, N>{_mm_undefined_si128()};
-}
-template <size_t N, HWY_IF_LE128(float, N)>
-HWY_API Vec128<float, N> Undefined(Simd<float, N, 0> /* tag */) {
-  return Vec128<float, N>{_mm_undefined_ps()};
-}
-template <size_t N, HWY_IF_LE128(double, N)>
-HWY_API Vec128<double, N> Undefined(Simd<double, N, 0> /* tag */) {
-  return Vec128<double, N>{_mm_undefined_pd()};
-}
-
-HWY_DIAGNOSTICS(pop)
-
-// ------------------------------ GetLane
-
-// Gets the single value stored in a vector/part.
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API T GetLane(const Vec128<T, N> v) {
-  return static_cast<T>(_mm_cvtsi128_si32(v.raw) & 0xFF);
-}
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API T GetLane(const Vec128<T, N> v) {
-  return static_cast<T>(_mm_cvtsi128_si32(v.raw) & 0xFFFF);
-}
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API T GetLane(const Vec128<T, N> v) {
-  return static_cast<T>(_mm_cvtsi128_si32(v.raw));
-}
-template <size_t N>
-HWY_API float GetLane(const Vec128<float, N> v) {
-  return _mm_cvtss_f32(v.raw);
-}
-template <size_t N>
-HWY_API uint64_t GetLane(const Vec128<uint64_t, N> v) {
-#if HWY_ARCH_X86_32
-  alignas(16) uint64_t lanes[2];
-  Store(v, Simd<uint64_t, N, 0>(), lanes);
-  return lanes[0];
-#else
-  return static_cast<uint64_t>(_mm_cvtsi128_si64(v.raw));
-#endif
-}
-template <size_t N>
-HWY_API int64_t GetLane(const Vec128<int64_t, N> v) {
-#if HWY_ARCH_X86_32
-  alignas(16) int64_t lanes[2];
-  Store(v, Simd<int64_t, N, 0>(), lanes);
-  return lanes[0];
-#else
-  return _mm_cvtsi128_si64(v.raw);
-#endif
-}
-template <size_t N>
-HWY_API double GetLane(const Vec128<double, N> v) {
-  return _mm_cvtsd_f64(v.raw);
-}
-
-// ================================================== LOGICAL
-
-// ------------------------------ And
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> And(Vec128<T, N> a, Vec128<T, N> b) {
-  return Vec128<T, N>{_mm_and_si128(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<float, N> And(const Vec128<float, N> a,
-                             const Vec128<float, N> b) {
-  return Vec128<float, N>{_mm_and_ps(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<double, N> And(const Vec128<double, N> a,
-                              const Vec128<double, N> b) {
-  return Vec128<double, N>{_mm_and_pd(a.raw, b.raw)};
-}
-
-// ------------------------------ AndNot
-
-// Returns ~not_mask & mask.
-template <typename T, size_t N>
-HWY_API Vec128<T, N> AndNot(Vec128<T, N> not_mask, Vec128<T, N> mask) {
-  return Vec128<T, N>{_mm_andnot_si128(not_mask.raw, mask.raw)};
-}
-template <size_t N>
-HWY_API Vec128<float, N> AndNot(const Vec128<float, N> not_mask,
-                                const Vec128<float, N> mask) {
-  return Vec128<float, N>{_mm_andnot_ps(not_mask.raw, mask.raw)};
-}
-template <size_t N>
-HWY_API Vec128<double, N> AndNot(const Vec128<double, N> not_mask,
-                                 const Vec128<double, N> mask) {
-  return Vec128<double, N>{_mm_andnot_pd(not_mask.raw, mask.raw)};
-}
-
-// ------------------------------ Or
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Or(Vec128<T, N> a, Vec128<T, N> b) {
-  return Vec128<T, N>{_mm_or_si128(a.raw, b.raw)};
-}
-
-template <size_t N>
-HWY_API Vec128<float, N> Or(const Vec128<float, N> a,
-                            const Vec128<float, N> b) {
-  return Vec128<float, N>{_mm_or_ps(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<double, N> Or(const Vec128<double, N> a,
-                             const Vec128<double, N> b) {
-  return Vec128<double, N>{_mm_or_pd(a.raw, b.raw)};
-}
-
-// ------------------------------ Xor
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Xor(Vec128<T, N> a, Vec128<T, N> b) {
-  return Vec128<T, N>{_mm_xor_si128(a.raw, b.raw)};
-}
-
-template <size_t N>
-HWY_API Vec128<float, N> Xor(const Vec128<float, N> a,
-                             const Vec128<float, N> b) {
-  return Vec128<float, N>{_mm_xor_ps(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<double, N> Xor(const Vec128<double, N> a,
-                              const Vec128<double, N> b) {
-  return Vec128<double, N>{_mm_xor_pd(a.raw, b.raw)};
-}
-
-// ------------------------------ Not
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Not(const Vec128<T, N> v) {
-  const DFromV<decltype(v)> d;
-  const RebindToUnsigned<decltype(d)> du;
-  using VU = VFromD<decltype(du)>;
-#if HWY_TARGET <= HWY_AVX3
-  const __m128i vu = BitCast(du, v).raw;
-  return BitCast(d, VU{_mm_ternarylogic_epi32(vu, vu, vu, 0x55)});
-#else
-  return Xor(v, BitCast(d, VU{_mm_set1_epi32(-1)}));
-#endif
-}
-
-// ------------------------------ Or3
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
-#if HWY_TARGET <= HWY_AVX3
-  const DFromV<decltype(o1)> d;
-  const RebindToUnsigned<decltype(d)> du;
-  using VU = VFromD<decltype(du)>;
-  const __m128i ret = _mm_ternarylogic_epi64(
-      BitCast(du, o1).raw, BitCast(du, o2).raw, BitCast(du, o3).raw, 0xFE);
-  return BitCast(d, VU{ret});
-#else
-  return Or(o1, Or(o2, o3));
-#endif
-}
-
-// ------------------------------ OrAnd
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
-#if HWY_TARGET <= HWY_AVX3
-  const DFromV<decltype(o)> d;
-  const RebindToUnsigned<decltype(d)> du;
-  using VU = VFromD<decltype(du)>;
-  const __m128i ret = _mm_ternarylogic_epi64(
-      BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8);
-  return BitCast(d, VU{ret});
-#else
-  return Or(o, And(a1, a2));
-#endif
-}
-
-// ------------------------------ IfVecThenElse
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
-                                   Vec128<T, N> no) {
-#if HWY_TARGET <= HWY_AVX3
-  const DFromV<decltype(no)> d;
-  const RebindToUnsigned<decltype(d)> du;
-  using VU = VFromD<decltype(du)>;
-  return BitCast(
-      d, VU{_mm_ternarylogic_epi64(BitCast(du, mask).raw, BitCast(du, yes).raw,
-                                   BitCast(du, no).raw, 0xCA)});
-#else
-  return IfThenElse(MaskFromVec(mask), yes, no);
-#endif
-}
-
-// ------------------------------ Operator overloads (internal-only if float)
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> operator&(const Vec128<T, N> a, const Vec128<T, N> b) {
-  return And(a, b);
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> operator|(const Vec128<T, N> a, const Vec128<T, N> b) {
-  return Or(a, b);
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> operator^(const Vec128<T, N> a, const Vec128<T, N> b) {
-  return Xor(a, b);
-}
-
-// ------------------------------ PopulationCount
-
-// 8/16 require BITALG, 32/64 require VPOPCNTDQ.
-#if HWY_TARGET == HWY_AVX3_DL
-
-#ifdef HWY_NATIVE_POPCNT
-#undef HWY_NATIVE_POPCNT
-#else
-#define HWY_NATIVE_POPCNT
-#endif
-
-namespace detail {
-
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<1> /* tag */,
-                                        Vec128<T, N> v) {
-  return Vec128<T, N>{_mm_popcnt_epi8(v.raw)};
-}
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<2> /* tag */,
-                                        Vec128<T, N> v) {
-  return Vec128<T, N>{_mm_popcnt_epi16(v.raw)};
-}
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<4> /* tag */,
-                                        Vec128<T, N> v) {
-  return Vec128<T, N>{_mm_popcnt_epi32(v.raw)};
-}
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<8> /* tag */,
-                                        Vec128<T, N> v) {
-  return Vec128<T, N>{_mm_popcnt_epi64(v.raw)};
-}
-
-}  // namespace detail
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> PopulationCount(Vec128<T, N> v) {
-  return detail::PopulationCount(hwy::SizeTag<sizeof(T)>(), v);
-}
-
-#endif  // HWY_TARGET == HWY_AVX3_DL
-
-// ================================================== SIGN
-
-// ------------------------------ Neg
-
-// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
-namespace detail {
-
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> Neg(hwy::FloatTag /*tag*/, const Vec128<T, N> v) {
-  return Xor(v, SignBit(DFromV<decltype(v)>()));
-}
-
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> Neg(hwy::NonFloatTag /*tag*/, const Vec128<T, N> v) {
-  return Zero(DFromV<decltype(v)>()) - v;
-}
-
-}  // namespace detail
-
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> Neg(const Vec128<T, N> v) {
-  return detail::Neg(hwy::IsFloatTag<T>(), v);
-}
-
-// ------------------------------ Abs
-
-// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
-template <size_t N>
-HWY_API Vec128<int8_t, N> Abs(const Vec128<int8_t, N> v) {
-#if HWY_COMPILER_MSVC
-  // Workaround for incorrect codegen? (reaches breakpoint)
-  const auto zero = Zero(DFromV<decltype(v)>());
-  return Vec128<int8_t, N>{_mm_max_epi8(v.raw, (zero - v).raw)};
-#else
-  return Vec128<int8_t, N>{_mm_abs_epi8(v.raw)};
-#endif
-}
-template <size_t N>
-HWY_API Vec128<int16_t, N> Abs(const Vec128<int16_t, N> v) {
-  return Vec128<int16_t, N>{_mm_abs_epi16(v.raw)};
-}
-template <size_t N>
-HWY_API Vec128<int32_t, N> Abs(const Vec128<int32_t, N> v) {
-  return Vec128<int32_t, N>{_mm_abs_epi32(v.raw)};
-}
-// i64 is implemented after BroadcastSignBit.
-template <size_t N>
-HWY_API Vec128<float, N> Abs(const Vec128<float, N> v) {
-  const Vec128<int32_t, N> mask{_mm_set1_epi32(0x7FFFFFFF)};
-  return v & BitCast(DFromV<decltype(v)>(), mask);
-}
-template <size_t N>
-HWY_API Vec128<double, N> Abs(const Vec128<double, N> v) {
-  const Vec128<int64_t, N> mask{_mm_set1_epi64x(0x7FFFFFFFFFFFFFFFLL)};
-  return v & BitCast(DFromV<decltype(v)>(), mask);
-}
-
-// ------------------------------ CopySign
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> CopySign(const Vec128<T, N> magn,
-                              const Vec128<T, N> sign) {
-  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
-
-  const DFromV<decltype(magn)> d;
-  const auto msb = SignBit(d);
-
-#if HWY_TARGET <= HWY_AVX3
-  const RebindToUnsigned<decltype(d)> du;
-  // Truth table for msb, magn, sign | bitwise msb ? sign : mag
-  //                  0    0     0   |  0
-  //                  0    0     1   |  0
-  //                  0    1     0   |  1
-  //                  0    1     1   |  1
-  //                  1    0     0   |  0
-  //                  1    0     1   |  1
-  //                  1    1     0   |  0
-  //                  1    1     1   |  1
-  // The lane size does not matter because we are not using predication.
-  const __m128i out = _mm_ternarylogic_epi32(
-      BitCast(du, msb).raw, BitCast(du, magn).raw, BitCast(du, sign).raw, 0xAC);
-  return BitCast(d, VFromD<decltype(du)>{out});
-#else
-  return Or(AndNot(msb, magn), And(msb, sign));
-#endif
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> CopySignToAbs(const Vec128<T, N> abs,
-                                   const Vec128<T, N> sign) {
-#if HWY_TARGET <= HWY_AVX3
-  // AVX3 can also handle abs < 0, so no extra action needed.
-  return CopySign(abs, sign);
-#else
-  return Or(abs, And(SignBit(DFromV<decltype(abs)>()), sign));
-#endif
-}
-
-// ================================================== MASK
-
-#if HWY_TARGET <= HWY_AVX3
-
-// ------------------------------ IfThenElse
-
-// Returns mask ? b : a.
-
-namespace detail {
-
-// Templates for signed/unsigned integer of a particular size.
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> IfThenElse(hwy::SizeTag<1> /* tag */,
-                                   Mask128<T, N> mask, Vec128<T, N> yes,
-                                   Vec128<T, N> no) {
-  return Vec128<T, N>{_mm_mask_mov_epi8(no.raw, mask.raw, yes.raw)};
-}
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> IfThenElse(hwy::SizeTag<2> /* tag */,
-                                   Mask128<T, N> mask, Vec128<T, N> yes,
-                                   Vec128<T, N> no) {
-  return Vec128<T, N>{_mm_mask_mov_epi16(no.raw, mask.raw, yes.raw)};
-}
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> IfThenElse(hwy::SizeTag<4> /* tag */,
-                                   Mask128<T, N> mask, Vec128<T, N> yes,
-                                   Vec128<T, N> no) {
-  return Vec128<T, N>{_mm_mask_mov_epi32(no.raw, mask.raw, yes.raw)};
-}
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> IfThenElse(hwy::SizeTag<8> /* tag */,
-                                   Mask128<T, N> mask, Vec128<T, N> yes,
-                                   Vec128<T, N> no) {
-  return Vec128<T, N>{_mm_mask_mov_epi64(no.raw, mask.raw, yes.raw)};
-}
-
-}  // namespace detail
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
-                                Vec128<T, N> no) {
-  return detail::IfThenElse(hwy::SizeTag<sizeof(T)>(), mask, yes, no);
-}
-
-template <size_t N>
-HWY_API Vec128<float, N> IfThenElse(Mask128<float, N> mask,
-                                    Vec128<float, N> yes, Vec128<float, N> no) {
-  return Vec128<float, N>{_mm_mask_mov_ps(no.raw, mask.raw, yes.raw)};
-}
-
-template <size_t N>
-HWY_API Vec128<double, N> IfThenElse(Mask128<double, N> mask,
-                                     Vec128<double, N> yes,
-                                     Vec128<double, N> no) {
-  return Vec128<double, N>{_mm_mask_mov_pd(no.raw, mask.raw, yes.raw)};
-}
-
-namespace detail {
-
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> IfThenElseZero(hwy::SizeTag<1> /* tag */,
-                                       Mask128<T, N> mask, Vec128<T, N> yes) {
-  return Vec128<T, N>{_mm_maskz_mov_epi8(mask.raw, yes.raw)};
-}
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> IfThenElseZero(hwy::SizeTag<2> /* tag */,
-                                       Mask128<T, N> mask, Vec128<T, N> yes) {
-  return Vec128<T, N>{_mm_maskz_mov_epi16(mask.raw, yes.raw)};
-}
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> IfThenElseZero(hwy::SizeTag<4> /* tag */,
-                                       Mask128<T, N> mask, Vec128<T, N> yes) {
-  return Vec128<T, N>{_mm_maskz_mov_epi32(mask.raw, yes.raw)};
-}
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> IfThenElseZero(hwy::SizeTag<8> /* tag */,
-                                       Mask128<T, N> mask, Vec128<T, N> yes) {
-  return Vec128<T, N>{_mm_maskz_mov_epi64(mask.raw, yes.raw)};
-}
-
-}  // namespace detail
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
-  return detail::IfThenElseZero(hwy::SizeTag<sizeof(T)>(), mask, yes);
-}
-
-template <size_t N>
-HWY_API Vec128<float, N> IfThenElseZero(Mask128<float, N> mask,
-                                        Vec128<float, N> yes) {
-  return Vec128<float, N>{_mm_maskz_mov_ps(mask.raw, yes.raw)};
-}
-
-template <size_t N>
-HWY_API Vec128<double, N> IfThenElseZero(Mask128<double, N> mask,
-                                         Vec128<double, N> yes) {
-  return Vec128<double, N>{_mm_maskz_mov_pd(mask.raw, yes.raw)};
-}
-
-namespace detail {
-
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> IfThenZeroElse(hwy::SizeTag<1> /* tag */,
-                                       Mask128<T, N> mask, Vec128<T, N> no) {
-  // xor_epi8/16 are missing, but we have sub, which is just as fast for u8/16.
-  return Vec128<T, N>{_mm_mask_sub_epi8(no.raw, mask.raw, no.raw, no.raw)};
-}
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> IfThenZeroElse(hwy::SizeTag<2> /* tag */,
-                                       Mask128<T, N> mask, Vec128<T, N> no) {
-  return Vec128<T, N>{_mm_mask_sub_epi16(no.raw, mask.raw, no.raw, no.raw)};
-}
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> IfThenZeroElse(hwy::SizeTag<4> /* tag */,
-                                       Mask128<T, N> mask, Vec128<T, N> no) {
-  return Vec128<T, N>{_mm_mask_xor_epi32(no.raw, mask.raw, no.raw, no.raw)};
-}
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> IfThenZeroElse(hwy::SizeTag<8> /* tag */,
-                                       Mask128<T, N> mask, Vec128<T, N> no) {
-  return Vec128<T, N>{_mm_mask_xor_epi64(no.raw, mask.raw, no.raw, no.raw)};
-}
-
-}  // namespace detail
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
-  return detail::IfThenZeroElse(hwy::SizeTag<sizeof(T)>(), mask, no);
-}
-
-template <size_t N>
-HWY_API Vec128<float, N> IfThenZeroElse(Mask128<float, N> mask,
-                                        Vec128<float, N> no) {
-  return Vec128<float, N>{_mm_mask_xor_ps(no.raw, mask.raw, no.raw, no.raw)};
-}
-
-template <size_t N>
-HWY_API Vec128<double, N> IfThenZeroElse(Mask128<double, N> mask,
-                                         Vec128<double, N> no) {
-  return Vec128<double, N>{_mm_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)};
-}
-
-// ------------------------------ Mask logical
-
-// For Clang and GCC, mask intrinsics (KORTEST) weren't added until recently.
-#if !defined(HWY_COMPILER_HAS_MASK_INTRINSICS)
-#if HWY_COMPILER_MSVC != 0 || HWY_COMPILER_GCC_ACTUAL >= 700 || \
-    HWY_COMPILER_CLANG >= 800
-#define HWY_COMPILER_HAS_MASK_INTRINSICS 1
-#else
-#define HWY_COMPILER_HAS_MASK_INTRINSICS 0
-#endif
-#endif  // HWY_COMPILER_HAS_MASK_INTRINSICS
-
-namespace detail {
-
-template <typename T, size_t N>
-HWY_INLINE Mask128<T, N> And(hwy::SizeTag<1> /*tag*/, const Mask128<T, N> a,
-                             const Mask128<T, N> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask128<T, N>{_kand_mask16(a.raw, b.raw)};
-#else
-  return Mask128<T, N>{static_cast<__mmask16>(a.raw & b.raw)};
-#endif
-}
-template <typename T, size_t N>
-HWY_INLINE Mask128<T, N> And(hwy::SizeTag<2> /*tag*/, const Mask128<T, N> a,
-                             const Mask128<T, N> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask128<T, N>{_kand_mask8(a.raw, b.raw)};
-#else
-  return Mask128<T, N>{static_cast<__mmask8>(a.raw & b.raw)};
-#endif
-}
-template <typename T, size_t N>
-HWY_INLINE Mask128<T, N> And(hwy::SizeTag<4> /*tag*/, const Mask128<T, N> a,
-                             const Mask128<T, N> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask128<T, N>{_kand_mask8(a.raw, b.raw)};
-#else
-  return Mask128<T, N>{static_cast<__mmask8>(a.raw & b.raw)};
-#endif
-}
-template <typename T, size_t N>
-HWY_INLINE Mask128<T, N> And(hwy::SizeTag<8> /*tag*/, const Mask128<T, N> a,
-                             const Mask128<T, N> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask128<T, N>{_kand_mask8(a.raw, b.raw)};
-#else
-  return Mask128<T, N>{static_cast<__mmask8>(a.raw & b.raw)};
-#endif
-}
-
-template <typename T, size_t N>
-HWY_INLINE Mask128<T, N> AndNot(hwy::SizeTag<1> /*tag*/, const Mask128<T, N> a,
-                                const Mask128<T, N> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask128<T, N>{_kandn_mask16(a.raw, b.raw)};
-#else
-  return Mask128<T, N>{static_cast<__mmask16>(~a.raw & b.raw)};
-#endif
-}
-template <typename T, size_t N>
-HWY_INLINE Mask128<T, N> AndNot(hwy::SizeTag<2> /*tag*/, const Mask128<T, N> a,
-                                const Mask128<T, N> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)};
-#else
-  return Mask128<T, N>{static_cast<__mmask8>(~a.raw & b.raw)};
-#endif
-}
-template <typename T, size_t N>
-HWY_INLINE Mask128<T, N> AndNot(hwy::SizeTag<4> /*tag*/, const Mask128<T, N> a,
-                                const Mask128<T, N> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)};
-#else
-  return Mask128<T, N>{static_cast<__mmask8>(~a.raw & b.raw)};
-#endif
-}
-template <typename T, size_t N>
-HWY_INLINE Mask128<T, N> AndNot(hwy::SizeTag<8> /*tag*/, const Mask128<T, N> a,
-                                const Mask128<T, N> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask128<T, N>{_kandn_mask8(a.raw, b.raw)};
-#else
-  return Mask128<T, N>{static_cast<__mmask8>(~a.raw & b.raw)};
-#endif
-}
-
-template <typename T, size_t N>
-HWY_INLINE Mask128<T, N> Or(hwy::SizeTag<1> /*tag*/, const Mask128<T, N> a,
-                            const Mask128<T, N> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask128<T, N>{_kor_mask16(a.raw, b.raw)};
-#else
-  return Mask128<T, N>{static_cast<__mmask16>(a.raw | b.raw)};
-#endif
-}
-template <typename T, size_t N>
-HWY_INLINE Mask128<T, N> Or(hwy::SizeTag<2> /*tag*/, const Mask128<T, N> a,
-                            const Mask128<T, N> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask128<T, N>{_kor_mask8(a.raw, b.raw)};
-#else
-  return Mask128<T, N>{static_cast<__mmask8>(a.raw | b.raw)};
-#endif
-}
-template <typename T, size_t N>
-HWY_INLINE Mask128<T, N> Or(hwy::SizeTag<4> /*tag*/, const Mask128<T, N> a,
-                            const Mask128<T, N> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask128<T, N>{_kor_mask8(a.raw, b.raw)};
-#else
-  return Mask128<T, N>{static_cast<__mmask8>(a.raw | b.raw)};
-#endif
-}
-template <typename T, size_t N>
-HWY_INLINE Mask128<T, N> Or(hwy::SizeTag<8> /*tag*/, const Mask128<T, N> a,
-                            const Mask128<T, N> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask128<T, N>{_kor_mask8(a.raw, b.raw)};
-#else
-  return Mask128<T, N>{static_cast<__mmask8>(a.raw | b.raw)};
-#endif
-}
-
-template <typename T, size_t N>
-HWY_INLINE Mask128<T, N> Xor(hwy::SizeTag<1> /*tag*/, const Mask128<T, N> a,
-                             const Mask128<T, N> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask128<T, N>{_kxor_mask16(a.raw, b.raw)};
-#else
-  return Mask128<T, N>{static_cast<__mmask16>(a.raw ^ b.raw)};
-#endif
-}
-template <typename T, size_t N>
-HWY_INLINE Mask128<T, N> Xor(hwy::SizeTag<2> /*tag*/, const Mask128<T, N> a,
-                             const Mask128<T, N> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)};
-#else
-  return Mask128<T, N>{static_cast<__mmask8>(a.raw ^ b.raw)};
-#endif
-}
-template <typename T, size_t N>
-HWY_INLINE Mask128<T, N> Xor(hwy::SizeTag<4> /*tag*/, const Mask128<T, N> a,
-                             const Mask128<T, N> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)};
-#else
-  return Mask128<T, N>{static_cast<__mmask8>(a.raw ^ b.raw)};
-#endif
-}
-template <typename T, size_t N>
-HWY_INLINE Mask128<T, N> Xor(hwy::SizeTag<8> /*tag*/, const Mask128<T, N> a,
-                             const Mask128<T, N> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask128<T, N>{_kxor_mask8(a.raw, b.raw)};
-#else
-  return Mask128<T, N>{static_cast<__mmask8>(a.raw ^ b.raw)};
-#endif
-}
-
-}  // namespace detail
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) {
-  return detail::And(hwy::SizeTag<sizeof(T)>(), a, b);
-}
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) {
-  return detail::AndNot(hwy::SizeTag<sizeof(T)>(), a, b);
-}
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) {
-  return detail::Or(hwy::SizeTag<sizeof(T)>(), a, b);
-}
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
-  return detail::Xor(hwy::SizeTag<sizeof(T)>(), a, b);
-}
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
-  // Flip only the valid bits.
-  // TODO(janwas): use _knot intrinsics if N >= 8.
-  return Xor(m, Mask128<T, N>::FromBits((1ull << N) - 1));
-}
-
-#else  // AVX2 or below
-
-// ------------------------------ Mask
-
-// Mask and Vec are the same (true = FF..FF).
-template <typename T, size_t N>
-HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
-  return Mask128<T, N>{v.raw};
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
-  return Vec128<T, N>{v.raw};
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> VecFromMask(const Simd<T, N, 0> /* tag */,
-                                 const Mask128<T, N> v) {
-  return Vec128<T, N>{v.raw};
-}
-
-#if HWY_TARGET == HWY_SSSE3
-
-// mask ? yes : no
-template <typename T, size_t N>
-HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
-                                Vec128<T, N> no) {
-  const auto vmask = VecFromMask(DFromV<decltype(no)>(), mask);
-  return Or(And(vmask, yes), AndNot(vmask, no));
-}
-
-#else  // HWY_TARGET == HWY_SSSE3
-
-// mask ? yes : no
-template <typename T, size_t N>
-HWY_API Vec128<T, N> IfThenElse(Mask128<T, N> mask, Vec128<T, N> yes,
-                                Vec128<T, N> no) {
-  return Vec128<T, N>{_mm_blendv_epi8(no.raw, yes.raw, mask.raw)};
-}
-template <size_t N>
-HWY_API Vec128<float, N> IfThenElse(const Mask128<float, N> mask,
-                                    const Vec128<float, N> yes,
-                                    const Vec128<float, N> no) {
-  return Vec128<float, N>{_mm_blendv_ps(no.raw, yes.raw, mask.raw)};
-}
-template <size_t N>
-HWY_API Vec128<double, N> IfThenElse(const Mask128<double, N> mask,
-                                     const Vec128<double, N> yes,
-                                     const Vec128<double, N> no) {
-  return Vec128<double, N>{_mm_blendv_pd(no.raw, yes.raw, mask.raw)};
-}
-
-#endif  // HWY_TARGET == HWY_SSSE3
-
-// mask ? yes : 0
-template <typename T, size_t N>
-HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
-  return yes & VecFromMask(DFromV<decltype(yes)>(), mask);
-}
-
-// mask ? 0 : no
-template <typename T, size_t N>
-HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
-  return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no);
-}
-
-// ------------------------------ Mask logical
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
-  return MaskFromVec(Not(VecFromMask(Simd<T, N, 0>(), m)));
-}
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) {
-  const Simd<T, N, 0> d;
-  return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
-}
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) {
-  const Simd<T, N, 0> d;
-  return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
-}
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) {
-  const Simd<T, N, 0> d;
-  return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
-}
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
-  const Simd<T, N, 0> d;
-  return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
-}
-
-#endif  // HWY_TARGET <= HWY_AVX3
-
-// ------------------------------ ShiftLeft
-
-template <int kBits, size_t N>
-HWY_API Vec128<uint16_t, N> ShiftLeft(const Vec128<uint16_t, N> v) {
-  return Vec128<uint16_t, N>{_mm_slli_epi16(v.raw, kBits)};
-}
-
-template <int kBits, size_t N>
-HWY_API Vec128<uint32_t, N> ShiftLeft(const Vec128<uint32_t, N> v) {
-  return Vec128<uint32_t, N>{_mm_slli_epi32(v.raw, kBits)};
-}
-
-template <int kBits, size_t N>
-HWY_API Vec128<uint64_t, N> ShiftLeft(const Vec128<uint64_t, N> v) {
-  return Vec128<uint64_t, N>{_mm_slli_epi64(v.raw, kBits)};
-}
-
-template <int kBits, size_t N>
-HWY_API Vec128<int16_t, N> ShiftLeft(const Vec128<int16_t, N> v) {
-  return Vec128<int16_t, N>{_mm_slli_epi16(v.raw, kBits)};
-}
-template <int kBits, size_t N>
-HWY_API Vec128<int32_t, N> ShiftLeft(const Vec128<int32_t, N> v) {
-  return Vec128<int32_t, N>{_mm_slli_epi32(v.raw, kBits)};
-}
-template <int kBits, size_t N>
-HWY_API Vec128<int64_t, N> ShiftLeft(const Vec128<int64_t, N> v) {
-  return Vec128<int64_t, N>{_mm_slli_epi64(v.raw, kBits)};
-}
-
-template <int kBits, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Vec128<T, N> ShiftLeft(const Vec128<T, N> v) {
-  const DFromV<decltype(v)> d8;
-  // Use raw instead of BitCast to support N=1.
-  const Vec128<T, N> shifted{ShiftLeft<kBits>(Vec128<MakeWide<T>>{v.raw}).raw};
-  return kBits == 1
-             ? (v + v)
-             : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
-}
-
-// ------------------------------ ShiftRight
-
-template <int kBits, size_t N>
-HWY_API Vec128<uint16_t, N> ShiftRight(const Vec128<uint16_t, N> v) {
-  return Vec128<uint16_t, N>{_mm_srli_epi16(v.raw, kBits)};
-}
-template <int kBits, size_t N>
-HWY_API Vec128<uint32_t, N> ShiftRight(const Vec128<uint32_t, N> v) {
-  return Vec128<uint32_t, N>{_mm_srli_epi32(v.raw, kBits)};
-}
-template <int kBits, size_t N>
-HWY_API Vec128<uint64_t, N> ShiftRight(const Vec128<uint64_t, N> v) {
-  return Vec128<uint64_t, N>{_mm_srli_epi64(v.raw, kBits)};
-}
-
-template <int kBits, size_t N>
-HWY_API Vec128<uint8_t, N> ShiftRight(const Vec128<uint8_t, N> v) {
-  const DFromV<decltype(v)> d8;
-  // Use raw instead of BitCast to support N=1.
-  const Vec128<uint8_t, N> shifted{
-      ShiftRight<kBits>(Vec128<uint16_t>{v.raw}).raw};
-  return shifted & Set(d8, 0xFF >> kBits);
-}
-
-template <int kBits, size_t N>
-HWY_API Vec128<int16_t, N> ShiftRight(const Vec128<int16_t, N> v) {
-  return Vec128<int16_t, N>{_mm_srai_epi16(v.raw, kBits)};
-}
-template <int kBits, size_t N>
-HWY_API Vec128<int32_t, N> ShiftRight(const Vec128<int32_t, N> v) {
-  return Vec128<int32_t, N>{_mm_srai_epi32(v.raw, kBits)};
-}
-
-template <int kBits, size_t N>
-HWY_API Vec128<int8_t, N> ShiftRight(const Vec128<int8_t, N> v) {
-  const DFromV<decltype(v)> di;
-  const RebindToUnsigned<decltype(di)> du;
-  const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
-  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
-  return (shifted ^ shifted_sign) - shifted_sign;
-}
-
-// i64 is implemented after BroadcastSignBit.
-
-// ================================================== SWIZZLE (1)
-
-// ------------------------------ TableLookupBytes
-template <typename T, size_t N, typename TI, size_t NI>
-HWY_API Vec128<TI, NI> TableLookupBytes(const Vec128<T, N> bytes,
-                                        const Vec128<TI, NI> from) {
-  return Vec128<TI, NI>{_mm_shuffle_epi8(bytes.raw, from.raw)};
-}
-
-// ------------------------------ TableLookupBytesOr0
-// For all vector widths; x86 anyway zeroes if >= 0x80.
-template <class V, class VI>
-HWY_API VI TableLookupBytesOr0(const V bytes, const VI from) {
-  return TableLookupBytes(bytes, from);
-}
-
-// ------------------------------ Shuffles (ShiftRight, TableLookupBytes)
-
-// Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
-// Shuffle0321 rotates one lane to the right (the previous least-significant
-// lane is now most-significant). These could also be implemented via
-// CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
-
-// Swap 32-bit halves in 64-bit halves.
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Shuffle2301(const Vec128<T, N> v) {
-  static_assert(sizeof(T) == 4, "Only for 32-bit lanes");
-  static_assert(N == 2 || N == 4, "Does not make sense for N=1");
-  return Vec128<T, N>{_mm_shuffle_epi32(v.raw, 0xB1)};
-}
-template <size_t N>
-HWY_API Vec128<float, N> Shuffle2301(const Vec128<float, N> v) {
-  static_assert(N == 2 || N == 4, "Does not make sense for N=1");
-  return Vec128<float, N>{_mm_shuffle_ps(v.raw, v.raw, 0xB1)};
-}
-
-// These are used by generic_ops-inl to implement LoadInterleaved3. As with
-// Intel's shuffle* intrinsics and InterleaveLower, the lower half of the output
-// comes from the first argument.
-namespace detail {
-
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Vec128<T, 4> Shuffle2301(const Vec128<T, 4> a, const Vec128<T, 4> b) {
-  const Twice<DFromV<decltype(a)>> d2;
-  const auto ba = Combine(d2, b, a);
-  alignas(16) const T kShuffle[8] = {1, 0, 7, 6};
-  return Vec128<T, 4>{TableLookupBytes(ba, Load(d2, kShuffle)).raw};
-}
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec128<T, 4> Shuffle2301(const Vec128<T, 4> a, const Vec128<T, 4> b) {
-  const Twice<DFromV<decltype(a)>> d2;
-  const auto ba = Combine(d2, b, a);
-  alignas(16) const T kShuffle[8] = {0x0302, 0x0100, 0x0f0e, 0x0d0c};
-  return Vec128<T, 4>{TableLookupBytes(ba, Load(d2, kShuffle)).raw};
-}
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec128<T, 4> Shuffle2301(const Vec128<T, 4> a, const Vec128<T, 4> b) {
-  const DFromV<decltype(a)> d;
-  const RebindToFloat<decltype(d)> df;
-  constexpr int m = _MM_SHUFFLE(2, 3, 0, 1);
-  return BitCast(d, Vec128<float, 4>{_mm_shuffle_ps(BitCast(df, a).raw,
-                                                    BitCast(df, b).raw, m)});
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Vec128<T, 4> Shuffle1230(const Vec128<T, 4> a, const Vec128<T, 4> b) {
-  const Twice<DFromV<decltype(a)>> d2;
-  const auto ba = Combine(d2, b, a);
-  alignas(16) const T kShuffle[8] = {0, 3, 6, 5};
-  return Vec128<T, 4>{TableLookupBytes(ba, Load(d2, kShuffle)).raw};
-}
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec128<T, 4> Shuffle1230(const Vec128<T, 4> a, const Vec128<T, 4> b) {
-  const Twice<DFromV<decltype(a)>> d2;
-  const auto ba = Combine(d2, b, a);
-  alignas(16) const T kShuffle[8] = {0x0100, 0x0706, 0x0d0c, 0x0b0a};
-  return Vec128<T, 4>{TableLookupBytes(ba, Load(d2, kShuffle)).raw};
-}
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec128<T, 4> Shuffle1230(const Vec128<T, 4> a, const Vec128<T, 4> b) {
-  const DFromV<decltype(a)> d;
-  const RebindToFloat<decltype(d)> df;
-  constexpr int m = _MM_SHUFFLE(1, 2, 3, 0);
-  return BitCast(d, Vec128<float, 4>{_mm_shuffle_ps(BitCast(df, a).raw,
-                                                    BitCast(df, b).raw, m)});
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Vec128<T, 4> Shuffle3012(const Vec128<T, 4> a, const Vec128<T, 4> b) {
-  const Twice<DFromV<decltype(a)>> d2;
-  const auto ba = Combine(d2, b, a);
-  alignas(16) const T kShuffle[8] = {2, 1, 4, 7};
-  return Vec128<T, 4>{TableLookupBytes(ba, Load(d2, kShuffle)).raw};
-}
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec128<T, 4> Shuffle3012(const Vec128<T, 4> a, const Vec128<T, 4> b) {
-  const Twice<DFromV<decltype(a)>> d2;
-  const auto ba = Combine(d2, b, a);
-  alignas(16) const T kShuffle[8] = {0x0504, 0x0302, 0x0908, 0x0f0e};
-  return Vec128<T, 4>{TableLookupBytes(ba, Load(d2, kShuffle)).raw};
-}
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec128<T, 4> Shuffle3012(const Vec128<T, 4> a, const Vec128<T, 4> b) {
-  const DFromV<decltype(a)> d;
-  const RebindToFloat<decltype(d)> df;
-  constexpr int m = _MM_SHUFFLE(3, 0, 1, 2);
-  return BitCast(d, Vec128<float, 4>{_mm_shuffle_ps(BitCast(df, a).raw,
-                                                    BitCast(df, b).raw, m)});
-}
-
-}  // namespace detail
-
-// Swap 64-bit halves
-HWY_API Vec128<uint32_t> Shuffle1032(const Vec128<uint32_t> v) {
-  return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x4E)};
-}
-HWY_API Vec128<int32_t> Shuffle1032(const Vec128<int32_t> v) {
-  return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x4E)};
-}
-HWY_API Vec128<float> Shuffle1032(const Vec128<float> v) {
-  return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x4E)};
-}
-HWY_API Vec128<uint64_t> Shuffle01(const Vec128<uint64_t> v) {
-  return Vec128<uint64_t>{_mm_shuffle_epi32(v.raw, 0x4E)};
-}
-HWY_API Vec128<int64_t> Shuffle01(const Vec128<int64_t> v) {
-  return Vec128<int64_t>{_mm_shuffle_epi32(v.raw, 0x4E)};
-}
-HWY_API Vec128<double> Shuffle01(const Vec128<double> v) {
-  return Vec128<double>{_mm_shuffle_pd(v.raw, v.raw, 1)};
-}
-
-// Rotate right 32 bits
-HWY_API Vec128<uint32_t> Shuffle0321(const Vec128<uint32_t> v) {
-  return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x39)};
-}
-HWY_API Vec128<int32_t> Shuffle0321(const Vec128<int32_t> v) {
-  return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x39)};
-}
-HWY_API Vec128<float> Shuffle0321(const Vec128<float> v) {
-  return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x39)};
-}
-// Rotate left 32 bits
-HWY_API Vec128<uint32_t> Shuffle2103(const Vec128<uint32_t> v) {
-  return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x93)};
-}
-HWY_API Vec128<int32_t> Shuffle2103(const Vec128<int32_t> v) {
-  return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x93)};
-}
-HWY_API Vec128<float> Shuffle2103(const Vec128<float> v) {
-  return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x93)};
-}
-
-// Reverse
-HWY_API Vec128<uint32_t> Shuffle0123(const Vec128<uint32_t> v) {
-  return Vec128<uint32_t>{_mm_shuffle_epi32(v.raw, 0x1B)};
-}
-HWY_API Vec128<int32_t> Shuffle0123(const Vec128<int32_t> v) {
-  return Vec128<int32_t>{_mm_shuffle_epi32(v.raw, 0x1B)};
-}
-HWY_API Vec128<float> Shuffle0123(const Vec128<float> v) {
-  return Vec128<float>{_mm_shuffle_ps(v.raw, v.raw, 0x1B)};
-}
-
-// ================================================== COMPARE
-
-#if HWY_TARGET <= HWY_AVX3
-
-// Comparisons set a mask bit to 1 if the condition is true, else 0.
-
-template <typename TFrom, size_t NFrom, typename TTo, size_t NTo>
-HWY_API Mask128<TTo, NTo> RebindMask(Simd<TTo, NTo, 0> /*tag*/,
-                                     Mask128<TFrom, NFrom> m) {
-  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
-  return Mask128<TTo, NTo>{m.raw};
-}
-
-namespace detail {
-
-template <typename T, size_t N>
-HWY_INLINE Mask128<T, N> TestBit(hwy::SizeTag<1> /*tag*/, const Vec128<T, N> v,
-                                 const Vec128<T, N> bit) {
-  return Mask128<T, N>{_mm_test_epi8_mask(v.raw, bit.raw)};
-}
-template <typename T, size_t N>
-HWY_INLINE Mask128<T, N> TestBit(hwy::SizeTag<2> /*tag*/, const Vec128<T, N> v,
-                                 const Vec128<T, N> bit) {
-  return Mask128<T, N>{_mm_test_epi16_mask(v.raw, bit.raw)};
-}
-template <typename T, size_t N>
-HWY_INLINE Mask128<T, N> TestBit(hwy::SizeTag<4> /*tag*/, const Vec128<T, N> v,
-                                 const Vec128<T, N> bit) {
-  return Mask128<T, N>{_mm_test_epi32_mask(v.raw, bit.raw)};
-}
-template <typename T, size_t N>
-HWY_INLINE Mask128<T, N> TestBit(hwy::SizeTag<8> /*tag*/, const Vec128<T, N> v,
-                                 const Vec128<T, N> bit) {
-  return Mask128<T, N>{_mm_test_epi64_mask(v.raw, bit.raw)};
-}
-
-}  // namespace detail
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> TestBit(const Vec128<T, N> v, const Vec128<T, N> bit) {
-  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
-  return detail::TestBit(hwy::SizeTag<sizeof(T)>(), v, bit);
-}
-
-// ------------------------------ Equality
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) {
-  return Mask128<T, N>{_mm_cmpeq_epi8_mask(a.raw, b.raw)};
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) {
-  return Mask128<T, N>{_mm_cmpeq_epi16_mask(a.raw, b.raw)};
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) {
-  return Mask128<T, N>{_mm_cmpeq_epi32_mask(a.raw, b.raw)};
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Mask128<T, N> operator==(const Vec128<T, N> a, const Vec128<T, N> b) {
-  return Mask128<T, N>{_mm_cmpeq_epi64_mask(a.raw, b.raw)};
-}
-
-template <size_t N>
-HWY_API Mask128<float, N> operator==(Vec128<float, N> a, Vec128<float, N> b) {
-  return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)};
-}
-
-template <size_t N>
-HWY_API Mask128<double, N> operator==(Vec128<double, N> a,
-                                      Vec128<double, N> b) {
-  return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_EQ_OQ)};
-}
-
-// ------------------------------ Inequality
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) {
-  return Mask128<T, N>{_mm_cmpneq_epi8_mask(a.raw, b.raw)};
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) {
-  return Mask128<T, N>{_mm_cmpneq_epi16_mask(a.raw, b.raw)};
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) {
-  return Mask128<T, N>{_mm_cmpneq_epi32_mask(a.raw, b.raw)};
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Mask128<T, N> operator!=(const Vec128<T, N> a, const Vec128<T, N> b) {
-  return Mask128<T, N>{_mm_cmpneq_epi64_mask(a.raw, b.raw)};
-}
-
-template <size_t N>
-HWY_API Mask128<float, N> operator!=(Vec128<float, N> a, Vec128<float, N> b) {
-  return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
-}
-
-template <size_t N>
-HWY_API Mask128<double, N> operator!=(Vec128<double, N> a,
-                                      Vec128<double, N> b) {
-  return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
-}
-
-// ------------------------------ Strict inequality
-
-// Signed/float <
-template <size_t N>
-HWY_API Mask128<int8_t, N> operator>(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
-  return Mask128<int8_t, N>{_mm_cmpgt_epi8_mask(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Mask128<int16_t, N> operator>(Vec128<int16_t, N> a,
-                                      Vec128<int16_t, N> b) {
-  return Mask128<int16_t, N>{_mm_cmpgt_epi16_mask(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Mask128<int32_t, N> operator>(Vec128<int32_t, N> a,
-                                      Vec128<int32_t, N> b) {
-  return Mask128<int32_t, N>{_mm_cmpgt_epi32_mask(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Mask128<int64_t, N> operator>(Vec128<int64_t, N> a,
-                                      Vec128<int64_t, N> b) {
-  return Mask128<int64_t, N>{_mm_cmpgt_epi64_mask(a.raw, b.raw)};
-}
-
-template <size_t N>
-HWY_API Mask128<uint8_t, N> operator>(Vec128<uint8_t, N> a,
-                                      Vec128<uint8_t, N> b) {
-  return Mask128<uint8_t, N>{_mm_cmpgt_epu8_mask(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Mask128<uint16_t, N> operator>(Vec128<uint16_t, N> a,
-                                       Vec128<uint16_t, N> b) {
-  return Mask128<uint16_t, N>{_mm_cmpgt_epu16_mask(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Mask128<uint32_t, N> operator>(Vec128<uint32_t, N> a,
-                                       Vec128<uint32_t, N> b) {
-  return Mask128<uint32_t, N>{_mm_cmpgt_epu32_mask(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Mask128<uint64_t, N> operator>(Vec128<uint64_t, N> a,
-                                       Vec128<uint64_t, N> b) {
-  return Mask128<uint64_t, N>{_mm_cmpgt_epu64_mask(a.raw, b.raw)};
-}
-
-template <size_t N>
-HWY_API Mask128<float, N> operator>(Vec128<float, N> a, Vec128<float, N> b) {
-  return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)};
-}
-template <size_t N>
-HWY_API Mask128<double, N> operator>(Vec128<double, N> a, Vec128<double, N> b) {
-  return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_GT_OQ)};
-}
-
-// ------------------------------ Weak inequality
-
-template <size_t N>
-HWY_API Mask128<float, N> operator>=(Vec128<float, N> a, Vec128<float, N> b) {
-  return Mask128<float, N>{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)};
-}
-template <size_t N>
-HWY_API Mask128<double, N> operator>=(Vec128<double, N> a,
-                                      Vec128<double, N> b) {
-  return Mask128<double, N>{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_GE_OQ)};
-}
-
-// ------------------------------ Mask
-
-namespace detail {
-
-template <typename T, size_t N>
-HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<1> /*tag*/,
-                                     const Vec128<T, N> v) {
-  return Mask128<T, N>{_mm_movepi8_mask(v.raw)};
-}
-template <typename T, size_t N>
-HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<2> /*tag*/,
-                                     const Vec128<T, N> v) {
-  return Mask128<T, N>{_mm_movepi16_mask(v.raw)};
-}
-template <typename T, size_t N>
-HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<4> /*tag*/,
-                                     const Vec128<T, N> v) {
-  return Mask128<T, N>{_mm_movepi32_mask(v.raw)};
-}
-template <typename T, size_t N>
-HWY_INLINE Mask128<T, N> MaskFromVec(hwy::SizeTag<8> /*tag*/,
-                                     const Vec128<T, N> v) {
-  return Mask128<T, N>{_mm_movepi64_mask(v.raw)};
-}
-
-}  // namespace detail
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
-  return detail::MaskFromVec(hwy::SizeTag<sizeof(T)>(), v);
-}
-// There do not seem to be native floating-point versions of these instructions.
-template <size_t N>
-HWY_API Mask128<float, N> MaskFromVec(const Vec128<float, N> v) {
-  const RebindToSigned<DFromV<decltype(v)>> di;
-  return Mask128<float, N>{MaskFromVec(BitCast(di, v)).raw};
-}
-template <size_t N>
-HWY_API Mask128<double, N> MaskFromVec(const Vec128<double, N> v) {
-  const RebindToSigned<DFromV<decltype(v)>> di;
-  return Mask128<double, N>{MaskFromVec(BitCast(di, v)).raw};
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
-  return Vec128<T, N>{_mm_movm_epi8(v.raw)};
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
-  return Vec128<T, N>{_mm_movm_epi16(v.raw)};
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
-  return Vec128<T, N>{_mm_movm_epi32(v.raw)};
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec128<T, N> VecFromMask(const Mask128<T, N> v) {
-  return Vec128<T, N>{_mm_movm_epi64(v.raw)};
-}
-
-template <size_t N>
-HWY_API Vec128<float, N> VecFromMask(const Mask128<float, N> v) {
-  return Vec128<float, N>{_mm_castsi128_ps(_mm_movm_epi32(v.raw))};
-}
-
-template <size_t N>
-HWY_API Vec128<double, N> VecFromMask(const Mask128<double, N> v) {
-  return Vec128<double, N>{_mm_castsi128_pd(_mm_movm_epi64(v.raw))};
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> VecFromMask(Simd<T, N, 0> /* tag */,
-                                 const Mask128<T, N> v) {
-  return VecFromMask(v);
-}
-
-#else  // AVX2 or below
-
-// Comparisons fill a lane with 1-bits if the condition is true, else 0.
-
-template <typename TFrom, typename TTo, size_t N>
-HWY_API Mask128<TTo, N> RebindMask(Simd<TTo, N, 0> /*tag*/,
-                                   Mask128<TFrom, N> m) {
-  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
-  const Simd<TFrom, N, 0> d;
-  return MaskFromVec(BitCast(Simd<TTo, N, 0>(), VecFromMask(d, m)));
-}
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> TestBit(Vec128<T, N> v, Vec128<T, N> bit) {
-  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
-  return (v & bit) == bit;
-}
-
-// ------------------------------ Equality
-
-// Unsigned
-template <size_t N>
-HWY_API Mask128<uint8_t, N> operator==(const Vec128<uint8_t, N> a,
-                                       const Vec128<uint8_t, N> b) {
-  return Mask128<uint8_t, N>{_mm_cmpeq_epi8(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Mask128<uint16_t, N> operator==(const Vec128<uint16_t, N> a,
-                                        const Vec128<uint16_t, N> b) {
-  return Mask128<uint16_t, N>{_mm_cmpeq_epi16(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Mask128<uint32_t, N> operator==(const Vec128<uint32_t, N> a,
-                                        const Vec128<uint32_t, N> b) {
-  return Mask128<uint32_t, N>{_mm_cmpeq_epi32(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a,
-                                        const Vec128<uint64_t, N> b) {
-#if HWY_TARGET == HWY_SSSE3
-  const Simd<uint32_t, N * 2, 0> d32;
-  const Simd<uint64_t, N, 0> d64;
-  const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b)));
-  const auto cmp64 = cmp32 & Shuffle2301(cmp32);
-  return MaskFromVec(BitCast(d64, cmp64));
-#else
-  return Mask128<uint64_t, N>{_mm_cmpeq_epi64(a.raw, b.raw)};
-#endif
-}
-
-// Signed
-template <size_t N>
-HWY_API Mask128<int8_t, N> operator==(const Vec128<int8_t, N> a,
-                                      const Vec128<int8_t, N> b) {
-  return Mask128<int8_t, N>{_mm_cmpeq_epi8(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Mask128<int16_t, N> operator==(Vec128<int16_t, N> a,
-                                       Vec128<int16_t, N> b) {
-  return Mask128<int16_t, N>{_mm_cmpeq_epi16(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Mask128<int32_t, N> operator==(const Vec128<int32_t, N> a,
-                                       const Vec128<int32_t, N> b) {
-  return Mask128<int32_t, N>{_mm_cmpeq_epi32(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a,
-                                       const Vec128<int64_t, N> b) {
-  // Same as signed ==; avoid duplicating the SSSE3 version.
-  const DFromV<decltype(a)> d;
-  RebindToUnsigned<decltype(d)> du;
-  return RebindMask(d, BitCast(du, a) == BitCast(du, b));
-}
-
-// Float
-template <size_t N>
-HWY_API Mask128<float, N> operator==(const Vec128<float, N> a,
-                                     const Vec128<float, N> b) {
-  return Mask128<float, N>{_mm_cmpeq_ps(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Mask128<double, N> operator==(const Vec128<double, N> a,
-                                      const Vec128<double, N> b) {
-  return Mask128<double, N>{_mm_cmpeq_pd(a.raw, b.raw)};
-}
-
-// ------------------------------ Inequality
-
-// This cannot have T as a template argument, otherwise it is not more
-// specialized than rewritten operator== in C++20, leading to compile
-// errors: https://gcc.godbolt.org/z/xsrPhPvPT.
-template <size_t N>
-HWY_API Mask128<uint8_t, N> operator!=(Vec128<uint8_t, N> a,
-                                       Vec128<uint8_t, N> b) {
-  return Not(a == b);
-}
-template <size_t N>
-HWY_API Mask128<uint16_t, N> operator!=(Vec128<uint16_t, N> a,
-                                       Vec128<uint16_t, N> b) {
-  return Not(a == b);
-}
-template <size_t N>
-HWY_API Mask128<uint32_t, N> operator!=(Vec128<uint32_t, N> a,
-                                       Vec128<uint32_t, N> b) {
-  return Not(a == b);
-}
-template <size_t N>
-HWY_API Mask128<uint64_t, N> operator!=(Vec128<uint64_t, N> a,
-                                       Vec128<uint64_t, N> b) {
-  return Not(a == b);
-}
-template <size_t N>
-HWY_API Mask128<int8_t, N> operator!=(Vec128<int8_t, N> a,
-                                      Vec128<int8_t, N> b) {
-  return Not(a == b);
-}
-template <size_t N>
-HWY_API Mask128<int16_t, N> operator!=(Vec128<int16_t, N> a,
-                                       Vec128<int16_t, N> b) {
-  return Not(a == b);
-}
-template <size_t N>
-HWY_API Mask128<int32_t, N> operator!=(Vec128<int32_t, N> a,
-                                       Vec128<int32_t, N> b) {
-  return Not(a == b);
-}
-template <size_t N>
-HWY_API Mask128<int64_t, N> operator!=(Vec128<int64_t, N> a,
-                                       Vec128<int64_t, N> b) {
-  return Not(a == b);
-}
-
-template <size_t N>
-HWY_API Mask128<float, N> operator!=(const Vec128<float, N> a,
-                                     const Vec128<float, N> b) {
-  return Mask128<float, N>{_mm_cmpneq_ps(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Mask128<double, N> operator!=(const Vec128<double, N> a,
-                                      const Vec128<double, N> b) {
-  return Mask128<double, N>{_mm_cmpneq_pd(a.raw, b.raw)};
-}
-
-// ------------------------------ Strict inequality
-
-namespace detail {
-
-template <size_t N>
-HWY_INLINE Mask128<int8_t, N> Gt(hwy::SignedTag /*tag*/, Vec128<int8_t, N> a,
-                                 Vec128<int8_t, N> b) {
-  return Mask128<int8_t, N>{_mm_cmpgt_epi8(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_INLINE Mask128<int16_t, N> Gt(hwy::SignedTag /*tag*/, Vec128<int16_t, N> a,
-                                  Vec128<int16_t, N> b) {
-  return Mask128<int16_t, N>{_mm_cmpgt_epi16(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_INLINE Mask128<int32_t, N> Gt(hwy::SignedTag /*tag*/, Vec128<int32_t, N> a,
-                                  Vec128<int32_t, N> b) {
-  return Mask128<int32_t, N>{_mm_cmpgt_epi32(a.raw, b.raw)};
-}
-
-template <size_t N>
-HWY_INLINE Mask128<int64_t, N> Gt(hwy::SignedTag /*tag*/,
-                                  const Vec128<int64_t, N> a,
-                                  const Vec128<int64_t, N> b) {
-#if HWY_TARGET == HWY_SSSE3
-  // See https://stackoverflow.com/questions/65166174/:
-  const Simd<int64_t, N, 0> d;
-  const RepartitionToNarrow<decltype(d)> d32;
-  const Vec128<int64_t, N> m_eq32{Eq(BitCast(d32, a), BitCast(d32, b)).raw};
-  const Vec128<int64_t, N> m_gt32{Gt(BitCast(d32, a), BitCast(d32, b)).raw};
-  // If a.upper is greater, upper := true. Otherwise, if a.upper == b.upper:
-  // upper := b-a (unsigned comparison result of lower). Otherwise: upper := 0.
-  const __m128i upper = OrAnd(m_gt32, m_eq32, Sub(b, a)).raw;
-  // Duplicate upper to lower half.
-  return Mask128<int64_t, N>{_mm_shuffle_epi32(upper, _MM_SHUFFLE(3, 3, 1, 1))};
-#else
-  return Mask128<int64_t, N>{_mm_cmpgt_epi64(a.raw, b.raw)};  // SSE4.2
-#endif
-}
-
-template <typename T, size_t N>
-HWY_INLINE Mask128<T, N> Gt(hwy::UnsignedTag /*tag*/, Vec128<T, N> a,
-                            Vec128<T, N> b) {
-  const DFromV<decltype(a)> du;
-  const RebindToSigned<decltype(du)> di;
-  const Vec128<T, N> msb = Set(du, (LimitsMax<T>() >> 1) + 1);
-  const auto sa = BitCast(di, Xor(a, msb));
-  const auto sb = BitCast(di, Xor(b, msb));
-  return RebindMask(du, Gt(hwy::SignedTag(), sa, sb));
-}
-
-template <size_t N>
-HWY_INLINE Mask128<float, N> Gt(hwy::FloatTag /*tag*/, Vec128<float, N> a,
-                                Vec128<float, N> b) {
-  return Mask128<float, N>{_mm_cmpgt_ps(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_INLINE Mask128<double, N> Gt(hwy::FloatTag /*tag*/, Vec128<double, N> a,
-                                 Vec128<double, N> b) {
-  return Mask128<double, N>{_mm_cmpgt_pd(a.raw, b.raw)};
-}
-
-}  // namespace detail
-
-template <typename T, size_t N>
-HWY_INLINE Mask128<T, N> operator>(Vec128<T, N> a, Vec128<T, N> b) {
-  return detail::Gt(hwy::TypeTag<T>(), a, b);
-}
-
-// ------------------------------ Weak inequality
-
-template <size_t N>
-HWY_API Mask128<float, N> operator>=(const Vec128<float, N> a,
-                                     const Vec128<float, N> b) {
-  return Mask128<float, N>{_mm_cmpge_ps(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Mask128<double, N> operator>=(const Vec128<double, N> a,
-                                      const Vec128<double, N> b) {
-  return Mask128<double, N>{_mm_cmpge_pd(a.raw, b.raw)};
-}
-
-#endif  // HWY_TARGET <= HWY_AVX3
-
-// ------------------------------ Reversed comparisons
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> operator<(Vec128<T, N> a, Vec128<T, N> b) {
-  return b > a;
-}
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> operator<=(Vec128<T, N> a, Vec128<T, N> b) {
-  return b >= a;
-}
-
-// ------------------------------ FirstN (Iota, Lt)
-
-template <typename T, size_t N, HWY_IF_LE128(T, N)>
-HWY_API Mask128<T, N> FirstN(const Simd<T, N, 0> d, size_t num) {
-#if HWY_TARGET <= HWY_AVX3
-  (void)d;
-  const uint64_t all = (1ull << N) - 1;
-  // BZHI only looks at the lower 8 bits of num!
-  const uint64_t bits = (num > 255) ? all : _bzhi_u64(all, num);
-  return Mask128<T, N>::FromBits(bits);
-#else
-  const RebindToSigned<decltype(d)> di;  // Signed comparisons are cheaper.
-  return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(num)));
-#endif
-}
-
-template <class D>
-using MFromD = decltype(FirstN(D(), 0));
-
-// ================================================== MEMORY (1)
-
-// Clang static analysis claims the memory immediately after a partial vector
-// store is uninitialized, and also flags the input to partial loads (at least
-// for loadl_pd) as "garbage". This is a false alarm because msan does not
-// raise errors. We work around this by using CopyBytes instead of intrinsics,
-// but only for the analyzer to avoid potentially bad code generation.
-// Unfortunately __clang_analyzer__ was not defined for clang-tidy prior to v7.
-#ifndef HWY_SAFE_PARTIAL_LOAD_STORE
-#if defined(__clang_analyzer__) || \
-    (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
-#define HWY_SAFE_PARTIAL_LOAD_STORE 1
-#else
-#define HWY_SAFE_PARTIAL_LOAD_STORE 0
-#endif
-#endif  // HWY_SAFE_PARTIAL_LOAD_STORE
-
-// ------------------------------ Load
-
-template <typename T>
-HWY_API Vec128<T> Load(Full128<T> /* tag */, const T* HWY_RESTRICT aligned) {
-  return Vec128<T>{_mm_load_si128(reinterpret_cast<const __m128i*>(aligned))};
-}
-HWY_API Vec128<float> Load(Full128<float> /* tag */,
-                           const float* HWY_RESTRICT aligned) {
-  return Vec128<float>{_mm_load_ps(aligned)};
-}
-HWY_API Vec128<double> Load(Full128<double> /* tag */,
-                            const double* HWY_RESTRICT aligned) {
-  return Vec128<double>{_mm_load_pd(aligned)};
-}
-
-template <typename T>
-HWY_API Vec128<T> LoadU(Full128<T> /* tag */, const T* HWY_RESTRICT p) {
-  return Vec128<T>{_mm_loadu_si128(reinterpret_cast<const __m128i*>(p))};
-}
-HWY_API Vec128<float> LoadU(Full128<float> /* tag */,
-                            const float* HWY_RESTRICT p) {
-  return Vec128<float>{_mm_loadu_ps(p)};
-}
-HWY_API Vec128<double> LoadU(Full128<double> /* tag */,
-                             const double* HWY_RESTRICT p) {
-  return Vec128<double>{_mm_loadu_pd(p)};
-}
-
-template <typename T>
-HWY_API Vec64<T> Load(Full64<T> /* tag */, const T* HWY_RESTRICT p) {
-#if HWY_SAFE_PARTIAL_LOAD_STORE
-  __m128i v = _mm_setzero_si128();
-  CopyBytes<8>(p, &v);  // not same size
-  return Vec64<T>{v};
-#else
-  return Vec64<T>{_mm_loadl_epi64(reinterpret_cast<const __m128i*>(p))};
-#endif
-}
-
-HWY_API Vec128<float, 2> Load(Full64<float> /* tag */,
-                              const float* HWY_RESTRICT p) {
-#if HWY_SAFE_PARTIAL_LOAD_STORE
-  __m128 v = _mm_setzero_ps();
-  CopyBytes<8>(p, &v);  // not same size
-  return Vec128<float, 2>{v};
-#else
-  const __m128 hi = _mm_setzero_ps();
-  return Vec128<float, 2>{_mm_loadl_pi(hi, reinterpret_cast<const __m64*>(p))};
-#endif
-}
-
-HWY_API Vec64<double> Load(Full64<double> /* tag */,
-                           const double* HWY_RESTRICT p) {
-#if HWY_SAFE_PARTIAL_LOAD_STORE
-  __m128d v = _mm_setzero_pd();
-  CopyBytes<8>(p, &v);  // not same size
-  return Vec64<double>{v};
-#else
-  return Vec64<double>{_mm_load_sd(p)};
-#endif
-}
-
-HWY_API Vec128<float, 1> Load(Full32<float> /* tag */,
-                              const float* HWY_RESTRICT p) {
-#if HWY_SAFE_PARTIAL_LOAD_STORE
-  __m128 v = _mm_setzero_ps();
-  CopyBytes<4>(p, &v);  // not same size
-  return Vec128<float, 1>{v};
-#else
-  return Vec128<float, 1>{_mm_load_ss(p)};
-#endif
-}
-
-// Any <= 32 bit except <float, 1>
-template <typename T, size_t N, HWY_IF_LE32(T, N)>
-HWY_API Vec128<T, N> Load(Simd<T, N, 0> /* tag */, const T* HWY_RESTRICT p) {
-  constexpr size_t kSize = sizeof(T) * N;
-#if HWY_SAFE_PARTIAL_LOAD_STORE
-  __m128 v = _mm_setzero_ps();
-  CopyBytes<kSize>(p, &v);  // not same size
-  return Vec128<T, N>{v};
-#else
-  int32_t bits = 0;
-  CopyBytes<kSize>(p, &bits);  // not same size
-  return Vec128<T, N>{_mm_cvtsi32_si128(bits)};
-#endif
-}
-
-// For < 128 bit, LoadU == Load.
-template <typename T, size_t N, HWY_IF_LE64(T, N)>
-HWY_API Vec128<T, N> LoadU(Simd<T, N, 0> d, const T* HWY_RESTRICT p) {
-  return Load(d, p);
-}
-
-// 128-bit SIMD => nothing to duplicate, same as an unaligned load.
-template <typename T, size_t N, HWY_IF_LE128(T, N)>
-HWY_API Vec128<T, N> LoadDup128(Simd<T, N, 0> d, const T* HWY_RESTRICT p) {
-  return LoadU(d, p);
-}
-
-// Returns a vector with lane i=[0, N) set to "first" + i.
-template <typename T, size_t N, typename T2, HWY_IF_LE128(T, N)>
-HWY_API Vec128<T, N> Iota(const Simd<T, N, 0> d, const T2 first) {
-  HWY_ALIGN T lanes[16 / sizeof(T)];
-  for (size_t i = 0; i < 16 / sizeof(T); ++i) {
-    lanes[i] = static_cast<T>(first + static_cast<T2>(i));
-  }
-  return Load(d, lanes);
-}
-
-// ------------------------------ MaskedLoad
-
-#if HWY_TARGET <= HWY_AVX3
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> /* tag */,
-                                const T* HWY_RESTRICT p) {
-  return Vec128<T, N>{_mm_maskz_loadu_epi8(m.raw, p)};
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> /* tag */,
-                                const T* HWY_RESTRICT p) {
-  return Vec128<T, N>{_mm_maskz_loadu_epi16(m.raw, p)};
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> /* tag */,
-                                const T* HWY_RESTRICT p) {
-  return Vec128<T, N>{_mm_maskz_loadu_epi32(m.raw, p)};
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> /* tag */,
-                                const T* HWY_RESTRICT p) {
-  return Vec128<T, N>{_mm_maskz_loadu_epi64(m.raw, p)};
-}
-
-template <size_t N>
-HWY_API Vec128<float, N> MaskedLoad(Mask128<float, N> m,
-                                    Simd<float, N, 0> /* tag */,
-                                    const float* HWY_RESTRICT p) {
-  return Vec128<float, N>{_mm_maskz_loadu_ps(m.raw, p)};
-}
-
-template <size_t N>
-HWY_API Vec128<double, N> MaskedLoad(Mask128<double, N> m,
-                                     Simd<double, N, 0> /* tag */,
-                                     const double* HWY_RESTRICT p) {
-  return Vec128<double, N>{_mm_maskz_loadu_pd(m.raw, p)};
-}
-
-#elif HWY_TARGET == HWY_AVX2
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> /* tag */,
-                                const T* HWY_RESTRICT p) {
-  auto p_p = reinterpret_cast<const int*>(p);  // NOLINT
-  return Vec128<T, N>{_mm_maskload_epi32(p_p, m.raw)};
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> /* tag */,
-                                const T* HWY_RESTRICT p) {
-  auto p_p = reinterpret_cast<const long long*>(p);  // NOLINT
-  return Vec128<T, N>{_mm_maskload_epi64(p_p, m.raw)};
-}
-
-template <size_t N>
-HWY_API Vec128<float, N> MaskedLoad(Mask128<float, N> m, Simd<float, N, 0> d,
-                                    const float* HWY_RESTRICT p) {
-  const Vec128<int32_t, N> mi =
-      BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
-  return Vec128<float, N>{_mm_maskload_ps(p, mi.raw)};
-}
-
-template <size_t N>
-HWY_API Vec128<double, N> MaskedLoad(Mask128<double, N> m, Simd<double, N, 0> d,
-                                     const double* HWY_RESTRICT p) {
-  const Vec128<int64_t, N> mi =
-      BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
-  return Vec128<double, N>{_mm_maskload_pd(p, mi.raw)};
-}
-
-// There is no maskload_epi8/16, so blend instead.
-template <typename T, size_t N, hwy::EnableIf<sizeof(T) <= 2>* = nullptr>
-HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> d,
-                                const T* HWY_RESTRICT p) {
-  return IfThenElseZero(m, Load(d, p));
-}
-
-#else  // <= SSE4
-
-// Avoid maskmov* - its nontemporal 'hint' causes it to bypass caches (slow).
-template <typename T, size_t N>
-HWY_API Vec128<T, N> MaskedLoad(Mask128<T, N> m, Simd<T, N, 0> d,
-                                const T* HWY_RESTRICT p) {
-  return IfThenElseZero(m, Load(d, p));
-}
-
-#endif
-
-// ------------------------------ Store
-
-template <typename T>
-HWY_API void Store(Vec128<T> v, Full128<T> /* tag */, T* HWY_RESTRICT aligned) {
-  _mm_store_si128(reinterpret_cast<__m128i*>(aligned), v.raw);
-}
-HWY_API void Store(const Vec128<float> v, Full128<float> /* tag */,
-                   float* HWY_RESTRICT aligned) {
-  _mm_store_ps(aligned, v.raw);
-}
-HWY_API void Store(const Vec128<double> v, Full128<double> /* tag */,
-                   double* HWY_RESTRICT aligned) {
-  _mm_store_pd(aligned, v.raw);
-}
-
-template <typename T>
-HWY_API void StoreU(Vec128<T> v, Full128<T> /* tag */, T* HWY_RESTRICT p) {
-  _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v.raw);
-}
-HWY_API void StoreU(const Vec128<float> v, Full128<float> /* tag */,
-                    float* HWY_RESTRICT p) {
-  _mm_storeu_ps(p, v.raw);
-}
-HWY_API void StoreU(const Vec128<double> v, Full128<double> /* tag */,
-                    double* HWY_RESTRICT p) {
-  _mm_storeu_pd(p, v.raw);
-}
-
-template <typename T>
-HWY_API void Store(Vec64<T> v, Full64<T> /* tag */, T* HWY_RESTRICT p) {
-#if HWY_SAFE_PARTIAL_LOAD_STORE
-  CopyBytes<8>(&v, p);  // not same size
-#else
-  _mm_storel_epi64(reinterpret_cast<__m128i*>(p), v.raw);
-#endif
-}
-HWY_API void Store(const Vec128<float, 2> v, Full64<float> /* tag */,
-                   float* HWY_RESTRICT p) {
-#if HWY_SAFE_PARTIAL_LOAD_STORE
-  CopyBytes<8>(&v, p);  // not same size
-#else
-  _mm_storel_pi(reinterpret_cast<__m64*>(p), v.raw);
-#endif
-}
-HWY_API void Store(const Vec64<double> v, Full64<double> /* tag */,
-                   double* HWY_RESTRICT p) {
-#if HWY_SAFE_PARTIAL_LOAD_STORE
-  CopyBytes<8>(&v, p);  // not same size
-#else
-  _mm_storel_pd(p, v.raw);
-#endif
-}
-
-// Any <= 32 bit except <float, 1>
-template <typename T, size_t N, HWY_IF_LE32(T, N)>
-HWY_API void Store(Vec128<T, N> v, Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
-  CopyBytes<sizeof(T) * N>(&v, p);  // not same size
-}
-HWY_API void Store(const Vec128<float, 1> v, Full32<float> /* tag */,
-                   float* HWY_RESTRICT p) {
-#if HWY_SAFE_PARTIAL_LOAD_STORE
-  CopyBytes<4>(&v, p);  // not same size
-#else
-  _mm_store_ss(p, v.raw);
-#endif
-}
-
-// For < 128 bit, StoreU == Store.
-template <typename T, size_t N, HWY_IF_LE64(T, N)>
-HWY_API void StoreU(const Vec128<T, N> v, Simd<T, N, 0> d, T* HWY_RESTRICT p) {
-  Store(v, d, p);
-}
-
-// ------------------------------ BlendedStore
-
-namespace detail {
-
-// There is no maskload_epi8/16 with which we could safely implement
-// BlendedStore. Manual blending is also unsafe because loading a full vector
-// that crosses the array end causes asan faults. Resort to scalar code; the
-// caller should instead use memcpy, assuming m is FirstN(d, n).
-template <typename T, size_t N>
-HWY_API void ScalarMaskedStore(Vec128<T, N> v, Mask128<T, N> m, Simd<T, N, 0> d,
-                               T* HWY_RESTRICT p) {
-  const RebindToSigned<decltype(d)> di;  // for testing mask if T=bfloat16_t.
-  using TI = TFromD<decltype(di)>;
-  alignas(16) TI buf[N];
-  alignas(16) TI mask[N];
-  Store(BitCast(di, v), di, buf);
-  Store(BitCast(di, VecFromMask(d, m)), di, mask);
-  for (size_t i = 0; i < N; ++i) {
-    if (mask[i]) {
-      CopySameSize(buf + i, p + i);
-    }
-  }
-}
-}  // namespace detail
-
-#if HWY_TARGET <= HWY_AVX3
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m,
-                          Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
-  _mm_mask_storeu_epi8(p, m.raw, v.raw);
-}
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m,
-                          Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
-  _mm_mask_storeu_epi16(p, m.raw, v.raw);
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m,
-                          Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
-  auto pi = reinterpret_cast<int*>(p);  // NOLINT
-  _mm_mask_storeu_epi32(pi, m.raw, v.raw);
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m,
-                          Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
-  auto pi = reinterpret_cast<long long*>(p);  // NOLINT
-  _mm_mask_storeu_epi64(pi, m.raw, v.raw);
-}
-
-template <size_t N>
-HWY_API void BlendedStore(Vec128<float, N> v, Mask128<float, N> m,
-                          Simd<float, N, 0>, float* HWY_RESTRICT p) {
-  _mm_mask_storeu_ps(p, m.raw, v.raw);
-}
-
-template <size_t N>
-HWY_API void BlendedStore(Vec128<double, N> v, Mask128<double, N> m,
-                          Simd<double, N, 0>, double* HWY_RESTRICT p) {
-  _mm_mask_storeu_pd(p, m.raw, v.raw);
-}
-
-#elif HWY_TARGET == HWY_AVX2
-
-template <typename T, size_t N, hwy::EnableIf<sizeof(T) <= 2>* = nullptr>
-HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m, Simd<T, N, 0> d,
-                          T* HWY_RESTRICT p) {
-  detail::ScalarMaskedStore(v, m, d, p);
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m,
-                          Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
-  // For partial vectors, avoid writing other lanes by zeroing their mask.
-  if (N < 4) {
-    const Full128<T> df;
-    const Mask128<T> mf{m.raw};
-    m = Mask128<T, N>{And(mf, FirstN(df, N)).raw};
-  }
-
-  auto pi = reinterpret_cast<int*>(p);  // NOLINT
-  _mm_maskstore_epi32(pi, m.raw, v.raw);
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m,
-                          Simd<T, N, 0> /* tag */, T* HWY_RESTRICT p) {
-  // For partial vectors, avoid writing other lanes by zeroing their mask.
-  if (N < 2) {
-    const Full128<T> df;
-    const Mask128<T> mf{m.raw};
-    m = Mask128<T, N>{And(mf, FirstN(df, N)).raw};
-  }
-
-  auto pi = reinterpret_cast<long long*>(p);  // NOLINT
-  _mm_maskstore_epi64(pi, m.raw, v.raw);
-}
-
-template <size_t N>
-HWY_API void BlendedStore(Vec128<float, N> v, Mask128<float, N> m,
-                          Simd<float, N, 0> d, float* HWY_RESTRICT p) {
-  using T = float;
-  // For partial vectors, avoid writing other lanes by zeroing their mask.
-  if (N < 4) {
-    const Full128<T> df;
-    const Mask128<T> mf{m.raw};
-    m = Mask128<T, N>{And(mf, FirstN(df, N)).raw};
-  }
-
-  const Vec128<MakeSigned<T>, N> mi =
-      BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
-  _mm_maskstore_ps(p, mi.raw, v.raw);
-}
-
-template <size_t N>
-HWY_API void BlendedStore(Vec128<double, N> v, Mask128<double, N> m,
-                          Simd<double, N, 0> d, double* HWY_RESTRICT p) {
-  using T = double;
-  // For partial vectors, avoid writing other lanes by zeroing their mask.
-  if (N < 2) {
-    const Full128<T> df;
-    const Mask128<T> mf{m.raw};
-    m = Mask128<T, N>{And(mf, FirstN(df, N)).raw};
-  }
-
-  const Vec128<MakeSigned<T>, N> mi =
-      BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
-  _mm_maskstore_pd(p, mi.raw, v.raw);
-}
-
-#else  // <= SSE4
-
-template <typename T, size_t N>
-HWY_API void BlendedStore(Vec128<T, N> v, Mask128<T, N> m, Simd<T, N, 0> d,
-                          T* HWY_RESTRICT p) {
-  // Avoid maskmov* - its nontemporal 'hint' causes it to bypass caches (slow).
-  detail::ScalarMaskedStore(v, m, d, p);
-}
-
-#endif  // SSE4
-
-// ================================================== ARITHMETIC
-
-// ------------------------------ Addition
-
-// Unsigned
-template <size_t N>
-HWY_API Vec128<uint8_t, N> operator+(const Vec128<uint8_t, N> a,
-                                     const Vec128<uint8_t, N> b) {
-  return Vec128<uint8_t, N>{_mm_add_epi8(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<uint16_t, N> operator+(const Vec128<uint16_t, N> a,
-                                      const Vec128<uint16_t, N> b) {
-  return Vec128<uint16_t, N>{_mm_add_epi16(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<uint32_t, N> operator+(const Vec128<uint32_t, N> a,
-                                      const Vec128<uint32_t, N> b) {
-  return Vec128<uint32_t, N>{_mm_add_epi32(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<uint64_t, N> operator+(const Vec128<uint64_t, N> a,
-                                      const Vec128<uint64_t, N> b) {
-  return Vec128<uint64_t, N>{_mm_add_epi64(a.raw, b.raw)};
-}
-
-// Signed
-template <size_t N>
-HWY_API Vec128<int8_t, N> operator+(const Vec128<int8_t, N> a,
-                                    const Vec128<int8_t, N> b) {
-  return Vec128<int8_t, N>{_mm_add_epi8(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<int16_t, N> operator+(const Vec128<int16_t, N> a,
-                                     const Vec128<int16_t, N> b) {
-  return Vec128<int16_t, N>{_mm_add_epi16(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<int32_t, N> operator+(const Vec128<int32_t, N> a,
-                                     const Vec128<int32_t, N> b) {
-  return Vec128<int32_t, N>{_mm_add_epi32(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<int64_t, N> operator+(const Vec128<int64_t, N> a,
-                                     const Vec128<int64_t, N> b) {
-  return Vec128<int64_t, N>{_mm_add_epi64(a.raw, b.raw)};
-}
-
-// Float
-template <size_t N>
-HWY_API Vec128<float, N> operator+(const Vec128<float, N> a,
-                                   const Vec128<float, N> b) {
-  return Vec128<float, N>{_mm_add_ps(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<double, N> operator+(const Vec128<double, N> a,
-                                    const Vec128<double, N> b) {
-  return Vec128<double, N>{_mm_add_pd(a.raw, b.raw)};
-}
-
-// ------------------------------ Subtraction
-
-// Unsigned
-template <size_t N>
-HWY_API Vec128<uint8_t, N> operator-(const Vec128<uint8_t, N> a,
-                                     const Vec128<uint8_t, N> b) {
-  return Vec128<uint8_t, N>{_mm_sub_epi8(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<uint16_t, N> operator-(Vec128<uint16_t, N> a,
-                                      Vec128<uint16_t, N> b) {
-  return Vec128<uint16_t, N>{_mm_sub_epi16(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<uint32_t, N> operator-(const Vec128<uint32_t, N> a,
-                                      const Vec128<uint32_t, N> b) {
-  return Vec128<uint32_t, N>{_mm_sub_epi32(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<uint64_t, N> operator-(const Vec128<uint64_t, N> a,
-                                      const Vec128<uint64_t, N> b) {
-  return Vec128<uint64_t, N>{_mm_sub_epi64(a.raw, b.raw)};
-}
-
-// Signed
-template <size_t N>
-HWY_API Vec128<int8_t, N> operator-(const Vec128<int8_t, N> a,
-                                    const Vec128<int8_t, N> b) {
-  return Vec128<int8_t, N>{_mm_sub_epi8(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<int16_t, N> operator-(const Vec128<int16_t, N> a,
-                                     const Vec128<int16_t, N> b) {
-  return Vec128<int16_t, N>{_mm_sub_epi16(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<int32_t, N> operator-(const Vec128<int32_t, N> a,
-                                     const Vec128<int32_t, N> b) {
-  return Vec128<int32_t, N>{_mm_sub_epi32(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<int64_t, N> operator-(const Vec128<int64_t, N> a,
-                                     const Vec128<int64_t, N> b) {
-  return Vec128<int64_t, N>{_mm_sub_epi64(a.raw, b.raw)};
-}
-
-// Float
-template <size_t N>
-HWY_API Vec128<float, N> operator-(const Vec128<float, N> a,
-                                   const Vec128<float, N> b) {
-  return Vec128<float, N>{_mm_sub_ps(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<double, N> operator-(const Vec128<double, N> a,
-                                    const Vec128<double, N> b) {
-  return Vec128<double, N>{_mm_sub_pd(a.raw, b.raw)};
-}
-
-// ------------------------------ SumsOf8
-template <size_t N>
-HWY_API Vec128<uint64_t, N / 8> SumsOf8(const Vec128<uint8_t, N> v) {
-  return Vec128<uint64_t, N / 8>{_mm_sad_epu8(v.raw, _mm_setzero_si128())};
-}
-
-// ------------------------------ SaturatedAdd
-
-// Returns a + b clamped to the destination range.
-
-// Unsigned
-template <size_t N>
-HWY_API Vec128<uint8_t, N> SaturatedAdd(const Vec128<uint8_t, N> a,
-                                        const Vec128<uint8_t, N> b) {
-  return Vec128<uint8_t, N>{_mm_adds_epu8(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<uint16_t, N> SaturatedAdd(const Vec128<uint16_t, N> a,
-                                         const Vec128<uint16_t, N> b) {
-  return Vec128<uint16_t, N>{_mm_adds_epu16(a.raw, b.raw)};
-}
-
-// Signed
-template <size_t N>
-HWY_API Vec128<int8_t, N> SaturatedAdd(const Vec128<int8_t, N> a,
-                                       const Vec128<int8_t, N> b) {
-  return Vec128<int8_t, N>{_mm_adds_epi8(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<int16_t, N> SaturatedAdd(const Vec128<int16_t, N> a,
-                                        const Vec128<int16_t, N> b) {
-  return Vec128<int16_t, N>{_mm_adds_epi16(a.raw, b.raw)};
-}
-
-// ------------------------------ SaturatedSub
-
-// Returns a - b clamped to the destination range.
-
-// Unsigned
-template <size_t N>
-HWY_API Vec128<uint8_t, N> SaturatedSub(const Vec128<uint8_t, N> a,
-                                        const Vec128<uint8_t, N> b) {
-  return Vec128<uint8_t, N>{_mm_subs_epu8(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<uint16_t, N> SaturatedSub(const Vec128<uint16_t, N> a,
-                                         const Vec128<uint16_t, N> b) {
-  return Vec128<uint16_t, N>{_mm_subs_epu16(a.raw, b.raw)};
-}
-
-// Signed
-template <size_t N>
-HWY_API Vec128<int8_t, N> SaturatedSub(const Vec128<int8_t, N> a,
-                                       const Vec128<int8_t, N> b) {
-  return Vec128<int8_t, N>{_mm_subs_epi8(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<int16_t, N> SaturatedSub(const Vec128<int16_t, N> a,
-                                        const Vec128<int16_t, N> b) {
-  return Vec128<int16_t, N>{_mm_subs_epi16(a.raw, b.raw)};
-}
-
-// ------------------------------ AverageRound
-
-// Returns (a + b + 1) / 2
-
-// Unsigned
-template <size_t N>
-HWY_API Vec128<uint8_t, N> AverageRound(const Vec128<uint8_t, N> a,
-                                        const Vec128<uint8_t, N> b) {
-  return Vec128<uint8_t, N>{_mm_avg_epu8(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<uint16_t, N> AverageRound(const Vec128<uint16_t, N> a,
-                                         const Vec128<uint16_t, N> b) {
-  return Vec128<uint16_t, N>{_mm_avg_epu16(a.raw, b.raw)};
-}
-
-// ------------------------------ Integer multiplication
-
-template <size_t N>
-HWY_API Vec128<uint16_t, N> operator*(const Vec128<uint16_t, N> a,
-                                      const Vec128<uint16_t, N> b) {
-  return Vec128<uint16_t, N>{_mm_mullo_epi16(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<int16_t, N> operator*(const Vec128<int16_t, N> a,
-                                     const Vec128<int16_t, N> b) {
-  return Vec128<int16_t, N>{_mm_mullo_epi16(a.raw, b.raw)};
-}
-
-// Returns the upper 16 bits of a * b in each lane.
-template <size_t N>
-HWY_API Vec128<uint16_t, N> MulHigh(const Vec128<uint16_t, N> a,
-                                    const Vec128<uint16_t, N> b) {
-  return Vec128<uint16_t, N>{_mm_mulhi_epu16(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<int16_t, N> MulHigh(const Vec128<int16_t, N> a,
-                                   const Vec128<int16_t, N> b) {
-  return Vec128<int16_t, N>{_mm_mulhi_epi16(a.raw, b.raw)};
-}
-
-template <size_t N>
-HWY_API Vec128<int16_t, N> MulFixedPoint15(const Vec128<int16_t, N> a,
-                                           const Vec128<int16_t, N> b) {
-  return Vec128<int16_t, N>{_mm_mulhrs_epi16(a.raw, b.raw)};
-}
-
-// Multiplies even lanes (0, 2 ..) and places the double-wide result into
-// even and the upper half into its odd neighbor lane.
-template <size_t N>
-HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(const Vec128<uint32_t, N> a,
-                                              const Vec128<uint32_t, N> b) {
-  return Vec128<uint64_t, (N + 1) / 2>{_mm_mul_epu32(a.raw, b.raw)};
-}
-
-#if HWY_TARGET == HWY_SSSE3
-
-template <size_t N, HWY_IF_LE64(int32_t, N)>  // N=1 or 2
-HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a,
-                                             const Vec128<int32_t, N> b) {
-  return Set(Simd<int64_t, (N + 1) / 2, 0>(),
-             static_cast<int64_t>(GetLane(a)) * GetLane(b));
-}
-HWY_API Vec128<int64_t> MulEven(const Vec128<int32_t> a,
-                                const Vec128<int32_t> b) {
-  alignas(16) int32_t a_lanes[4];
-  alignas(16) int32_t b_lanes[4];
-  const Full128<int32_t> di32;
-  Store(a, di32, a_lanes);
-  Store(b, di32, b_lanes);
-  alignas(16) int64_t mul[2];
-  mul[0] = static_cast<int64_t>(a_lanes[0]) * b_lanes[0];
-  mul[1] = static_cast<int64_t>(a_lanes[2]) * b_lanes[2];
-  return Load(Full128<int64_t>(), mul);
-}
-
-#else  // HWY_TARGET == HWY_SSSE3
-
-template <size_t N>
-HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(const Vec128<int32_t, N> a,
-                                             const Vec128<int32_t, N> b) {
-  return Vec128<int64_t, (N + 1) / 2>{_mm_mul_epi32(a.raw, b.raw)};
-}
-
-#endif  // HWY_TARGET == HWY_SSSE3
-
-template <size_t N>
-HWY_API Vec128<uint32_t, N> operator*(const Vec128<uint32_t, N> a,
-                                      const Vec128<uint32_t, N> b) {
-#if HWY_TARGET == HWY_SSSE3
-  // Not as inefficient as it looks: _mm_mullo_epi32 has 10 cycle latency.
-  // 64-bit right shift would also work but also needs port 5, so no benefit.
-  // Notation: x=don't care, z=0.
-  const __m128i a_x3x1 = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 3, 1, 1));
-  const auto mullo_x2x0 = MulEven(a, b);
-  const __m128i b_x3x1 = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(3, 3, 1, 1));
-  const auto mullo_x3x1 =
-      MulEven(Vec128<uint32_t, N>{a_x3x1}, Vec128<uint32_t, N>{b_x3x1});
-  // We could _mm_slli_epi64 by 32 to get 3z1z and OR with z2z0, but generating
-  // the latter requires one more instruction or a constant.
-  const __m128i mul_20 =
-      _mm_shuffle_epi32(mullo_x2x0.raw, _MM_SHUFFLE(2, 0, 2, 0));
-  const __m128i mul_31 =
-      _mm_shuffle_epi32(mullo_x3x1.raw, _MM_SHUFFLE(2, 0, 2, 0));
-  return Vec128<uint32_t, N>{_mm_unpacklo_epi32(mul_20, mul_31)};
-#else
-  return Vec128<uint32_t, N>{_mm_mullo_epi32(a.raw, b.raw)};
-#endif
-}
-
-template <size_t N>
-HWY_API Vec128<int32_t, N> operator*(const Vec128<int32_t, N> a,
-                                     const Vec128<int32_t, N> b) {
-  // Same as unsigned; avoid duplicating the SSSE3 code.
-  const DFromV<decltype(a)> d;
-  const RebindToUnsigned<decltype(d)> du;
-  return BitCast(d, BitCast(du, a) * BitCast(du, b));
-}
-
-// ------------------------------ RotateRight (ShiftRight, Or)
-
-template <int kBits, size_t N>
-HWY_API Vec128<uint32_t, N> RotateRight(const Vec128<uint32_t, N> v) {
-  static_assert(0 <= kBits && kBits < 32, "Invalid shift count");
-#if HWY_TARGET <= HWY_AVX3
-  return Vec128<uint32_t, N>{_mm_ror_epi32(v.raw, kBits)};
-#else
-  if (kBits == 0) return v;
-  return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(31, 32 - kBits)>(v));
-#endif
-}
-
-template <int kBits, size_t N>
-HWY_API Vec128<uint64_t, N> RotateRight(const Vec128<uint64_t, N> v) {
-  static_assert(0 <= kBits && kBits < 64, "Invalid shift count");
-#if HWY_TARGET <= HWY_AVX3
-  return Vec128<uint64_t, N>{_mm_ror_epi64(v.raw, kBits)};
-#else
-  if (kBits == 0) return v;
-  return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(63, 64 - kBits)>(v));
-#endif
-}
-
-// ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
-
-template <size_t N>
-HWY_API Vec128<int8_t, N> BroadcastSignBit(const Vec128<int8_t, N> v) {
-  const DFromV<decltype(v)> d;
-  return VecFromMask(v < Zero(d));
-}
-
-template <size_t N>
-HWY_API Vec128<int16_t, N> BroadcastSignBit(const Vec128<int16_t, N> v) {
-  return ShiftRight<15>(v);
-}
-
-template <size_t N>
-HWY_API Vec128<int32_t, N> BroadcastSignBit(const Vec128<int32_t, N> v) {
-  return ShiftRight<31>(v);
-}
-
-template <size_t N>
-HWY_API Vec128<int64_t, N> BroadcastSignBit(const Vec128<int64_t, N> v) {
-  const DFromV<decltype(v)> d;
-#if HWY_TARGET <= HWY_AVX3
-  (void)d;
-  return Vec128<int64_t, N>{_mm_srai_epi64(v.raw, 63)};
-#elif HWY_TARGET == HWY_AVX2 || HWY_TARGET == HWY_SSE4
-  return VecFromMask(v < Zero(d));
-#else
-  // Efficient Lt() requires SSE4.2 and BLENDVPD requires SSE4.1. 32-bit shift
-  // avoids generating a zero.
-  const RepartitionToNarrow<decltype(d)> d32;
-  const auto sign = ShiftRight<31>(BitCast(d32, v));
-  return Vec128<int64_t, N>{
-      _mm_shuffle_epi32(sign.raw, _MM_SHUFFLE(3, 3, 1, 1))};
-#endif
-}
-
-template <size_t N>
-HWY_API Vec128<int64_t, N> Abs(const Vec128<int64_t, N> v) {
-#if HWY_TARGET <= HWY_AVX3
-  return Vec128<int64_t, N>{_mm_abs_epi64(v.raw)};
-#else
-  const auto zero = Zero(DFromV<decltype(v)>());
-  return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
-#endif
-}
-
-template <int kBits, size_t N>
-HWY_API Vec128<int64_t, N> ShiftRight(const Vec128<int64_t, N> v) {
-#if HWY_TARGET <= HWY_AVX3
-  return Vec128<int64_t, N>{_mm_srai_epi64(v.raw, kBits)};
-#else
-  const DFromV<decltype(v)> di;
-  const RebindToUnsigned<decltype(di)> du;
-  const auto right = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
-  const auto sign = ShiftLeft<64 - kBits>(BroadcastSignBit(v));
-  return right | sign;
-#endif
-}
-
-// ------------------------------ ZeroIfNegative (BroadcastSignBit)
-template <typename T, size_t N>
-HWY_API Vec128<T, N> ZeroIfNegative(Vec128<T, N> v) {
-  static_assert(IsFloat<T>(), "Only works for float");
-  const DFromV<decltype(v)> d;
-#if HWY_TARGET == HWY_SSSE3
-  const RebindToSigned<decltype(d)> di;
-  const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v))));
-#else
-  const auto mask = MaskFromVec(v);  // MSB is sufficient for BLENDVPS
-#endif
-  return IfThenElse(mask, Zero(d), v);
-}
-
-// ------------------------------ IfNegativeThenElse
-template <size_t N>
-HWY_API Vec128<int8_t, N> IfNegativeThenElse(const Vec128<int8_t, N> v,
-                                             const Vec128<int8_t, N> yes,
-                                             const Vec128<int8_t, N> no) {
-  // int8: IfThenElse only looks at the MSB.
-  return IfThenElse(MaskFromVec(v), yes, no);
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
-                                        Vec128<T, N> no) {
-  static_assert(IsSigned<T>(), "Only works for signed/float");
-  const DFromV<decltype(v)> d;
-  const RebindToSigned<decltype(d)> di;
-
-  // 16-bit: no native blendv, so copy sign to lower byte's MSB.
-  v = BitCast(d, BroadcastSignBit(BitCast(di, v)));
-  return IfThenElse(MaskFromVec(v), yes, no);
-}
-
-template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 2)>
-HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
-                                        Vec128<T, N> no) {
-  static_assert(IsSigned<T>(), "Only works for signed/float");
-  const DFromV<decltype(v)> d;
-  const RebindToFloat<decltype(d)> df;
-
-  // 32/64-bit: use float IfThenElse, which only looks at the MSB.
-  return BitCast(d, IfThenElse(MaskFromVec(BitCast(df, v)), BitCast(df, yes),
-                               BitCast(df, no)));
-}
-
-// ------------------------------ ShiftLeftSame
-
-template <size_t N>
-HWY_API Vec128<uint16_t, N> ShiftLeftSame(const Vec128<uint16_t, N> v,
-                                          const int bits) {
-  return Vec128<uint16_t, N>{_mm_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
-}
-template <size_t N>
-HWY_API Vec128<uint32_t, N> ShiftLeftSame(const Vec128<uint32_t, N> v,
-                                          const int bits) {
-  return Vec128<uint32_t, N>{_mm_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
-}
-template <size_t N>
-HWY_API Vec128<uint64_t, N> ShiftLeftSame(const Vec128<uint64_t, N> v,
-                                          const int bits) {
-  return Vec128<uint64_t, N>{_mm_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
-}
-
-template <size_t N>
-HWY_API Vec128<int16_t, N> ShiftLeftSame(const Vec128<int16_t, N> v,
-                                         const int bits) {
-  return Vec128<int16_t, N>{_mm_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
-}
-
-template <size_t N>
-HWY_API Vec128<int32_t, N> ShiftLeftSame(const Vec128<int32_t, N> v,
-                                         const int bits) {
-  return Vec128<int32_t, N>{_mm_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
-}
-
-template <size_t N>
-HWY_API Vec128<int64_t, N> ShiftLeftSame(const Vec128<int64_t, N> v,
-                                         const int bits) {
-  return Vec128<int64_t, N>{_mm_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Vec128<T, N> ShiftLeftSame(const Vec128<T, N> v, const int bits) {
-  const DFromV<decltype(v)> d8;
-  // Use raw instead of BitCast to support N=1.
-  const Vec128<T, N> shifted{
-      ShiftLeftSame(Vec128<MakeWide<T>>{v.raw}, bits).raw};
-  return shifted & Set(d8, static_cast<T>((0xFF << bits) & 0xFF));
-}
-
-// ------------------------------ ShiftRightSame (BroadcastSignBit)
-
-template <size_t N>
-HWY_API Vec128<uint16_t, N> ShiftRightSame(const Vec128<uint16_t, N> v,
-                                           const int bits) {
-  return Vec128<uint16_t, N>{_mm_srl_epi16(v.raw, _mm_cvtsi32_si128(bits))};
-}
-template <size_t N>
-HWY_API Vec128<uint32_t, N> ShiftRightSame(const Vec128<uint32_t, N> v,
-                                           const int bits) {
-  return Vec128<uint32_t, N>{_mm_srl_epi32(v.raw, _mm_cvtsi32_si128(bits))};
-}
-template <size_t N>
-HWY_API Vec128<uint64_t, N> ShiftRightSame(const Vec128<uint64_t, N> v,
-                                           const int bits) {
-  return Vec128<uint64_t, N>{_mm_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))};
-}
-
-template <size_t N>
-HWY_API Vec128<uint8_t, N> ShiftRightSame(Vec128<uint8_t, N> v,
-                                          const int bits) {
-  const DFromV<decltype(v)> d8;
-  // Use raw instead of BitCast to support N=1.
-  const Vec128<uint8_t, N> shifted{
-      ShiftRightSame(Vec128<uint16_t>{v.raw}, bits).raw};
-  return shifted & Set(d8, static_cast<uint8_t>(0xFF >> bits));
-}
-
-template <size_t N>
-HWY_API Vec128<int16_t, N> ShiftRightSame(const Vec128<int16_t, N> v,
-                                          const int bits) {
-  return Vec128<int16_t, N>{_mm_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))};
-}
-
-template <size_t N>
-HWY_API Vec128<int32_t, N> ShiftRightSame(const Vec128<int32_t, N> v,
-                                          const int bits) {
-  return Vec128<int32_t, N>{_mm_sra_epi32(v.raw, _mm_cvtsi32_si128(bits))};
-}
-template <size_t N>
-HWY_API Vec128<int64_t, N> ShiftRightSame(const Vec128<int64_t, N> v,
-                                          const int bits) {
-#if HWY_TARGET <= HWY_AVX3
-  return Vec128<int64_t, N>{_mm_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))};
-#else
-  const DFromV<decltype(v)> di;
-  const RebindToUnsigned<decltype(di)> du;
-  const auto right = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
-  const auto sign = ShiftLeftSame(BroadcastSignBit(v), 64 - bits);
-  return right | sign;
-#endif
-}
-
-template <size_t N>
-HWY_API Vec128<int8_t, N> ShiftRightSame(Vec128<int8_t, N> v, const int bits) {
-  const DFromV<decltype(v)> di;
-  const RebindToUnsigned<decltype(di)> du;
-  const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
-  const auto shifted_sign =
-      BitCast(di, Set(du, static_cast<uint8_t>(0x80 >> bits)));
-  return (shifted ^ shifted_sign) - shifted_sign;
-}
-
-// ------------------------------ Floating-point mul / div
-
-template <size_t N>
-HWY_API Vec128<float, N> operator*(Vec128<float, N> a, Vec128<float, N> b) {
-  return Vec128<float, N>{_mm_mul_ps(a.raw, b.raw)};
-}
-HWY_API Vec128<float, 1> operator*(const Vec128<float, 1> a,
-                                   const Vec128<float, 1> b) {
-  return Vec128<float, 1>{_mm_mul_ss(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<double, N> operator*(const Vec128<double, N> a,
-                                    const Vec128<double, N> b) {
-  return Vec128<double, N>{_mm_mul_pd(a.raw, b.raw)};
-}
-HWY_API Vec64<double> operator*(const Vec64<double> a, const Vec64<double> b) {
-  return Vec64<double>{_mm_mul_sd(a.raw, b.raw)};
-}
-
-template <size_t N>
-HWY_API Vec128<float, N> operator/(const Vec128<float, N> a,
-                                   const Vec128<float, N> b) {
-  return Vec128<float, N>{_mm_div_ps(a.raw, b.raw)};
-}
-HWY_API Vec128<float, 1> operator/(const Vec128<float, 1> a,
-                                   const Vec128<float, 1> b) {
-  return Vec128<float, 1>{_mm_div_ss(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<double, N> operator/(const Vec128<double, N> a,
-                                    const Vec128<double, N> b) {
-  return Vec128<double, N>{_mm_div_pd(a.raw, b.raw)};
-}
-HWY_API Vec64<double> operator/(const Vec64<double> a, const Vec64<double> b) {
-  return Vec64<double>{_mm_div_sd(a.raw, b.raw)};
-}
-
-// Approximate reciprocal
-template <size_t N>
-HWY_API Vec128<float, N> ApproximateReciprocal(const Vec128<float, N> v) {
-  return Vec128<float, N>{_mm_rcp_ps(v.raw)};
-}
-HWY_API Vec128<float, 1> ApproximateReciprocal(const Vec128<float, 1> v) {
-  return Vec128<float, 1>{_mm_rcp_ss(v.raw)};
-}
-
-// Absolute value of difference.
-template <size_t N>
-HWY_API Vec128<float, N> AbsDiff(const Vec128<float, N> a,
-                                 const Vec128<float, N> b) {
-  return Abs(a - b);
-}
-
-// ------------------------------ Floating-point multiply-add variants
-
-// Returns mul * x + add
-template <size_t N>
-HWY_API Vec128<float, N> MulAdd(const Vec128<float, N> mul,
-                                const Vec128<float, N> x,
-                                const Vec128<float, N> add) {
-#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
-  return mul * x + add;
-#else
-  return Vec128<float, N>{_mm_fmadd_ps(mul.raw, x.raw, add.raw)};
-#endif
-}
-template <size_t N>
-HWY_API Vec128<double, N> MulAdd(const Vec128<double, N> mul,
-                                 const Vec128<double, N> x,
-                                 const Vec128<double, N> add) {
-#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
-  return mul * x + add;
-#else
-  return Vec128<double, N>{_mm_fmadd_pd(mul.raw, x.raw, add.raw)};
-#endif
-}
-
-// Returns add - mul * x
-template <size_t N>
-HWY_API Vec128<float, N> NegMulAdd(const Vec128<float, N> mul,
-                                   const Vec128<float, N> x,
-                                   const Vec128<float, N> add) {
-#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
-  return add - mul * x;
-#else
-  return Vec128<float, N>{_mm_fnmadd_ps(mul.raw, x.raw, add.raw)};
-#endif
-}
-template <size_t N>
-HWY_API Vec128<double, N> NegMulAdd(const Vec128<double, N> mul,
-                                    const Vec128<double, N> x,
-                                    const Vec128<double, N> add) {
-#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
-  return add - mul * x;
-#else
-  return Vec128<double, N>{_mm_fnmadd_pd(mul.raw, x.raw, add.raw)};
-#endif
-}
-
-// Returns mul * x - sub
-template <size_t N>
-HWY_API Vec128<float, N> MulSub(const Vec128<float, N> mul,
-                                const Vec128<float, N> x,
-                                const Vec128<float, N> sub) {
-#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
-  return mul * x - sub;
-#else
-  return Vec128<float, N>{_mm_fmsub_ps(mul.raw, x.raw, sub.raw)};
-#endif
-}
-template <size_t N>
-HWY_API Vec128<double, N> MulSub(const Vec128<double, N> mul,
-                                 const Vec128<double, N> x,
-                                 const Vec128<double, N> sub) {
-#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
-  return mul * x - sub;
-#else
-  return Vec128<double, N>{_mm_fmsub_pd(mul.raw, x.raw, sub.raw)};
-#endif
-}
-
-// Returns -mul * x - sub
-template <size_t N>
-HWY_API Vec128<float, N> NegMulSub(const Vec128<float, N> mul,
-                                   const Vec128<float, N> x,
-                                   const Vec128<float, N> sub) {
-#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
-  return Neg(mul) * x - sub;
-#else
-  return Vec128<float, N>{_mm_fnmsub_ps(mul.raw, x.raw, sub.raw)};
-#endif
-}
-template <size_t N>
-HWY_API Vec128<double, N> NegMulSub(const Vec128<double, N> mul,
-                                    const Vec128<double, N> x,
-                                    const Vec128<double, N> sub) {
-#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
-  return Neg(mul) * x - sub;
-#else
-  return Vec128<double, N>{_mm_fnmsub_pd(mul.raw, x.raw, sub.raw)};
-#endif
-}
-
-// ------------------------------ Floating-point square root
-
-// Full precision square root
-template <size_t N>
-HWY_API Vec128<float, N> Sqrt(const Vec128<float, N> v) {
-  return Vec128<float, N>{_mm_sqrt_ps(v.raw)};
-}
-HWY_API Vec128<float, 1> Sqrt(const Vec128<float, 1> v) {
-  return Vec128<float, 1>{_mm_sqrt_ss(v.raw)};
-}
-template <size_t N>
-HWY_API Vec128<double, N> Sqrt(const Vec128<double, N> v) {
-  return Vec128<double, N>{_mm_sqrt_pd(v.raw)};
-}
-HWY_API Vec64<double> Sqrt(const Vec64<double> v) {
-  return Vec64<double>{_mm_sqrt_sd(_mm_setzero_pd(), v.raw)};
-}
-
-// Approximate reciprocal square root
-template <size_t N>
-HWY_API Vec128<float, N> ApproximateReciprocalSqrt(const Vec128<float, N> v) {
-  return Vec128<float, N>{_mm_rsqrt_ps(v.raw)};
-}
-HWY_API Vec128<float, 1> ApproximateReciprocalSqrt(const Vec128<float, 1> v) {
-  return Vec128<float, 1>{_mm_rsqrt_ss(v.raw)};
-}
-
-// ------------------------------ Min (Gt, IfThenElse)
-
-namespace detail {
-
-template <typename T, size_t N>
-HWY_INLINE HWY_MAYBE_UNUSED Vec128<T, N> MinU(const Vec128<T, N> a,
-                                              const Vec128<T, N> b) {
-  const DFromV<decltype(a)> d;
-  const RebindToUnsigned<decltype(d)> du;
-  const RebindToSigned<decltype(d)> di;
-  const auto msb = Set(du, static_cast<T>(T(1) << (sizeof(T) * 8 - 1)));
-  const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb));
-  return IfThenElse(gt, b, a);
-}
-
-}  // namespace detail
-
-// Unsigned
-template <size_t N>
-HWY_API Vec128<uint8_t, N> Min(const Vec128<uint8_t, N> a,
-                               const Vec128<uint8_t, N> b) {
-  return Vec128<uint8_t, N>{_mm_min_epu8(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<uint16_t, N> Min(const Vec128<uint16_t, N> a,
-                                const Vec128<uint16_t, N> b) {
-#if HWY_TARGET == HWY_SSSE3
-  return detail::MinU(a, b);
-#else
-  return Vec128<uint16_t, N>{_mm_min_epu16(a.raw, b.raw)};
-#endif
-}
-template <size_t N>
-HWY_API Vec128<uint32_t, N> Min(const Vec128<uint32_t, N> a,
-                                const Vec128<uint32_t, N> b) {
-#if HWY_TARGET == HWY_SSSE3
-  return detail::MinU(a, b);
-#else
-  return Vec128<uint32_t, N>{_mm_min_epu32(a.raw, b.raw)};
-#endif
-}
-template <size_t N>
-HWY_API Vec128<uint64_t, N> Min(const Vec128<uint64_t, N> a,
-                                const Vec128<uint64_t, N> b) {
-#if HWY_TARGET <= HWY_AVX3
-  return Vec128<uint64_t, N>{_mm_min_epu64(a.raw, b.raw)};
-#else
-  return detail::MinU(a, b);
-#endif
-}
-
-// Signed
-template <size_t N>
-HWY_API Vec128<int8_t, N> Min(const Vec128<int8_t, N> a,
-                              const Vec128<int8_t, N> b) {
-#if HWY_TARGET == HWY_SSSE3
-  return IfThenElse(a < b, a, b);
-#else
-  return Vec128<int8_t, N>{_mm_min_epi8(a.raw, b.raw)};
-#endif
-}
-template <size_t N>
-HWY_API Vec128<int16_t, N> Min(const Vec128<int16_t, N> a,
-                               const Vec128<int16_t, N> b) {
-  return Vec128<int16_t, N>{_mm_min_epi16(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<int32_t, N> Min(const Vec128<int32_t, N> a,
-                               const Vec128<int32_t, N> b) {
-#if HWY_TARGET == HWY_SSSE3
-  return IfThenElse(a < b, a, b);
-#else
-  return Vec128<int32_t, N>{_mm_min_epi32(a.raw, b.raw)};
-#endif
-}
-template <size_t N>
-HWY_API Vec128<int64_t, N> Min(const Vec128<int64_t, N> a,
-                               const Vec128<int64_t, N> b) {
-#if HWY_TARGET <= HWY_AVX3
-  return Vec128<int64_t, N>{_mm_min_epi64(a.raw, b.raw)};
-#else
-  return IfThenElse(a < b, a, b);
-#endif
-}
-
-// Float
-template <size_t N>
-HWY_API Vec128<float, N> Min(const Vec128<float, N> a,
-                             const Vec128<float, N> b) {
-  return Vec128<float, N>{_mm_min_ps(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<double, N> Min(const Vec128<double, N> a,
-                              const Vec128<double, N> b) {
-  return Vec128<double, N>{_mm_min_pd(a.raw, b.raw)};
-}
-
-// ------------------------------ Max (Gt, IfThenElse)
-
-namespace detail {
-template <typename T, size_t N>
-HWY_INLINE HWY_MAYBE_UNUSED Vec128<T, N> MaxU(const Vec128<T, N> a,
-                                              const Vec128<T, N> b) {
-  const DFromV<decltype(a)> d;
-  const RebindToUnsigned<decltype(d)> du;
-  const RebindToSigned<decltype(d)> di;
-  const auto msb = Set(du, static_cast<T>(T(1) << (sizeof(T) * 8 - 1)));
-  const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb));
-  return IfThenElse(gt, a, b);
-}
-
-}  // namespace detail
-
-// Unsigned
-template <size_t N>
-HWY_API Vec128<uint8_t, N> Max(const Vec128<uint8_t, N> a,
-                               const Vec128<uint8_t, N> b) {
-  return Vec128<uint8_t, N>{_mm_max_epu8(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<uint16_t, N> Max(const Vec128<uint16_t, N> a,
-                                const Vec128<uint16_t, N> b) {
-#if HWY_TARGET == HWY_SSSE3
-  return detail::MaxU(a, b);
-#else
-  return Vec128<uint16_t, N>{_mm_max_epu16(a.raw, b.raw)};
-#endif
-}
-template <size_t N>
-HWY_API Vec128<uint32_t, N> Max(const Vec128<uint32_t, N> a,
-                                const Vec128<uint32_t, N> b) {
-#if HWY_TARGET == HWY_SSSE3
-  return detail::MaxU(a, b);
-#else
-  return Vec128<uint32_t, N>{_mm_max_epu32(a.raw, b.raw)};
-#endif
-}
-template <size_t N>
-HWY_API Vec128<uint64_t, N> Max(const Vec128<uint64_t, N> a,
-                                const Vec128<uint64_t, N> b) {
-#if HWY_TARGET <= HWY_AVX3
-  return Vec128<uint64_t, N>{_mm_max_epu64(a.raw, b.raw)};
-#else
-  return detail::MaxU(a, b);
-#endif
-}
-
-// Signed
-template <size_t N>
-HWY_API Vec128<int8_t, N> Max(const Vec128<int8_t, N> a,
-                              const Vec128<int8_t, N> b) {
-#if HWY_TARGET == HWY_SSSE3
-  return IfThenElse(a < b, b, a);
-#else
-  return Vec128<int8_t, N>{_mm_max_epi8(a.raw, b.raw)};
-#endif
-}
-template <size_t N>
-HWY_API Vec128<int16_t, N> Max(const Vec128<int16_t, N> a,
-                               const Vec128<int16_t, N> b) {
-  return Vec128<int16_t, N>{_mm_max_epi16(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<int32_t, N> Max(const Vec128<int32_t, N> a,
-                               const Vec128<int32_t, N> b) {
-#if HWY_TARGET == HWY_SSSE3
-  return IfThenElse(a < b, b, a);
-#else
-  return Vec128<int32_t, N>{_mm_max_epi32(a.raw, b.raw)};
-#endif
-}
-template <size_t N>
-HWY_API Vec128<int64_t, N> Max(const Vec128<int64_t, N> a,
-                               const Vec128<int64_t, N> b) {
-#if HWY_TARGET <= HWY_AVX3
-  return Vec128<int64_t, N>{_mm_max_epi64(a.raw, b.raw)};
-#else
-  return IfThenElse(a < b, b, a);
-#endif
-}
-
-// Float
-template <size_t N>
-HWY_API Vec128<float, N> Max(const Vec128<float, N> a,
-                             const Vec128<float, N> b) {
-  return Vec128<float, N>{_mm_max_ps(a.raw, b.raw)};
-}
-template <size_t N>
-HWY_API Vec128<double, N> Max(const Vec128<double, N> a,
-                              const Vec128<double, N> b) {
-  return Vec128<double, N>{_mm_max_pd(a.raw, b.raw)};
-}
-
-// ================================================== MEMORY (2)
-
-// ------------------------------ Non-temporal stores
-
-// On clang6, we see incorrect code generated for _mm_stream_pi, so
-// round even partial vectors up to 16 bytes.
-template <typename T, size_t N>
-HWY_API void Stream(Vec128<T, N> v, Simd<T, N, 0> /* tag */,
-                    T* HWY_RESTRICT aligned) {
-  _mm_stream_si128(reinterpret_cast<__m128i*>(aligned), v.raw);
-}
-template <size_t N>
-HWY_API void Stream(const Vec128<float, N> v, Simd<float, N, 0> /* tag */,
-                    float* HWY_RESTRICT aligned) {
-  _mm_stream_ps(aligned, v.raw);
-}
-template <size_t N>
-HWY_API void Stream(const Vec128<double, N> v, Simd<double, N, 0> /* tag */,
-                    double* HWY_RESTRICT aligned) {
-  _mm_stream_pd(aligned, v.raw);
-}
-
-// ------------------------------ Scatter
-
-// Work around warnings in the intrinsic definitions (passing -1 as a mask).
-HWY_DIAGNOSTICS(push)
-HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
-
-// Unfortunately the GCC/Clang intrinsics do not accept int64_t*.
-using GatherIndex64 = long long int;  // NOLINT(runtime/int)
-static_assert(sizeof(GatherIndex64) == 8, "Must be 64-bit type");
-
-#if HWY_TARGET <= HWY_AVX3
-namespace detail {
-
-template <typename T, size_t N>
-HWY_INLINE void ScatterOffset(hwy::SizeTag<4> /* tag */, Vec128<T, N> v,
-                              Simd<T, N, 0> /* tag */, T* HWY_RESTRICT base,
-                              const Vec128<int32_t, N> offset) {
-  if (N == 4) {
-    _mm_i32scatter_epi32(base, offset.raw, v.raw, 1);
-  } else {
-    const __mmask8 mask = (1u << N) - 1;
-    _mm_mask_i32scatter_epi32(base, mask, offset.raw, v.raw, 1);
-  }
-}
-template <typename T, size_t N>
-HWY_INLINE void ScatterIndex(hwy::SizeTag<4> /* tag */, Vec128<T, N> v,
-                             Simd<T, N, 0> /* tag */, T* HWY_RESTRICT base,
-                             const Vec128<int32_t, N> index) {
-  if (N == 4) {
-    _mm_i32scatter_epi32(base, index.raw, v.raw, 4);
-  } else {
-    const __mmask8 mask = (1u << N) - 1;
-    _mm_mask_i32scatter_epi32(base, mask, index.raw, v.raw, 4);
-  }
-}
-
-template <typename T, size_t N>
-HWY_INLINE void ScatterOffset(hwy::SizeTag<8> /* tag */, Vec128<T, N> v,
-                              Simd<T, N, 0> /* tag */, T* HWY_RESTRICT base,
-                              const Vec128<int64_t, N> offset) {
-  if (N == 2) {
-    _mm_i64scatter_epi64(base, offset.raw, v.raw, 1);
-  } else {
-    const __mmask8 mask = (1u << N) - 1;
-    _mm_mask_i64scatter_epi64(base, mask, offset.raw, v.raw, 1);
-  }
-}
-template <typename T, size_t N>
-HWY_INLINE void ScatterIndex(hwy::SizeTag<8> /* tag */, Vec128<T, N> v,
-                             Simd<T, N, 0> /* tag */, T* HWY_RESTRICT base,
-                             const Vec128<int64_t, N> index) {
-  if (N == 2) {
-    _mm_i64scatter_epi64(base, index.raw, v.raw, 8);
-  } else {
-    const __mmask8 mask = (1u << N) - 1;
-    _mm_mask_i64scatter_epi64(base, mask, index.raw, v.raw, 8);
-  }
-}
-
-}  // namespace detail
-
-template <typename T, size_t N, typename Offset>
-HWY_API void ScatterOffset(Vec128<T, N> v, Simd<T, N, 0> d,
-                           T* HWY_RESTRICT base,
-                           const Vec128<Offset, N> offset) {
-  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
-  return detail::ScatterOffset(hwy::SizeTag<sizeof(T)>(), v, d, base, offset);
-}
-template <typename T, size_t N, typename Index>
-HWY_API void ScatterIndex(Vec128<T, N> v, Simd<T, N, 0> d, T* HWY_RESTRICT base,
-                          const Vec128<Index, N> index) {
-  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
-  return detail::ScatterIndex(hwy::SizeTag<sizeof(T)>(), v, d, base, index);
-}
-
-template <size_t N>
-HWY_API void ScatterOffset(Vec128<float, N> v, Simd<float, N, 0> /* tag */,
-                           float* HWY_RESTRICT base,
-                           const Vec128<int32_t, N> offset) {
-  if (N == 4) {
-    _mm_i32scatter_ps(base, offset.raw, v.raw, 1);
-  } else {
-    const __mmask8 mask = (1u << N) - 1;
-    _mm_mask_i32scatter_ps(base, mask, offset.raw, v.raw, 1);
-  }
-}
-template <size_t N>
-HWY_API void ScatterIndex(Vec128<float, N> v, Simd<float, N, 0> /* tag */,
-                          float* HWY_RESTRICT base,
-                          const Vec128<int32_t, N> index) {
-  if (N == 4) {
-    _mm_i32scatter_ps(base, index.raw, v.raw, 4);
-  } else {
-    const __mmask8 mask = (1u << N) - 1;
-    _mm_mask_i32scatter_ps(base, mask, index.raw, v.raw, 4);
-  }
-}
-
-template <size_t N>
-HWY_API void ScatterOffset(Vec128<double, N> v, Simd<double, N, 0> /* tag */,
-                           double* HWY_RESTRICT base,
-                           const Vec128<int64_t, N> offset) {
-  if (N == 2) {
-    _mm_i64scatter_pd(base, offset.raw, v.raw, 1);
-  } else {
-    const __mmask8 mask = (1u << N) - 1;
-    _mm_mask_i64scatter_pd(base, mask, offset.raw, v.raw, 1);
-  }
-}
-template <size_t N>
-HWY_API void ScatterIndex(Vec128<double, N> v, Simd<double, N, 0> /* tag */,
-                          double* HWY_RESTRICT base,
-                          const Vec128<int64_t, N> index) {
-  if (N == 2) {
-    _mm_i64scatter_pd(base, index.raw, v.raw, 8);
-  } else {
-    const __mmask8 mask = (1u << N) - 1;
-    _mm_mask_i64scatter_pd(base, mask, index.raw, v.raw, 8);
-  }
-}
-#else  // HWY_TARGET <= HWY_AVX3
-
-template <typename T, size_t N, typename Offset, HWY_IF_LE128(T, N)>
-HWY_API void ScatterOffset(Vec128<T, N> v, Simd<T, N, 0> d,
-                           T* HWY_RESTRICT base,
-                           const Vec128<Offset, N> offset) {
-  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
-
-  alignas(16) T lanes[N];
-  Store(v, d, lanes);
-
-  alignas(16) Offset offset_lanes[N];
-  Store(offset, Rebind<Offset, decltype(d)>(), offset_lanes);
-
-  uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
-  for (size_t i = 0; i < N; ++i) {
-    CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
-  }
-}
-
-template <typename T, size_t N, typename Index, HWY_IF_LE128(T, N)>
-HWY_API void ScatterIndex(Vec128<T, N> v, Simd<T, N, 0> d, T* HWY_RESTRICT base,
-                          const Vec128<Index, N> index) {
-  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
-
-  alignas(16) T lanes[N];
-  Store(v, d, lanes);
-
-  alignas(16) Index index_lanes[N];
-  Store(index, Rebind<Index, decltype(d)>(), index_lanes);
-
-  for (size_t i = 0; i < N; ++i) {
-    base[index_lanes[i]] = lanes[i];
-  }
-}
-
-#endif
-
-// ------------------------------ Gather (Load/Store)
-
-#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
-
-template <typename T, size_t N, typename Offset>
-HWY_API Vec128<T, N> GatherOffset(const Simd<T, N, 0> d,
-                                  const T* HWY_RESTRICT base,
-                                  const Vec128<Offset, N> offset) {
-  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
-
-  alignas(16) Offset offset_lanes[N];
-  Store(offset, Rebind<Offset, decltype(d)>(), offset_lanes);
-
-  alignas(16) T lanes[N];
-  const uint8_t* base_bytes = reinterpret_cast<const uint8_t*>(base);
-  for (size_t i = 0; i < N; ++i) {
-    CopyBytes<sizeof(T)>(base_bytes + offset_lanes[i], &lanes[i]);
-  }
-  return Load(d, lanes);
-}
-
-template <typename T, size_t N, typename Index>
-HWY_API Vec128<T, N> GatherIndex(const Simd<T, N, 0> d,
-                                 const T* HWY_RESTRICT base,
-                                 const Vec128<Index, N> index) {
-  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
-
-  alignas(16) Index index_lanes[N];
-  Store(index, Rebind<Index, decltype(d)>(), index_lanes);
-
-  alignas(16) T lanes[N];
-  for (size_t i = 0; i < N; ++i) {
-    lanes[i] = base[index_lanes[i]];
-  }
-  return Load(d, lanes);
-}
-
-#else
-
-namespace detail {
-
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> GatherOffset(hwy::SizeTag<4> /* tag */,
-                                     Simd<T, N, 0> /* d */,
-                                     const T* HWY_RESTRICT base,
-                                     const Vec128<int32_t, N> offset) {
-  return Vec128<T, N>{_mm_i32gather_epi32(
-      reinterpret_cast<const int32_t*>(base), offset.raw, 1)};
-}
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> GatherIndex(hwy::SizeTag<4> /* tag */,
-                                    Simd<T, N, 0> /* d */,
-                                    const T* HWY_RESTRICT base,
-                                    const Vec128<int32_t, N> index) {
-  return Vec128<T, N>{_mm_i32gather_epi32(
-      reinterpret_cast<const int32_t*>(base), index.raw, 4)};
-}
-
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> GatherOffset(hwy::SizeTag<8> /* tag */,
-                                     Simd<T, N, 0> /* d */,
-                                     const T* HWY_RESTRICT base,
-                                     const Vec128<int64_t, N> offset) {
-  return Vec128<T, N>{_mm_i64gather_epi64(
-      reinterpret_cast<const GatherIndex64*>(base), offset.raw, 1)};
-}
-template <typename T, size_t N>
-HWY_INLINE Vec128<T, N> GatherIndex(hwy::SizeTag<8> /* tag */,
-                                    Simd<T, N, 0> /* d */,
-                                    const T* HWY_RESTRICT base,
-                                    const Vec128<int64_t, N> index) {
-  return Vec128<T, N>{_mm_i64gather_epi64(
-      reinterpret_cast<const GatherIndex64*>(base), index.raw, 8)};
-}
-
-}  // namespace detail
-
-template <typename T, size_t N, typename Offset>
-HWY_API Vec128<T, N> GatherOffset(Simd<T, N, 0> d, const T* HWY_RESTRICT base,
-                                  const Vec128<Offset, N> offset) {
-  return detail::GatherOffset(hwy::SizeTag<sizeof(T)>(), d, base, offset);
-}
-template <typename T, size_t N, typename Index>
-HWY_API Vec128<T, N> GatherIndex(Simd<T, N, 0> d, const T* HWY_RESTRICT base,
-                                 const Vec128<Index, N> index) {
-  return detail::GatherIndex(hwy::SizeTag<sizeof(T)>(), d, base, index);
-}
-
-template <size_t N>
-HWY_API Vec128<float, N> GatherOffset(Simd<float, N, 0> /* tag */,
-                                      const float* HWY_RESTRICT base,
-                                      const Vec128<int32_t, N> offset) {
-  return Vec128<float, N>{_mm_i32gather_ps(base, offset.raw, 1)};
-}
-template <size_t N>
-HWY_API Vec128<float, N> GatherIndex(Simd<float, N, 0> /* tag */,
-                                     const float* HWY_RESTRICT base,
-                                     const Vec128<int32_t, N> index) {
-  return Vec128<float, N>{_mm_i32gather_ps(base, index.raw, 4)};
-}
-
-template <size_t N>
-HWY_API Vec128<double, N> GatherOffset(Simd<double, N, 0> /* tag */,
-                                       const double* HWY_RESTRICT base,
-                                       const Vec128<int64_t, N> offset) {
-  return Vec128<double, N>{_mm_i64gather_pd(base, offset.raw, 1)};
-}
-template <size_t N>
-HWY_API Vec128<double, N> GatherIndex(Simd<double, N, 0> /* tag */,
-                                      const double* HWY_RESTRICT base,
-                                      const Vec128<int64_t, N> index) {
-  return Vec128<double, N>{_mm_i64gather_pd(base, index.raw, 8)};
-}
-
-#endif  // HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
-
-HWY_DIAGNOSTICS(pop)
-
-// ================================================== SWIZZLE (2)
-
-// ------------------------------ LowerHalf
-
-// Returns upper/lower half of a vector.
-template <typename T, size_t N>
-HWY_API Vec128<T, N / 2> LowerHalf(Simd<T, N / 2, 0> /* tag */,
-                                   Vec128<T, N> v) {
-  return Vec128<T, N / 2>{v.raw};
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
-  return LowerHalf(Simd<T, N / 2, 0>(), v);
-}
-
-// ------------------------------ ShiftLeftBytes
-
-template <int kBytes, typename T, size_t N>
-HWY_API Vec128<T, N> ShiftLeftBytes(Simd<T, N, 0> /* tag */, Vec128<T, N> v) {
-  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
-  return Vec128<T, N>{_mm_slli_si128(v.raw, kBytes)};
-}
-
-template <int kBytes, typename T, size_t N>
-HWY_API Vec128<T, N> ShiftLeftBytes(const Vec128<T, N> v) {
-  return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v);
-}
-
-// ------------------------------ ShiftLeftLanes
-
-template <int kLanes, typename T, size_t N>
-HWY_API Vec128<T, N> ShiftLeftLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
-  const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
-}
-
-template <int kLanes, typename T, size_t N>
-HWY_API Vec128<T, N> ShiftLeftLanes(const Vec128<T, N> v) {
-  return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
-}
-
-// ------------------------------ ShiftRightBytes
-template <int kBytes, typename T, size_t N>
-HWY_API Vec128<T, N> ShiftRightBytes(Simd<T, N, 0> /* tag */, Vec128<T, N> v) {
-  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
-  // For partial vectors, clear upper lanes so we shift in zeros.
-  if (N != 16 / sizeof(T)) {
-    const Vec128<T> vfull{v.raw};
-    v = Vec128<T, N>{IfThenElseZero(FirstN(Full128<T>(), N), vfull).raw};
-  }
-  return Vec128<T, N>{_mm_srli_si128(v.raw, kBytes)};
-}
-
-// ------------------------------ ShiftRightLanes
-template <int kLanes, typename T, size_t N>
-HWY_API Vec128<T, N> ShiftRightLanes(Simd<T, N, 0> d, const Vec128<T, N> v) {
-  const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v)));
-}
-
-// ------------------------------ UpperHalf (ShiftRightBytes)
-
-// Full input: copy hi into lo (smaller instruction encoding than shifts).
-template <typename T>
-HWY_API Vec64<T> UpperHalf(Half<Full128<T>> /* tag */, Vec128<T> v) {
-  return Vec64<T>{_mm_unpackhi_epi64(v.raw, v.raw)};
-}
-HWY_API Vec128<float, 2> UpperHalf(Full64<float> /* tag */, Vec128<float> v) {
-  return Vec128<float, 2>{_mm_movehl_ps(v.raw, v.raw)};
-}
-HWY_API Vec64<double> UpperHalf(Full64<double> /* tag */, Vec128<double> v) {
-  return Vec64<double>{_mm_unpackhi_pd(v.raw, v.raw)};
-}
-
-// Partial
-template <typename T, size_t N, HWY_IF_LE64(T, N)>
-HWY_API Vec128<T, (N + 1) / 2> UpperHalf(Half<Simd<T, N, 0>> /* tag */,
-                                         Vec128<T, N> v) {
-  const DFromV<decltype(v)> d;
-  const RebindToUnsigned<decltype(d)> du;
-  const auto vu = BitCast(du, v);
-  const auto upper = BitCast(d, ShiftRightBytes<N * sizeof(T) / 2>(du, vu));
-  return Vec128<T, (N + 1) / 2>{upper.raw};
-}
-
-// ------------------------------ ExtractLane (UpperHalf)
-
-namespace detail {
-
-template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
-HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
-  static_assert(kLane < N, "Lane index out of bounds");
-#if HWY_TARGET == HWY_SSSE3
-  const int pair = _mm_extract_epi16(v.raw, kLane / 2);
-  constexpr int kShift = kLane & 1 ? 8 : 0;
-  return static_cast<T>((pair >> kShift) & 0xFF);
-#else
-  return static_cast<T>(_mm_extract_epi8(v.raw, kLane) & 0xFF);
-#endif
-}
-
-template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
-HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
-  static_assert(kLane < N, "Lane index out of bounds");
-  return static_cast<T>(_mm_extract_epi16(v.raw, kLane) & 0xFFFF);
-}
-
-template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
-HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
-  static_assert(kLane < N, "Lane index out of bounds");
-#if HWY_TARGET == HWY_SSSE3
-  alignas(16) T lanes[4];
-  Store(v, DFromV<decltype(v)>(), lanes);
-  return lanes[kLane];
-#else
-  return static_cast<T>(_mm_extract_epi32(v.raw, kLane));
-#endif
-}
-
-template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
-HWY_INLINE T ExtractLane(const Vec128<T, N> v) {
-  static_assert(kLane < N, "Lane index out of bounds");
-#if HWY_TARGET == HWY_SSSE3 || HWY_ARCH_X86_32
-  alignas(16) T lanes[2];
-  Store(v, DFromV<decltype(v)>(), lanes);
-  return lanes[kLane];
-#else
-  return static_cast<T>(_mm_extract_epi64(v.raw, kLane));
-#endif
-}
-
-template <size_t kLane, size_t N>
-HWY_INLINE float ExtractLane(const Vec128<float, N> v) {
-  static_assert(kLane < N, "Lane index out of bounds");
-#if HWY_TARGET == HWY_SSSE3
-  alignas(16) float lanes[4];
-  Store(v, DFromV<decltype(v)>(), lanes);
-  return lanes[kLane];
-#else
-  // Bug in the intrinsic, returns int but should be float.
-  const int32_t bits = _mm_extract_ps(v.raw, kLane);
-  float ret;
-  CopySameSize(&bits, &ret);
-  return ret;
-#endif
-}
-
-// There is no extract_pd; two overloads because there is no UpperHalf for N=1.
-template <size_t kLane>
-HWY_INLINE double ExtractLane(const Vec128<double, 1> v) {
-  static_assert(kLane == 0, "Lane index out of bounds");
-  return GetLane(v);
-}
-
-template <size_t kLane>
-HWY_INLINE double ExtractLane(const Vec128<double> v) {
-  static_assert(kLane < 2, "Lane index out of bounds");
-  const Half<DFromV<decltype(v)>> dh;
-  return kLane == 0 ? GetLane(v) : GetLane(UpperHalf(dh, v));
-}
-
-}  // namespace detail
-
-// Requires one overload per vector length because ExtractLane<3> may be a
-// compile error if it calls _mm_extract_epi64.
-template <typename T>
-HWY_API T ExtractLane(const Vec128<T, 1> v, size_t i) {
-  HWY_DASSERT(i == 0);
-  (void)i;
-  return GetLane(v);
-}
-
-template <typename T>
-HWY_API T ExtractLane(const Vec128<T, 2> v, size_t i) {
-#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
-  if (__builtin_constant_p(i)) {
-    switch (i) {
-      case 0:
-        return detail::ExtractLane<0>(v);
-      case 1:
-        return detail::ExtractLane<1>(v);
-    }
-  }
-#endif
-  alignas(16) T lanes[2];
-  Store(v, DFromV<decltype(v)>(), lanes);
-  return lanes[i];
-}
-
-template <typename T>
-HWY_API T ExtractLane(const Vec128<T, 4> v, size_t i) {
-#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
-  if (__builtin_constant_p(i)) {
-    switch (i) {
-      case 0:
-        return detail::ExtractLane<0>(v);
-      case 1:
-        return detail::ExtractLane<1>(v);
-      case 2:
-        return detail::ExtractLane<2>(v);
-      case 3:
-        return detail::ExtractLane<3>(v);
-    }
-  }
-#endif
-  alignas(16) T lanes[4];
-  Store(v, DFromV<decltype(v)>(), lanes);
-  return lanes[i];
-}
-
-template <typename T>
-HWY_API T ExtractLane(const Vec128<T, 8> v, size_t i) {
-#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
-  if (__builtin_constant_p(i)) {
-    switch (i) {
-      case 0:
-        return detail::ExtractLane<0>(v);
-      case 1:
-        return detail::ExtractLane<1>(v);
-      case 2:
-        return detail::ExtractLane<2>(v);
-      case 3:
-        return detail::ExtractLane<3>(v);
-      case 4:
-        return detail::ExtractLane<4>(v);
-      case 5:
-        return detail::ExtractLane<5>(v);
-      case 6:
-        return detail::ExtractLane<6>(v);
-      case 7:
-        return detail::ExtractLane<7>(v);
-    }
-  }
-#endif
-  alignas(16) T lanes[8];
-  Store(v, DFromV<decltype(v)>(), lanes);
-  return lanes[i];
-}
-
-template <typename T>
-HWY_API T ExtractLane(const Vec128<T, 16> v, size_t i) {
-#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
-  if (__builtin_constant_p(i)) {
-    switch (i) {
-      case 0:
-        return detail::ExtractLane<0>(v);
-      case 1:
-        return detail::ExtractLane<1>(v);
-      case 2:
-        return detail::ExtractLane<2>(v);
-      case 3:
-        return detail::ExtractLane<3>(v);
-      case 4:
-        return detail::ExtractLane<4>(v);
-      case 5:
-        return detail::ExtractLane<5>(v);
-      case 6:
-        return detail::ExtractLane<6>(v);
-      case 7:
-        return detail::ExtractLane<7>(v);
-      case 8:
-        return detail::ExtractLane<8>(v);
-      case 9:
-        return detail::ExtractLane<9>(v);
-      case 10:
-        return detail::ExtractLane<10>(v);
-      case 11:
-        return detail::ExtractLane<11>(v);
-      case 12:
-        return detail::ExtractLane<12>(v);
-      case 13:
-        return detail::ExtractLane<13>(v);
-      case 14:
-        return detail::ExtractLane<14>(v);
-      case 15:
-        return detail::ExtractLane<15>(v);
-    }
-  }
-#endif
-  alignas(16) T lanes[16];
-  Store(v, DFromV<decltype(v)>(), lanes);
-  return lanes[i];
-}
-
-// ------------------------------ InsertLane (UpperHalf)
-
-namespace detail {
-
-template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
-HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
-  static_assert(kLane < N, "Lane index out of bounds");
-#if HWY_TARGET == HWY_SSSE3
-  const DFromV<decltype(v)> d;
-  alignas(16) T lanes[16];
-  Store(v, d, lanes);
-  lanes[kLane] = t;
-  return Load(d, lanes);
-#else
-  return Vec128<T, N>{_mm_insert_epi8(v.raw, t, kLane)};
-#endif
-}
-
-template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
-HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
-  static_assert(kLane < N, "Lane index out of bounds");
-  return Vec128<T, N>{_mm_insert_epi16(v.raw, t, kLane)};
-}
-
-template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
-HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
-  static_assert(kLane < N, "Lane index out of bounds");
-#if HWY_TARGET == HWY_SSSE3
-  alignas(16) T lanes[4];
-  const DFromV<decltype(v)> d;
-  Store(v, d, lanes);
-  lanes[kLane] = t;
-  return Load(d, lanes);
-#else
-  MakeSigned<T> ti;
-  CopySameSize(&t, &ti);  // don't just cast because T might be float.
-  return Vec128<T, N>{_mm_insert_epi32(v.raw, ti, kLane)};
-#endif
-}
-
-template <size_t kLane, typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
-HWY_INLINE Vec128<T, N> InsertLane(const Vec128<T, N> v, T t) {
-  static_assert(kLane < N, "Lane index out of bounds");
-#if HWY_TARGET == HWY_SSSE3 || HWY_ARCH_X86_32
-  const DFromV<decltype(v)> d;
-  alignas(16) T lanes[2];
-  Store(v, d, lanes);
-  lanes[kLane] = t;
-  return Load(d, lanes);
-#else
-  MakeSigned<T> ti;
-  CopySameSize(&t, &ti);  // don't just cast because T might be float.
-  return Vec128<T, N>{_mm_insert_epi64(v.raw, ti, kLane)};
-#endif
-}
-
-template <size_t kLane, size_t N>
-HWY_INLINE Vec128<float, N> InsertLane(const Vec128<float, N> v, float t) {
-  static_assert(kLane < N, "Lane index out of bounds");
-#if HWY_TARGET == HWY_SSSE3
-  const DFromV<decltype(v)> d;
-  alignas(16) float lanes[4];
-  Store(v, d, lanes);
-  lanes[kLane] = t;
-  return Load(d, lanes);
-#else
-  return Vec128<float, N>{_mm_insert_ps(v.raw, _mm_set_ss(t), kLane << 4)};
-#endif
-}
-
-// There is no insert_pd; two overloads because there is no UpperHalf for N=1.
-template <size_t kLane>
-HWY_INLINE Vec128<double, 1> InsertLane(const Vec128<double, 1> v, double t) {
-  static_assert(kLane == 0, "Lane index out of bounds");
-  return Set(DFromV<decltype(v)>(), t);
-}
-
-template <size_t kLane>
-HWY_INLINE Vec128<double> InsertLane(const Vec128<double> v, double t) {
-  static_assert(kLane < 2, "Lane index out of bounds");
-  const DFromV<decltype(v)> d;
-  const Vec128<double> vt = Set(d, t);
-  if (kLane == 0) {
-    return Vec128<double>{_mm_shuffle_pd(vt.raw, v.raw, 2)};
-  }
-  return Vec128<double>{_mm_shuffle_pd(v.raw, vt.raw, 0)};
-}
-
-}  // namespace detail
-
-// Requires one overload per vector length because InsertLane<3> may be a
-// compile error if it calls _mm_insert_epi64.
-
-template <typename T>
-HWY_API Vec128<T, 1> InsertLane(const Vec128<T, 1> v, size_t i, T t) {
-  HWY_DASSERT(i == 0);
-  (void)i;
-  return Set(DFromV<decltype(v)>(), t);
-}
-
-template <typename T>
-HWY_API Vec128<T, 2> InsertLane(const Vec128<T, 2> v, size_t i, T t) {
-#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
-  if (__builtin_constant_p(i)) {
-    switch (i) {
-      case 0:
-        return detail::InsertLane<0>(v, t);
-      case 1:
-        return detail::InsertLane<1>(v, t);
-    }
-  }
-#endif
-  const DFromV<decltype(v)> d;
-  alignas(16) T lanes[2];
-  Store(v, d, lanes);
-  lanes[i] = t;
-  return Load(d, lanes);
-}
-
-template <typename T>
-HWY_API Vec128<T, 4> InsertLane(const Vec128<T, 4> v, size_t i, T t) {
-#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
-  if (__builtin_constant_p(i)) {
-    switch (i) {
-      case 0:
-        return detail::InsertLane<0>(v, t);
-      case 1:
-        return detail::InsertLane<1>(v, t);
-      case 2:
-        return detail::InsertLane<2>(v, t);
-      case 3:
-        return detail::InsertLane<3>(v, t);
-    }
-  }
-#endif
-  const DFromV<decltype(v)> d;
-  alignas(16) T lanes[4];
-  Store(v, d, lanes);
-  lanes[i] = t;
-  return Load(d, lanes);
-}
-
-template <typename T>
-HWY_API Vec128<T, 8> InsertLane(const Vec128<T, 8> v, size_t i, T t) {
-#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
-  if (__builtin_constant_p(i)) {
-    switch (i) {
-      case 0:
-        return detail::InsertLane<0>(v, t);
-      case 1:
-        return detail::InsertLane<1>(v, t);
-      case 2:
-        return detail::InsertLane<2>(v, t);
-      case 3:
-        return detail::InsertLane<3>(v, t);
-      case 4:
-        return detail::InsertLane<4>(v, t);
-      case 5:
-        return detail::InsertLane<5>(v, t);
-      case 6:
-        return detail::InsertLane<6>(v, t);
-      case 7:
-        return detail::InsertLane<7>(v, t);
-    }
-  }
-#endif
-  const DFromV<decltype(v)> d;
-  alignas(16) T lanes[8];
-  Store(v, d, lanes);
-  lanes[i] = t;
-  return Load(d, lanes);
-}
-
-template <typename T>
-HWY_API Vec128<T, 16> InsertLane(const Vec128<T, 16> v, size_t i, T t) {
-#if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
-  if (__builtin_constant_p(i)) {
-    switch (i) {
-      case 0:
-        return detail::InsertLane<0>(v, t);
-      case 1:
-        return detail::InsertLane<1>(v, t);
-      case 2:
-        return detail::InsertLane<2>(v, t);
-      case 3:
-        return detail::InsertLane<3>(v, t);
-      case 4:
-        return detail::InsertLane<4>(v, t);
-      case 5:
-        return detail::InsertLane<5>(v, t);
-      case 6:
-        return detail::InsertLane<6>(v, t);
-      case 7:
-        return detail::InsertLane<7>(v, t);
-      case 8:
-        return detail::InsertLane<8>(v, t);
-      case 9:
-        return detail::InsertLane<9>(v, t);
-      case 10:
-        return detail::InsertLane<10>(v, t);
-      case 11:
-        return detail::InsertLane<11>(v, t);
-      case 12:
-        return detail::InsertLane<12>(v, t);
-      case 13:
-        return detail::InsertLane<13>(v, t);
-      case 14:
-        return detail::InsertLane<14>(v, t);
-      case 15:
-        return detail::InsertLane<15>(v, t);
-    }
-  }
-#endif
-  const DFromV<decltype(v)> d;
-  alignas(16) T lanes[16];
-  Store(v, d, lanes);
-  lanes[i] = t;
-  return Load(d, lanes);
-}
-
-// ------------------------------ CombineShiftRightBytes
-
-template <int kBytes, typename T, class V = Vec128<T>>
-HWY_API V CombineShiftRightBytes(Full128<T> d, V hi, V lo) {
-  const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, Vec128<uint8_t>{_mm_alignr_epi8(
-                        BitCast(d8, hi).raw, BitCast(d8, lo).raw, kBytes)});
-}
-
-template <int kBytes, typename T, size_t N, HWY_IF_LE64(T, N),
-          class V = Vec128<T, N>>
-HWY_API V CombineShiftRightBytes(Simd<T, N, 0> d, V hi, V lo) {
-  constexpr size_t kSize = N * sizeof(T);
-  static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
-  const Repartition<uint8_t, decltype(d)> d8;
-  const Full128<uint8_t> d_full8;
-  using V8 = VFromD<decltype(d_full8)>;
-  const V8 hi8{BitCast(d8, hi).raw};
-  // Move into most-significant bytes
-  const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw});
-  const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(d_full8, hi8, lo8);
-  return V{BitCast(Full128<T>(), r).raw};
-}
-
-// ------------------------------ Broadcast/splat any lane
-
-// Unsigned
-template <int kLane, size_t N>
-HWY_API Vec128<uint16_t, N> Broadcast(const Vec128<uint16_t, N> v) {
-  static_assert(0 <= kLane && kLane < N, "Invalid lane");
-  if (kLane < 4) {
-    const __m128i lo = _mm_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF);
-    return Vec128<uint16_t, N>{_mm_unpacklo_epi64(lo, lo)};
-  } else {
-    const __m128i hi = _mm_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF);
-    return Vec128<uint16_t, N>{_mm_unpackhi_epi64(hi, hi)};
-  }
-}
-template <int kLane, size_t N>
-HWY_API Vec128<uint32_t, N> Broadcast(const Vec128<uint32_t, N> v) {
-  static_assert(0 <= kLane && kLane < N, "Invalid lane");
-  return Vec128<uint32_t, N>{_mm_shuffle_epi32(v.raw, 0x55 * kLane)};
-}
-template <int kLane, size_t N>
-HWY_API Vec128<uint64_t, N> Broadcast(const Vec128<uint64_t, N> v) {
-  static_assert(0 <= kLane && kLane < N, "Invalid lane");
-  return Vec128<uint64_t, N>{_mm_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)};
-}
-
-// Signed
-template <int kLane, size_t N>
-HWY_API Vec128<int16_t, N> Broadcast(const Vec128<int16_t, N> v) {
-  static_assert(0 <= kLane && kLane < N, "Invalid lane");
-  if (kLane < 4) {
-    const __m128i lo = _mm_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF);
-    return Vec128<int16_t, N>{_mm_unpacklo_epi64(lo, lo)};
-  } else {
-    const __m128i hi = _mm_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF);
-    return Vec128<int16_t, N>{_mm_unpackhi_epi64(hi, hi)};
-  }
-}
-template <int kLane, size_t N>
-HWY_API Vec128<int32_t, N> Broadcast(const Vec128<int32_t, N> v) {
-  static_assert(0 <= kLane && kLane < N, "Invalid lane");
-  return Vec128<int32_t, N>{_mm_shuffle_epi32(v.raw, 0x55 * kLane)};
-}
-template <int kLane, size_t N>
-HWY_API Vec128<int64_t, N> Broadcast(const Vec128<int64_t, N> v) {
-  static_assert(0 <= kLane && kLane < N, "Invalid lane");
-  return Vec128<int64_t, N>{_mm_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)};
-}
-
-// Float
-template <int kLane, size_t N>
-HWY_API Vec128<float, N> Broadcast(const Vec128<float, N> v) {
-  static_assert(0 <= kLane && kLane < N, "Invalid lane");
-  return Vec128<float, N>{_mm_shuffle_ps(v.raw, v.raw, 0x55 * kLane)};
-}
-template <int kLane, size_t N>
-HWY_API Vec128<double, N> Broadcast(const Vec128<double, N> v) {
-  static_assert(0 <= kLane && kLane < N, "Invalid lane");
-  return Vec128<double, N>{_mm_shuffle_pd(v.raw, v.raw, 3 * kLane)};
-}
-
-// ------------------------------ TableLookupLanes (Shuffle01)
-
-// Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes.
-template <typename T, size_t N = 16 / sizeof(T)>
-struct Indices128 {
-  __m128i raw;
-};
-
-template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N),
-          HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Indices128<T, N> IndicesFromVec(Simd<T, N, 0> d, Vec128<TI, N> vec) {
-  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
-#if HWY_IS_DEBUG_BUILD
-  const Rebind<TI, decltype(d)> di;
-  HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
-              AllTrue(di, Lt(vec, Set(di, N))));
-#endif
-
-#if HWY_TARGET <= HWY_AVX2
-  (void)d;
-  return Indices128<T, N>{vec.raw};
-#else
-  const Repartition<uint8_t, decltype(d)> d8;
-  using V8 = VFromD<decltype(d8)>;
-  alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 0, 1, 2, 3,
-                                                    0, 1, 2, 3, 0, 1, 2, 3};
-
-  // Broadcast each lane index to all 4 bytes of T
-  alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = {
-      0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
-  const V8 lane_indices = TableLookupBytes(vec, Load(d8, kBroadcastLaneBytes));
-
-  // Shift to bytes
-  const Repartition<uint16_t, decltype(d)> d16;
-  const V8 byte_indices = BitCast(d8, ShiftLeft<2>(BitCast(d16, lane_indices)));
-
-  return Indices128<T, N>{Add(byte_indices, Load(d8, kByteOffsets)).raw};
-#endif
-}
-
-template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N),
-          HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Indices128<T, N> IndicesFromVec(Simd<T, N, 0> d, Vec128<TI, N> vec) {
-  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
-#if HWY_IS_DEBUG_BUILD
-  const Rebind<TI, decltype(d)> di;
-  HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
-              AllTrue(di, Lt(vec, Set(di, static_cast<TI>(N)))));
-#else
-  (void)d;
-#endif
-
-  // No change - even without AVX3, we can shuffle+blend.
-  return Indices128<T, N>{vec.raw};
-}
-
-template <typename T, size_t N, typename TI, HWY_IF_LE128(T, N)>
-HWY_API Indices128<T, N> SetTableIndices(Simd<T, N, 0> d, const TI* idx) {
-  const Rebind<TI, decltype(d)> di;
-  return IndicesFromVec(d, LoadU(di, idx));
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) {
-#if HWY_TARGET <= HWY_AVX2
-  const DFromV<decltype(v)> d;
-  const RebindToFloat<decltype(d)> df;
-  const Vec128<float, N> perm{_mm_permutevar_ps(BitCast(df, v).raw, idx.raw)};
-  return BitCast(d, perm);
-#else
-  return TableLookupBytes(v, Vec128<T, N>{idx.raw});
-#endif
-}
-
-template <size_t N, HWY_IF_GE64(float, N)>
-HWY_API Vec128<float, N> TableLookupLanes(Vec128<float, N> v,
-                                          Indices128<float, N> idx) {
-#if HWY_TARGET <= HWY_AVX2
-  return Vec128<float, N>{_mm_permutevar_ps(v.raw, idx.raw)};
-#else
-  const DFromV<decltype(v)> df;
-  const RebindToSigned<decltype(df)> di;
-  return BitCast(df,
-                 TableLookupBytes(BitCast(di, v), Vec128<int32_t, N>{idx.raw}));
-#endif
-}
-
-// Single lane: no change
-template <typename T>
-HWY_API Vec128<T, 1> TableLookupLanes(Vec128<T, 1> v,
-                                      Indices128<T, 1> /* idx */) {
-  return v;
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec128<T> TableLookupLanes(Vec128<T> v, Indices128<T> idx) {
-  const Full128<T> d;
-  Vec128<int64_t> vidx{idx.raw};
-#if HWY_TARGET <= HWY_AVX2
-  // There is no _mm_permute[x]var_epi64.
-  vidx += vidx;  // bit1 is the decider (unusual)
-  const Full128<double> df;
-  return BitCast(
-      d, Vec128<double>{_mm_permutevar_pd(BitCast(df, v).raw, vidx.raw)});
-#else
-  // Only 2 lanes: can swap+blend. Choose v if vidx == iota. To avoid a 64-bit
-  // comparison (expensive on SSSE3), just invert the upper lane and subtract 1
-  // to obtain an all-zero or all-one mask.
-  const Full128<int64_t> di;
-  const Vec128<int64_t> same = (vidx ^ Iota(di, 0)) - Set(di, 1);
-  const Mask128<T> mask_same = RebindMask(d, MaskFromVec(same));
-  return IfThenElse(mask_same, v, Shuffle01(v));
-#endif
-}
-
-HWY_API Vec128<double> TableLookupLanes(Vec128<double> v,
-                                        Indices128<double> idx) {
-  Vec128<int64_t> vidx{idx.raw};
-#if HWY_TARGET <= HWY_AVX2
-  vidx += vidx;  // bit1 is the decider (unusual)
-  return Vec128<double>{_mm_permutevar_pd(v.raw, vidx.raw)};
-#else
-  // Only 2 lanes: can swap+blend. Choose v if vidx == iota. To avoid a 64-bit
-  // comparison (expensive on SSSE3), just invert the upper lane and subtract 1
-  // to obtain an all-zero or all-one mask.
-  const Full128<double> d;
-  const Full128<int64_t> di;
-  const Vec128<int64_t> same = (vidx ^ Iota(di, 0)) - Set(di, 1);
-  const Mask128<double> mask_same = RebindMask(d, MaskFromVec(same));
-  return IfThenElse(mask_same, v, Shuffle01(v));
-#endif
-}
-
-// ------------------------------ ReverseBlocks
-
-// Single block: no change
-template <typename T>
-HWY_API Vec128<T> ReverseBlocks(Full128<T> /* tag */, const Vec128<T> v) {
-  return v;
-}
-
-// ------------------------------ Reverse (Shuffle0123, Shuffle2301)
-
-// Single lane: no change
-template <typename T>
-HWY_API Vec128<T, 1> Reverse(Simd<T, 1, 0> /* tag */, const Vec128<T, 1> v) {
-  return v;
-}
-
-// Two lanes: shuffle
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec128<T, 2> Reverse(Full64<T> /* tag */, const Vec128<T, 2> v) {
-  return Vec128<T, 2>{Shuffle2301(Vec128<T>{v.raw}).raw};
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) {
-  return Shuffle01(v);
-}
-
-// Four lanes: shuffle
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec128<T> Reverse(Full128<T> /* tag */, const Vec128<T> v) {
-  return Shuffle0123(v);
-}
-
-// 16-bit
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec128<T, N> Reverse(Simd<T, N, 0> d, const Vec128<T, N> v) {
-#if HWY_TARGET <= HWY_AVX3
-  if (N == 1) return v;
-  if (N == 2) {
-    const Repartition<uint32_t, decltype(d)> du32;
-    return BitCast(d, RotateRight<16>(BitCast(du32, v)));
-  }
-  const RebindToSigned<decltype(d)> di;
-  alignas(16) constexpr int16_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0};
-  const Vec128<int16_t, N> idx = Load(di, kReverse + (N == 8 ? 0 : 4));
-  return BitCast(d, Vec128<int16_t, N>{
-                        _mm_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
-#else
-  const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
-  return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v))));
-#endif
-}
-
-// ------------------------------ Reverse2
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> d, const Vec128<T, N> v) {
-  const Repartition<uint32_t, decltype(d)> du32;
-  return BitCast(d, RotateRight<16>(BitCast(du32, v)));
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
-  return Shuffle2301(v);
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec128<T, N> Reverse2(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
-  return Shuffle01(v);
-}
-
-// ------------------------------ Reverse4
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> d, const Vec128<T, N> v) {
-  const RebindToSigned<decltype(d)> di;
-  // 4x 16-bit: a single shufflelo suffices.
-  if (N == 4) {
-    return BitCast(d, Vec128<int16_t, N>{_mm_shufflelo_epi16(
-                          BitCast(di, v).raw, _MM_SHUFFLE(0, 1, 2, 3))});
-  }
-
-#if HWY_TARGET <= HWY_AVX3
-  alignas(16) constexpr int16_t kReverse4[8] = {3, 2, 1, 0, 7, 6, 5, 4};
-  const Vec128<int16_t, N> idx = Load(di, kReverse4);
-  return BitCast(d, Vec128<int16_t, N>{
-                        _mm_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
-#else
-  const RepartitionToWide<decltype(di)> dw;
-  return Reverse2(d, BitCast(d, Shuffle2301(BitCast(dw, v))));
-#endif
-}
-
-// 4x 32-bit: use Shuffle0123
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec128<T> Reverse4(Full128<T> /* tag */, const Vec128<T> v) {
-  return Shuffle0123(v);
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec128<T, N> Reverse4(Simd<T, N, 0> /* tag */, Vec128<T, N> /* v */) {
-  HWY_ASSERT(0);  // don't have 4 u64 lanes
-}
-
-// ------------------------------ Reverse8
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec128<T, N> Reverse8(Simd<T, N, 0> d, const Vec128<T, N> v) {
-#if HWY_TARGET <= HWY_AVX3
-  const RebindToSigned<decltype(d)> di;
-  alignas(32) constexpr int16_t kReverse8[16] = {7,  6,  5,  4,  3,  2,  1, 0,
-                                                 15, 14, 13, 12, 11, 10, 9, 8};
-  const Vec128<int16_t, N> idx = Load(di, kReverse8);
-  return BitCast(d, Vec128<int16_t, N>{
-                        _mm_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
-#else
-  const RepartitionToWide<decltype(d)> dw;
-  return Reverse2(d, BitCast(d, Shuffle0123(BitCast(dw, v))));
-#endif
-}
-
-template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 2)>
-HWY_API Vec128<T, N> Reverse8(Simd<T, N, 0> /* tag */, Vec128<T, N> /* v */) {
-  HWY_ASSERT(0);  // don't have 8 lanes unless 16-bit
-}
-
-// ------------------------------ InterleaveLower
-
-// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
-// the least-significant lane) and "b". To concatenate two half-width integers
-// into one, use ZipLower/Upper instead (also works with scalar).
-
-template <size_t N, HWY_IF_LE128(uint8_t, N)>
-HWY_API Vec128<uint8_t, N> InterleaveLower(const Vec128<uint8_t, N> a,
-                                           const Vec128<uint8_t, N> b) {
-  return Vec128<uint8_t, N>{_mm_unpacklo_epi8(a.raw, b.raw)};
-}
-template <size_t N, HWY_IF_LE128(uint16_t, N)>
-HWY_API Vec128<uint16_t, N> InterleaveLower(const Vec128<uint16_t, N> a,
-                                            const Vec128<uint16_t, N> b) {
-  return Vec128<uint16_t, N>{_mm_unpacklo_epi16(a.raw, b.raw)};
-}
-template <size_t N, HWY_IF_LE128(uint32_t, N)>
-HWY_API Vec128<uint32_t, N> InterleaveLower(const Vec128<uint32_t, N> a,
-                                            const Vec128<uint32_t, N> b) {
-  return Vec128<uint32_t, N>{_mm_unpacklo_epi32(a.raw, b.raw)};
-}
-template <size_t N, HWY_IF_LE128(uint64_t, N)>
-HWY_API Vec128<uint64_t, N> InterleaveLower(const Vec128<uint64_t, N> a,
-                                            const Vec128<uint64_t, N> b) {
-  return Vec128<uint64_t, N>{_mm_unpacklo_epi64(a.raw, b.raw)};
-}
-
-template <size_t N, HWY_IF_LE128(int8_t, N)>
-HWY_API Vec128<int8_t, N> InterleaveLower(const Vec128<int8_t, N> a,
-                                          const Vec128<int8_t, N> b) {
-  return Vec128<int8_t, N>{_mm_unpacklo_epi8(a.raw, b.raw)};
-}
-template <size_t N, HWY_IF_LE128(int16_t, N)>
-HWY_API Vec128<int16_t, N> InterleaveLower(const Vec128<int16_t, N> a,
-                                           const Vec128<int16_t, N> b) {
-  return Vec128<int16_t, N>{_mm_unpacklo_epi16(a.raw, b.raw)};
-}
-template <size_t N, HWY_IF_LE128(int32_t, N)>
-HWY_API Vec128<int32_t, N> InterleaveLower(const Vec128<int32_t, N> a,
-                                           const Vec128<int32_t, N> b) {
-  return Vec128<int32_t, N>{_mm_unpacklo_epi32(a.raw, b.raw)};
-}
-template <size_t N, HWY_IF_LE128(int64_t, N)>
-HWY_API Vec128<int64_t, N> InterleaveLower(const Vec128<int64_t, N> a,
-                                           const Vec128<int64_t, N> b) {
-  return Vec128<int64_t, N>{_mm_unpacklo_epi64(a.raw, b.raw)};
-}
-
-template <size_t N, HWY_IF_LE128(float, N)>
-HWY_API Vec128<float, N> InterleaveLower(const Vec128<float, N> a,
-                                         const Vec128<float, N> b) {
-  return Vec128<float, N>{_mm_unpacklo_ps(a.raw, b.raw)};
-}
-template <size_t N, HWY_IF_LE128(double, N)>
-HWY_API Vec128<double, N> InterleaveLower(const Vec128<double, N> a,
-                                          const Vec128<double, N> b) {
-  return Vec128<double, N>{_mm_unpacklo_pd(a.raw, b.raw)};
-}
-
-// Additional overload for the optional tag (also for 256/512).
-template <class V>
-HWY_API V InterleaveLower(DFromV<V> /* tag */, V a, V b) {
-  return InterleaveLower(a, b);
-}
-
-// ------------------------------ InterleaveUpper (UpperHalf)
-
-// All functions inside detail lack the required D parameter.
-namespace detail {
-
-HWY_API Vec128<uint8_t> InterleaveUpper(const Vec128<uint8_t> a,
-                                        const Vec128<uint8_t> b) {
-  return Vec128<uint8_t>{_mm_unpackhi_epi8(a.raw, b.raw)};
-}
-HWY_API Vec128<uint16_t> InterleaveUpper(const Vec128<uint16_t> a,
-                                         const Vec128<uint16_t> b) {
-  return Vec128<uint16_t>{_mm_unpackhi_epi16(a.raw, b.raw)};
-}
-HWY_API Vec128<uint32_t> InterleaveUpper(const Vec128<uint32_t> a,
-                                         const Vec128<uint32_t> b) {
-  return Vec128<uint32_t>{_mm_unpackhi_epi32(a.raw, b.raw)};
-}
-HWY_API Vec128<uint64_t> InterleaveUpper(const Vec128<uint64_t> a,
-                                         const Vec128<uint64_t> b) {
-  return Vec128<uint64_t>{_mm_unpackhi_epi64(a.raw, b.raw)};
-}
-
-HWY_API Vec128<int8_t> InterleaveUpper(const Vec128<int8_t> a,
-                                       const Vec128<int8_t> b) {
-  return Vec128<int8_t>{_mm_unpackhi_epi8(a.raw, b.raw)};
-}
-HWY_API Vec128<int16_t> InterleaveUpper(const Vec128<int16_t> a,
-                                        const Vec128<int16_t> b) {
-  return Vec128<int16_t>{_mm_unpackhi_epi16(a.raw, b.raw)};
-}
-HWY_API Vec128<int32_t> InterleaveUpper(const Vec128<int32_t> a,
-                                        const Vec128<int32_t> b) {
-  return Vec128<int32_t>{_mm_unpackhi_epi32(a.raw, b.raw)};
-}
-HWY_API Vec128<int64_t> InterleaveUpper(const Vec128<int64_t> a,
-                                        const Vec128<int64_t> b) {
-  return Vec128<int64_t>{_mm_unpackhi_epi64(a.raw, b.raw)};
-}
-
-HWY_API Vec128<float> InterleaveUpper(const Vec128<float> a,
-                                      const Vec128<float> b) {
-  return Vec128<float>{_mm_unpackhi_ps(a.raw, b.raw)};
-}
-HWY_API Vec128<double> InterleaveUpper(const Vec128<double> a,
-                                       const Vec128<double> b) {
-  return Vec128<double>{_mm_unpackhi_pd(a.raw, b.raw)};
-}
-
-}  // namespace detail
-
-// Full
-template <typename T, class V = Vec128<T>>
-HWY_API V InterleaveUpper(Full128<T> /* tag */, V a, V b) {
-  return detail::InterleaveUpper(a, b);
-}
-
-// Partial
-template <typename T, size_t N, HWY_IF_LE64(T, N), class V = Vec128<T, N>>
-HWY_API V InterleaveUpper(Simd<T, N, 0> d, V a, V b) {
-  const Half<decltype(d)> d2;
-  return InterleaveLower(d, V{UpperHalf(d2, a).raw}, V{UpperHalf(d2, b).raw});
-}
-
-// ------------------------------ ZipLower/ZipUpper (InterleaveLower)
-
-// Same as Interleave*, except that the return lanes are double-width integers;
-// this is necessary because the single-lane scalar cannot return two values.
-template <class V, class DW = RepartitionToWide<DFromV<V>>>
-HWY_API VFromD<DW> ZipLower(V a, V b) {
-  return BitCast(DW(), InterleaveLower(a, b));
-}
-template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
-HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
-  return BitCast(dw, InterleaveLower(D(), a, b));
-}
-
-template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
-HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
-  return BitCast(dw, InterleaveUpper(D(), a, b));
-}
-
-// ================================================== COMBINE
-
-// ------------------------------ Combine (InterleaveLower)
-
-// N = N/2 + N/2 (upper half undefined)
-template <typename T, size_t N, HWY_IF_LE128(T, N)>
-HWY_API Vec128<T, N> Combine(Simd<T, N, 0> d, Vec128<T, N / 2> hi_half,
-                             Vec128<T, N / 2> lo_half) {
-  const Half<decltype(d)> d2;
-  const RebindToUnsigned<decltype(d2)> du2;
-  // Treat half-width input as one lane, and expand to two lanes.
-  using VU = Vec128<UnsignedFromSize<N * sizeof(T) / 2>, 2>;
-  const VU lo{BitCast(du2, lo_half).raw};
-  const VU hi{BitCast(du2, hi_half).raw};
-  return BitCast(d, InterleaveLower(lo, hi));
-}
-
-// ------------------------------ ZeroExtendVector (Combine, IfThenElseZero)
-
-// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
-namespace detail {
-
-template <typename T>
-HWY_INLINE Vec128<T> ZeroExtendVector(hwy::NonFloatTag /*tag*/,
-                                      Full128<T> /* d */, Vec64<T> lo) {
-  return Vec128<T>{_mm_move_epi64(lo.raw)};
-}
-
-template <typename T>
-HWY_INLINE Vec128<T> ZeroExtendVector(hwy::FloatTag /*tag*/, Full128<T> d,
-                                      Vec64<T> lo) {
-  const RebindToUnsigned<decltype(d)> du;
-  return BitCast(d, ZeroExtendVector(du, BitCast(Half<decltype(du)>(), lo)));
-}
-
-}  // namespace detail
-
-template <typename T>
-HWY_API Vec128<T> ZeroExtendVector(Full128<T> d, Vec64<T> lo) {
-  return detail::ZeroExtendVector(hwy::IsFloatTag<T>(), d, lo);
-}
-
-template <typename T, size_t N, HWY_IF_LE64(T, N)>
-HWY_API Vec128<T, N> ZeroExtendVector(Simd<T, N, 0> d, Vec128<T, N / 2> lo) {
-  return IfThenElseZero(FirstN(d, N / 2), Vec128<T, N>{lo.raw});
-}
-
-// ------------------------------ Concat full (InterleaveLower)
-
-// hiH,hiL loH,loL |-> hiL,loL (= lower halves)
-template <typename T>
-HWY_API Vec128<T> ConcatLowerLower(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
-  const Repartition<uint64_t, decltype(d)> d64;
-  return BitCast(d, InterleaveLower(BitCast(d64, lo), BitCast(d64, hi)));
-}
-
-// hiH,hiL loH,loL |-> hiH,loH (= upper halves)
-template <typename T>
-HWY_API Vec128<T> ConcatUpperUpper(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
-  const Repartition<uint64_t, decltype(d)> d64;
-  return BitCast(d, InterleaveUpper(d64, BitCast(d64, lo), BitCast(d64, hi)));
-}
-
-// hiH,hiL loH,loL |-> hiL,loH (= inner halves)
-template <typename T>
-HWY_API Vec128<T> ConcatLowerUpper(Full128<T> d, const Vec128<T> hi,
-                                   const Vec128<T> lo) {
-  return CombineShiftRightBytes<8>(d, hi, lo);
-}
-
-// hiH,hiL loH,loL |-> hiH,loL (= outer halves)
-template <typename T>
-HWY_API Vec128<T> ConcatUpperLower(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
-  const Repartition<double, decltype(d)> dd;
-#if HWY_TARGET == HWY_SSSE3
-  return BitCast(
-      d, Vec128<double>{_mm_shuffle_pd(BitCast(dd, lo).raw, BitCast(dd, hi).raw,
-                                       _MM_SHUFFLE2(1, 0))});
-#else
-  // _mm_blend_epi16 has throughput 1/cycle on SKX, whereas _pd can do 3/cycle.
-  return BitCast(d, Vec128<double>{_mm_blend_pd(BitCast(dd, hi).raw,
-                                                BitCast(dd, lo).raw, 1)});
-#endif
-}
-HWY_API Vec128<float> ConcatUpperLower(Full128<float> d, Vec128<float> hi,
-                                       Vec128<float> lo) {
-#if HWY_TARGET == HWY_SSSE3
-  (void)d;
-  return Vec128<float>{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(3, 2, 1, 0))};
-#else
-  // _mm_shuffle_ps has throughput 1/cycle on SKX, whereas blend can do 3/cycle.
-  const RepartitionToWide<decltype(d)> dd;
-  return BitCast(d, Vec128<double>{_mm_blend_pd(BitCast(dd, hi).raw,
-                                                BitCast(dd, lo).raw, 1)});
-#endif
-}
-HWY_API Vec128<double> ConcatUpperLower(Full128<double> /* tag */,
-                                        Vec128<double> hi, Vec128<double> lo) {
-#if HWY_TARGET == HWY_SSSE3
-  return Vec128<double>{_mm_shuffle_pd(lo.raw, hi.raw, _MM_SHUFFLE2(1, 0))};
-#else
-  // _mm_shuffle_pd has throughput 1/cycle on SKX, whereas blend can do 3/cycle.
-  return Vec128<double>{_mm_blend_pd(hi.raw, lo.raw, 1)};
-#endif
-}
-
-// ------------------------------ Concat partial (Combine, LowerHalf)
-
-template <typename T, size_t N, HWY_IF_LE64(T, N)>
-HWY_API Vec128<T, N> ConcatLowerLower(Simd<T, N, 0> d, Vec128<T, N> hi,
-                                      Vec128<T, N> lo) {
-  const Half<decltype(d)> d2;
-  return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo));
-}
-
-template <typename T, size_t N, HWY_IF_LE64(T, N)>
-HWY_API Vec128<T, N> ConcatUpperUpper(Simd<T, N, 0> d, Vec128<T, N> hi,
-                                      Vec128<T, N> lo) {
-  const Half<decltype(d)> d2;
-  return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo));
-}
-
-template <typename T, size_t N, HWY_IF_LE64(T, N)>
-HWY_API Vec128<T, N> ConcatLowerUpper(Simd<T, N, 0> d, const Vec128<T, N> hi,
-                                      const Vec128<T, N> lo) {
-  const Half<decltype(d)> d2;
-  return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo));
-}
-
-template <typename T, size_t N, HWY_IF_LE64(T, N)>
-HWY_API Vec128<T, N> ConcatUpperLower(Simd<T, N, 0> d, Vec128<T, N> hi,
-                                      Vec128<T, N> lo) {
-  const Half<decltype(d)> d2;
-  return Combine(d, UpperHalf(d2, hi), LowerHalf(d2, lo));
-}
-
-// ------------------------------ ConcatOdd
-
-// 8-bit full
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Vec128<T> ConcatOdd(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
-  const Repartition<uint16_t, decltype(d)> dw;
-  // Right-shift 8 bits per u16 so we can pack.
-  const Vec128<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi));
-  const Vec128<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo));
-  return Vec128<T>{_mm_packus_epi16(uL.raw, uH.raw)};
-}
-
-// 8-bit x8
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Vec64<T> ConcatOdd(Simd<T, 8, 0> d, Vec64<T> hi, Vec64<T> lo) {
-  const Repartition<uint32_t, decltype(d)> du32;
-  // Don't care about upper half, no need to zero.
-  alignas(16) const uint8_t kCompactOddU8[8] = {1, 3, 5, 7};
-  const Vec64<T> shuf = BitCast(d, Load(Full64<uint8_t>(), kCompactOddU8));
-  const Vec64<T> L = TableLookupBytes(lo, shuf);
-  const Vec64<T> H = TableLookupBytes(hi, shuf);
-  return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H)));
-}
-
-// 8-bit x4
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Vec32<T> ConcatOdd(Simd<T, 4, 0> d, Vec32<T> hi, Vec32<T> lo) {
-  const Repartition<uint16_t, decltype(d)> du16;
-  // Don't care about upper half, no need to zero.
-  alignas(16) const uint8_t kCompactOddU8[4] = {1, 3};
-  const Vec32<T> shuf = BitCast(d, Load(Full32<uint8_t>(), kCompactOddU8));
-  const Vec32<T> L = TableLookupBytes(lo, shuf);
-  const Vec32<T> H = TableLookupBytes(hi, shuf);
-  return BitCast(d, InterleaveLower(du16, BitCast(du16, L), BitCast(du16, H)));
-}
-
-// 16-bit full
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec128<T> ConcatOdd(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
-  // Right-shift 16 bits per i32 - a *signed* shift of 0x8000xxxx returns
-  // 0xFFFF8000, which correctly saturates to 0x8000.
-  const Repartition<int32_t, decltype(d)> dw;
-  const Vec128<int32_t> uH = ShiftRight<16>(BitCast(dw, hi));
-  const Vec128<int32_t> uL = ShiftRight<16>(BitCast(dw, lo));
-  return Vec128<T>{_mm_packs_epi32(uL.raw, uH.raw)};
-}
-
-// 16-bit x4
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec64<T> ConcatOdd(Simd<T, 4, 0> d, Vec64<T> hi, Vec64<T> lo) {
-  const Repartition<uint32_t, decltype(d)> du32;
-  // Don't care about upper half, no need to zero.
-  alignas(16) const uint8_t kCompactOddU16[8] = {2, 3, 6, 7};
-  const Vec64<T> shuf = BitCast(d, Load(Full64<uint8_t>(), kCompactOddU16));
-  const Vec64<T> L = TableLookupBytes(lo, shuf);
-  const Vec64<T> H = TableLookupBytes(hi, shuf);
-  return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H)));
-}
-
-// 32-bit full
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec128<T> ConcatOdd(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
-  const RebindToFloat<decltype(d)> df;
-  return BitCast(
-      d, Vec128<float>{_mm_shuffle_ps(BitCast(df, lo).raw, BitCast(df, hi).raw,
-                                      _MM_SHUFFLE(3, 1, 3, 1))});
-}
-template <size_t N>
-HWY_API Vec128<float> ConcatOdd(Full128<float> /* tag */, Vec128<float> hi,
-                                Vec128<float> lo) {
-  return Vec128<float>{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(3, 1, 3, 1))};
-}
-
-// Any type x2
-template <typename T>
-HWY_API Vec128<T, 2> ConcatOdd(Simd<T, 2, 0> d, Vec128<T, 2> hi,
-                               Vec128<T, 2> lo) {
-  return InterleaveUpper(d, lo, hi);
-}
-
-// ------------------------------ ConcatEven (InterleaveLower)
-
-// 8-bit full
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Vec128<T> ConcatEven(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
-  const Repartition<uint16_t, decltype(d)> dw;
-  // Isolate lower 8 bits per u16 so we can pack.
-  const Vec128<uint16_t> mask = Set(dw, 0x00FF);
-  const Vec128<uint16_t> uH = And(BitCast(dw, hi), mask);
-  const Vec128<uint16_t> uL = And(BitCast(dw, lo), mask);
-  return Vec128<T>{_mm_packus_epi16(uL.raw, uH.raw)};
-}
-
-// 8-bit x8
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Vec64<T> ConcatEven(Simd<T, 8, 0> d, Vec64<T> hi, Vec64<T> lo) {
-  const Repartition<uint32_t, decltype(d)> du32;
-  // Don't care about upper half, no need to zero.
-  alignas(16) const uint8_t kCompactEvenU8[8] = {0, 2, 4, 6};
-  const Vec64<T> shuf = BitCast(d, Load(Full64<uint8_t>(), kCompactEvenU8));
-  const Vec64<T> L = TableLookupBytes(lo, shuf);
-  const Vec64<T> H = TableLookupBytes(hi, shuf);
-  return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H)));
-}
-
-// 8-bit x4
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Vec32<T> ConcatEven(Simd<T, 4, 0> d, Vec32<T> hi, Vec32<T> lo) {
-  const Repartition<uint16_t, decltype(d)> du16;
-  // Don't care about upper half, no need to zero.
-  alignas(16) const uint8_t kCompactEvenU8[4] = {0, 2};
-  const Vec32<T> shuf = BitCast(d, Load(Full32<uint8_t>(), kCompactEvenU8));
-  const Vec32<T> L = TableLookupBytes(lo, shuf);
-  const Vec32<T> H = TableLookupBytes(hi, shuf);
-  return BitCast(d, InterleaveLower(du16, BitCast(du16, L), BitCast(du16, H)));
-}
-
-// 16-bit full
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec128<T> ConcatEven(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
-#if HWY_TARGET <= HWY_SSE4
-  // Isolate lower 16 bits per u32 so we can pack.
-  const Repartition<uint32_t, decltype(d)> dw;
-  const Vec128<uint32_t> mask = Set(dw, 0x0000FFFF);
-  const Vec128<uint32_t> uH = And(BitCast(dw, hi), mask);
-  const Vec128<uint32_t> uL = And(BitCast(dw, lo), mask);
-  return Vec128<T>{_mm_packus_epi32(uL.raw, uH.raw)};
-#else
-  // packs_epi32 saturates 0x8000 to 0x7FFF. Instead ConcatEven within the two
-  // inputs, then concatenate them.
-  alignas(16) const T kCompactEvenU16[8] = {0x0100, 0x0504, 0x0908, 0x0D0C};
-  const Vec128<T> shuf = BitCast(d, Load(d, kCompactEvenU16));
-  const Vec128<T> L = TableLookupBytes(lo, shuf);
-  const Vec128<T> H = TableLookupBytes(hi, shuf);
-  return ConcatLowerLower(d, H, L);
-#endif
-}
-
-// 16-bit x4
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec64<T> ConcatEven(Simd<T, 4, 0> d, Vec64<T> hi, Vec64<T> lo) {
-  const Repartition<uint32_t, decltype(d)> du32;
-  // Don't care about upper half, no need to zero.
-  alignas(16) const uint8_t kCompactEvenU16[8] = {0, 1, 4, 5};
-  const Vec64<T> shuf = BitCast(d, Load(Full64<uint8_t>(), kCompactEvenU16));
-  const Vec64<T> L = TableLookupBytes(lo, shuf);
-  const Vec64<T> H = TableLookupBytes(hi, shuf);
-  return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H)));
-}
-
-// 32-bit full
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec128<T> ConcatEven(Full128<T> d, Vec128<T> hi, Vec128<T> lo) {
-  const RebindToFloat<decltype(d)> df;
-  return BitCast(
-      d, Vec128<float>{_mm_shuffle_ps(BitCast(df, lo).raw, BitCast(df, hi).raw,
-                                      _MM_SHUFFLE(2, 0, 2, 0))});
-}
-HWY_API Vec128<float> ConcatEven(Full128<float> /* tag */, Vec128<float> hi,
-                                 Vec128<float> lo) {
-  return Vec128<float>{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(2, 0, 2, 0))};
-}
-
-// Any T x2
-template <typename T>
-HWY_API Vec128<T, 2> ConcatEven(Simd<T, 2, 0> d, Vec128<T, 2> hi,
-                                Vec128<T, 2> lo) {
-  return InterleaveLower(d, lo, hi);
-}
-
-// ------------------------------ DupEven (InterleaveLower)
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
-  return Vec128<T, N>{_mm_shuffle_epi32(v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
-}
-template <size_t N>
-HWY_API Vec128<float, N> DupEven(Vec128<float, N> v) {
-  return Vec128<float, N>{
-      _mm_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec128<T, N> DupEven(const Vec128<T, N> v) {
-  return InterleaveLower(DFromV<decltype(v)>(), v, v);
-}
-
-// ------------------------------ DupOdd (InterleaveUpper)
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
-  return Vec128<T, N>{_mm_shuffle_epi32(v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
-}
-template <size_t N>
-HWY_API Vec128<float, N> DupOdd(Vec128<float, N> v) {
-  return Vec128<float, N>{
-      _mm_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec128<T, N> DupOdd(const Vec128<T, N> v) {
-  return InterleaveUpper(DFromV<decltype(v)>(), v, v);
-}
-
-// ------------------------------ OddEven (IfThenElse)
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
-HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
-  const DFromV<decltype(a)> d;
-  const Repartition<uint8_t, decltype(d)> d8;
-  alignas(16) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
-                                            0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
-  return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
-HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
-#if HWY_TARGET == HWY_SSSE3
-  const DFromV<decltype(a)> d;
-  const Repartition<uint8_t, decltype(d)> d8;
-  alignas(16) constexpr uint8_t mask[16] = {0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0,
-                                            0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0};
-  return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a);
-#else
-  return Vec128<T, N>{_mm_blend_epi16(a.raw, b.raw, 0x55)};
-#endif
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
-HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
-#if HWY_TARGET == HWY_SSSE3
-  const __m128i odd = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 1, 3, 1));
-  const __m128i even = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(2, 0, 2, 0));
-  return Vec128<T, N>{_mm_unpacklo_epi32(even, odd)};
-#else
-  // _mm_blend_epi16 has throughput 1/cycle on SKX, whereas _ps can do 3/cycle.
-  const DFromV<decltype(a)> d;
-  const RebindToFloat<decltype(d)> df;
-  return BitCast(d, Vec128<float, N>{_mm_blend_ps(BitCast(df, a).raw,
-                                                  BitCast(df, b).raw, 5)});
-#endif
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
-HWY_INLINE Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
-  // Same as ConcatUpperLower for full vectors; do not call that because this
-  // is more efficient for 64x1 vectors.
-  const DFromV<decltype(a)> d;
-  const RebindToFloat<decltype(d)> dd;
-#if HWY_TARGET == HWY_SSSE3
-  return BitCast(
-      d, Vec128<double, N>{_mm_shuffle_pd(
-             BitCast(dd, b).raw, BitCast(dd, a).raw, _MM_SHUFFLE2(1, 0))});
-#else
-  // _mm_shuffle_pd has throughput 1/cycle on SKX, whereas blend can do 3/cycle.
-  return BitCast(d, Vec128<double, N>{_mm_blend_pd(BitCast(dd, a).raw,
-                                                   BitCast(dd, b).raw, 1)});
-#endif
-}
-
-template <size_t N>
-HWY_API Vec128<float, N> OddEven(Vec128<float, N> a, Vec128<float, N> b) {
-#if HWY_TARGET == HWY_SSSE3
-  // SHUFPS must fill the lower half of the output from one input, so we
-  // need another shuffle. Unpack avoids another immediate byte.
-  const __m128 odd = _mm_shuffle_ps(a.raw, a.raw, _MM_SHUFFLE(3, 1, 3, 1));
-  const __m128 even = _mm_shuffle_ps(b.raw, b.raw, _MM_SHUFFLE(2, 0, 2, 0));
-  return Vec128<float, N>{_mm_unpacklo_ps(even, odd)};
-#else
-  return Vec128<float, N>{_mm_blend_ps(a.raw, b.raw, 5)};
-#endif
-}
-
-// ------------------------------ OddEvenBlocks
-template <typename T, size_t N>
-HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
-  return even;
-}
-
-// ------------------------------ SwapAdjacentBlocks
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
-  return v;
-}
-
-// ------------------------------ Shl (ZipLower, Mul)
-
-// Use AVX2/3 variable shifts where available, otherwise multiply by powers of
-// two from loading float exponents, which is considerably faster (according
-// to LLVM-MCA) than scalar or testing bits: https://gcc.godbolt.org/z/9G7Y9v.
-
-namespace detail {
-#if HWY_TARGET > HWY_AVX3  // AVX2 or older
-
-// Returns 2^v for use as per-lane multipliers to emulate 16-bit shifts.
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
-HWY_INLINE Vec128<MakeUnsigned<T>, N> Pow2(const Vec128<T, N> v) {
-  const DFromV<decltype(v)> d;
-  const RepartitionToWide<decltype(d)> dw;
-  const Rebind<float, decltype(dw)> df;
-  const auto zero = Zero(d);
-  // Move into exponent (this u16 will become the upper half of an f32)
-  const auto exp = ShiftLeft<23 - 16>(v);
-  const auto upper = exp + Set(d, 0x3F80);  // upper half of 1.0f
-  // Insert 0 into lower halves for reinterpreting as binary32.
-  const auto f0 = ZipLower(dw, zero, upper);
-  const auto f1 = ZipUpper(dw, zero, upper);
-  // See comment below.
-  const Vec128<int32_t, N> bits0{_mm_cvtps_epi32(BitCast(df, f0).raw)};
-  const Vec128<int32_t, N> bits1{_mm_cvtps_epi32(BitCast(df, f1).raw)};
-  return Vec128<MakeUnsigned<T>, N>{_mm_packus_epi32(bits0.raw, bits1.raw)};
-}
-
-// Same, for 32-bit shifts.
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
-HWY_INLINE Vec128<MakeUnsigned<T>, N> Pow2(const Vec128<T, N> v) {
-  const DFromV<decltype(v)> d;
-  const auto exp = ShiftLeft<23>(v);
-  const auto f = exp + Set(d, 0x3F800000);  // 1.0f
-  // Do not use ConvertTo because we rely on the native 0x80..00 overflow
-  // behavior. cvt instead of cvtt should be equivalent, but avoids test
-  // failure under GCC 10.2.1.
-  return Vec128<MakeUnsigned<T>, N>{_mm_cvtps_epi32(_mm_castsi128_ps(f.raw))};
-}
-
-#endif  // HWY_TARGET > HWY_AVX3
-
-template <size_t N>
-HWY_API Vec128<uint16_t, N> Shl(hwy::UnsignedTag /*tag*/, Vec128<uint16_t, N> v,
-                                Vec128<uint16_t, N> bits) {
-#if HWY_TARGET <= HWY_AVX3
-  return Vec128<uint16_t, N>{_mm_sllv_epi16(v.raw, bits.raw)};
-#else
-  return v * Pow2(bits);
-#endif
-}
-HWY_API Vec128<uint16_t, 1> Shl(hwy::UnsignedTag /*tag*/, Vec128<uint16_t, 1> v,
-                                Vec128<uint16_t, 1> bits) {
-  return Vec128<uint16_t, 1>{_mm_sll_epi16(v.raw, bits.raw)};
-}
-
-template <size_t N>
-HWY_API Vec128<uint32_t, N> Shl(hwy::UnsignedTag /*tag*/, Vec128<uint32_t, N> v,
-                                Vec128<uint32_t, N> bits) {
-#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
-  return v * Pow2(bits);
-#else
-  return Vec128<uint32_t, N>{_mm_sllv_epi32(v.raw, bits.raw)};
-#endif
-}
-HWY_API Vec128<uint32_t, 1> Shl(hwy::UnsignedTag /*tag*/, Vec128<uint32_t, 1> v,
-                                const Vec128<uint32_t, 1> bits) {
-  return Vec128<uint32_t, 1>{_mm_sll_epi32(v.raw, bits.raw)};
-}
-
-HWY_API Vec128<uint64_t> Shl(hwy::UnsignedTag /*tag*/, Vec128<uint64_t> v,
-                             Vec128<uint64_t> bits) {
-#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
-  // Individual shifts and combine
-  const Vec128<uint64_t> out0{_mm_sll_epi64(v.raw, bits.raw)};
-  const __m128i bits1 = _mm_unpackhi_epi64(bits.raw, bits.raw);
-  const Vec128<uint64_t> out1{_mm_sll_epi64(v.raw, bits1)};
-  return ConcatUpperLower(Full128<uint64_t>(), out1, out0);
-#else
-  return Vec128<uint64_t>{_mm_sllv_epi64(v.raw, bits.raw)};
-#endif
-}
-HWY_API Vec64<uint64_t> Shl(hwy::UnsignedTag /*tag*/, Vec64<uint64_t> v,
-                            Vec64<uint64_t> bits) {
-  return Vec64<uint64_t>{_mm_sll_epi64(v.raw, bits.raw)};
-}
-
-// Signed left shift is the same as unsigned.
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Shl(hwy::SignedTag /*tag*/, Vec128<T, N> v,
-                         Vec128<T, N> bits) {
-  const DFromV<decltype(v)> di;
-  const RebindToUnsigned<decltype(di)> du;
-  return BitCast(di,
-                 Shl(hwy::UnsignedTag(), BitCast(du, v), BitCast(du, bits)));
-}
-
-}  // namespace detail
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> operator<<(Vec128<T, N> v, Vec128<T, N> bits) {
-  return detail::Shl(hwy::TypeTag<T>(), v, bits);
-}
-
-// ------------------------------ Shr (mul, mask, BroadcastSignBit)
-
-// Use AVX2+ variable shifts except for SSSE3/SSE4 or 16-bit. There, we use
-// widening multiplication by powers of two obtained by loading float exponents,
-// followed by a constant right-shift. This is still faster than a scalar or
-// bit-test approach: https://gcc.godbolt.org/z/9G7Y9v.
-
-template <size_t N>
-HWY_API Vec128<uint16_t, N> operator>>(const Vec128<uint16_t, N> in,
-                                       const Vec128<uint16_t, N> bits) {
-#if HWY_TARGET <= HWY_AVX3
-  return Vec128<uint16_t, N>{_mm_srlv_epi16(in.raw, bits.raw)};
-#else
-  const Simd<uint16_t, N, 0> d;
-  // For bits=0, we cannot mul by 2^16, so fix the result later.
-  const auto out = MulHigh(in, detail::Pow2(Set(d, 16) - bits));
-  // Replace output with input where bits == 0.
-  return IfThenElse(bits == Zero(d), in, out);
-#endif
-}
-HWY_API Vec128<uint16_t, 1> operator>>(const Vec128<uint16_t, 1> in,
-                                       const Vec128<uint16_t, 1> bits) {
-  return Vec128<uint16_t, 1>{_mm_srl_epi16(in.raw, bits.raw)};
-}
-
-template <size_t N>
-HWY_API Vec128<uint32_t, N> operator>>(const Vec128<uint32_t, N> in,
-                                       const Vec128<uint32_t, N> bits) {
-#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
-  // 32x32 -> 64 bit mul, then shift right by 32.
-  const Simd<uint32_t, N, 0> d32;
-  // Move odd lanes into position for the second mul. Shuffle more gracefully
-  // handles N=1 than repartitioning to u64 and shifting 32 bits right.
-  const Vec128<uint32_t, N> in31{_mm_shuffle_epi32(in.raw, 0x31)};
-  // For bits=0, we cannot mul by 2^32, so fix the result later.
-  const auto mul = detail::Pow2(Set(d32, 32) - bits);
-  const auto out20 = ShiftRight<32>(MulEven(in, mul));  // z 2 z 0
-  const Vec128<uint32_t, N> mul31{_mm_shuffle_epi32(mul.raw, 0x31)};
-  // No need to shift right, already in the correct position.
-  const auto out31 = BitCast(d32, MulEven(in31, mul31));  // 3 ? 1 ?
-  const Vec128<uint32_t, N> out = OddEven(out31, BitCast(d32, out20));
-  // Replace output with input where bits == 0.
-  return IfThenElse(bits == Zero(d32), in, out);
-#else
-  return Vec128<uint32_t, N>{_mm_srlv_epi32(in.raw, bits.raw)};
-#endif
-}
-HWY_API Vec128<uint32_t, 1> operator>>(const Vec128<uint32_t, 1> in,
-                                       const Vec128<uint32_t, 1> bits) {
-  return Vec128<uint32_t, 1>{_mm_srl_epi32(in.raw, bits.raw)};
-}
-
-HWY_API Vec128<uint64_t> operator>>(const Vec128<uint64_t> v,
-                                    const Vec128<uint64_t> bits) {
-#if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4
-  // Individual shifts and combine
-  const Vec128<uint64_t> out0{_mm_srl_epi64(v.raw, bits.raw)};
-  const __m128i bits1 = _mm_unpackhi_epi64(bits.raw, bits.raw);
-  const Vec128<uint64_t> out1{_mm_srl_epi64(v.raw, bits1)};
-  return ConcatUpperLower(Full128<uint64_t>(), out1, out0);
-#else
-  return Vec128<uint64_t>{_mm_srlv_epi64(v.raw, bits.raw)};
-#endif
-}
-HWY_API Vec64<uint64_t> operator>>(const Vec64<uint64_t> v,
-                                   const Vec64<uint64_t> bits) {
-  return Vec64<uint64_t>{_mm_srl_epi64(v.raw, bits.raw)};
-}
-
-#if HWY_TARGET > HWY_AVX3  // AVX2 or older
-namespace detail {
-
-// Also used in x86_256-inl.h.
-template <class DI, class V>
-HWY_INLINE V SignedShr(const DI di, const V v, const V count_i) {
-  const RebindToUnsigned<DI> du;
-  const auto count = BitCast(du, count_i);  // same type as value to shift
-  // Clear sign and restore afterwards. This is preferable to shifting the MSB
-  // downwards because Shr is somewhat more expensive than Shl.
-  const auto sign = BroadcastSignBit(v);
-  const auto abs = BitCast(du, v ^ sign);  // off by one, but fixed below
-  return BitCast(di, abs >> count) ^ sign;
-}
-
-}  // namespace detail
-#endif  // HWY_TARGET > HWY_AVX3
-
-template <size_t N>
-HWY_API Vec128<int16_t, N> operator>>(const Vec128<int16_t, N> v,
-                                      const Vec128<int16_t, N> bits) {
-#if HWY_TARGET <= HWY_AVX3
-  return Vec128<int16_t, N>{_mm_srav_epi16(v.raw, bits.raw)};
-#else
-  return detail::SignedShr(Simd<int16_t, N, 0>(), v, bits);
-#endif
-}
-HWY_API Vec128<int16_t, 1> operator>>(const Vec128<int16_t, 1> v,
-                                      const Vec128<int16_t, 1> bits) {
-  return Vec128<int16_t, 1>{_mm_sra_epi16(v.raw, bits.raw)};
-}
-
-template <size_t N>
-HWY_API Vec128<int32_t, N> operator>>(const Vec128<int32_t, N> v,
-                                      const Vec128<int32_t, N> bits) {
-#if HWY_TARGET <= HWY_AVX3
-  return Vec128<int32_t, N>{_mm_srav_epi32(v.raw, bits.raw)};
-#else
-  return detail::SignedShr(Simd<int32_t, N, 0>(), v, bits);
-#endif
-}
-HWY_API Vec128<int32_t, 1> operator>>(const Vec128<int32_t, 1> v,
-                                      const Vec128<int32_t, 1> bits) {
-  return Vec128<int32_t, 1>{_mm_sra_epi32(v.raw, bits.raw)};
-}
-
-template <size_t N>
-HWY_API Vec128<int64_t, N> operator>>(const Vec128<int64_t, N> v,
-                                      const Vec128<int64_t, N> bits) {
-#if HWY_TARGET <= HWY_AVX3
-  return Vec128<int64_t, N>{_mm_srav_epi64(v.raw, bits.raw)};
-#else
-  return detail::SignedShr(Simd<int64_t, N, 0>(), v, bits);
-#endif
-}
-
-// ------------------------------ MulEven/Odd 64x64 (UpperHalf)
-
-HWY_INLINE Vec128<uint64_t> MulEven(const Vec128<uint64_t> a,
-                                    const Vec128<uint64_t> b) {
-  alignas(16) uint64_t mul[2];
-  mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]);
-  return Load(Full128<uint64_t>(), mul);
-}
-
-HWY_INLINE Vec128<uint64_t> MulOdd(const Vec128<uint64_t> a,
-                                   const Vec128<uint64_t> b) {
-  alignas(16) uint64_t mul[2];
-  const Half<Full128<uint64_t>> d2;
-  mul[0] =
-      Mul128(GetLane(UpperHalf(d2, a)), GetLane(UpperHalf(d2, b)), &mul[1]);
-  return Load(Full128<uint64_t>(), mul);
-}
-
-// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
-
-template <size_t N>
-HWY_API Vec128<float, N> ReorderWidenMulAccumulate(Simd<float, N, 0> df32,
-                                                   Vec128<bfloat16_t, 2 * N> a,
-                                                   Vec128<bfloat16_t, 2 * N> b,
-                                                   const Vec128<float, N> sum0,
-                                                   Vec128<float, N>& sum1) {
-  // TODO(janwas): _mm_dpbf16_ps when available
-  const Repartition<uint16_t, decltype(df32)> du16;
-  const RebindToUnsigned<decltype(df32)> du32;
-  const Vec128<uint16_t, 2 * N> zero = Zero(du16);
-  // Lane order within sum0/1 is undefined, hence we can avoid the
-  // longer-latency lane-crossing PromoteTo.
-  const Vec128<uint32_t, N> a0 = ZipLower(du32, zero, BitCast(du16, a));
-  const Vec128<uint32_t, N> a1 = ZipUpper(du32, zero, BitCast(du16, a));
-  const Vec128<uint32_t, N> b0 = ZipLower(du32, zero, BitCast(du16, b));
-  const Vec128<uint32_t, N> b1 = ZipUpper(du32, zero, BitCast(du16, b));
-  sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
-  return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
-}
-
-// ================================================== CONVERT
-
-// ------------------------------ Promotions (part w/ narrow lanes -> full)
-
-// Unsigned: zero-extend.
-template <size_t N>
-HWY_API Vec128<uint16_t, N> PromoteTo(Simd<uint16_t, N, 0> /* tag */,
-                                      const Vec128<uint8_t, N> v) {
-#if HWY_TARGET == HWY_SSSE3
-  const __m128i zero = _mm_setzero_si128();
-  return Vec128<uint16_t, N>{_mm_unpacklo_epi8(v.raw, zero)};
-#else
-  return Vec128<uint16_t, N>{_mm_cvtepu8_epi16(v.raw)};
-#endif
-}
-template <size_t N>
-HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N, 0> /* tag */,
-                                      const Vec128<uint16_t, N> v) {
-#if HWY_TARGET == HWY_SSSE3
-  return Vec128<uint32_t, N>{_mm_unpacklo_epi16(v.raw, _mm_setzero_si128())};
-#else
-  return Vec128<uint32_t, N>{_mm_cvtepu16_epi32(v.raw)};
-#endif
-}
-template <size_t N>
-HWY_API Vec128<uint64_t, N> PromoteTo(Simd<uint64_t, N, 0> /* tag */,
-                                      const Vec128<uint32_t, N> v) {
-#if HWY_TARGET == HWY_SSSE3
-  return Vec128<uint64_t, N>{_mm_unpacklo_epi32(v.raw, _mm_setzero_si128())};
-#else
-  return Vec128<uint64_t, N>{_mm_cvtepu32_epi64(v.raw)};
-#endif
-}
-template <size_t N>
-HWY_API Vec128<uint32_t, N> PromoteTo(Simd<uint32_t, N, 0> /* tag */,
-                                      const Vec128<uint8_t, N> v) {
-#if HWY_TARGET == HWY_SSSE3
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i u16 = _mm_unpacklo_epi8(v.raw, zero);
-  return Vec128<uint32_t, N>{_mm_unpacklo_epi16(u16, zero)};
-#else
-  return Vec128<uint32_t, N>{_mm_cvtepu8_epi32(v.raw)};
-#endif
-}
-
-// Unsigned to signed: same plus cast.
-template <size_t N>
-HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N, 0> di,
-                                     const Vec128<uint8_t, N> v) {
-  return BitCast(di, PromoteTo(Simd<uint16_t, N, 0>(), v));
-}
-template <size_t N>
-HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> di,
-                                     const Vec128<uint16_t, N> v) {
-  return BitCast(di, PromoteTo(Simd<uint32_t, N, 0>(), v));
-}
-template <size_t N>
-HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> di,
-                                     const Vec128<uint8_t, N> v) {
-  return BitCast(di, PromoteTo(Simd<uint32_t, N, 0>(), v));
-}
-
-// Signed: replicate sign bit.
-template <size_t N>
-HWY_API Vec128<int16_t, N> PromoteTo(Simd<int16_t, N, 0> /* tag */,
-                                     const Vec128<int8_t, N> v) {
-#if HWY_TARGET == HWY_SSSE3
-  return ShiftRight<8>(Vec128<int16_t, N>{_mm_unpacklo_epi8(v.raw, v.raw)});
-#else
-  return Vec128<int16_t, N>{_mm_cvtepi8_epi16(v.raw)};
-#endif
-}
-template <size_t N>
-HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
-                                     const Vec128<int16_t, N> v) {
-#if HWY_TARGET == HWY_SSSE3
-  return ShiftRight<16>(Vec128<int32_t, N>{_mm_unpacklo_epi16(v.raw, v.raw)});
-#else
-  return Vec128<int32_t, N>{_mm_cvtepi16_epi32(v.raw)};
-#endif
-}
-template <size_t N>
-HWY_API Vec128<int64_t, N> PromoteTo(Simd<int64_t, N, 0> /* tag */,
-                                     const Vec128<int32_t, N> v) {
-#if HWY_TARGET == HWY_SSSE3
-  return ShiftRight<32>(Vec128<int64_t, N>{_mm_unpacklo_epi32(v.raw, v.raw)});
-#else
-  return Vec128<int64_t, N>{_mm_cvtepi32_epi64(v.raw)};
-#endif
-}
-template <size_t N>
-HWY_API Vec128<int32_t, N> PromoteTo(Simd<int32_t, N, 0> /* tag */,
-                                     const Vec128<int8_t, N> v) {
-#if HWY_TARGET == HWY_SSSE3
-  const __m128i x2 = _mm_unpacklo_epi8(v.raw, v.raw);
-  const __m128i x4 = _mm_unpacklo_epi16(x2, x2);
-  return ShiftRight<24>(Vec128<int32_t, N>{x4});
-#else
-  return Vec128<int32_t, N>{_mm_cvtepi8_epi32(v.raw)};
-#endif
-}
-
-// Workaround for origin tracking bug in Clang msan prior to 11.0
-// (spurious "uninitialized memory" for TestF16 with "ORIGIN: invalid")
-#if HWY_IS_MSAN && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 1100)
-#define HWY_INLINE_F16 HWY_NOINLINE
-#else
-#define HWY_INLINE_F16 HWY_INLINE
-#endif
-template <size_t N>
-HWY_INLINE_F16 Vec128<float, N> PromoteTo(Simd<float, N, 0> df32,
-                                          const Vec128<float16_t, N> v) {
-#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_F16C)
-  const RebindToSigned<decltype(df32)> di32;
-  const RebindToUnsigned<decltype(df32)> du32;
-  // Expand to u32 so we can shift.
-  const auto bits16 = PromoteTo(du32, Vec128<uint16_t, N>{v.raw});
-  const auto sign = ShiftRight<15>(bits16);
-  const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F);
-  const auto mantissa = bits16 & Set(du32, 0x3FF);
-  const auto subnormal =
-      BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) *
-                        Set(df32, 1.0f / 16384 / 1024));
-
-  const auto biased_exp32 = biased_exp + Set(du32, 127 - 15);
-  const auto mantissa32 = ShiftLeft<23 - 10>(mantissa);
-  const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
-  const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal);
-  return BitCast(df32, ShiftLeft<31>(sign) | bits32);
-#else
-  (void)df32;
-  return Vec128<float, N>{_mm_cvtph_ps(v.raw)};
-#endif
-}
-
-template <size_t N>
-HWY_API Vec128<float, N> PromoteTo(Simd<float, N, 0> df32,
-                                   const Vec128<bfloat16_t, N> v) {
-  const Rebind<uint16_t, decltype(df32)> du16;
-  const RebindToSigned<decltype(df32)> di32;
-  return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
-}
-
-template <size_t N>
-HWY_API Vec128<double, N> PromoteTo(Simd<double, N, 0> /* tag */,
-                                    const Vec128<float, N> v) {
-  return Vec128<double, N>{_mm_cvtps_pd(v.raw)};
-}
-
-template <size_t N>
-HWY_API Vec128<double, N> PromoteTo(Simd<double, N, 0> /* tag */,
-                                    const Vec128<int32_t, N> v) {
-  return Vec128<double, N>{_mm_cvtepi32_pd(v.raw)};
-}
-
-// ------------------------------ Demotions (full -> part w/ narrow lanes)
-
-template <size_t N>
-HWY_API Vec128<uint16_t, N> DemoteTo(Simd<uint16_t, N, 0> /* tag */,
-                                     const Vec128<int32_t, N> v) {
-#if HWY_TARGET == HWY_SSSE3
-  const Simd<int32_t, N, 0> di32;
-  const Simd<uint16_t, N * 2, 0> du16;
-  const auto zero_if_neg = AndNot(ShiftRight<31>(v), v);
-  const auto too_big = VecFromMask(di32, Gt(v, Set(di32, 0xFFFF)));
-  const auto clamped = Or(zero_if_neg, too_big);
-  // Lower 2 bytes from each 32-bit lane; same as return type for fewer casts.
-  alignas(16) constexpr uint16_t kLower2Bytes[16] = {
-      0x0100, 0x0504, 0x0908, 0x0D0C, 0x8080, 0x8080, 0x8080, 0x8080};
-  const auto lo2 = Load(du16, kLower2Bytes);
-  return Vec128<uint16_t, N>{TableLookupBytes(BitCast(du16, clamped), lo2).raw};
-#else
-  return Vec128<uint16_t, N>{_mm_packus_epi32(v.raw, v.raw)};
-#endif
-}
-
-template <size_t N>
-HWY_API Vec128<int16_t, N> DemoteTo(Simd<int16_t, N, 0> /* tag */,
-                                    const Vec128<int32_t, N> v) {
-  return Vec128<int16_t, N>{_mm_packs_epi32(v.raw, v.raw)};
-}
-
-template <size_t N>
-HWY_API Vec128<uint8_t, N> DemoteTo(Simd<uint8_t, N, 0> /* tag */,
-                                    const Vec128<int32_t, N> v) {
-  const __m128i i16 = _mm_packs_epi32(v.raw, v.raw);
-  return Vec128<uint8_t, N>{_mm_packus_epi16(i16, i16)};
-}
-
-template <size_t N>
-HWY_API Vec128<uint8_t, N> DemoteTo(Simd<uint8_t, N, 0> /* tag */,
-                                    const Vec128<int16_t, N> v) {
-  return Vec128<uint8_t, N>{_mm_packus_epi16(v.raw, v.raw)};
-}
-
-template <size_t N>
-HWY_API Vec128<int8_t, N> DemoteTo(Simd<int8_t, N, 0> /* tag */,
-                                   const Vec128<int32_t, N> v) {
-  const __m128i i16 = _mm_packs_epi32(v.raw, v.raw);
-  return Vec128<int8_t, N>{_mm_packs_epi16(i16, i16)};
-}
-
-template <size_t N>
-HWY_API Vec128<int8_t, N> DemoteTo(Simd<int8_t, N, 0> /* tag */,
-                                   const Vec128<int16_t, N> v) {
-  return Vec128<int8_t, N>{_mm_packs_epi16(v.raw, v.raw)};
-}
-
-// Work around MSVC warning for _mm_cvtps_ph (8 is actually a valid immediate).
-// clang-cl requires a non-empty string, so we 'ignore' the irrelevant -Wmain.
-HWY_DIAGNOSTICS(push)
-HWY_DIAGNOSTICS_OFF(disable : 4556, ignored "-Wmain")
-
-template <size_t N>
-HWY_API Vec128<float16_t, N> DemoteTo(Simd<float16_t, N, 0> df16,
-                                      const Vec128<float, N> v) {
-#if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_F16C)
-  const RebindToUnsigned<decltype(df16)> du16;
-  const Rebind<uint32_t, decltype(df16)> du;
-  const RebindToSigned<decltype(du)> di;
-  const auto bits32 = BitCast(du, v);
-  const auto sign = ShiftRight<31>(bits32);
-  const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF);
-  const auto mantissa32 = bits32 & Set(du, 0x7FFFFF);
-
-  const auto k15 = Set(di, 15);
-  const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15);
-  const auto is_tiny = exp < Set(di, -24);
-
-  const auto is_subnormal = exp < Set(di, -14);
-  const auto biased_exp16 =
-      BitCast(du, IfThenZeroElse(is_subnormal, exp + k15));
-  const auto sub_exp = BitCast(du, Set(di, -14) - exp);  // [1, 11)
-  const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) +
-                     (mantissa32 >> (Set(du, 13) + sub_exp));
-  const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m,
-                                     ShiftRight<13>(mantissa32));  // <1024
-
-  const auto sign16 = ShiftLeft<15>(sign);
-  const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
-  const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16));
-  return BitCast(df16, DemoteTo(du16, bits16));
-#else
-  (void)df16;
-  return Vec128<float16_t, N>{_mm_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)};
-#endif
-}
-
-HWY_DIAGNOSTICS(pop)
-
-template <size_t N>
-HWY_API Vec128<bfloat16_t, N> DemoteTo(Simd<bfloat16_t, N, 0> dbf16,
-                                       const Vec128<float, N> v) {
-  // TODO(janwas): _mm_cvtneps_pbh once we have avx512bf16.
-  const Rebind<int32_t, decltype(dbf16)> di32;
-  const Rebind<uint32_t, decltype(dbf16)> du32;  // for logical shift right
-  const Rebind<uint16_t, decltype(dbf16)> du16;
-  const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
-  return BitCast(dbf16, DemoteTo(du16, bits_in_32));
-}
-
-template <size_t N>
-HWY_API Vec128<bfloat16_t, 2 * N> ReorderDemote2To(
-    Simd<bfloat16_t, 2 * N, 0> dbf16, Vec128<float, N> a, Vec128<float, N> b) {
-  // TODO(janwas): _mm_cvtne2ps_pbh once we have avx512bf16.
-  const RebindToUnsigned<decltype(dbf16)> du16;
-  const Repartition<uint32_t, decltype(dbf16)> du32;
-  const Vec128<uint32_t, N> b_in_even = ShiftRight<16>(BitCast(du32, b));
-  return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
-}
-
-template <size_t N>
-HWY_API Vec128<float, N> DemoteTo(Simd<float, N, 0> /* tag */,
-                                  const Vec128<double, N> v) {
-  return Vec128<float, N>{_mm_cvtpd_ps(v.raw)};
-}
-
-namespace detail {
-
-// For well-defined float->int demotion in all x86_*-inl.h.
-
-template <size_t N>
-HWY_INLINE auto ClampF64ToI32Max(Simd<double, N, 0> d, decltype(Zero(d)) v)
-    -> decltype(Zero(d)) {
-  // The max can be exactly represented in binary64, so clamping beforehand
-  // prevents x86 conversion from raising an exception and returning 80..00.
-  return Min(v, Set(d, 2147483647.0));
-}
-
-// For ConvertTo float->int of same size, clamping before conversion would
-// change the result because the max integer value is not exactly representable.
-// Instead detect the overflow result after conversion and fix it.
-template <class DI, class DF = RebindToFloat<DI>>
-HWY_INLINE auto FixConversionOverflow(DI di, VFromD<DF> original,
-                                      decltype(Zero(di).raw) converted_raw)
-    -> VFromD<DI> {
-  // Combinations of original and output sign:
-  //   --: normal <0 or -huge_val to 80..00: OK
-  //   -+: -0 to 0                         : OK
-  //   +-: +huge_val to 80..00             : xor with FF..FF to get 7F..FF
-  //   ++: normal >0                       : OK
-  const auto converted = VFromD<DI>{converted_raw};
-  const auto sign_wrong = AndNot(BitCast(di, original), converted);
-#if HWY_COMPILER_GCC_ACTUAL
-  // Critical GCC 11 compiler bug (possibly also GCC 10): omits the Xor; also
-  // Add() if using that instead. Work around with one more instruction.
-  const RebindToUnsigned<DI> du;
-  const VFromD<DI> mask = BroadcastSignBit(sign_wrong);
-  const VFromD<DI> max = BitCast(di, ShiftRight<1>(BitCast(du, mask)));
-  return IfVecThenElse(mask, max, converted);
-#else
-  return Xor(converted, BroadcastSignBit(sign_wrong));
-#endif
-}
-
-}  // namespace detail
-
-template <size_t N>
-HWY_API Vec128<int32_t, N> DemoteTo(Simd<int32_t, N, 0> /* tag */,
-                                    const Vec128<double, N> v) {
-  const auto clamped = detail::ClampF64ToI32Max(Simd<double, N, 0>(), v);
-  return Vec128<int32_t, N>{_mm_cvttpd_epi32(clamped.raw)};
-}
-
-// For already range-limited input [0, 255].
-template <size_t N>
-HWY_API Vec128<uint8_t, N> U8FromU32(const Vec128<uint32_t, N> v) {
-  const Simd<uint32_t, N, 0> d32;
-  const Simd<uint8_t, N * 4, 0> d8;
-  alignas(16) static constexpr uint32_t k8From32[4] = {
-      0x0C080400u, 0x0C080400u, 0x0C080400u, 0x0C080400u};
-  // Also replicate bytes into all 32 bit lanes for safety.
-  const auto quad = TableLookupBytes(v, Load(d32, k8From32));
-  return LowerHalf(LowerHalf(BitCast(d8, quad)));
-}
-
-// ------------------------------ Truncations
-
-template <typename From, typename To,
-          hwy::EnableIf<(sizeof(To) < sizeof(From))>* = nullptr>
-HWY_API Vec128<To, 1> TruncateTo(Simd<To, 1, 0> /* tag */,
-                                 const Vec128<From, 1> v) {
-  static_assert(!IsSigned<To>() && !IsSigned<From>(), "Unsigned only");
-  const Repartition<To, DFromV<decltype(v)>> d;
-  const auto v1 = BitCast(d, v);
-  return Vec128<To, 1>{v1.raw};
-}
-
-HWY_API Vec128<uint8_t, 2> TruncateTo(Simd<uint8_t, 2, 0> /* tag */,
-                                      const Vec128<uint64_t, 2> v) {
-  const Full128<uint8_t> d8;
-  alignas(16) static constexpr uint8_t kMap[16] = {0, 8, 0, 8, 0, 8, 0, 8,
-                                                   0, 8, 0, 8, 0, 8, 0, 8};
-  return LowerHalf(LowerHalf(LowerHalf(TableLookupBytes(v, Load(d8, kMap)))));
-}
-
-HWY_API Vec128<uint16_t, 2> TruncateTo(Simd<uint16_t, 2, 0> /* tag */,
-                                       const Vec128<uint64_t, 2> v) {
-  const Full128<uint16_t> d16;
-  alignas(16) static constexpr uint16_t kMap[8] = {
-      0x100u, 0x908u, 0x100u, 0x908u, 0x100u, 0x908u, 0x100u, 0x908u};
-  return LowerHalf(LowerHalf(TableLookupBytes(v, Load(d16, kMap))));
-}
-
-HWY_API Vec128<uint32_t, 2> TruncateTo(Simd<uint32_t, 2, 0> /* tag */,
-                                       const Vec128<uint64_t, 2> v) {
-  return Vec128<uint32_t, 2>{_mm_shuffle_epi32(v.raw, 0x88)};
-}
-
-template <size_t N, hwy::EnableIf<N >= 2>* = nullptr>
-HWY_API Vec128<uint8_t, N> TruncateTo(Simd<uint8_t, N, 0> /* tag */,
-                                      const Vec128<uint32_t, N> v) {
-  const Repartition<uint8_t, DFromV<decltype(v)>> d;
-  alignas(16) static constexpr uint8_t kMap[16] = {
-      0x0u, 0x4u, 0x8u, 0xCu, 0x0u, 0x4u, 0x8u, 0xCu,
-      0x0u, 0x4u, 0x8u, 0xCu, 0x0u, 0x4u, 0x8u, 0xCu};
-  return LowerHalf(LowerHalf(TableLookupBytes(v, Load(d, kMap))));
-}
-
-template <size_t N, hwy::EnableIf<N >= 2>* = nullptr>
-HWY_API Vec128<uint16_t, N> TruncateTo(Simd<uint16_t, N, 0> /* tag */,
-                                       const Vec128<uint32_t, N> v) {
-  const Repartition<uint16_t, DFromV<decltype(v)>> d;
-  const auto v1 = BitCast(d, v);
-  return LowerHalf(ConcatEven(d, v1, v1));
-}
-
-template <size_t N, hwy::EnableIf<N >= 2>* = nullptr>
-HWY_API Vec128<uint8_t, N> TruncateTo(Simd<uint8_t, N, 0> /* tag */,
-                                      const Vec128<uint16_t, N> v) {
-  const Repartition<uint8_t, DFromV<decltype(v)>> d;
-  const auto v1 = BitCast(d, v);
-  return LowerHalf(ConcatEven(d, v1, v1));
-}
-
-// ------------------------------ Integer <=> fp (ShiftRight, OddEven)
-
-template <size_t N>
-HWY_API Vec128<float, N> ConvertTo(Simd<float, N, 0> /* tag */,
-                                   const Vec128<int32_t, N> v) {
-  return Vec128<float, N>{_mm_cvtepi32_ps(v.raw)};
-}
-
-template <size_t N>
-HWY_API Vec128<float, N> ConvertTo(HWY_MAYBE_UNUSED Simd<float, N, 0> df,
-                                   const Vec128<uint32_t, N> v) {
-#if HWY_TARGET <= HWY_AVX3
-  return Vec128<float, N>{_mm_cvtepu32_ps(v.raw)};
-#else
-  // Based on wim's approach (https://stackoverflow.com/questions/34066228/)
-  const RebindToUnsigned<decltype(df)> du32;
-  const RebindToSigned<decltype(df)> d32;
-
-  const auto msk_lo = Set(du32, 0xFFFF);
-  const auto cnst2_16_flt = Set(df, 65536.0f); // 2^16
-
-  // Extract the 16 lowest/highest significant bits of v and cast to signed int
-  const auto v_lo = BitCast(d32, And(v, msk_lo));
-  const auto v_hi = BitCast(d32, ShiftRight<16>(v));
-  return MulAdd(cnst2_16_flt, ConvertTo(df, v_hi), ConvertTo(df, v_lo));
-#endif
-}
-
-template <size_t N>
-HWY_API Vec128<double, N> ConvertTo(Simd<double, N, 0> dd,
-                                    const Vec128<int64_t, N> v) {
-#if HWY_TARGET <= HWY_AVX3
-  (void)dd;
-  return Vec128<double, N>{_mm_cvtepi64_pd(v.raw)};
-#else
-  // Based on wim's approach (https://stackoverflow.com/questions/41144668/)
-  const Repartition<uint32_t, decltype(dd)> d32;
-  const Repartition<uint64_t, decltype(dd)> d64;
-
-  // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63
-  const auto k84_63 = Set(d64, 0x4530000080000000ULL);
-  const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63);
-
-  // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven)
-  const auto k52 = Set(d32, 0x43300000);
-  const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v)));
-
-  const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL));
-  return (v_upper - k84_63_52) + v_lower;  // order matters!
-#endif
-}
-
-template <size_t N>
-HWY_API Vec128<double, N> ConvertTo(HWY_MAYBE_UNUSED Simd<double, N, 0> dd,
-                                    const Vec128<uint64_t, N> v) {
-#if HWY_TARGET <= HWY_AVX3
-  return Vec128<double, N>{_mm_cvtepu64_pd(v.raw)};
-#else
-  // Based on wim's approach (https://stackoverflow.com/questions/41144668/)
-  const RebindToUnsigned<decltype(dd)> d64;
-  using VU = VFromD<decltype(d64)>;
-
-  const VU msk_lo = Set(d64, 0xFFFFFFFF);
-  const auto cnst2_32_dbl = Set(dd, 4294967296.0); // 2^32
-
-  // Extract the 32 lowest/highest significant bits of v
-  const VU v_lo = And(v, msk_lo);
-  const VU v_hi = ShiftRight<32>(v);
-
-  auto uint64_to_double128_fast = [&dd](VU w) HWY_ATTR {
-    w = Or(w, VU{detail::BitCastToInteger(Set(dd, 0x0010000000000000).raw)});
-    return BitCast(dd, w) - Set(dd, 0x0010000000000000);
-  };
-
-  const auto v_lo_dbl = uint64_to_double128_fast(v_lo);
-  return MulAdd(cnst2_32_dbl, uint64_to_double128_fast(v_hi), v_lo_dbl);
-#endif
-}
-
-// Truncates (rounds toward zero).
-template <size_t N>
-HWY_API Vec128<int32_t, N> ConvertTo(const Simd<int32_t, N, 0> di,
-                                     const Vec128<float, N> v) {
-  return detail::FixConversionOverflow(di, v, _mm_cvttps_epi32(v.raw));
-}
-
-// Full (partial handled below)
-HWY_API Vec128<int64_t> ConvertTo(Full128<int64_t> di, const Vec128<double> v) {
-#if HWY_TARGET <= HWY_AVX3 && HWY_ARCH_X86_64
-  return detail::FixConversionOverflow(di, v, _mm_cvttpd_epi64(v.raw));
-#elif HWY_ARCH_X86_64
-  const __m128i i0 = _mm_cvtsi64_si128(_mm_cvttsd_si64(v.raw));
-  const Half<Full128<double>> dd2;
-  const __m128i i1 = _mm_cvtsi64_si128(_mm_cvttsd_si64(UpperHalf(dd2, v).raw));
-  return detail::FixConversionOverflow(di, v, _mm_unpacklo_epi64(i0, i1));
-#else
-  using VI = VFromD<decltype(di)>;
-  const VI k0 = Zero(di);
-  const VI k1 = Set(di, 1);
-  const VI k51 = Set(di, 51);
-
-  // Exponent indicates whether the number can be represented as int64_t.
-  const VI biased_exp = ShiftRight<52>(BitCast(di, v)) & Set(di, 0x7FF);
-  const VI exp = biased_exp - Set(di, 0x3FF);
-  const auto in_range = exp < Set(di, 63);
-
-  // If we were to cap the exponent at 51 and add 2^52, the number would be in
-  // [2^52, 2^53) and mantissa bits could be read out directly. We need to
-  // round-to-0 (truncate), but changing rounding mode in MXCSR hits a
-  // compiler reordering bug: https://gcc.godbolt.org/z/4hKj6c6qc . We instead
-  // manually shift the mantissa into place (we already have many of the
-  // inputs anyway).
-  const VI shift_mnt = Max(k51 - exp, k0);
-  const VI shift_int = Max(exp - k51, k0);
-  const VI mantissa = BitCast(di, v) & Set(di, (1ULL << 52) - 1);
-  // Include implicit 1-bit; shift by one more to ensure it's in the mantissa.
-  const VI int52 = (mantissa | Set(di, 1ULL << 52)) >> (shift_mnt + k1);
-  // For inputs larger than 2^52, insert zeros at the bottom.
-  const VI shifted = int52 << shift_int;
-  // Restore the one bit lost when shifting in the implicit 1-bit.
-  const VI restored = shifted | ((mantissa & k1) << (shift_int - k1));
-
-  // Saturate to LimitsMin (unchanged when negating below) or LimitsMax.
-  const VI sign_mask = BroadcastSignBit(BitCast(di, v));
-  const VI limit = Set(di, LimitsMax<int64_t>()) - sign_mask;
-  const VI magnitude = IfThenElse(in_range, restored, limit);
-
-  // If the input was negative, negate the integer (two's complement).
-  return (magnitude ^ sign_mask) - sign_mask;
-#endif
-}
-HWY_API Vec64<int64_t> ConvertTo(Full64<int64_t> di, const Vec64<double> v) {
-  // Only need to specialize for non-AVX3, 64-bit (single scalar op)
-#if HWY_TARGET > HWY_AVX3 && HWY_ARCH_X86_64
-  const Vec64<int64_t> i0{_mm_cvtsi64_si128(_mm_cvttsd_si64(v.raw))};
-  return detail::FixConversionOverflow(di, v, i0.raw);
-#else
-  (void)di;
-  const auto full = ConvertTo(Full128<int64_t>(), Vec128<double>{v.raw});
-  return Vec64<int64_t>{full.raw};
-#endif
-}
-
-template <size_t N>
-HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
-  const Simd<int32_t, N, 0> di;
-  return detail::FixConversionOverflow(di, v, _mm_cvtps_epi32(v.raw));
-}
-
-// ------------------------------ Floating-point rounding (ConvertTo)
-
-#if HWY_TARGET == HWY_SSSE3
-
-// Toward nearest integer, ties to even
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Round(const Vec128<T, N> v) {
-  static_assert(IsFloat<T>(), "Only for float");
-  // Rely on rounding after addition with a large value such that no mantissa
-  // bits remain (assuming the current mode is nearest-even). We may need a
-  // compiler flag for precise floating-point to prevent "optimizing" this out.
-  const Simd<T, N, 0> df;
-  const auto max = Set(df, MantissaEnd<T>());
-  const auto large = CopySignToAbs(max, v);
-  const auto added = large + v;
-  const auto rounded = added - large;
-  // Keep original if NaN or the magnitude is large (already an int).
-  return IfThenElse(Abs(v) < max, rounded, v);
-}
-
-namespace detail {
-
-// Truncating to integer and converting back to float is correct except when the
-// input magnitude is large, in which case the input was already an integer
-// (because mantissa >> exponent is zero).
-template <typename T, size_t N>
-HWY_INLINE Mask128<T, N> UseInt(const Vec128<T, N> v) {
-  static_assert(IsFloat<T>(), "Only for float");
-  return Abs(v) < Set(Simd<T, N, 0>(), MantissaEnd<T>());
-}
-
-}  // namespace detail
-
-// Toward zero, aka truncate
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Trunc(const Vec128<T, N> v) {
-  static_assert(IsFloat<T>(), "Only for float");
-  const Simd<T, N, 0> df;
-  const RebindToSigned<decltype(df)> di;
-
-  const auto integer = ConvertTo(di, v);  // round toward 0
-  const auto int_f = ConvertTo(df, integer);
-
-  return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v);
-}
-
-// Toward +infinity, aka ceiling
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Ceil(const Vec128<T, N> v) {
-  static_assert(IsFloat<T>(), "Only for float");
-  const Simd<T, N, 0> df;
-  const RebindToSigned<decltype(df)> di;
-
-  const auto integer = ConvertTo(di, v);  // round toward 0
-  const auto int_f = ConvertTo(df, integer);
-
-  // Truncating a positive non-integer ends up smaller; if so, add 1.
-  const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v)));
-
-  return IfThenElse(detail::UseInt(v), int_f - neg1, v);
-}
-
-// Toward -infinity, aka floor
-template <typename T, size_t N>
-HWY_API Vec128<T, N> Floor(const Vec128<T, N> v) {
-  static_assert(IsFloat<T>(), "Only for float");
-  const Simd<T, N, 0> df;
-  const RebindToSigned<decltype(df)> di;
-
-  const auto integer = ConvertTo(di, v);  // round toward 0
-  const auto int_f = ConvertTo(df, integer);
-
-  // Truncating a negative non-integer ends up larger; if so, subtract 1.
-  const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v)));
-
-  return IfThenElse(detail::UseInt(v), int_f + neg1, v);
-}
-
-#else
-
-// Toward nearest integer, ties to even
-template <size_t N>
-HWY_API Vec128<float, N> Round(const Vec128<float, N> v) {
-  return Vec128<float, N>{
-      _mm_round_ps(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
-}
-template <size_t N>
-HWY_API Vec128<double, N> Round(const Vec128<double, N> v) {
-  return Vec128<double, N>{
-      _mm_round_pd(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
-}
-
-// Toward zero, aka truncate
-template <size_t N>
-HWY_API Vec128<float, N> Trunc(const Vec128<float, N> v) {
-  return Vec128<float, N>{
-      _mm_round_ps(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
-}
-template <size_t N>
-HWY_API Vec128<double, N> Trunc(const Vec128<double, N> v) {
-  return Vec128<double, N>{
-      _mm_round_pd(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
-}
-
-// Toward +infinity, aka ceiling
-template <size_t N>
-HWY_API Vec128<float, N> Ceil(const Vec128<float, N> v) {
-  return Vec128<float, N>{
-      _mm_round_ps(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
-}
-template <size_t N>
-HWY_API Vec128<double, N> Ceil(const Vec128<double, N> v) {
-  return Vec128<double, N>{
-      _mm_round_pd(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
-}
-
-// Toward -infinity, aka floor
-template <size_t N>
-HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) {
-  return Vec128<float, N>{
-      _mm_round_ps(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
-}
-template <size_t N>
-HWY_API Vec128<double, N> Floor(const Vec128<double, N> v) {
-  return Vec128<double, N>{
-      _mm_round_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
-}
-
-#endif  // !HWY_SSSE3
-
-// ------------------------------ Floating-point classification
-
-template <size_t N>
-HWY_API Mask128<float, N> IsNaN(const Vec128<float, N> v) {
-#if HWY_TARGET <= HWY_AVX3
-  return Mask128<float, N>{_mm_fpclass_ps_mask(v.raw, 0x81)};
-#else
-  return Mask128<float, N>{_mm_cmpunord_ps(v.raw, v.raw)};
-#endif
-}
-template <size_t N>
-HWY_API Mask128<double, N> IsNaN(const Vec128<double, N> v) {
-#if HWY_TARGET <= HWY_AVX3
-  return Mask128<double, N>{_mm_fpclass_pd_mask(v.raw, 0x81)};
-#else
-  return Mask128<double, N>{_mm_cmpunord_pd(v.raw, v.raw)};
-#endif
-}
-
-#if HWY_TARGET <= HWY_AVX3
-
-template <size_t N>
-HWY_API Mask128<float, N> IsInf(const Vec128<float, N> v) {
-  return Mask128<float, N>{_mm_fpclass_ps_mask(v.raw, 0x18)};
-}
-template <size_t N>
-HWY_API Mask128<double, N> IsInf(const Vec128<double, N> v) {
-  return Mask128<double, N>{_mm_fpclass_pd_mask(v.raw, 0x18)};
-}
-
-// Returns whether normal/subnormal/zero.
-template <size_t N>
-HWY_API Mask128<float, N> IsFinite(const Vec128<float, N> v) {
-  // fpclass doesn't have a flag for positive, so we have to check for inf/NaN
-  // and negate the mask.
-  return Not(Mask128<float, N>{_mm_fpclass_ps_mask(v.raw, 0x99)});
-}
-template <size_t N>
-HWY_API Mask128<double, N> IsFinite(const Vec128<double, N> v) {
-  return Not(Mask128<double, N>{_mm_fpclass_pd_mask(v.raw, 0x99)});
-}
-
-#else
-
-template <typename T, size_t N>
-HWY_API Mask128<T, N> IsInf(const Vec128<T, N> v) {
-  static_assert(IsFloat<T>(), "Only for float");
-  const Simd<T, N, 0> d;
-  const RebindToSigned<decltype(d)> di;
-  const VFromD<decltype(di)> vi = BitCast(di, v);
-  // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
-  return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
-}
-
-// Returns whether normal/subnormal/zero.
-template <typename T, size_t N>
-HWY_API Mask128<T, N> IsFinite(const Vec128<T, N> v) {
-  static_assert(IsFloat<T>(), "Only for float");
-  const Simd<T, N, 0> d;
-  const RebindToUnsigned<decltype(d)> du;
-  const RebindToSigned<decltype(d)> di;  // cheaper than unsigned comparison
-  const VFromD<decltype(du)> vu = BitCast(du, v);
-  // Shift left to clear the sign bit, then right so we can compare with the
-  // max exponent (cannot compare with MaxExponentTimes2 directly because it is
-  // negative and non-negative floats would be greater). MSVC seems to generate
-  // incorrect code if we instead add vu + vu.
-  const VFromD<decltype(di)> exp =
-      BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(ShiftLeft<1>(vu)));
-  return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
-}
-
-#endif  // HWY_TARGET <= HWY_AVX3
-
-// ================================================== CRYPTO
-
-#if !defined(HWY_DISABLE_PCLMUL_AES) && HWY_TARGET != HWY_SSSE3
-
-// Per-target flag to prevent generic_ops-inl.h from defining AESRound.
-#ifdef HWY_NATIVE_AES
-#undef HWY_NATIVE_AES
-#else
-#define HWY_NATIVE_AES
-#endif
-
-HWY_API Vec128<uint8_t> AESRound(Vec128<uint8_t> state,
-                                 Vec128<uint8_t> round_key) {
-  return Vec128<uint8_t>{_mm_aesenc_si128(state.raw, round_key.raw)};
-}
-
-HWY_API Vec128<uint8_t> AESLastRound(Vec128<uint8_t> state,
-                                     Vec128<uint8_t> round_key) {
-  return Vec128<uint8_t>{_mm_aesenclast_si128(state.raw, round_key.raw)};
-}
-
-template <size_t N, HWY_IF_LE128(uint64_t, N)>
-HWY_API Vec128<uint64_t, N> CLMulLower(Vec128<uint64_t, N> a,
-                                       Vec128<uint64_t, N> b) {
-  return Vec128<uint64_t, N>{_mm_clmulepi64_si128(a.raw, b.raw, 0x00)};
-}
-
-template <size_t N, HWY_IF_LE128(uint64_t, N)>
-HWY_API Vec128<uint64_t, N> CLMulUpper(Vec128<uint64_t, N> a,
-                                       Vec128<uint64_t, N> b) {
-  return Vec128<uint64_t, N>{_mm_clmulepi64_si128(a.raw, b.raw, 0x11)};
-}
-
-#endif  // !defined(HWY_DISABLE_PCLMUL_AES) && HWY_TARGET != HWY_SSSE3
-
-// ================================================== MISC
-
-template <typename T>
-struct CompressIsPartition {
-#if HWY_TARGET <= HWY_AVX3
-  // AVX3 supports native compress, but a table-based approach allows
-  // 'partitioning' (also moving mask=false lanes to the top), which helps
-  // vqsort. This is only feasible for eight or less lanes, i.e. sizeof(T) == 8
-  // on AVX3. For simplicity, we only use tables for 64-bit lanes (not AVX3
-  // u32x8 etc.).
-  enum { value = (sizeof(T) == 8) };
-#else
-  enum { value = 1 };
-#endif
-};
-
-#if HWY_TARGET <= HWY_AVX3
-
-// ------------------------------ LoadMaskBits
-
-// `p` points to at least 8 readable bytes, not all of which need be valid.
-template <typename T, size_t N, HWY_IF_LE128(T, N)>
-HWY_API Mask128<T, N> LoadMaskBits(Simd<T, N, 0> /* tag */,
-                                   const uint8_t* HWY_RESTRICT bits) {
-  uint64_t mask_bits = 0;
-  constexpr size_t kNumBytes = (N + 7) / 8;
-  CopyBytes<kNumBytes>(bits, &mask_bits);
-  if (N < 8) {
-    mask_bits &= (1ull << N) - 1;
-  }
-
-  return Mask128<T, N>::FromBits(mask_bits);
-}
-
-// ------------------------------ StoreMaskBits
-
-// `p` points to at least 8 writable bytes.
-template <typename T, size_t N>
-HWY_API size_t StoreMaskBits(const Simd<T, N, 0> /* tag */,
-                             const Mask128<T, N> mask, uint8_t* bits) {
-  constexpr size_t kNumBytes = (N + 7) / 8;
-  CopyBytes<kNumBytes>(&mask.raw, bits);
-
-  // Non-full byte, need to clear the undefined upper bits.
-  if (N < 8) {
-    const int mask_bits = (1 << N) - 1;
-    bits[0] = static_cast<uint8_t>(bits[0] & mask_bits);
-  }
-
-  return kNumBytes;
-}
-
-// ------------------------------ Mask testing
-
-// Beware: the suffix indicates the number of mask bits, not lane size!
-
-template <typename T, size_t N>
-HWY_API size_t CountTrue(const Simd<T, N, 0> /* tag */,
-                         const Mask128<T, N> mask) {
-  const uint64_t mask_bits = static_cast<uint64_t>(mask.raw) & ((1u << N) - 1);
-  return PopCount(mask_bits);
-}
-
-template <typename T, size_t N>
-HWY_API intptr_t FindFirstTrue(const Simd<T, N, 0> /* tag */,
-                               const Mask128<T, N> mask) {
-  const uint32_t mask_bits = static_cast<uint32_t>(mask.raw) & ((1u << N) - 1);
-  return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask_bits)) : -1;
-}
-
-template <typename T, size_t N>
-HWY_API bool AllFalse(const Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
-  const uint64_t mask_bits = static_cast<uint64_t>(mask.raw) & ((1u << N) - 1);
-  return mask_bits == 0;
-}
-
-template <typename T, size_t N>
-HWY_API bool AllTrue(const Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
-  const uint64_t mask_bits = static_cast<uint64_t>(mask.raw) & ((1u << N) - 1);
-  // Cannot use _kortestc because we may have less than 8 mask bits.
-  return mask_bits == (1u << N) - 1;
-}
-
-// ------------------------------ Compress
-
-#if HWY_TARGET != HWY_AVX3_DL
-namespace detail {
-
-// Returns permutevar_epi16 indices for 16-bit Compress. Also used by x86_256.
-HWY_INLINE Vec128<uint16_t> IndicesForCompress16(uint64_t mask_bits) {
-  Full128<uint16_t> du16;
-  // Table of u16 indices packed into bytes to reduce L1 usage. Will be unpacked
-  // to u16. Ideally we would broadcast 8*3 (half of the 8 bytes currently used)
-  // bits into each lane and then varshift, but that does not fit in 16 bits.
-  Rebind<uint8_t, decltype(du16)> du8;
-  alignas(16) constexpr uint8_t tbl[2048] = {
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
-      1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 2,
-      0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0,
-      0, 0, 0, 0, 0, 1, 3, 0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 0, 0, 0, 0, 2, 3, 0, 0,
-      0, 0, 0, 0, 0, 2, 3, 0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 1, 2, 3, 0,
-      0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 1, 4, 0, 0, 0, 0,
-      0, 0, 0, 1, 4, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0,
-      0, 1, 2, 4, 0, 0, 0, 0, 0, 0, 1, 2, 4, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0,
-      0, 3, 4, 0, 0, 0, 0, 0, 1, 3, 4, 0, 0, 0, 0, 0, 0, 1, 3, 4, 0, 0, 0, 0, 2,
-      3, 4, 0, 0, 0, 0, 0, 0, 2, 3, 4, 0, 0, 0, 0, 1, 2, 3, 4, 0, 0, 0, 0, 0, 1,
-      2, 3, 4, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 1, 5, 0,
-      0, 0, 0, 0, 0, 0, 1, 5, 0, 0, 0, 0, 0, 2, 5, 0, 0, 0, 0, 0, 0, 0, 2, 5, 0,
-      0, 0, 0, 0, 1, 2, 5, 0, 0, 0, 0, 0, 0, 1, 2, 5, 0, 0, 0, 0, 3, 5, 0, 0, 0,
-      0, 0, 0, 0, 3, 5, 0, 0, 0, 0, 0, 1, 3, 5, 0, 0, 0, 0, 0, 0, 1, 3, 5, 0, 0,
-      0, 0, 2, 3, 5, 0, 0, 0, 0, 0, 0, 2, 3, 5, 0, 0, 0, 0, 1, 2, 3, 5, 0, 0, 0,
-      0, 0, 1, 2, 3, 5, 0, 0, 0, 4, 5, 0, 0, 0, 0, 0, 0, 0, 4, 5, 0, 0, 0, 0, 0,
-      1, 4, 5, 0, 0, 0, 0, 0, 0, 1, 4, 5, 0, 0, 0, 0, 2, 4, 5, 0, 0, 0, 0, 0, 0,
-      2, 4, 5, 0, 0, 0, 0, 1, 2, 4, 5, 0, 0, 0, 0, 0, 1, 2, 4, 5, 0, 0, 0, 3, 4,
-      5, 0, 0, 0, 0, 0, 0, 3, 4, 5, 0, 0, 0, 0, 1, 3, 4, 5, 0, 0, 0, 0, 0, 1, 3,
-      4, 5, 0, 0, 0, 2, 3, 4, 5, 0, 0, 0, 0, 0, 2, 3, 4, 5, 0, 0, 0, 1, 2, 3, 4,
-      5, 0, 0, 0, 0, 1, 2, 3, 4, 5, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0,
-      0, 0, 0, 1, 6, 0, 0, 0, 0, 0, 0, 0, 1, 6, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0,
-      0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 1, 2, 6, 0, 0, 0, 0, 0, 0, 1, 2, 6, 0, 0, 0,
-      0, 3, 6, 0, 0, 0, 0, 0, 0, 0, 3, 6, 0, 0, 0, 0, 0, 1, 3, 6, 0, 0, 0, 0, 0,
-      0, 1, 3, 6, 0, 0, 0, 0, 2, 3, 6, 0, 0, 0, 0, 0, 0, 2, 3, 6, 0, 0, 0, 0, 1,
-      2, 3, 6, 0, 0, 0, 0, 0, 1, 2, 3, 6, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 4,
-      6, 0, 0, 0, 0, 0, 1, 4, 6, 0, 0, 0, 0, 0, 0, 1, 4, 6, 0, 0, 0, 0, 2, 4, 6,
-      0, 0, 0, 0, 0, 0, 2, 4, 6, 0, 0, 0, 0, 1, 2, 4, 6, 0, 0, 0, 0, 0, 1, 2, 4,
-      6, 0, 0, 0, 3, 4, 6, 0, 0, 0, 0, 0, 0, 3, 4, 6, 0, 0, 0, 0, 1, 3, 4, 6, 0,
-      0, 0, 0, 0, 1, 3, 4, 6, 0, 0, 0, 2, 3, 4, 6, 0, 0, 0, 0, 0, 2, 3, 4, 6, 0,
-      0, 0, 1, 2, 3, 4, 6, 0, 0, 0, 0, 1, 2, 3, 4, 6, 0, 0, 5, 6, 0, 0, 0, 0, 0,
-      0, 0, 5, 6, 0, 0, 0, 0, 0, 1, 5, 6, 0, 0, 0, 0, 0, 0, 1, 5, 6, 0, 0, 0, 0,
-      2, 5, 6, 0, 0, 0, 0, 0, 0, 2, 5, 6, 0, 0, 0, 0, 1, 2, 5, 6, 0, 0, 0, 0, 0,
-      1, 2, 5, 6, 0, 0, 0, 3, 5, 6, 0, 0, 0, 0, 0, 0, 3, 5, 6, 0, 0, 0, 0, 1, 3,
-      5, 6, 0, 0, 0, 0, 0, 1, 3, 5, 6, 0, 0, 0, 2, 3, 5, 6, 0, 0, 0, 0, 0, 2, 3,
-      5, 6, 0, 0, 0, 1, 2, 3, 5, 6, 0, 0, 0, 0, 1, 2, 3, 5, 6, 0, 0, 4, 5, 6, 0,
-      0, 0, 0, 0, 0, 4, 5, 6, 0, 0, 0, 0, 1, 4, 5, 6, 0, 0, 0, 0, 0, 1, 4, 5, 6,
-      0, 0, 0, 2, 4, 5, 6, 0, 0, 0, 0, 0, 2, 4, 5, 6, 0, 0, 0, 1, 2, 4, 5, 6, 0,
-      0, 0, 0, 1, 2, 4, 5, 6, 0, 0, 3, 4, 5, 6, 0, 0, 0, 0, 0, 3, 4, 5, 6, 0, 0,
-      0, 1, 3, 4, 5, 6, 0, 0, 0, 0, 1, 3, 4, 5, 6, 0, 0, 2, 3, 4, 5, 6, 0, 0, 0,
-      0, 2, 3, 4, 5, 6, 0, 0, 1, 2, 3, 4, 5, 6, 0, 0, 0, 1, 2, 3, 4, 5, 6, 0, 7,
-      0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 1, 7, 0, 0, 0, 0, 0, 0, 0, 1,
-      7, 0, 0, 0, 0, 0, 2, 7, 0, 0, 0, 0, 0, 0, 0, 2, 7, 0, 0, 0, 0, 0, 1, 2, 7,
-      0, 0, 0, 0, 0, 0, 1, 2, 7, 0, 0, 0, 0, 3, 7, 0, 0, 0, 0, 0, 0, 0, 3, 7, 0,
-      0, 0, 0, 0, 1, 3, 7, 0, 0, 0, 0, 0, 0, 1, 3, 7, 0, 0, 0, 0, 2, 3, 7, 0, 0,
-      0, 0, 0, 0, 2, 3, 7, 0, 0, 0, 0, 1, 2, 3, 7, 0, 0, 0, 0, 0, 1, 2, 3, 7, 0,
-      0, 0, 4, 7, 0, 0, 0, 0, 0, 0, 0, 4, 7, 0, 0, 0, 0, 0, 1, 4, 7, 0, 0, 0, 0,
-      0, 0, 1, 4, 7, 0, 0, 0, 0, 2, 4, 7, 0, 0, 0, 0, 0, 0, 2, 4, 7, 0, 0, 0, 0,
-      1, 2, 4, 7, 0, 0, 0, 0, 0, 1, 2, 4, 7, 0, 0, 0, 3, 4, 7, 0, 0, 0, 0, 0, 0,
-      3, 4, 7, 0, 0, 0, 0, 1, 3, 4, 7, 0, 0, 0, 0, 0, 1, 3, 4, 7, 0, 0, 0, 2, 3,
-      4, 7, 0, 0, 0, 0, 0, 2, 3, 4, 7, 0, 0, 0, 1, 2, 3, 4, 7, 0, 0, 0, 0, 1, 2,
-      3, 4, 7, 0, 0, 5, 7, 0, 0, 0, 0, 0, 0, 0, 5, 7, 0, 0, 0, 0, 0, 1, 5, 7, 0,
-      0, 0, 0, 0, 0, 1, 5, 7, 0, 0, 0, 0, 2, 5, 7, 0, 0, 0, 0, 0, 0, 2, 5, 7, 0,
-      0, 0, 0, 1, 2, 5, 7, 0, 0, 0, 0, 0, 1, 2, 5, 7, 0, 0, 0, 3, 5, 7, 0, 0, 0,
-      0, 0, 0, 3, 5, 7, 0, 0, 0, 0, 1, 3, 5, 7, 0, 0, 0, 0, 0, 1, 3, 5, 7, 0, 0,
-      0, 2, 3, 5, 7, 0, 0, 0, 0, 0, 2, 3, 5, 7, 0, 0, 0, 1, 2, 3, 5, 7, 0, 0, 0,
-      0, 1, 2, 3, 5, 7, 0, 0, 4, 5, 7, 0, 0, 0, 0, 0, 0, 4, 5, 7, 0, 0, 0, 0, 1,
-      4, 5, 7, 0, 0, 0, 0, 0, 1, 4, 5, 7, 0, 0, 0, 2, 4, 5, 7, 0, 0, 0, 0, 0, 2,
-      4, 5, 7, 0, 0, 0, 1, 2, 4, 5, 7, 0, 0, 0, 0, 1, 2, 4, 5, 7, 0, 0, 3, 4, 5,
-      7, 0, 0, 0, 0, 0, 3, 4, 5, 7, 0, 0, 0, 1, 3, 4, 5, 7, 0, 0, 0, 0, 1, 3, 4,
-      5, 7, 0, 0, 2, 3, 4, 5, 7, 0, 0, 0, 0, 2, 3, 4, 5, 7, 0, 0, 1, 2, 3, 4, 5,
-      7, 0, 0, 0, 1, 2, 3, 4, 5, 7, 0, 6, 7, 0, 0, 0, 0, 0, 0, 0, 6, 7, 0, 0, 0,
-      0, 0, 1, 6, 7, 0, 0, 0, 0, 0, 0, 1, 6, 7, 0, 0, 0, 0, 2, 6, 7, 0, 0, 0, 0,
-      0, 0, 2, 6, 7, 0, 0, 0, 0, 1, 2, 6, 7, 0, 0, 0, 0, 0, 1, 2, 6, 7, 0, 0, 0,
-      3, 6, 7, 0, 0, 0, 0, 0, 0, 3, 6, 7, 0, 0, 0, 0, 1, 3, 6, 7, 0, 0, 0, 0, 0,
-      1, 3, 6, 7, 0, 0, 0, 2, 3, 6, 7, 0, 0, 0, 0, 0, 2, 3, 6, 7, 0, 0, 0, 1, 2,
-      3, 6, 7, 0, 0, 0, 0, 1, 2, 3, 6, 7, 0, 0, 4, 6, 7, 0, 0, 0, 0, 0, 0, 4, 6,
-      7, 0, 0, 0, 0, 1, 4, 6, 7, 0, 0, 0, 0, 0, 1, 4, 6, 7, 0, 0, 0, 2, 4, 6, 7,
-      0, 0, 0, 0, 0, 2, 4, 6, 7, 0, 0, 0, 1, 2, 4, 6, 7, 0, 0, 0, 0, 1, 2, 4, 6,
-      7, 0, 0, 3, 4, 6, 7, 0, 0, 0, 0, 0, 3, 4, 6, 7, 0, 0, 0, 1, 3, 4, 6, 7, 0,
-      0, 0, 0, 1, 3, 4, 6, 7, 0, 0, 2, 3, 4, 6, 7, 0, 0, 0, 0, 2, 3, 4, 6, 7, 0,
-      0, 1, 2, 3, 4, 6, 7, 0, 0, 0, 1, 2, 3, 4, 6, 7, 0, 5, 6, 7, 0, 0, 0, 0, 0,
-      0, 5, 6, 7, 0, 0, 0, 0, 1, 5, 6, 7, 0, 0, 0, 0, 0, 1, 5, 6, 7, 0, 0, 0, 2,
-      5, 6, 7, 0, 0, 0, 0, 0, 2, 5, 6, 7, 0, 0, 0, 1, 2, 5, 6, 7, 0, 0, 0, 0, 1,
-      2, 5, 6, 7, 0, 0, 3, 5, 6, 7, 0, 0, 0, 0, 0, 3, 5, 6, 7, 0, 0, 0, 1, 3, 5,
-      6, 7, 0, 0, 0, 0, 1, 3, 5, 6, 7, 0, 0, 2, 3, 5, 6, 7, 0, 0, 0, 0, 2, 3, 5,
-      6, 7, 0, 0, 1, 2, 3, 5, 6, 7, 0, 0, 0, 1, 2, 3, 5, 6, 7, 0, 4, 5, 6, 7, 0,
-      0, 0, 0, 0, 4, 5, 6, 7, 0, 0, 0, 1, 4, 5, 6, 7, 0, 0, 0, 0, 1, 4, 5, 6, 7,
-      0, 0, 2, 4, 5, 6, 7, 0, 0, 0, 0, 2, 4, 5, 6, 7, 0, 0, 1, 2, 4, 5, 6, 7, 0,
-      0, 0, 1, 2, 4, 5, 6, 7, 0, 3, 4, 5, 6, 7, 0, 0, 0, 0, 3, 4, 5, 6, 7, 0, 0,
-      1, 3, 4, 5, 6, 7, 0, 0, 0, 1, 3, 4, 5, 6, 7, 0, 2, 3, 4, 5, 6, 7, 0, 0, 0,
-      2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 1, 2, 3, 4, 5, 6, 7};
-  return PromoteTo(du16, Load(du8, tbl + mask_bits * 8));
-}
-
-}  // namespace detail
-#endif  // HWY_TARGET != HWY_AVX3_DL
-
-// Single lane: no-op
-template <typename T>
-HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
-  return v;
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
-  const Simd<T, N, 0> d;
-  const Rebind<uint16_t, decltype(d)> du;
-  const auto vu = BitCast(du, v);  // (required for float16_t inputs)
-
-#if HWY_TARGET == HWY_AVX3_DL  // VBMI2
-  const Vec128<uint16_t, N> cu{_mm_maskz_compress_epi16(mask.raw, vu.raw)};
-#else
-  const auto idx = detail::IndicesForCompress16(uint64_t{mask.raw});
-  const Vec128<uint16_t, N> cu{_mm_permutexvar_epi16(idx.raw, vu.raw)};
-#endif  // HWY_TARGET != HWY_AVX3_DL
-  return BitCast(d, cu);
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
-  return Vec128<T, N>{_mm_maskz_compress_epi32(mask.raw, v.raw)};
-}
-
-template <size_t N, HWY_IF_GE64(float, N)>
-HWY_API Vec128<float, N> Compress(Vec128<float, N> v, Mask128<float, N> mask) {
-  return Vec128<float, N>{_mm_maskz_compress_ps(mask.raw, v.raw)};
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) {
-  HWY_DASSERT(mask.raw < 4);
-
-  // There are only 2 lanes, so we can afford to load the index vector directly.
-  alignas(16) constexpr uint8_t u8_indices[64] = {
-      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
-      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
-      8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2,  3,  4,  5,  6,  7,
-      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15};
-
-  const Full128<T> d;
-  const Repartition<uint8_t, decltype(d)> d8;
-  const auto index = Load(d8, u8_indices + 16 * mask.raw);
-  return BitCast(d, TableLookupBytes(BitCast(d8, v), index));
-}
-
-// ------------------------------ CompressNot (Compress)
-
-// Single lane: no-op
-template <typename T>
-HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
-  return v;
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
-  return Compress(v, Not(mask));
-}
-
-// ------------------------------ CompressBlocksNot
-HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
-                                           Mask128<uint64_t> /* m */) {
-  return v;
-}
-
-// ------------------------------ CompressBits (LoadMaskBits)
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v,
-                                  const uint8_t* HWY_RESTRICT bits) {
-  return Compress(v, LoadMaskBits(Simd<T, N, 0>(), bits));
-}
-
-// ------------------------------ CompressStore
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API size_t CompressStore(Vec128<T, N> v, Mask128<T, N> mask,
-                             Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
-  const Rebind<uint16_t, decltype(d)> du;
-  const auto vu = BitCast(du, v);  // (required for float16_t inputs)
-
-  const uint64_t mask_bits{mask.raw};
-
-#if HWY_TARGET == HWY_AVX3_DL  // VBMI2
-  _mm_mask_compressstoreu_epi16(unaligned, mask.raw, vu.raw);
-#else
-  const auto idx = detail::IndicesForCompress16(mask_bits);
-  const Vec128<uint16_t, N> cu{_mm_permutexvar_epi16(idx.raw, vu.raw)};
-  StoreU(BitCast(d, cu), d, unaligned);
-#endif  // HWY_TARGET == HWY_AVX3_DL
-
-  const size_t count = PopCount(mask_bits & ((1ull << N) - 1));
-  // Workaround for MSAN not marking output as initialized (b/233326619)
-#if HWY_IS_MSAN
-  __msan_unpoison(unaligned, count * sizeof(T));
-#endif
-  return count;
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API size_t CompressStore(Vec128<T, N> v, Mask128<T, N> mask,
-                             Simd<T, N, 0> /* tag */,
-                             T* HWY_RESTRICT unaligned) {
-  _mm_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
-  const size_t count = PopCount(uint64_t{mask.raw} & ((1ull << N) - 1));
-  // Workaround for MSAN not marking output as initialized (b/233326619)
-#if HWY_IS_MSAN
-  __msan_unpoison(unaligned, count * sizeof(T));
-#endif
-  return count;
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API size_t CompressStore(Vec128<T, N> v, Mask128<T, N> mask,
-                             Simd<T, N, 0> /* tag */,
-                             T* HWY_RESTRICT unaligned) {
-  _mm_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
-  const size_t count = PopCount(uint64_t{mask.raw} & ((1ull << N) - 1));
-  // Workaround for MSAN not marking output as initialized (b/233326619)
-#if HWY_IS_MSAN
-  __msan_unpoison(unaligned, count * sizeof(T));
-#endif
-  return count;
-}
-
-template <size_t N, HWY_IF_LE128(float, N)>
-HWY_API size_t CompressStore(Vec128<float, N> v, Mask128<float, N> mask,
-                             Simd<float, N, 0> /* tag */,
-                             float* HWY_RESTRICT unaligned) {
-  _mm_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
-  const size_t count = PopCount(uint64_t{mask.raw} & ((1ull << N) - 1));
-  // Workaround for MSAN not marking output as initialized (b/233326619)
-#if HWY_IS_MSAN
-  __msan_unpoison(unaligned, count * sizeof(float));
-#endif
-  return count;
-}
-
-template <size_t N, HWY_IF_LE128(double, N)>
-HWY_API size_t CompressStore(Vec128<double, N> v, Mask128<double, N> mask,
-                             Simd<double, N, 0> /* tag */,
-                             double* HWY_RESTRICT unaligned) {
-  _mm_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
-  const size_t count = PopCount(uint64_t{mask.raw} & ((1ull << N) - 1));
-  // Workaround for MSAN not marking output as initialized (b/233326619)
-#if HWY_IS_MSAN
-  __msan_unpoison(unaligned, count * sizeof(double));
-#endif
-  return count;
-}
-
-// ------------------------------ CompressBlendedStore (CompressStore)
-template <typename T, size_t N>
-HWY_API size_t CompressBlendedStore(Vec128<T, N> v, Mask128<T, N> m,
-                                    Simd<T, N, 0> d,
-                                    T* HWY_RESTRICT unaligned) {
-  // AVX-512 already does the blending at no extra cost (latency 11,
-  // rthroughput 2 - same as compress plus store).
-  if (HWY_TARGET == HWY_AVX3_DL || sizeof(T) != 2) {
-    // We're relying on the mask to blend. Clear the undefined upper bits.
-    if (N != 16 / sizeof(T)) {
-      m = And(m, FirstN(d, N));
-    }
-    return CompressStore(v, m, d, unaligned);
-  } else {
-    const size_t count = CountTrue(d, m);
-    const Vec128<T, N> compressed = Compress(v, m);
-#if HWY_MEM_OPS_MIGHT_FAULT
-    // BlendedStore tests mask for each lane, but we know that the mask is
-    // FirstN, so we can just copy.
-    alignas(16) T buf[N];
-    Store(compressed, d, buf);
-    memcpy(unaligned, buf, count * sizeof(T));
-#else
-    BlendedStore(compressed, FirstN(d, count), d, unaligned);
-#endif
-    // Workaround: as of 2022-02-23 MSAN does not mark the output as
-    // initialized.
-#if HWY_IS_MSAN
-    __msan_unpoison(unaligned, count * sizeof(T));
-#endif
-    return count;
-  }
-}
-
-// ------------------------------ CompressBitsStore (LoadMaskBits)
-
-template <typename T, size_t N>
-HWY_API size_t CompressBitsStore(Vec128<T, N> v,
-                                 const uint8_t* HWY_RESTRICT bits,
-                                 Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
-  return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
-}
-
-#else  // AVX2 or below
-
-// ------------------------------ LoadMaskBits (TestBit)
-
-namespace detail {
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 1)>
-HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t mask_bits) {
-  const RebindToUnsigned<decltype(d)> du;
-  // Easier than Set(), which would require an >8-bit type, which would not
-  // compile for T=uint8_t, N=1.
-  const Vec128<T, N> vbits{_mm_cvtsi32_si128(static_cast<int>(mask_bits))};
-
-  // Replicate bytes 8x such that each byte contains the bit that governs it.
-  alignas(16) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
-                                             1, 1, 1, 1, 1, 1, 1, 1};
-  const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8));
-
-  alignas(16) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
-                                            1, 2, 4, 8, 16, 32, 64, 128};
-  return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit)));
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
-HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t mask_bits) {
-  const RebindToUnsigned<decltype(d)> du;
-  alignas(16) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
-  const auto vmask_bits = Set(du, static_cast<uint16_t>(mask_bits));
-  return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4)>
-HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t mask_bits) {
-  const RebindToUnsigned<decltype(d)> du;
-  alignas(16) constexpr uint32_t kBit[8] = {1, 2, 4, 8};
-  const auto vmask_bits = Set(du, static_cast<uint32_t>(mask_bits));
-  return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8)>
-HWY_INLINE Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d, uint64_t mask_bits) {
-  const RebindToUnsigned<decltype(d)> du;
-  alignas(16) constexpr uint64_t kBit[8] = {1, 2};
-  return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit)));
-}
-
-}  // namespace detail
-
-// `p` points to at least 8 readable bytes, not all of which need be valid.
-template <typename T, size_t N, HWY_IF_LE128(T, N)>
-HWY_API Mask128<T, N> LoadMaskBits(Simd<T, N, 0> d,
-                                   const uint8_t* HWY_RESTRICT bits) {
-  uint64_t mask_bits = 0;
-  constexpr size_t kNumBytes = (N + 7) / 8;
-  CopyBytes<kNumBytes>(bits, &mask_bits);
-  if (N < 8) {
-    mask_bits &= (1ull << N) - 1;
-  }
-
-  return detail::LoadMaskBits(d, mask_bits);
-}
-
-// ------------------------------ StoreMaskBits
-
-namespace detail {
-
-constexpr HWY_INLINE uint64_t U64FromInt(int mask_bits) {
-  return static_cast<uint64_t>(static_cast<unsigned>(mask_bits));
-}
-
-template <typename T, size_t N>
-HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/,
-                                 const Mask128<T, N> mask) {
-  const Simd<T, N, 0> d;
-  const auto sign_bits = BitCast(d, VecFromMask(d, mask)).raw;
-  return U64FromInt(_mm_movemask_epi8(sign_bits));
-}
-
-template <typename T, size_t N>
-HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/,
-                                 const Mask128<T, N> mask) {
-  // Remove useless lower half of each u16 while preserving the sign bit.
-  const auto sign_bits = _mm_packs_epi16(mask.raw, _mm_setzero_si128());
-  return U64FromInt(_mm_movemask_epi8(sign_bits));
-}
-
-template <typename T, size_t N>
-HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/,
-                                 const Mask128<T, N> mask) {
-  const Simd<T, N, 0> d;
-  const Simd<float, N, 0> df;
-  const auto sign_bits = BitCast(df, VecFromMask(d, mask));
-  return U64FromInt(_mm_movemask_ps(sign_bits.raw));
-}
-
-template <typename T, size_t N>
-HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/,
-                                 const Mask128<T, N> mask) {
-  const Simd<T, N, 0> d;
-  const Simd<double, N, 0> df;
-  const auto sign_bits = BitCast(df, VecFromMask(d, mask));
-  return U64FromInt(_mm_movemask_pd(sign_bits.raw));
-}
-
-// Returns the lowest N of the _mm_movemask* bits.
-template <typename T, size_t N>
-constexpr uint64_t OnlyActive(uint64_t mask_bits) {
-  return ((N * sizeof(T)) == 16) ? mask_bits : mask_bits & ((1ull << N) - 1);
-}
-
-template <typename T, size_t N>
-HWY_INLINE uint64_t BitsFromMask(const Mask128<T, N> mask) {
-  return OnlyActive<T, N>(BitsFromMask(hwy::SizeTag<sizeof(T)>(), mask));
-}
-
-}  // namespace detail
-
-// `p` points to at least 8 writable bytes.
-template <typename T, size_t N>
-HWY_API size_t StoreMaskBits(const Simd<T, N, 0> /* tag */,
-                             const Mask128<T, N> mask, uint8_t* bits) {
-  constexpr size_t kNumBytes = (N + 7) / 8;
-  const uint64_t mask_bits = detail::BitsFromMask(mask);
-  CopyBytes<kNumBytes>(&mask_bits, bits);
-  return kNumBytes;
-}
-
-// ------------------------------ Mask testing
-
-template <typename T, size_t N>
-HWY_API bool AllFalse(const Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
-  // Cheaper than PTEST, which is 2 uop / 3L.
-  return detail::BitsFromMask(mask) == 0;
-}
-
-template <typename T, size_t N>
-HWY_API bool AllTrue(const Simd<T, N, 0> /* tag */, const Mask128<T, N> mask) {
-  constexpr uint64_t kAllBits =
-      detail::OnlyActive<T, N>((1ull << (16 / sizeof(T))) - 1);
-  return detail::BitsFromMask(mask) == kAllBits;
-}
-
-template <typename T, size_t N>
-HWY_API size_t CountTrue(const Simd<T, N, 0> /* tag */,
-                         const Mask128<T, N> mask) {
-  return PopCount(detail::BitsFromMask(mask));
-}
-
-template <typename T, size_t N>
-HWY_API intptr_t FindFirstTrue(const Simd<T, N, 0> /* tag */,
-                               const Mask128<T, N> mask) {
-  const uint64_t mask_bits = detail::BitsFromMask(mask);
-  return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero64(mask_bits)) : -1;
-}
-
-// ------------------------------ Compress, CompressBits
-
-namespace detail {
-
-// Also works for N < 8 because the first 16 4-tuples only reference bytes 0-6.
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
-HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N, 0> d, uint64_t mask_bits) {
-  HWY_DASSERT(mask_bits < 256);
-  const Rebind<uint8_t, decltype(d)> d8;
-  const Simd<uint16_t, N, 0> du;
-
-  // compress_epi16 requires VBMI2 and there is no permutevar_epi16, so we need
-  // byte indices for PSHUFB (one vector's worth for each of 256 combinations of
-  // 8 mask bits). Loading them directly would require 4 KiB. We can instead
-  // store lane indices and convert to byte indices (2*lane + 0..1), with the
-  // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
-  // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
-  // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
-  // is likely more costly than the higher cache footprint from storing bytes.
-  alignas(16) constexpr uint8_t table[2048] = {
-      // PrintCompress16x8Tables
-      0,  2,  4,  6,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
-      2,  0,  4,  6,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
-      4,  0,  2,  6,  8,  10, 12, 14, /**/ 0, 4,  2,  6,  8,  10, 12, 14,  //
-      2,  4,  0,  6,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
-      6,  0,  2,  4,  8,  10, 12, 14, /**/ 0, 6,  2,  4,  8,  10, 12, 14,  //
-      2,  6,  0,  4,  8,  10, 12, 14, /**/ 0, 2,  6,  4,  8,  10, 12, 14,  //
-      4,  6,  0,  2,  8,  10, 12, 14, /**/ 0, 4,  6,  2,  8,  10, 12, 14,  //
-      2,  4,  6,  0,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
-      8,  0,  2,  4,  6,  10, 12, 14, /**/ 0, 8,  2,  4,  6,  10, 12, 14,  //
-      2,  8,  0,  4,  6,  10, 12, 14, /**/ 0, 2,  8,  4,  6,  10, 12, 14,  //
-      4,  8,  0,  2,  6,  10, 12, 14, /**/ 0, 4,  8,  2,  6,  10, 12, 14,  //
-      2,  4,  8,  0,  6,  10, 12, 14, /**/ 0, 2,  4,  8,  6,  10, 12, 14,  //
-      6,  8,  0,  2,  4,  10, 12, 14, /**/ 0, 6,  8,  2,  4,  10, 12, 14,  //
-      2,  6,  8,  0,  4,  10, 12, 14, /**/ 0, 2,  6,  8,  4,  10, 12, 14,  //
-      4,  6,  8,  0,  2,  10, 12, 14, /**/ 0, 4,  6,  8,  2,  10, 12, 14,  //
-      2,  4,  6,  8,  0,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
-      10, 0,  2,  4,  6,  8,  12, 14, /**/ 0, 10, 2,  4,  6,  8,  12, 14,  //
-      2,  10, 0,  4,  6,  8,  12, 14, /**/ 0, 2,  10, 4,  6,  8,  12, 14,  //
-      4,  10, 0,  2,  6,  8,  12, 14, /**/ 0, 4,  10, 2,  6,  8,  12, 14,  //
-      2,  4,  10, 0,  6,  8,  12, 14, /**/ 0, 2,  4,  10, 6,  8,  12, 14,  //
-      6,  10, 0,  2,  4,  8,  12, 14, /**/ 0, 6,  10, 2,  4,  8,  12, 14,  //
-      2,  6,  10, 0,  4,  8,  12, 14, /**/ 0, 2,  6,  10, 4,  8,  12, 14,  //
-      4,  6,  10, 0,  2,  8,  12, 14, /**/ 0, 4,  6,  10, 2,  8,  12, 14,  //
-      2,  4,  6,  10, 0,  8,  12, 14, /**/ 0, 2,  4,  6,  10, 8,  12, 14,  //
-      8,  10, 0,  2,  4,  6,  12, 14, /**/ 0, 8,  10, 2,  4,  6,  12, 14,  //
-      2,  8,  10, 0,  4,  6,  12, 14, /**/ 0, 2,  8,  10, 4,  6,  12, 14,  //
-      4,  8,  10, 0,  2,  6,  12, 14, /**/ 0, 4,  8,  10, 2,  6,  12, 14,  //
-      2,  4,  8,  10, 0,  6,  12, 14, /**/ 0, 2,  4,  8,  10, 6,  12, 14,  //
-      6,  8,  10, 0,  2,  4,  12, 14, /**/ 0, 6,  8,  10, 2,  4,  12, 14,  //
-      2,  6,  8,  10, 0,  4,  12, 14, /**/ 0, 2,  6,  8,  10, 4,  12, 14,  //
-      4,  6,  8,  10, 0,  2,  12, 14, /**/ 0, 4,  6,  8,  10, 2,  12, 14,  //
-      2,  4,  6,  8,  10, 0,  12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
-      12, 0,  2,  4,  6,  8,  10, 14, /**/ 0, 12, 2,  4,  6,  8,  10, 14,  //
-      2,  12, 0,  4,  6,  8,  10, 14, /**/ 0, 2,  12, 4,  6,  8,  10, 14,  //
-      4,  12, 0,  2,  6,  8,  10, 14, /**/ 0, 4,  12, 2,  6,  8,  10, 14,  //
-      2,  4,  12, 0,  6,  8,  10, 14, /**/ 0, 2,  4,  12, 6,  8,  10, 14,  //
-      6,  12, 0,  2,  4,  8,  10, 14, /**/ 0, 6,  12, 2,  4,  8,  10, 14,  //
-      2,  6,  12, 0,  4,  8,  10, 14, /**/ 0, 2,  6,  12, 4,  8,  10, 14,  //
-      4,  6,  12, 0,  2,  8,  10, 14, /**/ 0, 4,  6,  12, 2,  8,  10, 14,  //
-      2,  4,  6,  12, 0,  8,  10, 14, /**/ 0, 2,  4,  6,  12, 8,  10, 14,  //
-      8,  12, 0,  2,  4,  6,  10, 14, /**/ 0, 8,  12, 2,  4,  6,  10, 14,  //
-      2,  8,  12, 0,  4,  6,  10, 14, /**/ 0, 2,  8,  12, 4,  6,  10, 14,  //
-      4,  8,  12, 0,  2,  6,  10, 14, /**/ 0, 4,  8,  12, 2,  6,  10, 14,  //
-      2,  4,  8,  12, 0,  6,  10, 14, /**/ 0, 2,  4,  8,  12, 6,  10, 14,  //
-      6,  8,  12, 0,  2,  4,  10, 14, /**/ 0, 6,  8,  12, 2,  4,  10, 14,  //
-      2,  6,  8,  12, 0,  4,  10, 14, /**/ 0, 2,  6,  8,  12, 4,  10, 14,  //
-      4,  6,  8,  12, 0,  2,  10, 14, /**/ 0, 4,  6,  8,  12, 2,  10, 14,  //
-      2,  4,  6,  8,  12, 0,  10, 14, /**/ 0, 2,  4,  6,  8,  12, 10, 14,  //
-      10, 12, 0,  2,  4,  6,  8,  14, /**/ 0, 10, 12, 2,  4,  6,  8,  14,  //
-      2,  10, 12, 0,  4,  6,  8,  14, /**/ 0, 2,  10, 12, 4,  6,  8,  14,  //
-      4,  10, 12, 0,  2,  6,  8,  14, /**/ 0, 4,  10, 12, 2,  6,  8,  14,  //
-      2,  4,  10, 12, 0,  6,  8,  14, /**/ 0, 2,  4,  10, 12, 6,  8,  14,  //
-      6,  10, 12, 0,  2,  4,  8,  14, /**/ 0, 6,  10, 12, 2,  4,  8,  14,  //
-      2,  6,  10, 12, 0,  4,  8,  14, /**/ 0, 2,  6,  10, 12, 4,  8,  14,  //
-      4,  6,  10, 12, 0,  2,  8,  14, /**/ 0, 4,  6,  10, 12, 2,  8,  14,  //
-      2,  4,  6,  10, 12, 0,  8,  14, /**/ 0, 2,  4,  6,  10, 12, 8,  14,  //
-      8,  10, 12, 0,  2,  4,  6,  14, /**/ 0, 8,  10, 12, 2,  4,  6,  14,  //
-      2,  8,  10, 12, 0,  4,  6,  14, /**/ 0, 2,  8,  10, 12, 4,  6,  14,  //
-      4,  8,  10, 12, 0,  2,  6,  14, /**/ 0, 4,  8,  10, 12, 2,  6,  14,  //
-      2,  4,  8,  10, 12, 0,  6,  14, /**/ 0, 2,  4,  8,  10, 12, 6,  14,  //
-      6,  8,  10, 12, 0,  2,  4,  14, /**/ 0, 6,  8,  10, 12, 2,  4,  14,  //
-      2,  6,  8,  10, 12, 0,  4,  14, /**/ 0, 2,  6,  8,  10, 12, 4,  14,  //
-      4,  6,  8,  10, 12, 0,  2,  14, /**/ 0, 4,  6,  8,  10, 12, 2,  14,  //
-      2,  4,  6,  8,  10, 12, 0,  14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
-      14, 0,  2,  4,  6,  8,  10, 12, /**/ 0, 14, 2,  4,  6,  8,  10, 12,  //
-      2,  14, 0,  4,  6,  8,  10, 12, /**/ 0, 2,  14, 4,  6,  8,  10, 12,  //
-      4,  14, 0,  2,  6,  8,  10, 12, /**/ 0, 4,  14, 2,  6,  8,  10, 12,  //
-      2,  4,  14, 0,  6,  8,  10, 12, /**/ 0, 2,  4,  14, 6,  8,  10, 12,  //
-      6,  14, 0,  2,  4,  8,  10, 12, /**/ 0, 6,  14, 2,  4,  8,  10, 12,  //
-      2,  6,  14, 0,  4,  8,  10, 12, /**/ 0, 2,  6,  14, 4,  8,  10, 12,  //
-      4,  6,  14, 0,  2,  8,  10, 12, /**/ 0, 4,  6,  14, 2,  8,  10, 12,  //
-      2,  4,  6,  14, 0,  8,  10, 12, /**/ 0, 2,  4,  6,  14, 8,  10, 12,  //
-      8,  14, 0,  2,  4,  6,  10, 12, /**/ 0, 8,  14, 2,  4,  6,  10, 12,  //
-      2,  8,  14, 0,  4,  6,  10, 12, /**/ 0, 2,  8,  14, 4,  6,  10, 12,  //
-      4,  8,  14, 0,  2,  6,  10, 12, /**/ 0, 4,  8,  14, 2,  6,  10, 12,  //
-      2,  4,  8,  14, 0,  6,  10, 12, /**/ 0, 2,  4,  8,  14, 6,  10, 12,  //
-      6,  8,  14, 0,  2,  4,  10, 12, /**/ 0, 6,  8,  14, 2,  4,  10, 12,  //
-      2,  6,  8,  14, 0,  4,  10, 12, /**/ 0, 2,  6,  8,  14, 4,  10, 12,  //
-      4,  6,  8,  14, 0,  2,  10, 12, /**/ 0, 4,  6,  8,  14, 2,  10, 12,  //
-      2,  4,  6,  8,  14, 0,  10, 12, /**/ 0, 2,  4,  6,  8,  14, 10, 12,  //
-      10, 14, 0,  2,  4,  6,  8,  12, /**/ 0, 10, 14, 2,  4,  6,  8,  12,  //
-      2,  10, 14, 0,  4,  6,  8,  12, /**/ 0, 2,  10, 14, 4,  6,  8,  12,  //
-      4,  10, 14, 0,  2,  6,  8,  12, /**/ 0, 4,  10, 14, 2,  6,  8,  12,  //
-      2,  4,  10, 14, 0,  6,  8,  12, /**/ 0, 2,  4,  10, 14, 6,  8,  12,  //
-      6,  10, 14, 0,  2,  4,  8,  12, /**/ 0, 6,  10, 14, 2,  4,  8,  12,  //
-      2,  6,  10, 14, 0,  4,  8,  12, /**/ 0, 2,  6,  10, 14, 4,  8,  12,  //
-      4,  6,  10, 14, 0,  2,  8,  12, /**/ 0, 4,  6,  10, 14, 2,  8,  12,  //
-      2,  4,  6,  10, 14, 0,  8,  12, /**/ 0, 2,  4,  6,  10, 14, 8,  12,  //
-      8,  10, 14, 0,  2,  4,  6,  12, /**/ 0, 8,  10, 14, 2,  4,  6,  12,  //
-      2,  8,  10, 14, 0,  4,  6,  12, /**/ 0, 2,  8,  10, 14, 4,  6,  12,  //
-      4,  8,  10, 14, 0,  2,  6,  12, /**/ 0, 4,  8,  10, 14, 2,  6,  12,  //
-      2,  4,  8,  10, 14, 0,  6,  12, /**/ 0, 2,  4,  8,  10, 14, 6,  12,  //
-      6,  8,  10, 14, 0,  2,  4,  12, /**/ 0, 6,  8,  10, 14, 2,  4,  12,  //
-      2,  6,  8,  10, 14, 0,  4,  12, /**/ 0, 2,  6,  8,  10, 14, 4,  12,  //
-      4,  6,  8,  10, 14, 0,  2,  12, /**/ 0, 4,  6,  8,  10, 14, 2,  12,  //
-      2,  4,  6,  8,  10, 14, 0,  12, /**/ 0, 2,  4,  6,  8,  10, 14, 12,  //
-      12, 14, 0,  2,  4,  6,  8,  10, /**/ 0, 12, 14, 2,  4,  6,  8,  10,  //
-      2,  12, 14, 0,  4,  6,  8,  10, /**/ 0, 2,  12, 14, 4,  6,  8,  10,  //
-      4,  12, 14, 0,  2,  6,  8,  10, /**/ 0, 4,  12, 14, 2,  6,  8,  10,  //
-      2,  4,  12, 14, 0,  6,  8,  10, /**/ 0, 2,  4,  12, 14, 6,  8,  10,  //
-      6,  12, 14, 0,  2,  4,  8,  10, /**/ 0, 6,  12, 14, 2,  4,  8,  10,  //
-      2,  6,  12, 14, 0,  4,  8,  10, /**/ 0, 2,  6,  12, 14, 4,  8,  10,  //
-      4,  6,  12, 14, 0,  2,  8,  10, /**/ 0, 4,  6,  12, 14, 2,  8,  10,  //
-      2,  4,  6,  12, 14, 0,  8,  10, /**/ 0, 2,  4,  6,  12, 14, 8,  10,  //
-      8,  12, 14, 0,  2,  4,  6,  10, /**/ 0, 8,  12, 14, 2,  4,  6,  10,  //
-      2,  8,  12, 14, 0,  4,  6,  10, /**/ 0, 2,  8,  12, 14, 4,  6,  10,  //
-      4,  8,  12, 14, 0,  2,  6,  10, /**/ 0, 4,  8,  12, 14, 2,  6,  10,  //
-      2,  4,  8,  12, 14, 0,  6,  10, /**/ 0, 2,  4,  8,  12, 14, 6,  10,  //
-      6,  8,  12, 14, 0,  2,  4,  10, /**/ 0, 6,  8,  12, 14, 2,  4,  10,  //
-      2,  6,  8,  12, 14, 0,  4,  10, /**/ 0, 2,  6,  8,  12, 14, 4,  10,  //
-      4,  6,  8,  12, 14, 0,  2,  10, /**/ 0, 4,  6,  8,  12, 14, 2,  10,  //
-      2,  4,  6,  8,  12, 14, 0,  10, /**/ 0, 2,  4,  6,  8,  12, 14, 10,  //
-      10, 12, 14, 0,  2,  4,  6,  8,  /**/ 0, 10, 12, 14, 2,  4,  6,  8,   //
-      2,  10, 12, 14, 0,  4,  6,  8,  /**/ 0, 2,  10, 12, 14, 4,  6,  8,   //
-      4,  10, 12, 14, 0,  2,  6,  8,  /**/ 0, 4,  10, 12, 14, 2,  6,  8,   //
-      2,  4,  10, 12, 14, 0,  6,  8,  /**/ 0, 2,  4,  10, 12, 14, 6,  8,   //
-      6,  10, 12, 14, 0,  2,  4,  8,  /**/ 0, 6,  10, 12, 14, 2,  4,  8,   //
-      2,  6,  10, 12, 14, 0,  4,  8,  /**/ 0, 2,  6,  10, 12, 14, 4,  8,   //
-      4,  6,  10, 12, 14, 0,  2,  8,  /**/ 0, 4,  6,  10, 12, 14, 2,  8,   //
-      2,  4,  6,  10, 12, 14, 0,  8,  /**/ 0, 2,  4,  6,  10, 12, 14, 8,   //
-      8,  10, 12, 14, 0,  2,  4,  6,  /**/ 0, 8,  10, 12, 14, 2,  4,  6,   //
-      2,  8,  10, 12, 14, 0,  4,  6,  /**/ 0, 2,  8,  10, 12, 14, 4,  6,   //
-      4,  8,  10, 12, 14, 0,  2,  6,  /**/ 0, 4,  8,  10, 12, 14, 2,  6,   //
-      2,  4,  8,  10, 12, 14, 0,  6,  /**/ 0, 2,  4,  8,  10, 12, 14, 6,   //
-      6,  8,  10, 12, 14, 0,  2,  4,  /**/ 0, 6,  8,  10, 12, 14, 2,  4,   //
-      2,  6,  8,  10, 12, 14, 0,  4,  /**/ 0, 2,  6,  8,  10, 12, 14, 4,   //
-      4,  6,  8,  10, 12, 14, 0,  2,  /**/ 0, 4,  6,  8,  10, 12, 14, 2,   //
-      2,  4,  6,  8,  10, 12, 14, 0,  /**/ 0, 2,  4,  6,  8,  10, 12, 14};
-
-  const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw};
-  const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
-  return BitCast(d, pairs + Set(du, 0x0100));
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 2)>
-HWY_INLINE Vec128<T, N> IndicesFromNotBits(Simd<T, N, 0> d,
-                                           uint64_t mask_bits) {
-  HWY_DASSERT(mask_bits < 256);
-  const Rebind<uint8_t, decltype(d)> d8;
-  const Simd<uint16_t, N, 0> du;
-
-  // compress_epi16 requires VBMI2 and there is no permutevar_epi16, so we need
-  // byte indices for PSHUFB (one vector's worth for each of 256 combinations of
-  // 8 mask bits). Loading them directly would require 4 KiB. We can instead
-  // store lane indices and convert to byte indices (2*lane + 0..1), with the
-  // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
-  // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
-  // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
-  // is likely more costly than the higher cache footprint from storing bytes.
-  alignas(16) constexpr uint8_t table[2048] = {
-      // PrintCompressNot16x8Tables
-      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 12, 14, 0,   //
-      0, 4,  6,  8,  10, 12, 14, 2,  /**/ 4,  6,  8,  10, 12, 14, 0,  2,   //
-      0, 2,  6,  8,  10, 12, 14, 4,  /**/ 2,  6,  8,  10, 12, 14, 0,  4,   //
-      0, 6,  8,  10, 12, 14, 2,  4,  /**/ 6,  8,  10, 12, 14, 0,  2,  4,   //
-      0, 2,  4,  8,  10, 12, 14, 6,  /**/ 2,  4,  8,  10, 12, 14, 0,  6,   //
-      0, 4,  8,  10, 12, 14, 2,  6,  /**/ 4,  8,  10, 12, 14, 0,  2,  6,   //
-      0, 2,  8,  10, 12, 14, 4,  6,  /**/ 2,  8,  10, 12, 14, 0,  4,  6,   //
-      0, 8,  10, 12, 14, 2,  4,  6,  /**/ 8,  10, 12, 14, 0,  2,  4,  6,   //
-      0, 2,  4,  6,  10, 12, 14, 8,  /**/ 2,  4,  6,  10, 12, 14, 0,  8,   //
-      0, 4,  6,  10, 12, 14, 2,  8,  /**/ 4,  6,  10, 12, 14, 0,  2,  8,   //
-      0, 2,  6,  10, 12, 14, 4,  8,  /**/ 2,  6,  10, 12, 14, 0,  4,  8,   //
-      0, 6,  10, 12, 14, 2,  4,  8,  /**/ 6,  10, 12, 14, 0,  2,  4,  8,   //
-      0, 2,  4,  10, 12, 14, 6,  8,  /**/ 2,  4,  10, 12, 14, 0,  6,  8,   //
-      0, 4,  10, 12, 14, 2,  6,  8,  /**/ 4,  10, 12, 14, 0,  2,  6,  8,   //
-      0, 2,  10, 12, 14, 4,  6,  8,  /**/ 2,  10, 12, 14, 0,  4,  6,  8,   //
-      0, 10, 12, 14, 2,  4,  6,  8,  /**/ 10, 12, 14, 0,  2,  4,  6,  8,   //
-      0, 2,  4,  6,  8,  12, 14, 10, /**/ 2,  4,  6,  8,  12, 14, 0,  10,  //
-      0, 4,  6,  8,  12, 14, 2,  10, /**/ 4,  6,  8,  12, 14, 0,  2,  10,  //
-      0, 2,  6,  8,  12, 14, 4,  10, /**/ 2,  6,  8,  12, 14, 0,  4,  10,  //
-      0, 6,  8,  12, 14, 2,  4,  10, /**/ 6,  8,  12, 14, 0,  2,  4,  10,  //
-      0, 2,  4,  8,  12, 14, 6,  10, /**/ 2,  4,  8,  12, 14, 0,  6,  10,  //
-      0, 4,  8,  12, 14, 2,  6,  10, /**/ 4,  8,  12, 14, 0,  2,  6,  10,  //
-      0, 2,  8,  12, 14, 4,  6,  10, /**/ 2,  8,  12, 14, 0,  4,  6,  10,  //
-      0, 8,  12, 14, 2,  4,  6,  10, /**/ 8,  12, 14, 0,  2,  4,  6,  10,  //
-      0, 2,  4,  6,  12, 14, 8,  10, /**/ 2,  4,  6,  12, 14, 0,  8,  10,  //
-      0, 4,  6,  12, 14, 2,  8,  10, /**/ 4,  6,  12, 14, 0,  2,  8,  10,  //
-      0, 2,  6,  12, 14, 4,  8,  10, /**/ 2,  6,  12, 14, 0,  4,  8,  10,  //
-      0, 6,  12, 14, 2,  4,  8,  10, /**/ 6,  12, 14, 0,  2,  4,  8,  10,  //
-      0, 2,  4,  12, 14, 6,  8,  10, /**/ 2,  4,  12, 14, 0,  6,  8,  10,  //
-      0, 4,  12, 14, 2,  6,  8,  10, /**/ 4,  12, 14, 0,  2,  6,  8,  10,  //
-      0, 2,  12, 14, 4,  6,  8,  10, /**/ 2,  12, 14, 0,  4,  6,  8,  10,  //
-      0, 12, 14, 2,  4,  6,  8,  10, /**/ 12, 14, 0,  2,  4,  6,  8,  10,  //
-      0, 2,  4,  6,  8,  10, 14, 12, /**/ 2,  4,  6,  8,  10, 14, 0,  12,  //
-      0, 4,  6,  8,  10, 14, 2,  12, /**/ 4,  6,  8,  10, 14, 0,  2,  12,  //
-      0, 2,  6,  8,  10, 14, 4,  12, /**/ 2,  6,  8,  10, 14, 0,  4,  12,  //
-      0, 6,  8,  10, 14, 2,  4,  12, /**/ 6,  8,  10, 14, 0,  2,  4,  12,  //
-      0, 2,  4,  8,  10, 14, 6,  12, /**/ 2,  4,  8,  10, 14, 0,  6,  12,  //
-      0, 4,  8,  10, 14, 2,  6,  12, /**/ 4,  8,  10, 14, 0,  2,  6,  12,  //
-      0, 2,  8,  10, 14, 4,  6,  12, /**/ 2,  8,  10, 14, 0,  4,  6,  12,  //
-      0, 8,  10, 14, 2,  4,  6,  12, /**/ 8,  10, 14, 0,  2,  4,  6,  12,  //
-      0, 2,  4,  6,  10, 14, 8,  12, /**/ 2,  4,  6,  10, 14, 0,  8,  12,  //
-      0, 4,  6,  10, 14, 2,  8,  12, /**/ 4,  6,  10, 14, 0,  2,  8,  12,  //
-      0, 2,  6,  10, 14, 4,  8,  12, /**/ 2,  6,  10, 14, 0,  4,  8,  12,  //
-      0, 6,  10, 14, 2,  4,  8,  12, /**/ 6,  10, 14, 0,  2,  4,  8,  12,  //
-      0, 2,  4,  10, 14, 6,  8,  12, /**/ 2,  4,  10, 14, 0,  6,  8,  12,  //
-      0, 4,  10, 14, 2,  6,  8,  12, /**/ 4,  10, 14, 0,  2,  6,  8,  12,  //
-      0, 2,  10, 14, 4,  6,  8,  12, /**/ 2,  10, 14, 0,  4,  6,  8,  12,  //
-      0, 10, 14, 2,  4,  6,  8,  12, /**/ 10, 14, 0,  2,  4,  6,  8,  12,  //
-      0, 2,  4,  6,  8,  14, 10, 12, /**/ 2,  4,  6,  8,  14, 0,  10, 12,  //
-      0, 4,  6,  8,  14, 2,  10, 12, /**/ 4,  6,  8,  14, 0,  2,  10, 12,  //
-      0, 2,  6,  8,  14, 4,  10, 12, /**/ 2,  6,  8,  14, 0,  4,  10, 12,  //
-      0, 6,  8,  14, 2,  4,  10, 12, /**/ 6,  8,  14, 0,  2,  4,  10, 12,  //
-      0, 2,  4,  8,  14, 6,  10, 12, /**/ 2,  4,  8,  14, 0,  6,  10, 12,  //
-      0, 4,  8,  14, 2,  6,  10, 12, /**/ 4,  8,  14, 0,  2,  6,  10, 12,  //
-      0, 2,  8,  14, 4,  6,  10, 12, /**/ 2,  8,  14, 0,  4,  6,  10, 12,  //
-      0, 8,  14, 2,  4,  6,  10, 12, /**/ 8,  14, 0,  2,  4,  6,  10, 12,  //
-      0, 2,  4,  6,  14, 8,  10, 12, /**/ 2,  4,  6,  14, 0,  8,  10, 12,  //
-      0, 4,  6,  14, 2,  8,  10, 12, /**/ 4,  6,  14, 0,  2,  8,  10, 12,  //
-      0, 2,  6,  14, 4,  8,  10, 12, /**/ 2,  6,  14, 0,  4,  8,  10, 12,  //
-      0, 6,  14, 2,  4,  8,  10, 12, /**/ 6,  14, 0,  2,  4,  8,  10, 12,  //
-      0, 2,  4,  14, 6,  8,  10, 12, /**/ 2,  4,  14, 0,  6,  8,  10, 12,  //
-      0, 4,  14, 2,  6,  8,  10, 12, /**/ 4,  14, 0,  2,  6,  8,  10, 12,  //
-      0, 2,  14, 4,  6,  8,  10, 12, /**/ 2,  14, 0,  4,  6,  8,  10, 12,  //
-      0, 14, 2,  4,  6,  8,  10, 12, /**/ 14, 0,  2,  4,  6,  8,  10, 12,  //
-      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 12, 0,  14,  //
-      0, 4,  6,  8,  10, 12, 2,  14, /**/ 4,  6,  8,  10, 12, 0,  2,  14,  //
-      0, 2,  6,  8,  10, 12, 4,  14, /**/ 2,  6,  8,  10, 12, 0,  4,  14,  //
-      0, 6,  8,  10, 12, 2,  4,  14, /**/ 6,  8,  10, 12, 0,  2,  4,  14,  //
-      0, 2,  4,  8,  10, 12, 6,  14, /**/ 2,  4,  8,  10, 12, 0,  6,  14,  //
-      0, 4,  8,  10, 12, 2,  6,  14, /**/ 4,  8,  10, 12, 0,  2,  6,  14,  //
-      0, 2,  8,  10, 12, 4,  6,  14, /**/ 2,  8,  10, 12, 0,  4,  6,  14,  //
-      0, 8,  10, 12, 2,  4,  6,  14, /**/ 8,  10, 12, 0,  2,  4,  6,  14,  //
-      0, 2,  4,  6,  10, 12, 8,  14, /**/ 2,  4,  6,  10, 12, 0,  8,  14,  //
-      0, 4,  6,  10, 12, 2,  8,  14, /**/ 4,  6,  10, 12, 0,  2,  8,  14,  //
-      0, 2,  6,  10, 12, 4,  8,  14, /**/ 2,  6,  10, 12, 0,  4,  8,  14,  //
-      0, 6,  10, 12, 2,  4,  8,  14, /**/ 6,  10, 12, 0,  2,  4,  8,  14,  //
-      0, 2,  4,  10, 12, 6,  8,  14, /**/ 2,  4,  10, 12, 0,  6,  8,  14,  //
-      0, 4,  10, 12, 2,  6,  8,  14, /**/ 4,  10, 12, 0,  2,  6,  8,  14,  //
-      0, 2,  10, 12, 4,  6,  8,  14, /**/ 2,  10, 12, 0,  4,  6,  8,  14,  //
-      0, 10, 12, 2,  4,  6,  8,  14, /**/ 10, 12, 0,  2,  4,  6,  8,  14,  //
-      0, 2,  4,  6,  8,  12, 10, 14, /**/ 2,  4,  6,  8,  12, 0,  10, 14,  //
-      0, 4,  6,  8,  12, 2,  10, 14, /**/ 4,  6,  8,  12, 0,  2,  10, 14,  //
-      0, 2,  6,  8,  12, 4,  10, 14, /**/ 2,  6,  8,  12, 0,  4,  10, 14,  //
-      0, 6,  8,  12, 2,  4,  10, 14, /**/ 6,  8,  12, 0,  2,  4,  10, 14,  //
-      0, 2,  4,  8,  12, 6,  10, 14, /**/ 2,  4,  8,  12, 0,  6,  10, 14,  //
-      0, 4,  8,  12, 2,  6,  10, 14, /**/ 4,  8,  12, 0,  2,  6,  10, 14,  //
-      0, 2,  8,  12, 4,  6,  10, 14, /**/ 2,  8,  12, 0,  4,  6,  10, 14,  //
-      0, 8,  12, 2,  4,  6,  10, 14, /**/ 8,  12, 0,  2,  4,  6,  10, 14,  //
-      0, 2,  4,  6,  12, 8,  10, 14, /**/ 2,  4,  6,  12, 0,  8,  10, 14,  //
-      0, 4,  6,  12, 2,  8,  10, 14, /**/ 4,  6,  12, 0,  2,  8,  10, 14,  //
-      0, 2,  6,  12, 4,  8,  10, 14, /**/ 2,  6,  12, 0,  4,  8,  10, 14,  //
-      0, 6,  12, 2,  4,  8,  10, 14, /**/ 6,  12, 0,  2,  4,  8,  10, 14,  //
-      0, 2,  4,  12, 6,  8,  10, 14, /**/ 2,  4,  12, 0,  6,  8,  10, 14,  //
-      0, 4,  12, 2,  6,  8,  10, 14, /**/ 4,  12, 0,  2,  6,  8,  10, 14,  //
-      0, 2,  12, 4,  6,  8,  10, 14, /**/ 2,  12, 0,  4,  6,  8,  10, 14,  //
-      0, 12, 2,  4,  6,  8,  10, 14, /**/ 12, 0,  2,  4,  6,  8,  10, 14,  //
-      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 0,  12, 14,  //
-      0, 4,  6,  8,  10, 2,  12, 14, /**/ 4,  6,  8,  10, 0,  2,  12, 14,  //
-      0, 2,  6,  8,  10, 4,  12, 14, /**/ 2,  6,  8,  10, 0,  4,  12, 14,  //
-      0, 6,  8,  10, 2,  4,  12, 14, /**/ 6,  8,  10, 0,  2,  4,  12, 14,  //
-      0, 2,  4,  8,  10, 6,  12, 14, /**/ 2,  4,  8,  10, 0,  6,  12, 14,  //
-      0, 4,  8,  10, 2,  6,  12, 14, /**/ 4,  8,  10, 0,  2,  6,  12, 14,  //
-      0, 2,  8,  10, 4,  6,  12, 14, /**/ 2,  8,  10, 0,  4,  6,  12, 14,  //
-      0, 8,  10, 2,  4,  6,  12, 14, /**/ 8,  10, 0,  2,  4,  6,  12, 14,  //
-      0, 2,  4,  6,  10, 8,  12, 14, /**/ 2,  4,  6,  10, 0,  8,  12, 14,  //
-      0, 4,  6,  10, 2,  8,  12, 14, /**/ 4,  6,  10, 0,  2,  8,  12, 14,  //
-      0, 2,  6,  10, 4,  8,  12, 14, /**/ 2,  6,  10, 0,  4,  8,  12, 14,  //
-      0, 6,  10, 2,  4,  8,  12, 14, /**/ 6,  10, 0,  2,  4,  8,  12, 14,  //
-      0, 2,  4,  10, 6,  8,  12, 14, /**/ 2,  4,  10, 0,  6,  8,  12, 14,  //
-      0, 4,  10, 2,  6,  8,  12, 14, /**/ 4,  10, 0,  2,  6,  8,  12, 14,  //
-      0, 2,  10, 4,  6,  8,  12, 14, /**/ 2,  10, 0,  4,  6,  8,  12, 14,  //
-      0, 10, 2,  4,  6,  8,  12, 14, /**/ 10, 0,  2,  4,  6,  8,  12, 14,  //
-      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  0,  10, 12, 14,  //
-      0, 4,  6,  8,  2,  10, 12, 14, /**/ 4,  6,  8,  0,  2,  10, 12, 14,  //
-      0, 2,  6,  8,  4,  10, 12, 14, /**/ 2,  6,  8,  0,  4,  10, 12, 14,  //
-      0, 6,  8,  2,  4,  10, 12, 14, /**/ 6,  8,  0,  2,  4,  10, 12, 14,  //
-      0, 2,  4,  8,  6,  10, 12, 14, /**/ 2,  4,  8,  0,  6,  10, 12, 14,  //
-      0, 4,  8,  2,  6,  10, 12, 14, /**/ 4,  8,  0,  2,  6,  10, 12, 14,  //
-      0, 2,  8,  4,  6,  10, 12, 14, /**/ 2,  8,  0,  4,  6,  10, 12, 14,  //
-      0, 8,  2,  4,  6,  10, 12, 14, /**/ 8,  0,  2,  4,  6,  10, 12, 14,  //
-      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  0,  8,  10, 12, 14,  //
-      0, 4,  6,  2,  8,  10, 12, 14, /**/ 4,  6,  0,  2,  8,  10, 12, 14,  //
-      0, 2,  6,  4,  8,  10, 12, 14, /**/ 2,  6,  0,  4,  8,  10, 12, 14,  //
-      0, 6,  2,  4,  8,  10, 12, 14, /**/ 6,  0,  2,  4,  8,  10, 12, 14,  //
-      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  0,  6,  8,  10, 12, 14,  //
-      0, 4,  2,  6,  8,  10, 12, 14, /**/ 4,  0,  2,  6,  8,  10, 12, 14,  //
-      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  0,  4,  6,  8,  10, 12, 14,  //
-      0, 2,  4,  6,  8,  10, 12, 14, /**/ 0,  2,  4,  6,  8,  10, 12, 14};
-
-  const Vec128<uint8_t, 2 * N> byte_idx{Load(d8, table + mask_bits * 8).raw};
-  const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
-  return BitCast(d, pairs + Set(du, 0x0100));
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4), HWY_IF_LE128(T, N)>
-HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N, 0> d, uint64_t mask_bits) {
-  HWY_DASSERT(mask_bits < 16);
-
-  // There are only 4 lanes, so we can afford to load the index vector directly.
-  alignas(16) constexpr uint8_t u8_indices[256] = {
-      // PrintCompress32x4Tables
-      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
-      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
-      4,  5,  6,  7,  0,  1,  2,  3,  8,  9,  10, 11, 12, 13, 14, 15,  //
-      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
-      8,  9,  10, 11, 0,  1,  2,  3,  4,  5,  6,  7,  12, 13, 14, 15,  //
-      0,  1,  2,  3,  8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15,  //
-      4,  5,  6,  7,  8,  9,  10, 11, 0,  1,  2,  3,  12, 13, 14, 15,  //
-      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
-      12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,  //
-      0,  1,  2,  3,  12, 13, 14, 15, 4,  5,  6,  7,  8,  9,  10, 11,  //
-      4,  5,  6,  7,  12, 13, 14, 15, 0,  1,  2,  3,  8,  9,  10, 11,  //
-      0,  1,  2,  3,  4,  5,  6,  7,  12, 13, 14, 15, 8,  9,  10, 11,  //
-      8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,   //
-      0,  1,  2,  3,  8,  9,  10, 11, 12, 13, 14, 15, 4,  5,  6,  7,   //
-      4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,   //
-      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15};
-
-  const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 4), HWY_IF_LE128(T, N)>
-HWY_INLINE Vec128<T, N> IndicesFromNotBits(Simd<T, N, 0> d,
-                                           uint64_t mask_bits) {
-  HWY_DASSERT(mask_bits < 16);
-
-  // There are only 4 lanes, so we can afford to load the index vector directly.
-  alignas(16) constexpr uint8_t u8_indices[256] = {
-      // PrintCompressNot32x4Tables
-      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 4,  5,
-      6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  0,  1,  2,  3,
-      8,  9,  10, 11, 12, 13, 14, 15, 4,  5,  6,  7,  8,  9,  10, 11, 12, 13,
-      14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7,
-      12, 13, 14, 15, 8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15, 0,  1,
-      2,  3,  8,  9,  10, 11, 0,  1,  2,  3,  12, 13, 14, 15, 4,  5,  6,  7,
-      8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
-      10, 11, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-      4,  5,  6,  7,  8,  9,  10, 11, 0,  1,  2,  3,  12, 13, 14, 15, 0,  1,
-      2,  3,  8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15, 8,  9,  10, 11,
-      0,  1,  2,  3,  4,  5,  6,  7,  12, 13, 14, 15, 0,  1,  2,  3,  4,  5,
-      6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 4,  5,  6,  7,  0,  1,  2,  3,
-      8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
-      10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
-      12, 13, 14, 15};
-
-  const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8), HWY_IF_LE128(T, N)>
-HWY_INLINE Vec128<T, N> IndicesFromBits(Simd<T, N, 0> d, uint64_t mask_bits) {
-  HWY_DASSERT(mask_bits < 4);
-
-  // There are only 2 lanes, so we can afford to load the index vector directly.
-  alignas(16) constexpr uint8_t u8_indices[64] = {
-      // PrintCompress64x2Tables
-      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
-      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
-      8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2,  3,  4,  5,  6,  7,
-      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15};
-
-  const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
-}
-
-template <typename T, size_t N, HWY_IF_LANE_SIZE(T, 8), HWY_IF_LE128(T, N)>
-HWY_INLINE Vec128<T, N> IndicesFromNotBits(Simd<T, N, 0> d,
-                                           uint64_t mask_bits) {
-  HWY_DASSERT(mask_bits < 4);
-
-  // There are only 2 lanes, so we can afford to load the index vector directly.
-  alignas(16) constexpr uint8_t u8_indices[64] = {
-      // PrintCompressNot64x2Tables
-      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
-      8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2,  3,  4,  5,  6,  7,
-      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
-      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15};
-
-  const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v, uint64_t mask_bits) {
-  const Simd<T, N, 0> d;
-  const RebindToUnsigned<decltype(d)> du;
-
-  HWY_DASSERT(mask_bits < (1ull << N));
-  const auto indices = BitCast(du, detail::IndicesFromBits(d, mask_bits));
-  return BitCast(d, TableLookupBytes(BitCast(du, v), indices));
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> CompressNotBits(Vec128<T, N> v, uint64_t mask_bits) {
-  const Simd<T, N, 0> d;
-  const RebindToUnsigned<decltype(d)> du;
-
-  HWY_DASSERT(mask_bits < (1ull << N));
-  const auto indices = BitCast(du, detail::IndicesFromNotBits(d, mask_bits));
-  return BitCast(d, TableLookupBytes(BitCast(du, v), indices));
-}
-
-}  // namespace detail
-
-// Single lane: no-op
-template <typename T>
-HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
-  return v;
-}
-
-// Two lanes: conditional swap
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec128<T> Compress(Vec128<T> v, Mask128<T> mask) {
-  // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep.
-  const Full128<T> d;
-  const Vec128<T> m = VecFromMask(d, mask);
-  const Vec128<T> maskL = DupEven(m);
-  const Vec128<T> maskH = DupOdd(m);
-  const Vec128<T> swap = AndNot(maskL, maskH);
-  return IfVecThenElse(swap, Shuffle01(v), v);
-}
-
-// General case
-template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 8)>
-HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
-  return detail::CompressBits(v, detail::BitsFromMask(mask));
-}
-
-// Single lane: no-op
-template <typename T>
-HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
-  return v;
-}
-
-// Two lanes: conditional swap
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) {
-  // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep.
-  const Full128<T> d;
-  const Vec128<T> m = VecFromMask(d, mask);
-  const Vec128<T> maskL = DupEven(m);
-  const Vec128<T> maskH = DupOdd(m);
-  const Vec128<T> swap = AndNot(maskH, maskL);
-  return IfVecThenElse(swap, Shuffle01(v), v);
-}
-
-// General case
-template <typename T, size_t N, HWY_IF_NOT_LANE_SIZE(T, 8)>
-HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
-  // For partial vectors, we cannot pull the Not() into the table because
-  // BitsFromMask clears the upper bits.
-  if (N < 16 / sizeof(T)) {
-    return detail::CompressBits(v, detail::BitsFromMask(Not(mask)));
-  }
-  return detail::CompressNotBits(v, detail::BitsFromMask(mask));
-}
-
-// ------------------------------ CompressBlocksNot
-HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
-                                           Mask128<uint64_t> /* m */) {
-  return v;
-}
-
-template <typename T, size_t N>
-HWY_API Vec128<T, N> CompressBits(Vec128<T, N> v,
-                                  const uint8_t* HWY_RESTRICT bits) {
-  uint64_t mask_bits = 0;
-  constexpr size_t kNumBytes = (N + 7) / 8;
-  CopyBytes<kNumBytes>(bits, &mask_bits);
-  if (N < 8) {
-    mask_bits &= (1ull << N) - 1;
-  }
-
-  return detail::CompressBits(v, mask_bits);
-}
-
-// ------------------------------ CompressStore, CompressBitsStore
-
-template <typename T, size_t N>
-HWY_API size_t CompressStore(Vec128<T, N> v, Mask128<T, N> m, Simd<T, N, 0> d,
-                             T* HWY_RESTRICT unaligned) {
-  const RebindToUnsigned<decltype(d)> du;
-
-  const uint64_t mask_bits = detail::BitsFromMask(m);
-  HWY_DASSERT(mask_bits < (1ull << N));
-  const size_t count = PopCount(mask_bits);
-
-  // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches).
-  const auto indices = BitCast(du, detail::IndicesFromBits(d, mask_bits));
-  const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
-  StoreU(compressed, d, unaligned);
-  // Workaround for MSAN not marking output as initialized (b/233326619)
-#if HWY_IS_MSAN
-  __msan_unpoison(unaligned, count * sizeof(T));
-#endif
-
-  return count;
-}
-
-template <typename T, size_t N>
-HWY_API size_t CompressBlendedStore(Vec128<T, N> v, Mask128<T, N> m,
-                                    Simd<T, N, 0> d,
-                                    T* HWY_RESTRICT unaligned) {
-  const RebindToUnsigned<decltype(d)> du;
-
-  const uint64_t mask_bits = detail::BitsFromMask(m);
-  HWY_DASSERT(mask_bits < (1ull << N));
-  const size_t count = PopCount(mask_bits);
-
-  // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches).
-  const auto indices = BitCast(du, detail::IndicesFromBits(d, mask_bits));
-  const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
-  BlendedStore(compressed, FirstN(d, count), d, unaligned);
-  // Workaround for MSAN not marking output as initialized (b/233326619)
-#if HWY_IS_MSAN
-  __msan_unpoison(unaligned, count * sizeof(T));
-#endif
-  return count;
-}
-
-template <typename T, size_t N>
-HWY_API size_t CompressBitsStore(Vec128<T, N> v,
-                                 const uint8_t* HWY_RESTRICT bits,
-                                 Simd<T, N, 0> d, T* HWY_RESTRICT unaligned) {
-  const RebindToUnsigned<decltype(d)> du;
-
-  uint64_t mask_bits = 0;
-  constexpr size_t kNumBytes = (N + 7) / 8;
-  CopyBytes<kNumBytes>(bits, &mask_bits);
-  if (N < 8) {
-    mask_bits &= (1ull << N) - 1;
-  }
-  const size_t count = PopCount(mask_bits);
-
-  // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches).
-  const auto indices = BitCast(du, detail::IndicesFromBits(d, mask_bits));
-  const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices));
-  StoreU(compressed, d, unaligned);
-
-  // Workaround for MSAN not marking output as initialized (b/233326619)
-#if HWY_IS_MSAN
-  __msan_unpoison(unaligned, count * sizeof(T));
-#endif
-  return count;
-}
-
-#endif  // HWY_TARGET <= HWY_AVX3
-
-// ------------------------------ StoreInterleaved2/3/4
-
-// HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in
-// generic_ops-inl.h.
-
-// ------------------------------ Reductions
-
-namespace detail {
-
-// N=1 for any T: no-op
-template <typename T>
-HWY_INLINE Vec128<T, 1> SumOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
-                                   const Vec128<T, 1> v) {
-  return v;
-}
-template <typename T>
-HWY_INLINE Vec128<T, 1> MinOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
-                                   const Vec128<T, 1> v) {
-  return v;
-}
-template <typename T>
-HWY_INLINE Vec128<T, 1> MaxOfLanes(hwy::SizeTag<sizeof(T)> /* tag */,
-                                   const Vec128<T, 1> v) {
-  return v;
-}
-
-// u32/i32/f32:
-
-// N=2
-template <typename T>
-HWY_INLINE Vec128<T, 2> SumOfLanes(hwy::SizeTag<4> /* tag */,
-                                   const Vec128<T, 2> v10) {
-  return v10 + Shuffle2301(v10);
-}
-template <typename T>
-HWY_INLINE Vec128<T, 2> MinOfLanes(hwy::SizeTag<4> /* tag */,
-                                   const Vec128<T, 2> v10) {
-  return Min(v10, Shuffle2301(v10));
-}
-template <typename T>
-HWY_INLINE Vec128<T, 2> MaxOfLanes(hwy::SizeTag<4> /* tag */,
-                                   const Vec128<T, 2> v10) {
-  return Max(v10, Shuffle2301(v10));
-}
-
-// N=4 (full)
-template <typename T>
-HWY_INLINE Vec128<T> SumOfLanes(hwy::SizeTag<4> /* tag */,
-                                const Vec128<T> v3210) {
-  const Vec128<T> v1032 = Shuffle1032(v3210);
-  const Vec128<T> v31_20_31_20 = v3210 + v1032;
-  const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
-  return v20_31_20_31 + v31_20_31_20;
-}
-template <typename T>
-HWY_INLINE Vec128<T> MinOfLanes(hwy::SizeTag<4> /* tag */,
-                                const Vec128<T> v3210) {
-  const Vec128<T> v1032 = Shuffle1032(v3210);
-  const Vec128<T> v31_20_31_20 = Min(v3210, v1032);
-  const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
-  return Min(v20_31_20_31, v31_20_31_20);
-}
-template <typename T>
-HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<4> /* tag */,
-                                const Vec128<T> v3210) {
-  const Vec128<T> v1032 = Shuffle1032(v3210);
-  const Vec128<T> v31_20_31_20 = Max(v3210, v1032);
-  const Vec128<T> v20_31_20_31 = Shuffle0321(v31_20_31_20);
-  return Max(v20_31_20_31, v31_20_31_20);
-}
-
-// u64/i64/f64:
-
-// N=2 (full)
-template <typename T>
-HWY_INLINE Vec128<T> SumOfLanes(hwy::SizeTag<8> /* tag */,
-                                const Vec128<T> v10) {
-  const Vec128<T> v01 = Shuffle01(v10);
-  return v10 + v01;
-}
-template <typename T>
-HWY_INLINE Vec128<T> MinOfLanes(hwy::SizeTag<8> /* tag */,
-                                const Vec128<T> v10) {
-  const Vec128<T> v01 = Shuffle01(v10);
-  return Min(v10, v01);
-}
-template <typename T>
-HWY_INLINE Vec128<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
-                                const Vec128<T> v10) {
-  const Vec128<T> v01 = Shuffle01(v10);
-  return Max(v10, v01);
-}
-
-template <size_t N, HWY_IF_GE32(uint16_t, N)>
-HWY_API Vec128<uint16_t, N> MinOfLanes(hwy::SizeTag<2> /* tag */,
-                                       Vec128<uint16_t, N> v) {
-  const Simd<uint16_t, N, 0> d;
-  const RepartitionToWide<decltype(d)> d32;
-  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
-  const auto odd = ShiftRight<16>(BitCast(d32, v));
-  const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd));
-  // Also broadcast into odd lanes.
-  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
-}
-template <size_t N, HWY_IF_GE32(int16_t, N)>
-HWY_API Vec128<int16_t, N> MinOfLanes(hwy::SizeTag<2> /* tag */,
-                                      Vec128<int16_t, N> v) {
-  const Simd<int16_t, N, 0> d;
-  const RepartitionToWide<decltype(d)> d32;
-  // Sign-extend
-  const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
-  const auto odd = ShiftRight<16>(BitCast(d32, v));
-  const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd));
-  // Also broadcast into odd lanes.
-  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
-}
-
-template <size_t N, HWY_IF_GE32(uint16_t, N)>
-HWY_API Vec128<uint16_t, N> MaxOfLanes(hwy::SizeTag<2> /* tag */,
-                                       Vec128<uint16_t, N> v) {
-  const Simd<uint16_t, N, 0> d;
-  const RepartitionToWide<decltype(d)> d32;
-  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
-  const auto odd = ShiftRight<16>(BitCast(d32, v));
-  const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd));
-  // Also broadcast into odd lanes.
-  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
-}
-template <size_t N, HWY_IF_GE32(int16_t, N)>
-HWY_API Vec128<int16_t, N> MaxOfLanes(hwy::SizeTag<2> /* tag */,
-                                      Vec128<int16_t, N> v) {
-  const Simd<int16_t, N, 0> d;
-  const RepartitionToWide<decltype(d)> d32;
-  // Sign-extend
-  const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
-  const auto odd = ShiftRight<16>(BitCast(d32, v));
-  const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd));
-  // Also broadcast into odd lanes.
-  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
-}
-
-}  // namespace detail
-
-// Supported for u/i/f 32/64. Returns the same value in each lane.
-template <typename T, size_t N>
-HWY_API Vec128<T, N> SumOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
-  return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), v);
-}
-template <typename T, size_t N>
-HWY_API Vec128<T, N> MinOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
-  return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), v);
-}
-template <typename T, size_t N>
-HWY_API Vec128<T, N> MaxOfLanes(Simd<T, N, 0> /* tag */, const Vec128<T, N> v) {
-  return detail::MaxOfLanes(hwy::SizeTag<sizeof(T)>(), v);
-}
-
-// ------------------------------ Lt128
-
-namespace detail {
-
-// Returns vector-mask for Lt128. Also used by x86_256/x86_512.
-template <class D, class V = VFromD<D>>
-HWY_INLINE V Lt128Vec(const D d, const V a, const V b) {
-  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
-  // Truth table of Eq and Lt for Hi and Lo u64.
-  // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
-  // =H =L cH cL  | out = cH | (=H & cL)
-  //  0  0  0  0  |  0
-  //  0  0  0  1  |  0
-  //  0  0  1  0  |  1
-  //  0  0  1  1  |  1
-  //  0  1  0  0  |  0
-  //  0  1  0  1  |  0
-  //  0  1  1  0  |  1
-  //  1  0  0  0  |  0
-  //  1  0  0  1  |  1
-  //  1  1  0  0  |  0
-  const auto eqHL = Eq(a, b);
-  const V ltHL = VecFromMask(d, Lt(a, b));
-  const V ltLX = ShiftLeftLanes<1>(ltHL);
-  const V vecHx = IfThenElse(eqHL, ltLX, ltHL);
-  return InterleaveUpper(d, vecHx, vecHx);
-}
-
-// Returns vector-mask for Eq128. Also used by x86_256/x86_512.
-template <class D, class V = VFromD<D>>
-HWY_INLINE V Eq128Vec(const D d, const V a, const V b) {
-  static_assert(!IsSigned<TFromD<D>>() && sizeof(TFromD<D>) == 8, "Use u64");
-  const auto eqHL = VecFromMask(d, Eq(a, b));
-  const auto eqLH = Reverse2(d, eqHL);
-  return And(eqHL, eqLH);
-}
-
-template <class D, class V = VFromD<D>>
-HWY_INLINE V Lt128UpperVec(const D d, const V a, const V b) {
-  // No specialization required for AVX-512: Mask <-> Vec is fast, and
-  // copying mask bits to their neighbor seems infeasible.
-  const V ltHL = VecFromMask(d, Lt(a, b));
-  return InterleaveUpper(d, ltHL, ltHL);
-}
-
-template <class D, class V = VFromD<D>>
-HWY_INLINE V Eq128UpperVec(const D d, const V a, const V b) {
-  // No specialization required for AVX-512: Mask <-> Vec is fast, and
-  // copying mask bits to their neighbor seems infeasible.
-  const V eqHL = VecFromMask(d, Eq(a, b));
-  return InterleaveUpper(d, eqHL, eqHL);
-}
-
-}  // namespace detail
-
-template <class D, class V = VFromD<D>>
-HWY_API MFromD<D> Lt128(D d, const V a, const V b) {
-  return MaskFromVec(detail::Lt128Vec(d, a, b));
-}
-
-template <class D, class V = VFromD<D>>
-HWY_API MFromD<D> Eq128(D d, const V a, const V b) {
-  return MaskFromVec(detail::Eq128Vec(d, a, b));
-}
-
-template <class D, class V = VFromD<D>>
-HWY_API MFromD<D> Lt128Upper(D d, const V a, const V b) {
-  return MaskFromVec(detail::Lt128UpperVec(d, a, b));
-}
-
-template <class D, class V = VFromD<D>>
-HWY_API MFromD<D> Eq128Upper(D d, const V a, const V b) {
-  return MaskFromVec(detail::Eq128UpperVec(d, a, b));
-}
-
-// ------------------------------ Min128, Max128 (Lt128)
-
-// Avoids the extra MaskFromVec in Lt128.
-template <class D, class V = VFromD<D>>
-HWY_API V Min128(D d, const V a, const V b) {
-  return IfVecThenElse(detail::Lt128Vec(d, a, b), a, b);
-}
-
-template <class D, class V = VFromD<D>>
-HWY_API V Max128(D d, const V a, const V b) {
-  return IfVecThenElse(detail::Lt128Vec(d, b, a), a, b);
-}
-
-template <class D, class V = VFromD<D>>
-HWY_API V Min128Upper(D d, const V a, const V b) {
-  return IfVecThenElse(detail::Lt128UpperVec(d, a, b), a, b);
-}
-
-template <class D, class V = VFromD<D>>
-HWY_API V Max128Upper(D d, const V a, const V b) {
-  return IfVecThenElse(detail::Lt128UpperVec(d, b, a), a, b);
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-// Note that the GCC warnings are not suppressed if we only wrap the *intrin.h -
-// the warning seems to be issued at the call site of intrinsics, i.e. our code.
-HWY_DIAGNOSTICS(pop)
diff --git a/third_party/highway/hwy/ops/x86_256-inl.h b/third_party/highway/hwy/ops/x86_256-inl.h
deleted file mode 100644 (file)
index b4c29a7..0000000
+++ /dev/null
@@ -1,5502 +0,0 @@
-// Copyright 2019 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// 256-bit vectors and AVX2 instructions, plus some AVX512-VL operations when
-// compiling for that target.
-// External include guard in highway.h - see comment there.
-
-// WARNING: most operations do not cross 128-bit block boundaries. In
-// particular, "Broadcast", pack and zip behavior may be surprising.
-
-// Must come before HWY_DIAGNOSTICS and HWY_COMPILER_CLANGCL
-#include "hwy/base.h"
-
-// Avoid uninitialized warnings in GCC's avx512fintrin.h - see
-// https://github.com/google/highway/issues/710)
-HWY_DIAGNOSTICS(push)
-#if HWY_COMPILER_GCC_ACTUAL
-HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
-HWY_DIAGNOSTICS_OFF(disable : 4703 6001 26494, ignored "-Wmaybe-uninitialized")
-#endif
-
-// Must come before HWY_COMPILER_CLANGCL
-#include <immintrin.h>  // AVX2+
-
-#if HWY_COMPILER_CLANGCL
-// Including <immintrin.h> should be enough, but Clang's headers helpfully skip
-// including these headers when _MSC_VER is defined, like when using clang-cl.
-// Include these directly here.
-#include <avxintrin.h>
-// avxintrin defines __m256i and must come before avx2intrin.
-#include <avx2intrin.h>
-#include <bmi2intrin.h>  // _pext_u64
-#include <f16cintrin.h>
-#include <fmaintrin.h>
-#include <smmintrin.h>
-#endif  // HWY_COMPILER_CLANGCL
-
-#include <stddef.h>
-#include <stdint.h>
-#include <string.h>  // memcpy
-
-#if HWY_IS_MSAN
-#include <sanitizer/msan_interface.h>
-#endif
-
-// For half-width vectors. Already includes base.h and shared-inl.h.
-#include "hwy/ops/x86_128-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-namespace detail {
-
-template <typename T>
-struct Raw256 {
-  using type = __m256i;
-};
-template <>
-struct Raw256<float> {
-  using type = __m256;
-};
-template <>
-struct Raw256<double> {
-  using type = __m256d;
-};
-
-}  // namespace detail
-
-template <typename T>
-class Vec256 {
-  using Raw = typename detail::Raw256<T>::type;
-
- public:
-  // Compound assignment. Only usable if there is a corresponding non-member
-  // binary operator overload. For example, only f32 and f64 support division.
-  HWY_INLINE Vec256& operator*=(const Vec256 other) {
-    return *this = (*this * other);
-  }
-  HWY_INLINE Vec256& operator/=(const Vec256 other) {
-    return *this = (*this / other);
-  }
-  HWY_INLINE Vec256& operator+=(const Vec256 other) {
-    return *this = (*this + other);
-  }
-  HWY_INLINE Vec256& operator-=(const Vec256 other) {
-    return *this = (*this - other);
-  }
-  HWY_INLINE Vec256& operator&=(const Vec256 other) {
-    return *this = (*this & other);
-  }
-  HWY_INLINE Vec256& operator|=(const Vec256 other) {
-    return *this = (*this | other);
-  }
-  HWY_INLINE Vec256& operator^=(const Vec256 other) {
-    return *this = (*this ^ other);
-  }
-
-  Raw raw;
-};
-
-#if HWY_TARGET <= HWY_AVX3
-
-namespace detail {
-
-// Template arg: sizeof(lane type)
-template <size_t size>
-struct RawMask256 {};
-template <>
-struct RawMask256<1> {
-  using type = __mmask32;
-};
-template <>
-struct RawMask256<2> {
-  using type = __mmask16;
-};
-template <>
-struct RawMask256<4> {
-  using type = __mmask8;
-};
-template <>
-struct RawMask256<8> {
-  using type = __mmask8;
-};
-
-}  // namespace detail
-
-template <typename T>
-struct Mask256 {
-  using Raw = typename detail::RawMask256<sizeof(T)>::type;
-
-  static Mask256<T> FromBits(uint64_t mask_bits) {
-    return Mask256<T>{static_cast<Raw>(mask_bits)};
-  }
-
-  Raw raw;
-};
-
-#else  // AVX2
-
-// FF..FF or 0.
-template <typename T>
-struct Mask256 {
-  typename detail::Raw256<T>::type raw;
-};
-
-#endif  // HWY_TARGET <= HWY_AVX3
-
-// ------------------------------ BitCast
-
-namespace detail {
-
-HWY_INLINE __m256i BitCastToInteger(__m256i v) { return v; }
-HWY_INLINE __m256i BitCastToInteger(__m256 v) { return _mm256_castps_si256(v); }
-HWY_INLINE __m256i BitCastToInteger(__m256d v) {
-  return _mm256_castpd_si256(v);
-}
-
-template <typename T>
-HWY_INLINE Vec256<uint8_t> BitCastToByte(Vec256<T> v) {
-  return Vec256<uint8_t>{BitCastToInteger(v.raw)};
-}
-
-// Cannot rely on function overloading because return types differ.
-template <typename T>
-struct BitCastFromInteger256 {
-  HWY_INLINE __m256i operator()(__m256i v) { return v; }
-};
-template <>
-struct BitCastFromInteger256<float> {
-  HWY_INLINE __m256 operator()(__m256i v) { return _mm256_castsi256_ps(v); }
-};
-template <>
-struct BitCastFromInteger256<double> {
-  HWY_INLINE __m256d operator()(__m256i v) { return _mm256_castsi256_pd(v); }
-};
-
-template <typename T>
-HWY_INLINE Vec256<T> BitCastFromByte(Full256<T> /* tag */, Vec256<uint8_t> v) {
-  return Vec256<T>{BitCastFromInteger256<T>()(v.raw)};
-}
-
-}  // namespace detail
-
-template <typename T, typename FromT>
-HWY_API Vec256<T> BitCast(Full256<T> d, Vec256<FromT> v) {
-  return detail::BitCastFromByte(d, detail::BitCastToByte(v));
-}
-
-// ------------------------------ Set
-
-// Returns an all-zero vector.
-template <typename T>
-HWY_API Vec256<T> Zero(Full256<T> /* tag */) {
-  return Vec256<T>{_mm256_setzero_si256()};
-}
-HWY_API Vec256<float> Zero(Full256<float> /* tag */) {
-  return Vec256<float>{_mm256_setzero_ps()};
-}
-HWY_API Vec256<double> Zero(Full256<double> /* tag */) {
-  return Vec256<double>{_mm256_setzero_pd()};
-}
-
-// Returns a vector with all lanes set to "t".
-HWY_API Vec256<uint8_t> Set(Full256<uint8_t> /* tag */, const uint8_t t) {
-  return Vec256<uint8_t>{_mm256_set1_epi8(static_cast<char>(t))};  // NOLINT
-}
-HWY_API Vec256<uint16_t> Set(Full256<uint16_t> /* tag */, const uint16_t t) {
-  return Vec256<uint16_t>{_mm256_set1_epi16(static_cast<short>(t))};  // NOLINT
-}
-HWY_API Vec256<uint32_t> Set(Full256<uint32_t> /* tag */, const uint32_t t) {
-  return Vec256<uint32_t>{_mm256_set1_epi32(static_cast<int>(t))};
-}
-HWY_API Vec256<uint64_t> Set(Full256<uint64_t> /* tag */, const uint64_t t) {
-  return Vec256<uint64_t>{
-      _mm256_set1_epi64x(static_cast<long long>(t))};  // NOLINT
-}
-HWY_API Vec256<int8_t> Set(Full256<int8_t> /* tag */, const int8_t t) {
-  return Vec256<int8_t>{_mm256_set1_epi8(static_cast<char>(t))};  // NOLINT
-}
-HWY_API Vec256<int16_t> Set(Full256<int16_t> /* tag */, const int16_t t) {
-  return Vec256<int16_t>{_mm256_set1_epi16(static_cast<short>(t))};  // NOLINT
-}
-HWY_API Vec256<int32_t> Set(Full256<int32_t> /* tag */, const int32_t t) {
-  return Vec256<int32_t>{_mm256_set1_epi32(t)};
-}
-HWY_API Vec256<int64_t> Set(Full256<int64_t> /* tag */, const int64_t t) {
-  return Vec256<int64_t>{
-      _mm256_set1_epi64x(static_cast<long long>(t))};  // NOLINT
-}
-HWY_API Vec256<float> Set(Full256<float> /* tag */, const float t) {
-  return Vec256<float>{_mm256_set1_ps(t)};
-}
-HWY_API Vec256<double> Set(Full256<double> /* tag */, const double t) {
-  return Vec256<double>{_mm256_set1_pd(t)};
-}
-
-HWY_DIAGNOSTICS(push)
-HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
-
-// Returns a vector with uninitialized elements.
-template <typename T>
-HWY_API Vec256<T> Undefined(Full256<T> /* tag */) {
-  // Available on Clang 6.0, GCC 6.2, ICC 16.03, MSVC 19.14. All but ICC
-  // generate an XOR instruction.
-  return Vec256<T>{_mm256_undefined_si256()};
-}
-HWY_API Vec256<float> Undefined(Full256<float> /* tag */) {
-  return Vec256<float>{_mm256_undefined_ps()};
-}
-HWY_API Vec256<double> Undefined(Full256<double> /* tag */) {
-  return Vec256<double>{_mm256_undefined_pd()};
-}
-
-HWY_DIAGNOSTICS(pop)
-
-// ================================================== LOGICAL
-
-// ------------------------------ And
-
-template <typename T>
-HWY_API Vec256<T> And(Vec256<T> a, Vec256<T> b) {
-  return Vec256<T>{_mm256_and_si256(a.raw, b.raw)};
-}
-
-HWY_API Vec256<float> And(const Vec256<float> a, const Vec256<float> b) {
-  return Vec256<float>{_mm256_and_ps(a.raw, b.raw)};
-}
-HWY_API Vec256<double> And(const Vec256<double> a, const Vec256<double> b) {
-  return Vec256<double>{_mm256_and_pd(a.raw, b.raw)};
-}
-
-// ------------------------------ AndNot
-
-// Returns ~not_mask & mask.
-template <typename T>
-HWY_API Vec256<T> AndNot(Vec256<T> not_mask, Vec256<T> mask) {
-  return Vec256<T>{_mm256_andnot_si256(not_mask.raw, mask.raw)};
-}
-HWY_API Vec256<float> AndNot(const Vec256<float> not_mask,
-                             const Vec256<float> mask) {
-  return Vec256<float>{_mm256_andnot_ps(not_mask.raw, mask.raw)};
-}
-HWY_API Vec256<double> AndNot(const Vec256<double> not_mask,
-                              const Vec256<double> mask) {
-  return Vec256<double>{_mm256_andnot_pd(not_mask.raw, mask.raw)};
-}
-
-// ------------------------------ Or
-
-template <typename T>
-HWY_API Vec256<T> Or(Vec256<T> a, Vec256<T> b) {
-  return Vec256<T>{_mm256_or_si256(a.raw, b.raw)};
-}
-
-HWY_API Vec256<float> Or(const Vec256<float> a, const Vec256<float> b) {
-  return Vec256<float>{_mm256_or_ps(a.raw, b.raw)};
-}
-HWY_API Vec256<double> Or(const Vec256<double> a, const Vec256<double> b) {
-  return Vec256<double>{_mm256_or_pd(a.raw, b.raw)};
-}
-
-// ------------------------------ Xor
-
-template <typename T>
-HWY_API Vec256<T> Xor(Vec256<T> a, Vec256<T> b) {
-  return Vec256<T>{_mm256_xor_si256(a.raw, b.raw)};
-}
-
-HWY_API Vec256<float> Xor(const Vec256<float> a, const Vec256<float> b) {
-  return Vec256<float>{_mm256_xor_ps(a.raw, b.raw)};
-}
-HWY_API Vec256<double> Xor(const Vec256<double> a, const Vec256<double> b) {
-  return Vec256<double>{_mm256_xor_pd(a.raw, b.raw)};
-}
-
-// ------------------------------ Not
-
-template <typename T>
-HWY_API Vec256<T> Not(const Vec256<T> v) {
-  using TU = MakeUnsigned<T>;
-#if HWY_TARGET <= HWY_AVX3
-  const __m256i vu = BitCast(Full256<TU>(), v).raw;
-  return BitCast(Full256<T>(),
-                 Vec256<TU>{_mm256_ternarylogic_epi32(vu, vu, vu, 0x55)});
-#else
-  return Xor(v, BitCast(Full256<T>(), Vec256<TU>{_mm256_set1_epi32(-1)}));
-#endif
-}
-
-// ------------------------------ Or3
-
-template <typename T>
-HWY_API Vec256<T> Or3(Vec256<T> o1, Vec256<T> o2, Vec256<T> o3) {
-#if HWY_TARGET <= HWY_AVX3
-  const Full256<T> d;
-  const RebindToUnsigned<decltype(d)> du;
-  using VU = VFromD<decltype(du)>;
-  const __m256i ret = _mm256_ternarylogic_epi64(
-      BitCast(du, o1).raw, BitCast(du, o2).raw, BitCast(du, o3).raw, 0xFE);
-  return BitCast(d, VU{ret});
-#else
-  return Or(o1, Or(o2, o3));
-#endif
-}
-
-// ------------------------------ OrAnd
-
-template <typename T>
-HWY_API Vec256<T> OrAnd(Vec256<T> o, Vec256<T> a1, Vec256<T> a2) {
-#if HWY_TARGET <= HWY_AVX3
-  const Full256<T> d;
-  const RebindToUnsigned<decltype(d)> du;
-  using VU = VFromD<decltype(du)>;
-  const __m256i ret = _mm256_ternarylogic_epi64(
-      BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8);
-  return BitCast(d, VU{ret});
-#else
-  return Or(o, And(a1, a2));
-#endif
-}
-
-// ------------------------------ IfVecThenElse
-
-template <typename T>
-HWY_API Vec256<T> IfVecThenElse(Vec256<T> mask, Vec256<T> yes, Vec256<T> no) {
-#if HWY_TARGET <= HWY_AVX3
-  const Full256<T> d;
-  const RebindToUnsigned<decltype(d)> du;
-  using VU = VFromD<decltype(du)>;
-  return BitCast(d, VU{_mm256_ternarylogic_epi64(BitCast(du, mask).raw,
-                                                 BitCast(du, yes).raw,
-                                                 BitCast(du, no).raw, 0xCA)});
-#else
-  return IfThenElse(MaskFromVec(mask), yes, no);
-#endif
-}
-
-// ------------------------------ Operator overloads (internal-only if float)
-
-template <typename T>
-HWY_API Vec256<T> operator&(const Vec256<T> a, const Vec256<T> b) {
-  return And(a, b);
-}
-
-template <typename T>
-HWY_API Vec256<T> operator|(const Vec256<T> a, const Vec256<T> b) {
-  return Or(a, b);
-}
-
-template <typename T>
-HWY_API Vec256<T> operator^(const Vec256<T> a, const Vec256<T> b) {
-  return Xor(a, b);
-}
-
-// ------------------------------ PopulationCount
-
-// 8/16 require BITALG, 32/64 require VPOPCNTDQ.
-#if HWY_TARGET == HWY_AVX3_DL
-
-#ifdef HWY_NATIVE_POPCNT
-#undef HWY_NATIVE_POPCNT
-#else
-#define HWY_NATIVE_POPCNT
-#endif
-
-namespace detail {
-
-template <typename T>
-HWY_INLINE Vec256<T> PopulationCount(hwy::SizeTag<1> /* tag */, Vec256<T> v) {
-  return Vec256<T>{_mm256_popcnt_epi8(v.raw)};
-}
-template <typename T>
-HWY_INLINE Vec256<T> PopulationCount(hwy::SizeTag<2> /* tag */, Vec256<T> v) {
-  return Vec256<T>{_mm256_popcnt_epi16(v.raw)};
-}
-template <typename T>
-HWY_INLINE Vec256<T> PopulationCount(hwy::SizeTag<4> /* tag */, Vec256<T> v) {
-  return Vec256<T>{_mm256_popcnt_epi32(v.raw)};
-}
-template <typename T>
-HWY_INLINE Vec256<T> PopulationCount(hwy::SizeTag<8> /* tag */, Vec256<T> v) {
-  return Vec256<T>{_mm256_popcnt_epi64(v.raw)};
-}
-
-}  // namespace detail
-
-template <typename T>
-HWY_API Vec256<T> PopulationCount(Vec256<T> v) {
-  return detail::PopulationCount(hwy::SizeTag<sizeof(T)>(), v);
-}
-
-#endif  // HWY_TARGET == HWY_AVX3_DL
-
-// ================================================== SIGN
-
-// ------------------------------ CopySign
-
-template <typename T>
-HWY_API Vec256<T> CopySign(const Vec256<T> magn, const Vec256<T> sign) {
-  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
-
-  const Full256<T> d;
-  const auto msb = SignBit(d);
-
-#if HWY_TARGET <= HWY_AVX3
-  const Rebind<MakeUnsigned<T>, decltype(d)> du;
-  // Truth table for msb, magn, sign | bitwise msb ? sign : mag
-  //                  0    0     0   |  0
-  //                  0    0     1   |  0
-  //                  0    1     0   |  1
-  //                  0    1     1   |  1
-  //                  1    0     0   |  0
-  //                  1    0     1   |  1
-  //                  1    1     0   |  0
-  //                  1    1     1   |  1
-  // The lane size does not matter because we are not using predication.
-  const __m256i out = _mm256_ternarylogic_epi32(
-      BitCast(du, msb).raw, BitCast(du, magn).raw, BitCast(du, sign).raw, 0xAC);
-  return BitCast(d, decltype(Zero(du)){out});
-#else
-  return Or(AndNot(msb, magn), And(msb, sign));
-#endif
-}
-
-template <typename T>
-HWY_API Vec256<T> CopySignToAbs(const Vec256<T> abs, const Vec256<T> sign) {
-#if HWY_TARGET <= HWY_AVX3
-  // AVX3 can also handle abs < 0, so no extra action needed.
-  return CopySign(abs, sign);
-#else
-  return Or(abs, And(SignBit(Full256<T>()), sign));
-#endif
-}
-
-// ================================================== MASK
-
-#if HWY_TARGET <= HWY_AVX3
-
-// ------------------------------ IfThenElse
-
-// Returns mask ? b : a.
-
-namespace detail {
-
-// Templates for signed/unsigned integer of a particular size.
-template <typename T>
-HWY_INLINE Vec256<T> IfThenElse(hwy::SizeTag<1> /* tag */, Mask256<T> mask,
-                                Vec256<T> yes, Vec256<T> no) {
-  return Vec256<T>{_mm256_mask_mov_epi8(no.raw, mask.raw, yes.raw)};
-}
-template <typename T>
-HWY_INLINE Vec256<T> IfThenElse(hwy::SizeTag<2> /* tag */, Mask256<T> mask,
-                                Vec256<T> yes, Vec256<T> no) {
-  return Vec256<T>{_mm256_mask_mov_epi16(no.raw, mask.raw, yes.raw)};
-}
-template <typename T>
-HWY_INLINE Vec256<T> IfThenElse(hwy::SizeTag<4> /* tag */, Mask256<T> mask,
-                                Vec256<T> yes, Vec256<T> no) {
-  return Vec256<T>{_mm256_mask_mov_epi32(no.raw, mask.raw, yes.raw)};
-}
-template <typename T>
-HWY_INLINE Vec256<T> IfThenElse(hwy::SizeTag<8> /* tag */, Mask256<T> mask,
-                                Vec256<T> yes, Vec256<T> no) {
-  return Vec256<T>{_mm256_mask_mov_epi64(no.raw, mask.raw, yes.raw)};
-}
-
-}  // namespace detail
-
-template <typename T>
-HWY_API Vec256<T> IfThenElse(Mask256<T> mask, Vec256<T> yes, Vec256<T> no) {
-  return detail::IfThenElse(hwy::SizeTag<sizeof(T)>(), mask, yes, no);
-}
-HWY_API Vec256<float> IfThenElse(Mask256<float> mask, Vec256<float> yes,
-                                 Vec256<float> no) {
-  return Vec256<float>{_mm256_mask_mov_ps(no.raw, mask.raw, yes.raw)};
-}
-HWY_API Vec256<double> IfThenElse(Mask256<double> mask, Vec256<double> yes,
-                                  Vec256<double> no) {
-  return Vec256<double>{_mm256_mask_mov_pd(no.raw, mask.raw, yes.raw)};
-}
-
-namespace detail {
-
-template <typename T>
-HWY_INLINE Vec256<T> IfThenElseZero(hwy::SizeTag<1> /* tag */, Mask256<T> mask,
-                                    Vec256<T> yes) {
-  return Vec256<T>{_mm256_maskz_mov_epi8(mask.raw, yes.raw)};
-}
-template <typename T>
-HWY_INLINE Vec256<T> IfThenElseZero(hwy::SizeTag<2> /* tag */, Mask256<T> mask,
-                                    Vec256<T> yes) {
-  return Vec256<T>{_mm256_maskz_mov_epi16(mask.raw, yes.raw)};
-}
-template <typename T>
-HWY_INLINE Vec256<T> IfThenElseZero(hwy::SizeTag<4> /* tag */, Mask256<T> mask,
-                                    Vec256<T> yes) {
-  return Vec256<T>{_mm256_maskz_mov_epi32(mask.raw, yes.raw)};
-}
-template <typename T>
-HWY_INLINE Vec256<T> IfThenElseZero(hwy::SizeTag<8> /* tag */, Mask256<T> mask,
-                                    Vec256<T> yes) {
-  return Vec256<T>{_mm256_maskz_mov_epi64(mask.raw, yes.raw)};
-}
-
-}  // namespace detail
-
-template <typename T>
-HWY_API Vec256<T> IfThenElseZero(Mask256<T> mask, Vec256<T> yes) {
-  return detail::IfThenElseZero(hwy::SizeTag<sizeof(T)>(), mask, yes);
-}
-HWY_API Vec256<float> IfThenElseZero(Mask256<float> mask, Vec256<float> yes) {
-  return Vec256<float>{_mm256_maskz_mov_ps(mask.raw, yes.raw)};
-}
-HWY_API Vec256<double> IfThenElseZero(Mask256<double> mask,
-                                      Vec256<double> yes) {
-  return Vec256<double>{_mm256_maskz_mov_pd(mask.raw, yes.raw)};
-}
-
-namespace detail {
-
-template <typename T>
-HWY_INLINE Vec256<T> IfThenZeroElse(hwy::SizeTag<1> /* tag */, Mask256<T> mask,
-                                    Vec256<T> no) {
-  // xor_epi8/16 are missing, but we have sub, which is just as fast for u8/16.
-  return Vec256<T>{_mm256_mask_sub_epi8(no.raw, mask.raw, no.raw, no.raw)};
-}
-template <typename T>
-HWY_INLINE Vec256<T> IfThenZeroElse(hwy::SizeTag<2> /* tag */, Mask256<T> mask,
-                                    Vec256<T> no) {
-  return Vec256<T>{_mm256_mask_sub_epi16(no.raw, mask.raw, no.raw, no.raw)};
-}
-template <typename T>
-HWY_INLINE Vec256<T> IfThenZeroElse(hwy::SizeTag<4> /* tag */, Mask256<T> mask,
-                                    Vec256<T> no) {
-  return Vec256<T>{_mm256_mask_xor_epi32(no.raw, mask.raw, no.raw, no.raw)};
-}
-template <typename T>
-HWY_INLINE Vec256<T> IfThenZeroElse(hwy::SizeTag<8> /* tag */, Mask256<T> mask,
-                                    Vec256<T> no) {
-  return Vec256<T>{_mm256_mask_xor_epi64(no.raw, mask.raw, no.raw, no.raw)};
-}
-
-}  // namespace detail
-
-template <typename T>
-HWY_API Vec256<T> IfThenZeroElse(Mask256<T> mask, Vec256<T> no) {
-  return detail::IfThenZeroElse(hwy::SizeTag<sizeof(T)>(), mask, no);
-}
-HWY_API Vec256<float> IfThenZeroElse(Mask256<float> mask, Vec256<float> no) {
-  return Vec256<float>{_mm256_mask_xor_ps(no.raw, mask.raw, no.raw, no.raw)};
-}
-HWY_API Vec256<double> IfThenZeroElse(Mask256<double> mask, Vec256<double> no) {
-  return Vec256<double>{_mm256_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)};
-}
-
-template <typename T>
-HWY_API Vec256<T> ZeroIfNegative(const Vec256<T> v) {
-  static_assert(IsSigned<T>(), "Only for float");
-  // AVX3 MaskFromVec only looks at the MSB
-  return IfThenZeroElse(MaskFromVec(v), v);
-}
-
-// ------------------------------ Mask logical
-
-namespace detail {
-
-template <typename T>
-HWY_INLINE Mask256<T> And(hwy::SizeTag<1> /*tag*/, const Mask256<T> a,
-                          const Mask256<T> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask256<T>{_kand_mask32(a.raw, b.raw)};
-#else
-  return Mask256<T>{static_cast<__mmask32>(a.raw & b.raw)};
-#endif
-}
-template <typename T>
-HWY_INLINE Mask256<T> And(hwy::SizeTag<2> /*tag*/, const Mask256<T> a,
-                          const Mask256<T> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask256<T>{_kand_mask16(a.raw, b.raw)};
-#else
-  return Mask256<T>{static_cast<__mmask16>(a.raw & b.raw)};
-#endif
-}
-template <typename T>
-HWY_INLINE Mask256<T> And(hwy::SizeTag<4> /*tag*/, const Mask256<T> a,
-                          const Mask256<T> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask256<T>{_kand_mask8(a.raw, b.raw)};
-#else
-  return Mask256<T>{static_cast<__mmask8>(a.raw & b.raw)};
-#endif
-}
-template <typename T>
-HWY_INLINE Mask256<T> And(hwy::SizeTag<8> /*tag*/, const Mask256<T> a,
-                          const Mask256<T> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask256<T>{_kand_mask8(a.raw, b.raw)};
-#else
-  return Mask256<T>{static_cast<__mmask8>(a.raw & b.raw)};
-#endif
-}
-
-template <typename T>
-HWY_INLINE Mask256<T> AndNot(hwy::SizeTag<1> /*tag*/, const Mask256<T> a,
-                             const Mask256<T> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask256<T>{_kandn_mask32(a.raw, b.raw)};
-#else
-  return Mask256<T>{static_cast<__mmask32>(~a.raw & b.raw)};
-#endif
-}
-template <typename T>
-HWY_INLINE Mask256<T> AndNot(hwy::SizeTag<2> /*tag*/, const Mask256<T> a,
-                             const Mask256<T> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask256<T>{_kandn_mask16(a.raw, b.raw)};
-#else
-  return Mask256<T>{static_cast<__mmask16>(~a.raw & b.raw)};
-#endif
-}
-template <typename T>
-HWY_INLINE Mask256<T> AndNot(hwy::SizeTag<4> /*tag*/, const Mask256<T> a,
-                             const Mask256<T> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask256<T>{_kandn_mask8(a.raw, b.raw)};
-#else
-  return Mask256<T>{static_cast<__mmask8>(~a.raw & b.raw)};
-#endif
-}
-template <typename T>
-HWY_INLINE Mask256<T> AndNot(hwy::SizeTag<8> /*tag*/, const Mask256<T> a,
-                             const Mask256<T> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask256<T>{_kandn_mask8(a.raw, b.raw)};
-#else
-  return Mask256<T>{static_cast<__mmask8>(~a.raw & b.raw)};
-#endif
-}
-
-template <typename T>
-HWY_INLINE Mask256<T> Or(hwy::SizeTag<1> /*tag*/, const Mask256<T> a,
-                         const Mask256<T> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask256<T>{_kor_mask32(a.raw, b.raw)};
-#else
-  return Mask256<T>{static_cast<__mmask32>(a.raw | b.raw)};
-#endif
-}
-template <typename T>
-HWY_INLINE Mask256<T> Or(hwy::SizeTag<2> /*tag*/, const Mask256<T> a,
-                         const Mask256<T> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask256<T>{_kor_mask16(a.raw, b.raw)};
-#else
-  return Mask256<T>{static_cast<__mmask16>(a.raw | b.raw)};
-#endif
-}
-template <typename T>
-HWY_INLINE Mask256<T> Or(hwy::SizeTag<4> /*tag*/, const Mask256<T> a,
-                         const Mask256<T> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask256<T>{_kor_mask8(a.raw, b.raw)};
-#else
-  return Mask256<T>{static_cast<__mmask8>(a.raw | b.raw)};
-#endif
-}
-template <typename T>
-HWY_INLINE Mask256<T> Or(hwy::SizeTag<8> /*tag*/, const Mask256<T> a,
-                         const Mask256<T> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask256<T>{_kor_mask8(a.raw, b.raw)};
-#else
-  return Mask256<T>{static_cast<__mmask8>(a.raw | b.raw)};
-#endif
-}
-
-template <typename T>
-HWY_INLINE Mask256<T> Xor(hwy::SizeTag<1> /*tag*/, const Mask256<T> a,
-                          const Mask256<T> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask256<T>{_kxor_mask32(a.raw, b.raw)};
-#else
-  return Mask256<T>{static_cast<__mmask32>(a.raw ^ b.raw)};
-#endif
-}
-template <typename T>
-HWY_INLINE Mask256<T> Xor(hwy::SizeTag<2> /*tag*/, const Mask256<T> a,
-                          const Mask256<T> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask256<T>{_kxor_mask16(a.raw, b.raw)};
-#else
-  return Mask256<T>{static_cast<__mmask16>(a.raw ^ b.raw)};
-#endif
-}
-template <typename T>
-HWY_INLINE Mask256<T> Xor(hwy::SizeTag<4> /*tag*/, const Mask256<T> a,
-                          const Mask256<T> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask256<T>{_kxor_mask8(a.raw, b.raw)};
-#else
-  return Mask256<T>{static_cast<__mmask8>(a.raw ^ b.raw)};
-#endif
-}
-template <typename T>
-HWY_INLINE Mask256<T> Xor(hwy::SizeTag<8> /*tag*/, const Mask256<T> a,
-                          const Mask256<T> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask256<T>{_kxor_mask8(a.raw, b.raw)};
-#else
-  return Mask256<T>{static_cast<__mmask8>(a.raw ^ b.raw)};
-#endif
-}
-
-}  // namespace detail
-
-template <typename T>
-HWY_API Mask256<T> And(const Mask256<T> a, Mask256<T> b) {
-  return detail::And(hwy::SizeTag<sizeof(T)>(), a, b);
-}
-
-template <typename T>
-HWY_API Mask256<T> AndNot(const Mask256<T> a, Mask256<T> b) {
-  return detail::AndNot(hwy::SizeTag<sizeof(T)>(), a, b);
-}
-
-template <typename T>
-HWY_API Mask256<T> Or(const Mask256<T> a, Mask256<T> b) {
-  return detail::Or(hwy::SizeTag<sizeof(T)>(), a, b);
-}
-
-template <typename T>
-HWY_API Mask256<T> Xor(const Mask256<T> a, Mask256<T> b) {
-  return detail::Xor(hwy::SizeTag<sizeof(T)>(), a, b);
-}
-
-template <typename T>
-HWY_API Mask256<T> Not(const Mask256<T> m) {
-  // Flip only the valid bits.
-  constexpr size_t N = 32 / sizeof(T);
-  return Xor(m, Mask256<T>::FromBits((1ull << N) - 1));
-}
-
-#else  // AVX2
-
-// ------------------------------ Mask
-
-// Mask and Vec are the same (true = FF..FF).
-template <typename T>
-HWY_API Mask256<T> MaskFromVec(const Vec256<T> v) {
-  return Mask256<T>{v.raw};
-}
-
-template <typename T>
-HWY_API Vec256<T> VecFromMask(const Mask256<T> v) {
-  return Vec256<T>{v.raw};
-}
-
-template <typename T>
-HWY_API Vec256<T> VecFromMask(Full256<T> /* tag */, const Mask256<T> v) {
-  return Vec256<T>{v.raw};
-}
-
-// ------------------------------ IfThenElse
-
-// mask ? yes : no
-template <typename T>
-HWY_API Vec256<T> IfThenElse(const Mask256<T> mask, const Vec256<T> yes,
-                             const Vec256<T> no) {
-  return Vec256<T>{_mm256_blendv_epi8(no.raw, yes.raw, mask.raw)};
-}
-HWY_API Vec256<float> IfThenElse(const Mask256<float> mask,
-                                 const Vec256<float> yes,
-                                 const Vec256<float> no) {
-  return Vec256<float>{_mm256_blendv_ps(no.raw, yes.raw, mask.raw)};
-}
-HWY_API Vec256<double> IfThenElse(const Mask256<double> mask,
-                                  const Vec256<double> yes,
-                                  const Vec256<double> no) {
-  return Vec256<double>{_mm256_blendv_pd(no.raw, yes.raw, mask.raw)};
-}
-
-// mask ? yes : 0
-template <typename T>
-HWY_API Vec256<T> IfThenElseZero(Mask256<T> mask, Vec256<T> yes) {
-  return yes & VecFromMask(Full256<T>(), mask);
-}
-
-// mask ? 0 : no
-template <typename T>
-HWY_API Vec256<T> IfThenZeroElse(Mask256<T> mask, Vec256<T> no) {
-  return AndNot(VecFromMask(Full256<T>(), mask), no);
-}
-
-template <typename T>
-HWY_API Vec256<T> ZeroIfNegative(Vec256<T> v) {
-  static_assert(IsSigned<T>(), "Only for float");
-  const auto zero = Zero(Full256<T>());
-  // AVX2 IfThenElse only looks at the MSB for 32/64-bit lanes
-  return IfThenElse(MaskFromVec(v), zero, v);
-}
-
-// ------------------------------ Mask logical
-
-template <typename T>
-HWY_API Mask256<T> Not(const Mask256<T> m) {
-  return MaskFromVec(Not(VecFromMask(Full256<T>(), m)));
-}
-
-template <typename T>
-HWY_API Mask256<T> And(const Mask256<T> a, Mask256<T> b) {
-  const Full256<T> d;
-  return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
-}
-
-template <typename T>
-HWY_API Mask256<T> AndNot(const Mask256<T> a, Mask256<T> b) {
-  const Full256<T> d;
-  return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
-}
-
-template <typename T>
-HWY_API Mask256<T> Or(const Mask256<T> a, Mask256<T> b) {
-  const Full256<T> d;
-  return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
-}
-
-template <typename T>
-HWY_API Mask256<T> Xor(const Mask256<T> a, Mask256<T> b) {
-  const Full256<T> d;
-  return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
-}
-
-#endif  // HWY_TARGET <= HWY_AVX3
-
-// ================================================== COMPARE
-
-#if HWY_TARGET <= HWY_AVX3
-
-// Comparisons set a mask bit to 1 if the condition is true, else 0.
-
-template <typename TFrom, typename TTo>
-HWY_API Mask256<TTo> RebindMask(Full256<TTo> /*tag*/, Mask256<TFrom> m) {
-  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
-  return Mask256<TTo>{m.raw};
-}
-
-namespace detail {
-
-template <typename T>
-HWY_INLINE Mask256<T> TestBit(hwy::SizeTag<1> /*tag*/, const Vec256<T> v,
-                              const Vec256<T> bit) {
-  return Mask256<T>{_mm256_test_epi8_mask(v.raw, bit.raw)};
-}
-template <typename T>
-HWY_INLINE Mask256<T> TestBit(hwy::SizeTag<2> /*tag*/, const Vec256<T> v,
-                              const Vec256<T> bit) {
-  return Mask256<T>{_mm256_test_epi16_mask(v.raw, bit.raw)};
-}
-template <typename T>
-HWY_INLINE Mask256<T> TestBit(hwy::SizeTag<4> /*tag*/, const Vec256<T> v,
-                              const Vec256<T> bit) {
-  return Mask256<T>{_mm256_test_epi32_mask(v.raw, bit.raw)};
-}
-template <typename T>
-HWY_INLINE Mask256<T> TestBit(hwy::SizeTag<8> /*tag*/, const Vec256<T> v,
-                              const Vec256<T> bit) {
-  return Mask256<T>{_mm256_test_epi64_mask(v.raw, bit.raw)};
-}
-
-}  // namespace detail
-
-template <typename T>
-HWY_API Mask256<T> TestBit(const Vec256<T> v, const Vec256<T> bit) {
-  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
-  return detail::TestBit(hwy::SizeTag<sizeof(T)>(), v, bit);
-}
-
-// ------------------------------ Equality
-
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
-  return Mask256<T>{_mm256_cmpeq_epi8_mask(a.raw, b.raw)};
-}
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
-  return Mask256<T>{_mm256_cmpeq_epi16_mask(a.raw, b.raw)};
-}
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
-  return Mask256<T>{_mm256_cmpeq_epi32_mask(a.raw, b.raw)};
-}
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
-  return Mask256<T>{_mm256_cmpeq_epi64_mask(a.raw, b.raw)};
-}
-
-HWY_API Mask256<float> operator==(Vec256<float> a, Vec256<float> b) {
-  return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)};
-}
-
-HWY_API Mask256<double> operator==(Vec256<double> a, Vec256<double> b) {
-  return Mask256<double>{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_EQ_OQ)};
-}
-
-// ------------------------------ Inequality
-
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Mask256<T> operator!=(const Vec256<T> a, const Vec256<T> b) {
-  return Mask256<T>{_mm256_cmpneq_epi8_mask(a.raw, b.raw)};
-}
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Mask256<T> operator!=(const Vec256<T> a, const Vec256<T> b) {
-  return Mask256<T>{_mm256_cmpneq_epi16_mask(a.raw, b.raw)};
-}
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Mask256<T> operator!=(const Vec256<T> a, const Vec256<T> b) {
-  return Mask256<T>{_mm256_cmpneq_epi32_mask(a.raw, b.raw)};
-}
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Mask256<T> operator!=(const Vec256<T> a, const Vec256<T> b) {
-  return Mask256<T>{_mm256_cmpneq_epi64_mask(a.raw, b.raw)};
-}
-
-HWY_API Mask256<float> operator!=(Vec256<float> a, Vec256<float> b) {
-  return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
-}
-
-HWY_API Mask256<double> operator!=(Vec256<double> a, Vec256<double> b) {
-  return Mask256<double>{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
-}
-
-// ------------------------------ Strict inequality
-
-HWY_API Mask256<int8_t> operator>(Vec256<int8_t> a, Vec256<int8_t> b) {
-  return Mask256<int8_t>{_mm256_cmpgt_epi8_mask(a.raw, b.raw)};
-}
-HWY_API Mask256<int16_t> operator>(Vec256<int16_t> a, Vec256<int16_t> b) {
-  return Mask256<int16_t>{_mm256_cmpgt_epi16_mask(a.raw, b.raw)};
-}
-HWY_API Mask256<int32_t> operator>(Vec256<int32_t> a, Vec256<int32_t> b) {
-  return Mask256<int32_t>{_mm256_cmpgt_epi32_mask(a.raw, b.raw)};
-}
-HWY_API Mask256<int64_t> operator>(Vec256<int64_t> a, Vec256<int64_t> b) {
-  return Mask256<int64_t>{_mm256_cmpgt_epi64_mask(a.raw, b.raw)};
-}
-
-HWY_API Mask256<uint8_t> operator>(Vec256<uint8_t> a, Vec256<uint8_t> b) {
-  return Mask256<uint8_t>{_mm256_cmpgt_epu8_mask(a.raw, b.raw)};
-}
-HWY_API Mask256<uint16_t> operator>(const Vec256<uint16_t> a,
-                                    const Vec256<uint16_t> b) {
-  return Mask256<uint16_t>{_mm256_cmpgt_epu16_mask(a.raw, b.raw)};
-}
-HWY_API Mask256<uint32_t> operator>(const Vec256<uint32_t> a,
-                                    const Vec256<uint32_t> b) {
-  return Mask256<uint32_t>{_mm256_cmpgt_epu32_mask(a.raw, b.raw)};
-}
-HWY_API Mask256<uint64_t> operator>(const Vec256<uint64_t> a,
-                                    const Vec256<uint64_t> b) {
-  return Mask256<uint64_t>{_mm256_cmpgt_epu64_mask(a.raw, b.raw)};
-}
-
-HWY_API Mask256<float> operator>(Vec256<float> a, Vec256<float> b) {
-  return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)};
-}
-HWY_API Mask256<double> operator>(Vec256<double> a, Vec256<double> b) {
-  return Mask256<double>{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_GT_OQ)};
-}
-
-// ------------------------------ Weak inequality
-
-HWY_API Mask256<float> operator>=(Vec256<float> a, Vec256<float> b) {
-  return Mask256<float>{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)};
-}
-HWY_API Mask256<double> operator>=(Vec256<double> a, Vec256<double> b) {
-  return Mask256<double>{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_GE_OQ)};
-}
-
-// ------------------------------ Mask
-
-namespace detail {
-
-template <typename T>
-HWY_INLINE Mask256<T> MaskFromVec(hwy::SizeTag<1> /*tag*/, const Vec256<T> v) {
-  return Mask256<T>{_mm256_movepi8_mask(v.raw)};
-}
-template <typename T>
-HWY_INLINE Mask256<T> MaskFromVec(hwy::SizeTag<2> /*tag*/, const Vec256<T> v) {
-  return Mask256<T>{_mm256_movepi16_mask(v.raw)};
-}
-template <typename T>
-HWY_INLINE Mask256<T> MaskFromVec(hwy::SizeTag<4> /*tag*/, const Vec256<T> v) {
-  return Mask256<T>{_mm256_movepi32_mask(v.raw)};
-}
-template <typename T>
-HWY_INLINE Mask256<T> MaskFromVec(hwy::SizeTag<8> /*tag*/, const Vec256<T> v) {
-  return Mask256<T>{_mm256_movepi64_mask(v.raw)};
-}
-
-}  // namespace detail
-
-template <typename T>
-HWY_API Mask256<T> MaskFromVec(const Vec256<T> v) {
-  return detail::MaskFromVec(hwy::SizeTag<sizeof(T)>(), v);
-}
-// There do not seem to be native floating-point versions of these instructions.
-HWY_API Mask256<float> MaskFromVec(const Vec256<float> v) {
-  return Mask256<float>{MaskFromVec(BitCast(Full256<int32_t>(), v)).raw};
-}
-HWY_API Mask256<double> MaskFromVec(const Vec256<double> v) {
-  return Mask256<double>{MaskFromVec(BitCast(Full256<int64_t>(), v)).raw};
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Vec256<T> VecFromMask(const Mask256<T> v) {
-  return Vec256<T>{_mm256_movm_epi8(v.raw)};
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec256<T> VecFromMask(const Mask256<T> v) {
-  return Vec256<T>{_mm256_movm_epi16(v.raw)};
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec256<T> VecFromMask(const Mask256<T> v) {
-  return Vec256<T>{_mm256_movm_epi32(v.raw)};
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec256<T> VecFromMask(const Mask256<T> v) {
-  return Vec256<T>{_mm256_movm_epi64(v.raw)};
-}
-
-HWY_API Vec256<float> VecFromMask(const Mask256<float> v) {
-  return Vec256<float>{_mm256_castsi256_ps(_mm256_movm_epi32(v.raw))};
-}
-
-HWY_API Vec256<double> VecFromMask(const Mask256<double> v) {
-  return Vec256<double>{_mm256_castsi256_pd(_mm256_movm_epi64(v.raw))};
-}
-
-template <typename T>
-HWY_API Vec256<T> VecFromMask(Full256<T> /* tag */, const Mask256<T> v) {
-  return VecFromMask(v);
-}
-
-#else  // AVX2
-
-// Comparisons fill a lane with 1-bits if the condition is true, else 0.
-
-template <typename TFrom, typename TTo>
-HWY_API Mask256<TTo> RebindMask(Full256<TTo> d_to, Mask256<TFrom> m) {
-  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
-  return MaskFromVec(BitCast(d_to, VecFromMask(Full256<TFrom>(), m)));
-}
-
-template <typename T>
-HWY_API Mask256<T> TestBit(const Vec256<T> v, const Vec256<T> bit) {
-  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
-  return (v & bit) == bit;
-}
-
-// ------------------------------ Equality
-
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
-  return Mask256<T>{_mm256_cmpeq_epi8(a.raw, b.raw)};
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
-  return Mask256<T>{_mm256_cmpeq_epi16(a.raw, b.raw)};
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
-  return Mask256<T>{_mm256_cmpeq_epi32(a.raw, b.raw)};
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Mask256<T> operator==(const Vec256<T> a, const Vec256<T> b) {
-  return Mask256<T>{_mm256_cmpeq_epi64(a.raw, b.raw)};
-}
-
-HWY_API Mask256<float> operator==(const Vec256<float> a,
-                                  const Vec256<float> b) {
-  return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_EQ_OQ)};
-}
-
-HWY_API Mask256<double> operator==(const Vec256<double> a,
-                                   const Vec256<double> b) {
-  return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_EQ_OQ)};
-}
-
-// ------------------------------ Inequality
-
-template <typename T>
-HWY_API Mask256<T> operator!=(const Vec256<T> a, const Vec256<T> b) {
-  return Not(a == b);
-}
-HWY_API Mask256<float> operator!=(const Vec256<float> a,
-                                  const Vec256<float> b) {
-  return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_NEQ_OQ)};
-}
-HWY_API Mask256<double> operator!=(const Vec256<double> a,
-                                   const Vec256<double> b) {
-  return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_NEQ_OQ)};
-}
-
-// ------------------------------ Strict inequality
-
-// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
-namespace detail {
-
-// Pre-9.3 GCC immintrin.h uses char, which may be unsigned, causing cmpgt_epi8
-// to perform an unsigned comparison instead of the intended signed. Workaround
-// is to cast to an explicitly signed type. See https://godbolt.org/z/PL7Ujy
-#if HWY_COMPILER_GCC != 0 && HWY_COMPILER_GCC < 930
-#define HWY_AVX2_GCC_CMPGT8_WORKAROUND 1
-#else
-#define HWY_AVX2_GCC_CMPGT8_WORKAROUND 0
-#endif
-
-HWY_API Mask256<int8_t> Gt(hwy::SignedTag /*tag*/, Vec256<int8_t> a,
-                           Vec256<int8_t> b) {
-#if HWY_AVX2_GCC_CMPGT8_WORKAROUND
-  using i8x32 = signed char __attribute__((__vector_size__(32)));
-  return Mask256<int8_t>{static_cast<__m256i>(reinterpret_cast<i8x32>(a.raw) >
-                                              reinterpret_cast<i8x32>(b.raw))};
-#else
-  return Mask256<int8_t>{_mm256_cmpgt_epi8(a.raw, b.raw)};
-#endif
-}
-HWY_API Mask256<int16_t> Gt(hwy::SignedTag /*tag*/, Vec256<int16_t> a,
-                            Vec256<int16_t> b) {
-  return Mask256<int16_t>{_mm256_cmpgt_epi16(a.raw, b.raw)};
-}
-HWY_API Mask256<int32_t> Gt(hwy::SignedTag /*tag*/, Vec256<int32_t> a,
-                            Vec256<int32_t> b) {
-  return Mask256<int32_t>{_mm256_cmpgt_epi32(a.raw, b.raw)};
-}
-HWY_API Mask256<int64_t> Gt(hwy::SignedTag /*tag*/, Vec256<int64_t> a,
-                            Vec256<int64_t> b) {
-  return Mask256<int64_t>{_mm256_cmpgt_epi64(a.raw, b.raw)};
-}
-
-template <typename T>
-HWY_INLINE Mask256<T> Gt(hwy::UnsignedTag /*tag*/, Vec256<T> a, Vec256<T> b) {
-  const Full256<T> du;
-  const RebindToSigned<decltype(du)> di;
-  const Vec256<T> msb = Set(du, (LimitsMax<T>() >> 1) + 1);
-  return RebindMask(du, BitCast(di, Xor(a, msb)) > BitCast(di, Xor(b, msb)));
-}
-
-HWY_API Mask256<float> Gt(hwy::FloatTag /*tag*/, Vec256<float> a,
-                          Vec256<float> b) {
-  return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_GT_OQ)};
-}
-HWY_API Mask256<double> Gt(hwy::FloatTag /*tag*/, Vec256<double> a,
-                           Vec256<double> b) {
-  return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_GT_OQ)};
-}
-
-}  // namespace detail
-
-template <typename T>
-HWY_API Mask256<T> operator>(Vec256<T> a, Vec256<T> b) {
-  return detail::Gt(hwy::TypeTag<T>(), a, b);
-}
-
-// ------------------------------ Weak inequality
-
-HWY_API Mask256<float> operator>=(const Vec256<float> a,
-                                  const Vec256<float> b) {
-  return Mask256<float>{_mm256_cmp_ps(a.raw, b.raw, _CMP_GE_OQ)};
-}
-HWY_API Mask256<double> operator>=(const Vec256<double> a,
-                                   const Vec256<double> b) {
-  return Mask256<double>{_mm256_cmp_pd(a.raw, b.raw, _CMP_GE_OQ)};
-}
-
-#endif  // HWY_TARGET <= HWY_AVX3
-
-// ------------------------------ Reversed comparisons
-
-template <typename T>
-HWY_API Mask256<T> operator<(const Vec256<T> a, const Vec256<T> b) {
-  return b > a;
-}
-
-template <typename T>
-HWY_API Mask256<T> operator<=(const Vec256<T> a, const Vec256<T> b) {
-  return b >= a;
-}
-
-// ------------------------------ Min (Gt, IfThenElse)
-
-// Unsigned
-HWY_API Vec256<uint8_t> Min(const Vec256<uint8_t> a, const Vec256<uint8_t> b) {
-  return Vec256<uint8_t>{_mm256_min_epu8(a.raw, b.raw)};
-}
-HWY_API Vec256<uint16_t> Min(const Vec256<uint16_t> a,
-                             const Vec256<uint16_t> b) {
-  return Vec256<uint16_t>{_mm256_min_epu16(a.raw, b.raw)};
-}
-HWY_API Vec256<uint32_t> Min(const Vec256<uint32_t> a,
-                             const Vec256<uint32_t> b) {
-  return Vec256<uint32_t>{_mm256_min_epu32(a.raw, b.raw)};
-}
-HWY_API Vec256<uint64_t> Min(const Vec256<uint64_t> a,
-                             const Vec256<uint64_t> b) {
-#if HWY_TARGET <= HWY_AVX3
-  return Vec256<uint64_t>{_mm256_min_epu64(a.raw, b.raw)};
-#else
-  const Full256<uint64_t> du;
-  const Full256<int64_t> di;
-  const auto msb = Set(du, 1ull << 63);
-  const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb));
-  return IfThenElse(gt, b, a);
-#endif
-}
-
-// Signed
-HWY_API Vec256<int8_t> Min(const Vec256<int8_t> a, const Vec256<int8_t> b) {
-  return Vec256<int8_t>{_mm256_min_epi8(a.raw, b.raw)};
-}
-HWY_API Vec256<int16_t> Min(const Vec256<int16_t> a, const Vec256<int16_t> b) {
-  return Vec256<int16_t>{_mm256_min_epi16(a.raw, b.raw)};
-}
-HWY_API Vec256<int32_t> Min(const Vec256<int32_t> a, const Vec256<int32_t> b) {
-  return Vec256<int32_t>{_mm256_min_epi32(a.raw, b.raw)};
-}
-HWY_API Vec256<int64_t> Min(const Vec256<int64_t> a, const Vec256<int64_t> b) {
-#if HWY_TARGET <= HWY_AVX3
-  return Vec256<int64_t>{_mm256_min_epi64(a.raw, b.raw)};
-#else
-  return IfThenElse(a < b, a, b);
-#endif
-}
-
-// Float
-HWY_API Vec256<float> Min(const Vec256<float> a, const Vec256<float> b) {
-  return Vec256<float>{_mm256_min_ps(a.raw, b.raw)};
-}
-HWY_API Vec256<double> Min(const Vec256<double> a, const Vec256<double> b) {
-  return Vec256<double>{_mm256_min_pd(a.raw, b.raw)};
-}
-
-// ------------------------------ Max (Gt, IfThenElse)
-
-// Unsigned
-HWY_API Vec256<uint8_t> Max(const Vec256<uint8_t> a, const Vec256<uint8_t> b) {
-  return Vec256<uint8_t>{_mm256_max_epu8(a.raw, b.raw)};
-}
-HWY_API Vec256<uint16_t> Max(const Vec256<uint16_t> a,
-                             const Vec256<uint16_t> b) {
-  return Vec256<uint16_t>{_mm256_max_epu16(a.raw, b.raw)};
-}
-HWY_API Vec256<uint32_t> Max(const Vec256<uint32_t> a,
-                             const Vec256<uint32_t> b) {
-  return Vec256<uint32_t>{_mm256_max_epu32(a.raw, b.raw)};
-}
-HWY_API Vec256<uint64_t> Max(const Vec256<uint64_t> a,
-                             const Vec256<uint64_t> b) {
-#if HWY_TARGET <= HWY_AVX3
-  return Vec256<uint64_t>{_mm256_max_epu64(a.raw, b.raw)};
-#else
-  const Full256<uint64_t> du;
-  const Full256<int64_t> di;
-  const auto msb = Set(du, 1ull << 63);
-  const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb));
-  return IfThenElse(gt, a, b);
-#endif
-}
-
-// Signed
-HWY_API Vec256<int8_t> Max(const Vec256<int8_t> a, const Vec256<int8_t> b) {
-  return Vec256<int8_t>{_mm256_max_epi8(a.raw, b.raw)};
-}
-HWY_API Vec256<int16_t> Max(const Vec256<int16_t> a, const Vec256<int16_t> b) {
-  return Vec256<int16_t>{_mm256_max_epi16(a.raw, b.raw)};
-}
-HWY_API Vec256<int32_t> Max(const Vec256<int32_t> a, const Vec256<int32_t> b) {
-  return Vec256<int32_t>{_mm256_max_epi32(a.raw, b.raw)};
-}
-HWY_API Vec256<int64_t> Max(const Vec256<int64_t> a, const Vec256<int64_t> b) {
-#if HWY_TARGET <= HWY_AVX3
-  return Vec256<int64_t>{_mm256_max_epi64(a.raw, b.raw)};
-#else
-  return IfThenElse(a < b, b, a);
-#endif
-}
-
-// Float
-HWY_API Vec256<float> Max(const Vec256<float> a, const Vec256<float> b) {
-  return Vec256<float>{_mm256_max_ps(a.raw, b.raw)};
-}
-HWY_API Vec256<double> Max(const Vec256<double> a, const Vec256<double> b) {
-  return Vec256<double>{_mm256_max_pd(a.raw, b.raw)};
-}
-
-// ------------------------------ FirstN (Iota, Lt)
-
-template <typename T>
-HWY_API Mask256<T> FirstN(const Full256<T> d, size_t n) {
-#if HWY_TARGET <= HWY_AVX3
-  (void)d;
-  constexpr size_t N = 32 / sizeof(T);
-#if HWY_ARCH_X86_64
-  const uint64_t all = (1ull << N) - 1;
-  // BZHI only looks at the lower 8 bits of n!
-  return Mask256<T>::FromBits((n > 255) ? all : _bzhi_u64(all, n));
-#else
-  const uint32_t all = static_cast<uint32_t>((1ull << N) - 1);
-  // BZHI only looks at the lower 8 bits of n!
-  return Mask256<T>::FromBits(
-      (n > 255) ? all : _bzhi_u32(all, static_cast<uint32_t>(n)));
-#endif  // HWY_ARCH_X86_64
-#else
-  const RebindToSigned<decltype(d)> di;  // Signed comparisons are cheaper.
-  return RebindMask(d, Iota(di, 0) < Set(di, static_cast<MakeSigned<T>>(n)));
-#endif
-}
-
-// ================================================== ARITHMETIC
-
-// ------------------------------ Addition
-
-// Unsigned
-HWY_API Vec256<uint8_t> operator+(const Vec256<uint8_t> a,
-                                  const Vec256<uint8_t> b) {
-  return Vec256<uint8_t>{_mm256_add_epi8(a.raw, b.raw)};
-}
-HWY_API Vec256<uint16_t> operator+(const Vec256<uint16_t> a,
-                                   const Vec256<uint16_t> b) {
-  return Vec256<uint16_t>{_mm256_add_epi16(a.raw, b.raw)};
-}
-HWY_API Vec256<uint32_t> operator+(const Vec256<uint32_t> a,
-                                   const Vec256<uint32_t> b) {
-  return Vec256<uint32_t>{_mm256_add_epi32(a.raw, b.raw)};
-}
-HWY_API Vec256<uint64_t> operator+(const Vec256<uint64_t> a,
-                                   const Vec256<uint64_t> b) {
-  return Vec256<uint64_t>{_mm256_add_epi64(a.raw, b.raw)};
-}
-
-// Signed
-HWY_API Vec256<int8_t> operator+(const Vec256<int8_t> a,
-                                 const Vec256<int8_t> b) {
-  return Vec256<int8_t>{_mm256_add_epi8(a.raw, b.raw)};
-}
-HWY_API Vec256<int16_t> operator+(const Vec256<int16_t> a,
-                                  const Vec256<int16_t> b) {
-  return Vec256<int16_t>{_mm256_add_epi16(a.raw, b.raw)};
-}
-HWY_API Vec256<int32_t> operator+(const Vec256<int32_t> a,
-                                  const Vec256<int32_t> b) {
-  return Vec256<int32_t>{_mm256_add_epi32(a.raw, b.raw)};
-}
-HWY_API Vec256<int64_t> operator+(const Vec256<int64_t> a,
-                                  const Vec256<int64_t> b) {
-  return Vec256<int64_t>{_mm256_add_epi64(a.raw, b.raw)};
-}
-
-// Float
-HWY_API Vec256<float> operator+(const Vec256<float> a, const Vec256<float> b) {
-  return Vec256<float>{_mm256_add_ps(a.raw, b.raw)};
-}
-HWY_API Vec256<double> operator+(const Vec256<double> a,
-                                 const Vec256<double> b) {
-  return Vec256<double>{_mm256_add_pd(a.raw, b.raw)};
-}
-
-// ------------------------------ Subtraction
-
-// Unsigned
-HWY_API Vec256<uint8_t> operator-(const Vec256<uint8_t> a,
-                                  const Vec256<uint8_t> b) {
-  return Vec256<uint8_t>{_mm256_sub_epi8(a.raw, b.raw)};
-}
-HWY_API Vec256<uint16_t> operator-(const Vec256<uint16_t> a,
-                                   const Vec256<uint16_t> b) {
-  return Vec256<uint16_t>{_mm256_sub_epi16(a.raw, b.raw)};
-}
-HWY_API Vec256<uint32_t> operator-(const Vec256<uint32_t> a,
-                                   const Vec256<uint32_t> b) {
-  return Vec256<uint32_t>{_mm256_sub_epi32(a.raw, b.raw)};
-}
-HWY_API Vec256<uint64_t> operator-(const Vec256<uint64_t> a,
-                                   const Vec256<uint64_t> b) {
-  return Vec256<uint64_t>{_mm256_sub_epi64(a.raw, b.raw)};
-}
-
-// Signed
-HWY_API Vec256<int8_t> operator-(const Vec256<int8_t> a,
-                                 const Vec256<int8_t> b) {
-  return Vec256<int8_t>{_mm256_sub_epi8(a.raw, b.raw)};
-}
-HWY_API Vec256<int16_t> operator-(const Vec256<int16_t> a,
-                                  const Vec256<int16_t> b) {
-  return Vec256<int16_t>{_mm256_sub_epi16(a.raw, b.raw)};
-}
-HWY_API Vec256<int32_t> operator-(const Vec256<int32_t> a,
-                                  const Vec256<int32_t> b) {
-  return Vec256<int32_t>{_mm256_sub_epi32(a.raw, b.raw)};
-}
-HWY_API Vec256<int64_t> operator-(const Vec256<int64_t> a,
-                                  const Vec256<int64_t> b) {
-  return Vec256<int64_t>{_mm256_sub_epi64(a.raw, b.raw)};
-}
-
-// Float
-HWY_API Vec256<float> operator-(const Vec256<float> a, const Vec256<float> b) {
-  return Vec256<float>{_mm256_sub_ps(a.raw, b.raw)};
-}
-HWY_API Vec256<double> operator-(const Vec256<double> a,
-                                 const Vec256<double> b) {
-  return Vec256<double>{_mm256_sub_pd(a.raw, b.raw)};
-}
-
-// ------------------------------ SumsOf8
-HWY_API Vec256<uint64_t> SumsOf8(const Vec256<uint8_t> v) {
-  return Vec256<uint64_t>{_mm256_sad_epu8(v.raw, _mm256_setzero_si256())};
-}
-
-// ------------------------------ SaturatedAdd
-
-// Returns a + b clamped to the destination range.
-
-// Unsigned
-HWY_API Vec256<uint8_t> SaturatedAdd(const Vec256<uint8_t> a,
-                                     const Vec256<uint8_t> b) {
-  return Vec256<uint8_t>{_mm256_adds_epu8(a.raw, b.raw)};
-}
-HWY_API Vec256<uint16_t> SaturatedAdd(const Vec256<uint16_t> a,
-                                      const Vec256<uint16_t> b) {
-  return Vec256<uint16_t>{_mm256_adds_epu16(a.raw, b.raw)};
-}
-
-// Signed
-HWY_API Vec256<int8_t> SaturatedAdd(const Vec256<int8_t> a,
-                                    const Vec256<int8_t> b) {
-  return Vec256<int8_t>{_mm256_adds_epi8(a.raw, b.raw)};
-}
-HWY_API Vec256<int16_t> SaturatedAdd(const Vec256<int16_t> a,
-                                     const Vec256<int16_t> b) {
-  return Vec256<int16_t>{_mm256_adds_epi16(a.raw, b.raw)};
-}
-
-// ------------------------------ SaturatedSub
-
-// Returns a - b clamped to the destination range.
-
-// Unsigned
-HWY_API Vec256<uint8_t> SaturatedSub(const Vec256<uint8_t> a,
-                                     const Vec256<uint8_t> b) {
-  return Vec256<uint8_t>{_mm256_subs_epu8(a.raw, b.raw)};
-}
-HWY_API Vec256<uint16_t> SaturatedSub(const Vec256<uint16_t> a,
-                                      const Vec256<uint16_t> b) {
-  return Vec256<uint16_t>{_mm256_subs_epu16(a.raw, b.raw)};
-}
-
-// Signed
-HWY_API Vec256<int8_t> SaturatedSub(const Vec256<int8_t> a,
-                                    const Vec256<int8_t> b) {
-  return Vec256<int8_t>{_mm256_subs_epi8(a.raw, b.raw)};
-}
-HWY_API Vec256<int16_t> SaturatedSub(const Vec256<int16_t> a,
-                                     const Vec256<int16_t> b) {
-  return Vec256<int16_t>{_mm256_subs_epi16(a.raw, b.raw)};
-}
-
-// ------------------------------ Average
-
-// Returns (a + b + 1) / 2
-
-// Unsigned
-HWY_API Vec256<uint8_t> AverageRound(const Vec256<uint8_t> a,
-                                     const Vec256<uint8_t> b) {
-  return Vec256<uint8_t>{_mm256_avg_epu8(a.raw, b.raw)};
-}
-HWY_API Vec256<uint16_t> AverageRound(const Vec256<uint16_t> a,
-                                      const Vec256<uint16_t> b) {
-  return Vec256<uint16_t>{_mm256_avg_epu16(a.raw, b.raw)};
-}
-
-// ------------------------------ Abs (Sub)
-
-// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
-HWY_API Vec256<int8_t> Abs(const Vec256<int8_t> v) {
-#if HWY_COMPILER_MSVC
-  // Workaround for incorrect codegen? (wrong result)
-  const auto zero = Zero(Full256<int8_t>());
-  return Vec256<int8_t>{_mm256_max_epi8(v.raw, (zero - v).raw)};
-#else
-  return Vec256<int8_t>{_mm256_abs_epi8(v.raw)};
-#endif
-}
-HWY_API Vec256<int16_t> Abs(const Vec256<int16_t> v) {
-  return Vec256<int16_t>{_mm256_abs_epi16(v.raw)};
-}
-HWY_API Vec256<int32_t> Abs(const Vec256<int32_t> v) {
-  return Vec256<int32_t>{_mm256_abs_epi32(v.raw)};
-}
-// i64 is implemented after BroadcastSignBit.
-
-HWY_API Vec256<float> Abs(const Vec256<float> v) {
-  const Vec256<int32_t> mask{_mm256_set1_epi32(0x7FFFFFFF)};
-  return v & BitCast(Full256<float>(), mask);
-}
-HWY_API Vec256<double> Abs(const Vec256<double> v) {
-  const Vec256<int64_t> mask{_mm256_set1_epi64x(0x7FFFFFFFFFFFFFFFLL)};
-  return v & BitCast(Full256<double>(), mask);
-}
-
-// ------------------------------ Integer multiplication
-
-// Unsigned
-HWY_API Vec256<uint16_t> operator*(Vec256<uint16_t> a, Vec256<uint16_t> b) {
-  return Vec256<uint16_t>{_mm256_mullo_epi16(a.raw, b.raw)};
-}
-HWY_API Vec256<uint32_t> operator*(Vec256<uint32_t> a, Vec256<uint32_t> b) {
-  return Vec256<uint32_t>{_mm256_mullo_epi32(a.raw, b.raw)};
-}
-
-// Signed
-HWY_API Vec256<int16_t> operator*(Vec256<int16_t> a, Vec256<int16_t> b) {
-  return Vec256<int16_t>{_mm256_mullo_epi16(a.raw, b.raw)};
-}
-HWY_API Vec256<int32_t> operator*(Vec256<int32_t> a, Vec256<int32_t> b) {
-  return Vec256<int32_t>{_mm256_mullo_epi32(a.raw, b.raw)};
-}
-
-// Returns the upper 16 bits of a * b in each lane.
-HWY_API Vec256<uint16_t> MulHigh(Vec256<uint16_t> a, Vec256<uint16_t> b) {
-  return Vec256<uint16_t>{_mm256_mulhi_epu16(a.raw, b.raw)};
-}
-HWY_API Vec256<int16_t> MulHigh(Vec256<int16_t> a, Vec256<int16_t> b) {
-  return Vec256<int16_t>{_mm256_mulhi_epi16(a.raw, b.raw)};
-}
-
-HWY_API Vec256<int16_t> MulFixedPoint15(Vec256<int16_t> a, Vec256<int16_t> b) {
-  return Vec256<int16_t>{_mm256_mulhrs_epi16(a.raw, b.raw)};
-}
-
-// Multiplies even lanes (0, 2 ..) and places the double-wide result into
-// even and the upper half into its odd neighbor lane.
-HWY_API Vec256<int64_t> MulEven(Vec256<int32_t> a, Vec256<int32_t> b) {
-  return Vec256<int64_t>{_mm256_mul_epi32(a.raw, b.raw)};
-}
-HWY_API Vec256<uint64_t> MulEven(Vec256<uint32_t> a, Vec256<uint32_t> b) {
-  return Vec256<uint64_t>{_mm256_mul_epu32(a.raw, b.raw)};
-}
-
-// ------------------------------ ShiftLeft
-
-template <int kBits>
-HWY_API Vec256<uint16_t> ShiftLeft(const Vec256<uint16_t> v) {
-  return Vec256<uint16_t>{_mm256_slli_epi16(v.raw, kBits)};
-}
-
-template <int kBits>
-HWY_API Vec256<uint32_t> ShiftLeft(const Vec256<uint32_t> v) {
-  return Vec256<uint32_t>{_mm256_slli_epi32(v.raw, kBits)};
-}
-
-template <int kBits>
-HWY_API Vec256<uint64_t> ShiftLeft(const Vec256<uint64_t> v) {
-  return Vec256<uint64_t>{_mm256_slli_epi64(v.raw, kBits)};
-}
-
-template <int kBits>
-HWY_API Vec256<int16_t> ShiftLeft(const Vec256<int16_t> v) {
-  return Vec256<int16_t>{_mm256_slli_epi16(v.raw, kBits)};
-}
-
-template <int kBits>
-HWY_API Vec256<int32_t> ShiftLeft(const Vec256<int32_t> v) {
-  return Vec256<int32_t>{_mm256_slli_epi32(v.raw, kBits)};
-}
-
-template <int kBits>
-HWY_API Vec256<int64_t> ShiftLeft(const Vec256<int64_t> v) {
-  return Vec256<int64_t>{_mm256_slli_epi64(v.raw, kBits)};
-}
-
-template <int kBits, typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Vec256<T> ShiftLeft(const Vec256<T> v) {
-  const Full256<T> d8;
-  const RepartitionToWide<decltype(d8)> d16;
-  const auto shifted = BitCast(d8, ShiftLeft<kBits>(BitCast(d16, v)));
-  return kBits == 1
-             ? (v + v)
-             : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
-}
-
-// ------------------------------ ShiftRight
-
-template <int kBits>
-HWY_API Vec256<uint16_t> ShiftRight(const Vec256<uint16_t> v) {
-  return Vec256<uint16_t>{_mm256_srli_epi16(v.raw, kBits)};
-}
-
-template <int kBits>
-HWY_API Vec256<uint32_t> ShiftRight(const Vec256<uint32_t> v) {
-  return Vec256<uint32_t>{_mm256_srli_epi32(v.raw, kBits)};
-}
-
-template <int kBits>
-HWY_API Vec256<uint64_t> ShiftRight(const Vec256<uint64_t> v) {
-  return Vec256<uint64_t>{_mm256_srli_epi64(v.raw, kBits)};
-}
-
-template <int kBits>
-HWY_API Vec256<uint8_t> ShiftRight(const Vec256<uint8_t> v) {
-  const Full256<uint8_t> d8;
-  // Use raw instead of BitCast to support N=1.
-  const Vec256<uint8_t> shifted{ShiftRight<kBits>(Vec256<uint16_t>{v.raw}).raw};
-  return shifted & Set(d8, 0xFF >> kBits);
-}
-
-template <int kBits>
-HWY_API Vec256<int16_t> ShiftRight(const Vec256<int16_t> v) {
-  return Vec256<int16_t>{_mm256_srai_epi16(v.raw, kBits)};
-}
-
-template <int kBits>
-HWY_API Vec256<int32_t> ShiftRight(const Vec256<int32_t> v) {
-  return Vec256<int32_t>{_mm256_srai_epi32(v.raw, kBits)};
-}
-
-template <int kBits>
-HWY_API Vec256<int8_t> ShiftRight(const Vec256<int8_t> v) {
-  const Full256<int8_t> di;
-  const Full256<uint8_t> du;
-  const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
-  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
-  return (shifted ^ shifted_sign) - shifted_sign;
-}
-
-// i64 is implemented after BroadcastSignBit.
-
-// ------------------------------ RotateRight
-
-template <int kBits>
-HWY_API Vec256<uint32_t> RotateRight(const Vec256<uint32_t> v) {
-  static_assert(0 <= kBits && kBits < 32, "Invalid shift count");
-#if HWY_TARGET <= HWY_AVX3
-  return Vec256<uint32_t>{_mm256_ror_epi32(v.raw, kBits)};
-#else
-  if (kBits == 0) return v;
-  return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(31, 32 - kBits)>(v));
-#endif
-}
-
-template <int kBits>
-HWY_API Vec256<uint64_t> RotateRight(const Vec256<uint64_t> v) {
-  static_assert(0 <= kBits && kBits < 64, "Invalid shift count");
-#if HWY_TARGET <= HWY_AVX3
-  return Vec256<uint64_t>{_mm256_ror_epi64(v.raw, kBits)};
-#else
-  if (kBits == 0) return v;
-  return Or(ShiftRight<kBits>(v), ShiftLeft<HWY_MIN(63, 64 - kBits)>(v));
-#endif
-}
-
-// ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
-
-HWY_API Vec256<int8_t> BroadcastSignBit(const Vec256<int8_t> v) {
-  return VecFromMask(v < Zero(Full256<int8_t>()));
-}
-
-HWY_API Vec256<int16_t> BroadcastSignBit(const Vec256<int16_t> v) {
-  return ShiftRight<15>(v);
-}
-
-HWY_API Vec256<int32_t> BroadcastSignBit(const Vec256<int32_t> v) {
-  return ShiftRight<31>(v);
-}
-
-HWY_API Vec256<int64_t> BroadcastSignBit(const Vec256<int64_t> v) {
-#if HWY_TARGET == HWY_AVX2
-  return VecFromMask(v < Zero(Full256<int64_t>()));
-#else
-  return Vec256<int64_t>{_mm256_srai_epi64(v.raw, 63)};
-#endif
-}
-
-template <int kBits>
-HWY_API Vec256<int64_t> ShiftRight(const Vec256<int64_t> v) {
-#if HWY_TARGET <= HWY_AVX3
-  return Vec256<int64_t>{_mm256_srai_epi64(v.raw, kBits)};
-#else
-  const Full256<int64_t> di;
-  const Full256<uint64_t> du;
-  const auto right = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
-  const auto sign = ShiftLeft<64 - kBits>(BroadcastSignBit(v));
-  return right | sign;
-#endif
-}
-
-HWY_API Vec256<int64_t> Abs(const Vec256<int64_t> v) {
-#if HWY_TARGET <= HWY_AVX3
-  return Vec256<int64_t>{_mm256_abs_epi64(v.raw)};
-#else
-  const auto zero = Zero(Full256<int64_t>());
-  return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v);
-#endif
-}
-
-// ------------------------------ IfNegativeThenElse (BroadcastSignBit)
-HWY_API Vec256<int8_t> IfNegativeThenElse(Vec256<int8_t> v, Vec256<int8_t> yes,
-                                          Vec256<int8_t> no) {
-  // int8: AVX2 IfThenElse only looks at the MSB.
-  return IfThenElse(MaskFromVec(v), yes, no);
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec256<T> IfNegativeThenElse(Vec256<T> v, Vec256<T> yes, Vec256<T> no) {
-  static_assert(IsSigned<T>(), "Only works for signed/float");
-  const Full256<T> d;
-  const RebindToSigned<decltype(d)> di;
-
-  // 16-bit: no native blendv, so copy sign to lower byte's MSB.
-  v = BitCast(d, BroadcastSignBit(BitCast(di, v)));
-  return IfThenElse(MaskFromVec(v), yes, no);
-}
-
-template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
-HWY_API Vec256<T> IfNegativeThenElse(Vec256<T> v, Vec256<T> yes, Vec256<T> no) {
-  static_assert(IsSigned<T>(), "Only works for signed/float");
-  const Full256<T> d;
-  const RebindToFloat<decltype(d)> df;
-
-  // 32/64-bit: use float IfThenElse, which only looks at the MSB.
-  const MFromD<decltype(df)> msb = MaskFromVec(BitCast(df, v));
-  return BitCast(d, IfThenElse(msb, BitCast(df, yes), BitCast(df, no)));
-}
-
-// ------------------------------ ShiftLeftSame
-
-HWY_API Vec256<uint16_t> ShiftLeftSame(const Vec256<uint16_t> v,
-                                       const int bits) {
-  return Vec256<uint16_t>{_mm256_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
-}
-HWY_API Vec256<uint32_t> ShiftLeftSame(const Vec256<uint32_t> v,
-                                       const int bits) {
-  return Vec256<uint32_t>{_mm256_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
-}
-HWY_API Vec256<uint64_t> ShiftLeftSame(const Vec256<uint64_t> v,
-                                       const int bits) {
-  return Vec256<uint64_t>{_mm256_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
-}
-
-HWY_API Vec256<int16_t> ShiftLeftSame(const Vec256<int16_t> v, const int bits) {
-  return Vec256<int16_t>{_mm256_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
-}
-
-HWY_API Vec256<int32_t> ShiftLeftSame(const Vec256<int32_t> v, const int bits) {
-  return Vec256<int32_t>{_mm256_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
-}
-
-HWY_API Vec256<int64_t> ShiftLeftSame(const Vec256<int64_t> v, const int bits) {
-  return Vec256<int64_t>{_mm256_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Vec256<T> ShiftLeftSame(const Vec256<T> v, const int bits) {
-  const Full256<T> d8;
-  const RepartitionToWide<decltype(d8)> d16;
-  const auto shifted = BitCast(d8, ShiftLeftSame(BitCast(d16, v), bits));
-  return shifted & Set(d8, static_cast<T>((0xFF << bits) & 0xFF));
-}
-
-// ------------------------------ ShiftRightSame (BroadcastSignBit)
-
-HWY_API Vec256<uint16_t> ShiftRightSame(const Vec256<uint16_t> v,
-                                        const int bits) {
-  return Vec256<uint16_t>{_mm256_srl_epi16(v.raw, _mm_cvtsi32_si128(bits))};
-}
-HWY_API Vec256<uint32_t> ShiftRightSame(const Vec256<uint32_t> v,
-                                        const int bits) {
-  return Vec256<uint32_t>{_mm256_srl_epi32(v.raw, _mm_cvtsi32_si128(bits))};
-}
-HWY_API Vec256<uint64_t> ShiftRightSame(const Vec256<uint64_t> v,
-                                        const int bits) {
-  return Vec256<uint64_t>{_mm256_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))};
-}
-
-HWY_API Vec256<uint8_t> ShiftRightSame(Vec256<uint8_t> v, const int bits) {
-  const Full256<uint8_t> d8;
-  const RepartitionToWide<decltype(d8)> d16;
-  const auto shifted = BitCast(d8, ShiftRightSame(BitCast(d16, v), bits));
-  return shifted & Set(d8, static_cast<uint8_t>(0xFF >> bits));
-}
-
-HWY_API Vec256<int16_t> ShiftRightSame(const Vec256<int16_t> v,
-                                       const int bits) {
-  return Vec256<int16_t>{_mm256_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))};
-}
-
-HWY_API Vec256<int32_t> ShiftRightSame(const Vec256<int32_t> v,
-                                       const int bits) {
-  return Vec256<int32_t>{_mm256_sra_epi32(v.raw, _mm_cvtsi32_si128(bits))};
-}
-HWY_API Vec256<int64_t> ShiftRightSame(const Vec256<int64_t> v,
-                                       const int bits) {
-#if HWY_TARGET <= HWY_AVX3
-  return Vec256<int64_t>{_mm256_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))};
-#else
-  const Full256<int64_t> di;
-  const Full256<uint64_t> du;
-  const auto right = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
-  const auto sign = ShiftLeftSame(BroadcastSignBit(v), 64 - bits);
-  return right | sign;
-#endif
-}
-
-HWY_API Vec256<int8_t> ShiftRightSame(Vec256<int8_t> v, const int bits) {
-  const Full256<int8_t> di;
-  const Full256<uint8_t> du;
-  const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
-  const auto shifted_sign =
-      BitCast(di, Set(du, static_cast<uint8_t>(0x80 >> bits)));
-  return (shifted ^ shifted_sign) - shifted_sign;
-}
-
-// ------------------------------ Neg (Xor, Sub)
-
-// Tag dispatch instead of SFINAE for MSVC 2017 compatibility
-namespace detail {
-
-template <typename T>
-HWY_INLINE Vec256<T> Neg(hwy::FloatTag /*tag*/, const Vec256<T> v) {
-  return Xor(v, SignBit(Full256<T>()));
-}
-
-// Not floating-point
-template <typename T>
-HWY_INLINE Vec256<T> Neg(hwy::NonFloatTag /*tag*/, const Vec256<T> v) {
-  return Zero(Full256<T>()) - v;
-}
-
-}  // namespace detail
-
-template <typename T>
-HWY_API Vec256<T> Neg(const Vec256<T> v) {
-  return detail::Neg(hwy::IsFloatTag<T>(), v);
-}
-
-// ------------------------------ Floating-point mul / div
-
-HWY_API Vec256<float> operator*(const Vec256<float> a, const Vec256<float> b) {
-  return Vec256<float>{_mm256_mul_ps(a.raw, b.raw)};
-}
-HWY_API Vec256<double> operator*(const Vec256<double> a,
-                                 const Vec256<double> b) {
-  return Vec256<double>{_mm256_mul_pd(a.raw, b.raw)};
-}
-
-HWY_API Vec256<float> operator/(const Vec256<float> a, const Vec256<float> b) {
-  return Vec256<float>{_mm256_div_ps(a.raw, b.raw)};
-}
-HWY_API Vec256<double> operator/(const Vec256<double> a,
-                                 const Vec256<double> b) {
-  return Vec256<double>{_mm256_div_pd(a.raw, b.raw)};
-}
-
-// Approximate reciprocal
-HWY_API Vec256<float> ApproximateReciprocal(const Vec256<float> v) {
-  return Vec256<float>{_mm256_rcp_ps(v.raw)};
-}
-
-// Absolute value of difference.
-HWY_API Vec256<float> AbsDiff(const Vec256<float> a, const Vec256<float> b) {
-  return Abs(a - b);
-}
-
-// ------------------------------ Floating-point multiply-add variants
-
-// Returns mul * x + add
-HWY_API Vec256<float> MulAdd(const Vec256<float> mul, const Vec256<float> x,
-                             const Vec256<float> add) {
-#ifdef HWY_DISABLE_BMI2_FMA
-  return mul * x + add;
-#else
-  return Vec256<float>{_mm256_fmadd_ps(mul.raw, x.raw, add.raw)};
-#endif
-}
-HWY_API Vec256<double> MulAdd(const Vec256<double> mul, const Vec256<double> x,
-                              const Vec256<double> add) {
-#ifdef HWY_DISABLE_BMI2_FMA
-  return mul * x + add;
-#else
-  return Vec256<double>{_mm256_fmadd_pd(mul.raw, x.raw, add.raw)};
-#endif
-}
-
-// Returns add - mul * x
-HWY_API Vec256<float> NegMulAdd(const Vec256<float> mul, const Vec256<float> x,
-                                const Vec256<float> add) {
-#ifdef HWY_DISABLE_BMI2_FMA
-  return add - mul * x;
-#else
-  return Vec256<float>{_mm256_fnmadd_ps(mul.raw, x.raw, add.raw)};
-#endif
-}
-HWY_API Vec256<double> NegMulAdd(const Vec256<double> mul,
-                                 const Vec256<double> x,
-                                 const Vec256<double> add) {
-#ifdef HWY_DISABLE_BMI2_FMA
-  return add - mul * x;
-#else
-  return Vec256<double>{_mm256_fnmadd_pd(mul.raw, x.raw, add.raw)};
-#endif
-}
-
-// Returns mul * x - sub
-HWY_API Vec256<float> MulSub(const Vec256<float> mul, const Vec256<float> x,
-                             const Vec256<float> sub) {
-#ifdef HWY_DISABLE_BMI2_FMA
-  return mul * x - sub;
-#else
-  return Vec256<float>{_mm256_fmsub_ps(mul.raw, x.raw, sub.raw)};
-#endif
-}
-HWY_API Vec256<double> MulSub(const Vec256<double> mul, const Vec256<double> x,
-                              const Vec256<double> sub) {
-#ifdef HWY_DISABLE_BMI2_FMA
-  return mul * x - sub;
-#else
-  return Vec256<double>{_mm256_fmsub_pd(mul.raw, x.raw, sub.raw)};
-#endif
-}
-
-// Returns -mul * x - sub
-HWY_API Vec256<float> NegMulSub(const Vec256<float> mul, const Vec256<float> x,
-                                const Vec256<float> sub) {
-#ifdef HWY_DISABLE_BMI2_FMA
-  return Neg(mul * x) - sub;
-#else
-  return Vec256<float>{_mm256_fnmsub_ps(mul.raw, x.raw, sub.raw)};
-#endif
-}
-HWY_API Vec256<double> NegMulSub(const Vec256<double> mul,
-                                 const Vec256<double> x,
-                                 const Vec256<double> sub) {
-#ifdef HWY_DISABLE_BMI2_FMA
-  return Neg(mul * x) - sub;
-#else
-  return Vec256<double>{_mm256_fnmsub_pd(mul.raw, x.raw, sub.raw)};
-#endif
-}
-
-// ------------------------------ Floating-point square root
-
-// Full precision square root
-HWY_API Vec256<float> Sqrt(const Vec256<float> v) {
-  return Vec256<float>{_mm256_sqrt_ps(v.raw)};
-}
-HWY_API Vec256<double> Sqrt(const Vec256<double> v) {
-  return Vec256<double>{_mm256_sqrt_pd(v.raw)};
-}
-
-// Approximate reciprocal square root
-HWY_API Vec256<float> ApproximateReciprocalSqrt(const Vec256<float> v) {
-  return Vec256<float>{_mm256_rsqrt_ps(v.raw)};
-}
-
-// ------------------------------ Floating-point rounding
-
-// Toward nearest integer, tie to even
-HWY_API Vec256<float> Round(const Vec256<float> v) {
-  return Vec256<float>{
-      _mm256_round_ps(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
-}
-HWY_API Vec256<double> Round(const Vec256<double> v) {
-  return Vec256<double>{
-      _mm256_round_pd(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
-}
-
-// Toward zero, aka truncate
-HWY_API Vec256<float> Trunc(const Vec256<float> v) {
-  return Vec256<float>{
-      _mm256_round_ps(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
-}
-HWY_API Vec256<double> Trunc(const Vec256<double> v) {
-  return Vec256<double>{
-      _mm256_round_pd(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
-}
-
-// Toward +infinity, aka ceiling
-HWY_API Vec256<float> Ceil(const Vec256<float> v) {
-  return Vec256<float>{
-      _mm256_round_ps(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
-}
-HWY_API Vec256<double> Ceil(const Vec256<double> v) {
-  return Vec256<double>{
-      _mm256_round_pd(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
-}
-
-// Toward -infinity, aka floor
-HWY_API Vec256<float> Floor(const Vec256<float> v) {
-  return Vec256<float>{
-      _mm256_round_ps(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
-}
-HWY_API Vec256<double> Floor(const Vec256<double> v) {
-  return Vec256<double>{
-      _mm256_round_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
-}
-
-// ------------------------------ Floating-point classification
-
-HWY_API Mask256<float> IsNaN(const Vec256<float> v) {
-#if HWY_TARGET <= HWY_AVX3
-  return Mask256<float>{_mm256_fpclass_ps_mask(v.raw, 0x81)};
-#else
-  return Mask256<float>{_mm256_cmp_ps(v.raw, v.raw, _CMP_UNORD_Q)};
-#endif
-}
-HWY_API Mask256<double> IsNaN(const Vec256<double> v) {
-#if HWY_TARGET <= HWY_AVX3
-  return Mask256<double>{_mm256_fpclass_pd_mask(v.raw, 0x81)};
-#else
-  return Mask256<double>{_mm256_cmp_pd(v.raw, v.raw, _CMP_UNORD_Q)};
-#endif
-}
-
-#if HWY_TARGET <= HWY_AVX3
-
-HWY_API Mask256<float> IsInf(const Vec256<float> v) {
-  return Mask256<float>{_mm256_fpclass_ps_mask(v.raw, 0x18)};
-}
-HWY_API Mask256<double> IsInf(const Vec256<double> v) {
-  return Mask256<double>{_mm256_fpclass_pd_mask(v.raw, 0x18)};
-}
-
-HWY_API Mask256<float> IsFinite(const Vec256<float> v) {
-  // fpclass doesn't have a flag for positive, so we have to check for inf/NaN
-  // and negate the mask.
-  return Not(Mask256<float>{_mm256_fpclass_ps_mask(v.raw, 0x99)});
-}
-HWY_API Mask256<double> IsFinite(const Vec256<double> v) {
-  return Not(Mask256<double>{_mm256_fpclass_pd_mask(v.raw, 0x99)});
-}
-
-#else
-
-template <typename T>
-HWY_API Mask256<T> IsInf(const Vec256<T> v) {
-  static_assert(IsFloat<T>(), "Only for float");
-  const Full256<T> d;
-  const RebindToSigned<decltype(d)> di;
-  const VFromD<decltype(di)> vi = BitCast(di, v);
-  // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
-  return RebindMask(d, Eq(Add(vi, vi), Set(di, hwy::MaxExponentTimes2<T>())));
-}
-
-// Returns whether normal/subnormal/zero.
-template <typename T>
-HWY_API Mask256<T> IsFinite(const Vec256<T> v) {
-  static_assert(IsFloat<T>(), "Only for float");
-  const Full256<T> d;
-  const RebindToUnsigned<decltype(d)> du;
-  const RebindToSigned<decltype(d)> di;  // cheaper than unsigned comparison
-  const VFromD<decltype(du)> vu = BitCast(du, v);
-  // Shift left to clear the sign bit, then right so we can compare with the
-  // max exponent (cannot compare with MaxExponentTimes2 directly because it is
-  // negative and non-negative floats would be greater). MSVC seems to generate
-  // incorrect code if we instead add vu + vu.
-  const VFromD<decltype(di)> exp =
-      BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(ShiftLeft<1>(vu)));
-  return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField<T>())));
-}
-
-#endif  // HWY_TARGET <= HWY_AVX3
-
-// ================================================== MEMORY
-
-// ------------------------------ Load
-
-template <typename T>
-HWY_API Vec256<T> Load(Full256<T> /* tag */, const T* HWY_RESTRICT aligned) {
-  return Vec256<T>{
-      _mm256_load_si256(reinterpret_cast<const __m256i*>(aligned))};
-}
-HWY_API Vec256<float> Load(Full256<float> /* tag */,
-                           const float* HWY_RESTRICT aligned) {
-  return Vec256<float>{_mm256_load_ps(aligned)};
-}
-HWY_API Vec256<double> Load(Full256<double> /* tag */,
-                            const double* HWY_RESTRICT aligned) {
-  return Vec256<double>{_mm256_load_pd(aligned)};
-}
-
-template <typename T>
-HWY_API Vec256<T> LoadU(Full256<T> /* tag */, const T* HWY_RESTRICT p) {
-  return Vec256<T>{_mm256_loadu_si256(reinterpret_cast<const __m256i*>(p))};
-}
-HWY_API Vec256<float> LoadU(Full256<float> /* tag */,
-                            const float* HWY_RESTRICT p) {
-  return Vec256<float>{_mm256_loadu_ps(p)};
-}
-HWY_API Vec256<double> LoadU(Full256<double> /* tag */,
-                             const double* HWY_RESTRICT p) {
-  return Vec256<double>{_mm256_loadu_pd(p)};
-}
-
-// ------------------------------ MaskedLoad
-
-#if HWY_TARGET <= HWY_AVX3
-
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Vec256<T> MaskedLoad(Mask256<T> m, Full256<T> /* tag */,
-                             const T* HWY_RESTRICT p) {
-  return Vec256<T>{_mm256_maskz_loadu_epi8(m.raw, p)};
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec256<T> MaskedLoad(Mask256<T> m, Full256<T> /* tag */,
-                             const T* HWY_RESTRICT p) {
-  return Vec256<T>{_mm256_maskz_loadu_epi16(m.raw, p)};
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec256<T> MaskedLoad(Mask256<T> m, Full256<T> /* tag */,
-                             const T* HWY_RESTRICT p) {
-  return Vec256<T>{_mm256_maskz_loadu_epi32(m.raw, p)};
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec256<T> MaskedLoad(Mask256<T> m, Full256<T> /* tag */,
-                             const T* HWY_RESTRICT p) {
-  return Vec256<T>{_mm256_maskz_loadu_epi64(m.raw, p)};
-}
-
-HWY_API Vec256<float> MaskedLoad(Mask256<float> m, Full256<float> /* tag */,
-                                 const float* HWY_RESTRICT p) {
-  return Vec256<float>{_mm256_maskz_loadu_ps(m.raw, p)};
-}
-
-HWY_API Vec256<double> MaskedLoad(Mask256<double> m, Full256<double> /* tag */,
-                                  const double* HWY_RESTRICT p) {
-  return Vec256<double>{_mm256_maskz_loadu_pd(m.raw, p)};
-}
-
-#else  //  AVX2
-
-// There is no maskload_epi8/16, so blend instead.
-template <typename T, hwy::EnableIf<sizeof(T) <= 2>* = nullptr>
-HWY_API Vec256<T> MaskedLoad(Mask256<T> m, Full256<T> d,
-                             const T* HWY_RESTRICT p) {
-  return IfThenElseZero(m, LoadU(d, p));
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec256<T> MaskedLoad(Mask256<T> m, Full256<T> /* tag */,
-                             const T* HWY_RESTRICT p) {
-  auto pi = reinterpret_cast<const int*>(p);  // NOLINT
-  return Vec256<T>{_mm256_maskload_epi32(pi, m.raw)};
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec256<T> MaskedLoad(Mask256<T> m, Full256<T> /* tag */,
-                             const T* HWY_RESTRICT p) {
-  auto pi = reinterpret_cast<const long long*>(p);  // NOLINT
-  return Vec256<T>{_mm256_maskload_epi64(pi, m.raw)};
-}
-
-HWY_API Vec256<float> MaskedLoad(Mask256<float> m, Full256<float> d,
-                                 const float* HWY_RESTRICT p) {
-  const Vec256<int32_t> mi =
-      BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
-  return Vec256<float>{_mm256_maskload_ps(p, mi.raw)};
-}
-
-HWY_API Vec256<double> MaskedLoad(Mask256<double> m, Full256<double> d,
-                                  const double* HWY_RESTRICT p) {
-  const Vec256<int64_t> mi =
-      BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
-  return Vec256<double>{_mm256_maskload_pd(p, mi.raw)};
-}
-
-#endif
-
-// ------------------------------ LoadDup128
-
-// Loads 128 bit and duplicates into both 128-bit halves. This avoids the
-// 3-cycle cost of moving data between 128-bit halves and avoids port 5.
-template <typename T>
-HWY_API Vec256<T> LoadDup128(Full256<T> /* tag */, const T* HWY_RESTRICT p) {
-#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1931
-  // Workaround for incorrect results with _mm256_broadcastsi128_si256. Note
-  // that MSVC also lacks _mm256_zextsi128_si256, but cast (which leaves the
-  // upper half undefined) is fine because we're overwriting that anyway.
-  // This workaround seems in turn to generate incorrect code in MSVC 2022
-  // (19.31), so use broadcastsi128 there.
-  const __m128i v128 = LoadU(Full128<T>(), p).raw;
-  return Vec256<T>{
-      _mm256_inserti128_si256(_mm256_castsi128_si256(v128), v128, 1)};
-#else
-  return Vec256<T>{_mm256_broadcastsi128_si256(LoadU(Full128<T>(), p).raw)};
-#endif
-}
-HWY_API Vec256<float> LoadDup128(Full256<float> /* tag */,
-                                 const float* const HWY_RESTRICT p) {
-#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1931
-  const __m128 v128 = LoadU(Full128<float>(), p).raw;
-  return Vec256<float>{
-      _mm256_insertf128_ps(_mm256_castps128_ps256(v128), v128, 1)};
-#else
-  return Vec256<float>{_mm256_broadcast_ps(reinterpret_cast<const __m128*>(p))};
-#endif
-}
-HWY_API Vec256<double> LoadDup128(Full256<double> /* tag */,
-                                  const double* const HWY_RESTRICT p) {
-#if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1931
-  const __m128d v128 = LoadU(Full128<double>(), p).raw;
-  return Vec256<double>{
-      _mm256_insertf128_pd(_mm256_castpd128_pd256(v128), v128, 1)};
-#else
-  return Vec256<double>{
-      _mm256_broadcast_pd(reinterpret_cast<const __m128d*>(p))};
-#endif
-}
-
-// ------------------------------ Store
-
-template <typename T>
-HWY_API void Store(Vec256<T> v, Full256<T> /* tag */, T* HWY_RESTRICT aligned) {
-  _mm256_store_si256(reinterpret_cast<__m256i*>(aligned), v.raw);
-}
-HWY_API void Store(const Vec256<float> v, Full256<float> /* tag */,
-                   float* HWY_RESTRICT aligned) {
-  _mm256_store_ps(aligned, v.raw);
-}
-HWY_API void Store(const Vec256<double> v, Full256<double> /* tag */,
-                   double* HWY_RESTRICT aligned) {
-  _mm256_store_pd(aligned, v.raw);
-}
-
-template <typename T>
-HWY_API void StoreU(Vec256<T> v, Full256<T> /* tag */, T* HWY_RESTRICT p) {
-  _mm256_storeu_si256(reinterpret_cast<__m256i*>(p), v.raw);
-}
-HWY_API void StoreU(const Vec256<float> v, Full256<float> /* tag */,
-                    float* HWY_RESTRICT p) {
-  _mm256_storeu_ps(p, v.raw);
-}
-HWY_API void StoreU(const Vec256<double> v, Full256<double> /* tag */,
-                    double* HWY_RESTRICT p) {
-  _mm256_storeu_pd(p, v.raw);
-}
-
-// ------------------------------ BlendedStore
-
-#if HWY_TARGET <= HWY_AVX3
-
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> /* tag */,
-                          T* HWY_RESTRICT p) {
-  _mm256_mask_storeu_epi8(p, m.raw, v.raw);
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> /* tag */,
-                          T* HWY_RESTRICT p) {
-  _mm256_mask_storeu_epi16(p, m.raw, v.raw);
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> /* tag */,
-                          T* HWY_RESTRICT p) {
-  _mm256_mask_storeu_epi32(p, m.raw, v.raw);
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> /* tag */,
-                          T* HWY_RESTRICT p) {
-  _mm256_mask_storeu_epi64(p, m.raw, v.raw);
-}
-
-HWY_API void BlendedStore(Vec256<float> v, Mask256<float> m,
-                          Full256<float> /* tag */, float* HWY_RESTRICT p) {
-  _mm256_mask_storeu_ps(p, m.raw, v.raw);
-}
-
-HWY_API void BlendedStore(Vec256<double> v, Mask256<double> m,
-                          Full256<double> /* tag */, double* HWY_RESTRICT p) {
-  _mm256_mask_storeu_pd(p, m.raw, v.raw);
-}
-
-#else  //  AVX2
-
-// Intel SDM says "No AC# reported for any mask bit combinations". However, AMD
-// allows AC# if "Alignment checking enabled and: 256-bit memory operand not
-// 32-byte aligned". Fortunately AC# is not enabled by default and requires both
-// OS support (CR0) and the application to set rflags.AC. We assume these remain
-// disabled because x86/x64 code and compiler output often contain misaligned
-// scalar accesses, which would also fault.
-//
-// Caveat: these are slow on AMD Jaguar/Bulldozer.
-
-template <typename T, hwy::EnableIf<sizeof(T) <= 2>* = nullptr>
-HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
-                          T* HWY_RESTRICT p) {
-  // There is no maskload_epi8/16. Blending is also unsafe because loading a
-  // full vector that crosses the array end causes asan faults. Resort to scalar
-  // code; the caller should instead use memcpy, assuming m is FirstN(d, n).
-  const RebindToUnsigned<decltype(d)> du;
-  using TU = TFromD<decltype(du)>;
-  alignas(32) TU buf[32 / sizeof(T)];
-  alignas(32) TU mask[32 / sizeof(T)];
-  Store(BitCast(du, v), du, buf);
-  Store(BitCast(du, VecFromMask(d, m)), du, mask);
-  for (size_t i = 0; i < 32 / sizeof(T); ++i) {
-    if (mask[i]) {
-      CopySameSize(buf + i, p + i);
-    }
-  }
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> /* tag */,
-                          T* HWY_RESTRICT p) {
-  auto pi = reinterpret_cast<int*>(p);  // NOLINT
-  _mm256_maskstore_epi32(pi, m.raw, v.raw);
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API void BlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> /* tag */,
-                          T* HWY_RESTRICT p) {
-  auto pi = reinterpret_cast<long long*>(p);  // NOLINT
-  _mm256_maskstore_epi64(pi, m.raw, v.raw);
-}
-
-HWY_API void BlendedStore(Vec256<float> v, Mask256<float> m, Full256<float> d,
-                          float* HWY_RESTRICT p) {
-  const Vec256<int32_t> mi =
-      BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
-  _mm256_maskstore_ps(p, mi.raw, v.raw);
-}
-
-HWY_API void BlendedStore(Vec256<double> v, Mask256<double> m,
-                          Full256<double> d, double* HWY_RESTRICT p) {
-  const Vec256<int64_t> mi =
-      BitCast(RebindToSigned<decltype(d)>(), VecFromMask(d, m));
-  _mm256_maskstore_pd(p, mi.raw, v.raw);
-}
-
-#endif
-
-// ------------------------------ Non-temporal stores
-
-template <typename T>
-HWY_API void Stream(Vec256<T> v, Full256<T> /* tag */,
-                    T* HWY_RESTRICT aligned) {
-  _mm256_stream_si256(reinterpret_cast<__m256i*>(aligned), v.raw);
-}
-HWY_API void Stream(const Vec256<float> v, Full256<float> /* tag */,
-                    float* HWY_RESTRICT aligned) {
-  _mm256_stream_ps(aligned, v.raw);
-}
-HWY_API void Stream(const Vec256<double> v, Full256<double> /* tag */,
-                    double* HWY_RESTRICT aligned) {
-  _mm256_stream_pd(aligned, v.raw);
-}
-
-// ------------------------------ Scatter
-
-// Work around warnings in the intrinsic definitions (passing -1 as a mask).
-HWY_DIAGNOSTICS(push)
-HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
-
-#if HWY_TARGET <= HWY_AVX3
-namespace detail {
-
-template <typename T>
-HWY_INLINE void ScatterOffset(hwy::SizeTag<4> /* tag */, Vec256<T> v,
-                              Full256<T> /* tag */, T* HWY_RESTRICT base,
-                              const Vec256<int32_t> offset) {
-  _mm256_i32scatter_epi32(base, offset.raw, v.raw, 1);
-}
-template <typename T>
-HWY_INLINE void ScatterIndex(hwy::SizeTag<4> /* tag */, Vec256<T> v,
-                             Full256<T> /* tag */, T* HWY_RESTRICT base,
-                             const Vec256<int32_t> index) {
-  _mm256_i32scatter_epi32(base, index.raw, v.raw, 4);
-}
-
-template <typename T>
-HWY_INLINE void ScatterOffset(hwy::SizeTag<8> /* tag */, Vec256<T> v,
-                              Full256<T> /* tag */, T* HWY_RESTRICT base,
-                              const Vec256<int64_t> offset) {
-  _mm256_i64scatter_epi64(base, offset.raw, v.raw, 1);
-}
-template <typename T>
-HWY_INLINE void ScatterIndex(hwy::SizeTag<8> /* tag */, Vec256<T> v,
-                             Full256<T> /* tag */, T* HWY_RESTRICT base,
-                             const Vec256<int64_t> index) {
-  _mm256_i64scatter_epi64(base, index.raw, v.raw, 8);
-}
-
-}  // namespace detail
-
-template <typename T, typename Offset>
-HWY_API void ScatterOffset(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
-                           const Vec256<Offset> offset) {
-  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
-  return detail::ScatterOffset(hwy::SizeTag<sizeof(T)>(), v, d, base, offset);
-}
-template <typename T, typename Index>
-HWY_API void ScatterIndex(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
-                          const Vec256<Index> index) {
-  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
-  return detail::ScatterIndex(hwy::SizeTag<sizeof(T)>(), v, d, base, index);
-}
-
-HWY_API void ScatterOffset(Vec256<float> v, Full256<float> /* tag */,
-                           float* HWY_RESTRICT base,
-                           const Vec256<int32_t> offset) {
-  _mm256_i32scatter_ps(base, offset.raw, v.raw, 1);
-}
-HWY_API void ScatterIndex(Vec256<float> v, Full256<float> /* tag */,
-                          float* HWY_RESTRICT base,
-                          const Vec256<int32_t> index) {
-  _mm256_i32scatter_ps(base, index.raw, v.raw, 4);
-}
-
-HWY_API void ScatterOffset(Vec256<double> v, Full256<double> /* tag */,
-                           double* HWY_RESTRICT base,
-                           const Vec256<int64_t> offset) {
-  _mm256_i64scatter_pd(base, offset.raw, v.raw, 1);
-}
-HWY_API void ScatterIndex(Vec256<double> v, Full256<double> /* tag */,
-                          double* HWY_RESTRICT base,
-                          const Vec256<int64_t> index) {
-  _mm256_i64scatter_pd(base, index.raw, v.raw, 8);
-}
-
-#else
-
-template <typename T, typename Offset>
-HWY_API void ScatterOffset(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
-                           const Vec256<Offset> offset) {
-  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
-
-  constexpr size_t N = 32 / sizeof(T);
-  alignas(32) T lanes[N];
-  Store(v, d, lanes);
-
-  alignas(32) Offset offset_lanes[N];
-  Store(offset, Full256<Offset>(), offset_lanes);
-
-  uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base);
-  for (size_t i = 0; i < N; ++i) {
-    CopyBytes<sizeof(T)>(&lanes[i], base_bytes + offset_lanes[i]);
-  }
-}
-
-template <typename T, typename Index>
-HWY_API void ScatterIndex(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base,
-                          const Vec256<Index> index) {
-  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
-
-  constexpr size_t N = 32 / sizeof(T);
-  alignas(32) T lanes[N];
-  Store(v, d, lanes);
-
-  alignas(32) Index index_lanes[N];
-  Store(index, Full256<Index>(), index_lanes);
-
-  for (size_t i = 0; i < N; ++i) {
-    base[index_lanes[i]] = lanes[i];
-  }
-}
-
-#endif
-
-// ------------------------------ Gather
-
-namespace detail {
-
-template <typename T>
-HWY_INLINE Vec256<T> GatherOffset(hwy::SizeTag<4> /* tag */,
-                                  Full256<T> /* tag */,
-                                  const T* HWY_RESTRICT base,
-                                  const Vec256<int32_t> offset) {
-  return Vec256<T>{_mm256_i32gather_epi32(
-      reinterpret_cast<const int32_t*>(base), offset.raw, 1)};
-}
-template <typename T>
-HWY_INLINE Vec256<T> GatherIndex(hwy::SizeTag<4> /* tag */,
-                                 Full256<T> /* tag */,
-                                 const T* HWY_RESTRICT base,
-                                 const Vec256<int32_t> index) {
-  return Vec256<T>{_mm256_i32gather_epi32(
-      reinterpret_cast<const int32_t*>(base), index.raw, 4)};
-}
-
-template <typename T>
-HWY_INLINE Vec256<T> GatherOffset(hwy::SizeTag<8> /* tag */,
-                                  Full256<T> /* tag */,
-                                  const T* HWY_RESTRICT base,
-                                  const Vec256<int64_t> offset) {
-  return Vec256<T>{_mm256_i64gather_epi64(
-      reinterpret_cast<const GatherIndex64*>(base), offset.raw, 1)};
-}
-template <typename T>
-HWY_INLINE Vec256<T> GatherIndex(hwy::SizeTag<8> /* tag */,
-                                 Full256<T> /* tag */,
-                                 const T* HWY_RESTRICT base,
-                                 const Vec256<int64_t> index) {
-  return Vec256<T>{_mm256_i64gather_epi64(
-      reinterpret_cast<const GatherIndex64*>(base), index.raw, 8)};
-}
-
-}  // namespace detail
-
-template <typename T, typename Offset>
-HWY_API Vec256<T> GatherOffset(Full256<T> d, const T* HWY_RESTRICT base,
-                               const Vec256<Offset> offset) {
-  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
-  return detail::GatherOffset(hwy::SizeTag<sizeof(T)>(), d, base, offset);
-}
-template <typename T, typename Index>
-HWY_API Vec256<T> GatherIndex(Full256<T> d, const T* HWY_RESTRICT base,
-                              const Vec256<Index> index) {
-  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
-  return detail::GatherIndex(hwy::SizeTag<sizeof(T)>(), d, base, index);
-}
-
-HWY_API Vec256<float> GatherOffset(Full256<float> /* tag */,
-                                   const float* HWY_RESTRICT base,
-                                   const Vec256<int32_t> offset) {
-  return Vec256<float>{_mm256_i32gather_ps(base, offset.raw, 1)};
-}
-HWY_API Vec256<float> GatherIndex(Full256<float> /* tag */,
-                                  const float* HWY_RESTRICT base,
-                                  const Vec256<int32_t> index) {
-  return Vec256<float>{_mm256_i32gather_ps(base, index.raw, 4)};
-}
-
-HWY_API Vec256<double> GatherOffset(Full256<double> /* tag */,
-                                    const double* HWY_RESTRICT base,
-                                    const Vec256<int64_t> offset) {
-  return Vec256<double>{_mm256_i64gather_pd(base, offset.raw, 1)};
-}
-HWY_API Vec256<double> GatherIndex(Full256<double> /* tag */,
-                                   const double* HWY_RESTRICT base,
-                                   const Vec256<int64_t> index) {
-  return Vec256<double>{_mm256_i64gather_pd(base, index.raw, 8)};
-}
-
-HWY_DIAGNOSTICS(pop)
-
-// ================================================== SWIZZLE
-
-// ------------------------------ LowerHalf
-
-template <typename T>
-HWY_API Vec128<T> LowerHalf(Full128<T> /* tag */, Vec256<T> v) {
-  return Vec128<T>{_mm256_castsi256_si128(v.raw)};
-}
-HWY_API Vec128<float> LowerHalf(Full128<float> /* tag */, Vec256<float> v) {
-  return Vec128<float>{_mm256_castps256_ps128(v.raw)};
-}
-HWY_API Vec128<double> LowerHalf(Full128<double> /* tag */, Vec256<double> v) {
-  return Vec128<double>{_mm256_castpd256_pd128(v.raw)};
-}
-
-template <typename T>
-HWY_API Vec128<T> LowerHalf(Vec256<T> v) {
-  return LowerHalf(Full128<T>(), v);
-}
-
-// ------------------------------ UpperHalf
-
-template <typename T>
-HWY_API Vec128<T> UpperHalf(Full128<T> /* tag */, Vec256<T> v) {
-  return Vec128<T>{_mm256_extracti128_si256(v.raw, 1)};
-}
-HWY_API Vec128<float> UpperHalf(Full128<float> /* tag */, Vec256<float> v) {
-  return Vec128<float>{_mm256_extractf128_ps(v.raw, 1)};
-}
-HWY_API Vec128<double> UpperHalf(Full128<double> /* tag */, Vec256<double> v) {
-  return Vec128<double>{_mm256_extractf128_pd(v.raw, 1)};
-}
-
-// ------------------------------ ExtractLane (Store)
-template <typename T>
-HWY_API T ExtractLane(const Vec256<T> v, size_t i) {
-  const Full256<T> d;
-  HWY_DASSERT(i < Lanes(d));
-  alignas(32) T lanes[32 / sizeof(T)];
-  Store(v, d, lanes);
-  return lanes[i];
-}
-
-// ------------------------------ InsertLane (Store)
-template <typename T>
-HWY_API Vec256<T> InsertLane(const Vec256<T> v, size_t i, T t) {
-  const Full256<T> d;
-  HWY_DASSERT(i < Lanes(d));
-  alignas(64) T lanes[64 / sizeof(T)];
-  Store(v, d, lanes);
-  lanes[i] = t;
-  return Load(d, lanes);
-}
-
-// ------------------------------ GetLane (LowerHalf)
-template <typename T>
-HWY_API T GetLane(const Vec256<T> v) {
-  return GetLane(LowerHalf(v));
-}
-
-// ------------------------------ ZeroExtendVector
-
-// Unfortunately the initial _mm256_castsi128_si256 intrinsic leaves the upper
-// bits undefined. Although it makes sense for them to be zero (VEX encoded
-// 128-bit instructions zero the upper lanes to avoid large penalties), a
-// compiler could decide to optimize out code that relies on this.
-//
-// The newer _mm256_zextsi128_si256 intrinsic fixes this by specifying the
-// zeroing, but it is not available on MSVC until 15.7 nor GCC until 10.1. For
-// older GCC, we can still obtain the desired code thanks to pattern
-// recognition; note that the expensive insert instruction is not actually
-// generated, see https://gcc.godbolt.org/z/1MKGaP.
-
-#if !defined(HWY_HAVE_ZEXT)
-#if (HWY_COMPILER_MSVC && HWY_COMPILER_MSVC >= 1915) ||  \
-    (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 500) || \
-    (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL >= 1000)
-#define HWY_HAVE_ZEXT 1
-#else
-#define HWY_HAVE_ZEXT 0
-#endif
-#endif  // defined(HWY_HAVE_ZEXT)
-
-template <typename T>
-HWY_API Vec256<T> ZeroExtendVector(Full256<T> /* tag */, Vec128<T> lo) {
-#if HWY_HAVE_ZEXT
-return Vec256<T>{_mm256_zextsi128_si256(lo.raw)};
-#else
-  return Vec256<T>{_mm256_inserti128_si256(_mm256_setzero_si256(), lo.raw, 0)};
-#endif
-}
-HWY_API Vec256<float> ZeroExtendVector(Full256<float> /* tag */,
-                                       Vec128<float> lo) {
-#if HWY_HAVE_ZEXT
-  return Vec256<float>{_mm256_zextps128_ps256(lo.raw)};
-#else
-  return Vec256<float>{_mm256_insertf128_ps(_mm256_setzero_ps(), lo.raw, 0)};
-#endif
-}
-HWY_API Vec256<double> ZeroExtendVector(Full256<double> /* tag */,
-                                        Vec128<double> lo) {
-#if HWY_HAVE_ZEXT
-  return Vec256<double>{_mm256_zextpd128_pd256(lo.raw)};
-#else
-  return Vec256<double>{_mm256_insertf128_pd(_mm256_setzero_pd(), lo.raw, 0)};
-#endif
-}
-
-// ------------------------------ Combine
-
-template <typename T>
-HWY_API Vec256<T> Combine(Full256<T> d, Vec128<T> hi, Vec128<T> lo) {
-  const auto lo256 = ZeroExtendVector(d, lo);
-  return Vec256<T>{_mm256_inserti128_si256(lo256.raw, hi.raw, 1)};
-}
-HWY_API Vec256<float> Combine(Full256<float> d, Vec128<float> hi,
-                              Vec128<float> lo) {
-  const auto lo256 = ZeroExtendVector(d, lo);
-  return Vec256<float>{_mm256_insertf128_ps(lo256.raw, hi.raw, 1)};
-}
-HWY_API Vec256<double> Combine(Full256<double> d, Vec128<double> hi,
-                               Vec128<double> lo) {
-  const auto lo256 = ZeroExtendVector(d, lo);
-  return Vec256<double>{_mm256_insertf128_pd(lo256.raw, hi.raw, 1)};
-}
-
-// ------------------------------ ShiftLeftBytes
-
-template <int kBytes, typename T>
-HWY_API Vec256<T> ShiftLeftBytes(Full256<T> /* tag */, const Vec256<T> v) {
-  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
-  // This is the same operation as _mm256_bslli_epi128.
-  return Vec256<T>{_mm256_slli_si256(v.raw, kBytes)};
-}
-
-template <int kBytes, typename T>
-HWY_API Vec256<T> ShiftLeftBytes(const Vec256<T> v) {
-  return ShiftLeftBytes<kBytes>(Full256<T>(), v);
-}
-
-// ------------------------------ ShiftLeftLanes
-
-template <int kLanes, typename T>
-HWY_API Vec256<T> ShiftLeftLanes(Full256<T> d, const Vec256<T> v) {
-  const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
-}
-
-template <int kLanes, typename T>
-HWY_API Vec256<T> ShiftLeftLanes(const Vec256<T> v) {
-  return ShiftLeftLanes<kLanes>(Full256<T>(), v);
-}
-
-// ------------------------------ ShiftRightBytes
-
-template <int kBytes, typename T>
-HWY_API Vec256<T> ShiftRightBytes(Full256<T> /* tag */, const Vec256<T> v) {
-  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
-  // This is the same operation as _mm256_bsrli_epi128.
-  return Vec256<T>{_mm256_srli_si256(v.raw, kBytes)};
-}
-
-// ------------------------------ ShiftRightLanes
-template <int kLanes, typename T>
-HWY_API Vec256<T> ShiftRightLanes(Full256<T> d, const Vec256<T> v) {
-  const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v)));
-}
-
-// ------------------------------ CombineShiftRightBytes
-
-// Extracts 128 bits from <hi, lo> by skipping the least-significant kBytes.
-template <int kBytes, typename T, class V = Vec256<T>>
-HWY_API V CombineShiftRightBytes(Full256<T> d, V hi, V lo) {
-  const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, Vec256<uint8_t>{_mm256_alignr_epi8(
-                        BitCast(d8, hi).raw, BitCast(d8, lo).raw, kBytes)});
-}
-
-// ------------------------------ Broadcast/splat any lane
-
-// Unsigned
-template <int kLane>
-HWY_API Vec256<uint16_t> Broadcast(const Vec256<uint16_t> v) {
-  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
-  if (kLane < 4) {
-    const __m256i lo = _mm256_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF);
-    return Vec256<uint16_t>{_mm256_unpacklo_epi64(lo, lo)};
-  } else {
-    const __m256i hi =
-        _mm256_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF);
-    return Vec256<uint16_t>{_mm256_unpackhi_epi64(hi, hi)};
-  }
-}
-template <int kLane>
-HWY_API Vec256<uint32_t> Broadcast(const Vec256<uint32_t> v) {
-  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
-  return Vec256<uint32_t>{_mm256_shuffle_epi32(v.raw, 0x55 * kLane)};
-}
-template <int kLane>
-HWY_API Vec256<uint64_t> Broadcast(const Vec256<uint64_t> v) {
-  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
-  return Vec256<uint64_t>{_mm256_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)};
-}
-
-// Signed
-template <int kLane>
-HWY_API Vec256<int16_t> Broadcast(const Vec256<int16_t> v) {
-  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
-  if (kLane < 4) {
-    const __m256i lo = _mm256_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF);
-    return Vec256<int16_t>{_mm256_unpacklo_epi64(lo, lo)};
-  } else {
-    const __m256i hi =
-        _mm256_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF);
-    return Vec256<int16_t>{_mm256_unpackhi_epi64(hi, hi)};
-  }
-}
-template <int kLane>
-HWY_API Vec256<int32_t> Broadcast(const Vec256<int32_t> v) {
-  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
-  return Vec256<int32_t>{_mm256_shuffle_epi32(v.raw, 0x55 * kLane)};
-}
-template <int kLane>
-HWY_API Vec256<int64_t> Broadcast(const Vec256<int64_t> v) {
-  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
-  return Vec256<int64_t>{_mm256_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)};
-}
-
-// Float
-template <int kLane>
-HWY_API Vec256<float> Broadcast(Vec256<float> v) {
-  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
-  return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0x55 * kLane)};
-}
-template <int kLane>
-HWY_API Vec256<double> Broadcast(const Vec256<double> v) {
-  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
-  return Vec256<double>{_mm256_shuffle_pd(v.raw, v.raw, 15 * kLane)};
-}
-
-// ------------------------------ Hard-coded shuffles
-
-// Notation: let Vec256<int32_t> have lanes 7,6,5,4,3,2,1,0 (0 is
-// least-significant). Shuffle0321 rotates four-lane blocks one lane to the
-// right (the previous least-significant lane is now most-significant =>
-// 47650321). These could also be implemented via CombineShiftRightBytes but
-// the shuffle_abcd notation is more convenient.
-
-// Swap 32-bit halves in 64-bit halves.
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec256<T> Shuffle2301(const Vec256<T> v) {
-  return Vec256<T>{_mm256_shuffle_epi32(v.raw, 0xB1)};
-}
-HWY_API Vec256<float> Shuffle2301(const Vec256<float> v) {
-  return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0xB1)};
-}
-
-namespace detail {
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec256<T> Shuffle2301(const Vec256<T> a, const Vec256<T> b) {
-  const Full256<T> d;
-  const RebindToFloat<decltype(d)> df;
-  constexpr int m = _MM_SHUFFLE(2, 3, 0, 1);
-  return BitCast(d, Vec256<float>{_mm256_shuffle_ps(BitCast(df, a).raw,
-                                                    BitCast(df, b).raw, m)});
-}
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec256<T> Shuffle1230(const Vec256<T> a, const Vec256<T> b) {
-  const Full256<T> d;
-  const RebindToFloat<decltype(d)> df;
-  constexpr int m = _MM_SHUFFLE(1, 2, 3, 0);
-  return BitCast(d, Vec256<float>{_mm256_shuffle_ps(BitCast(df, a).raw,
-                                                    BitCast(df, b).raw, m)});
-}
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec256<T> Shuffle3012(const Vec256<T> a, const Vec256<T> b) {
-  const Full256<T> d;
-  const RebindToFloat<decltype(d)> df;
-  constexpr int m = _MM_SHUFFLE(3, 0, 1, 2);
-  return BitCast(d, Vec256<float>{_mm256_shuffle_ps(BitCast(df, a).raw,
-                                                    BitCast(df, b).raw, m)});
-}
-
-}  // namespace detail
-
-// Swap 64-bit halves
-HWY_API Vec256<uint32_t> Shuffle1032(const Vec256<uint32_t> v) {
-  return Vec256<uint32_t>{_mm256_shuffle_epi32(v.raw, 0x4E)};
-}
-HWY_API Vec256<int32_t> Shuffle1032(const Vec256<int32_t> v) {
-  return Vec256<int32_t>{_mm256_shuffle_epi32(v.raw, 0x4E)};
-}
-HWY_API Vec256<float> Shuffle1032(const Vec256<float> v) {
-  // Shorter encoding than _mm256_permute_ps.
-  return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0x4E)};
-}
-HWY_API Vec256<uint64_t> Shuffle01(const Vec256<uint64_t> v) {
-  return Vec256<uint64_t>{_mm256_shuffle_epi32(v.raw, 0x4E)};
-}
-HWY_API Vec256<int64_t> Shuffle01(const Vec256<int64_t> v) {
-  return Vec256<int64_t>{_mm256_shuffle_epi32(v.raw, 0x4E)};
-}
-HWY_API Vec256<double> Shuffle01(const Vec256<double> v) {
-  // Shorter encoding than _mm256_permute_pd.
-  return Vec256<double>{_mm256_shuffle_pd(v.raw, v.raw, 5)};
-}
-
-// Rotate right 32 bits
-HWY_API Vec256<uint32_t> Shuffle0321(const Vec256<uint32_t> v) {
-  return Vec256<uint32_t>{_mm256_shuffle_epi32(v.raw, 0x39)};
-}
-HWY_API Vec256<int32_t> Shuffle0321(const Vec256<int32_t> v) {
-  return Vec256<int32_t>{_mm256_shuffle_epi32(v.raw, 0x39)};
-}
-HWY_API Vec256<float> Shuffle0321(const Vec256<float> v) {
-  return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0x39)};
-}
-// Rotate left 32 bits
-HWY_API Vec256<uint32_t> Shuffle2103(const Vec256<uint32_t> v) {
-  return Vec256<uint32_t>{_mm256_shuffle_epi32(v.raw, 0x93)};
-}
-HWY_API Vec256<int32_t> Shuffle2103(const Vec256<int32_t> v) {
-  return Vec256<int32_t>{_mm256_shuffle_epi32(v.raw, 0x93)};
-}
-HWY_API Vec256<float> Shuffle2103(const Vec256<float> v) {
-  return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0x93)};
-}
-
-// Reverse
-HWY_API Vec256<uint32_t> Shuffle0123(const Vec256<uint32_t> v) {
-  return Vec256<uint32_t>{_mm256_shuffle_epi32(v.raw, 0x1B)};
-}
-HWY_API Vec256<int32_t> Shuffle0123(const Vec256<int32_t> v) {
-  return Vec256<int32_t>{_mm256_shuffle_epi32(v.raw, 0x1B)};
-}
-HWY_API Vec256<float> Shuffle0123(const Vec256<float> v) {
-  return Vec256<float>{_mm256_shuffle_ps(v.raw, v.raw, 0x1B)};
-}
-
-// ------------------------------ TableLookupLanes
-
-// Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes.
-template <typename T>
-struct Indices256 {
-  __m256i raw;
-};
-
-// Native 8x32 instruction: indices remain unchanged
-template <typename T, typename TI, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Indices256<T> IndicesFromVec(Full256<T> /* tag */, Vec256<TI> vec) {
-  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
-#if HWY_IS_DEBUG_BUILD
-  const Full256<TI> di;
-  HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
-              AllTrue(di, Lt(vec, Set(di, static_cast<TI>(32 / sizeof(T))))));
-#endif
-  return Indices256<T>{vec.raw};
-}
-
-// 64-bit lanes: convert indices to 8x32 unless AVX3 is available
-template <typename T, typename TI, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Indices256<T> IndicesFromVec(Full256<T> d, Vec256<TI> idx64) {
-  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
-  const Rebind<TI, decltype(d)> di;
-  (void)di;  // potentially unused
-#if HWY_IS_DEBUG_BUILD
-  HWY_DASSERT(AllFalse(di, Lt(idx64, Zero(di))) &&
-              AllTrue(di, Lt(idx64, Set(di, static_cast<TI>(32 / sizeof(T))))));
-#endif
-
-#if HWY_TARGET <= HWY_AVX3
-  (void)d;
-  return Indices256<T>{idx64.raw};
-#else
-  const Repartition<float, decltype(d)> df;  // 32-bit!
-  // Replicate 64-bit index into upper 32 bits
-  const Vec256<TI> dup =
-      BitCast(di, Vec256<float>{_mm256_moveldup_ps(BitCast(df, idx64).raw)});
-  // For each idx64 i, idx32 are 2*i and 2*i+1.
-  const Vec256<TI> idx32 = dup + dup + Set(di, TI(1) << 32);
-  return Indices256<T>{idx32.raw};
-#endif
-}
-
-template <typename T, typename TI>
-HWY_API Indices256<T> SetTableIndices(const Full256<T> d, const TI* idx) {
-  const Rebind<TI, decltype(d)> di;
-  return IndicesFromVec(d, LoadU(di, idx));
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec256<T> TableLookupLanes(Vec256<T> v, Indices256<T> idx) {
-  return Vec256<T>{_mm256_permutevar8x32_epi32(v.raw, idx.raw)};
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec256<T> TableLookupLanes(Vec256<T> v, Indices256<T> idx) {
-#if HWY_TARGET <= HWY_AVX3
-  return Vec256<T>{_mm256_permutexvar_epi64(idx.raw, v.raw)};
-#else
-  return Vec256<T>{_mm256_permutevar8x32_epi32(v.raw, idx.raw)};
-#endif
-}
-
-HWY_API Vec256<float> TableLookupLanes(const Vec256<float> v,
-                                       const Indices256<float> idx) {
-  return Vec256<float>{_mm256_permutevar8x32_ps(v.raw, idx.raw)};
-}
-
-HWY_API Vec256<double> TableLookupLanes(const Vec256<double> v,
-                                        const Indices256<double> idx) {
-#if HWY_TARGET <= HWY_AVX3
-  return Vec256<double>{_mm256_permutexvar_pd(idx.raw, v.raw)};
-#else
-  const Full256<double> df;
-  const Full256<uint64_t> du;
-  return BitCast(df, Vec256<uint64_t>{_mm256_permutevar8x32_epi32(
-                         BitCast(du, v).raw, idx.raw)});
-#endif
-}
-
-// ------------------------------ SwapAdjacentBlocks
-
-template <typename T>
-HWY_API Vec256<T> SwapAdjacentBlocks(Vec256<T> v) {
-  return Vec256<T>{_mm256_permute2x128_si256(v.raw, v.raw, 0x01)};
-}
-
-HWY_API Vec256<float> SwapAdjacentBlocks(Vec256<float> v) {
-  return Vec256<float>{_mm256_permute2f128_ps(v.raw, v.raw, 0x01)};
-}
-
-HWY_API Vec256<double> SwapAdjacentBlocks(Vec256<double> v) {
-  return Vec256<double>{_mm256_permute2f128_pd(v.raw, v.raw, 0x01)};
-}
-
-// ------------------------------ Reverse (RotateRight)
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec256<T> Reverse(Full256<T> d, const Vec256<T> v) {
-  alignas(32) constexpr int32_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0};
-  return TableLookupLanes(v, SetTableIndices(d, kReverse));
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec256<T> Reverse(Full256<T> d, const Vec256<T> v) {
-  alignas(32) constexpr int64_t kReverse[4] = {3, 2, 1, 0};
-  return TableLookupLanes(v, SetTableIndices(d, kReverse));
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec256<T> Reverse(Full256<T> d, const Vec256<T> v) {
-#if HWY_TARGET <= HWY_AVX3
-  const RebindToSigned<decltype(d)> di;
-  alignas(32) constexpr int16_t kReverse[16] = {15, 14, 13, 12, 11, 10, 9, 8,
-                                                7,  6,  5,  4,  3,  2,  1, 0};
-  const Vec256<int16_t> idx = Load(di, kReverse);
-  return BitCast(d, Vec256<int16_t>{
-                        _mm256_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
-#else
-  const RepartitionToWide<RebindToUnsigned<decltype(d)>> du32;
-  const Vec256<uint32_t> rev32 = Reverse(du32, BitCast(du32, v));
-  return BitCast(d, RotateRight<16>(rev32));
-#endif
-}
-
-// ------------------------------ Reverse2
-
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec256<T> Reverse2(Full256<T> d, const Vec256<T> v) {
-  const Full256<uint32_t> du32;
-  return BitCast(d, RotateRight<16>(BitCast(du32, v)));
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec256<T> Reverse2(Full256<T> /* tag */, const Vec256<T> v) {
-  return Shuffle2301(v);
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec256<T> Reverse2(Full256<T> /* tag */, const Vec256<T> v) {
-  return Shuffle01(v);
-}
-
-// ------------------------------ Reverse4 (SwapAdjacentBlocks)
-
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec256<T> Reverse4(Full256<T> d, const Vec256<T> v) {
-#if HWY_TARGET <= HWY_AVX3
-  const RebindToSigned<decltype(d)> di;
-  alignas(32) constexpr int16_t kReverse4[16] = {3,  2,  1, 0, 7,  6,  5,  4,
-                                                 11, 10, 9, 8, 15, 14, 13, 12};
-  const Vec256<int16_t> idx = Load(di, kReverse4);
-  return BitCast(d, Vec256<int16_t>{
-                        _mm256_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
-#else
-  const RepartitionToWide<decltype(d)> dw;
-  return Reverse2(d, BitCast(d, Shuffle2301(BitCast(dw, v))));
-#endif
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec256<T> Reverse4(Full256<T> /* tag */, const Vec256<T> v) {
-  return Shuffle0123(v);
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec256<T> Reverse4(Full256<T> /* tag */, const Vec256<T> v) {
-  // Could also use _mm256_permute4x64_epi64.
-  return SwapAdjacentBlocks(Shuffle01(v));
-}
-
-// ------------------------------ Reverse8
-
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec256<T> Reverse8(Full256<T> d, const Vec256<T> v) {
-#if HWY_TARGET <= HWY_AVX3
-  const RebindToSigned<decltype(d)> di;
-  alignas(32) constexpr int16_t kReverse8[16] = {7,  6,  5,  4,  3,  2,  1, 0,
-                                                 15, 14, 13, 12, 11, 10, 9, 8};
-  const Vec256<int16_t> idx = Load(di, kReverse8);
-  return BitCast(d, Vec256<int16_t>{
-                        _mm256_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
-#else
-  const RepartitionToWide<decltype(d)> dw;
-  return Reverse2(d, BitCast(d, Shuffle0123(BitCast(dw, v))));
-#endif
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec256<T> Reverse8(Full256<T> d, const Vec256<T> v) {
-  return Reverse(d, v);
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec256<T> Reverse8(Full256<T> /* tag */, const Vec256<T> /* v */) {
-  HWY_ASSERT(0);  // AVX2 does not have 8 64-bit lanes
-}
-
-// ------------------------------ InterleaveLower
-
-// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
-// the least-significant lane) and "b". To concatenate two half-width integers
-// into one, use ZipLower/Upper instead (also works with scalar).
-
-HWY_API Vec256<uint8_t> InterleaveLower(const Vec256<uint8_t> a,
-                                        const Vec256<uint8_t> b) {
-  return Vec256<uint8_t>{_mm256_unpacklo_epi8(a.raw, b.raw)};
-}
-HWY_API Vec256<uint16_t> InterleaveLower(const Vec256<uint16_t> a,
-                                         const Vec256<uint16_t> b) {
-  return Vec256<uint16_t>{_mm256_unpacklo_epi16(a.raw, b.raw)};
-}
-HWY_API Vec256<uint32_t> InterleaveLower(const Vec256<uint32_t> a,
-                                         const Vec256<uint32_t> b) {
-  return Vec256<uint32_t>{_mm256_unpacklo_epi32(a.raw, b.raw)};
-}
-HWY_API Vec256<uint64_t> InterleaveLower(const Vec256<uint64_t> a,
-                                         const Vec256<uint64_t> b) {
-  return Vec256<uint64_t>{_mm256_unpacklo_epi64(a.raw, b.raw)};
-}
-
-HWY_API Vec256<int8_t> InterleaveLower(const Vec256<int8_t> a,
-                                       const Vec256<int8_t> b) {
-  return Vec256<int8_t>{_mm256_unpacklo_epi8(a.raw, b.raw)};
-}
-HWY_API Vec256<int16_t> InterleaveLower(const Vec256<int16_t> a,
-                                        const Vec256<int16_t> b) {
-  return Vec256<int16_t>{_mm256_unpacklo_epi16(a.raw, b.raw)};
-}
-HWY_API Vec256<int32_t> InterleaveLower(const Vec256<int32_t> a,
-                                        const Vec256<int32_t> b) {
-  return Vec256<int32_t>{_mm256_unpacklo_epi32(a.raw, b.raw)};
-}
-HWY_API Vec256<int64_t> InterleaveLower(const Vec256<int64_t> a,
-                                        const Vec256<int64_t> b) {
-  return Vec256<int64_t>{_mm256_unpacklo_epi64(a.raw, b.raw)};
-}
-
-HWY_API Vec256<float> InterleaveLower(const Vec256<float> a,
-                                      const Vec256<float> b) {
-  return Vec256<float>{_mm256_unpacklo_ps(a.raw, b.raw)};
-}
-HWY_API Vec256<double> InterleaveLower(const Vec256<double> a,
-                                       const Vec256<double> b) {
-  return Vec256<double>{_mm256_unpacklo_pd(a.raw, b.raw)};
-}
-
-// ------------------------------ InterleaveUpper
-
-// All functions inside detail lack the required D parameter.
-namespace detail {
-
-HWY_API Vec256<uint8_t> InterleaveUpper(const Vec256<uint8_t> a,
-                                        const Vec256<uint8_t> b) {
-  return Vec256<uint8_t>{_mm256_unpackhi_epi8(a.raw, b.raw)};
-}
-HWY_API Vec256<uint16_t> InterleaveUpper(const Vec256<uint16_t> a,
-                                         const Vec256<uint16_t> b) {
-  return Vec256<uint16_t>{_mm256_unpackhi_epi16(a.raw, b.raw)};
-}
-HWY_API Vec256<uint32_t> InterleaveUpper(const Vec256<uint32_t> a,
-                                         const Vec256<uint32_t> b) {
-  return Vec256<uint32_t>{_mm256_unpackhi_epi32(a.raw, b.raw)};
-}
-HWY_API Vec256<uint64_t> InterleaveUpper(const Vec256<uint64_t> a,
-                                         const Vec256<uint64_t> b) {
-  return Vec256<uint64_t>{_mm256_unpackhi_epi64(a.raw, b.raw)};
-}
-
-HWY_API Vec256<int8_t> InterleaveUpper(const Vec256<int8_t> a,
-                                       const Vec256<int8_t> b) {
-  return Vec256<int8_t>{_mm256_unpackhi_epi8(a.raw, b.raw)};
-}
-HWY_API Vec256<int16_t> InterleaveUpper(const Vec256<int16_t> a,
-                                        const Vec256<int16_t> b) {
-  return Vec256<int16_t>{_mm256_unpackhi_epi16(a.raw, b.raw)};
-}
-HWY_API Vec256<int32_t> InterleaveUpper(const Vec256<int32_t> a,
-                                        const Vec256<int32_t> b) {
-  return Vec256<int32_t>{_mm256_unpackhi_epi32(a.raw, b.raw)};
-}
-HWY_API Vec256<int64_t> InterleaveUpper(const Vec256<int64_t> a,
-                                        const Vec256<int64_t> b) {
-  return Vec256<int64_t>{_mm256_unpackhi_epi64(a.raw, b.raw)};
-}
-
-HWY_API Vec256<float> InterleaveUpper(const Vec256<float> a,
-                                      const Vec256<float> b) {
-  return Vec256<float>{_mm256_unpackhi_ps(a.raw, b.raw)};
-}
-HWY_API Vec256<double> InterleaveUpper(const Vec256<double> a,
-                                       const Vec256<double> b) {
-  return Vec256<double>{_mm256_unpackhi_pd(a.raw, b.raw)};
-}
-
-}  // namespace detail
-
-template <typename T, class V = Vec256<T>>
-HWY_API V InterleaveUpper(Full256<T> /* tag */, V a, V b) {
-  return detail::InterleaveUpper(a, b);
-}
-
-// ------------------------------ ZipLower/ZipUpper (InterleaveLower)
-
-// Same as Interleave*, except that the return lanes are double-width integers;
-// this is necessary because the single-lane scalar cannot return two values.
-template <typename T, typename TW = MakeWide<T>>
-HWY_API Vec256<TW> ZipLower(Vec256<T> a, Vec256<T> b) {
-  return BitCast(Full256<TW>(), InterleaveLower(a, b));
-}
-template <typename T, typename TW = MakeWide<T>>
-HWY_API Vec256<TW> ZipLower(Full256<TW> dw, Vec256<T> a, Vec256<T> b) {
-  return BitCast(dw, InterleaveLower(a, b));
-}
-
-template <typename T, typename TW = MakeWide<T>>
-HWY_API Vec256<TW> ZipUpper(Full256<TW> dw, Vec256<T> a, Vec256<T> b) {
-  return BitCast(dw, InterleaveUpper(Full256<T>(), a, b));
-}
-
-// ------------------------------ Blocks (LowerHalf, ZeroExtendVector)
-
-// _mm256_broadcastsi128_si256 has 7 cycle latency on ICL.
-// _mm256_permute2x128_si256 is slow on Zen1 (8 uops), so we avoid it (at no
-// extra cost) for LowerLower and UpperLower.
-
-// hiH,hiL loH,loL |-> hiL,loL (= lower halves)
-template <typename T>
-HWY_API Vec256<T> ConcatLowerLower(Full256<T> d, const Vec256<T> hi,
-                                   const Vec256<T> lo) {
-  const Half<decltype(d)> d2;
-  return Vec256<T>{_mm256_inserti128_si256(lo.raw, LowerHalf(d2, hi).raw, 1)};
-}
-HWY_API Vec256<float> ConcatLowerLower(Full256<float> d, const Vec256<float> hi,
-                                       const Vec256<float> lo) {
-  const Half<decltype(d)> d2;
-  return Vec256<float>{_mm256_insertf128_ps(lo.raw, LowerHalf(d2, hi).raw, 1)};
-}
-HWY_API Vec256<double> ConcatLowerLower(Full256<double> d,
-                                        const Vec256<double> hi,
-                                        const Vec256<double> lo) {
-  const Half<decltype(d)> d2;
-  return Vec256<double>{_mm256_insertf128_pd(lo.raw, LowerHalf(d2, hi).raw, 1)};
-}
-
-// hiH,hiL loH,loL |-> hiL,loH (= inner halves / swap blocks)
-template <typename T>
-HWY_API Vec256<T> ConcatLowerUpper(Full256<T> /* tag */, const Vec256<T> hi,
-                                   const Vec256<T> lo) {
-  return Vec256<T>{_mm256_permute2x128_si256(lo.raw, hi.raw, 0x21)};
-}
-HWY_API Vec256<float> ConcatLowerUpper(Full256<float> /* tag */,
-                                       const Vec256<float> hi,
-                                       const Vec256<float> lo) {
-  return Vec256<float>{_mm256_permute2f128_ps(lo.raw, hi.raw, 0x21)};
-}
-HWY_API Vec256<double> ConcatLowerUpper(Full256<double> /* tag */,
-                                        const Vec256<double> hi,
-                                        const Vec256<double> lo) {
-  return Vec256<double>{_mm256_permute2f128_pd(lo.raw, hi.raw, 0x21)};
-}
-
-// hiH,hiL loH,loL |-> hiH,loL (= outer halves)
-template <typename T>
-HWY_API Vec256<T> ConcatUpperLower(Full256<T> /* tag */, const Vec256<T> hi,
-                                   const Vec256<T> lo) {
-  return Vec256<T>{_mm256_blend_epi32(hi.raw, lo.raw, 0x0F)};
-}
-HWY_API Vec256<float> ConcatUpperLower(Full256<float> /* tag */,
-                                       const Vec256<float> hi,
-                                       const Vec256<float> lo) {
-  return Vec256<float>{_mm256_blend_ps(hi.raw, lo.raw, 0x0F)};
-}
-HWY_API Vec256<double> ConcatUpperLower(Full256<double> /* tag */,
-                                        const Vec256<double> hi,
-                                        const Vec256<double> lo) {
-  return Vec256<double>{_mm256_blend_pd(hi.raw, lo.raw, 3)};
-}
-
-// hiH,hiL loH,loL |-> hiH,loH (= upper halves)
-template <typename T>
-HWY_API Vec256<T> ConcatUpperUpper(Full256<T> /* tag */, const Vec256<T> hi,
-                                   const Vec256<T> lo) {
-  return Vec256<T>{_mm256_permute2x128_si256(lo.raw, hi.raw, 0x31)};
-}
-HWY_API Vec256<float> ConcatUpperUpper(Full256<float> /* tag */,
-                                       const Vec256<float> hi,
-                                       const Vec256<float> lo) {
-  return Vec256<float>{_mm256_permute2f128_ps(lo.raw, hi.raw, 0x31)};
-}
-HWY_API Vec256<double> ConcatUpperUpper(Full256<double> /* tag */,
-                                        const Vec256<double> hi,
-                                        const Vec256<double> lo) {
-  return Vec256<double>{_mm256_permute2f128_pd(lo.raw, hi.raw, 0x31)};
-}
-
-// ------------------------------ ConcatOdd
-
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Vec256<T> ConcatOdd(Full256<T> d, Vec256<T> hi, Vec256<T> lo) {
-  const RebindToUnsigned<decltype(d)> du;
-#if HWY_TARGET == HWY_AVX3_DL
-  alignas(32) constexpr uint8_t kIdx[32] = {
-      1,  3,  5,  7,  9,  11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
-      33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63};
-  return BitCast(d, Vec256<uint16_t>{_mm256_mask2_permutex2var_epi8(
-                        BitCast(du, lo).raw, Load(du, kIdx).raw,
-                        __mmask32{0xFFFFFFFFu}, BitCast(du, hi).raw)});
-#else
-  const RepartitionToWide<decltype(du)> dw;
-  // Unsigned 8-bit shift so we can pack.
-  const Vec256<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi));
-  const Vec256<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo));
-  const __m256i u8 = _mm256_packus_epi16(uL.raw, uH.raw);
-  return Vec256<T>{_mm256_permute4x64_epi64(u8, _MM_SHUFFLE(3, 1, 2, 0))};
-#endif
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec256<T> ConcatOdd(Full256<T> d, Vec256<T> hi, Vec256<T> lo) {
-  const RebindToUnsigned<decltype(d)> du;
-#if HWY_TARGET <= HWY_AVX3
-  alignas(32) constexpr uint16_t kIdx[16] = {1,  3,  5,  7,  9,  11, 13, 15,
-                                             17, 19, 21, 23, 25, 27, 29, 31};
-  return BitCast(d, Vec256<uint16_t>{_mm256_mask2_permutex2var_epi16(
-                        BitCast(du, lo).raw, Load(du, kIdx).raw,
-                        __mmask16{0xFFFF}, BitCast(du, hi).raw)});
-#else
-  const RepartitionToWide<decltype(du)> dw;
-  // Unsigned 16-bit shift so we can pack.
-  const Vec256<uint32_t> uH = ShiftRight<16>(BitCast(dw, hi));
-  const Vec256<uint32_t> uL = ShiftRight<16>(BitCast(dw, lo));
-  const __m256i u16 = _mm256_packus_epi32(uL.raw, uH.raw);
-  return Vec256<T>{_mm256_permute4x64_epi64(u16, _MM_SHUFFLE(3, 1, 2, 0))};
-#endif
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec256<T> ConcatOdd(Full256<T> d, Vec256<T> hi, Vec256<T> lo) {
-  const RebindToUnsigned<decltype(d)> du;
-#if HWY_TARGET <= HWY_AVX3
-  alignas(32) constexpr uint32_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
-  return BitCast(d, Vec256<uint32_t>{_mm256_mask2_permutex2var_epi32(
-                        BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF},
-                        BitCast(du, hi).raw)});
-#else
-  const RebindToFloat<decltype(d)> df;
-  const Vec256<float> v3131{_mm256_shuffle_ps(
-      BitCast(df, lo).raw, BitCast(df, hi).raw, _MM_SHUFFLE(3, 1, 3, 1))};
-  return Vec256<T>{_mm256_permute4x64_epi64(BitCast(du, v3131).raw,
-                                            _MM_SHUFFLE(3, 1, 2, 0))};
-#endif
-}
-
-HWY_API Vec256<float> ConcatOdd(Full256<float> d, Vec256<float> hi,
-                                Vec256<float> lo) {
-  const RebindToUnsigned<decltype(d)> du;
-#if HWY_TARGET <= HWY_AVX3
-  alignas(32) constexpr uint32_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
-  return Vec256<float>{_mm256_mask2_permutex2var_ps(lo.raw, Load(du, kIdx).raw,
-                                                    __mmask8{0xFF}, hi.raw)};
-#else
-  const Vec256<float> v3131{
-      _mm256_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(3, 1, 3, 1))};
-  return BitCast(d, Vec256<uint32_t>{_mm256_permute4x64_epi64(
-                        BitCast(du, v3131).raw, _MM_SHUFFLE(3, 1, 2, 0))});
-#endif
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec256<T> ConcatOdd(Full256<T> d, Vec256<T> hi, Vec256<T> lo) {
-  const RebindToUnsigned<decltype(d)> du;
-#if HWY_TARGET <= HWY_AVX3
-  alignas(64) constexpr uint64_t kIdx[4] = {1, 3, 5, 7};
-  return BitCast(d, Vec256<uint64_t>{_mm256_mask2_permutex2var_epi64(
-                        BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF},
-                        BitCast(du, hi).raw)});
-#else
-  const RebindToFloat<decltype(d)> df;
-  const Vec256<double> v31{
-      _mm256_shuffle_pd(BitCast(df, lo).raw, BitCast(df, hi).raw, 15)};
-  return Vec256<T>{
-      _mm256_permute4x64_epi64(BitCast(du, v31).raw, _MM_SHUFFLE(3, 1, 2, 0))};
-#endif
-}
-
-HWY_API Vec256<double> ConcatOdd(Full256<double> d, Vec256<double> hi,
-                                 Vec256<double> lo) {
-#if HWY_TARGET <= HWY_AVX3
-  const RebindToUnsigned<decltype(d)> du;
-  alignas(64) constexpr uint64_t kIdx[4] = {1, 3, 5, 7};
-  return Vec256<double>{_mm256_mask2_permutex2var_pd(lo.raw, Load(du, kIdx).raw,
-                                                     __mmask8{0xFF}, hi.raw)};
-#else
-  (void)d;
-  const Vec256<double> v31{_mm256_shuffle_pd(lo.raw, hi.raw, 15)};
-  return Vec256<double>{
-      _mm256_permute4x64_pd(v31.raw, _MM_SHUFFLE(3, 1, 2, 0))};
-#endif
-}
-
-// ------------------------------ ConcatEven
-
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Vec256<T> ConcatEven(Full256<T> d, Vec256<T> hi, Vec256<T> lo) {
-  const RebindToUnsigned<decltype(d)> du;
-#if HWY_TARGET == HWY_AVX3_DL
-  alignas(64) constexpr uint8_t kIdx[32] = {
-      0,  2,  4,  6,  8,  10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
-      32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62};
-  return BitCast(d, Vec256<uint32_t>{_mm256_mask2_permutex2var_epi8(
-                        BitCast(du, lo).raw, Load(du, kIdx).raw,
-                        __mmask32{0xFFFFFFFFu}, BitCast(du, hi).raw)});
-#else
-  const RepartitionToWide<decltype(du)> dw;
-  // Isolate lower 8 bits per u16 so we can pack.
-  const Vec256<uint16_t> mask = Set(dw, 0x00FF);
-  const Vec256<uint16_t> uH = And(BitCast(dw, hi), mask);
-  const Vec256<uint16_t> uL = And(BitCast(dw, lo), mask);
-  const __m256i u8 = _mm256_packus_epi16(uL.raw, uH.raw);
-  return Vec256<T>{_mm256_permute4x64_epi64(u8, _MM_SHUFFLE(3, 1, 2, 0))};
-#endif
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec256<T> ConcatEven(Full256<T> d, Vec256<T> hi, Vec256<T> lo) {
-  const RebindToUnsigned<decltype(d)> du;
-#if HWY_TARGET <= HWY_AVX3
-  alignas(64) constexpr uint16_t kIdx[16] = {0,  2,  4,  6,  8,  10, 12, 14,
-                                             16, 18, 20, 22, 24, 26, 28, 30};
-  return BitCast(d, Vec256<uint32_t>{_mm256_mask2_permutex2var_epi16(
-                        BitCast(du, lo).raw, Load(du, kIdx).raw,
-                        __mmask16{0xFFFF}, BitCast(du, hi).raw)});
-#else
-  const RepartitionToWide<decltype(du)> dw;
-  // Isolate lower 16 bits per u32 so we can pack.
-  const Vec256<uint32_t> mask = Set(dw, 0x0000FFFF);
-  const Vec256<uint32_t> uH = And(BitCast(dw, hi), mask);
-  const Vec256<uint32_t> uL = And(BitCast(dw, lo), mask);
-  const __m256i u16 = _mm256_packus_epi32(uL.raw, uH.raw);
-  return Vec256<T>{_mm256_permute4x64_epi64(u16, _MM_SHUFFLE(3, 1, 2, 0))};
-#endif
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec256<T> ConcatEven(Full256<T> d, Vec256<T> hi, Vec256<T> lo) {
-  const RebindToUnsigned<decltype(d)> du;
-#if HWY_TARGET <= HWY_AVX3
-  alignas(64) constexpr uint32_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
-  return BitCast(d, Vec256<uint32_t>{_mm256_mask2_permutex2var_epi32(
-                        BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF},
-                        BitCast(du, hi).raw)});
-#else
-  const RebindToFloat<decltype(d)> df;
-  const Vec256<float> v2020{_mm256_shuffle_ps(
-      BitCast(df, lo).raw, BitCast(df, hi).raw, _MM_SHUFFLE(2, 0, 2, 0))};
-  return Vec256<T>{_mm256_permute4x64_epi64(BitCast(du, v2020).raw,
-                                            _MM_SHUFFLE(3, 1, 2, 0))};
-
-#endif
-}
-
-HWY_API Vec256<float> ConcatEven(Full256<float> d, Vec256<float> hi,
-                                 Vec256<float> lo) {
-  const RebindToUnsigned<decltype(d)> du;
-#if HWY_TARGET <= HWY_AVX3
-  alignas(64) constexpr uint32_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
-  return Vec256<float>{_mm256_mask2_permutex2var_ps(lo.raw, Load(du, kIdx).raw,
-                                                    __mmask8{0xFF}, hi.raw)};
-#else
-  const Vec256<float> v2020{
-      _mm256_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(2, 0, 2, 0))};
-  return BitCast(d, Vec256<uint32_t>{_mm256_permute4x64_epi64(
-                        BitCast(du, v2020).raw, _MM_SHUFFLE(3, 1, 2, 0))});
-
-#endif
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec256<T> ConcatEven(Full256<T> d, Vec256<T> hi, Vec256<T> lo) {
-  const RebindToUnsigned<decltype(d)> du;
-#if HWY_TARGET <= HWY_AVX3
-  alignas(64) constexpr uint64_t kIdx[4] = {0, 2, 4, 6};
-  return BitCast(d, Vec256<uint64_t>{_mm256_mask2_permutex2var_epi64(
-                        BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF},
-                        BitCast(du, hi).raw)});
-#else
-  const RebindToFloat<decltype(d)> df;
-  const Vec256<double> v20{
-      _mm256_shuffle_pd(BitCast(df, lo).raw, BitCast(df, hi).raw, 0)};
-  return Vec256<T>{
-      _mm256_permute4x64_epi64(BitCast(du, v20).raw, _MM_SHUFFLE(3, 1, 2, 0))};
-
-#endif
-}
-
-HWY_API Vec256<double> ConcatEven(Full256<double> d, Vec256<double> hi,
-                                  Vec256<double> lo) {
-#if HWY_TARGET <= HWY_AVX3
-  const RebindToUnsigned<decltype(d)> du;
-  alignas(64) constexpr uint64_t kIdx[4] = {0, 2, 4, 6};
-  return Vec256<double>{_mm256_mask2_permutex2var_pd(lo.raw, Load(du, kIdx).raw,
-                                                     __mmask8{0xFF}, hi.raw)};
-#else
-  (void)d;
-  const Vec256<double> v20{_mm256_shuffle_pd(lo.raw, hi.raw, 0)};
-  return Vec256<double>{
-      _mm256_permute4x64_pd(v20.raw, _MM_SHUFFLE(3, 1, 2, 0))};
-#endif
-}
-
-// ------------------------------ DupEven (InterleaveLower)
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec256<T> DupEven(Vec256<T> v) {
-  return Vec256<T>{_mm256_shuffle_epi32(v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
-}
-HWY_API Vec256<float> DupEven(Vec256<float> v) {
-  return Vec256<float>{
-      _mm256_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(2, 2, 0, 0))};
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec256<T> DupEven(const Vec256<T> v) {
-  return InterleaveLower(Full256<T>(), v, v);
-}
-
-// ------------------------------ DupOdd (InterleaveUpper)
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec256<T> DupOdd(Vec256<T> v) {
-  return Vec256<T>{_mm256_shuffle_epi32(v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
-}
-HWY_API Vec256<float> DupOdd(Vec256<float> v) {
-  return Vec256<float>{
-      _mm256_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(3, 3, 1, 1))};
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec256<T> DupOdd(const Vec256<T> v) {
-  return InterleaveUpper(Full256<T>(), v, v);
-}
-
-// ------------------------------ OddEven
-
-namespace detail {
-
-template <typename T>
-HWY_INLINE Vec256<T> OddEven(hwy::SizeTag<1> /* tag */, const Vec256<T> a,
-                             const Vec256<T> b) {
-  const Full256<T> d;
-  const Full256<uint8_t> d8;
-  alignas(32) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0,
-                                            0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0};
-  return IfThenElse(MaskFromVec(BitCast(d, LoadDup128(d8, mask))), b, a);
-}
-template <typename T>
-HWY_INLINE Vec256<T> OddEven(hwy::SizeTag<2> /* tag */, const Vec256<T> a,
-                             const Vec256<T> b) {
-  return Vec256<T>{_mm256_blend_epi16(a.raw, b.raw, 0x55)};
-}
-template <typename T>
-HWY_INLINE Vec256<T> OddEven(hwy::SizeTag<4> /* tag */, const Vec256<T> a,
-                             const Vec256<T> b) {
-  return Vec256<T>{_mm256_blend_epi32(a.raw, b.raw, 0x55)};
-}
-template <typename T>
-HWY_INLINE Vec256<T> OddEven(hwy::SizeTag<8> /* tag */, const Vec256<T> a,
-                             const Vec256<T> b) {
-  return Vec256<T>{_mm256_blend_epi32(a.raw, b.raw, 0x33)};
-}
-
-}  // namespace detail
-
-template <typename T>
-HWY_API Vec256<T> OddEven(const Vec256<T> a, const Vec256<T> b) {
-  return detail::OddEven(hwy::SizeTag<sizeof(T)>(), a, b);
-}
-HWY_API Vec256<float> OddEven(const Vec256<float> a, const Vec256<float> b) {
-  return Vec256<float>{_mm256_blend_ps(a.raw, b.raw, 0x55)};
-}
-
-HWY_API Vec256<double> OddEven(const Vec256<double> a, const Vec256<double> b) {
-  return Vec256<double>{_mm256_blend_pd(a.raw, b.raw, 5)};
-}
-
-// ------------------------------ OddEvenBlocks
-
-template <typename T>
-Vec256<T> OddEvenBlocks(Vec256<T> odd, Vec256<T> even) {
-  return Vec256<T>{_mm256_blend_epi32(odd.raw, even.raw, 0xFu)};
-}
-
-HWY_API Vec256<float> OddEvenBlocks(Vec256<float> odd, Vec256<float> even) {
-  return Vec256<float>{_mm256_blend_ps(odd.raw, even.raw, 0xFu)};
-}
-
-HWY_API Vec256<double> OddEvenBlocks(Vec256<double> odd, Vec256<double> even) {
-  return Vec256<double>{_mm256_blend_pd(odd.raw, even.raw, 0x3u)};
-}
-
-// ------------------------------ ReverseBlocks (ConcatLowerUpper)
-
-template <typename T>
-HWY_API Vec256<T> ReverseBlocks(Full256<T> d, Vec256<T> v) {
-  return ConcatLowerUpper(d, v, v);
-}
-
-// ------------------------------ TableLookupBytes (ZeroExtendVector)
-
-// Both full
-template <typename T, typename TI>
-HWY_API Vec256<TI> TableLookupBytes(const Vec256<T> bytes,
-                                    const Vec256<TI> from) {
-  return Vec256<TI>{_mm256_shuffle_epi8(bytes.raw, from.raw)};
-}
-
-// Partial index vector
-template <typename T, typename TI, size_t NI>
-HWY_API Vec128<TI, NI> TableLookupBytes(const Vec256<T> bytes,
-                                        const Vec128<TI, NI> from) {
-  // First expand to full 128, then 256.
-  const auto from_256 = ZeroExtendVector(Full256<TI>(), Vec128<TI>{from.raw});
-  const auto tbl_full = TableLookupBytes(bytes, from_256);
-  // Shrink to 128, then partial.
-  return Vec128<TI, NI>{LowerHalf(Full128<TI>(), tbl_full).raw};
-}
-
-// Partial table vector
-template <typename T, size_t N, typename TI>
-HWY_API Vec256<TI> TableLookupBytes(const Vec128<T, N> bytes,
-                                    const Vec256<TI> from) {
-  // First expand to full 128, then 256.
-  const auto bytes_256 = ZeroExtendVector(Full256<T>(), Vec128<T>{bytes.raw});
-  return TableLookupBytes(bytes_256, from);
-}
-
-// Partial both are handled by x86_128.
-
-// ------------------------------ Shl (Mul, ZipLower)
-
-namespace detail {
-
-#if HWY_TARGET > HWY_AVX3  // AVX2 or older
-
-// Returns 2^v for use as per-lane multipliers to emulate 16-bit shifts.
-template <typename T>
-HWY_INLINE Vec256<MakeUnsigned<T>> Pow2(const Vec256<T> v) {
-  static_assert(sizeof(T) == 2, "Only for 16-bit");
-  const Full256<T> d;
-  const RepartitionToWide<decltype(d)> dw;
-  const Rebind<float, decltype(dw)> df;
-  const auto zero = Zero(d);
-  // Move into exponent (this u16 will become the upper half of an f32)
-  const auto exp = ShiftLeft<23 - 16>(v);
-  const auto upper = exp + Set(d, 0x3F80);  // upper half of 1.0f
-  // Insert 0 into lower halves for reinterpreting as binary32.
-  const auto f0 = ZipLower(dw, zero, upper);
-  const auto f1 = ZipUpper(dw, zero, upper);
-  // Do not use ConvertTo because it checks for overflow, which is redundant
-  // because we only care about v in [0, 16).
-  const Vec256<int32_t> bits0{_mm256_cvttps_epi32(BitCast(df, f0).raw)};
-  const Vec256<int32_t> bits1{_mm256_cvttps_epi32(BitCast(df, f1).raw)};
-  return Vec256<MakeUnsigned<T>>{_mm256_packus_epi32(bits0.raw, bits1.raw)};
-}
-
-#endif  // HWY_TARGET > HWY_AVX3
-
-HWY_INLINE Vec256<uint16_t> Shl(hwy::UnsignedTag /*tag*/, Vec256<uint16_t> v,
-                                Vec256<uint16_t> bits) {
-#if HWY_TARGET <= HWY_AVX3
-  return Vec256<uint16_t>{_mm256_sllv_epi16(v.raw, bits.raw)};
-#else
-  return v * Pow2(bits);
-#endif
-}
-
-HWY_INLINE Vec256<uint32_t> Shl(hwy::UnsignedTag /*tag*/, Vec256<uint32_t> v,
-                                Vec256<uint32_t> bits) {
-  return Vec256<uint32_t>{_mm256_sllv_epi32(v.raw, bits.raw)};
-}
-
-HWY_INLINE Vec256<uint64_t> Shl(hwy::UnsignedTag /*tag*/, Vec256<uint64_t> v,
-                                Vec256<uint64_t> bits) {
-  return Vec256<uint64_t>{_mm256_sllv_epi64(v.raw, bits.raw)};
-}
-
-template <typename T>
-HWY_INLINE Vec256<T> Shl(hwy::SignedTag /*tag*/, Vec256<T> v, Vec256<T> bits) {
-  // Signed left shifts are the same as unsigned.
-  const Full256<T> di;
-  const Full256<MakeUnsigned<T>> du;
-  return BitCast(di,
-                 Shl(hwy::UnsignedTag(), BitCast(du, v), BitCast(du, bits)));
-}
-
-}  // namespace detail
-
-template <typename T>
-HWY_API Vec256<T> operator<<(Vec256<T> v, Vec256<T> bits) {
-  return detail::Shl(hwy::TypeTag<T>(), v, bits);
-}
-
-// ------------------------------ Shr (MulHigh, IfThenElse, Not)
-
-HWY_API Vec256<uint16_t> operator>>(Vec256<uint16_t> v, Vec256<uint16_t> bits) {
-#if HWY_TARGET <= HWY_AVX3
-  return Vec256<uint16_t>{_mm256_srlv_epi16(v.raw, bits.raw)};
-#else
-  Full256<uint16_t> d;
-  // For bits=0, we cannot mul by 2^16, so fix the result later.
-  auto out = MulHigh(v, detail::Pow2(Set(d, 16) - bits));
-  // Replace output with input where bits == 0.
-  return IfThenElse(bits == Zero(d), v, out);
-#endif
-}
-
-HWY_API Vec256<uint32_t> operator>>(Vec256<uint32_t> v, Vec256<uint32_t> bits) {
-  return Vec256<uint32_t>{_mm256_srlv_epi32(v.raw, bits.raw)};
-}
-
-HWY_API Vec256<uint64_t> operator>>(Vec256<uint64_t> v, Vec256<uint64_t> bits) {
-  return Vec256<uint64_t>{_mm256_srlv_epi64(v.raw, bits.raw)};
-}
-
-HWY_API Vec256<int16_t> operator>>(Vec256<int16_t> v, Vec256<int16_t> bits) {
-#if HWY_TARGET <= HWY_AVX3
-  return Vec256<int16_t>{_mm256_srav_epi16(v.raw, bits.raw)};
-#else
-  return detail::SignedShr(Full256<int16_t>(), v, bits);
-#endif
-}
-
-HWY_API Vec256<int32_t> operator>>(Vec256<int32_t> v, Vec256<int32_t> bits) {
-  return Vec256<int32_t>{_mm256_srav_epi32(v.raw, bits.raw)};
-}
-
-HWY_API Vec256<int64_t> operator>>(Vec256<int64_t> v, Vec256<int64_t> bits) {
-#if HWY_TARGET <= HWY_AVX3
-  return Vec256<int64_t>{_mm256_srav_epi64(v.raw, bits.raw)};
-#else
-  return detail::SignedShr(Full256<int64_t>(), v, bits);
-#endif
-}
-
-HWY_INLINE Vec256<uint64_t> MulEven(const Vec256<uint64_t> a,
-                                    const Vec256<uint64_t> b) {
-  const DFromV<decltype(a)> du64;
-  const RepartitionToNarrow<decltype(du64)> du32;
-  const auto maskL = Set(du64, 0xFFFFFFFFULL);
-  const auto a32 = BitCast(du32, a);
-  const auto b32 = BitCast(du32, b);
-  // Inputs for MulEven: we only need the lower 32 bits
-  const auto aH = Shuffle2301(a32);
-  const auto bH = Shuffle2301(b32);
-
-  // Knuth double-word multiplication. We use 32x32 = 64 MulEven and only need
-  // the even (lower 64 bits of every 128-bit block) results. See
-  // https://github.com/hcs0/Hackers-Delight/blob/master/muldwu.c.tat
-  const auto aLbL = MulEven(a32, b32);
-  const auto w3 = aLbL & maskL;
-
-  const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
-  const auto w2 = t2 & maskL;
-  const auto w1 = ShiftRight<32>(t2);
-
-  const auto t = MulEven(a32, bH) + w2;
-  const auto k = ShiftRight<32>(t);
-
-  const auto mulH = MulEven(aH, bH) + w1 + k;
-  const auto mulL = ShiftLeft<32>(t) + w3;
-  return InterleaveLower(mulL, mulH);
-}
-
-HWY_INLINE Vec256<uint64_t> MulOdd(const Vec256<uint64_t> a,
-                                   const Vec256<uint64_t> b) {
-  const DFromV<decltype(a)> du64;
-  const RepartitionToNarrow<decltype(du64)> du32;
-  const auto maskL = Set(du64, 0xFFFFFFFFULL);
-  const auto a32 = BitCast(du32, a);
-  const auto b32 = BitCast(du32, b);
-  // Inputs for MulEven: we only need bits [95:64] (= upper half of input)
-  const auto aH = Shuffle2301(a32);
-  const auto bH = Shuffle2301(b32);
-
-  // Same as above, but we're using the odd results (upper 64 bits per block).
-  const auto aLbL = MulEven(a32, b32);
-  const auto w3 = aLbL & maskL;
-
-  const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
-  const auto w2 = t2 & maskL;
-  const auto w1 = ShiftRight<32>(t2);
-
-  const auto t = MulEven(a32, bH) + w2;
-  const auto k = ShiftRight<32>(t);
-
-  const auto mulH = MulEven(aH, bH) + w1 + k;
-  const auto mulL = ShiftLeft<32>(t) + w3;
-  return InterleaveUpper(du64, mulL, mulH);
-}
-
-// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
-
-HWY_API Vec256<float> ReorderWidenMulAccumulate(Full256<float> df32,
-                                                Vec256<bfloat16_t> a,
-                                                Vec256<bfloat16_t> b,
-                                                const Vec256<float> sum0,
-                                                Vec256<float>& sum1) {
-  // TODO(janwas): _mm256_dpbf16_ps when available
-  const Repartition<uint16_t, decltype(df32)> du16;
-  const RebindToUnsigned<decltype(df32)> du32;
-  const Vec256<uint16_t> zero = Zero(du16);
-  // Lane order within sum0/1 is undefined, hence we can avoid the
-  // longer-latency lane-crossing PromoteTo.
-  const Vec256<uint32_t> a0 = ZipLower(du32, zero, BitCast(du16, a));
-  const Vec256<uint32_t> a1 = ZipUpper(du32, zero, BitCast(du16, a));
-  const Vec256<uint32_t> b0 = ZipLower(du32, zero, BitCast(du16, b));
-  const Vec256<uint32_t> b1 = ZipUpper(du32, zero, BitCast(du16, b));
-  sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
-  return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
-}
-
-// ================================================== CONVERT
-
-// ------------------------------ Promotions (part w/ narrow lanes -> full)
-
-HWY_API Vec256<double> PromoteTo(Full256<double> /* tag */,
-                                 const Vec128<float, 4> v) {
-  return Vec256<double>{_mm256_cvtps_pd(v.raw)};
-}
-
-HWY_API Vec256<double> PromoteTo(Full256<double> /* tag */,
-                                 const Vec128<int32_t, 4> v) {
-  return Vec256<double>{_mm256_cvtepi32_pd(v.raw)};
-}
-
-// Unsigned: zero-extend.
-// Note: these have 3 cycle latency; if inputs are already split across the
-// 128 bit blocks (in their upper/lower halves), then Zip* would be faster.
-HWY_API Vec256<uint16_t> PromoteTo(Full256<uint16_t> /* tag */,
-                                   Vec128<uint8_t> v) {
-  return Vec256<uint16_t>{_mm256_cvtepu8_epi16(v.raw)};
-}
-HWY_API Vec256<uint32_t> PromoteTo(Full256<uint32_t> /* tag */,
-                                   Vec128<uint8_t, 8> v) {
-  return Vec256<uint32_t>{_mm256_cvtepu8_epi32(v.raw)};
-}
-HWY_API Vec256<int16_t> PromoteTo(Full256<int16_t> /* tag */,
-                                  Vec128<uint8_t> v) {
-  return Vec256<int16_t>{_mm256_cvtepu8_epi16(v.raw)};
-}
-HWY_API Vec256<int32_t> PromoteTo(Full256<int32_t> /* tag */,
-                                  Vec128<uint8_t, 8> v) {
-  return Vec256<int32_t>{_mm256_cvtepu8_epi32(v.raw)};
-}
-HWY_API Vec256<uint32_t> PromoteTo(Full256<uint32_t> /* tag */,
-                                   Vec128<uint16_t> v) {
-  return Vec256<uint32_t>{_mm256_cvtepu16_epi32(v.raw)};
-}
-HWY_API Vec256<int32_t> PromoteTo(Full256<int32_t> /* tag */,
-                                  Vec128<uint16_t> v) {
-  return Vec256<int32_t>{_mm256_cvtepu16_epi32(v.raw)};
-}
-HWY_API Vec256<uint64_t> PromoteTo(Full256<uint64_t> /* tag */,
-                                   Vec128<uint32_t> v) {
-  return Vec256<uint64_t>{_mm256_cvtepu32_epi64(v.raw)};
-}
-
-// Signed: replicate sign bit.
-// Note: these have 3 cycle latency; if inputs are already split across the
-// 128 bit blocks (in their upper/lower halves), then ZipUpper/lo followed by
-// signed shift would be faster.
-HWY_API Vec256<int16_t> PromoteTo(Full256<int16_t> /* tag */,
-                                  Vec128<int8_t> v) {
-  return Vec256<int16_t>{_mm256_cvtepi8_epi16(v.raw)};
-}
-HWY_API Vec256<int32_t> PromoteTo(Full256<int32_t> /* tag */,
-                                  Vec128<int8_t, 8> v) {
-  return Vec256<int32_t>{_mm256_cvtepi8_epi32(v.raw)};
-}
-HWY_API Vec256<int32_t> PromoteTo(Full256<int32_t> /* tag */,
-                                  Vec128<int16_t> v) {
-  return Vec256<int32_t>{_mm256_cvtepi16_epi32(v.raw)};
-}
-HWY_API Vec256<int64_t> PromoteTo(Full256<int64_t> /* tag */,
-                                  Vec128<int32_t> v) {
-  return Vec256<int64_t>{_mm256_cvtepi32_epi64(v.raw)};
-}
-
-// ------------------------------ Demotions (full -> part w/ narrow lanes)
-
-HWY_API Vec128<uint16_t> DemoteTo(Full128<uint16_t> /* tag */,
-                                  const Vec256<int32_t> v) {
-  const __m256i u16 = _mm256_packus_epi32(v.raw, v.raw);
-  // Concatenating lower halves of both 128-bit blocks afterward is more
-  // efficient than an extra input with low block = high block of v.
-  return Vec128<uint16_t>{
-      _mm256_castsi256_si128(_mm256_permute4x64_epi64(u16, 0x88))};
-}
-
-HWY_API Vec128<int16_t> DemoteTo(Full128<int16_t> /* tag */,
-                                 const Vec256<int32_t> v) {
-  const __m256i i16 = _mm256_packs_epi32(v.raw, v.raw);
-  return Vec128<int16_t>{
-      _mm256_castsi256_si128(_mm256_permute4x64_epi64(i16, 0x88))};
-}
-
-HWY_API Vec128<uint8_t, 8> DemoteTo(Full64<uint8_t> /* tag */,
-                                    const Vec256<int32_t> v) {
-  const __m256i u16_blocks = _mm256_packus_epi32(v.raw, v.raw);
-  // Concatenate lower 64 bits of each 128-bit block
-  const __m256i u16_concat = _mm256_permute4x64_epi64(u16_blocks, 0x88);
-  const __m128i u16 = _mm256_castsi256_si128(u16_concat);
-  // packus treats the input as signed; we want unsigned. Clear the MSB to get
-  // unsigned saturation to u8.
-  const __m128i i16 = _mm_and_si128(u16, _mm_set1_epi16(0x7FFF));
-  return Vec128<uint8_t, 8>{_mm_packus_epi16(i16, i16)};
-}
-
-HWY_API Vec128<uint8_t> DemoteTo(Full128<uint8_t> /* tag */,
-                                 const Vec256<int16_t> v) {
-  const __m256i u8 = _mm256_packus_epi16(v.raw, v.raw);
-  return Vec128<uint8_t>{
-      _mm256_castsi256_si128(_mm256_permute4x64_epi64(u8, 0x88))};
-}
-
-HWY_API Vec128<int8_t, 8> DemoteTo(Full64<int8_t> /* tag */,
-                                   const Vec256<int32_t> v) {
-  const __m256i i16_blocks = _mm256_packs_epi32(v.raw, v.raw);
-  // Concatenate lower 64 bits of each 128-bit block
-  const __m256i i16_concat = _mm256_permute4x64_epi64(i16_blocks, 0x88);
-  const __m128i i16 = _mm256_castsi256_si128(i16_concat);
-  return Vec128<int8_t, 8>{_mm_packs_epi16(i16, i16)};
-}
-
-HWY_API Vec128<int8_t> DemoteTo(Full128<int8_t> /* tag */,
-                                const Vec256<int16_t> v) {
-  const __m256i i8 = _mm256_packs_epi16(v.raw, v.raw);
-  return Vec128<int8_t>{
-      _mm256_castsi256_si128(_mm256_permute4x64_epi64(i8, 0x88))};
-}
-
-  // Avoid "value of intrinsic immediate argument '8' is out of range '0 - 7'".
-  // 8 is the correct value of _MM_FROUND_NO_EXC, which is allowed here.
-HWY_DIAGNOSTICS(push)
-HWY_DIAGNOSTICS_OFF(disable : 4556, ignored "-Wsign-conversion")
-
-HWY_API Vec128<float16_t> DemoteTo(Full128<float16_t> df16,
-                                   const Vec256<float> v) {
-#ifdef HWY_DISABLE_F16C
-  const RebindToUnsigned<decltype(df16)> du16;
-  const Rebind<uint32_t, decltype(df16)> du;
-  const RebindToSigned<decltype(du)> di;
-  const auto bits32 = BitCast(du, v);
-  const auto sign = ShiftRight<31>(bits32);
-  const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF);
-  const auto mantissa32 = bits32 & Set(du, 0x7FFFFF);
-
-  const auto k15 = Set(di, 15);
-  const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15);
-  const auto is_tiny = exp < Set(di, -24);
-
-  const auto is_subnormal = exp < Set(di, -14);
-  const auto biased_exp16 =
-      BitCast(du, IfThenZeroElse(is_subnormal, exp + k15));
-  const auto sub_exp = BitCast(du, Set(di, -14) - exp);  // [1, 11)
-  const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) +
-                     (mantissa32 >> (Set(du, 13) + sub_exp));
-  const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m,
-                                     ShiftRight<13>(mantissa32));  // <1024
-
-  const auto sign16 = ShiftLeft<15>(sign);
-  const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16;
-  const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16));
-  return BitCast(df16, DemoteTo(du16, bits16));
-#else
-  (void)df16;
-  return Vec128<float16_t>{_mm256_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)};
-#endif
-}
-
-HWY_DIAGNOSTICS(pop)
-
-HWY_API Vec128<bfloat16_t> DemoteTo(Full128<bfloat16_t> dbf16,
-                                    const Vec256<float> v) {
-  // TODO(janwas): _mm256_cvtneps_pbh once we have avx512bf16.
-  const Rebind<int32_t, decltype(dbf16)> di32;
-  const Rebind<uint32_t, decltype(dbf16)> du32;  // for logical shift right
-  const Rebind<uint16_t, decltype(dbf16)> du16;
-  const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
-  return BitCast(dbf16, DemoteTo(du16, bits_in_32));
-}
-
-HWY_API Vec256<bfloat16_t> ReorderDemote2To(Full256<bfloat16_t> dbf16,
-                                            Vec256<float> a, Vec256<float> b) {
-  // TODO(janwas): _mm256_cvtne2ps_pbh once we have avx512bf16.
-  const RebindToUnsigned<decltype(dbf16)> du16;
-  const Repartition<uint32_t, decltype(dbf16)> du32;
-  const Vec256<uint32_t> b_in_even = ShiftRight<16>(BitCast(du32, b));
-  return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
-}
-
-HWY_API Vec128<float> DemoteTo(Full128<float> /* tag */,
-                               const Vec256<double> v) {
-  return Vec128<float>{_mm256_cvtpd_ps(v.raw)};
-}
-
-HWY_API Vec128<int32_t> DemoteTo(Full128<int32_t> /* tag */,
-                                 const Vec256<double> v) {
-  const auto clamped = detail::ClampF64ToI32Max(Full256<double>(), v);
-  return Vec128<int32_t>{_mm256_cvttpd_epi32(clamped.raw)};
-}
-
-// For already range-limited input [0, 255].
-HWY_API Vec128<uint8_t, 8> U8FromU32(const Vec256<uint32_t> v) {
-  const Full256<uint32_t> d32;
-  alignas(32) static constexpr uint32_t k8From32[8] = {
-      0x0C080400u, ~0u, ~0u, ~0u, ~0u, 0x0C080400u, ~0u, ~0u};
-  // Place first four bytes in lo[0], remaining 4 in hi[1].
-  const auto quad = TableLookupBytes(v, Load(d32, k8From32));
-  // Interleave both quadruplets - OR instead of unpack reduces port5 pressure.
-  const auto lo = LowerHalf(quad);
-  const auto hi = UpperHalf(Full128<uint32_t>(), quad);
-  const auto pair = LowerHalf(lo | hi);
-  return BitCast(Full64<uint8_t>(), pair);
-}
-
-// ------------------------------ Truncations
-
-namespace detail {
-
-// LO and HI each hold four indices of bytes within a 128-bit block.
-template <uint32_t LO, uint32_t HI, typename T>
-HWY_INLINE Vec128<uint32_t> LookupAndConcatHalves(Vec256<T> v) {
-  const Full256<uint32_t> d32;
-
-#if HWY_TARGET <= HWY_AVX3_DL
-  alignas(32) constexpr uint32_t kMap[8] = {
-      LO, HI, 0x10101010 + LO, 0x10101010 + HI, 0, 0, 0, 0};
-  const auto result = _mm256_permutexvar_epi8(v.raw, Load(d32, kMap).raw);
-#else
-  alignas(32) static constexpr uint32_t kMap[8] = {LO,  HI,  ~0u, ~0u,
-                                                   ~0u, ~0u, LO,  HI};
-  const auto quad = TableLookupBytes(v, Load(d32, kMap));
-  const auto result = _mm256_permute4x64_epi64(quad.raw, 0xCC);
-  // Possible alternative:
-  // const auto lo = LowerHalf(quad);
-  // const auto hi = UpperHalf(Full128<uint32_t>(), quad);
-  // const auto result = lo | hi;
-#endif
-
-  return Vec128<uint32_t>{_mm256_castsi256_si128(result)};
-}
-
-// LO and HI each hold two indices of bytes within a 128-bit block.
-template <uint16_t LO, uint16_t HI, typename T>
-HWY_INLINE Vec128<uint32_t, 2> LookupAndConcatQuarters(Vec256<T> v) {
-  const Full256<uint16_t> d16;
-
-#if HWY_TARGET <= HWY_AVX3_DL
-  alignas(32) constexpr uint16_t kMap[16] = {
-      LO, HI, 0x1010 + LO, 0x1010 + HI, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-  const auto result = _mm256_permutexvar_epi8(v.raw, Load(d16, kMap).raw);
-  return LowerHalf(Vec128<uint32_t>{_mm256_castsi256_si128(result)});
-#else
-  constexpr uint16_t ff = static_cast<uint16_t>(~0u);
-  alignas(32) static constexpr uint16_t kMap[16] = {
-      LO, ff, HI, ff, ff, ff, ff, ff, ff, ff, ff, ff, LO, ff, HI, ff};
-  const auto quad = TableLookupBytes(v, Load(d16, kMap));
-  const auto mixed = _mm256_permute4x64_epi64(quad.raw, 0xCC);
-  const auto half = _mm256_castsi256_si128(mixed);
-  return LowerHalf(Vec128<uint32_t>{_mm_packus_epi32(half, half)});
-#endif
-}
-
-}  // namespace detail
-
-HWY_API Vec128<uint8_t, 4> TruncateTo(Simd<uint8_t, 4, 0> /* tag */,
-                                      const Vec256<uint64_t> v) {
-  const Full256<uint32_t> d32;
-#if HWY_TARGET <= HWY_AVX3_DL
-  alignas(32) constexpr uint32_t kMap[8] = {0x18100800u, 0, 0, 0, 0, 0, 0, 0};
-  const auto result = _mm256_permutexvar_epi8(v.raw, Load(d32, kMap).raw);
-  return LowerHalf(LowerHalf(LowerHalf(Vec256<uint8_t>{result})));
-#else
-  alignas(32) static constexpr uint32_t kMap[8] = {0xFFFF0800u, ~0u, ~0u, ~0u,
-                                                   0x0800FFFFu, ~0u, ~0u, ~0u};
-  const auto quad = TableLookupBytes(v, Load(d32, kMap));
-  const auto lo = LowerHalf(quad);
-  const auto hi = UpperHalf(Full128<uint32_t>(), quad);
-  const auto result = lo | hi;
-  return LowerHalf(LowerHalf(Vec128<uint8_t>{result.raw}));
-#endif
-}
-
-HWY_API Vec128<uint16_t, 4> TruncateTo(Simd<uint16_t, 4, 0> /* tag */,
-                                       const Vec256<uint64_t> v) {
-  const auto result = detail::LookupAndConcatQuarters<0x100, 0x908>(v);
-  return Vec128<uint16_t, 4>{result.raw};
-}
-
-HWY_API Vec128<uint32_t> TruncateTo(Simd<uint32_t, 4, 0> /* tag */,
-                                    const Vec256<uint64_t> v) {
-  const Full256<uint32_t> d32;
-  alignas(32) constexpr uint32_t kEven[8] = {0, 2, 4, 6, 0, 2, 4, 6};
-  const auto v32 =
-      TableLookupLanes(BitCast(d32, v), SetTableIndices(d32, kEven));
-  return LowerHalf(Vec256<uint32_t>{v32.raw});
-}
-
-HWY_API Vec128<uint8_t, 8> TruncateTo(Simd<uint8_t, 8, 0> /* tag */,
-                                      const Vec256<uint32_t> v) {
-  const auto full = detail::LookupAndConcatQuarters<0x400, 0xC08>(v);
-  return Vec128<uint8_t, 8>{full.raw};
-}
-
-HWY_API Vec128<uint16_t> TruncateTo(Simd<uint16_t, 8, 0> /* tag */,
-                                    const Vec256<uint32_t> v) {
-  const auto full = detail::LookupAndConcatHalves<0x05040100, 0x0D0C0908>(v);
-  return Vec128<uint16_t>{full.raw};
-}
-
-HWY_API Vec128<uint8_t> TruncateTo(Simd<uint8_t, 16, 0> /* tag */,
-                                   const Vec256<uint16_t> v) {
-  const auto full = detail::LookupAndConcatHalves<0x06040200, 0x0E0C0A08>(v);
-  return Vec128<uint8_t>{full.raw};
-}
-
-// ------------------------------ Integer <=> fp (ShiftRight, OddEven)
-
-HWY_API Vec256<float> ConvertTo(Full256<float> /* tag */,
-                                const Vec256<int32_t> v) {
-  return Vec256<float>{_mm256_cvtepi32_ps(v.raw)};
-}
-
-HWY_API Vec256<double> ConvertTo(Full256<double> dd, const Vec256<int64_t> v) {
-#if HWY_TARGET <= HWY_AVX3
-  (void)dd;
-  return Vec256<double>{_mm256_cvtepi64_pd(v.raw)};
-#else
-  // Based on wim's approach (https://stackoverflow.com/questions/41144668/)
-  const Repartition<uint32_t, decltype(dd)> d32;
-  const Repartition<uint64_t, decltype(dd)> d64;
-
-  // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63
-  const auto k84_63 = Set(d64, 0x4530000080000000ULL);
-  const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63);
-
-  // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven)
-  const auto k52 = Set(d32, 0x43300000);
-  const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v)));
-
-  const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL));
-  return (v_upper - k84_63_52) + v_lower;  // order matters!
-#endif
-}
-
-HWY_API Vec256<float> ConvertTo(HWY_MAYBE_UNUSED Full256<float> df,
-                                const Vec256<uint32_t> v) {
-#if HWY_TARGET <= HWY_AVX3
-  return Vec256<float>{_mm256_cvtepu32_ps(v.raw)};
-#else
-  // Based on wim's approach (https://stackoverflow.com/questions/34066228/)
-  const RebindToUnsigned<decltype(df)> du32;
-  const RebindToSigned<decltype(df)> d32;
-
-  const auto msk_lo = Set(du32, 0xFFFF);
-  const auto cnst2_16_flt = Set(df, 65536.0f); // 2^16
-
-  // Extract the 16 lowest/highest significant bits of v and cast to signed int
-  const auto v_lo = BitCast(d32, And(v, msk_lo));
-  const auto v_hi = BitCast(d32, ShiftRight<16>(v));
-
-  return MulAdd(cnst2_16_flt, ConvertTo(df, v_hi), ConvertTo(df, v_lo));
-#endif
-}
-
-HWY_API Vec256<double> ConvertTo(HWY_MAYBE_UNUSED Full256<double> dd,
-                                  const Vec256<uint64_t> v) {
-#if HWY_TARGET <= HWY_AVX3
-  return Vec256<double>{_mm256_cvtepu64_pd(v.raw)};
-#else
-  // Based on wim's approach (https://stackoverflow.com/questions/41144668/)
-  const RebindToUnsigned<decltype(dd)> d64;
-  using VU = VFromD<decltype(d64)>;
-
-  const VU msk_lo = Set(d64, 0xFFFFFFFFULL);
-  const auto cnst2_32_dbl = Set(dd, 4294967296.0); // 2^32
-
-   // Extract the 32 lowest significant bits of v
-  const VU v_lo = And(v, msk_lo);
-  const VU v_hi = ShiftRight<32>(v);
-
-  auto uint64_to_double256_fast = [&dd](Vec256<uint64_t> w) HWY_ATTR {
-    w = Or(w, Vec256<uint64_t>{
-                  detail::BitCastToInteger(Set(dd, 0x0010000000000000).raw)});
-    return BitCast(dd, w) - Set(dd, 0x0010000000000000);
-  };
-
-  const auto v_lo_dbl = uint64_to_double256_fast(v_lo);
-  return MulAdd(cnst2_32_dbl, uint64_to_double256_fast(v_hi), v_lo_dbl);
-#endif
-}
-
-// Truncates (rounds toward zero).
-HWY_API Vec256<int32_t> ConvertTo(Full256<int32_t> d, const Vec256<float> v) {
-  return detail::FixConversionOverflow(d, v, _mm256_cvttps_epi32(v.raw));
-}
-
-HWY_API Vec256<int64_t> ConvertTo(Full256<int64_t> di, const Vec256<double> v) {
-#if HWY_TARGET <= HWY_AVX3
-  return detail::FixConversionOverflow(di, v, _mm256_cvttpd_epi64(v.raw));
-#else
-  using VI = decltype(Zero(di));
-  const VI k0 = Zero(di);
-  const VI k1 = Set(di, 1);
-  const VI k51 = Set(di, 51);
-
-  // Exponent indicates whether the number can be represented as int64_t.
-  const VI biased_exp = ShiftRight<52>(BitCast(di, v)) & Set(di, 0x7FF);
-  const VI exp = biased_exp - Set(di, 0x3FF);
-  const auto in_range = exp < Set(di, 63);
-
-  // If we were to cap the exponent at 51 and add 2^52, the number would be in
-  // [2^52, 2^53) and mantissa bits could be read out directly. We need to
-  // round-to-0 (truncate), but changing rounding mode in MXCSR hits a
-  // compiler reordering bug: https://gcc.godbolt.org/z/4hKj6c6qc . We instead
-  // manually shift the mantissa into place (we already have many of the
-  // inputs anyway).
-  const VI shift_mnt = Max(k51 - exp, k0);
-  const VI shift_int = Max(exp - k51, k0);
-  const VI mantissa = BitCast(di, v) & Set(di, (1ULL << 52) - 1);
-  // Include implicit 1-bit; shift by one more to ensure it's in the mantissa.
-  const VI int52 = (mantissa | Set(di, 1ULL << 52)) >> (shift_mnt + k1);
-  // For inputs larger than 2^52, insert zeros at the bottom.
-  const VI shifted = int52 << shift_int;
-  // Restore the one bit lost when shifting in the implicit 1-bit.
-  const VI restored = shifted | ((mantissa & k1) << (shift_int - k1));
-
-  // Saturate to LimitsMin (unchanged when negating below) or LimitsMax.
-  const VI sign_mask = BroadcastSignBit(BitCast(di, v));
-  const VI limit = Set(di, LimitsMax<int64_t>()) - sign_mask;
-  const VI magnitude = IfThenElse(in_range, restored, limit);
-
-  // If the input was negative, negate the integer (two's complement).
-  return (magnitude ^ sign_mask) - sign_mask;
-#endif
-}
-
-HWY_API Vec256<int32_t> NearestInt(const Vec256<float> v) {
-  const Full256<int32_t> di;
-  return detail::FixConversionOverflow(di, v, _mm256_cvtps_epi32(v.raw));
-}
-
-
-HWY_API Vec256<float> PromoteTo(Full256<float> df32,
-                                const Vec128<float16_t> v) {
-#ifdef HWY_DISABLE_F16C
-  const RebindToSigned<decltype(df32)> di32;
-  const RebindToUnsigned<decltype(df32)> du32;
-  // Expand to u32 so we can shift.
-  const auto bits16 = PromoteTo(du32, Vec128<uint16_t>{v.raw});
-  const auto sign = ShiftRight<15>(bits16);
-  const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F);
-  const auto mantissa = bits16 & Set(du32, 0x3FF);
-  const auto subnormal =
-      BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) *
-                        Set(df32, 1.0f / 16384 / 1024));
-
-  const auto biased_exp32 = biased_exp + Set(du32, 127 - 15);
-  const auto mantissa32 = ShiftLeft<23 - 10>(mantissa);
-  const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32;
-  const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal);
-  return BitCast(df32, ShiftLeft<31>(sign) | bits32);
-#else
-  (void)df32;
-  return Vec256<float>{_mm256_cvtph_ps(v.raw)};
-#endif
-}
-
-HWY_API Vec256<float> PromoteTo(Full256<float> df32,
-                                const Vec128<bfloat16_t> v) {
-  const Rebind<uint16_t, decltype(df32)> du16;
-  const RebindToSigned<decltype(df32)> di32;
-  return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
-}
-
-// ================================================== CRYPTO
-
-#if !defined(HWY_DISABLE_PCLMUL_AES)
-
-// Per-target flag to prevent generic_ops-inl.h from defining AESRound.
-#ifdef HWY_NATIVE_AES
-#undef HWY_NATIVE_AES
-#else
-#define HWY_NATIVE_AES
-#endif
-
-HWY_API Vec256<uint8_t> AESRound(Vec256<uint8_t> state,
-                                 Vec256<uint8_t> round_key) {
-#if HWY_TARGET == HWY_AVX3_DL
-  return Vec256<uint8_t>{_mm256_aesenc_epi128(state.raw, round_key.raw)};
-#else
-  const Full256<uint8_t> d;
-  const Half<decltype(d)> d2;
-  return Combine(d, AESRound(UpperHalf(d2, state), UpperHalf(d2, round_key)),
-                 AESRound(LowerHalf(state), LowerHalf(round_key)));
-#endif
-}
-
-HWY_API Vec256<uint8_t> AESLastRound(Vec256<uint8_t> state,
-                                     Vec256<uint8_t> round_key) {
-#if HWY_TARGET == HWY_AVX3_DL
-  return Vec256<uint8_t>{_mm256_aesenclast_epi128(state.raw, round_key.raw)};
-#else
-  const Full256<uint8_t> d;
-  const Half<decltype(d)> d2;
-  return Combine(d,
-                 AESLastRound(UpperHalf(d2, state), UpperHalf(d2, round_key)),
-                 AESLastRound(LowerHalf(state), LowerHalf(round_key)));
-#endif
-}
-
-HWY_API Vec256<uint64_t> CLMulLower(Vec256<uint64_t> a, Vec256<uint64_t> b) {
-#if HWY_TARGET == HWY_AVX3_DL
-  return Vec256<uint64_t>{_mm256_clmulepi64_epi128(a.raw, b.raw, 0x00)};
-#else
-  const Full256<uint64_t> d;
-  const Half<decltype(d)> d2;
-  return Combine(d, CLMulLower(UpperHalf(d2, a), UpperHalf(d2, b)),
-                 CLMulLower(LowerHalf(a), LowerHalf(b)));
-#endif
-}
-
-HWY_API Vec256<uint64_t> CLMulUpper(Vec256<uint64_t> a, Vec256<uint64_t> b) {
-#if HWY_TARGET == HWY_AVX3_DL
-  return Vec256<uint64_t>{_mm256_clmulepi64_epi128(a.raw, b.raw, 0x11)};
-#else
-  const Full256<uint64_t> d;
-  const Half<decltype(d)> d2;
-  return Combine(d, CLMulUpper(UpperHalf(d2, a), UpperHalf(d2, b)),
-                 CLMulUpper(LowerHalf(a), LowerHalf(b)));
-#endif
-}
-
-#endif  // HWY_DISABLE_PCLMUL_AES
-
-// ================================================== MISC
-
-// Returns a vector with lane i=[0, N) set to "first" + i.
-template <typename T, typename T2>
-HWY_API Vec256<T> Iota(const Full256<T> d, const T2 first) {
-  HWY_ALIGN T lanes[32 / sizeof(T)];
-  for (size_t i = 0; i < 32 / sizeof(T); ++i) {
-    lanes[i] = static_cast<T>(first + static_cast<T2>(i));
-  }
-  return Load(d, lanes);
-}
-
-#if HWY_TARGET <= HWY_AVX3
-
-// ------------------------------ LoadMaskBits
-
-// `p` points to at least 8 readable bytes, not all of which need be valid.
-template <typename T>
-HWY_API Mask256<T> LoadMaskBits(const Full256<T> /* tag */,
-                                const uint8_t* HWY_RESTRICT bits) {
-  constexpr size_t N = 32 / sizeof(T);
-  constexpr size_t kNumBytes = (N + 7) / 8;
-
-  uint64_t mask_bits = 0;
-  CopyBytes<kNumBytes>(bits, &mask_bits);
-
-  if (N < 8) {
-    mask_bits &= (1ull << N) - 1;
-  }
-
-  return Mask256<T>::FromBits(mask_bits);
-}
-
-// ------------------------------ StoreMaskBits
-
-// `p` points to at least 8 writable bytes.
-template <typename T>
-HWY_API size_t StoreMaskBits(const Full256<T> /* tag */, const Mask256<T> mask,
-                             uint8_t* bits) {
-  constexpr size_t N = 32 / sizeof(T);
-  constexpr size_t kNumBytes = (N + 7) / 8;
-
-  CopyBytes<kNumBytes>(&mask.raw, bits);
-
-  // Non-full byte, need to clear the undefined upper bits.
-  if (N < 8) {
-    const int mask_bits = static_cast<int>((1ull << N) - 1);
-    bits[0] = static_cast<uint8_t>(bits[0] & mask_bits);
-  }
-  return kNumBytes;
-}
-
-// ------------------------------ Mask testing
-
-template <typename T>
-HWY_API size_t CountTrue(const Full256<T> /* tag */, const Mask256<T> mask) {
-  return PopCount(static_cast<uint64_t>(mask.raw));
-}
-
-template <typename T>
-HWY_API intptr_t FindFirstTrue(const Full256<T> /* tag */,
-                               const Mask256<T> mask) {
-  return mask.raw ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask.raw)) : -1;
-}
-
-// Beware: the suffix indicates the number of mask bits, not lane size!
-
-namespace detail {
-
-template <typename T>
-HWY_INLINE bool AllFalse(hwy::SizeTag<1> /*tag*/, const Mask256<T> mask) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return _kortestz_mask32_u8(mask.raw, mask.raw);
-#else
-  return mask.raw == 0;
-#endif
-}
-template <typename T>
-HWY_INLINE bool AllFalse(hwy::SizeTag<2> /*tag*/, const Mask256<T> mask) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return _kortestz_mask16_u8(mask.raw, mask.raw);
-#else
-  return mask.raw == 0;
-#endif
-}
-template <typename T>
-HWY_INLINE bool AllFalse(hwy::SizeTag<4> /*tag*/, const Mask256<T> mask) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return _kortestz_mask8_u8(mask.raw, mask.raw);
-#else
-  return mask.raw == 0;
-#endif
-}
-template <typename T>
-HWY_INLINE bool AllFalse(hwy::SizeTag<8> /*tag*/, const Mask256<T> mask) {
-  return (uint64_t{mask.raw} & 0xF) == 0;
-}
-
-}  // namespace detail
-
-template <typename T>
-HWY_API bool AllFalse(const Full256<T> /* tag */, const Mask256<T> mask) {
-  return detail::AllFalse(hwy::SizeTag<sizeof(T)>(), mask);
-}
-
-namespace detail {
-
-template <typename T>
-HWY_INLINE bool AllTrue(hwy::SizeTag<1> /*tag*/, const Mask256<T> mask) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return _kortestc_mask32_u8(mask.raw, mask.raw);
-#else
-  return mask.raw == 0xFFFFFFFFu;
-#endif
-}
-template <typename T>
-HWY_INLINE bool AllTrue(hwy::SizeTag<2> /*tag*/, const Mask256<T> mask) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return _kortestc_mask16_u8(mask.raw, mask.raw);
-#else
-  return mask.raw == 0xFFFFu;
-#endif
-}
-template <typename T>
-HWY_INLINE bool AllTrue(hwy::SizeTag<4> /*tag*/, const Mask256<T> mask) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return _kortestc_mask8_u8(mask.raw, mask.raw);
-#else
-  return mask.raw == 0xFFu;
-#endif
-}
-template <typename T>
-HWY_INLINE bool AllTrue(hwy::SizeTag<8> /*tag*/, const Mask256<T> mask) {
-  // Cannot use _kortestc because we have less than 8 mask bits.
-  return mask.raw == 0xFu;
-}
-
-}  // namespace detail
-
-template <typename T>
-HWY_API bool AllTrue(const Full256<T> /* tag */, const Mask256<T> mask) {
-  return detail::AllTrue(hwy::SizeTag<sizeof(T)>(), mask);
-}
-
-// ------------------------------ Compress
-
-// 16-bit is defined in x86_512 so we can use 512-bit vectors.
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec256<T> Compress(Vec256<T> v, Mask256<T> mask) {
-  return Vec256<T>{_mm256_maskz_compress_epi32(mask.raw, v.raw)};
-}
-
-HWY_API Vec256<float> Compress(Vec256<float> v, Mask256<float> mask) {
-  return Vec256<float>{_mm256_maskz_compress_ps(mask.raw, v.raw)};
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec256<T> Compress(Vec256<T> v, Mask256<T> mask) {
-  // See CompressIsPartition.
-  alignas(16) constexpr uint64_t packed_array[16] = {
-      // PrintCompress64x4NibbleTables
-      0x00003210, 0x00003210, 0x00003201, 0x00003210, 0x00003102, 0x00003120,
-      0x00003021, 0x00003210, 0x00002103, 0x00002130, 0x00002031, 0x00002310,
-      0x00001032, 0x00001320, 0x00000321, 0x00003210};
-
-  // For lane i, shift the i-th 4-bit index down to bits [0, 2) -
-  // _mm256_permutexvar_epi64 will ignore the upper bits.
-  const Full256<T> d;
-  const RebindToUnsigned<decltype(d)> du64;
-  const auto packed = Set(du64, packed_array[mask.raw]);
-  alignas(64) constexpr uint64_t shifts[4] = {0, 4, 8, 12};
-  const auto indices = Indices256<T>{(packed >> Load(du64, shifts)).raw};
-  return TableLookupLanes(v, indices);
-}
-
-// ------------------------------ CompressNot (Compress)
-
-template <typename T, HWY_IF_NOT_LANE_SIZE(T, 8)>
-HWY_API Vec256<T> CompressNot(Vec256<T> v, const Mask256<T> mask) {
-  return Compress(v, Not(mask));
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec256<T> CompressNot(Vec256<T> v, Mask256<T> mask) {
-  // See CompressIsPartition.
-  alignas(16) constexpr uint64_t packed_array[16] = {
-      // PrintCompressNot64x4NibbleTables
-      0x00003210, 0x00000321, 0x00001320, 0x00001032, 0x00002310, 0x00002031,
-      0x00002130, 0x00002103, 0x00003210, 0x00003021, 0x00003120, 0x00003102,
-      0x00003210, 0x00003201, 0x00003210, 0x00003210};
-
-  // For lane i, shift the i-th 4-bit index down to bits [0, 2) -
-  // _mm256_permutexvar_epi64 will ignore the upper bits.
-  const Full256<T> d;
-  const RebindToUnsigned<decltype(d)> du64;
-  const auto packed = Set(du64, packed_array[mask.raw]);
-  alignas(64) constexpr uint64_t shifts[4] = {0, 4, 8, 12};
-  const auto indices = Indices256<T>{(packed >> Load(du64, shifts)).raw};
-  return TableLookupLanes(v, indices);
-}
-
-// ------------------------------ CompressBlocksNot
-HWY_API Vec256<uint64_t> CompressBlocksNot(Vec256<uint64_t> v,
-                                           Mask256<uint64_t> mask) {
-  return CompressNot(v, mask);
-}
-
-// ------------------------------ CompressBits (LoadMaskBits)
-template <typename T>
-HWY_API Vec256<T> CompressBits(Vec256<T> v, const uint8_t* HWY_RESTRICT bits) {
-  return Compress(v, LoadMaskBits(Full256<T>(), bits));
-}
-
-// ------------------------------ CompressStore
-
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API size_t CompressStore(Vec256<T> v, Mask256<T> mask, Full256<T> d,
-                             T* HWY_RESTRICT unaligned) {
-  const Rebind<uint16_t, decltype(d)> du;
-  const auto vu = BitCast(du, v);  // (required for float16_t inputs)
-
-  const uint64_t mask_bits{mask.raw};
-
-#if HWY_TARGET == HWY_AVX3_DL  // VBMI2
-  _mm256_mask_compressstoreu_epi16(unaligned, mask.raw, vu.raw);
-#else
-  // Split into halves to keep the table size manageable.
-  const Half<decltype(du)> duh;
-  const auto vL = LowerHalf(duh, vu);
-  const auto vH = UpperHalf(duh, vu);
-
-  const uint64_t mask_bitsL = mask_bits & 0xFF;
-  const uint64_t mask_bitsH = mask_bits >> 8;
-
-  const auto idxL = detail::IndicesForCompress16(mask_bitsL);
-  const auto idxH = detail::IndicesForCompress16(mask_bitsH);
-
-  // Compress and 128-bit halves.
-  const Vec128<uint16_t> cL{_mm_permutexvar_epi16(idxL.raw, vL.raw)};
-  const Vec128<uint16_t> cH{_mm_permutexvar_epi16(idxH.raw, vH.raw)};
-  const Half<decltype(d)> dh;
-  StoreU(BitCast(dh, cL), dh, unaligned);
-  StoreU(BitCast(dh, cH), dh, unaligned + PopCount(mask_bitsL));
-#endif  // HWY_TARGET == HWY_AVX3_DL
-
-  return PopCount(mask_bits);
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API size_t CompressStore(Vec256<T> v, Mask256<T> mask, Full256<T> /* tag */,
-                             T* HWY_RESTRICT unaligned) {
-  _mm256_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
-  const size_t count = PopCount(uint64_t{mask.raw});
-  // Workaround for MSAN not marking output as initialized (b/233326619)
-#if HWY_IS_MSAN
-  __msan_unpoison(unaligned, count * sizeof(T));
-#endif
-  return count;
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API size_t CompressStore(Vec256<T> v, Mask256<T> mask, Full256<T> /* tag */,
-                             T* HWY_RESTRICT unaligned) {
-  _mm256_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
-  const size_t count = PopCount(uint64_t{mask.raw} & 0xFull);
-  // Workaround for MSAN not marking output as initialized (b/233326619)
-#if HWY_IS_MSAN
-  __msan_unpoison(unaligned, count * sizeof(T));
-#endif
-  return count;
-}
-
-HWY_API size_t CompressStore(Vec256<float> v, Mask256<float> mask,
-                             Full256<float> /* tag */,
-                             float* HWY_RESTRICT unaligned) {
-  _mm256_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
-  const size_t count = PopCount(uint64_t{mask.raw});
-  // Workaround for MSAN not marking output as initialized (b/233326619)
-#if HWY_IS_MSAN
-  __msan_unpoison(unaligned, count * sizeof(float));
-#endif
-  return count;
-}
-
-HWY_API size_t CompressStore(Vec256<double> v, Mask256<double> mask,
-                             Full256<double> /* tag */,
-                             double* HWY_RESTRICT unaligned) {
-  _mm256_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
-  const size_t count = PopCount(uint64_t{mask.raw} & 0xFull);
-  // Workaround for MSAN not marking output as initialized (b/233326619)
-#if HWY_IS_MSAN
-  __msan_unpoison(unaligned, count * sizeof(double));
-#endif
-  return count;
-}
-
-// ------------------------------ CompressBlendedStore (CompressStore)
-
-template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
-HWY_API size_t CompressBlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
-                                    T* HWY_RESTRICT unaligned) {
-  // Native (32 or 64-bit) AVX-512 instruction already does the blending at no
-  // extra cost (latency 11, rthroughput 2 - same as compress plus store).
-  return CompressStore(v, m, d, unaligned);
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API size_t CompressBlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
-                                    T* HWY_RESTRICT unaligned) {
-#if HWY_TARGET <= HWY_AVX3_DL
-  return CompressStore(v, m, d, unaligned);  // also native
-#else
-  const size_t count = CountTrue(d, m);
-  BlendedStore(Compress(v, m), FirstN(d, count), d, unaligned);
-  // Workaround for MSAN not marking output as initialized (b/233326619)
-#if HWY_IS_MSAN
-  __msan_unpoison(unaligned, count * sizeof(T));
-#endif
-  return count;
-#endif
-}
-
-// ------------------------------ CompressBitsStore (LoadMaskBits)
-
-template <typename T>
-HWY_API size_t CompressBitsStore(Vec256<T> v, const uint8_t* HWY_RESTRICT bits,
-                                 Full256<T> d, T* HWY_RESTRICT unaligned) {
-  return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
-}
-
-#else  // AVX2
-
-// ------------------------------ LoadMaskBits (TestBit)
-
-namespace detail {
-
-// 256 suffix avoids ambiguity with x86_128 without needing HWY_IF_LE128 there.
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_INLINE Mask256<T> LoadMaskBits256(Full256<T> d, uint64_t mask_bits) {
-  const RebindToUnsigned<decltype(d)> du;
-  const Repartition<uint32_t, decltype(d)> du32;
-  const auto vbits = BitCast(du, Set(du32, static_cast<uint32_t>(mask_bits)));
-
-  // Replicate bytes 8x such that each byte contains the bit that governs it.
-  const Repartition<uint64_t, decltype(d)> du64;
-  alignas(32) constexpr uint64_t kRep8[4] = {
-      0x0000000000000000ull, 0x0101010101010101ull, 0x0202020202020202ull,
-      0x0303030303030303ull};
-  const auto rep8 = TableLookupBytes(vbits, BitCast(du, Load(du64, kRep8)));
-
-  alignas(32) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
-                                            1, 2, 4, 8, 16, 32, 64, 128};
-  return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit)));
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_INLINE Mask256<T> LoadMaskBits256(Full256<T> d, uint64_t mask_bits) {
-  const RebindToUnsigned<decltype(d)> du;
-  alignas(32) constexpr uint16_t kBit[16] = {
-      1,     2,     4,     8,     16,     32,     64,     128,
-      0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000};
-  const auto vmask_bits = Set(du, static_cast<uint16_t>(mask_bits));
-  return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_INLINE Mask256<T> LoadMaskBits256(Full256<T> d, uint64_t mask_bits) {
-  const RebindToUnsigned<decltype(d)> du;
-  alignas(32) constexpr uint32_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
-  const auto vmask_bits = Set(du, static_cast<uint32_t>(mask_bits));
-  return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_INLINE Mask256<T> LoadMaskBits256(Full256<T> d, uint64_t mask_bits) {
-  const RebindToUnsigned<decltype(d)> du;
-  alignas(32) constexpr uint64_t kBit[8] = {1, 2, 4, 8};
-  return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit)));
-}
-
-}  // namespace detail
-
-// `p` points to at least 8 readable bytes, not all of which need be valid.
-template <typename T>
-HWY_API Mask256<T> LoadMaskBits(Full256<T> d,
-                                const uint8_t* HWY_RESTRICT bits) {
-  constexpr size_t N = 32 / sizeof(T);
-  constexpr size_t kNumBytes = (N + 7) / 8;
-
-  uint64_t mask_bits = 0;
-  CopyBytes<kNumBytes>(bits, &mask_bits);
-
-  if (N < 8) {
-    mask_bits &= (1ull << N) - 1;
-  }
-
-  return detail::LoadMaskBits256(d, mask_bits);
-}
-
-// ------------------------------ StoreMaskBits
-
-namespace detail {
-
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_INLINE uint64_t BitsFromMask(const Mask256<T> mask) {
-  const Full256<T> d;
-  const Full256<uint8_t> d8;
-  const auto sign_bits = BitCast(d8, VecFromMask(d, mask)).raw;
-  // Prevent sign-extension of 32-bit masks because the intrinsic returns int.
-  return static_cast<uint32_t>(_mm256_movemask_epi8(sign_bits));
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_INLINE uint64_t BitsFromMask(const Mask256<T> mask) {
-#if HWY_ARCH_X86_64
-  const Full256<T> d;
-  const Full256<uint8_t> d8;
-  const Mask256<uint8_t> mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask)));
-  const uint64_t sign_bits8 = BitsFromMask(mask8);
-  // Skip the bits from the lower byte of each u16 (better not to use the
-  // same packs_epi16 as SSE4, because that requires an extra swizzle here).
-  return _pext_u64(sign_bits8, 0xAAAAAAAAull);
-#else
-  // Slow workaround for 32-bit builds, which lack _pext_u64.
-  // Remove useless lower half of each u16 while preserving the sign bit.
-  // Bytes [0, 8) and [16, 24) have the same sign bits as the input lanes.
-  const auto sign_bits = _mm256_packs_epi16(mask.raw, _mm256_setzero_si256());
-  // Move odd qwords (value zero) to top so they don't affect the mask value.
-  const auto compressed =
-      _mm256_permute4x64_epi64(sign_bits, _MM_SHUFFLE(3, 1, 2, 0));
-  return static_cast<unsigned>(_mm256_movemask_epi8(compressed));
-#endif  // HWY_ARCH_X86_64
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_INLINE uint64_t BitsFromMask(const Mask256<T> mask) {
-  const Full256<T> d;
-  const Full256<float> df;
-  const auto sign_bits = BitCast(df, VecFromMask(d, mask)).raw;
-  return static_cast<unsigned>(_mm256_movemask_ps(sign_bits));
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_INLINE uint64_t BitsFromMask(const Mask256<T> mask) {
-  const Full256<T> d;
-  const Full256<double> df;
-  const auto sign_bits = BitCast(df, VecFromMask(d, mask)).raw;
-  return static_cast<unsigned>(_mm256_movemask_pd(sign_bits));
-}
-
-}  // namespace detail
-
-// `p` points to at least 8 writable bytes.
-template <typename T>
-HWY_API size_t StoreMaskBits(const Full256<T> /* tag */, const Mask256<T> mask,
-                             uint8_t* bits) {
-  constexpr size_t N = 32 / sizeof(T);
-  constexpr size_t kNumBytes = (N + 7) / 8;
-
-  const uint64_t mask_bits = detail::BitsFromMask(mask);
-  CopyBytes<kNumBytes>(&mask_bits, bits);
-  return kNumBytes;
-}
-
-// ------------------------------ Mask testing
-
-// Specialize for 16-bit lanes to avoid unnecessary pext. This assumes each mask
-// lane is 0 or ~0.
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API bool AllFalse(const Full256<T> d, const Mask256<T> mask) {
-  const Repartition<uint8_t, decltype(d)> d8;
-  const Mask256<uint8_t> mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask)));
-  return detail::BitsFromMask(mask8) == 0;
-}
-
-template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
-HWY_API bool AllFalse(const Full256<T> /* tag */, const Mask256<T> mask) {
-  // Cheaper than PTEST, which is 2 uop / 3L.
-  return detail::BitsFromMask(mask) == 0;
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API bool AllTrue(const Full256<T> d, const Mask256<T> mask) {
-  const Repartition<uint8_t, decltype(d)> d8;
-  const Mask256<uint8_t> mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask)));
-  return detail::BitsFromMask(mask8) == (1ull << 32) - 1;
-}
-template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
-HWY_API bool AllTrue(const Full256<T> /* tag */, const Mask256<T> mask) {
-  constexpr uint64_t kAllBits = (1ull << (32 / sizeof(T))) - 1;
-  return detail::BitsFromMask(mask) == kAllBits;
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API size_t CountTrue(const Full256<T> d, const Mask256<T> mask) {
-  const Repartition<uint8_t, decltype(d)> d8;
-  const Mask256<uint8_t> mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask)));
-  return PopCount(detail::BitsFromMask(mask8)) >> 1;
-}
-template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
-HWY_API size_t CountTrue(const Full256<T> /* tag */, const Mask256<T> mask) {
-  return PopCount(detail::BitsFromMask(mask));
-}
-
-template <typename T>
-HWY_API intptr_t FindFirstTrue(const Full256<T> /* tag */,
-                               const Mask256<T> mask) {
-  const uint64_t mask_bits = detail::BitsFromMask(mask);
-  return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero64(mask_bits)) : -1;
-}
-
-// ------------------------------ Compress, CompressBits
-
-namespace detail {
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d,
-                                                uint64_t mask_bits) {
-  const RebindToUnsigned<decltype(d)> d32;
-  // We need a masked Iota(). With 8 lanes, there are 256 combinations and a LUT
-  // of SetTableIndices would require 8 KiB, a large part of L1D. The other
-  // alternative is _pext_u64, but this is extremely slow on Zen2 (18 cycles)
-  // and unavailable in 32-bit builds. We instead compress each index into 4
-  // bits, for a total of 1 KiB.
-  alignas(16) constexpr uint32_t packed_array[256] = {
-      // PrintCompress32x8Tables
-      0x76543210, 0x76543210, 0x76543201, 0x76543210, 0x76543102, 0x76543120,
-      0x76543021, 0x76543210, 0x76542103, 0x76542130, 0x76542031, 0x76542310,
-      0x76541032, 0x76541320, 0x76540321, 0x76543210, 0x76532104, 0x76532140,
-      0x76532041, 0x76532410, 0x76531042, 0x76531420, 0x76530421, 0x76534210,
-      0x76521043, 0x76521430, 0x76520431, 0x76524310, 0x76510432, 0x76514320,
-      0x76504321, 0x76543210, 0x76432105, 0x76432150, 0x76432051, 0x76432510,
-      0x76431052, 0x76431520, 0x76430521, 0x76435210, 0x76421053, 0x76421530,
-      0x76420531, 0x76425310, 0x76410532, 0x76415320, 0x76405321, 0x76453210,
-      0x76321054, 0x76321540, 0x76320541, 0x76325410, 0x76310542, 0x76315420,
-      0x76305421, 0x76354210, 0x76210543, 0x76215430, 0x76205431, 0x76254310,
-      0x76105432, 0x76154320, 0x76054321, 0x76543210, 0x75432106, 0x75432160,
-      0x75432061, 0x75432610, 0x75431062, 0x75431620, 0x75430621, 0x75436210,
-      0x75421063, 0x75421630, 0x75420631, 0x75426310, 0x75410632, 0x75416320,
-      0x75406321, 0x75463210, 0x75321064, 0x75321640, 0x75320641, 0x75326410,
-      0x75310642, 0x75316420, 0x75306421, 0x75364210, 0x75210643, 0x75216430,
-      0x75206431, 0x75264310, 0x75106432, 0x75164320, 0x75064321, 0x75643210,
-      0x74321065, 0x74321650, 0x74320651, 0x74326510, 0x74310652, 0x74316520,
-      0x74306521, 0x74365210, 0x74210653, 0x74216530, 0x74206531, 0x74265310,
-      0x74106532, 0x74165320, 0x74065321, 0x74653210, 0x73210654, 0x73216540,
-      0x73206541, 0x73265410, 0x73106542, 0x73165420, 0x73065421, 0x73654210,
-      0x72106543, 0x72165430, 0x72065431, 0x72654310, 0x71065432, 0x71654320,
-      0x70654321, 0x76543210, 0x65432107, 0x65432170, 0x65432071, 0x65432710,
-      0x65431072, 0x65431720, 0x65430721, 0x65437210, 0x65421073, 0x65421730,
-      0x65420731, 0x65427310, 0x65410732, 0x65417320, 0x65407321, 0x65473210,
-      0x65321074, 0x65321740, 0x65320741, 0x65327410, 0x65310742, 0x65317420,
-      0x65307421, 0x65374210, 0x65210743, 0x65217430, 0x65207431, 0x65274310,
-      0x65107432, 0x65174320, 0x65074321, 0x65743210, 0x64321075, 0x64321750,
-      0x64320751, 0x64327510, 0x64310752, 0x64317520, 0x64307521, 0x64375210,
-      0x64210753, 0x64217530, 0x64207531, 0x64275310, 0x64107532, 0x64175320,
-      0x64075321, 0x64753210, 0x63210754, 0x63217540, 0x63207541, 0x63275410,
-      0x63107542, 0x63175420, 0x63075421, 0x63754210, 0x62107543, 0x62175430,
-      0x62075431, 0x62754310, 0x61075432, 0x61754320, 0x60754321, 0x67543210,
-      0x54321076, 0x54321760, 0x54320761, 0x54327610, 0x54310762, 0x54317620,
-      0x54307621, 0x54376210, 0x54210763, 0x54217630, 0x54207631, 0x54276310,
-      0x54107632, 0x54176320, 0x54076321, 0x54763210, 0x53210764, 0x53217640,
-      0x53207641, 0x53276410, 0x53107642, 0x53176420, 0x53076421, 0x53764210,
-      0x52107643, 0x52176430, 0x52076431, 0x52764310, 0x51076432, 0x51764320,
-      0x50764321, 0x57643210, 0x43210765, 0x43217650, 0x43207651, 0x43276510,
-      0x43107652, 0x43176520, 0x43076521, 0x43765210, 0x42107653, 0x42176530,
-      0x42076531, 0x42765310, 0x41076532, 0x41765320, 0x40765321, 0x47653210,
-      0x32107654, 0x32176540, 0x32076541, 0x32765410, 0x31076542, 0x31765420,
-      0x30765421, 0x37654210, 0x21076543, 0x21765430, 0x20765431, 0x27654310,
-      0x10765432, 0x17654320, 0x07654321, 0x76543210};
-
-  // No need to mask because _mm256_permutevar8x32_epi32 ignores bits 3..31.
-  // Just shift each copy of the 32 bit LUT to extract its 4-bit fields.
-  // If broadcasting 32-bit from memory incurs the 3-cycle block-crossing
-  // latency, it may be faster to use LoadDup128 and PSHUFB.
-  const auto packed = Set(d32, packed_array[mask_bits]);
-  alignas(32) constexpr uint32_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
-  return Indices256<uint32_t>{(packed >> Load(d32, shifts)).raw};
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d,
-                                                uint64_t mask_bits) {
-  const Repartition<uint32_t, decltype(d)> d32;
-
-  // For 64-bit, we still need 32-bit indices because there is no 64-bit
-  // permutevar, but there are only 4 lanes, so we can afford to skip the
-  // unpacking and load the entire index vector directly.
-  alignas(32) constexpr uint32_t u32_indices[128] = {
-      // PrintCompress64x4PairTables
-      0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 0, 1, 4, 5,
-      6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 0, 1, 2, 3, 6, 7, 0, 1, 4, 5,
-      2, 3, 6, 7, 2, 3, 4, 5, 0, 1, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 6, 7,
-      0, 1, 2, 3, 4, 5, 0, 1, 6, 7, 2, 3, 4, 5, 2, 3, 6, 7, 0, 1, 4, 5,
-      0, 1, 2, 3, 6, 7, 4, 5, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 4, 5, 6, 7,
-      2, 3, 2, 3, 4, 5, 6, 7, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7};
-  return Indices256<uint32_t>{Load(d32, u32_indices + 8 * mask_bits).raw};
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_INLINE Indices256<uint32_t> IndicesFromNotBits(Full256<T> d,
-                                                   uint64_t mask_bits) {
-  const RebindToUnsigned<decltype(d)> d32;
-  // We need a masked Iota(). With 8 lanes, there are 256 combinations and a LUT
-  // of SetTableIndices would require 8 KiB, a large part of L1D. The other
-  // alternative is _pext_u64, but this is extremely slow on Zen2 (18 cycles)
-  // and unavailable in 32-bit builds. We instead compress each index into 4
-  // bits, for a total of 1 KiB.
-  alignas(16) constexpr uint32_t packed_array[256] = {
-      // PrintCompressNot32x8Tables
-      0x76543210, 0x07654321, 0x17654320, 0x10765432, 0x27654310, 0x20765431,
-      0x21765430, 0x21076543, 0x37654210, 0x30765421, 0x31765420, 0x31076542,
-      0x32765410, 0x32076541, 0x32176540, 0x32107654, 0x47653210, 0x40765321,
-      0x41765320, 0x41076532, 0x42765310, 0x42076531, 0x42176530, 0x42107653,
-      0x43765210, 0x43076521, 0x43176520, 0x43107652, 0x43276510, 0x43207651,
-      0x43217650, 0x43210765, 0x57643210, 0x50764321, 0x51764320, 0x51076432,
-      0x52764310, 0x52076431, 0x52176430, 0x52107643, 0x53764210, 0x53076421,
-      0x53176420, 0x53107642, 0x53276410, 0x53207641, 0x53217640, 0x53210764,
-      0x54763210, 0x54076321, 0x54176320, 0x54107632, 0x54276310, 0x54207631,
-      0x54217630, 0x54210763, 0x54376210, 0x54307621, 0x54317620, 0x54310762,
-      0x54327610, 0x54320761, 0x54321760, 0x54321076, 0x67543210, 0x60754321,
-      0x61754320, 0x61075432, 0x62754310, 0x62075431, 0x62175430, 0x62107543,
-      0x63754210, 0x63075421, 0x63175420, 0x63107542, 0x63275410, 0x63207541,
-      0x63217540, 0x63210754, 0x64753210, 0x64075321, 0x64175320, 0x64107532,
-      0x64275310, 0x64207531, 0x64217530, 0x64210753, 0x64375210, 0x64307521,
-      0x64317520, 0x64310752, 0x64327510, 0x64320751, 0x64321750, 0x64321075,
-      0x65743210, 0x65074321, 0x65174320, 0x65107432, 0x65274310, 0x65207431,
-      0x65217430, 0x65210743, 0x65374210, 0x65307421, 0x65317420, 0x65310742,
-      0x65327410, 0x65320741, 0x65321740, 0x65321074, 0x65473210, 0x65407321,
-      0x65417320, 0x65410732, 0x65427310, 0x65420731, 0x65421730, 0x65421073,
-      0x65437210, 0x65430721, 0x65431720, 0x65431072, 0x65432710, 0x65432071,
-      0x65432170, 0x65432107, 0x76543210, 0x70654321, 0x71654320, 0x71065432,
-      0x72654310, 0x72065431, 0x72165430, 0x72106543, 0x73654210, 0x73065421,
-      0x73165420, 0x73106542, 0x73265410, 0x73206541, 0x73216540, 0x73210654,
-      0x74653210, 0x74065321, 0x74165320, 0x74106532, 0x74265310, 0x74206531,
-      0x74216530, 0x74210653, 0x74365210, 0x74306521, 0x74316520, 0x74310652,
-      0x74326510, 0x74320651, 0x74321650, 0x74321065, 0x75643210, 0x75064321,
-      0x75164320, 0x75106432, 0x75264310, 0x75206431, 0x75216430, 0x75210643,
-      0x75364210, 0x75306421, 0x75316420, 0x75310642, 0x75326410, 0x75320641,
-      0x75321640, 0x75321064, 0x75463210, 0x75406321, 0x75416320, 0x75410632,
-      0x75426310, 0x75420631, 0x75421630, 0x75421063, 0x75436210, 0x75430621,
-      0x75431620, 0x75431062, 0x75432610, 0x75432061, 0x75432160, 0x75432106,
-      0x76543210, 0x76054321, 0x76154320, 0x76105432, 0x76254310, 0x76205431,
-      0x76215430, 0x76210543, 0x76354210, 0x76305421, 0x76315420, 0x76310542,
-      0x76325410, 0x76320541, 0x76321540, 0x76321054, 0x76453210, 0x76405321,
-      0x76415320, 0x76410532, 0x76425310, 0x76420531, 0x76421530, 0x76421053,
-      0x76435210, 0x76430521, 0x76431520, 0x76431052, 0x76432510, 0x76432051,
-      0x76432150, 0x76432105, 0x76543210, 0x76504321, 0x76514320, 0x76510432,
-      0x76524310, 0x76520431, 0x76521430, 0x76521043, 0x76534210, 0x76530421,
-      0x76531420, 0x76531042, 0x76532410, 0x76532041, 0x76532140, 0x76532104,
-      0x76543210, 0x76540321, 0x76541320, 0x76541032, 0x76542310, 0x76542031,
-      0x76542130, 0x76542103, 0x76543210, 0x76543021, 0x76543120, 0x76543102,
-      0x76543210, 0x76543201, 0x76543210, 0x76543210};
-
-  // No need to mask because <_mm256_permutevar8x32_epi32> ignores bits 3..31.
-  // Just shift each copy of the 32 bit LUT to extract its 4-bit fields.
-  // If broadcasting 32-bit from memory incurs the 3-cycle block-crossing
-  // latency, it may be faster to use LoadDup128 and PSHUFB.
-  const auto packed = Set(d32, packed_array[mask_bits]);
-  alignas(32) constexpr uint32_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
-  return Indices256<uint32_t>{(packed >> Load(d32, shifts)).raw};
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_INLINE Indices256<uint32_t> IndicesFromNotBits(Full256<T> d,
-                                                   uint64_t mask_bits) {
-  const Repartition<uint32_t, decltype(d)> d32;
-
-  // For 64-bit, we still need 32-bit indices because there is no 64-bit
-  // permutevar, but there are only 4 lanes, so we can afford to skip the
-  // unpacking and load the entire index vector directly.
-  alignas(32) constexpr uint32_t u32_indices[128] = {
-      // PrintCompressNot64x4PairTables
-      0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 0, 1, 0, 1, 4, 5, 6, 7,
-      2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 6, 7, 4, 5, 2, 3, 6, 7,
-      0, 1, 4, 5, 0, 1, 6, 7, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 0, 1,
-      2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 0, 1, 6, 7, 0, 1, 4, 5, 2, 3, 6, 7,
-      4, 5, 0, 1, 2, 3, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 0, 1, 4, 5,
-      6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
-  return Indices256<uint32_t>{Load(d32, u32_indices + 8 * mask_bits).raw};
-}
-template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
-HWY_INLINE Vec256<T> Compress(Vec256<T> v, const uint64_t mask_bits) {
-  const Full256<T> d;
-  const Repartition<uint32_t, decltype(d)> du32;
-
-  HWY_DASSERT(mask_bits < (1ull << (32 / sizeof(T))));
-  const auto indices = IndicesFromBits(d, mask_bits);
-  return BitCast(d, TableLookupLanes(BitCast(du32, v), indices));
-}
-
-// LUTs are infeasible for 2^16 possible masks, so splice together two
-// half-vector Compress.
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_INLINE Vec256<T> Compress(Vec256<T> v, const uint64_t mask_bits) {
-  const Full256<T> d;
-  const RebindToUnsigned<decltype(d)> du;
-  const auto vu16 = BitCast(du, v);  // (required for float16_t inputs)
-  const Half<decltype(du)> duh;
-  const auto half0 = LowerHalf(duh, vu16);
-  const auto half1 = UpperHalf(duh, vu16);
-
-  const uint64_t mask_bits0 = mask_bits & 0xFF;
-  const uint64_t mask_bits1 = mask_bits >> 8;
-  const auto compressed0 = detail::CompressBits(half0, mask_bits0);
-  const auto compressed1 = detail::CompressBits(half1, mask_bits1);
-
-  alignas(32) uint16_t all_true[16] = {};
-  // Store mask=true lanes, left to right.
-  const size_t num_true0 = PopCount(mask_bits0);
-  Store(compressed0, duh, all_true);
-  StoreU(compressed1, duh, all_true + num_true0);
-
-  if (hwy::HWY_NAMESPACE::CompressIsPartition<T>::value) {
-    // Store mask=false lanes, right to left. The second vector fills the upper
-    // half with right-aligned false lanes. The first vector is shifted
-    // rightwards to overwrite the true lanes of the second.
-    alignas(32) uint16_t all_false[16] = {};
-    const size_t num_true1 = PopCount(mask_bits1);
-    Store(compressed1, duh, all_false + 8);
-    StoreU(compressed0, duh, all_false + num_true1);
-
-    const auto mask = FirstN(du, num_true0 + num_true1);
-    return BitCast(d,
-                   IfThenElse(mask, Load(du, all_true), Load(du, all_false)));
-  } else {
-    // Only care about the mask=true lanes.
-    return BitCast(d, Load(du, all_true));
-  }
-}
-
-template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
-HWY_INLINE Vec256<T> CompressNot(Vec256<T> v, const uint64_t mask_bits) {
-  const Full256<T> d;
-  const Repartition<uint32_t, decltype(d)> du32;
-
-  HWY_DASSERT(mask_bits < (1ull << (32 / sizeof(T))));
-  const auto indices = IndicesFromNotBits(d, mask_bits);
-  return BitCast(d, TableLookupLanes(BitCast(du32, v), indices));
-}
-
-// LUTs are infeasible for 2^16 possible masks, so splice together two
-// half-vector Compress.
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_INLINE Vec256<T> CompressNot(Vec256<T> v, const uint64_t mask_bits) {
-  // Compress ensures only the lower 16 bits are set, so flip those.
-  return Compress(v, mask_bits ^ 0xFFFF);
-}
-
-}  // namespace detail
-
-template <typename T>
-HWY_API Vec256<T> Compress(Vec256<T> v, Mask256<T> m) {
-  return detail::Compress(v, detail::BitsFromMask(m));
-}
-
-template <typename T>
-HWY_API Vec256<T> CompressNot(Vec256<T> v, Mask256<T> m) {
-  return detail::CompressNot(v, detail::BitsFromMask(m));
-}
-
-HWY_API Vec256<uint64_t> CompressBlocksNot(Vec256<uint64_t> v,
-                                           Mask256<uint64_t> mask) {
-  return CompressNot(v, mask);
-}
-
-template <typename T>
-HWY_API Vec256<T> CompressBits(Vec256<T> v, const uint8_t* HWY_RESTRICT bits) {
-  constexpr size_t N = 32 / sizeof(T);
-  constexpr size_t kNumBytes = (N + 7) / 8;
-
-  uint64_t mask_bits = 0;
-  CopyBytes<kNumBytes>(bits, &mask_bits);
-
-  if (N < 8) {
-    mask_bits &= (1ull << N) - 1;
-  }
-
-  return detail::Compress(v, mask_bits);
-}
-
-// ------------------------------ CompressStore, CompressBitsStore
-
-template <typename T>
-HWY_API size_t CompressStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
-                             T* HWY_RESTRICT unaligned) {
-  const uint64_t mask_bits = detail::BitsFromMask(m);
-  const size_t count = PopCount(mask_bits);
-  StoreU(detail::Compress(v, mask_bits), d, unaligned);
-  // Workaround for MSAN not marking output as initialized (b/233326619)
-#if HWY_IS_MSAN
-  __msan_unpoison(unaligned, count * sizeof(T));
-#endif
-  return count;
-}
-
-template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)>
-HWY_API size_t CompressBlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
-                                    T* HWY_RESTRICT unaligned) {
-  const uint64_t mask_bits = detail::BitsFromMask(m);
-  const size_t count = PopCount(mask_bits);
-  BlendedStore(detail::Compress(v, mask_bits), FirstN(d, count), d, unaligned);
-  // Workaround for MSAN not marking output as initialized (b/233326619)
-#if HWY_IS_MSAN
-  __msan_unpoison(unaligned, count * sizeof(T));
-#endif
-  return count;
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API size_t CompressBlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d,
-                                    T* HWY_RESTRICT unaligned) {
-  const uint64_t mask_bits = detail::BitsFromMask(m);
-  const size_t count = PopCount(mask_bits);
-  const Vec256<T> compressed = detail::Compress(v, mask_bits);
-
-#if HWY_MEM_OPS_MIGHT_FAULT  // true if HWY_IS_MSAN
-  // BlendedStore tests mask for each lane, but we know that the mask is
-  // FirstN, so we can just copy.
-  alignas(32) T buf[16];
-  Store(compressed, d, buf);
-  memcpy(unaligned, buf, count * sizeof(T));
-#else
-  BlendedStore(compressed, FirstN(d, count), d, unaligned);
-#endif
-  return count;
-}
-
-template <typename T>
-HWY_API size_t CompressBitsStore(Vec256<T> v, const uint8_t* HWY_RESTRICT bits,
-                                 Full256<T> d, T* HWY_RESTRICT unaligned) {
-  constexpr size_t N = 32 / sizeof(T);
-  constexpr size_t kNumBytes = (N + 7) / 8;
-
-  uint64_t mask_bits = 0;
-  CopyBytes<kNumBytes>(bits, &mask_bits);
-
-  if (N < 8) {
-    mask_bits &= (1ull << N) - 1;
-  }
-  const size_t count = PopCount(mask_bits);
-
-  StoreU(detail::Compress(v, mask_bits), d, unaligned);
-  // Workaround for MSAN not marking output as initialized (b/233326619)
-#if HWY_IS_MSAN
-  __msan_unpoison(unaligned, count * sizeof(T));
-#endif
-  return count;
-}
-
-#endif  // HWY_TARGET <= HWY_AVX3
-
-// ------------------------------ LoadInterleaved3/4
-
-// Implemented in generic_ops, we just overload LoadTransposedBlocks3/4.
-
-namespace detail {
-
-// Input:
-// 1 0 (<- first block of unaligned)
-// 3 2
-// 5 4
-// Output:
-// 3 0
-// 4 1
-// 5 2
-template <typename T>
-HWY_API void LoadTransposedBlocks3(Full256<T> d,
-                                   const T* HWY_RESTRICT unaligned,
-                                   Vec256<T>& A, Vec256<T>& B, Vec256<T>& C) {
-  constexpr size_t N = 32 / sizeof(T);
-  const Vec256<T> v10 = LoadU(d, unaligned + 0 * N);  // 1 0
-  const Vec256<T> v32 = LoadU(d, unaligned + 1 * N);
-  const Vec256<T> v54 = LoadU(d, unaligned + 2 * N);
-
-  A = ConcatUpperLower(d, v32, v10);
-  B = ConcatLowerUpper(d, v54, v10);
-  C = ConcatUpperLower(d, v54, v32);
-}
-
-// Input (128-bit blocks):
-// 1 0 (first block of unaligned)
-// 3 2
-// 5 4
-// 7 6
-// Output:
-// 4 0 (LSB of A)
-// 5 1
-// 6 2
-// 7 3
-template <typename T>
-HWY_API void LoadTransposedBlocks4(Full256<T> d,
-                                   const T* HWY_RESTRICT unaligned,
-                                   Vec256<T>& A, Vec256<T>& B, Vec256<T>& C,
-                                   Vec256<T>& D) {
-  constexpr size_t N = 32 / sizeof(T);
-  const Vec256<T> v10 = LoadU(d, unaligned + 0 * N);
-  const Vec256<T> v32 = LoadU(d, unaligned + 1 * N);
-  const Vec256<T> v54 = LoadU(d, unaligned + 2 * N);
-  const Vec256<T> v76 = LoadU(d, unaligned + 3 * N);
-
-  A = ConcatLowerLower(d, v54, v10);
-  B = ConcatUpperUpper(d, v54, v10);
-  C = ConcatLowerLower(d, v76, v32);
-  D = ConcatUpperUpper(d, v76, v32);
-}
-
-}  // namespace detail
-
-// ------------------------------ StoreInterleaved2/3/4 (ConcatUpperLower)
-
-// Implemented in generic_ops, we just overload StoreTransposedBlocks2/3/4.
-
-namespace detail {
-
-// Input (128-bit blocks):
-// 2 0 (LSB of i)
-// 3 1
-// Output:
-// 1 0
-// 3 2
-template <typename T>
-HWY_API void StoreTransposedBlocks2(const Vec256<T> i, const Vec256<T> j,
-                                    const Full256<T> d,
-                                    T* HWY_RESTRICT unaligned) {
-  constexpr size_t N = 32 / sizeof(T);
-  const auto out0 = ConcatLowerLower(d, j, i);
-  const auto out1 = ConcatUpperUpper(d, j, i);
-  StoreU(out0, d, unaligned + 0 * N);
-  StoreU(out1, d, unaligned + 1 * N);
-}
-
-// Input (128-bit blocks):
-// 3 0 (LSB of i)
-// 4 1
-// 5 2
-// Output:
-// 1 0
-// 3 2
-// 5 4
-template <typename T>
-HWY_API void StoreTransposedBlocks3(const Vec256<T> i, const Vec256<T> j,
-                                    const Vec256<T> k, Full256<T> d,
-                                    T* HWY_RESTRICT unaligned) {
-  constexpr size_t N = 32 / sizeof(T);
-  const auto out0 = ConcatLowerLower(d, j, i);
-  const auto out1 = ConcatUpperLower(d, i, k);
-  const auto out2 = ConcatUpperUpper(d, k, j);
-  StoreU(out0, d, unaligned + 0 * N);
-  StoreU(out1, d, unaligned + 1 * N);
-  StoreU(out2, d, unaligned + 2 * N);
-}
-
-// Input (128-bit blocks):
-// 4 0 (LSB of i)
-// 5 1
-// 6 2
-// 7 3
-// Output:
-// 1 0
-// 3 2
-// 5 4
-// 7 6
-template <typename T>
-HWY_API void StoreTransposedBlocks4(const Vec256<T> i, const Vec256<T> j,
-                                    const Vec256<T> k, const Vec256<T> l,
-                                    Full256<T> d, T* HWY_RESTRICT unaligned) {
-  constexpr size_t N = 32 / sizeof(T);
-  // Write lower halves, then upper.
-  const auto out0 = ConcatLowerLower(d, j, i);
-  const auto out1 = ConcatLowerLower(d, l, k);
-  StoreU(out0, d, unaligned + 0 * N);
-  StoreU(out1, d, unaligned + 1 * N);
-  const auto out2 = ConcatUpperUpper(d, j, i);
-  const auto out3 = ConcatUpperUpper(d, l, k);
-  StoreU(out2, d, unaligned + 2 * N);
-  StoreU(out3, d, unaligned + 3 * N);
-}
-
-}  // namespace detail
-
-// ------------------------------ Reductions
-
-namespace detail {
-
-// Returns sum{lane[i]} in each lane. "v3210" is a replicated 128-bit block.
-// Same logic as x86/128.h, but with Vec256 arguments.
-template <typename T>
-HWY_INLINE Vec256<T> SumOfLanes(hwy::SizeTag<4> /* tag */,
-                                const Vec256<T> v3210) {
-  const auto v1032 = Shuffle1032(v3210);
-  const auto v31_20_31_20 = v3210 + v1032;
-  const auto v20_31_20_31 = Shuffle0321(v31_20_31_20);
-  return v20_31_20_31 + v31_20_31_20;
-}
-template <typename T>
-HWY_INLINE Vec256<T> MinOfLanes(hwy::SizeTag<4> /* tag */,
-                                const Vec256<T> v3210) {
-  const auto v1032 = Shuffle1032(v3210);
-  const auto v31_20_31_20 = Min(v3210, v1032);
-  const auto v20_31_20_31 = Shuffle0321(v31_20_31_20);
-  return Min(v20_31_20_31, v31_20_31_20);
-}
-template <typename T>
-HWY_INLINE Vec256<T> MaxOfLanes(hwy::SizeTag<4> /* tag */,
-                                const Vec256<T> v3210) {
-  const auto v1032 = Shuffle1032(v3210);
-  const auto v31_20_31_20 = Max(v3210, v1032);
-  const auto v20_31_20_31 = Shuffle0321(v31_20_31_20);
-  return Max(v20_31_20_31, v31_20_31_20);
-}
-
-template <typename T>
-HWY_INLINE Vec256<T> SumOfLanes(hwy::SizeTag<8> /* tag */,
-                                const Vec256<T> v10) {
-  const auto v01 = Shuffle01(v10);
-  return v10 + v01;
-}
-template <typename T>
-HWY_INLINE Vec256<T> MinOfLanes(hwy::SizeTag<8> /* tag */,
-                                const Vec256<T> v10) {
-  const auto v01 = Shuffle01(v10);
-  return Min(v10, v01);
-}
-template <typename T>
-HWY_INLINE Vec256<T> MaxOfLanes(hwy::SizeTag<8> /* tag */,
-                                const Vec256<T> v10) {
-  const auto v01 = Shuffle01(v10);
-  return Max(v10, v01);
-}
-
-HWY_API Vec256<uint16_t> MinOfLanes(hwy::SizeTag<2> /* tag */,
-                                    Vec256<uint16_t> v) {
-  const Full256<uint16_t> d;
-  const RepartitionToWide<decltype(d)> d32;
-  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
-  const auto odd = ShiftRight<16>(BitCast(d32, v));
-  const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd));
-  // Also broadcast into odd lanes.
-  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
-}
-HWY_API Vec256<int16_t> MinOfLanes(hwy::SizeTag<2> /* tag */,
-                                   Vec256<int16_t> v) {
-  const Full256<int16_t> d;
-  const RepartitionToWide<decltype(d)> d32;
-  // Sign-extend
-  const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
-  const auto odd = ShiftRight<16>(BitCast(d32, v));
-  const auto min = MinOfLanes(hwy::SizeTag<4>(), Min(even, odd));
-  // Also broadcast into odd lanes.
-  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
-}
-
-HWY_API Vec256<uint16_t> MaxOfLanes(hwy::SizeTag<2> /* tag */,
-                                    Vec256<uint16_t> v) {
-  const Full256<uint16_t> d;
-  const RepartitionToWide<decltype(d)> d32;
-  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
-  const auto odd = ShiftRight<16>(BitCast(d32, v));
-  const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd));
-  // Also broadcast into odd lanes.
-  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
-}
-HWY_API Vec256<int16_t> MaxOfLanes(hwy::SizeTag<2> /* tag */,
-                                   Vec256<int16_t> v) {
-  const Full256<int16_t> d;
-  const RepartitionToWide<decltype(d)> d32;
-  // Sign-extend
-  const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
-  const auto odd = ShiftRight<16>(BitCast(d32, v));
-  const auto min = MaxOfLanes(hwy::SizeTag<4>(), Max(even, odd));
-  // Also broadcast into odd lanes.
-  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
-}
-
-}  // namespace detail
-
-// Supported for {uif}32x8, {uif}64x4. Returns the sum in each lane.
-template <typename T>
-HWY_API Vec256<T> SumOfLanes(Full256<T> d, const Vec256<T> vHL) {
-  const Vec256<T> vLH = ConcatLowerUpper(d, vHL, vHL);
-  return detail::SumOfLanes(hwy::SizeTag<sizeof(T)>(), vLH + vHL);
-}
-template <typename T>
-HWY_API Vec256<T> MinOfLanes(Full256<T> d, const Vec256<T> vHL) {
-  const Vec256<T> vLH = ConcatLowerUpper(d, vHL, vHL);
-  return detail::MinOfLanes(hwy::SizeTag<sizeof(T)>(), Min(vLH, vHL));
-}
-template <typename T>
-HWY_API Vec256<T> MaxOfLanes(Full256<T> d, const Vec256<T> vHL) {
-  const Vec256<T> vLH = ConcatLowerUpper(d, vHL, vHL);
-  return detail::MaxOfLanes(hwy::SizeTag<sizeof(T)>(), Max(vLH, vHL));
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-// Note that the GCC warnings are not suppressed if we only wrap the *intrin.h -
-// the warning seems to be issued at the call site of intrinsics, i.e. our code.
-HWY_DIAGNOSTICS(pop)
diff --git a/third_party/highway/hwy/ops/x86_512-inl.h b/third_party/highway/hwy/ops/x86_512-inl.h
deleted file mode 100644 (file)
index f2500e3..0000000
+++ /dev/null
@@ -1,4330 +0,0 @@
-// Copyright 2019 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// 512-bit AVX512 vectors and operations.
-// External include guard in highway.h - see comment there.
-
-// WARNING: most operations do not cross 128-bit block boundaries. In
-// particular, "Broadcast", pack and zip behavior may be surprising.
-
-// Must come before HWY_DIAGNOSTICS and HWY_COMPILER_CLANGCL
-#include "hwy/base.h"
-
-// Avoid uninitialized warnings in GCC's avx512fintrin.h - see
-// https://github.com/google/highway/issues/710)
-HWY_DIAGNOSTICS(push)
-#if HWY_COMPILER_GCC_ACTUAL
-HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
-HWY_DIAGNOSTICS_OFF(disable : 4703 6001 26494, ignored "-Wmaybe-uninitialized")
-#endif
-
-#include <immintrin.h>  // AVX2+
-
-#if HWY_COMPILER_CLANGCL
-// Including <immintrin.h> should be enough, but Clang's headers helpfully skip
-// including these headers when _MSC_VER is defined, like when using clang-cl.
-// Include these directly here.
-// clang-format off
-#include <smmintrin.h>
-
-#include <avxintrin.h>
-#include <avx2intrin.h>
-#include <f16cintrin.h>
-#include <fmaintrin.h>
-
-#include <avx512fintrin.h>
-#include <avx512vlintrin.h>
-#include <avx512bwintrin.h>
-#include <avx512dqintrin.h>
-#include <avx512vlbwintrin.h>
-#include <avx512vldqintrin.h>
-#include <avx512bitalgintrin.h>
-#include <avx512vlbitalgintrin.h>
-#include <avx512vpopcntdqintrin.h>
-#include <avx512vpopcntdqvlintrin.h>
-// clang-format on
-#endif  // HWY_COMPILER_CLANGCL
-
-#include <stddef.h>
-#include <stdint.h>
-
-#if HWY_IS_MSAN
-#include <sanitizer/msan_interface.h>
-#endif
-
-// For half-width vectors. Already includes base.h and shared-inl.h.
-#include "hwy/ops/x86_256-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-namespace detail {
-
-template <typename T>
-struct Raw512 {
-  using type = __m512i;
-};
-template <>
-struct Raw512<float> {
-  using type = __m512;
-};
-template <>
-struct Raw512<double> {
-  using type = __m512d;
-};
-
-// Template arg: sizeof(lane type)
-template <size_t size>
-struct RawMask512 {};
-template <>
-struct RawMask512<1> {
-  using type = __mmask64;
-};
-template <>
-struct RawMask512<2> {
-  using type = __mmask32;
-};
-template <>
-struct RawMask512<4> {
-  using type = __mmask16;
-};
-template <>
-struct RawMask512<8> {
-  using type = __mmask8;
-};
-
-}  // namespace detail
-
-template <typename T>
-class Vec512 {
-  using Raw = typename detail::Raw512<T>::type;
-
- public:
-  // Compound assignment. Only usable if there is a corresponding non-member
-  // binary operator overload. For example, only f32 and f64 support division.
-  HWY_INLINE Vec512& operator*=(const Vec512 other) {
-    return *this = (*this * other);
-  }
-  HWY_INLINE Vec512& operator/=(const Vec512 other) {
-    return *this = (*this / other);
-  }
-  HWY_INLINE Vec512& operator+=(const Vec512 other) {
-    return *this = (*this + other);
-  }
-  HWY_INLINE Vec512& operator-=(const Vec512 other) {
-    return *this = (*this - other);
-  }
-  HWY_INLINE Vec512& operator&=(const Vec512 other) {
-    return *this = (*this & other);
-  }
-  HWY_INLINE Vec512& operator|=(const Vec512 other) {
-    return *this = (*this | other);
-  }
-  HWY_INLINE Vec512& operator^=(const Vec512 other) {
-    return *this = (*this ^ other);
-  }
-
-  Raw raw;
-};
-
-// Mask register: one bit per lane.
-template <typename T>
-struct Mask512 {
-  typename detail::RawMask512<sizeof(T)>::type raw;
-};
-
-// ------------------------------ BitCast
-
-namespace detail {
-
-HWY_INLINE __m512i BitCastToInteger(__m512i v) { return v; }
-HWY_INLINE __m512i BitCastToInteger(__m512 v) { return _mm512_castps_si512(v); }
-HWY_INLINE __m512i BitCastToInteger(__m512d v) {
-  return _mm512_castpd_si512(v);
-}
-
-template <typename T>
-HWY_INLINE Vec512<uint8_t> BitCastToByte(Vec512<T> v) {
-  return Vec512<uint8_t>{BitCastToInteger(v.raw)};
-}
-
-// Cannot rely on function overloading because return types differ.
-template <typename T>
-struct BitCastFromInteger512 {
-  HWY_INLINE __m512i operator()(__m512i v) { return v; }
-};
-template <>
-struct BitCastFromInteger512<float> {
-  HWY_INLINE __m512 operator()(__m512i v) { return _mm512_castsi512_ps(v); }
-};
-template <>
-struct BitCastFromInteger512<double> {
-  HWY_INLINE __m512d operator()(__m512i v) { return _mm512_castsi512_pd(v); }
-};
-
-template <typename T>
-HWY_INLINE Vec512<T> BitCastFromByte(Full512<T> /* tag */, Vec512<uint8_t> v) {
-  return Vec512<T>{BitCastFromInteger512<T>()(v.raw)};
-}
-
-}  // namespace detail
-
-template <typename T, typename FromT>
-HWY_API Vec512<T> BitCast(Full512<T> d, Vec512<FromT> v) {
-  return detail::BitCastFromByte(d, detail::BitCastToByte(v));
-}
-
-// ------------------------------ Set
-
-// Returns an all-zero vector.
-template <typename T>
-HWY_API Vec512<T> Zero(Full512<T> /* tag */) {
-  return Vec512<T>{_mm512_setzero_si512()};
-}
-HWY_API Vec512<float> Zero(Full512<float> /* tag */) {
-  return Vec512<float>{_mm512_setzero_ps()};
-}
-HWY_API Vec512<double> Zero(Full512<double> /* tag */) {
-  return Vec512<double>{_mm512_setzero_pd()};
-}
-
-// Returns a vector with all lanes set to "t".
-HWY_API Vec512<uint8_t> Set(Full512<uint8_t> /* tag */, const uint8_t t) {
-  return Vec512<uint8_t>{_mm512_set1_epi8(static_cast<char>(t))};  // NOLINT
-}
-HWY_API Vec512<uint16_t> Set(Full512<uint16_t> /* tag */, const uint16_t t) {
-  return Vec512<uint16_t>{_mm512_set1_epi16(static_cast<short>(t))};  // NOLINT
-}
-HWY_API Vec512<uint32_t> Set(Full512<uint32_t> /* tag */, const uint32_t t) {
-  return Vec512<uint32_t>{_mm512_set1_epi32(static_cast<int>(t))};
-}
-HWY_API Vec512<uint64_t> Set(Full512<uint64_t> /* tag */, const uint64_t t) {
-  return Vec512<uint64_t>{
-      _mm512_set1_epi64(static_cast<long long>(t))};  // NOLINT
-}
-HWY_API Vec512<int8_t> Set(Full512<int8_t> /* tag */, const int8_t t) {
-  return Vec512<int8_t>{_mm512_set1_epi8(static_cast<char>(t))};  // NOLINT
-}
-HWY_API Vec512<int16_t> Set(Full512<int16_t> /* tag */, const int16_t t) {
-  return Vec512<int16_t>{_mm512_set1_epi16(static_cast<short>(t))};  // NOLINT
-}
-HWY_API Vec512<int32_t> Set(Full512<int32_t> /* tag */, const int32_t t) {
-  return Vec512<int32_t>{_mm512_set1_epi32(t)};
-}
-HWY_API Vec512<int64_t> Set(Full512<int64_t> /* tag */, const int64_t t) {
-  return Vec512<int64_t>{
-      _mm512_set1_epi64(static_cast<long long>(t))};  // NOLINT
-}
-HWY_API Vec512<float> Set(Full512<float> /* tag */, const float t) {
-  return Vec512<float>{_mm512_set1_ps(t)};
-}
-HWY_API Vec512<double> Set(Full512<double> /* tag */, const double t) {
-  return Vec512<double>{_mm512_set1_pd(t)};
-}
-
-HWY_DIAGNOSTICS(push)
-HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
-
-// Returns a vector with uninitialized elements.
-template <typename T>
-HWY_API Vec512<T> Undefined(Full512<T> /* tag */) {
-  // Available on Clang 6.0, GCC 6.2, ICC 16.03, MSVC 19.14. All but ICC
-  // generate an XOR instruction.
-  return Vec512<T>{_mm512_undefined_epi32()};
-}
-HWY_API Vec512<float> Undefined(Full512<float> /* tag */) {
-  return Vec512<float>{_mm512_undefined_ps()};
-}
-HWY_API Vec512<double> Undefined(Full512<double> /* tag */) {
-  return Vec512<double>{_mm512_undefined_pd()};
-}
-
-HWY_DIAGNOSTICS(pop)
-
-// ================================================== LOGICAL
-
-// ------------------------------ Not
-
-template <typename T>
-HWY_API Vec512<T> Not(const Vec512<T> v) {
-  using TU = MakeUnsigned<T>;
-  const __m512i vu = BitCast(Full512<TU>(), v).raw;
-  return BitCast(Full512<T>(),
-                 Vec512<TU>{_mm512_ternarylogic_epi32(vu, vu, vu, 0x55)});
-}
-
-// ------------------------------ And
-
-template <typename T>
-HWY_API Vec512<T> And(const Vec512<T> a, const Vec512<T> b) {
-  return Vec512<T>{_mm512_and_si512(a.raw, b.raw)};
-}
-
-HWY_API Vec512<float> And(const Vec512<float> a, const Vec512<float> b) {
-  return Vec512<float>{_mm512_and_ps(a.raw, b.raw)};
-}
-HWY_API Vec512<double> And(const Vec512<double> a, const Vec512<double> b) {
-  return Vec512<double>{_mm512_and_pd(a.raw, b.raw)};
-}
-
-// ------------------------------ AndNot
-
-// Returns ~not_mask & mask.
-template <typename T>
-HWY_API Vec512<T> AndNot(const Vec512<T> not_mask, const Vec512<T> mask) {
-  return Vec512<T>{_mm512_andnot_si512(not_mask.raw, mask.raw)};
-}
-HWY_API Vec512<float> AndNot(const Vec512<float> not_mask,
-                             const Vec512<float> mask) {
-  return Vec512<float>{_mm512_andnot_ps(not_mask.raw, mask.raw)};
-}
-HWY_API Vec512<double> AndNot(const Vec512<double> not_mask,
-                              const Vec512<double> mask) {
-  return Vec512<double>{_mm512_andnot_pd(not_mask.raw, mask.raw)};
-}
-
-// ------------------------------ Or
-
-template <typename T>
-HWY_API Vec512<T> Or(const Vec512<T> a, const Vec512<T> b) {
-  return Vec512<T>{_mm512_or_si512(a.raw, b.raw)};
-}
-
-HWY_API Vec512<float> Or(const Vec512<float> a, const Vec512<float> b) {
-  return Vec512<float>{_mm512_or_ps(a.raw, b.raw)};
-}
-HWY_API Vec512<double> Or(const Vec512<double> a, const Vec512<double> b) {
-  return Vec512<double>{_mm512_or_pd(a.raw, b.raw)};
-}
-
-// ------------------------------ Xor
-
-template <typename T>
-HWY_API Vec512<T> Xor(const Vec512<T> a, const Vec512<T> b) {
-  return Vec512<T>{_mm512_xor_si512(a.raw, b.raw)};
-}
-
-HWY_API Vec512<float> Xor(const Vec512<float> a, const Vec512<float> b) {
-  return Vec512<float>{_mm512_xor_ps(a.raw, b.raw)};
-}
-HWY_API Vec512<double> Xor(const Vec512<double> a, const Vec512<double> b) {
-  return Vec512<double>{_mm512_xor_pd(a.raw, b.raw)};
-}
-
-// ------------------------------ Or3
-
-template <typename T>
-HWY_API Vec512<T> Or3(Vec512<T> o1, Vec512<T> o2, Vec512<T> o3) {
-  const Full512<T> d;
-  const RebindToUnsigned<decltype(d)> du;
-  using VU = VFromD<decltype(du)>;
-  const __m512i ret = _mm512_ternarylogic_epi64(
-      BitCast(du, o1).raw, BitCast(du, o2).raw, BitCast(du, o3).raw, 0xFE);
-  return BitCast(d, VU{ret});
-}
-
-// ------------------------------ OrAnd
-
-template <typename T>
-HWY_API Vec512<T> OrAnd(Vec512<T> o, Vec512<T> a1, Vec512<T> a2) {
-  const Full512<T> d;
-  const RebindToUnsigned<decltype(d)> du;
-  using VU = VFromD<decltype(du)>;
-  const __m512i ret = _mm512_ternarylogic_epi64(
-      BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8);
-  return BitCast(d, VU{ret});
-}
-
-// ------------------------------ IfVecThenElse
-
-template <typename T>
-HWY_API Vec512<T> IfVecThenElse(Vec512<T> mask, Vec512<T> yes, Vec512<T> no) {
-  const Full512<T> d;
-  const RebindToUnsigned<decltype(d)> du;
-  using VU = VFromD<decltype(du)>;
-  return BitCast(d, VU{_mm512_ternarylogic_epi64(BitCast(du, mask).raw,
-                                                 BitCast(du, yes).raw,
-                                                 BitCast(du, no).raw, 0xCA)});
-}
-
-// ------------------------------ Operator overloads (internal-only if float)
-
-template <typename T>
-HWY_API Vec512<T> operator&(const Vec512<T> a, const Vec512<T> b) {
-  return And(a, b);
-}
-
-template <typename T>
-HWY_API Vec512<T> operator|(const Vec512<T> a, const Vec512<T> b) {
-  return Or(a, b);
-}
-
-template <typename T>
-HWY_API Vec512<T> operator^(const Vec512<T> a, const Vec512<T> b) {
-  return Xor(a, b);
-}
-
-// ------------------------------ PopulationCount
-
-// 8/16 require BITALG, 32/64 require VPOPCNTDQ.
-#if HWY_TARGET == HWY_AVX3_DL
-
-#ifdef HWY_NATIVE_POPCNT
-#undef HWY_NATIVE_POPCNT
-#else
-#define HWY_NATIVE_POPCNT
-#endif
-
-namespace detail {
-
-template <typename T>
-HWY_INLINE Vec512<T> PopulationCount(hwy::SizeTag<1> /* tag */, Vec512<T> v) {
-  return Vec512<T>{_mm512_popcnt_epi8(v.raw)};
-}
-template <typename T>
-HWY_INLINE Vec512<T> PopulationCount(hwy::SizeTag<2> /* tag */, Vec512<T> v) {
-  return Vec512<T>{_mm512_popcnt_epi16(v.raw)};
-}
-template <typename T>
-HWY_INLINE Vec512<T> PopulationCount(hwy::SizeTag<4> /* tag */, Vec512<T> v) {
-  return Vec512<T>{_mm512_popcnt_epi32(v.raw)};
-}
-template <typename T>
-HWY_INLINE Vec512<T> PopulationCount(hwy::SizeTag<8> /* tag */, Vec512<T> v) {
-  return Vec512<T>{_mm512_popcnt_epi64(v.raw)};
-}
-
-}  // namespace detail
-
-template <typename T>
-HWY_API Vec512<T> PopulationCount(Vec512<T> v) {
-  return detail::PopulationCount(hwy::SizeTag<sizeof(T)>(), v);
-}
-
-#endif  // HWY_TARGET == HWY_AVX3_DL
-
-// ================================================== SIGN
-
-// ------------------------------ CopySign
-
-template <typename T>
-HWY_API Vec512<T> CopySign(const Vec512<T> magn, const Vec512<T> sign) {
-  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
-
-  const Full512<T> d;
-  const auto msb = SignBit(d);
-
-  const Rebind<MakeUnsigned<T>, decltype(d)> du;
-  // Truth table for msb, magn, sign | bitwise msb ? sign : mag
-  //                  0    0     0   |  0
-  //                  0    0     1   |  0
-  //                  0    1     0   |  1
-  //                  0    1     1   |  1
-  //                  1    0     0   |  0
-  //                  1    0     1   |  1
-  //                  1    1     0   |  0
-  //                  1    1     1   |  1
-  // The lane size does not matter because we are not using predication.
-  const __m512i out = _mm512_ternarylogic_epi32(
-      BitCast(du, msb).raw, BitCast(du, magn).raw, BitCast(du, sign).raw, 0xAC);
-  return BitCast(d, decltype(Zero(du)){out});
-}
-
-template <typename T>
-HWY_API Vec512<T> CopySignToAbs(const Vec512<T> abs, const Vec512<T> sign) {
-  // AVX3 can also handle abs < 0, so no extra action needed.
-  return CopySign(abs, sign);
-}
-
-// ================================================== MASK
-
-// ------------------------------ FirstN
-
-// Possibilities for constructing a bitmask of N ones:
-// - kshift* only consider the lowest byte of the shift count, so they would
-//   not correctly handle large n.
-// - Scalar shifts >= 64 are UB.
-// - BZHI has the desired semantics; we assume AVX-512 implies BMI2. However,
-//   we need 64-bit masks for sizeof(T) == 1, so special-case 32-bit builds.
-
-#if HWY_ARCH_X86_32
-namespace detail {
-
-// 32 bit mask is sufficient for lane size >= 2.
-template <typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
-HWY_INLINE Mask512<T> FirstN(size_t n) {
-  Mask512<T> m;
-  const uint32_t all = ~uint32_t{0};
-  // BZHI only looks at the lower 8 bits of n!
-  m.raw = static_cast<decltype(m.raw)>((n > 255) ? all : _bzhi_u32(all, n));
-  return m;
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_INLINE Mask512<T> FirstN(size_t n) {
-  const uint64_t bits = n < 64 ? ((1ULL << n) - 1) : ~uint64_t{0};
-  return Mask512<T>{static_cast<__mmask64>(bits)};
-}
-
-}  // namespace detail
-#endif  // HWY_ARCH_X86_32
-
-template <typename T>
-HWY_API Mask512<T> FirstN(const Full512<T> /*tag*/, size_t n) {
-#if HWY_ARCH_X86_64
-  Mask512<T> m;
-  const uint64_t all = ~uint64_t{0};
-  // BZHI only looks at the lower 8 bits of n!
-  m.raw = static_cast<decltype(m.raw)>((n > 255) ? all : _bzhi_u64(all, n));
-  return m;
-#else
-  return detail::FirstN<T>(n);
-#endif  // HWY_ARCH_X86_64
-}
-
-// ------------------------------ IfThenElse
-
-// Returns mask ? b : a.
-
-namespace detail {
-
-// Templates for signed/unsigned integer of a particular size.
-template <typename T>
-HWY_INLINE Vec512<T> IfThenElse(hwy::SizeTag<1> /* tag */,
-                                const Mask512<T> mask, const Vec512<T> yes,
-                                const Vec512<T> no) {
-  return Vec512<T>{_mm512_mask_mov_epi8(no.raw, mask.raw, yes.raw)};
-}
-template <typename T>
-HWY_INLINE Vec512<T> IfThenElse(hwy::SizeTag<2> /* tag */,
-                                const Mask512<T> mask, const Vec512<T> yes,
-                                const Vec512<T> no) {
-  return Vec512<T>{_mm512_mask_mov_epi16(no.raw, mask.raw, yes.raw)};
-}
-template <typename T>
-HWY_INLINE Vec512<T> IfThenElse(hwy::SizeTag<4> /* tag */,
-                                const Mask512<T> mask, const Vec512<T> yes,
-                                const Vec512<T> no) {
-  return Vec512<T>{_mm512_mask_mov_epi32(no.raw, mask.raw, yes.raw)};
-}
-template <typename T>
-HWY_INLINE Vec512<T> IfThenElse(hwy::SizeTag<8> /* tag */,
-                                const Mask512<T> mask, const Vec512<T> yes,
-                                const Vec512<T> no) {
-  return Vec512<T>{_mm512_mask_mov_epi64(no.raw, mask.raw, yes.raw)};
-}
-
-}  // namespace detail
-
-template <typename T>
-HWY_API Vec512<T> IfThenElse(const Mask512<T> mask, const Vec512<T> yes,
-                             const Vec512<T> no) {
-  return detail::IfThenElse(hwy::SizeTag<sizeof(T)>(), mask, yes, no);
-}
-HWY_API Vec512<float> IfThenElse(const Mask512<float> mask,
-                                 const Vec512<float> yes,
-                                 const Vec512<float> no) {
-  return Vec512<float>{_mm512_mask_mov_ps(no.raw, mask.raw, yes.raw)};
-}
-HWY_API Vec512<double> IfThenElse(const Mask512<double> mask,
-                                  const Vec512<double> yes,
-                                  const Vec512<double> no) {
-  return Vec512<double>{_mm512_mask_mov_pd(no.raw, mask.raw, yes.raw)};
-}
-
-namespace detail {
-
-template <typename T>
-HWY_INLINE Vec512<T> IfThenElseZero(hwy::SizeTag<1> /* tag */,
-                                    const Mask512<T> mask,
-                                    const Vec512<T> yes) {
-  return Vec512<T>{_mm512_maskz_mov_epi8(mask.raw, yes.raw)};
-}
-template <typename T>
-HWY_INLINE Vec512<T> IfThenElseZero(hwy::SizeTag<2> /* tag */,
-                                    const Mask512<T> mask,
-                                    const Vec512<T> yes) {
-  return Vec512<T>{_mm512_maskz_mov_epi16(mask.raw, yes.raw)};
-}
-template <typename T>
-HWY_INLINE Vec512<T> IfThenElseZero(hwy::SizeTag<4> /* tag */,
-                                    const Mask512<T> mask,
-                                    const Vec512<T> yes) {
-  return Vec512<T>{_mm512_maskz_mov_epi32(mask.raw, yes.raw)};
-}
-template <typename T>
-HWY_INLINE Vec512<T> IfThenElseZero(hwy::SizeTag<8> /* tag */,
-                                    const Mask512<T> mask,
-                                    const Vec512<T> yes) {
-  return Vec512<T>{_mm512_maskz_mov_epi64(mask.raw, yes.raw)};
-}
-
-}  // namespace detail
-
-template <typename T>
-HWY_API Vec512<T> IfThenElseZero(const Mask512<T> mask, const Vec512<T> yes) {
-  return detail::IfThenElseZero(hwy::SizeTag<sizeof(T)>(), mask, yes);
-}
-HWY_API Vec512<float> IfThenElseZero(const Mask512<float> mask,
-                                     const Vec512<float> yes) {
-  return Vec512<float>{_mm512_maskz_mov_ps(mask.raw, yes.raw)};
-}
-HWY_API Vec512<double> IfThenElseZero(const Mask512<double> mask,
-                                      const Vec512<double> yes) {
-  return Vec512<double>{_mm512_maskz_mov_pd(mask.raw, yes.raw)};
-}
-
-namespace detail {
-
-template <typename T>
-HWY_INLINE Vec512<T> IfThenZeroElse(hwy::SizeTag<1> /* tag */,
-                                    const Mask512<T> mask, const Vec512<T> no) {
-  // xor_epi8/16 are missing, but we have sub, which is just as fast for u8/16.
-  return Vec512<T>{_mm512_mask_sub_epi8(no.raw, mask.raw, no.raw, no.raw)};
-}
-template <typename T>
-HWY_INLINE Vec512<T> IfThenZeroElse(hwy::SizeTag<2> /* tag */,
-                                    const Mask512<T> mask, const Vec512<T> no) {
-  return Vec512<T>{_mm512_mask_sub_epi16(no.raw, mask.raw, no.raw, no.raw)};
-}
-template <typename T>
-HWY_INLINE Vec512<T> IfThenZeroElse(hwy::SizeTag<4> /* tag */,
-                                    const Mask512<T> mask, const Vec512<T> no) {
-  return Vec512<T>{_mm512_mask_xor_epi32(no.raw, mask.raw, no.raw, no.raw)};
-}
-template <typename T>
-HWY_INLINE Vec512<T> IfThenZeroElse(hwy::SizeTag<8> /* tag */,
-                                    const Mask512<T> mask, const Vec512<T> no) {
-  return Vec512<T>{_mm512_mask_xor_epi64(no.raw, mask.raw, no.raw, no.raw)};
-}
-
-}  // namespace detail
-
-template <typename T>
-HWY_API Vec512<T> IfThenZeroElse(const Mask512<T> mask, const Vec512<T> no) {
-  return detail::IfThenZeroElse(hwy::SizeTag<sizeof(T)>(), mask, no);
-}
-HWY_API Vec512<float> IfThenZeroElse(const Mask512<float> mask,
-                                     const Vec512<float> no) {
-  return Vec512<float>{_mm512_mask_xor_ps(no.raw, mask.raw, no.raw, no.raw)};
-}
-HWY_API Vec512<double> IfThenZeroElse(const Mask512<double> mask,
-                                      const Vec512<double> no) {
-  return Vec512<double>{_mm512_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)};
-}
-
-template <typename T>
-HWY_API Vec512<T> IfNegativeThenElse(Vec512<T> v, Vec512<T> yes, Vec512<T> no) {
-  static_assert(IsSigned<T>(), "Only works for signed/float");
-  // AVX3 MaskFromVec only looks at the MSB
-  return IfThenElse(MaskFromVec(v), yes, no);
-}
-
-template <typename T, HWY_IF_FLOAT(T)>
-HWY_API Vec512<T> ZeroIfNegative(const Vec512<T> v) {
-  // AVX3 MaskFromVec only looks at the MSB
-  return IfThenZeroElse(MaskFromVec(v), v);
-}
-
-// ================================================== ARITHMETIC
-
-// ------------------------------ Addition
-
-// Unsigned
-HWY_API Vec512<uint8_t> operator+(const Vec512<uint8_t> a,
-                                  const Vec512<uint8_t> b) {
-  return Vec512<uint8_t>{_mm512_add_epi8(a.raw, b.raw)};
-}
-HWY_API Vec512<uint16_t> operator+(const Vec512<uint16_t> a,
-                                   const Vec512<uint16_t> b) {
-  return Vec512<uint16_t>{_mm512_add_epi16(a.raw, b.raw)};
-}
-HWY_API Vec512<uint32_t> operator+(const Vec512<uint32_t> a,
-                                   const Vec512<uint32_t> b) {
-  return Vec512<uint32_t>{_mm512_add_epi32(a.raw, b.raw)};
-}
-HWY_API Vec512<uint64_t> operator+(const Vec512<uint64_t> a,
-                                   const Vec512<uint64_t> b) {
-  return Vec512<uint64_t>{_mm512_add_epi64(a.raw, b.raw)};
-}
-
-// Signed
-HWY_API Vec512<int8_t> operator+(const Vec512<int8_t> a,
-                                 const Vec512<int8_t> b) {
-  return Vec512<int8_t>{_mm512_add_epi8(a.raw, b.raw)};
-}
-HWY_API Vec512<int16_t> operator+(const Vec512<int16_t> a,
-                                  const Vec512<int16_t> b) {
-  return Vec512<int16_t>{_mm512_add_epi16(a.raw, b.raw)};
-}
-HWY_API Vec512<int32_t> operator+(const Vec512<int32_t> a,
-                                  const Vec512<int32_t> b) {
-  return Vec512<int32_t>{_mm512_add_epi32(a.raw, b.raw)};
-}
-HWY_API Vec512<int64_t> operator+(const Vec512<int64_t> a,
-                                  const Vec512<int64_t> b) {
-  return Vec512<int64_t>{_mm512_add_epi64(a.raw, b.raw)};
-}
-
-// Float
-HWY_API Vec512<float> operator+(const Vec512<float> a, const Vec512<float> b) {
-  return Vec512<float>{_mm512_add_ps(a.raw, b.raw)};
-}
-HWY_API Vec512<double> operator+(const Vec512<double> a,
-                                 const Vec512<double> b) {
-  return Vec512<double>{_mm512_add_pd(a.raw, b.raw)};
-}
-
-// ------------------------------ Subtraction
-
-// Unsigned
-HWY_API Vec512<uint8_t> operator-(const Vec512<uint8_t> a,
-                                  const Vec512<uint8_t> b) {
-  return Vec512<uint8_t>{_mm512_sub_epi8(a.raw, b.raw)};
-}
-HWY_API Vec512<uint16_t> operator-(const Vec512<uint16_t> a,
-                                   const Vec512<uint16_t> b) {
-  return Vec512<uint16_t>{_mm512_sub_epi16(a.raw, b.raw)};
-}
-HWY_API Vec512<uint32_t> operator-(const Vec512<uint32_t> a,
-                                   const Vec512<uint32_t> b) {
-  return Vec512<uint32_t>{_mm512_sub_epi32(a.raw, b.raw)};
-}
-HWY_API Vec512<uint64_t> operator-(const Vec512<uint64_t> a,
-                                   const Vec512<uint64_t> b) {
-  return Vec512<uint64_t>{_mm512_sub_epi64(a.raw, b.raw)};
-}
-
-// Signed
-HWY_API Vec512<int8_t> operator-(const Vec512<int8_t> a,
-                                 const Vec512<int8_t> b) {
-  return Vec512<int8_t>{_mm512_sub_epi8(a.raw, b.raw)};
-}
-HWY_API Vec512<int16_t> operator-(const Vec512<int16_t> a,
-                                  const Vec512<int16_t> b) {
-  return Vec512<int16_t>{_mm512_sub_epi16(a.raw, b.raw)};
-}
-HWY_API Vec512<int32_t> operator-(const Vec512<int32_t> a,
-                                  const Vec512<int32_t> b) {
-  return Vec512<int32_t>{_mm512_sub_epi32(a.raw, b.raw)};
-}
-HWY_API Vec512<int64_t> operator-(const Vec512<int64_t> a,
-                                  const Vec512<int64_t> b) {
-  return Vec512<int64_t>{_mm512_sub_epi64(a.raw, b.raw)};
-}
-
-// Float
-HWY_API Vec512<float> operator-(const Vec512<float> a, const Vec512<float> b) {
-  return Vec512<float>{_mm512_sub_ps(a.raw, b.raw)};
-}
-HWY_API Vec512<double> operator-(const Vec512<double> a,
-                                 const Vec512<double> b) {
-  return Vec512<double>{_mm512_sub_pd(a.raw, b.raw)};
-}
-
-// ------------------------------ SumsOf8
-HWY_API Vec512<uint64_t> SumsOf8(const Vec512<uint8_t> v) {
-  return Vec512<uint64_t>{_mm512_sad_epu8(v.raw, _mm512_setzero_si512())};
-}
-
-// ------------------------------ SaturatedAdd
-
-// Returns a + b clamped to the destination range.
-
-// Unsigned
-HWY_API Vec512<uint8_t> SaturatedAdd(const Vec512<uint8_t> a,
-                                     const Vec512<uint8_t> b) {
-  return Vec512<uint8_t>{_mm512_adds_epu8(a.raw, b.raw)};
-}
-HWY_API Vec512<uint16_t> SaturatedAdd(const Vec512<uint16_t> a,
-                                      const Vec512<uint16_t> b) {
-  return Vec512<uint16_t>{_mm512_adds_epu16(a.raw, b.raw)};
-}
-
-// Signed
-HWY_API Vec512<int8_t> SaturatedAdd(const Vec512<int8_t> a,
-                                    const Vec512<int8_t> b) {
-  return Vec512<int8_t>{_mm512_adds_epi8(a.raw, b.raw)};
-}
-HWY_API Vec512<int16_t> SaturatedAdd(const Vec512<int16_t> a,
-                                     const Vec512<int16_t> b) {
-  return Vec512<int16_t>{_mm512_adds_epi16(a.raw, b.raw)};
-}
-
-// ------------------------------ SaturatedSub
-
-// Returns a - b clamped to the destination range.
-
-// Unsigned
-HWY_API Vec512<uint8_t> SaturatedSub(const Vec512<uint8_t> a,
-                                     const Vec512<uint8_t> b) {
-  return Vec512<uint8_t>{_mm512_subs_epu8(a.raw, b.raw)};
-}
-HWY_API Vec512<uint16_t> SaturatedSub(const Vec512<uint16_t> a,
-                                      const Vec512<uint16_t> b) {
-  return Vec512<uint16_t>{_mm512_subs_epu16(a.raw, b.raw)};
-}
-
-// Signed
-HWY_API Vec512<int8_t> SaturatedSub(const Vec512<int8_t> a,
-                                    const Vec512<int8_t> b) {
-  return Vec512<int8_t>{_mm512_subs_epi8(a.raw, b.raw)};
-}
-HWY_API Vec512<int16_t> SaturatedSub(const Vec512<int16_t> a,
-                                     const Vec512<int16_t> b) {
-  return Vec512<int16_t>{_mm512_subs_epi16(a.raw, b.raw)};
-}
-
-// ------------------------------ Average
-
-// Returns (a + b + 1) / 2
-
-// Unsigned
-HWY_API Vec512<uint8_t> AverageRound(const Vec512<uint8_t> a,
-                                     const Vec512<uint8_t> b) {
-  return Vec512<uint8_t>{_mm512_avg_epu8(a.raw, b.raw)};
-}
-HWY_API Vec512<uint16_t> AverageRound(const Vec512<uint16_t> a,
-                                      const Vec512<uint16_t> b) {
-  return Vec512<uint16_t>{_mm512_avg_epu16(a.raw, b.raw)};
-}
-
-// ------------------------------ Abs (Sub)
-
-// Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1.
-HWY_API Vec512<int8_t> Abs(const Vec512<int8_t> v) {
-#if HWY_COMPILER_MSVC
-  // Workaround for incorrect codegen? (untested due to internal compiler error)
-  const auto zero = Zero(Full512<int8_t>());
-  return Vec512<int8_t>{_mm512_max_epi8(v.raw, (zero - v).raw)};
-#else
-  return Vec512<int8_t>{_mm512_abs_epi8(v.raw)};
-#endif
-}
-HWY_API Vec512<int16_t> Abs(const Vec512<int16_t> v) {
-  return Vec512<int16_t>{_mm512_abs_epi16(v.raw)};
-}
-HWY_API Vec512<int32_t> Abs(const Vec512<int32_t> v) {
-  return Vec512<int32_t>{_mm512_abs_epi32(v.raw)};
-}
-HWY_API Vec512<int64_t> Abs(const Vec512<int64_t> v) {
-  return Vec512<int64_t>{_mm512_abs_epi64(v.raw)};
-}
-
-// These aren't native instructions, they also involve AND with constant.
-HWY_API Vec512<float> Abs(const Vec512<float> v) {
-  return Vec512<float>{_mm512_abs_ps(v.raw)};
-}
-HWY_API Vec512<double> Abs(const Vec512<double> v) {
-  return Vec512<double>{_mm512_abs_pd(v.raw)};
-}
-// ------------------------------ ShiftLeft
-
-template <int kBits>
-HWY_API Vec512<uint16_t> ShiftLeft(const Vec512<uint16_t> v) {
-  return Vec512<uint16_t>{_mm512_slli_epi16(v.raw, kBits)};
-}
-
-template <int kBits>
-HWY_API Vec512<uint32_t> ShiftLeft(const Vec512<uint32_t> v) {
-  return Vec512<uint32_t>{_mm512_slli_epi32(v.raw, kBits)};
-}
-
-template <int kBits>
-HWY_API Vec512<uint64_t> ShiftLeft(const Vec512<uint64_t> v) {
-  return Vec512<uint64_t>{_mm512_slli_epi64(v.raw, kBits)};
-}
-
-template <int kBits>
-HWY_API Vec512<int16_t> ShiftLeft(const Vec512<int16_t> v) {
-  return Vec512<int16_t>{_mm512_slli_epi16(v.raw, kBits)};
-}
-
-template <int kBits>
-HWY_API Vec512<int32_t> ShiftLeft(const Vec512<int32_t> v) {
-  return Vec512<int32_t>{_mm512_slli_epi32(v.raw, kBits)};
-}
-
-template <int kBits>
-HWY_API Vec512<int64_t> ShiftLeft(const Vec512<int64_t> v) {
-  return Vec512<int64_t>{_mm512_slli_epi64(v.raw, kBits)};
-}
-
-template <int kBits, typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Vec512<T> ShiftLeft(const Vec512<T> v) {
-  const Full512<T> d8;
-  const RepartitionToWide<decltype(d8)> d16;
-  const auto shifted = BitCast(d8, ShiftLeft<kBits>(BitCast(d16, v)));
-  return kBits == 1
-             ? (v + v)
-             : (shifted & Set(d8, static_cast<T>((0xFF << kBits) & 0xFF)));
-}
-
-// ------------------------------ ShiftRight
-
-template <int kBits>
-HWY_API Vec512<uint16_t> ShiftRight(const Vec512<uint16_t> v) {
-  return Vec512<uint16_t>{_mm512_srli_epi16(v.raw, kBits)};
-}
-
-template <int kBits>
-HWY_API Vec512<uint32_t> ShiftRight(const Vec512<uint32_t> v) {
-  return Vec512<uint32_t>{_mm512_srli_epi32(v.raw, kBits)};
-}
-
-template <int kBits>
-HWY_API Vec512<uint64_t> ShiftRight(const Vec512<uint64_t> v) {
-  return Vec512<uint64_t>{_mm512_srli_epi64(v.raw, kBits)};
-}
-
-template <int kBits>
-HWY_API Vec512<uint8_t> ShiftRight(const Vec512<uint8_t> v) {
-  const Full512<uint8_t> d8;
-  // Use raw instead of BitCast to support N=1.
-  const Vec512<uint8_t> shifted{ShiftRight<kBits>(Vec512<uint16_t>{v.raw}).raw};
-  return shifted & Set(d8, 0xFF >> kBits);
-}
-
-template <int kBits>
-HWY_API Vec512<int16_t> ShiftRight(const Vec512<int16_t> v) {
-  return Vec512<int16_t>{_mm512_srai_epi16(v.raw, kBits)};
-}
-
-template <int kBits>
-HWY_API Vec512<int32_t> ShiftRight(const Vec512<int32_t> v) {
-  return Vec512<int32_t>{_mm512_srai_epi32(v.raw, kBits)};
-}
-
-template <int kBits>
-HWY_API Vec512<int64_t> ShiftRight(const Vec512<int64_t> v) {
-  return Vec512<int64_t>{_mm512_srai_epi64(v.raw, kBits)};
-}
-
-template <int kBits>
-HWY_API Vec512<int8_t> ShiftRight(const Vec512<int8_t> v) {
-  const Full512<int8_t> di;
-  const Full512<uint8_t> du;
-  const auto shifted = BitCast(di, ShiftRight<kBits>(BitCast(du, v)));
-  const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits));
-  return (shifted ^ shifted_sign) - shifted_sign;
-}
-
-// ------------------------------ RotateRight
-
-template <int kBits>
-HWY_API Vec512<uint32_t> RotateRight(const Vec512<uint32_t> v) {
-  static_assert(0 <= kBits && kBits < 32, "Invalid shift count");
-  return Vec512<uint32_t>{_mm512_ror_epi32(v.raw, kBits)};
-}
-
-template <int kBits>
-HWY_API Vec512<uint64_t> RotateRight(const Vec512<uint64_t> v) {
-  static_assert(0 <= kBits && kBits < 64, "Invalid shift count");
-  return Vec512<uint64_t>{_mm512_ror_epi64(v.raw, kBits)};
-}
-
-// ------------------------------ ShiftLeftSame
-
-HWY_API Vec512<uint16_t> ShiftLeftSame(const Vec512<uint16_t> v,
-                                       const int bits) {
-  return Vec512<uint16_t>{_mm512_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
-}
-HWY_API Vec512<uint32_t> ShiftLeftSame(const Vec512<uint32_t> v,
-                                       const int bits) {
-  return Vec512<uint32_t>{_mm512_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
-}
-HWY_API Vec512<uint64_t> ShiftLeftSame(const Vec512<uint64_t> v,
-                                       const int bits) {
-  return Vec512<uint64_t>{_mm512_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
-}
-
-HWY_API Vec512<int16_t> ShiftLeftSame(const Vec512<int16_t> v, const int bits) {
-  return Vec512<int16_t>{_mm512_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))};
-}
-
-HWY_API Vec512<int32_t> ShiftLeftSame(const Vec512<int32_t> v, const int bits) {
-  return Vec512<int32_t>{_mm512_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))};
-}
-
-HWY_API Vec512<int64_t> ShiftLeftSame(const Vec512<int64_t> v, const int bits) {
-  return Vec512<int64_t>{_mm512_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))};
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Vec512<T> ShiftLeftSame(const Vec512<T> v, const int bits) {
-  const Full512<T> d8;
-  const RepartitionToWide<decltype(d8)> d16;
-  const auto shifted = BitCast(d8, ShiftLeftSame(BitCast(d16, v), bits));
-  return shifted & Set(d8, static_cast<T>((0xFF << bits) & 0xFF));
-}
-
-// ------------------------------ ShiftRightSame
-
-HWY_API Vec512<uint16_t> ShiftRightSame(const Vec512<uint16_t> v,
-                                        const int bits) {
-  return Vec512<uint16_t>{_mm512_srl_epi16(v.raw, _mm_cvtsi32_si128(bits))};
-}
-HWY_API Vec512<uint32_t> ShiftRightSame(const Vec512<uint32_t> v,
-                                        const int bits) {
-  return Vec512<uint32_t>{_mm512_srl_epi32(v.raw, _mm_cvtsi32_si128(bits))};
-}
-HWY_API Vec512<uint64_t> ShiftRightSame(const Vec512<uint64_t> v,
-                                        const int bits) {
-  return Vec512<uint64_t>{_mm512_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))};
-}
-
-HWY_API Vec512<uint8_t> ShiftRightSame(Vec512<uint8_t> v, const int bits) {
-  const Full512<uint8_t> d8;
-  const RepartitionToWide<decltype(d8)> d16;
-  const auto shifted = BitCast(d8, ShiftRightSame(BitCast(d16, v), bits));
-  return shifted & Set(d8, static_cast<uint8_t>(0xFF >> bits));
-}
-
-HWY_API Vec512<int16_t> ShiftRightSame(const Vec512<int16_t> v,
-                                       const int bits) {
-  return Vec512<int16_t>{_mm512_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))};
-}
-
-HWY_API Vec512<int32_t> ShiftRightSame(const Vec512<int32_t> v,
-                                       const int bits) {
-  return Vec512<int32_t>{_mm512_sra_epi32(v.raw, _mm_cvtsi32_si128(bits))};
-}
-HWY_API Vec512<int64_t> ShiftRightSame(const Vec512<int64_t> v,
-                                       const int bits) {
-  return Vec512<int64_t>{_mm512_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))};
-}
-
-HWY_API Vec512<int8_t> ShiftRightSame(Vec512<int8_t> v, const int bits) {
-  const Full512<int8_t> di;
-  const Full512<uint8_t> du;
-  const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits));
-  const auto shifted_sign =
-      BitCast(di, Set(du, static_cast<uint8_t>(0x80 >> bits)));
-  return (shifted ^ shifted_sign) - shifted_sign;
-}
-
-// ------------------------------ Shl
-
-HWY_API Vec512<uint16_t> operator<<(const Vec512<uint16_t> v,
-                                    const Vec512<uint16_t> bits) {
-  return Vec512<uint16_t>{_mm512_sllv_epi16(v.raw, bits.raw)};
-}
-
-HWY_API Vec512<uint32_t> operator<<(const Vec512<uint32_t> v,
-                                    const Vec512<uint32_t> bits) {
-  return Vec512<uint32_t>{_mm512_sllv_epi32(v.raw, bits.raw)};
-}
-
-HWY_API Vec512<uint64_t> operator<<(const Vec512<uint64_t> v,
-                                    const Vec512<uint64_t> bits) {
-  return Vec512<uint64_t>{_mm512_sllv_epi64(v.raw, bits.raw)};
-}
-
-// Signed left shift is the same as unsigned.
-template <typename T, HWY_IF_SIGNED(T)>
-HWY_API Vec512<T> operator<<(const Vec512<T> v, const Vec512<T> bits) {
-  const Full512<T> di;
-  const Full512<MakeUnsigned<T>> du;
-  return BitCast(di, BitCast(du, v) << BitCast(du, bits));
-}
-
-// ------------------------------ Shr
-
-HWY_API Vec512<uint16_t> operator>>(const Vec512<uint16_t> v,
-                                    const Vec512<uint16_t> bits) {
-  return Vec512<uint16_t>{_mm512_srlv_epi16(v.raw, bits.raw)};
-}
-
-HWY_API Vec512<uint32_t> operator>>(const Vec512<uint32_t> v,
-                                    const Vec512<uint32_t> bits) {
-  return Vec512<uint32_t>{_mm512_srlv_epi32(v.raw, bits.raw)};
-}
-
-HWY_API Vec512<uint64_t> operator>>(const Vec512<uint64_t> v,
-                                    const Vec512<uint64_t> bits) {
-  return Vec512<uint64_t>{_mm512_srlv_epi64(v.raw, bits.raw)};
-}
-
-HWY_API Vec512<int16_t> operator>>(const Vec512<int16_t> v,
-                                   const Vec512<int16_t> bits) {
-  return Vec512<int16_t>{_mm512_srav_epi16(v.raw, bits.raw)};
-}
-
-HWY_API Vec512<int32_t> operator>>(const Vec512<int32_t> v,
-                                   const Vec512<int32_t> bits) {
-  return Vec512<int32_t>{_mm512_srav_epi32(v.raw, bits.raw)};
-}
-
-HWY_API Vec512<int64_t> operator>>(const Vec512<int64_t> v,
-                                   const Vec512<int64_t> bits) {
-  return Vec512<int64_t>{_mm512_srav_epi64(v.raw, bits.raw)};
-}
-
-// ------------------------------ Minimum
-
-// Unsigned
-HWY_API Vec512<uint8_t> Min(const Vec512<uint8_t> a, const Vec512<uint8_t> b) {
-  return Vec512<uint8_t>{_mm512_min_epu8(a.raw, b.raw)};
-}
-HWY_API Vec512<uint16_t> Min(const Vec512<uint16_t> a,
-                             const Vec512<uint16_t> b) {
-  return Vec512<uint16_t>{_mm512_min_epu16(a.raw, b.raw)};
-}
-HWY_API Vec512<uint32_t> Min(const Vec512<uint32_t> a,
-                             const Vec512<uint32_t> b) {
-  return Vec512<uint32_t>{_mm512_min_epu32(a.raw, b.raw)};
-}
-HWY_API Vec512<uint64_t> Min(const Vec512<uint64_t> a,
-                             const Vec512<uint64_t> b) {
-  return Vec512<uint64_t>{_mm512_min_epu64(a.raw, b.raw)};
-}
-
-// Signed
-HWY_API Vec512<int8_t> Min(const Vec512<int8_t> a, const Vec512<int8_t> b) {
-  return Vec512<int8_t>{_mm512_min_epi8(a.raw, b.raw)};
-}
-HWY_API Vec512<int16_t> Min(const Vec512<int16_t> a, const Vec512<int16_t> b) {
-  return Vec512<int16_t>{_mm512_min_epi16(a.raw, b.raw)};
-}
-HWY_API Vec512<int32_t> Min(const Vec512<int32_t> a, const Vec512<int32_t> b) {
-  return Vec512<int32_t>{_mm512_min_epi32(a.raw, b.raw)};
-}
-HWY_API Vec512<int64_t> Min(const Vec512<int64_t> a, const Vec512<int64_t> b) {
-  return Vec512<int64_t>{_mm512_min_epi64(a.raw, b.raw)};
-}
-
-// Float
-HWY_API Vec512<float> Min(const Vec512<float> a, const Vec512<float> b) {
-  return Vec512<float>{_mm512_min_ps(a.raw, b.raw)};
-}
-HWY_API Vec512<double> Min(const Vec512<double> a, const Vec512<double> b) {
-  return Vec512<double>{_mm512_min_pd(a.raw, b.raw)};
-}
-
-// ------------------------------ Maximum
-
-// Unsigned
-HWY_API Vec512<uint8_t> Max(const Vec512<uint8_t> a, const Vec512<uint8_t> b) {
-  return Vec512<uint8_t>{_mm512_max_epu8(a.raw, b.raw)};
-}
-HWY_API Vec512<uint16_t> Max(const Vec512<uint16_t> a,
-                             const Vec512<uint16_t> b) {
-  return Vec512<uint16_t>{_mm512_max_epu16(a.raw, b.raw)};
-}
-HWY_API Vec512<uint32_t> Max(const Vec512<uint32_t> a,
-                             const Vec512<uint32_t> b) {
-  return Vec512<uint32_t>{_mm512_max_epu32(a.raw, b.raw)};
-}
-HWY_API Vec512<uint64_t> Max(const Vec512<uint64_t> a,
-                             const Vec512<uint64_t> b) {
-  return Vec512<uint64_t>{_mm512_max_epu64(a.raw, b.raw)};
-}
-
-// Signed
-HWY_API Vec512<int8_t> Max(const Vec512<int8_t> a, const Vec512<int8_t> b) {
-  return Vec512<int8_t>{_mm512_max_epi8(a.raw, b.raw)};
-}
-HWY_API Vec512<int16_t> Max(const Vec512<int16_t> a, const Vec512<int16_t> b) {
-  return Vec512<int16_t>{_mm512_max_epi16(a.raw, b.raw)};
-}
-HWY_API Vec512<int32_t> Max(const Vec512<int32_t> a, const Vec512<int32_t> b) {
-  return Vec512<int32_t>{_mm512_max_epi32(a.raw, b.raw)};
-}
-HWY_API Vec512<int64_t> Max(const Vec512<int64_t> a, const Vec512<int64_t> b) {
-  return Vec512<int64_t>{_mm512_max_epi64(a.raw, b.raw)};
-}
-
-// Float
-HWY_API Vec512<float> Max(const Vec512<float> a, const Vec512<float> b) {
-  return Vec512<float>{_mm512_max_ps(a.raw, b.raw)};
-}
-HWY_API Vec512<double> Max(const Vec512<double> a, const Vec512<double> b) {
-  return Vec512<double>{_mm512_max_pd(a.raw, b.raw)};
-}
-
-// ------------------------------ Integer multiplication
-
-// Unsigned
-HWY_API Vec512<uint16_t> operator*(Vec512<uint16_t> a, Vec512<uint16_t> b) {
-  return Vec512<uint16_t>{_mm512_mullo_epi16(a.raw, b.raw)};
-}
-HWY_API Vec512<uint32_t> operator*(Vec512<uint32_t> a, Vec512<uint32_t> b) {
-  return Vec512<uint32_t>{_mm512_mullo_epi32(a.raw, b.raw)};
-}
-HWY_API Vec512<uint64_t> operator*(Vec512<uint64_t> a, Vec512<uint64_t> b) {
-  return Vec512<uint64_t>{_mm512_mullo_epi64(a.raw, b.raw)};
-}
-HWY_API Vec256<uint64_t> operator*(Vec256<uint64_t> a, Vec256<uint64_t> b) {
-  return Vec256<uint64_t>{_mm256_mullo_epi64(a.raw, b.raw)};
-}
-HWY_API Vec128<uint64_t> operator*(Vec128<uint64_t> a, Vec128<uint64_t> b) {
-  return Vec128<uint64_t>{_mm_mullo_epi64(a.raw, b.raw)};
-}
-
-// Per-target flag to prevent generic_ops-inl.h from defining i64 operator*.
-#ifdef HWY_NATIVE_I64MULLO
-#undef HWY_NATIVE_I64MULLO
-#else
-#define HWY_NATIVE_I64MULLO
-#endif
-
-// Signed
-HWY_API Vec512<int16_t> operator*(Vec512<int16_t> a, Vec512<int16_t> b) {
-  return Vec512<int16_t>{_mm512_mullo_epi16(a.raw, b.raw)};
-}
-HWY_API Vec512<int32_t> operator*(Vec512<int32_t> a, Vec512<int32_t> b) {
-  return Vec512<int32_t>{_mm512_mullo_epi32(a.raw, b.raw)};
-}
-HWY_API Vec512<int64_t> operator*(Vec512<int64_t> a, Vec512<int64_t> b) {
-  return Vec512<int64_t>{_mm512_mullo_epi64(a.raw, b.raw)};
-}
-HWY_API Vec256<int64_t> operator*(Vec256<int64_t> a, Vec256<int64_t> b) {
-  return Vec256<int64_t>{_mm256_mullo_epi64(a.raw, b.raw)};
-}
-HWY_API Vec128<int64_t> operator*(Vec128<int64_t> a, Vec128<int64_t> b) {
-  return Vec128<int64_t>{_mm_mullo_epi64(a.raw, b.raw)};
-}
-// Returns the upper 16 bits of a * b in each lane.
-HWY_API Vec512<uint16_t> MulHigh(Vec512<uint16_t> a, Vec512<uint16_t> b) {
-  return Vec512<uint16_t>{_mm512_mulhi_epu16(a.raw, b.raw)};
-}
-HWY_API Vec512<int16_t> MulHigh(Vec512<int16_t> a, Vec512<int16_t> b) {
-  return Vec512<int16_t>{_mm512_mulhi_epi16(a.raw, b.raw)};
-}
-
-HWY_API Vec512<int16_t> MulFixedPoint15(Vec512<int16_t> a, Vec512<int16_t> b) {
-  return Vec512<int16_t>{_mm512_mulhrs_epi16(a.raw, b.raw)};
-}
-
-// Multiplies even lanes (0, 2 ..) and places the double-wide result into
-// even and the upper half into its odd neighbor lane.
-HWY_API Vec512<int64_t> MulEven(Vec512<int32_t> a, Vec512<int32_t> b) {
-  return Vec512<int64_t>{_mm512_mul_epi32(a.raw, b.raw)};
-}
-HWY_API Vec512<uint64_t> MulEven(Vec512<uint32_t> a, Vec512<uint32_t> b) {
-  return Vec512<uint64_t>{_mm512_mul_epu32(a.raw, b.raw)};
-}
-
-// ------------------------------ Neg (Sub)
-
-template <typename T, HWY_IF_FLOAT(T)>
-HWY_API Vec512<T> Neg(const Vec512<T> v) {
-  return Xor(v, SignBit(Full512<T>()));
-}
-
-template <typename T, HWY_IF_NOT_FLOAT(T)>
-HWY_API Vec512<T> Neg(const Vec512<T> v) {
-  return Zero(Full512<T>()) - v;
-}
-
-// ------------------------------ Floating-point mul / div
-
-HWY_API Vec512<float> operator*(const Vec512<float> a, const Vec512<float> b) {
-  return Vec512<float>{_mm512_mul_ps(a.raw, b.raw)};
-}
-HWY_API Vec512<double> operator*(const Vec512<double> a,
-                                 const Vec512<double> b) {
-  return Vec512<double>{_mm512_mul_pd(a.raw, b.raw)};
-}
-
-HWY_API Vec512<float> operator/(const Vec512<float> a, const Vec512<float> b) {
-  return Vec512<float>{_mm512_div_ps(a.raw, b.raw)};
-}
-HWY_API Vec512<double> operator/(const Vec512<double> a,
-                                 const Vec512<double> b) {
-  return Vec512<double>{_mm512_div_pd(a.raw, b.raw)};
-}
-
-// Approximate reciprocal
-HWY_API Vec512<float> ApproximateReciprocal(const Vec512<float> v) {
-  return Vec512<float>{_mm512_rcp14_ps(v.raw)};
-}
-
-// Absolute value of difference.
-HWY_API Vec512<float> AbsDiff(const Vec512<float> a, const Vec512<float> b) {
-  return Abs(a - b);
-}
-
-// ------------------------------ Floating-point multiply-add variants
-
-// Returns mul * x + add
-HWY_API Vec512<float> MulAdd(const Vec512<float> mul, const Vec512<float> x,
-                             const Vec512<float> add) {
-  return Vec512<float>{_mm512_fmadd_ps(mul.raw, x.raw, add.raw)};
-}
-HWY_API Vec512<double> MulAdd(const Vec512<double> mul, const Vec512<double> x,
-                              const Vec512<double> add) {
-  return Vec512<double>{_mm512_fmadd_pd(mul.raw, x.raw, add.raw)};
-}
-
-// Returns add - mul * x
-HWY_API Vec512<float> NegMulAdd(const Vec512<float> mul, const Vec512<float> x,
-                                const Vec512<float> add) {
-  return Vec512<float>{_mm512_fnmadd_ps(mul.raw, x.raw, add.raw)};
-}
-HWY_API Vec512<double> NegMulAdd(const Vec512<double> mul,
-                                 const Vec512<double> x,
-                                 const Vec512<double> add) {
-  return Vec512<double>{_mm512_fnmadd_pd(mul.raw, x.raw, add.raw)};
-}
-
-// Returns mul * x - sub
-HWY_API Vec512<float> MulSub(const Vec512<float> mul, const Vec512<float> x,
-                             const Vec512<float> sub) {
-  return Vec512<float>{_mm512_fmsub_ps(mul.raw, x.raw, sub.raw)};
-}
-HWY_API Vec512<double> MulSub(const Vec512<double> mul, const Vec512<double> x,
-                              const Vec512<double> sub) {
-  return Vec512<double>{_mm512_fmsub_pd(mul.raw, x.raw, sub.raw)};
-}
-
-// Returns -mul * x - sub
-HWY_API Vec512<float> NegMulSub(const Vec512<float> mul, const Vec512<float> x,
-                                const Vec512<float> sub) {
-  return Vec512<float>{_mm512_fnmsub_ps(mul.raw, x.raw, sub.raw)};
-}
-HWY_API Vec512<double> NegMulSub(const Vec512<double> mul,
-                                 const Vec512<double> x,
-                                 const Vec512<double> sub) {
-  return Vec512<double>{_mm512_fnmsub_pd(mul.raw, x.raw, sub.raw)};
-}
-
-// ------------------------------ Floating-point square root
-
-// Full precision square root
-HWY_API Vec512<float> Sqrt(const Vec512<float> v) {
-  return Vec512<float>{_mm512_sqrt_ps(v.raw)};
-}
-HWY_API Vec512<double> Sqrt(const Vec512<double> v) {
-  return Vec512<double>{_mm512_sqrt_pd(v.raw)};
-}
-
-// Approximate reciprocal square root
-HWY_API Vec512<float> ApproximateReciprocalSqrt(const Vec512<float> v) {
-  return Vec512<float>{_mm512_rsqrt14_ps(v.raw)};
-}
-
-// ------------------------------ Floating-point rounding
-
-// Work around warnings in the intrinsic definitions (passing -1 as a mask).
-HWY_DIAGNOSTICS(push)
-HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
-
-// Toward nearest integer, tie to even
-HWY_API Vec512<float> Round(const Vec512<float> v) {
-  return Vec512<float>{_mm512_roundscale_ps(
-      v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
-}
-HWY_API Vec512<double> Round(const Vec512<double> v) {
-  return Vec512<double>{_mm512_roundscale_pd(
-      v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)};
-}
-
-// Toward zero, aka truncate
-HWY_API Vec512<float> Trunc(const Vec512<float> v) {
-  return Vec512<float>{
-      _mm512_roundscale_ps(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
-}
-HWY_API Vec512<double> Trunc(const Vec512<double> v) {
-  return Vec512<double>{
-      _mm512_roundscale_pd(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)};
-}
-
-// Toward +infinity, aka ceiling
-HWY_API Vec512<float> Ceil(const Vec512<float> v) {
-  return Vec512<float>{
-      _mm512_roundscale_ps(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
-}
-HWY_API Vec512<double> Ceil(const Vec512<double> v) {
-  return Vec512<double>{
-      _mm512_roundscale_pd(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)};
-}
-
-// Toward -infinity, aka floor
-HWY_API Vec512<float> Floor(const Vec512<float> v) {
-  return Vec512<float>{
-      _mm512_roundscale_ps(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
-}
-HWY_API Vec512<double> Floor(const Vec512<double> v) {
-  return Vec512<double>{
-      _mm512_roundscale_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)};
-}
-
-HWY_DIAGNOSTICS(pop)
-
-// ================================================== COMPARE
-
-// Comparisons set a mask bit to 1 if the condition is true, else 0.
-
-template <typename TFrom, typename TTo>
-HWY_API Mask512<TTo> RebindMask(Full512<TTo> /*tag*/, Mask512<TFrom> m) {
-  static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size");
-  return Mask512<TTo>{m.raw};
-}
-
-namespace detail {
-
-template <typename T>
-HWY_INLINE Mask512<T> TestBit(hwy::SizeTag<1> /*tag*/, const Vec512<T> v,
-                              const Vec512<T> bit) {
-  return Mask512<T>{_mm512_test_epi8_mask(v.raw, bit.raw)};
-}
-template <typename T>
-HWY_INLINE Mask512<T> TestBit(hwy::SizeTag<2> /*tag*/, const Vec512<T> v,
-                              const Vec512<T> bit) {
-  return Mask512<T>{_mm512_test_epi16_mask(v.raw, bit.raw)};
-}
-template <typename T>
-HWY_INLINE Mask512<T> TestBit(hwy::SizeTag<4> /*tag*/, const Vec512<T> v,
-                              const Vec512<T> bit) {
-  return Mask512<T>{_mm512_test_epi32_mask(v.raw, bit.raw)};
-}
-template <typename T>
-HWY_INLINE Mask512<T> TestBit(hwy::SizeTag<8> /*tag*/, const Vec512<T> v,
-                              const Vec512<T> bit) {
-  return Mask512<T>{_mm512_test_epi64_mask(v.raw, bit.raw)};
-}
-
-}  // namespace detail
-
-template <typename T>
-HWY_API Mask512<T> TestBit(const Vec512<T> v, const Vec512<T> bit) {
-  static_assert(!hwy::IsFloat<T>(), "Only integer vectors supported");
-  return detail::TestBit(hwy::SizeTag<sizeof(T)>(), v, bit);
-}
-
-// ------------------------------ Equality
-
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Mask512<T> operator==(Vec512<T> a, Vec512<T> b) {
-  return Mask512<T>{_mm512_cmpeq_epi8_mask(a.raw, b.raw)};
-}
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Mask512<T> operator==(Vec512<T> a, Vec512<T> b) {
-  return Mask512<T>{_mm512_cmpeq_epi16_mask(a.raw, b.raw)};
-}
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Mask512<T> operator==(Vec512<T> a, Vec512<T> b) {
-  return Mask512<T>{_mm512_cmpeq_epi32_mask(a.raw, b.raw)};
-}
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Mask512<T> operator==(Vec512<T> a, Vec512<T> b) {
-  return Mask512<T>{_mm512_cmpeq_epi64_mask(a.raw, b.raw)};
-}
-
-HWY_API Mask512<float> operator==(Vec512<float> a, Vec512<float> b) {
-  return Mask512<float>{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)};
-}
-
-HWY_API Mask512<double> operator==(Vec512<double> a, Vec512<double> b) {
-  return Mask512<double>{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_EQ_OQ)};
-}
-
-// ------------------------------ Inequality
-
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Mask512<T> operator!=(Vec512<T> a, Vec512<T> b) {
-  return Mask512<T>{_mm512_cmpneq_epi8_mask(a.raw, b.raw)};
-}
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Mask512<T> operator!=(Vec512<T> a, Vec512<T> b) {
-  return Mask512<T>{_mm512_cmpneq_epi16_mask(a.raw, b.raw)};
-}
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Mask512<T> operator!=(Vec512<T> a, Vec512<T> b) {
-  return Mask512<T>{_mm512_cmpneq_epi32_mask(a.raw, b.raw)};
-}
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Mask512<T> operator!=(Vec512<T> a, Vec512<T> b) {
-  return Mask512<T>{_mm512_cmpneq_epi64_mask(a.raw, b.raw)};
-}
-
-HWY_API Mask512<float> operator!=(Vec512<float> a, Vec512<float> b) {
-  return Mask512<float>{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
-}
-
-HWY_API Mask512<double> operator!=(Vec512<double> a, Vec512<double> b) {
-  return Mask512<double>{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_NEQ_OQ)};
-}
-
-// ------------------------------ Strict inequality
-
-HWY_API Mask512<uint8_t> operator>(Vec512<uint8_t> a, Vec512<uint8_t> b) {
-  return Mask512<uint8_t>{_mm512_cmpgt_epu8_mask(a.raw, b.raw)};
-}
-HWY_API Mask512<uint16_t> operator>(Vec512<uint16_t> a, Vec512<uint16_t> b) {
-  return Mask512<uint16_t>{_mm512_cmpgt_epu16_mask(a.raw, b.raw)};
-}
-HWY_API Mask512<uint32_t> operator>(Vec512<uint32_t> a, Vec512<uint32_t> b) {
-  return Mask512<uint32_t>{_mm512_cmpgt_epu32_mask(a.raw, b.raw)};
-}
-HWY_API Mask512<uint64_t> operator>(Vec512<uint64_t> a, Vec512<uint64_t> b) {
-  return Mask512<uint64_t>{_mm512_cmpgt_epu64_mask(a.raw, b.raw)};
-}
-
-HWY_API Mask512<int8_t> operator>(Vec512<int8_t> a, Vec512<int8_t> b) {
-  return Mask512<int8_t>{_mm512_cmpgt_epi8_mask(a.raw, b.raw)};
-}
-HWY_API Mask512<int16_t> operator>(Vec512<int16_t> a, Vec512<int16_t> b) {
-  return Mask512<int16_t>{_mm512_cmpgt_epi16_mask(a.raw, b.raw)};
-}
-HWY_API Mask512<int32_t> operator>(Vec512<int32_t> a, Vec512<int32_t> b) {
-  return Mask512<int32_t>{_mm512_cmpgt_epi32_mask(a.raw, b.raw)};
-}
-HWY_API Mask512<int64_t> operator>(Vec512<int64_t> a, Vec512<int64_t> b) {
-  return Mask512<int64_t>{_mm512_cmpgt_epi64_mask(a.raw, b.raw)};
-}
-
-HWY_API Mask512<float> operator>(Vec512<float> a, Vec512<float> b) {
-  return Mask512<float>{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)};
-}
-HWY_API Mask512<double> operator>(Vec512<double> a, Vec512<double> b) {
-  return Mask512<double>{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_GT_OQ)};
-}
-
-// ------------------------------ Weak inequality
-
-HWY_API Mask512<float> operator>=(Vec512<float> a, Vec512<float> b) {
-  return Mask512<float>{_mm512_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)};
-}
-HWY_API Mask512<double> operator>=(Vec512<double> a, Vec512<double> b) {
-  return Mask512<double>{_mm512_cmp_pd_mask(a.raw, b.raw, _CMP_GE_OQ)};
-}
-
-// ------------------------------ Reversed comparisons
-
-template <typename T>
-HWY_API Mask512<T> operator<(Vec512<T> a, Vec512<T> b) {
-  return b > a;
-}
-
-template <typename T>
-HWY_API Mask512<T> operator<=(Vec512<T> a, Vec512<T> b) {
-  return b >= a;
-}
-
-// ------------------------------ Mask
-
-namespace detail {
-
-template <typename T>
-HWY_INLINE Mask512<T> MaskFromVec(hwy::SizeTag<1> /*tag*/, const Vec512<T> v) {
-  return Mask512<T>{_mm512_movepi8_mask(v.raw)};
-}
-template <typename T>
-HWY_INLINE Mask512<T> MaskFromVec(hwy::SizeTag<2> /*tag*/, const Vec512<T> v) {
-  return Mask512<T>{_mm512_movepi16_mask(v.raw)};
-}
-template <typename T>
-HWY_INLINE Mask512<T> MaskFromVec(hwy::SizeTag<4> /*tag*/, const Vec512<T> v) {
-  return Mask512<T>{_mm512_movepi32_mask(v.raw)};
-}
-template <typename T>
-HWY_INLINE Mask512<T> MaskFromVec(hwy::SizeTag<8> /*tag*/, const Vec512<T> v) {
-  return Mask512<T>{_mm512_movepi64_mask(v.raw)};
-}
-
-}  // namespace detail
-
-template <typename T>
-HWY_API Mask512<T> MaskFromVec(const Vec512<T> v) {
-  return detail::MaskFromVec(hwy::SizeTag<sizeof(T)>(), v);
-}
-// There do not seem to be native floating-point versions of these instructions.
-HWY_API Mask512<float> MaskFromVec(const Vec512<float> v) {
-  return Mask512<float>{MaskFromVec(BitCast(Full512<int32_t>(), v)).raw};
-}
-HWY_API Mask512<double> MaskFromVec(const Vec512<double> v) {
-  return Mask512<double>{MaskFromVec(BitCast(Full512<int64_t>(), v)).raw};
-}
-
-HWY_API Vec512<uint8_t> VecFromMask(const Mask512<uint8_t> v) {
-  return Vec512<uint8_t>{_mm512_movm_epi8(v.raw)};
-}
-HWY_API Vec512<int8_t> VecFromMask(const Mask512<int8_t> v) {
-  return Vec512<int8_t>{_mm512_movm_epi8(v.raw)};
-}
-
-HWY_API Vec512<uint16_t> VecFromMask(const Mask512<uint16_t> v) {
-  return Vec512<uint16_t>{_mm512_movm_epi16(v.raw)};
-}
-HWY_API Vec512<int16_t> VecFromMask(const Mask512<int16_t> v) {
-  return Vec512<int16_t>{_mm512_movm_epi16(v.raw)};
-}
-
-HWY_API Vec512<uint32_t> VecFromMask(const Mask512<uint32_t> v) {
-  return Vec512<uint32_t>{_mm512_movm_epi32(v.raw)};
-}
-HWY_API Vec512<int32_t> VecFromMask(const Mask512<int32_t> v) {
-  return Vec512<int32_t>{_mm512_movm_epi32(v.raw)};
-}
-HWY_API Vec512<float> VecFromMask(const Mask512<float> v) {
-  return Vec512<float>{_mm512_castsi512_ps(_mm512_movm_epi32(v.raw))};
-}
-
-HWY_API Vec512<uint64_t> VecFromMask(const Mask512<uint64_t> v) {
-  return Vec512<uint64_t>{_mm512_movm_epi64(v.raw)};
-}
-HWY_API Vec512<int64_t> VecFromMask(const Mask512<int64_t> v) {
-  return Vec512<int64_t>{_mm512_movm_epi64(v.raw)};
-}
-HWY_API Vec512<double> VecFromMask(const Mask512<double> v) {
-  return Vec512<double>{_mm512_castsi512_pd(_mm512_movm_epi64(v.raw))};
-}
-
-template <typename T>
-HWY_API Vec512<T> VecFromMask(Full512<T> /* tag */, const Mask512<T> v) {
-  return VecFromMask(v);
-}
-
-// ------------------------------ Mask logical
-
-namespace detail {
-
-template <typename T>
-HWY_INLINE Mask512<T> Not(hwy::SizeTag<1> /*tag*/, const Mask512<T> m) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask512<T>{_knot_mask64(m.raw)};
-#else
-  return Mask512<T>{~m.raw};
-#endif
-}
-template <typename T>
-HWY_INLINE Mask512<T> Not(hwy::SizeTag<2> /*tag*/, const Mask512<T> m) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask512<T>{_knot_mask32(m.raw)};
-#else
-  return Mask512<T>{~m.raw};
-#endif
-}
-template <typename T>
-HWY_INLINE Mask512<T> Not(hwy::SizeTag<4> /*tag*/, const Mask512<T> m) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask512<T>{_knot_mask16(m.raw)};
-#else
-  return Mask512<T>{static_cast<uint16_t>(~m.raw & 0xFFFF)};
-#endif
-}
-template <typename T>
-HWY_INLINE Mask512<T> Not(hwy::SizeTag<8> /*tag*/, const Mask512<T> m) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask512<T>{_knot_mask8(m.raw)};
-#else
-  return Mask512<T>{static_cast<uint8_t>(~m.raw & 0xFF)};
-#endif
-}
-
-template <typename T>
-HWY_INLINE Mask512<T> And(hwy::SizeTag<1> /*tag*/, const Mask512<T> a,
-                          const Mask512<T> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask512<T>{_kand_mask64(a.raw, b.raw)};
-#else
-  return Mask512<T>{a.raw & b.raw};
-#endif
-}
-template <typename T>
-HWY_INLINE Mask512<T> And(hwy::SizeTag<2> /*tag*/, const Mask512<T> a,
-                          const Mask512<T> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask512<T>{_kand_mask32(a.raw, b.raw)};
-#else
-  return Mask512<T>{a.raw & b.raw};
-#endif
-}
-template <typename T>
-HWY_INLINE Mask512<T> And(hwy::SizeTag<4> /*tag*/, const Mask512<T> a,
-                          const Mask512<T> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask512<T>{_kand_mask16(a.raw, b.raw)};
-#else
-  return Mask512<T>{static_cast<uint16_t>(a.raw & b.raw)};
-#endif
-}
-template <typename T>
-HWY_INLINE Mask512<T> And(hwy::SizeTag<8> /*tag*/, const Mask512<T> a,
-                          const Mask512<T> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask512<T>{_kand_mask8(a.raw, b.raw)};
-#else
-  return Mask512<T>{static_cast<uint8_t>(a.raw & b.raw)};
-#endif
-}
-
-template <typename T>
-HWY_INLINE Mask512<T> AndNot(hwy::SizeTag<1> /*tag*/, const Mask512<T> a,
-                             const Mask512<T> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask512<T>{_kandn_mask64(a.raw, b.raw)};
-#else
-  return Mask512<T>{~a.raw & b.raw};
-#endif
-}
-template <typename T>
-HWY_INLINE Mask512<T> AndNot(hwy::SizeTag<2> /*tag*/, const Mask512<T> a,
-                             const Mask512<T> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask512<T>{_kandn_mask32(a.raw, b.raw)};
-#else
-  return Mask512<T>{~a.raw & b.raw};
-#endif
-}
-template <typename T>
-HWY_INLINE Mask512<T> AndNot(hwy::SizeTag<4> /*tag*/, const Mask512<T> a,
-                             const Mask512<T> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask512<T>{_kandn_mask16(a.raw, b.raw)};
-#else
-  return Mask512<T>{static_cast<uint16_t>(~a.raw & b.raw)};
-#endif
-}
-template <typename T>
-HWY_INLINE Mask512<T> AndNot(hwy::SizeTag<8> /*tag*/, const Mask512<T> a,
-                             const Mask512<T> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask512<T>{_kandn_mask8(a.raw, b.raw)};
-#else
-  return Mask512<T>{static_cast<uint8_t>(~a.raw & b.raw)};
-#endif
-}
-
-template <typename T>
-HWY_INLINE Mask512<T> Or(hwy::SizeTag<1> /*tag*/, const Mask512<T> a,
-                         const Mask512<T> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask512<T>{_kor_mask64(a.raw, b.raw)};
-#else
-  return Mask512<T>{a.raw | b.raw};
-#endif
-}
-template <typename T>
-HWY_INLINE Mask512<T> Or(hwy::SizeTag<2> /*tag*/, const Mask512<T> a,
-                         const Mask512<T> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask512<T>{_kor_mask32(a.raw, b.raw)};
-#else
-  return Mask512<T>{a.raw | b.raw};
-#endif
-}
-template <typename T>
-HWY_INLINE Mask512<T> Or(hwy::SizeTag<4> /*tag*/, const Mask512<T> a,
-                         const Mask512<T> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask512<T>{_kor_mask16(a.raw, b.raw)};
-#else
-  return Mask512<T>{static_cast<uint16_t>(a.raw | b.raw)};
-#endif
-}
-template <typename T>
-HWY_INLINE Mask512<T> Or(hwy::SizeTag<8> /*tag*/, const Mask512<T> a,
-                         const Mask512<T> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask512<T>{_kor_mask8(a.raw, b.raw)};
-#else
-  return Mask512<T>{static_cast<uint8_t>(a.raw | b.raw)};
-#endif
-}
-
-template <typename T>
-HWY_INLINE Mask512<T> Xor(hwy::SizeTag<1> /*tag*/, const Mask512<T> a,
-                          const Mask512<T> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask512<T>{_kxor_mask64(a.raw, b.raw)};
-#else
-  return Mask512<T>{a.raw ^ b.raw};
-#endif
-}
-template <typename T>
-HWY_INLINE Mask512<T> Xor(hwy::SizeTag<2> /*tag*/, const Mask512<T> a,
-                          const Mask512<T> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask512<T>{_kxor_mask32(a.raw, b.raw)};
-#else
-  return Mask512<T>{a.raw ^ b.raw};
-#endif
-}
-template <typename T>
-HWY_INLINE Mask512<T> Xor(hwy::SizeTag<4> /*tag*/, const Mask512<T> a,
-                          const Mask512<T> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask512<T>{_kxor_mask16(a.raw, b.raw)};
-#else
-  return Mask512<T>{static_cast<uint16_t>(a.raw ^ b.raw)};
-#endif
-}
-template <typename T>
-HWY_INLINE Mask512<T> Xor(hwy::SizeTag<8> /*tag*/, const Mask512<T> a,
-                          const Mask512<T> b) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return Mask512<T>{_kxor_mask8(a.raw, b.raw)};
-#else
-  return Mask512<T>{static_cast<uint8_t>(a.raw ^ b.raw)};
-#endif
-}
-
-}  // namespace detail
-
-template <typename T>
-HWY_API Mask512<T> Not(const Mask512<T> m) {
-  return detail::Not(hwy::SizeTag<sizeof(T)>(), m);
-}
-
-template <typename T>
-HWY_API Mask512<T> And(const Mask512<T> a, Mask512<T> b) {
-  return detail::And(hwy::SizeTag<sizeof(T)>(), a, b);
-}
-
-template <typename T>
-HWY_API Mask512<T> AndNot(const Mask512<T> a, Mask512<T> b) {
-  return detail::AndNot(hwy::SizeTag<sizeof(T)>(), a, b);
-}
-
-template <typename T>
-HWY_API Mask512<T> Or(const Mask512<T> a, Mask512<T> b) {
-  return detail::Or(hwy::SizeTag<sizeof(T)>(), a, b);
-}
-
-template <typename T>
-HWY_API Mask512<T> Xor(const Mask512<T> a, Mask512<T> b) {
-  return detail::Xor(hwy::SizeTag<sizeof(T)>(), a, b);
-}
-
-// ------------------------------ BroadcastSignBit (ShiftRight, compare, mask)
-
-HWY_API Vec512<int8_t> BroadcastSignBit(const Vec512<int8_t> v) {
-  return VecFromMask(v < Zero(Full512<int8_t>()));
-}
-
-HWY_API Vec512<int16_t> BroadcastSignBit(const Vec512<int16_t> v) {
-  return ShiftRight<15>(v);
-}
-
-HWY_API Vec512<int32_t> BroadcastSignBit(const Vec512<int32_t> v) {
-  return ShiftRight<31>(v);
-}
-
-HWY_API Vec512<int64_t> BroadcastSignBit(const Vec512<int64_t> v) {
-  return Vec512<int64_t>{_mm512_srai_epi64(v.raw, 63)};
-}
-
-// ------------------------------ Floating-point classification (Not)
-
-HWY_API Mask512<float> IsNaN(const Vec512<float> v) {
-  return Mask512<float>{_mm512_fpclass_ps_mask(v.raw, 0x81)};
-}
-HWY_API Mask512<double> IsNaN(const Vec512<double> v) {
-  return Mask512<double>{_mm512_fpclass_pd_mask(v.raw, 0x81)};
-}
-
-HWY_API Mask512<float> IsInf(const Vec512<float> v) {
-  return Mask512<float>{_mm512_fpclass_ps_mask(v.raw, 0x18)};
-}
-HWY_API Mask512<double> IsInf(const Vec512<double> v) {
-  return Mask512<double>{_mm512_fpclass_pd_mask(v.raw, 0x18)};
-}
-
-// Returns whether normal/subnormal/zero. fpclass doesn't have a flag for
-// positive, so we have to check for inf/NaN and negate.
-HWY_API Mask512<float> IsFinite(const Vec512<float> v) {
-  return Not(Mask512<float>{_mm512_fpclass_ps_mask(v.raw, 0x99)});
-}
-HWY_API Mask512<double> IsFinite(const Vec512<double> v) {
-  return Not(Mask512<double>{_mm512_fpclass_pd_mask(v.raw, 0x99)});
-}
-
-// ================================================== MEMORY
-
-// ------------------------------ Load
-
-template <typename T>
-HWY_API Vec512<T> Load(Full512<T> /* tag */, const T* HWY_RESTRICT aligned) {
-  return Vec512<T>{_mm512_load_si512(aligned)};
-}
-HWY_API Vec512<float> Load(Full512<float> /* tag */,
-                           const float* HWY_RESTRICT aligned) {
-  return Vec512<float>{_mm512_load_ps(aligned)};
-}
-HWY_API Vec512<double> Load(Full512<double> /* tag */,
-                            const double* HWY_RESTRICT aligned) {
-  return Vec512<double>{_mm512_load_pd(aligned)};
-}
-
-template <typename T>
-HWY_API Vec512<T> LoadU(Full512<T> /* tag */, const T* HWY_RESTRICT p) {
-  return Vec512<T>{_mm512_loadu_si512(p)};
-}
-HWY_API Vec512<float> LoadU(Full512<float> /* tag */,
-                            const float* HWY_RESTRICT p) {
-  return Vec512<float>{_mm512_loadu_ps(p)};
-}
-HWY_API Vec512<double> LoadU(Full512<double> /* tag */,
-                             const double* HWY_RESTRICT p) {
-  return Vec512<double>{_mm512_loadu_pd(p)};
-}
-
-// ------------------------------ MaskedLoad
-
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Vec512<T> MaskedLoad(Mask512<T> m, Full512<T> /* tag */,
-                             const T* HWY_RESTRICT p) {
-  return Vec512<T>{_mm512_maskz_loadu_epi8(m.raw, p)};
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec512<T> MaskedLoad(Mask512<T> m, Full512<T> /* tag */,
-                             const T* HWY_RESTRICT p) {
-  return Vec512<T>{_mm512_maskz_loadu_epi16(m.raw, p)};
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec512<T> MaskedLoad(Mask512<T> m, Full512<T> /* tag */,
-                             const T* HWY_RESTRICT p) {
-  return Vec512<T>{_mm512_maskz_loadu_epi32(m.raw, p)};
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec512<T> MaskedLoad(Mask512<T> m, Full512<T> /* tag */,
-                             const T* HWY_RESTRICT p) {
-  return Vec512<T>{_mm512_maskz_loadu_epi64(m.raw, p)};
-}
-
-HWY_API Vec512<float> MaskedLoad(Mask512<float> m, Full512<float> /* tag */,
-                                 const float* HWY_RESTRICT p) {
-  return Vec512<float>{_mm512_maskz_loadu_ps(m.raw, p)};
-}
-
-HWY_API Vec512<double> MaskedLoad(Mask512<double> m, Full512<double> /* tag */,
-                                  const double* HWY_RESTRICT p) {
-  return Vec512<double>{_mm512_maskz_loadu_pd(m.raw, p)};
-}
-
-// ------------------------------ LoadDup128
-
-// Loads 128 bit and duplicates into both 128-bit halves. This avoids the
-// 3-cycle cost of moving data between 128-bit halves and avoids port 5.
-template <typename T>
-HWY_API Vec512<T> LoadDup128(Full512<T> /* tag */,
-                             const T* const HWY_RESTRICT p) {
-  const auto x4 = LoadU(Full128<T>(), p);
-  return Vec512<T>{_mm512_broadcast_i32x4(x4.raw)};
-}
-HWY_API Vec512<float> LoadDup128(Full512<float> /* tag */,
-                                 const float* const HWY_RESTRICT p) {
-  const __m128 x4 = _mm_loadu_ps(p);
-  return Vec512<float>{_mm512_broadcast_f32x4(x4)};
-}
-
-HWY_API Vec512<double> LoadDup128(Full512<double> /* tag */,
-                                  const double* const HWY_RESTRICT p) {
-  const __m128d x2 = _mm_loadu_pd(p);
-  return Vec512<double>{_mm512_broadcast_f64x2(x2)};
-}
-
-// ------------------------------ Store
-
-template <typename T>
-HWY_API void Store(const Vec512<T> v, Full512<T> /* tag */,
-                   T* HWY_RESTRICT aligned) {
-  _mm512_store_si512(reinterpret_cast<__m512i*>(aligned), v.raw);
-}
-HWY_API void Store(const Vec512<float> v, Full512<float> /* tag */,
-                   float* HWY_RESTRICT aligned) {
-  _mm512_store_ps(aligned, v.raw);
-}
-HWY_API void Store(const Vec512<double> v, Full512<double> /* tag */,
-                   double* HWY_RESTRICT aligned) {
-  _mm512_store_pd(aligned, v.raw);
-}
-
-template <typename T>
-HWY_API void StoreU(const Vec512<T> v, Full512<T> /* tag */,
-                    T* HWY_RESTRICT p) {
-  _mm512_storeu_si512(reinterpret_cast<__m512i*>(p), v.raw);
-}
-HWY_API void StoreU(const Vec512<float> v, Full512<float> /* tag */,
-                    float* HWY_RESTRICT p) {
-  _mm512_storeu_ps(p, v.raw);
-}
-HWY_API void StoreU(const Vec512<double> v, Full512<double>,
-                    double* HWY_RESTRICT p) {
-  _mm512_storeu_pd(p, v.raw);
-}
-
-// ------------------------------ BlendedStore
-
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API void BlendedStore(Vec512<T> v, Mask512<T> m, Full512<T> /* tag */,
-                          T* HWY_RESTRICT p) {
-  _mm512_mask_storeu_epi8(p, m.raw, v.raw);
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API void BlendedStore(Vec512<T> v, Mask512<T> m, Full512<T> /* tag */,
-                          T* HWY_RESTRICT p) {
-  _mm512_mask_storeu_epi16(p, m.raw, v.raw);
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API void BlendedStore(Vec512<T> v, Mask512<T> m, Full512<T> /* tag */,
-                          T* HWY_RESTRICT p) {
-  _mm512_mask_storeu_epi32(p, m.raw, v.raw);
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API void BlendedStore(Vec512<T> v, Mask512<T> m, Full512<T> /* tag */,
-                          T* HWY_RESTRICT p) {
-  _mm512_mask_storeu_epi64(p, m.raw, v.raw);
-}
-
-HWY_API void BlendedStore(Vec512<float> v, Mask512<float> m,
-                          Full512<float> /* tag */, float* HWY_RESTRICT p) {
-  _mm512_mask_storeu_ps(p, m.raw, v.raw);
-}
-
-HWY_API void BlendedStore(Vec512<double> v, Mask512<double> m,
-                          Full512<double> /* tag */, double* HWY_RESTRICT p) {
-  _mm512_mask_storeu_pd(p, m.raw, v.raw);
-}
-
-// ------------------------------ Non-temporal stores
-
-template <typename T>
-HWY_API void Stream(const Vec512<T> v, Full512<T> /* tag */,
-                    T* HWY_RESTRICT aligned) {
-  _mm512_stream_si512(reinterpret_cast<__m512i*>(aligned), v.raw);
-}
-HWY_API void Stream(const Vec512<float> v, Full512<float> /* tag */,
-                    float* HWY_RESTRICT aligned) {
-  _mm512_stream_ps(aligned, v.raw);
-}
-HWY_API void Stream(const Vec512<double> v, Full512<double>,
-                    double* HWY_RESTRICT aligned) {
-  _mm512_stream_pd(aligned, v.raw);
-}
-
-// ------------------------------ Scatter
-
-// Work around warnings in the intrinsic definitions (passing -1 as a mask).
-HWY_DIAGNOSTICS(push)
-HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
-
-namespace detail {
-
-template <typename T>
-HWY_INLINE void ScatterOffset(hwy::SizeTag<4> /* tag */, Vec512<T> v,
-                              Full512<T> /* tag */, T* HWY_RESTRICT base,
-                              const Vec512<int32_t> offset) {
-  _mm512_i32scatter_epi32(base, offset.raw, v.raw, 1);
-}
-template <typename T>
-HWY_INLINE void ScatterIndex(hwy::SizeTag<4> /* tag */, Vec512<T> v,
-                             Full512<T> /* tag */, T* HWY_RESTRICT base,
-                             const Vec512<int32_t> index) {
-  _mm512_i32scatter_epi32(base, index.raw, v.raw, 4);
-}
-
-template <typename T>
-HWY_INLINE void ScatterOffset(hwy::SizeTag<8> /* tag */, Vec512<T> v,
-                              Full512<T> /* tag */, T* HWY_RESTRICT base,
-                              const Vec512<int64_t> offset) {
-  _mm512_i64scatter_epi64(base, offset.raw, v.raw, 1);
-}
-template <typename T>
-HWY_INLINE void ScatterIndex(hwy::SizeTag<8> /* tag */, Vec512<T> v,
-                             Full512<T> /* tag */, T* HWY_RESTRICT base,
-                             const Vec512<int64_t> index) {
-  _mm512_i64scatter_epi64(base, index.raw, v.raw, 8);
-}
-
-}  // namespace detail
-
-template <typename T, typename Offset>
-HWY_API void ScatterOffset(Vec512<T> v, Full512<T> d, T* HWY_RESTRICT base,
-                           const Vec512<Offset> offset) {
-  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
-  return detail::ScatterOffset(hwy::SizeTag<sizeof(T)>(), v, d, base, offset);
-}
-template <typename T, typename Index>
-HWY_API void ScatterIndex(Vec512<T> v, Full512<T> d, T* HWY_RESTRICT base,
-                          const Vec512<Index> index) {
-  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
-  return detail::ScatterIndex(hwy::SizeTag<sizeof(T)>(), v, d, base, index);
-}
-
-HWY_API void ScatterOffset(Vec512<float> v, Full512<float> /* tag */,
-                           float* HWY_RESTRICT base,
-                           const Vec512<int32_t> offset) {
-  _mm512_i32scatter_ps(base, offset.raw, v.raw, 1);
-}
-HWY_API void ScatterIndex(Vec512<float> v, Full512<float> /* tag */,
-                          float* HWY_RESTRICT base,
-                          const Vec512<int32_t> index) {
-  _mm512_i32scatter_ps(base, index.raw, v.raw, 4);
-}
-
-HWY_API void ScatterOffset(Vec512<double> v, Full512<double> /* tag */,
-                           double* HWY_RESTRICT base,
-                           const Vec512<int64_t> offset) {
-  _mm512_i64scatter_pd(base, offset.raw, v.raw, 1);
-}
-HWY_API void ScatterIndex(Vec512<double> v, Full512<double> /* tag */,
-                          double* HWY_RESTRICT base,
-                          const Vec512<int64_t> index) {
-  _mm512_i64scatter_pd(base, index.raw, v.raw, 8);
-}
-
-// ------------------------------ Gather
-
-namespace detail {
-
-template <typename T>
-HWY_INLINE Vec512<T> GatherOffset(hwy::SizeTag<4> /* tag */,
-                                  Full512<T> /* tag */,
-                                  const T* HWY_RESTRICT base,
-                                  const Vec512<int32_t> offset) {
-  return Vec512<T>{_mm512_i32gather_epi32(offset.raw, base, 1)};
-}
-template <typename T>
-HWY_INLINE Vec512<T> GatherIndex(hwy::SizeTag<4> /* tag */,
-                                 Full512<T> /* tag */,
-                                 const T* HWY_RESTRICT base,
-                                 const Vec512<int32_t> index) {
-  return Vec512<T>{_mm512_i32gather_epi32(index.raw, base, 4)};
-}
-
-template <typename T>
-HWY_INLINE Vec512<T> GatherOffset(hwy::SizeTag<8> /* tag */,
-                                  Full512<T> /* tag */,
-                                  const T* HWY_RESTRICT base,
-                                  const Vec512<int64_t> offset) {
-  return Vec512<T>{_mm512_i64gather_epi64(offset.raw, base, 1)};
-}
-template <typename T>
-HWY_INLINE Vec512<T> GatherIndex(hwy::SizeTag<8> /* tag */,
-                                 Full512<T> /* tag */,
-                                 const T* HWY_RESTRICT base,
-                                 const Vec512<int64_t> index) {
-  return Vec512<T>{_mm512_i64gather_epi64(index.raw, base, 8)};
-}
-
-}  // namespace detail
-
-template <typename T, typename Offset>
-HWY_API Vec512<T> GatherOffset(Full512<T> d, const T* HWY_RESTRICT base,
-                               const Vec512<Offset> offset) {
-  static_assert(sizeof(T) == sizeof(Offset), "Must match for portability");
-  return detail::GatherOffset(hwy::SizeTag<sizeof(T)>(), d, base, offset);
-}
-template <typename T, typename Index>
-HWY_API Vec512<T> GatherIndex(Full512<T> d, const T* HWY_RESTRICT base,
-                              const Vec512<Index> index) {
-  static_assert(sizeof(T) == sizeof(Index), "Must match for portability");
-  return detail::GatherIndex(hwy::SizeTag<sizeof(T)>(), d, base, index);
-}
-
-HWY_API Vec512<float> GatherOffset(Full512<float> /* tag */,
-                                   const float* HWY_RESTRICT base,
-                                   const Vec512<int32_t> offset) {
-  return Vec512<float>{_mm512_i32gather_ps(offset.raw, base, 1)};
-}
-HWY_API Vec512<float> GatherIndex(Full512<float> /* tag */,
-                                  const float* HWY_RESTRICT base,
-                                  const Vec512<int32_t> index) {
-  return Vec512<float>{_mm512_i32gather_ps(index.raw, base, 4)};
-}
-
-HWY_API Vec512<double> GatherOffset(Full512<double> /* tag */,
-                                    const double* HWY_RESTRICT base,
-                                    const Vec512<int64_t> offset) {
-  return Vec512<double>{_mm512_i64gather_pd(offset.raw, base, 1)};
-}
-HWY_API Vec512<double> GatherIndex(Full512<double> /* tag */,
-                                   const double* HWY_RESTRICT base,
-                                   const Vec512<int64_t> index) {
-  return Vec512<double>{_mm512_i64gather_pd(index.raw, base, 8)};
-}
-
-HWY_DIAGNOSTICS(pop)
-
-// ================================================== SWIZZLE
-
-// ------------------------------ LowerHalf
-
-template <typename T>
-HWY_API Vec256<T> LowerHalf(Full256<T> /* tag */, Vec512<T> v) {
-  return Vec256<T>{_mm512_castsi512_si256(v.raw)};
-}
-HWY_API Vec256<float> LowerHalf(Full256<float> /* tag */, Vec512<float> v) {
-  return Vec256<float>{_mm512_castps512_ps256(v.raw)};
-}
-HWY_API Vec256<double> LowerHalf(Full256<double> /* tag */, Vec512<double> v) {
-  return Vec256<double>{_mm512_castpd512_pd256(v.raw)};
-}
-
-template <typename T>
-HWY_API Vec256<T> LowerHalf(Vec512<T> v) {
-  return LowerHalf(Full256<T>(), v);
-}
-
-// ------------------------------ UpperHalf
-
-template <typename T>
-HWY_API Vec256<T> UpperHalf(Full256<T> /* tag */, Vec512<T> v) {
-  return Vec256<T>{_mm512_extracti32x8_epi32(v.raw, 1)};
-}
-HWY_API Vec256<float> UpperHalf(Full256<float> /* tag */, Vec512<float> v) {
-  return Vec256<float>{_mm512_extractf32x8_ps(v.raw, 1)};
-}
-HWY_API Vec256<double> UpperHalf(Full256<double> /* tag */, Vec512<double> v) {
-  return Vec256<double>{_mm512_extractf64x4_pd(v.raw, 1)};
-}
-
-// ------------------------------ ExtractLane (Store)
-template <typename T>
-HWY_API T ExtractLane(const Vec512<T> v, size_t i) {
-  const Full512<T> d;
-  HWY_DASSERT(i < Lanes(d));
-  alignas(64) T lanes[64 / sizeof(T)];
-  Store(v, d, lanes);
-  return lanes[i];
-}
-
-// ------------------------------ InsertLane (Store)
-template <typename T>
-HWY_API Vec512<T> InsertLane(const Vec512<T> v, size_t i, T t) {
-  const Full512<T> d;
-  HWY_DASSERT(i < Lanes(d));
-  alignas(64) T lanes[64 / sizeof(T)];
-  Store(v, d, lanes);
-  lanes[i] = t;
-  return Load(d, lanes);
-}
-
-// ------------------------------ GetLane (LowerHalf)
-template <typename T>
-HWY_API T GetLane(const Vec512<T> v) {
-  return GetLane(LowerHalf(v));
-}
-
-// ------------------------------ ZeroExtendVector
-
-template <typename T>
-HWY_API Vec512<T> ZeroExtendVector(Full512<T> /* tag */, Vec256<T> lo) {
-#if HWY_HAVE_ZEXT  // See definition/comment in x86_256-inl.h.
-  return Vec512<T>{_mm512_zextsi256_si512(lo.raw)};
-#else
-  return Vec512<T>{_mm512_inserti32x8(_mm512_setzero_si512(), lo.raw, 0)};
-#endif
-}
-HWY_API Vec512<float> ZeroExtendVector(Full512<float> /* tag */,
-                                       Vec256<float> lo) {
-#if HWY_HAVE_ZEXT
-  return Vec512<float>{_mm512_zextps256_ps512(lo.raw)};
-#else
-  return Vec512<float>{_mm512_insertf32x8(_mm512_setzero_ps(), lo.raw, 0)};
-#endif
-}
-HWY_API Vec512<double> ZeroExtendVector(Full512<double> /* tag */,
-                                        Vec256<double> lo) {
-#if HWY_HAVE_ZEXT
-  return Vec512<double>{_mm512_zextpd256_pd512(lo.raw)};
-#else
-  return Vec512<double>{_mm512_insertf64x4(_mm512_setzero_pd(), lo.raw, 0)};
-#endif
-}
-
-// ------------------------------ Combine
-
-template <typename T>
-HWY_API Vec512<T> Combine(Full512<T> d, Vec256<T> hi, Vec256<T> lo) {
-  const auto lo512 = ZeroExtendVector(d, lo);
-  return Vec512<T>{_mm512_inserti32x8(lo512.raw, hi.raw, 1)};
-}
-HWY_API Vec512<float> Combine(Full512<float> d, Vec256<float> hi,
-                              Vec256<float> lo) {
-  const auto lo512 = ZeroExtendVector(d, lo);
-  return Vec512<float>{_mm512_insertf32x8(lo512.raw, hi.raw, 1)};
-}
-HWY_API Vec512<double> Combine(Full512<double> d, Vec256<double> hi,
-                               Vec256<double> lo) {
-  const auto lo512 = ZeroExtendVector(d, lo);
-  return Vec512<double>{_mm512_insertf64x4(lo512.raw, hi.raw, 1)};
-}
-
-// ------------------------------ ShiftLeftBytes
-
-template <int kBytes, typename T>
-HWY_API Vec512<T> ShiftLeftBytes(Full512<T> /* tag */, const Vec512<T> v) {
-  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
-  return Vec512<T>{_mm512_bslli_epi128(v.raw, kBytes)};
-}
-
-template <int kBytes, typename T>
-HWY_API Vec512<T> ShiftLeftBytes(const Vec512<T> v) {
-  return ShiftLeftBytes<kBytes>(Full512<T>(), v);
-}
-
-// ------------------------------ ShiftLeftLanes
-
-template <int kLanes, typename T>
-HWY_API Vec512<T> ShiftLeftLanes(Full512<T> d, const Vec512<T> v) {
-  const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, ShiftLeftBytes<kLanes * sizeof(T)>(BitCast(d8, v)));
-}
-
-template <int kLanes, typename T>
-HWY_API Vec512<T> ShiftLeftLanes(const Vec512<T> v) {
-  return ShiftLeftLanes<kLanes>(Full512<T>(), v);
-}
-
-// ------------------------------ ShiftRightBytes
-template <int kBytes, typename T>
-HWY_API Vec512<T> ShiftRightBytes(Full512<T> /* tag */, const Vec512<T> v) {
-  static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes");
-  return Vec512<T>{_mm512_bsrli_epi128(v.raw, kBytes)};
-}
-
-// ------------------------------ ShiftRightLanes
-template <int kLanes, typename T>
-HWY_API Vec512<T> ShiftRightLanes(Full512<T> d, const Vec512<T> v) {
-  const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v)));
-}
-
-// ------------------------------ CombineShiftRightBytes
-
-template <int kBytes, typename T, class V = Vec512<T>>
-HWY_API V CombineShiftRightBytes(Full512<T> d, V hi, V lo) {
-  const Repartition<uint8_t, decltype(d)> d8;
-  return BitCast(d, Vec512<uint8_t>{_mm512_alignr_epi8(
-                        BitCast(d8, hi).raw, BitCast(d8, lo).raw, kBytes)});
-}
-
-// ------------------------------ Broadcast/splat any lane
-
-// Unsigned
-template <int kLane>
-HWY_API Vec512<uint16_t> Broadcast(const Vec512<uint16_t> v) {
-  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
-  if (kLane < 4) {
-    const __m512i lo = _mm512_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF);
-    return Vec512<uint16_t>{_mm512_unpacklo_epi64(lo, lo)};
-  } else {
-    const __m512i hi =
-        _mm512_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF);
-    return Vec512<uint16_t>{_mm512_unpackhi_epi64(hi, hi)};
-  }
-}
-template <int kLane>
-HWY_API Vec512<uint32_t> Broadcast(const Vec512<uint32_t> v) {
-  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
-  constexpr _MM_PERM_ENUM perm = static_cast<_MM_PERM_ENUM>(0x55 * kLane);
-  return Vec512<uint32_t>{_mm512_shuffle_epi32(v.raw, perm)};
-}
-template <int kLane>
-HWY_API Vec512<uint64_t> Broadcast(const Vec512<uint64_t> v) {
-  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
-  constexpr _MM_PERM_ENUM perm = kLane ? _MM_PERM_DCDC : _MM_PERM_BABA;
-  return Vec512<uint64_t>{_mm512_shuffle_epi32(v.raw, perm)};
-}
-
-// Signed
-template <int kLane>
-HWY_API Vec512<int16_t> Broadcast(const Vec512<int16_t> v) {
-  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
-  if (kLane < 4) {
-    const __m512i lo = _mm512_shufflelo_epi16(v.raw, (0x55 * kLane) & 0xFF);
-    return Vec512<int16_t>{_mm512_unpacklo_epi64(lo, lo)};
-  } else {
-    const __m512i hi =
-        _mm512_shufflehi_epi16(v.raw, (0x55 * (kLane - 4)) & 0xFF);
-    return Vec512<int16_t>{_mm512_unpackhi_epi64(hi, hi)};
-  }
-}
-template <int kLane>
-HWY_API Vec512<int32_t> Broadcast(const Vec512<int32_t> v) {
-  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
-  constexpr _MM_PERM_ENUM perm = static_cast<_MM_PERM_ENUM>(0x55 * kLane);
-  return Vec512<int32_t>{_mm512_shuffle_epi32(v.raw, perm)};
-}
-template <int kLane>
-HWY_API Vec512<int64_t> Broadcast(const Vec512<int64_t> v) {
-  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
-  constexpr _MM_PERM_ENUM perm = kLane ? _MM_PERM_DCDC : _MM_PERM_BABA;
-  return Vec512<int64_t>{_mm512_shuffle_epi32(v.raw, perm)};
-}
-
-// Float
-template <int kLane>
-HWY_API Vec512<float> Broadcast(const Vec512<float> v) {
-  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
-  constexpr _MM_PERM_ENUM perm = static_cast<_MM_PERM_ENUM>(0x55 * kLane);
-  return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, perm)};
-}
-template <int kLane>
-HWY_API Vec512<double> Broadcast(const Vec512<double> v) {
-  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
-  constexpr _MM_PERM_ENUM perm = static_cast<_MM_PERM_ENUM>(0xFF * kLane);
-  return Vec512<double>{_mm512_shuffle_pd(v.raw, v.raw, perm)};
-}
-
-// ------------------------------ Hard-coded shuffles
-
-// Notation: let Vec512<int32_t> have lanes 7,6,5,4,3,2,1,0 (0 is
-// least-significant). Shuffle0321 rotates four-lane blocks one lane to the
-// right (the previous least-significant lane is now most-significant =>
-// 47650321). These could also be implemented via CombineShiftRightBytes but
-// the shuffle_abcd notation is more convenient.
-
-// Swap 32-bit halves in 64-bit halves.
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec512<T> Shuffle2301(const Vec512<T> v) {
-  return Vec512<T>{_mm512_shuffle_epi32(v.raw, _MM_PERM_CDAB)};
-}
-HWY_API Vec512<float> Shuffle2301(const Vec512<float> v) {
-  return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_CDAB)};
-}
-
-namespace detail {
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec512<T> Shuffle2301(const Vec512<T> a, const Vec512<T> b) {
-  const Full512<T> d;
-  const RebindToFloat<decltype(d)> df;
-  return BitCast(
-      d, Vec512<float>{_mm512_shuffle_ps(BitCast(df, a).raw, BitCast(df, b).raw,
-                                         _MM_PERM_CDAB)});
-}
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec512<T> Shuffle1230(const Vec512<T> a, const Vec512<T> b) {
-  const Full512<T> d;
-  const RebindToFloat<decltype(d)> df;
-  return BitCast(
-      d, Vec512<float>{_mm512_shuffle_ps(BitCast(df, a).raw, BitCast(df, b).raw,
-                                         _MM_PERM_BCDA)});
-}
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec512<T> Shuffle3012(const Vec512<T> a, const Vec512<T> b) {
-  const Full512<T> d;
-  const RebindToFloat<decltype(d)> df;
-  return BitCast(
-      d, Vec512<float>{_mm512_shuffle_ps(BitCast(df, a).raw, BitCast(df, b).raw,
-                                         _MM_PERM_DABC)});
-}
-
-}  // namespace detail
-
-// Swap 64-bit halves
-HWY_API Vec512<uint32_t> Shuffle1032(const Vec512<uint32_t> v) {
-  return Vec512<uint32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_BADC)};
-}
-HWY_API Vec512<int32_t> Shuffle1032(const Vec512<int32_t> v) {
-  return Vec512<int32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_BADC)};
-}
-HWY_API Vec512<float> Shuffle1032(const Vec512<float> v) {
-  // Shorter encoding than _mm512_permute_ps.
-  return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_BADC)};
-}
-HWY_API Vec512<uint64_t> Shuffle01(const Vec512<uint64_t> v) {
-  return Vec512<uint64_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_BADC)};
-}
-HWY_API Vec512<int64_t> Shuffle01(const Vec512<int64_t> v) {
-  return Vec512<int64_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_BADC)};
-}
-HWY_API Vec512<double> Shuffle01(const Vec512<double> v) {
-  // Shorter encoding than _mm512_permute_pd.
-  return Vec512<double>{_mm512_shuffle_pd(v.raw, v.raw, _MM_PERM_BBBB)};
-}
-
-// Rotate right 32 bits
-HWY_API Vec512<uint32_t> Shuffle0321(const Vec512<uint32_t> v) {
-  return Vec512<uint32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_ADCB)};
-}
-HWY_API Vec512<int32_t> Shuffle0321(const Vec512<int32_t> v) {
-  return Vec512<int32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_ADCB)};
-}
-HWY_API Vec512<float> Shuffle0321(const Vec512<float> v) {
-  return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_ADCB)};
-}
-// Rotate left 32 bits
-HWY_API Vec512<uint32_t> Shuffle2103(const Vec512<uint32_t> v) {
-  return Vec512<uint32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_CBAD)};
-}
-HWY_API Vec512<int32_t> Shuffle2103(const Vec512<int32_t> v) {
-  return Vec512<int32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_CBAD)};
-}
-HWY_API Vec512<float> Shuffle2103(const Vec512<float> v) {
-  return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_CBAD)};
-}
-
-// Reverse
-HWY_API Vec512<uint32_t> Shuffle0123(const Vec512<uint32_t> v) {
-  return Vec512<uint32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_ABCD)};
-}
-HWY_API Vec512<int32_t> Shuffle0123(const Vec512<int32_t> v) {
-  return Vec512<int32_t>{_mm512_shuffle_epi32(v.raw, _MM_PERM_ABCD)};
-}
-HWY_API Vec512<float> Shuffle0123(const Vec512<float> v) {
-  return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_ABCD)};
-}
-
-// ------------------------------ TableLookupLanes
-
-// Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes.
-template <typename T>
-struct Indices512 {
-  __m512i raw;
-};
-
-template <typename T, typename TI>
-HWY_API Indices512<T> IndicesFromVec(Full512<T> /* tag */, Vec512<TI> vec) {
-  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
-#if HWY_IS_DEBUG_BUILD
-  const Full512<TI> di;
-  HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) &&
-              AllTrue(di, Lt(vec, Set(di, static_cast<TI>(64 / sizeof(T))))));
-#endif
-  return Indices512<T>{vec.raw};
-}
-
-template <typename T, typename TI>
-HWY_API Indices512<T> SetTableIndices(const Full512<T> d, const TI* idx) {
-  const Rebind<TI, decltype(d)> di;
-  return IndicesFromVec(d, LoadU(di, idx));
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec512<T> TableLookupLanes(Vec512<T> v, Indices512<T> idx) {
-  return Vec512<T>{_mm512_permutexvar_epi32(idx.raw, v.raw)};
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec512<T> TableLookupLanes(Vec512<T> v, Indices512<T> idx) {
-  return Vec512<T>{_mm512_permutexvar_epi64(idx.raw, v.raw)};
-}
-
-HWY_API Vec512<float> TableLookupLanes(Vec512<float> v, Indices512<float> idx) {
-  return Vec512<float>{_mm512_permutexvar_ps(idx.raw, v.raw)};
-}
-
-HWY_API Vec512<double> TableLookupLanes(Vec512<double> v,
-                                        Indices512<double> idx) {
-  return Vec512<double>{_mm512_permutexvar_pd(idx.raw, v.raw)};
-}
-
-// ------------------------------ Reverse
-
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec512<T> Reverse(Full512<T> d, const Vec512<T> v) {
-  const RebindToSigned<decltype(d)> di;
-  alignas(64) constexpr int16_t kReverse[32] = {
-      31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
-      15, 14, 13, 12, 11, 10, 9,  8,  7,  6,  5,  4,  3,  2,  1,  0};
-  const Vec512<int16_t> idx = Load(di, kReverse);
-  return BitCast(d, Vec512<int16_t>{
-                        _mm512_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec512<T> Reverse(Full512<T> d, const Vec512<T> v) {
-  alignas(64) constexpr int32_t kReverse[16] = {15, 14, 13, 12, 11, 10, 9, 8,
-                                                7,  6,  5,  4,  3,  2,  1, 0};
-  return TableLookupLanes(v, SetTableIndices(d, kReverse));
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec512<T> Reverse(Full512<T> d, const Vec512<T> v) {
-  alignas(64) constexpr int64_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0};
-  return TableLookupLanes(v, SetTableIndices(d, kReverse));
-}
-
-// ------------------------------ Reverse2
-
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec512<T> Reverse2(Full512<T> d, const Vec512<T> v) {
-  const Full512<uint32_t> du32;
-  return BitCast(d, RotateRight<16>(BitCast(du32, v)));
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec512<T> Reverse2(Full512<T> /* tag */, const Vec512<T> v) {
-  return Shuffle2301(v);
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec512<T> Reverse2(Full512<T> /* tag */, const Vec512<T> v) {
-  return Shuffle01(v);
-}
-
-// ------------------------------ Reverse4
-
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec512<T> Reverse4(Full512<T> d, const Vec512<T> v) {
-  const RebindToSigned<decltype(d)> di;
-  alignas(64) constexpr int16_t kReverse4[32] = {
-      3,  2,  1,  0,  7,  6,  5,  4,  11, 10, 9,  8,  15, 14, 13, 12,
-      19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28};
-  const Vec512<int16_t> idx = Load(di, kReverse4);
-  return BitCast(d, Vec512<int16_t>{
-                        _mm512_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec512<T> Reverse4(Full512<T> /* tag */, const Vec512<T> v) {
-  return Shuffle0123(v);
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec512<T> Reverse4(Full512<T> /* tag */, const Vec512<T> v) {
-  return Vec512<T>{_mm512_permutex_epi64(v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
-}
-HWY_API Vec512<double> Reverse4(Full512<double> /* tag */, Vec512<double> v) {
-  return Vec512<double>{_mm512_permutex_pd(v.raw, _MM_SHUFFLE(0, 1, 2, 3))};
-}
-
-// ------------------------------ Reverse8
-
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec512<T> Reverse8(Full512<T> d, const Vec512<T> v) {
-  const RebindToSigned<decltype(d)> di;
-  alignas(64) constexpr int16_t kReverse8[32] = {
-      7,  6,  5,  4,  3,  2,  1,  0,  15, 14, 13, 12, 11, 10, 9,  8,
-      23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24};
-  const Vec512<int16_t> idx = Load(di, kReverse8);
-  return BitCast(d, Vec512<int16_t>{
-                        _mm512_permutexvar_epi16(idx.raw, BitCast(di, v).raw)});
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec512<T> Reverse8(Full512<T> d, const Vec512<T> v) {
-  const RebindToSigned<decltype(d)> di;
-  alignas(64) constexpr int32_t kReverse8[16] = {7,  6,  5,  4,  3,  2,  1, 0,
-                                                 15, 14, 13, 12, 11, 10, 9, 8};
-  const Vec512<int32_t> idx = Load(di, kReverse8);
-  return BitCast(d, Vec512<int32_t>{
-                        _mm512_permutexvar_epi32(idx.raw, BitCast(di, v).raw)});
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec512<T> Reverse8(Full512<T> d, const Vec512<T> v) {
-  return Reverse(d, v);
-}
-
-// ------------------------------ InterleaveLower
-
-// Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
-// the least-significant lane) and "b". To concatenate two half-width integers
-// into one, use ZipLower/Upper instead (also works with scalar).
-
-HWY_API Vec512<uint8_t> InterleaveLower(const Vec512<uint8_t> a,
-                                        const Vec512<uint8_t> b) {
-  return Vec512<uint8_t>{_mm512_unpacklo_epi8(a.raw, b.raw)};
-}
-HWY_API Vec512<uint16_t> InterleaveLower(const Vec512<uint16_t> a,
-                                         const Vec512<uint16_t> b) {
-  return Vec512<uint16_t>{_mm512_unpacklo_epi16(a.raw, b.raw)};
-}
-HWY_API Vec512<uint32_t> InterleaveLower(const Vec512<uint32_t> a,
-                                         const Vec512<uint32_t> b) {
-  return Vec512<uint32_t>{_mm512_unpacklo_epi32(a.raw, b.raw)};
-}
-HWY_API Vec512<uint64_t> InterleaveLower(const Vec512<uint64_t> a,
-                                         const Vec512<uint64_t> b) {
-  return Vec512<uint64_t>{_mm512_unpacklo_epi64(a.raw, b.raw)};
-}
-
-HWY_API Vec512<int8_t> InterleaveLower(const Vec512<int8_t> a,
-                                       const Vec512<int8_t> b) {
-  return Vec512<int8_t>{_mm512_unpacklo_epi8(a.raw, b.raw)};
-}
-HWY_API Vec512<int16_t> InterleaveLower(const Vec512<int16_t> a,
-                                        const Vec512<int16_t> b) {
-  return Vec512<int16_t>{_mm512_unpacklo_epi16(a.raw, b.raw)};
-}
-HWY_API Vec512<int32_t> InterleaveLower(const Vec512<int32_t> a,
-                                        const Vec512<int32_t> b) {
-  return Vec512<int32_t>{_mm512_unpacklo_epi32(a.raw, b.raw)};
-}
-HWY_API Vec512<int64_t> InterleaveLower(const Vec512<int64_t> a,
-                                        const Vec512<int64_t> b) {
-  return Vec512<int64_t>{_mm512_unpacklo_epi64(a.raw, b.raw)};
-}
-
-HWY_API Vec512<float> InterleaveLower(const Vec512<float> a,
-                                      const Vec512<float> b) {
-  return Vec512<float>{_mm512_unpacklo_ps(a.raw, b.raw)};
-}
-HWY_API Vec512<double> InterleaveLower(const Vec512<double> a,
-                                       const Vec512<double> b) {
-  return Vec512<double>{_mm512_unpacklo_pd(a.raw, b.raw)};
-}
-
-// ------------------------------ InterleaveUpper
-
-// All functions inside detail lack the required D parameter.
-namespace detail {
-
-HWY_API Vec512<uint8_t> InterleaveUpper(const Vec512<uint8_t> a,
-                                        const Vec512<uint8_t> b) {
-  return Vec512<uint8_t>{_mm512_unpackhi_epi8(a.raw, b.raw)};
-}
-HWY_API Vec512<uint16_t> InterleaveUpper(const Vec512<uint16_t> a,
-                                         const Vec512<uint16_t> b) {
-  return Vec512<uint16_t>{_mm512_unpackhi_epi16(a.raw, b.raw)};
-}
-HWY_API Vec512<uint32_t> InterleaveUpper(const Vec512<uint32_t> a,
-                                         const Vec512<uint32_t> b) {
-  return Vec512<uint32_t>{_mm512_unpackhi_epi32(a.raw, b.raw)};
-}
-HWY_API Vec512<uint64_t> InterleaveUpper(const Vec512<uint64_t> a,
-                                         const Vec512<uint64_t> b) {
-  return Vec512<uint64_t>{_mm512_unpackhi_epi64(a.raw, b.raw)};
-}
-
-HWY_API Vec512<int8_t> InterleaveUpper(const Vec512<int8_t> a,
-                                       const Vec512<int8_t> b) {
-  return Vec512<int8_t>{_mm512_unpackhi_epi8(a.raw, b.raw)};
-}
-HWY_API Vec512<int16_t> InterleaveUpper(const Vec512<int16_t> a,
-                                        const Vec512<int16_t> b) {
-  return Vec512<int16_t>{_mm512_unpackhi_epi16(a.raw, b.raw)};
-}
-HWY_API Vec512<int32_t> InterleaveUpper(const Vec512<int32_t> a,
-                                        const Vec512<int32_t> b) {
-  return Vec512<int32_t>{_mm512_unpackhi_epi32(a.raw, b.raw)};
-}
-HWY_API Vec512<int64_t> InterleaveUpper(const Vec512<int64_t> a,
-                                        const Vec512<int64_t> b) {
-  return Vec512<int64_t>{_mm512_unpackhi_epi64(a.raw, b.raw)};
-}
-
-HWY_API Vec512<float> InterleaveUpper(const Vec512<float> a,
-                                      const Vec512<float> b) {
-  return Vec512<float>{_mm512_unpackhi_ps(a.raw, b.raw)};
-}
-HWY_API Vec512<double> InterleaveUpper(const Vec512<double> a,
-                                       const Vec512<double> b) {
-  return Vec512<double>{_mm512_unpackhi_pd(a.raw, b.raw)};
-}
-
-}  // namespace detail
-
-template <typename T, class V = Vec512<T>>
-HWY_API V InterleaveUpper(Full512<T> /* tag */, V a, V b) {
-  return detail::InterleaveUpper(a, b);
-}
-
-// ------------------------------ ZipLower/ZipUpper (InterleaveLower)
-
-// Same as Interleave*, except that the return lanes are double-width integers;
-// this is necessary because the single-lane scalar cannot return two values.
-template <typename T, typename TW = MakeWide<T>>
-HWY_API Vec512<TW> ZipLower(Vec512<T> a, Vec512<T> b) {
-  return BitCast(Full512<TW>(), InterleaveLower(a, b));
-}
-template <typename T, typename TW = MakeWide<T>>
-HWY_API Vec512<TW> ZipLower(Full512<TW> /* d */, Vec512<T> a, Vec512<T> b) {
-  return BitCast(Full512<TW>(), InterleaveLower(a, b));
-}
-
-template <typename T, typename TW = MakeWide<T>>
-HWY_API Vec512<TW> ZipUpper(Full512<TW> d, Vec512<T> a, Vec512<T> b) {
-  return BitCast(Full512<TW>(), InterleaveUpper(d, a, b));
-}
-
-// ------------------------------ Concat* halves
-
-// hiH,hiL loH,loL |-> hiL,loL (= lower halves)
-template <typename T>
-HWY_API Vec512<T> ConcatLowerLower(Full512<T> /* tag */, const Vec512<T> hi,
-                                   const Vec512<T> lo) {
-  return Vec512<T>{_mm512_shuffle_i32x4(lo.raw, hi.raw, _MM_PERM_BABA)};
-}
-HWY_API Vec512<float> ConcatLowerLower(Full512<float> /* tag */,
-                                       const Vec512<float> hi,
-                                       const Vec512<float> lo) {
-  return Vec512<float>{_mm512_shuffle_f32x4(lo.raw, hi.raw, _MM_PERM_BABA)};
-}
-HWY_API Vec512<double> ConcatLowerLower(Full512<double> /* tag */,
-                                        const Vec512<double> hi,
-                                        const Vec512<double> lo) {
-  return Vec512<double>{_mm512_shuffle_f64x2(lo.raw, hi.raw, _MM_PERM_BABA)};
-}
-
-// hiH,hiL loH,loL |-> hiH,loH (= upper halves)
-template <typename T>
-HWY_API Vec512<T> ConcatUpperUpper(Full512<T> /* tag */, const Vec512<T> hi,
-                                   const Vec512<T> lo) {
-  return Vec512<T>{_mm512_shuffle_i32x4(lo.raw, hi.raw, _MM_PERM_DCDC)};
-}
-HWY_API Vec512<float> ConcatUpperUpper(Full512<float> /* tag */,
-                                       const Vec512<float> hi,
-                                       const Vec512<float> lo) {
-  return Vec512<float>{_mm512_shuffle_f32x4(lo.raw, hi.raw, _MM_PERM_DCDC)};
-}
-HWY_API Vec512<double> ConcatUpperUpper(Full512<double> /* tag */,
-                                        const Vec512<double> hi,
-                                        const Vec512<double> lo) {
-  return Vec512<double>{_mm512_shuffle_f64x2(lo.raw, hi.raw, _MM_PERM_DCDC)};
-}
-
-// hiH,hiL loH,loL |-> hiL,loH (= inner halves / swap blocks)
-template <typename T>
-HWY_API Vec512<T> ConcatLowerUpper(Full512<T> /* tag */, const Vec512<T> hi,
-                                   const Vec512<T> lo) {
-  return Vec512<T>{_mm512_shuffle_i32x4(lo.raw, hi.raw, _MM_PERM_BADC)};
-}
-HWY_API Vec512<float> ConcatLowerUpper(Full512<float> /* tag */,
-                                       const Vec512<float> hi,
-                                       const Vec512<float> lo) {
-  return Vec512<float>{_mm512_shuffle_f32x4(lo.raw, hi.raw, _MM_PERM_BADC)};
-}
-HWY_API Vec512<double> ConcatLowerUpper(Full512<double> /* tag */,
-                                        const Vec512<double> hi,
-                                        const Vec512<double> lo) {
-  return Vec512<double>{_mm512_shuffle_f64x2(lo.raw, hi.raw, _MM_PERM_BADC)};
-}
-
-// hiH,hiL loH,loL |-> hiH,loL (= outer halves)
-template <typename T>
-HWY_API Vec512<T> ConcatUpperLower(Full512<T> /* tag */, const Vec512<T> hi,
-                                   const Vec512<T> lo) {
-  // There are no imm8 blend in AVX512. Use blend16 because 32-bit masks
-  // are efficiently loaded from 32-bit regs.
-  const __mmask32 mask = /*_cvtu32_mask32 */ (0x0000FFFF);
-  return Vec512<T>{_mm512_mask_blend_epi16(mask, hi.raw, lo.raw)};
-}
-HWY_API Vec512<float> ConcatUpperLower(Full512<float> /* tag */,
-                                       const Vec512<float> hi,
-                                       const Vec512<float> lo) {
-  const __mmask16 mask = /*_cvtu32_mask16 */ (0x00FF);
-  return Vec512<float>{_mm512_mask_blend_ps(mask, hi.raw, lo.raw)};
-}
-HWY_API Vec512<double> ConcatUpperLower(Full512<double> /* tag */,
-                                        const Vec512<double> hi,
-                                        const Vec512<double> lo) {
-  const __mmask8 mask = /*_cvtu32_mask8 */ (0x0F);
-  return Vec512<double>{_mm512_mask_blend_pd(mask, hi.raw, lo.raw)};
-}
-
-// ------------------------------ ConcatOdd
-
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Vec512<T> ConcatOdd(Full512<T> d, Vec512<T> hi, Vec512<T> lo) {
-  const RebindToUnsigned<decltype(d)> du;
-#if HWY_TARGET == HWY_AVX3_DL
-  alignas(64) constexpr uint8_t kIdx[64] = {
-      1,   3,   5,   7,   9,   11,  13,  15,  17,  19,  21,  23,  25,
-      27,  29,  31,  33,  35,  37,  39,  41,  43,  45,  47,  49,  51,
-      53,  55,  57,  59,  61,  63,  65,  67,  69,  71,  73,  75,  77,
-      79,  81,  83,  85,  87,  89,  91,  93,  95,  97,  99,  101, 103,
-      105, 107, 109, 111, 113, 115, 117, 119, 121, 123, 125, 127};
-  return BitCast(d,
-                 Vec512<uint8_t>{_mm512_mask2_permutex2var_epi8(
-                     BitCast(du, lo).raw, Load(du, kIdx).raw,
-                     __mmask64{0xFFFFFFFFFFFFFFFFull}, BitCast(du, hi).raw)});
-#else
-  const RepartitionToWide<decltype(du)> dw;
-  // Right-shift 8 bits per u16 so we can pack.
-  const Vec512<uint16_t> uH = ShiftRight<8>(BitCast(dw, hi));
-  const Vec512<uint16_t> uL = ShiftRight<8>(BitCast(dw, lo));
-  const Vec512<uint64_t> u8{_mm512_packus_epi16(uL.raw, uH.raw)};
-  // Undo block interleave: lower half = even u64 lanes, upper = odd u64 lanes.
-  const Full512<uint64_t> du64;
-  alignas(64) constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
-  return BitCast(d, TableLookupLanes(u8, SetTableIndices(du64, kIdx)));
-#endif
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec512<T> ConcatOdd(Full512<T> d, Vec512<T> hi, Vec512<T> lo) {
-  const RebindToUnsigned<decltype(d)> du;
-  alignas(64) constexpr uint16_t kIdx[32] = {
-      1,  3,  5,  7,  9,  11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
-      33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63};
-  return BitCast(d, Vec512<uint16_t>{_mm512_mask2_permutex2var_epi16(
-                        BitCast(du, lo).raw, Load(du, kIdx).raw,
-                        __mmask32{0xFFFFFFFFu}, BitCast(du, hi).raw)});
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec512<T> ConcatOdd(Full512<T> d, Vec512<T> hi, Vec512<T> lo) {
-  const RebindToUnsigned<decltype(d)> du;
-  alignas(64) constexpr uint32_t kIdx[16] = {1,  3,  5,  7,  9,  11, 13, 15,
-                                             17, 19, 21, 23, 25, 27, 29, 31};
-  return BitCast(d, Vec512<uint32_t>{_mm512_mask2_permutex2var_epi32(
-                        BitCast(du, lo).raw, Load(du, kIdx).raw,
-                        __mmask16{0xFFFF}, BitCast(du, hi).raw)});
-}
-
-HWY_API Vec512<float> ConcatOdd(Full512<float> d, Vec512<float> hi,
-                                Vec512<float> lo) {
-  const RebindToUnsigned<decltype(d)> du;
-  alignas(64) constexpr uint32_t kIdx[16] = {1,  3,  5,  7,  9,  11, 13, 15,
-                                             17, 19, 21, 23, 25, 27, 29, 31};
-  return Vec512<float>{_mm512_mask2_permutex2var_ps(lo.raw, Load(du, kIdx).raw,
-                                                    __mmask16{0xFFFF}, hi.raw)};
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec512<T> ConcatOdd(Full512<T> d, Vec512<T> hi, Vec512<T> lo) {
-  const RebindToUnsigned<decltype(d)> du;
-  alignas(64) constexpr uint64_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
-  return BitCast(d, Vec512<uint64_t>{_mm512_mask2_permutex2var_epi64(
-                        BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF},
-                        BitCast(du, hi).raw)});
-}
-
-HWY_API Vec512<double> ConcatOdd(Full512<double> d, Vec512<double> hi,
-                                 Vec512<double> lo) {
-  const RebindToUnsigned<decltype(d)> du;
-  alignas(64) constexpr uint64_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15};
-  return Vec512<double>{_mm512_mask2_permutex2var_pd(lo.raw, Load(du, kIdx).raw,
-                                                     __mmask8{0xFF}, hi.raw)};
-}
-
-// ------------------------------ ConcatEven
-
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API Vec512<T> ConcatEven(Full512<T> d, Vec512<T> hi, Vec512<T> lo) {
-  const RebindToUnsigned<decltype(d)> du;
-#if HWY_TARGET == HWY_AVX3_DL
-  alignas(64) constexpr uint8_t kIdx[64] = {
-      0,   2,   4,   6,   8,   10,  12,  14,  16,  18,  20,  22,  24,
-      26,  28,  30,  32,  34,  36,  38,  40,  42,  44,  46,  48,  50,
-      52,  54,  56,  58,  60,  62,  64,  66,  68,  70,  72,  74,  76,
-      78,  80,  82,  84,  86,  88,  90,  92,  94,  96,  98,  100, 102,
-      104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126};
-  return BitCast(d,
-                 Vec512<uint32_t>{_mm512_mask2_permutex2var_epi8(
-                     BitCast(du, lo).raw, Load(du, kIdx).raw,
-                     __mmask64{0xFFFFFFFFFFFFFFFFull}, BitCast(du, hi).raw)});
-#else
-  const RepartitionToWide<decltype(du)> dw;
-  // Isolate lower 8 bits per u16 so we can pack.
-  const Vec512<uint16_t> mask = Set(dw, 0x00FF);
-  const Vec512<uint16_t> uH = And(BitCast(dw, hi), mask);
-  const Vec512<uint16_t> uL = And(BitCast(dw, lo), mask);
-  const Vec512<uint64_t> u8{_mm512_packus_epi16(uL.raw, uH.raw)};
-  // Undo block interleave: lower half = even u64 lanes, upper = odd u64 lanes.
-  const Full512<uint64_t> du64;
-  alignas(64) constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
-  return BitCast(d, TableLookupLanes(u8, SetTableIndices(du64, kIdx)));
-#endif
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec512<T> ConcatEven(Full512<T> d, Vec512<T> hi, Vec512<T> lo) {
-  const RebindToUnsigned<decltype(d)> du;
-  alignas(64) constexpr uint16_t kIdx[32] = {
-      0,  2,  4,  6,  8,  10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
-      32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62};
-  return BitCast(d, Vec512<uint32_t>{_mm512_mask2_permutex2var_epi16(
-                        BitCast(du, lo).raw, Load(du, kIdx).raw,
-                        __mmask32{0xFFFFFFFFu}, BitCast(du, hi).raw)});
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec512<T> ConcatEven(Full512<T> d, Vec512<T> hi, Vec512<T> lo) {
-  const RebindToUnsigned<decltype(d)> du;
-  alignas(64) constexpr uint32_t kIdx[16] = {0,  2,  4,  6,  8,  10, 12, 14,
-                                             16, 18, 20, 22, 24, 26, 28, 30};
-  return BitCast(d, Vec512<uint32_t>{_mm512_mask2_permutex2var_epi32(
-                        BitCast(du, lo).raw, Load(du, kIdx).raw,
-                        __mmask16{0xFFFF}, BitCast(du, hi).raw)});
-}
-
-HWY_API Vec512<float> ConcatEven(Full512<float> d, Vec512<float> hi,
-                                 Vec512<float> lo) {
-  const RebindToUnsigned<decltype(d)> du;
-  alignas(64) constexpr uint32_t kIdx[16] = {0,  2,  4,  6,  8,  10, 12, 14,
-                                             16, 18, 20, 22, 24, 26, 28, 30};
-  return Vec512<float>{_mm512_mask2_permutex2var_ps(lo.raw, Load(du, kIdx).raw,
-                                                    __mmask16{0xFFFF}, hi.raw)};
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec512<T> ConcatEven(Full512<T> d, Vec512<T> hi, Vec512<T> lo) {
-  const RebindToUnsigned<decltype(d)> du;
-  alignas(64) constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
-  return BitCast(d, Vec512<uint64_t>{_mm512_mask2_permutex2var_epi64(
-                        BitCast(du, lo).raw, Load(du, kIdx).raw, __mmask8{0xFF},
-                        BitCast(du, hi).raw)});
-}
-
-HWY_API Vec512<double> ConcatEven(Full512<double> d, Vec512<double> hi,
-                                  Vec512<double> lo) {
-  const RebindToUnsigned<decltype(d)> du;
-  alignas(64) constexpr uint64_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14};
-  return Vec512<double>{_mm512_mask2_permutex2var_pd(lo.raw, Load(du, kIdx).raw,
-                                                     __mmask8{0xFF}, hi.raw)};
-}
-
-// ------------------------------ DupEven (InterleaveLower)
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec512<T> DupEven(Vec512<T> v) {
-  return Vec512<T>{_mm512_shuffle_epi32(v.raw, _MM_PERM_CCAA)};
-}
-HWY_API Vec512<float> DupEven(Vec512<float> v) {
-  return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_CCAA)};
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec512<T> DupEven(const Vec512<T> v) {
-  return InterleaveLower(Full512<T>(), v, v);
-}
-
-// ------------------------------ DupOdd (InterleaveUpper)
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec512<T> DupOdd(Vec512<T> v) {
-  return Vec512<T>{_mm512_shuffle_epi32(v.raw, _MM_PERM_DDBB)};
-}
-HWY_API Vec512<float> DupOdd(Vec512<float> v) {
-  return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_DDBB)};
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec512<T> DupOdd(const Vec512<T> v) {
-  return InterleaveUpper(Full512<T>(), v, v);
-}
-
-// ------------------------------ OddEven
-
-template <typename T>
-HWY_API Vec512<T> OddEven(const Vec512<T> a, const Vec512<T> b) {
-  constexpr size_t s = sizeof(T);
-  constexpr int shift = s == 1 ? 0 : s == 2 ? 32 : s == 4 ? 48 : 56;
-  return IfThenElse(Mask512<T>{0x5555555555555555ull >> shift}, b, a);
-}
-
-// ------------------------------ OddEvenBlocks
-
-template <typename T>
-HWY_API Vec512<T> OddEvenBlocks(Vec512<T> odd, Vec512<T> even) {
-  return Vec512<T>{_mm512_mask_blend_epi64(__mmask8{0x33u}, odd.raw, even.raw)};
-}
-
-HWY_API Vec512<float> OddEvenBlocks(Vec512<float> odd, Vec512<float> even) {
-  return Vec512<float>{
-      _mm512_mask_blend_ps(__mmask16{0x0F0Fu}, odd.raw, even.raw)};
-}
-
-HWY_API Vec512<double> OddEvenBlocks(Vec512<double> odd, Vec512<double> even) {
-  return Vec512<double>{
-      _mm512_mask_blend_pd(__mmask8{0x33u}, odd.raw, even.raw)};
-}
-
-// ------------------------------ SwapAdjacentBlocks
-
-template <typename T>
-HWY_API Vec512<T> SwapAdjacentBlocks(Vec512<T> v) {
-  return Vec512<T>{_mm512_shuffle_i32x4(v.raw, v.raw, _MM_PERM_CDAB)};
-}
-
-HWY_API Vec512<float> SwapAdjacentBlocks(Vec512<float> v) {
-  return Vec512<float>{_mm512_shuffle_f32x4(v.raw, v.raw, _MM_PERM_CDAB)};
-}
-
-HWY_API Vec512<double> SwapAdjacentBlocks(Vec512<double> v) {
-  return Vec512<double>{_mm512_shuffle_f64x2(v.raw, v.raw, _MM_PERM_CDAB)};
-}
-
-// ------------------------------ ReverseBlocks
-
-template <typename T>
-HWY_API Vec512<T> ReverseBlocks(Full512<T> /* tag */, Vec512<T> v) {
-  return Vec512<T>{_mm512_shuffle_i32x4(v.raw, v.raw, _MM_PERM_ABCD)};
-}
-HWY_API Vec512<float> ReverseBlocks(Full512<float> /* tag */, Vec512<float> v) {
-  return Vec512<float>{_mm512_shuffle_f32x4(v.raw, v.raw, _MM_PERM_ABCD)};
-}
-HWY_API Vec512<double> ReverseBlocks(Full512<double> /* tag */,
-                                     Vec512<double> v) {
-  return Vec512<double>{_mm512_shuffle_f64x2(v.raw, v.raw, _MM_PERM_ABCD)};
-}
-
-// ------------------------------ TableLookupBytes (ZeroExtendVector)
-
-// Both full
-template <typename T, typename TI>
-HWY_API Vec512<TI> TableLookupBytes(Vec512<T> bytes, Vec512<TI> indices) {
-  return Vec512<TI>{_mm512_shuffle_epi8(bytes.raw, indices.raw)};
-}
-
-// Partial index vector
-template <typename T, typename TI, size_t NI>
-HWY_API Vec128<TI, NI> TableLookupBytes(Vec512<T> bytes, Vec128<TI, NI> from) {
-  const Full512<TI> d512;
-  const Half<decltype(d512)> d256;
-  const Half<decltype(d256)> d128;
-  // First expand to full 128, then 256, then 512.
-  const Vec128<TI> from_full{from.raw};
-  const auto from_512 =
-      ZeroExtendVector(d512, ZeroExtendVector(d256, from_full));
-  const auto tbl_full = TableLookupBytes(bytes, from_512);
-  // Shrink to 256, then 128, then partial.
-  return Vec128<TI, NI>{LowerHalf(d128, LowerHalf(d256, tbl_full)).raw};
-}
-template <typename T, typename TI>
-HWY_API Vec256<TI> TableLookupBytes(Vec512<T> bytes, Vec256<TI> from) {
-  const auto from_512 = ZeroExtendVector(Full512<TI>(), from);
-  return LowerHalf(Full256<TI>(), TableLookupBytes(bytes, from_512));
-}
-
-// Partial table vector
-template <typename T, size_t N, typename TI>
-HWY_API Vec512<TI> TableLookupBytes(Vec128<T, N> bytes, Vec512<TI> from) {
-  const Full512<TI> d512;
-  const Half<decltype(d512)> d256;
-  const Half<decltype(d256)> d128;
-  // First expand to full 128, then 256, then 512.
-  const Vec128<T> bytes_full{bytes.raw};
-  const auto bytes_512 =
-      ZeroExtendVector(d512, ZeroExtendVector(d256, bytes_full));
-  return TableLookupBytes(bytes_512, from);
-}
-template <typename T, typename TI>
-HWY_API Vec512<TI> TableLookupBytes(Vec256<T> bytes, Vec512<TI> from) {
-  const auto bytes_512 = ZeroExtendVector(Full512<T>(), bytes);
-  return TableLookupBytes(bytes_512, from);
-}
-
-// Partial both are handled by x86_128/256.
-
-// ================================================== CONVERT
-
-// ------------------------------ Promotions (part w/ narrow lanes -> full)
-
-// Unsigned: zero-extend.
-// Note: these have 3 cycle latency; if inputs are already split across the
-// 128 bit blocks (in their upper/lower halves), then Zip* would be faster.
-HWY_API Vec512<uint16_t> PromoteTo(Full512<uint16_t> /* tag */,
-                                   Vec256<uint8_t> v) {
-  return Vec512<uint16_t>{_mm512_cvtepu8_epi16(v.raw)};
-}
-HWY_API Vec512<uint32_t> PromoteTo(Full512<uint32_t> /* tag */,
-                                   Vec128<uint8_t> v) {
-  return Vec512<uint32_t>{_mm512_cvtepu8_epi32(v.raw)};
-}
-HWY_API Vec512<int16_t> PromoteTo(Full512<int16_t> /* tag */,
-                                  Vec256<uint8_t> v) {
-  return Vec512<int16_t>{_mm512_cvtepu8_epi16(v.raw)};
-}
-HWY_API Vec512<int32_t> PromoteTo(Full512<int32_t> /* tag */,
-                                  Vec128<uint8_t> v) {
-  return Vec512<int32_t>{_mm512_cvtepu8_epi32(v.raw)};
-}
-HWY_API Vec512<uint32_t> PromoteTo(Full512<uint32_t> /* tag */,
-                                   Vec256<uint16_t> v) {
-  return Vec512<uint32_t>{_mm512_cvtepu16_epi32(v.raw)};
-}
-HWY_API Vec512<int32_t> PromoteTo(Full512<int32_t> /* tag */,
-                                  Vec256<uint16_t> v) {
-  return Vec512<int32_t>{_mm512_cvtepu16_epi32(v.raw)};
-}
-HWY_API Vec512<uint64_t> PromoteTo(Full512<uint64_t> /* tag */,
-                                   Vec256<uint32_t> v) {
-  return Vec512<uint64_t>{_mm512_cvtepu32_epi64(v.raw)};
-}
-
-// Signed: replicate sign bit.
-// Note: these have 3 cycle latency; if inputs are already split across the
-// 128 bit blocks (in their upper/lower halves), then ZipUpper/lo followed by
-// signed shift would be faster.
-HWY_API Vec512<int16_t> PromoteTo(Full512<int16_t> /* tag */,
-                                  Vec256<int8_t> v) {
-  return Vec512<int16_t>{_mm512_cvtepi8_epi16(v.raw)};
-}
-HWY_API Vec512<int32_t> PromoteTo(Full512<int32_t> /* tag */,
-                                  Vec128<int8_t> v) {
-  return Vec512<int32_t>{_mm512_cvtepi8_epi32(v.raw)};
-}
-HWY_API Vec512<int32_t> PromoteTo(Full512<int32_t> /* tag */,
-                                  Vec256<int16_t> v) {
-  return Vec512<int32_t>{_mm512_cvtepi16_epi32(v.raw)};
-}
-HWY_API Vec512<int64_t> PromoteTo(Full512<int64_t> /* tag */,
-                                  Vec256<int32_t> v) {
-  return Vec512<int64_t>{_mm512_cvtepi32_epi64(v.raw)};
-}
-
-// Float
-HWY_API Vec512<float> PromoteTo(Full512<float> /* tag */,
-                                const Vec256<float16_t> v) {
-  return Vec512<float>{_mm512_cvtph_ps(v.raw)};
-}
-
-HWY_API Vec512<float> PromoteTo(Full512<float> df32,
-                                const Vec256<bfloat16_t> v) {
-  const Rebind<uint16_t, decltype(df32)> du16;
-  const RebindToSigned<decltype(df32)> di32;
-  return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
-}
-
-HWY_API Vec512<double> PromoteTo(Full512<double> /* tag */, Vec256<float> v) {
-  return Vec512<double>{_mm512_cvtps_pd(v.raw)};
-}
-
-HWY_API Vec512<double> PromoteTo(Full512<double> /* tag */, Vec256<int32_t> v) {
-  return Vec512<double>{_mm512_cvtepi32_pd(v.raw)};
-}
-
-// ------------------------------ Demotions (full -> part w/ narrow lanes)
-
-HWY_API Vec256<uint16_t> DemoteTo(Full256<uint16_t> /* tag */,
-                                  const Vec512<int32_t> v) {
-  const Vec512<uint16_t> u16{_mm512_packus_epi32(v.raw, v.raw)};
-
-  // Compress even u64 lanes into 256 bit.
-  alignas(64) static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
-  const auto idx64 = Load(Full512<uint64_t>(), kLanes);
-  const Vec512<uint16_t> even{_mm512_permutexvar_epi64(idx64.raw, u16.raw)};
-  return LowerHalf(even);
-}
-
-HWY_API Vec256<int16_t> DemoteTo(Full256<int16_t> /* tag */,
-                                 const Vec512<int32_t> v) {
-  const Vec512<int16_t> i16{_mm512_packs_epi32(v.raw, v.raw)};
-
-  // Compress even u64 lanes into 256 bit.
-  alignas(64) static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
-  const auto idx64 = Load(Full512<uint64_t>(), kLanes);
-  const Vec512<int16_t> even{_mm512_permutexvar_epi64(idx64.raw, i16.raw)};
-  return LowerHalf(even);
-}
-
-HWY_API Vec128<uint8_t, 16> DemoteTo(Full128<uint8_t> /* tag */,
-                                     const Vec512<int32_t> v) {
-  const Vec512<uint16_t> u16{_mm512_packus_epi32(v.raw, v.raw)};
-  // packus treats the input as signed; we want unsigned. Clear the MSB to get
-  // unsigned saturation to u8.
-  const Vec512<int16_t> i16{
-      _mm512_and_si512(u16.raw, _mm512_set1_epi16(0x7FFF))};
-  const Vec512<uint8_t> u8{_mm512_packus_epi16(i16.raw, i16.raw)};
-
-  alignas(16) static constexpr uint32_t kLanes[4] = {0, 4, 8, 12};
-  const auto idx32 = LoadDup128(Full512<uint32_t>(), kLanes);
-  const Vec512<uint8_t> fixed{_mm512_permutexvar_epi32(idx32.raw, u8.raw)};
-  return LowerHalf(LowerHalf(fixed));
-}
-
-HWY_API Vec256<uint8_t> DemoteTo(Full256<uint8_t> /* tag */,
-                                 const Vec512<int16_t> v) {
-  const Vec512<uint8_t> u8{_mm512_packus_epi16(v.raw, v.raw)};
-
-  // Compress even u64 lanes into 256 bit.
-  alignas(64) static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
-  const auto idx64 = Load(Full512<uint64_t>(), kLanes);
-  const Vec512<uint8_t> even{_mm512_permutexvar_epi64(idx64.raw, u8.raw)};
-  return LowerHalf(even);
-}
-
-HWY_API Vec128<int8_t, 16> DemoteTo(Full128<int8_t> /* tag */,
-                                    const Vec512<int32_t> v) {
-  const Vec512<int16_t> i16{_mm512_packs_epi32(v.raw, v.raw)};
-  const Vec512<int8_t> i8{_mm512_packs_epi16(i16.raw, i16.raw)};
-
-  alignas(16) static constexpr uint32_t kLanes[16] = {0, 4, 8, 12, 0, 4, 8, 12,
-                                                      0, 4, 8, 12, 0, 4, 8, 12};
-  const auto idx32 = LoadDup128(Full512<uint32_t>(), kLanes);
-  const Vec512<int8_t> fixed{_mm512_permutexvar_epi32(idx32.raw, i8.raw)};
-  return LowerHalf(LowerHalf(fixed));
-}
-
-HWY_API Vec256<int8_t> DemoteTo(Full256<int8_t> /* tag */,
-                                const Vec512<int16_t> v) {
-  const Vec512<int8_t> u8{_mm512_packs_epi16(v.raw, v.raw)};
-
-  // Compress even u64 lanes into 256 bit.
-  alignas(64) static constexpr uint64_t kLanes[8] = {0, 2, 4, 6, 0, 2, 4, 6};
-  const auto idx64 = Load(Full512<uint64_t>(), kLanes);
-  const Vec512<int8_t> even{_mm512_permutexvar_epi64(idx64.raw, u8.raw)};
-  return LowerHalf(even);
-}
-
-HWY_API Vec256<float16_t> DemoteTo(Full256<float16_t> /* tag */,
-                                   const Vec512<float> v) {
-  // Work around warnings in the intrinsic definitions (passing -1 as a mask).
-  HWY_DIAGNOSTICS(push)
-  HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion")
-  return Vec256<float16_t>{_mm512_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)};
-  HWY_DIAGNOSTICS(pop)
-}
-
-HWY_API Vec256<bfloat16_t> DemoteTo(Full256<bfloat16_t> dbf16,
-                                    const Vec512<float> v) {
-  // TODO(janwas): _mm512_cvtneps_pbh once we have avx512bf16.
-  const Rebind<int32_t, decltype(dbf16)> di32;
-  const Rebind<uint32_t, decltype(dbf16)> du32;  // for logical shift right
-  const Rebind<uint16_t, decltype(dbf16)> du16;
-  const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v)));
-  return BitCast(dbf16, DemoteTo(du16, bits_in_32));
-}
-
-HWY_API Vec512<bfloat16_t> ReorderDemote2To(Full512<bfloat16_t> dbf16,
-                                            Vec512<float> a, Vec512<float> b) {
-  // TODO(janwas): _mm512_cvtne2ps_pbh once we have avx512bf16.
-  const RebindToUnsigned<decltype(dbf16)> du16;
-  const Repartition<uint32_t, decltype(dbf16)> du32;
-  const Vec512<uint32_t> b_in_even = ShiftRight<16>(BitCast(du32, b));
-  return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even)));
-}
-
-HWY_API Vec256<float> DemoteTo(Full256<float> /* tag */,
-                               const Vec512<double> v) {
-  return Vec256<float>{_mm512_cvtpd_ps(v.raw)};
-}
-
-HWY_API Vec256<int32_t> DemoteTo(Full256<int32_t> /* tag */,
-                                 const Vec512<double> v) {
-  const auto clamped = detail::ClampF64ToI32Max(Full512<double>(), v);
-  return Vec256<int32_t>{_mm512_cvttpd_epi32(clamped.raw)};
-}
-
-// For already range-limited input [0, 255].
-HWY_API Vec128<uint8_t, 16> U8FromU32(const Vec512<uint32_t> v) {
-  const Full512<uint32_t> d32;
-  // In each 128 bit block, gather the lower byte of 4 uint32_t lanes into the
-  // lowest 4 bytes.
-  alignas(16) static constexpr uint32_t k8From32[4] = {0x0C080400u, ~0u, ~0u,
-                                                       ~0u};
-  const auto quads = TableLookupBytes(v, LoadDup128(d32, k8From32));
-  // Gather the lowest 4 bytes of 4 128-bit blocks.
-  alignas(16) static constexpr uint32_t kIndex32[4] = {0, 4, 8, 12};
-  const Vec512<uint8_t> bytes{
-      _mm512_permutexvar_epi32(LoadDup128(d32, kIndex32).raw, quads.raw)};
-  return LowerHalf(LowerHalf(bytes));
-}
-
-// ------------------------------ Truncations
-
-HWY_API Vec128<uint8_t, 8> TruncateTo(Simd<uint8_t, 8, 0> d,
-                                      const Vec512<uint64_t> v) {
-#if HWY_TARGET == HWY_AVX3_DL
-  (void)d;
-  const Full512<uint8_t> d8;
-  alignas(16) static constexpr uint8_t k8From64[16] = {
-    0, 8, 16, 24, 32, 40, 48, 56, 0, 8, 16, 24, 32, 40, 48, 56};
-  const Vec512<uint8_t> bytes{
-      _mm512_permutexvar_epi8(LoadDup128(d8, k8From64).raw, v.raw)};
-  return LowerHalf(LowerHalf(LowerHalf(bytes)));
-#else
-  const Full512<uint32_t> d32;
-  alignas(64) constexpr uint32_t kEven[16] = {0, 2, 4, 6, 8, 10, 12, 14,
-                                              0, 2, 4, 6, 8, 10, 12, 14};
-  const Vec512<uint32_t> even{
-      _mm512_permutexvar_epi32(Load(d32, kEven).raw, v.raw)};
-  return TruncateTo(d, LowerHalf(even));
-#endif
-}
-
-HWY_API Vec128<uint16_t, 8> TruncateTo(Simd<uint16_t, 8, 0> /* tag */,
-                                       const Vec512<uint64_t> v) {
-  const Full512<uint16_t> d16;
-  alignas(16) static constexpr uint16_t k16From64[8] = {
-      0, 4, 8, 12, 16, 20, 24, 28};
-  const Vec512<uint16_t> bytes{
-      _mm512_permutexvar_epi16(LoadDup128(d16, k16From64).raw, v.raw)};
-  return LowerHalf(LowerHalf(bytes));
-}
-
-HWY_API Vec256<uint32_t> TruncateTo(Simd<uint32_t, 8, 0> /* tag */,
-                                    const Vec512<uint64_t> v) {
-  const Full512<uint32_t> d32;
-  alignas(64) constexpr uint32_t kEven[16] = {0, 2, 4, 6, 8, 10, 12, 14,
-                                              0, 2, 4, 6, 8, 10, 12, 14};
-  const Vec512<uint32_t> even{
-      _mm512_permutexvar_epi32(Load(d32, kEven).raw, v.raw)};
-  return LowerHalf(even);
-}
-
-HWY_API Vec128<uint8_t, 16> TruncateTo(Simd<uint8_t, 16, 0> /* tag */,
-                                       const Vec512<uint32_t> v) {
-#if HWY_TARGET == HWY_AVX3_DL
-  const Full512<uint8_t> d8;
-  alignas(16) static constexpr uint8_t k8From32[16] = {
-    0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60};
-  const Vec512<uint8_t> bytes{
-      _mm512_permutexvar_epi32(LoadDup128(d8, k8From32).raw, v.raw)};
-#else
-  const Full512<uint32_t> d32;
-  // In each 128 bit block, gather the lower byte of 4 uint32_t lanes into the
-  // lowest 4 bytes.
-  alignas(16) static constexpr uint32_t k8From32[4] = {0x0C080400u, ~0u, ~0u,
-                                                       ~0u};
-  const auto quads = TableLookupBytes(v, LoadDup128(d32, k8From32));
-  // Gather the lowest 4 bytes of 4 128-bit blocks.
-  alignas(16) static constexpr uint32_t kIndex32[4] = {0, 4, 8, 12};
-  const Vec512<uint8_t> bytes{
-      _mm512_permutexvar_epi32(LoadDup128(d32, kIndex32).raw, quads.raw)};
-#endif
-  return LowerHalf(LowerHalf(bytes));
-}
-
-HWY_API Vec256<uint16_t> TruncateTo(Simd<uint16_t, 16, 0> /* tag */,
-                                    const Vec512<uint32_t> v) {
-  const Full512<uint16_t> d16;
-  alignas(64) static constexpr uint16_t k16From32[32] = {
-      0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
-      0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30};
-  const Vec512<uint16_t> bytes{
-      _mm512_permutexvar_epi16(Load(d16, k16From32).raw, v.raw)};
-  return LowerHalf(bytes);
-}
-
-HWY_API Vec256<uint8_t> TruncateTo(Simd<uint8_t, 32, 0> /* tag */,
-                                   const Vec512<uint16_t> v) {
-#if HWY_TARGET == HWY_AVX3_DL
-  const Full512<uint8_t> d8;
-  alignas(64) static constexpr uint8_t k8From16[64] = {
-     0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
-    32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
-     0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30,
-    32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62};
-  const Vec512<uint8_t> bytes{
-      _mm512_permutexvar_epi8(Load(d8, k8From16).raw, v.raw)};
-#else
-  const Full512<uint32_t> d32;
-  alignas(16) static constexpr uint32_t k16From32[4] = {
-      0x06040200u, 0x0E0C0A08u, 0x06040200u, 0x0E0C0A08u};
-  const auto quads = TableLookupBytes(v, LoadDup128(d32, k16From32));
-  alignas(64) static constexpr uint32_t kIndex32[16] = {
-      0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
-  const Vec512<uint8_t> bytes{
-      _mm512_permutexvar_epi32(Load(d32, kIndex32).raw, quads.raw)};
-#endif
-  return LowerHalf(bytes);
-}
-
-// ------------------------------ Convert integer <=> floating point
-
-HWY_API Vec512<float> ConvertTo(Full512<float> /* tag */,
-                                const Vec512<int32_t> v) {
-  return Vec512<float>{_mm512_cvtepi32_ps(v.raw)};
-}
-
-HWY_API Vec512<double> ConvertTo(Full512<double> /* tag */,
-                                 const Vec512<int64_t> v) {
-  return Vec512<double>{_mm512_cvtepi64_pd(v.raw)};
-}
-
-HWY_API Vec512<float> ConvertTo(Full512<float> /* tag*/,
-                                const Vec512<uint32_t> v) {
-  return Vec512<float>{_mm512_cvtepu32_ps(v.raw)};
-}
-
-HWY_API Vec512<double> ConvertTo(Full512<double> /* tag*/,
-                                const Vec512<uint64_t> v) {
-  return Vec512<double>{_mm512_cvtepu64_pd(v.raw)};
-}
-
-// Truncates (rounds toward zero).
-HWY_API Vec512<int32_t> ConvertTo(Full512<int32_t> d, const Vec512<float> v) {
-  return detail::FixConversionOverflow(d, v, _mm512_cvttps_epi32(v.raw));
-}
-HWY_API Vec512<int64_t> ConvertTo(Full512<int64_t> di, const Vec512<double> v) {
-  return detail::FixConversionOverflow(di, v, _mm512_cvttpd_epi64(v.raw));
-}
-
-HWY_API Vec512<int32_t> NearestInt(const Vec512<float> v) {
-  const Full512<int32_t> di;
-  return detail::FixConversionOverflow(di, v, _mm512_cvtps_epi32(v.raw));
-}
-
-// ================================================== CRYPTO
-
-#if !defined(HWY_DISABLE_PCLMUL_AES)
-
-// Per-target flag to prevent generic_ops-inl.h from defining AESRound.
-#ifdef HWY_NATIVE_AES
-#undef HWY_NATIVE_AES
-#else
-#define HWY_NATIVE_AES
-#endif
-
-HWY_API Vec512<uint8_t> AESRound(Vec512<uint8_t> state,
-                                 Vec512<uint8_t> round_key) {
-#if HWY_TARGET == HWY_AVX3_DL
-  return Vec512<uint8_t>{_mm512_aesenc_epi128(state.raw, round_key.raw)};
-#else
-  const Full512<uint8_t> d;
-  const Half<decltype(d)> d2;
-  return Combine(d, AESRound(UpperHalf(d2, state), UpperHalf(d2, round_key)),
-                 AESRound(LowerHalf(state), LowerHalf(round_key)));
-#endif
-}
-
-HWY_API Vec512<uint8_t> AESLastRound(Vec512<uint8_t> state,
-                                     Vec512<uint8_t> round_key) {
-#if HWY_TARGET == HWY_AVX3_DL
-  return Vec512<uint8_t>{_mm512_aesenclast_epi128(state.raw, round_key.raw)};
-#else
-  const Full512<uint8_t> d;
-  const Half<decltype(d)> d2;
-  return Combine(d,
-                 AESLastRound(UpperHalf(d2, state), UpperHalf(d2, round_key)),
-                 AESLastRound(LowerHalf(state), LowerHalf(round_key)));
-#endif
-}
-
-HWY_API Vec512<uint64_t> CLMulLower(Vec512<uint64_t> va, Vec512<uint64_t> vb) {
-#if HWY_TARGET == HWY_AVX3_DL
-  return Vec512<uint64_t>{_mm512_clmulepi64_epi128(va.raw, vb.raw, 0x00)};
-#else
-  alignas(64) uint64_t a[8];
-  alignas(64) uint64_t b[8];
-  const Full512<uint64_t> d;
-  const Full128<uint64_t> d128;
-  Store(va, d, a);
-  Store(vb, d, b);
-  for (size_t i = 0; i < 8; i += 2) {
-    const auto mul = CLMulLower(Load(d128, a + i), Load(d128, b + i));
-    Store(mul, d128, a + i);
-  }
-  return Load(d, a);
-#endif
-}
-
-HWY_API Vec512<uint64_t> CLMulUpper(Vec512<uint64_t> va, Vec512<uint64_t> vb) {
-#if HWY_TARGET == HWY_AVX3_DL
-  return Vec512<uint64_t>{_mm512_clmulepi64_epi128(va.raw, vb.raw, 0x11)};
-#else
-  alignas(64) uint64_t a[8];
-  alignas(64) uint64_t b[8];
-  const Full512<uint64_t> d;
-  const Full128<uint64_t> d128;
-  Store(va, d, a);
-  Store(vb, d, b);
-  for (size_t i = 0; i < 8; i += 2) {
-    const auto mul = CLMulUpper(Load(d128, a + i), Load(d128, b + i));
-    Store(mul, d128, a + i);
-  }
-  return Load(d, a);
-#endif
-}
-
-#endif  // HWY_DISABLE_PCLMUL_AES
-
-// ================================================== MISC
-
-// Returns a vector with lane i=[0, N) set to "first" + i.
-template <typename T, typename T2>
-Vec512<T> Iota(const Full512<T> d, const T2 first) {
-  HWY_ALIGN T lanes[64 / sizeof(T)];
-  for (size_t i = 0; i < 64 / sizeof(T); ++i) {
-    lanes[i] = static_cast<T>(first + static_cast<T2>(i));
-  }
-  return Load(d, lanes);
-}
-
-// ------------------------------ Mask testing
-
-// Beware: the suffix indicates the number of mask bits, not lane size!
-
-namespace detail {
-
-template <typename T>
-HWY_INLINE bool AllFalse(hwy::SizeTag<1> /*tag*/, const Mask512<T> mask) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return _kortestz_mask64_u8(mask.raw, mask.raw);
-#else
-  return mask.raw == 0;
-#endif
-}
-template <typename T>
-HWY_INLINE bool AllFalse(hwy::SizeTag<2> /*tag*/, const Mask512<T> mask) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return _kortestz_mask32_u8(mask.raw, mask.raw);
-#else
-  return mask.raw == 0;
-#endif
-}
-template <typename T>
-HWY_INLINE bool AllFalse(hwy::SizeTag<4> /*tag*/, const Mask512<T> mask) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return _kortestz_mask16_u8(mask.raw, mask.raw);
-#else
-  return mask.raw == 0;
-#endif
-}
-template <typename T>
-HWY_INLINE bool AllFalse(hwy::SizeTag<8> /*tag*/, const Mask512<T> mask) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return _kortestz_mask8_u8(mask.raw, mask.raw);
-#else
-  return mask.raw == 0;
-#endif
-}
-
-}  // namespace detail
-
-template <typename T>
-HWY_API bool AllFalse(const Full512<T> /* tag */, const Mask512<T> mask) {
-  return detail::AllFalse(hwy::SizeTag<sizeof(T)>(), mask);
-}
-
-namespace detail {
-
-template <typename T>
-HWY_INLINE bool AllTrue(hwy::SizeTag<1> /*tag*/, const Mask512<T> mask) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return _kortestc_mask64_u8(mask.raw, mask.raw);
-#else
-  return mask.raw == 0xFFFFFFFFFFFFFFFFull;
-#endif
-}
-template <typename T>
-HWY_INLINE bool AllTrue(hwy::SizeTag<2> /*tag*/, const Mask512<T> mask) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return _kortestc_mask32_u8(mask.raw, mask.raw);
-#else
-  return mask.raw == 0xFFFFFFFFull;
-#endif
-}
-template <typename T>
-HWY_INLINE bool AllTrue(hwy::SizeTag<4> /*tag*/, const Mask512<T> mask) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return _kortestc_mask16_u8(mask.raw, mask.raw);
-#else
-  return mask.raw == 0xFFFFull;
-#endif
-}
-template <typename T>
-HWY_INLINE bool AllTrue(hwy::SizeTag<8> /*tag*/, const Mask512<T> mask) {
-#if HWY_COMPILER_HAS_MASK_INTRINSICS
-  return _kortestc_mask8_u8(mask.raw, mask.raw);
-#else
-  return mask.raw == 0xFFull;
-#endif
-}
-
-}  // namespace detail
-
-template <typename T>
-HWY_API bool AllTrue(const Full512<T> /* tag */, const Mask512<T> mask) {
-  return detail::AllTrue(hwy::SizeTag<sizeof(T)>(), mask);
-}
-
-// `p` points to at least 8 readable bytes, not all of which need be valid.
-template <typename T>
-HWY_API Mask512<T> LoadMaskBits(const Full512<T> /* tag */,
-                                const uint8_t* HWY_RESTRICT bits) {
-  Mask512<T> mask;
-  CopyBytes<8 / sizeof(T)>(bits, &mask.raw);
-  // N >= 8 (= 512 / 64), so no need to mask invalid bits.
-  return mask;
-}
-
-// `p` points to at least 8 writable bytes.
-template <typename T>
-HWY_API size_t StoreMaskBits(const Full512<T> /* tag */, const Mask512<T> mask,
-                             uint8_t* bits) {
-  const size_t kNumBytes = 8 / sizeof(T);
-  CopyBytes<kNumBytes>(&mask.raw, bits);
-  // N >= 8 (= 512 / 64), so no need to mask invalid bits.
-  return kNumBytes;
-}
-
-template <typename T>
-HWY_API size_t CountTrue(const Full512<T> /* tag */, const Mask512<T> mask) {
-  return PopCount(static_cast<uint64_t>(mask.raw));
-}
-
-template <typename T, HWY_IF_NOT_LANE_SIZE(T, 1)>
-HWY_API intptr_t FindFirstTrue(const Full512<T> /* tag */,
-                               const Mask512<T> mask) {
-  return mask.raw ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask.raw)) : -1;
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 1)>
-HWY_API intptr_t FindFirstTrue(const Full512<T> /* tag */,
-                               const Mask512<T> mask) {
-  return mask.raw ? intptr_t(Num0BitsBelowLS1Bit_Nonzero64(mask.raw)) : -1;
-}
-
-// ------------------------------ Compress
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API Vec512<T> Compress(Vec512<T> v, Mask512<T> mask) {
-  return Vec512<T>{_mm512_maskz_compress_epi32(mask.raw, v.raw)};
-}
-
-HWY_API Vec512<float> Compress(Vec512<float> v, Mask512<float> mask) {
-  return Vec512<float>{_mm512_maskz_compress_ps(mask.raw, v.raw)};
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec512<T> Compress(Vec512<T> v, Mask512<T> mask) {
-  // See CompressIsPartition. u64 is faster than u32.
-  alignas(16) constexpr uint64_t packed_array[256] = {
-      // PrintCompress32x8Tables
-      0x76543210, 0x76543210, 0x76543201, 0x76543210, 0x76543102, 0x76543120,
-      0x76543021, 0x76543210, 0x76542103, 0x76542130, 0x76542031, 0x76542310,
-      0x76541032, 0x76541320, 0x76540321, 0x76543210, 0x76532104, 0x76532140,
-      0x76532041, 0x76532410, 0x76531042, 0x76531420, 0x76530421, 0x76534210,
-      0x76521043, 0x76521430, 0x76520431, 0x76524310, 0x76510432, 0x76514320,
-      0x76504321, 0x76543210, 0x76432105, 0x76432150, 0x76432051, 0x76432510,
-      0x76431052, 0x76431520, 0x76430521, 0x76435210, 0x76421053, 0x76421530,
-      0x76420531, 0x76425310, 0x76410532, 0x76415320, 0x76405321, 0x76453210,
-      0x76321054, 0x76321540, 0x76320541, 0x76325410, 0x76310542, 0x76315420,
-      0x76305421, 0x76354210, 0x76210543, 0x76215430, 0x76205431, 0x76254310,
-      0x76105432, 0x76154320, 0x76054321, 0x76543210, 0x75432106, 0x75432160,
-      0x75432061, 0x75432610, 0x75431062, 0x75431620, 0x75430621, 0x75436210,
-      0x75421063, 0x75421630, 0x75420631, 0x75426310, 0x75410632, 0x75416320,
-      0x75406321, 0x75463210, 0x75321064, 0x75321640, 0x75320641, 0x75326410,
-      0x75310642, 0x75316420, 0x75306421, 0x75364210, 0x75210643, 0x75216430,
-      0x75206431, 0x75264310, 0x75106432, 0x75164320, 0x75064321, 0x75643210,
-      0x74321065, 0x74321650, 0x74320651, 0x74326510, 0x74310652, 0x74316520,
-      0x74306521, 0x74365210, 0x74210653, 0x74216530, 0x74206531, 0x74265310,
-      0x74106532, 0x74165320, 0x74065321, 0x74653210, 0x73210654, 0x73216540,
-      0x73206541, 0x73265410, 0x73106542, 0x73165420, 0x73065421, 0x73654210,
-      0x72106543, 0x72165430, 0x72065431, 0x72654310, 0x71065432, 0x71654320,
-      0x70654321, 0x76543210, 0x65432107, 0x65432170, 0x65432071, 0x65432710,
-      0x65431072, 0x65431720, 0x65430721, 0x65437210, 0x65421073, 0x65421730,
-      0x65420731, 0x65427310, 0x65410732, 0x65417320, 0x65407321, 0x65473210,
-      0x65321074, 0x65321740, 0x65320741, 0x65327410, 0x65310742, 0x65317420,
-      0x65307421, 0x65374210, 0x65210743, 0x65217430, 0x65207431, 0x65274310,
-      0x65107432, 0x65174320, 0x65074321, 0x65743210, 0x64321075, 0x64321750,
-      0x64320751, 0x64327510, 0x64310752, 0x64317520, 0x64307521, 0x64375210,
-      0x64210753, 0x64217530, 0x64207531, 0x64275310, 0x64107532, 0x64175320,
-      0x64075321, 0x64753210, 0x63210754, 0x63217540, 0x63207541, 0x63275410,
-      0x63107542, 0x63175420, 0x63075421, 0x63754210, 0x62107543, 0x62175430,
-      0x62075431, 0x62754310, 0x61075432, 0x61754320, 0x60754321, 0x67543210,
-      0x54321076, 0x54321760, 0x54320761, 0x54327610, 0x54310762, 0x54317620,
-      0x54307621, 0x54376210, 0x54210763, 0x54217630, 0x54207631, 0x54276310,
-      0x54107632, 0x54176320, 0x54076321, 0x54763210, 0x53210764, 0x53217640,
-      0x53207641, 0x53276410, 0x53107642, 0x53176420, 0x53076421, 0x53764210,
-      0x52107643, 0x52176430, 0x52076431, 0x52764310, 0x51076432, 0x51764320,
-      0x50764321, 0x57643210, 0x43210765, 0x43217650, 0x43207651, 0x43276510,
-      0x43107652, 0x43176520, 0x43076521, 0x43765210, 0x42107653, 0x42176530,
-      0x42076531, 0x42765310, 0x41076532, 0x41765320, 0x40765321, 0x47653210,
-      0x32107654, 0x32176540, 0x32076541, 0x32765410, 0x31076542, 0x31765420,
-      0x30765421, 0x37654210, 0x21076543, 0x21765430, 0x20765431, 0x27654310,
-      0x10765432, 0x17654320, 0x07654321, 0x76543210};
-
-  // For lane i, shift the i-th 4-bit index down to bits [0, 3) -
-  // _mm512_permutexvar_epi64 will ignore the upper bits.
-  const Full512<T> d;
-  const RebindToUnsigned<decltype(d)> du64;
-  const auto packed = Set(du64, packed_array[mask.raw]);
-  alignas(64) constexpr uint64_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
-  const auto indices = Indices512<T>{(packed >> Load(du64, shifts)).raw};
-  return TableLookupLanes(v, indices);
-}
-
-// 16-bit may use the 32-bit Compress and must be defined after it.
-//
-// Ignore IDE redefinition error - this is not actually defined in x86_256 if
-// we are including x86_512-inl.h.
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec256<T> Compress(Vec256<T> v, Mask256<T> mask) {
-  const Full256<T> d;
-  const Rebind<uint16_t, decltype(d)> du;
-  const auto vu = BitCast(du, v);  // (required for float16_t inputs)
-
-#if HWY_TARGET == HWY_AVX3_DL  // VBMI2
-  const Vec256<uint16_t> cu{_mm256_maskz_compress_epi16(mask.raw, vu.raw)};
-#else
-  // Promote to i32 (512-bit vector!) so we can use the native Compress.
-  const auto vw = PromoteTo(Rebind<int32_t, decltype(d)>(), vu);
-  const Mask512<int32_t> mask32{static_cast<__mmask16>(mask.raw)};
-  const auto cu = DemoteTo(du, Compress(vw, mask32));
-#endif  // HWY_TARGET == HWY_AVX3_DL
-
-  return BitCast(d, cu);
-}
-
-// Expands to 32-bit, compresses, concatenate demoted halves.
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API Vec512<T> Compress(Vec512<T> v, const Mask512<T> mask) {
-  const Full512<T> d;
-  const Rebind<uint16_t, decltype(d)> du;
-  const auto vu = BitCast(du, v);  // (required for float16_t inputs)
-
-#if HWY_TARGET == HWY_AVX3_DL  // VBMI2
-  const Vec512<uint16_t> cu{_mm512_maskz_compress_epi16(mask.raw, vu.raw)};
-#else
-  const Repartition<int32_t, decltype(d)> dw;
-  const Half<decltype(du)> duh;
-  const auto promoted0 = PromoteTo(dw, LowerHalf(duh, vu));
-  const auto promoted1 = PromoteTo(dw, UpperHalf(duh, vu));
-
-  const uint32_t mask_bits{mask.raw};
-  const Mask512<int32_t> mask0{static_cast<__mmask16>(mask_bits & 0xFFFF)};
-  const Mask512<int32_t> mask1{static_cast<__mmask16>(mask_bits >> 16)};
-  const auto compressed0 = Compress(promoted0, mask0);
-  const auto compressed1 = Compress(promoted1, mask1);
-
-  const auto demoted0 = ZeroExtendVector(du, DemoteTo(duh, compressed0));
-  const auto demoted1 = ZeroExtendVector(du, DemoteTo(duh, compressed1));
-
-  // Concatenate into single vector by shifting upper with writemask.
-  const size_t num0 = CountTrue(dw, mask0);
-  const __mmask32 m_upper = ~((1u << num0) - 1);
-  alignas(64) uint16_t iota[64] = {
-      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-      16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
-  const auto idx = LoadU(du, iota + 32 - num0);
-  const Vec512<uint16_t> cu{_mm512_mask_permutexvar_epi16(
-      demoted0.raw, m_upper, idx.raw, demoted1.raw)};
-#endif  // HWY_TARGET == HWY_AVX3_DL
-
-  return BitCast(d, cu);
-}
-
-// ------------------------------ CompressNot
-
-template <typename T, HWY_IF_NOT_LANE_SIZE(T, 8)>
-HWY_API Vec512<T> CompressNot(Vec512<T> v, const Mask512<T> mask) {
-  return Compress(v, Not(mask));
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API Vec512<T> CompressNot(Vec512<T> v, Mask512<T> mask) {
-  // See CompressIsPartition. u64 is faster than u32.
-  alignas(16) constexpr uint64_t packed_array[256] = {
-      // PrintCompressNot32x8Tables
-      0x76543210, 0x07654321, 0x17654320, 0x10765432, 0x27654310, 0x20765431,
-      0x21765430, 0x21076543, 0x37654210, 0x30765421, 0x31765420, 0x31076542,
-      0x32765410, 0x32076541, 0x32176540, 0x32107654, 0x47653210, 0x40765321,
-      0x41765320, 0x41076532, 0x42765310, 0x42076531, 0x42176530, 0x42107653,
-      0x43765210, 0x43076521, 0x43176520, 0x43107652, 0x43276510, 0x43207651,
-      0x43217650, 0x43210765, 0x57643210, 0x50764321, 0x51764320, 0x51076432,
-      0x52764310, 0x52076431, 0x52176430, 0x52107643, 0x53764210, 0x53076421,
-      0x53176420, 0x53107642, 0x53276410, 0x53207641, 0x53217640, 0x53210764,
-      0x54763210, 0x54076321, 0x54176320, 0x54107632, 0x54276310, 0x54207631,
-      0x54217630, 0x54210763, 0x54376210, 0x54307621, 0x54317620, 0x54310762,
-      0x54327610, 0x54320761, 0x54321760, 0x54321076, 0x67543210, 0x60754321,
-      0x61754320, 0x61075432, 0x62754310, 0x62075431, 0x62175430, 0x62107543,
-      0x63754210, 0x63075421, 0x63175420, 0x63107542, 0x63275410, 0x63207541,
-      0x63217540, 0x63210754, 0x64753210, 0x64075321, 0x64175320, 0x64107532,
-      0x64275310, 0x64207531, 0x64217530, 0x64210753, 0x64375210, 0x64307521,
-      0x64317520, 0x64310752, 0x64327510, 0x64320751, 0x64321750, 0x64321075,
-      0x65743210, 0x65074321, 0x65174320, 0x65107432, 0x65274310, 0x65207431,
-      0x65217430, 0x65210743, 0x65374210, 0x65307421, 0x65317420, 0x65310742,
-      0x65327410, 0x65320741, 0x65321740, 0x65321074, 0x65473210, 0x65407321,
-      0x65417320, 0x65410732, 0x65427310, 0x65420731, 0x65421730, 0x65421073,
-      0x65437210, 0x65430721, 0x65431720, 0x65431072, 0x65432710, 0x65432071,
-      0x65432170, 0x65432107, 0x76543210, 0x70654321, 0x71654320, 0x71065432,
-      0x72654310, 0x72065431, 0x72165430, 0x72106543, 0x73654210, 0x73065421,
-      0x73165420, 0x73106542, 0x73265410, 0x73206541, 0x73216540, 0x73210654,
-      0x74653210, 0x74065321, 0x74165320, 0x74106532, 0x74265310, 0x74206531,
-      0x74216530, 0x74210653, 0x74365210, 0x74306521, 0x74316520, 0x74310652,
-      0x74326510, 0x74320651, 0x74321650, 0x74321065, 0x75643210, 0x75064321,
-      0x75164320, 0x75106432, 0x75264310, 0x75206431, 0x75216430, 0x75210643,
-      0x75364210, 0x75306421, 0x75316420, 0x75310642, 0x75326410, 0x75320641,
-      0x75321640, 0x75321064, 0x75463210, 0x75406321, 0x75416320, 0x75410632,
-      0x75426310, 0x75420631, 0x75421630, 0x75421063, 0x75436210, 0x75430621,
-      0x75431620, 0x75431062, 0x75432610, 0x75432061, 0x75432160, 0x75432106,
-      0x76543210, 0x76054321, 0x76154320, 0x76105432, 0x76254310, 0x76205431,
-      0x76215430, 0x76210543, 0x76354210, 0x76305421, 0x76315420, 0x76310542,
-      0x76325410, 0x76320541, 0x76321540, 0x76321054, 0x76453210, 0x76405321,
-      0x76415320, 0x76410532, 0x76425310, 0x76420531, 0x76421530, 0x76421053,
-      0x76435210, 0x76430521, 0x76431520, 0x76431052, 0x76432510, 0x76432051,
-      0x76432150, 0x76432105, 0x76543210, 0x76504321, 0x76514320, 0x76510432,
-      0x76524310, 0x76520431, 0x76521430, 0x76521043, 0x76534210, 0x76530421,
-      0x76531420, 0x76531042, 0x76532410, 0x76532041, 0x76532140, 0x76532104,
-      0x76543210, 0x76540321, 0x76541320, 0x76541032, 0x76542310, 0x76542031,
-      0x76542130, 0x76542103, 0x76543210, 0x76543021, 0x76543120, 0x76543102,
-      0x76543210, 0x76543201, 0x76543210, 0x76543210};
-
-  // For lane i, shift the i-th 4-bit index down to bits [0, 3) -
-  // _mm512_permutexvar_epi64 will ignore the upper bits.
-  const Full512<T> d;
-  const RebindToUnsigned<decltype(d)> du64;
-  const auto packed = Set(du64, packed_array[mask.raw]);
-  alignas(64) constexpr uint64_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28};
-  const auto indices = Indices512<T>{(packed >> Load(du64, shifts)).raw};
-  return TableLookupLanes(v, indices);
-}
-
-HWY_API Vec512<uint64_t> CompressBlocksNot(Vec512<uint64_t> v,
-                                           Mask512<uint64_t> mask) {
-  return CompressNot(v, mask);
-}
-
-// ------------------------------ CompressBits
-template <typename T>
-HWY_API Vec512<T> CompressBits(Vec512<T> v, const uint8_t* HWY_RESTRICT bits) {
-  return Compress(v, LoadMaskBits(Full512<T>(), bits));
-}
-
-// ------------------------------ CompressStore
-
-template <typename T, HWY_IF_LANE_SIZE(T, 2)>
-HWY_API size_t CompressStore(Vec512<T> v, Mask512<T> mask, Full512<T> d,
-                             T* HWY_RESTRICT unaligned) {
-  const Rebind<uint16_t, decltype(d)> du;
-  const auto vu = BitCast(du, v);  // (required for float16_t inputs)
-
-  const uint64_t mask_bits{mask.raw};
-
-#if HWY_TARGET == HWY_AVX3_DL  // VBMI2
-  _mm512_mask_compressstoreu_epi16(unaligned, mask.raw, vu.raw);
-#else
-  const Repartition<int32_t, decltype(d)> dw;
-  const Half<decltype(du)> duh;
-  const auto promoted0 = PromoteTo(dw, LowerHalf(duh, vu));
-  const auto promoted1 = PromoteTo(dw, UpperHalf(duh, vu));
-
-  const uint64_t maskL = mask_bits & 0xFFFF;
-  const uint64_t maskH = mask_bits >> 16;
-  const Mask512<int32_t> mask0{static_cast<__mmask16>(maskL)};
-  const Mask512<int32_t> mask1{static_cast<__mmask16>(maskH)};
-  const auto compressed0 = Compress(promoted0, mask0);
-  const auto compressed1 = Compress(promoted1, mask1);
-
-  const Half<decltype(d)> dh;
-  const auto demoted0 = BitCast(dh, DemoteTo(duh, compressed0));
-  const auto demoted1 = BitCast(dh, DemoteTo(duh, compressed1));
-
-  // Store 256-bit halves
-  StoreU(demoted0, dh, unaligned);
-  StoreU(demoted1, dh, unaligned + PopCount(maskL));
-#endif
-
-  return PopCount(mask_bits);
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 4)>
-HWY_API size_t CompressStore(Vec512<T> v, Mask512<T> mask, Full512<T> /* tag */,
-                             T* HWY_RESTRICT unaligned) {
-  _mm512_mask_compressstoreu_epi32(unaligned, mask.raw, v.raw);
-  const size_t count = PopCount(uint64_t{mask.raw});
-// Workaround for MSAN not marking output as initialized (b/233326619)
-#if HWY_IS_MSAN
-  __msan_unpoison(unaligned, count * sizeof(T));
-#endif
-  return count;
-}
-
-template <typename T, HWY_IF_LANE_SIZE(T, 8)>
-HWY_API size_t CompressStore(Vec512<T> v, Mask512<T> mask, Full512<T> /* tag */,
-                             T* HWY_RESTRICT unaligned) {
-  _mm512_mask_compressstoreu_epi64(unaligned, mask.raw, v.raw);
-  const size_t count = PopCount(uint64_t{mask.raw});
-// Workaround for MSAN not marking output as initialized (b/233326619)
-#if HWY_IS_MSAN
-  __msan_unpoison(unaligned, count * sizeof(T));
-#endif
-  return count;
-}
-
-HWY_API size_t CompressStore(Vec512<float> v, Mask512<float> mask,
-                             Full512<float> /* tag */,
-                             float* HWY_RESTRICT unaligned) {
-  _mm512_mask_compressstoreu_ps(unaligned, mask.raw, v.raw);
-  const size_t count = PopCount(uint64_t{mask.raw});
-// Workaround for MSAN not marking output as initialized (b/233326619)
-#if HWY_IS_MSAN
-  __msan_unpoison(unaligned, count * sizeof(float));
-#endif
-  return count;
-}
-
-HWY_API size_t CompressStore(Vec512<double> v, Mask512<double> mask,
-                             Full512<double> /* tag */,
-                             double* HWY_RESTRICT unaligned) {
-  _mm512_mask_compressstoreu_pd(unaligned, mask.raw, v.raw);
-  const size_t count = PopCount(uint64_t{mask.raw});
-// Workaround for MSAN not marking output as initialized (b/233326619)
-#if HWY_IS_MSAN
-  __msan_unpoison(unaligned, count * sizeof(double));
-#endif
-  return count;
-}
-
-// ------------------------------ CompressBlendedStore
-template <typename T>
-HWY_API size_t CompressBlendedStore(Vec512<T> v, Mask512<T> m, Full512<T> d,
-                                    T* HWY_RESTRICT unaligned) {
-  // AVX-512 already does the blending at no extra cost (latency 11,
-  // rthroughput 2 - same as compress plus store).
-  if (HWY_TARGET == HWY_AVX3_DL || sizeof(T) != 2) {
-    return CompressStore(v, m, d, unaligned);
-  } else {
-    const size_t count = CountTrue(d, m);
-    BlendedStore(Compress(v, m), FirstN(d, count), d, unaligned);
-// Workaround for MSAN not marking output as initialized (b/233326619)
-#if HWY_IS_MSAN
-    __msan_unpoison(unaligned, count * sizeof(T));
-#endif
-    return count;
-  }
-}
-
-// ------------------------------ CompressBitsStore
-template <typename T>
-HWY_API size_t CompressBitsStore(Vec512<T> v, const uint8_t* HWY_RESTRICT bits,
-                                 Full512<T> d, T* HWY_RESTRICT unaligned) {
-  return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
-}
-
-// ------------------------------ LoadInterleaved4
-
-// Actually implemented in generic_ops, we just overload LoadTransposedBlocks4.
-namespace detail {
-
-// Type-safe wrapper.
-template <_MM_PERM_ENUM kPerm, typename T>
-Vec512<T> Shuffle128(const Vec512<T> lo, const Vec512<T> hi) {
-  return Vec512<T>{_mm512_shuffle_i64x2(lo.raw, hi.raw, kPerm)};
-}
-template <_MM_PERM_ENUM kPerm>
-Vec512<float> Shuffle128(const Vec512<float> lo, const Vec512<float> hi) {
-  return Vec512<float>{_mm512_shuffle_f32x4(lo.raw, hi.raw, kPerm)};
-}
-template <_MM_PERM_ENUM kPerm>
-Vec512<double> Shuffle128(const Vec512<double> lo, const Vec512<double> hi) {
-  return Vec512<double>{_mm512_shuffle_f64x2(lo.raw, hi.raw, kPerm)};
-}
-
-// Input (128-bit blocks):
-// 3 2 1 0 (<- first block in unaligned)
-// 7 6 5 4
-// b a 9 8
-// Output:
-// 9 6 3 0 (LSB of A)
-// a 7 4 1
-// b 8 5 2
-template <typename T>
-HWY_API void LoadTransposedBlocks3(Full512<T> d,
-                                   const T* HWY_RESTRICT unaligned,
-                                   Vec512<T>& A, Vec512<T>& B, Vec512<T>& C) {
-  constexpr size_t N = 64 / sizeof(T);
-  const Vec512<T> v3210 = LoadU(d, unaligned + 0 * N);
-  const Vec512<T> v7654 = LoadU(d, unaligned + 1 * N);
-  const Vec512<T> vba98 = LoadU(d, unaligned + 2 * N);
-
-  const Vec512<T> v5421 = detail::Shuffle128<_MM_PERM_BACB>(v3210, v7654);
-  const Vec512<T> va976 = detail::Shuffle128<_MM_PERM_CBDC>(v7654, vba98);
-
-  A = detail::Shuffle128<_MM_PERM_CADA>(v3210, va976);
-  B = detail::Shuffle128<_MM_PERM_DBCA>(v5421, va976);
-  C = detail::Shuffle128<_MM_PERM_DADB>(v5421, vba98);
-}
-
-// Input (128-bit blocks):
-// 3 2 1 0 (<- first block in unaligned)
-// 7 6 5 4
-// b a 9 8
-// f e d c
-// Output:
-// c 8 4 0 (LSB of A)
-// d 9 5 1
-// e a 6 2
-// f b 7 3
-template <typename T>
-HWY_API void LoadTransposedBlocks4(Full512<T> d,
-                                   const T* HWY_RESTRICT unaligned,
-                                   Vec512<T>& A, Vec512<T>& B, Vec512<T>& C,
-                                   Vec512<T>& D) {
-  constexpr size_t N = 64 / sizeof(T);
-  const Vec512<T> v3210 = LoadU(d, unaligned + 0 * N);
-  const Vec512<T> v7654 = LoadU(d, unaligned + 1 * N);
-  const Vec512<T> vba98 = LoadU(d, unaligned + 2 * N);
-  const Vec512<T> vfedc = LoadU(d, unaligned + 3 * N);
-
-  const Vec512<T> v5410 = detail::Shuffle128<_MM_PERM_BABA>(v3210, v7654);
-  const Vec512<T> vdc98 = detail::Shuffle128<_MM_PERM_BABA>(vba98, vfedc);
-  const Vec512<T> v7632 = detail::Shuffle128<_MM_PERM_DCDC>(v3210, v7654);
-  const Vec512<T> vfeba = detail::Shuffle128<_MM_PERM_DCDC>(vba98, vfedc);
-  A = detail::Shuffle128<_MM_PERM_CACA>(v5410, vdc98);
-  B = detail::Shuffle128<_MM_PERM_DBDB>(v5410, vdc98);
-  C = detail::Shuffle128<_MM_PERM_CACA>(v7632, vfeba);
-  D = detail::Shuffle128<_MM_PERM_DBDB>(v7632, vfeba);
-}
-
-}  // namespace detail
-
-// ------------------------------ StoreInterleaved2
-
-// Implemented in generic_ops, we just overload StoreTransposedBlocks2/3/4.
-
-namespace detail {
-
-// Input (128-bit blocks):
-// 6 4 2 0 (LSB of i)
-// 7 5 3 1
-// Output:
-// 3 2 1 0
-// 7 6 5 4
-template <typename T>
-HWY_API void StoreTransposedBlocks2(const Vec512<T> i, const Vec512<T> j,
-                                    const Full512<T> d,
-                                    T* HWY_RESTRICT unaligned) {
-  constexpr size_t N = 64 / sizeof(T);
-  const auto j1_j0_i1_i0 = detail::Shuffle128<_MM_PERM_BABA>(i, j);
-  const auto j3_j2_i3_i2 = detail::Shuffle128<_MM_PERM_DCDC>(i, j);
-  const auto j1_i1_j0_i0 =
-      detail::Shuffle128<_MM_PERM_DBCA>(j1_j0_i1_i0, j1_j0_i1_i0);
-  const auto j3_i3_j2_i2 =
-      detail::Shuffle128<_MM_PERM_DBCA>(j3_j2_i3_i2, j3_j2_i3_i2);
-  StoreU(j1_i1_j0_i0, d, unaligned + 0 * N);
-  StoreU(j3_i3_j2_i2, d, unaligned + 1 * N);
-}
-
-// Input (128-bit blocks):
-// 9 6 3 0 (LSB of i)
-// a 7 4 1
-// b 8 5 2
-// Output:
-// 3 2 1 0
-// 7 6 5 4
-// b a 9 8
-template <typename T>
-HWY_API void StoreTransposedBlocks3(const Vec512<T> i, const Vec512<T> j,
-                                    const Vec512<T> k, Full512<T> d,
-                                    T* HWY_RESTRICT unaligned) {
-  constexpr size_t N = 64 / sizeof(T);
-  const Vec512<T> j2_j0_i2_i0 = detail::Shuffle128<_MM_PERM_CACA>(i, j);
-  const Vec512<T> i3_i1_k2_k0 = detail::Shuffle128<_MM_PERM_DBCA>(k, i);
-  const Vec512<T> j3_j1_k3_k1 = detail::Shuffle128<_MM_PERM_DBDB>(k, j);
-
-  const Vec512<T> out0 =  // i1 k0 j0 i0
-      detail::Shuffle128<_MM_PERM_CACA>(j2_j0_i2_i0, i3_i1_k2_k0);
-  const Vec512<T> out1 =  // j2 i2 k1 j1
-      detail::Shuffle128<_MM_PERM_DBAC>(j3_j1_k3_k1, j2_j0_i2_i0);
-  const Vec512<T> out2 =  // k3 j3 i3 k2
-      detail::Shuffle128<_MM_PERM_BDDB>(i3_i1_k2_k0, j3_j1_k3_k1);
-
-  StoreU(out0, d, unaligned + 0 * N);
-  StoreU(out1, d, unaligned + 1 * N);
-  StoreU(out2, d, unaligned + 2 * N);
-}
-
-// Input (128-bit blocks):
-// c 8 4 0 (LSB of i)
-// d 9 5 1
-// e a 6 2
-// f b 7 3
-// Output:
-// 3 2 1 0
-// 7 6 5 4
-// b a 9 8
-// f e d c
-template <typename T>
-HWY_API void StoreTransposedBlocks4(const Vec512<T> i, const Vec512<T> j,
-                                    const Vec512<T> k, const Vec512<T> l,
-                                    Full512<T> d, T* HWY_RESTRICT unaligned) {
-  constexpr size_t N = 64 / sizeof(T);
-  const Vec512<T> j1_j0_i1_i0 = detail::Shuffle128<_MM_PERM_BABA>(i, j);
-  const Vec512<T> l1_l0_k1_k0 = detail::Shuffle128<_MM_PERM_BABA>(k, l);
-  const Vec512<T> j3_j2_i3_i2 = detail::Shuffle128<_MM_PERM_DCDC>(i, j);
-  const Vec512<T> l3_l2_k3_k2 = detail::Shuffle128<_MM_PERM_DCDC>(k, l);
-  const Vec512<T> out0 =
-      detail::Shuffle128<_MM_PERM_CACA>(j1_j0_i1_i0, l1_l0_k1_k0);
-  const Vec512<T> out1 =
-      detail::Shuffle128<_MM_PERM_DBDB>(j1_j0_i1_i0, l1_l0_k1_k0);
-  const Vec512<T> out2 =
-      detail::Shuffle128<_MM_PERM_CACA>(j3_j2_i3_i2, l3_l2_k3_k2);
-  const Vec512<T> out3 =
-      detail::Shuffle128<_MM_PERM_DBDB>(j3_j2_i3_i2, l3_l2_k3_k2);
-  StoreU(out0, d, unaligned + 0 * N);
-  StoreU(out1, d, unaligned + 1 * N);
-  StoreU(out2, d, unaligned + 2 * N);
-  StoreU(out3, d, unaligned + 3 * N);
-}
-
-}  // namespace detail
-
-// ------------------------------ MulEven/Odd (Shuffle2301, InterleaveLower)
-
-HWY_INLINE Vec512<uint64_t> MulEven(const Vec512<uint64_t> a,
-                                    const Vec512<uint64_t> b) {
-  const DFromV<decltype(a)> du64;
-  const RepartitionToNarrow<decltype(du64)> du32;
-  const auto maskL = Set(du64, 0xFFFFFFFFULL);
-  const auto a32 = BitCast(du32, a);
-  const auto b32 = BitCast(du32, b);
-  // Inputs for MulEven: we only need the lower 32 bits
-  const auto aH = Shuffle2301(a32);
-  const auto bH = Shuffle2301(b32);
-
-  // Knuth double-word multiplication. We use 32x32 = 64 MulEven and only need
-  // the even (lower 64 bits of every 128-bit block) results. See
-  // https://github.com/hcs0/Hackers-Delight/blob/master/muldwu.c.tat
-  const auto aLbL = MulEven(a32, b32);
-  const auto w3 = aLbL & maskL;
-
-  const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
-  const auto w2 = t2 & maskL;
-  const auto w1 = ShiftRight<32>(t2);
-
-  const auto t = MulEven(a32, bH) + w2;
-  const auto k = ShiftRight<32>(t);
-
-  const auto mulH = MulEven(aH, bH) + w1 + k;
-  const auto mulL = ShiftLeft<32>(t) + w3;
-  return InterleaveLower(mulL, mulH);
-}
-
-HWY_INLINE Vec512<uint64_t> MulOdd(const Vec512<uint64_t> a,
-                                   const Vec512<uint64_t> b) {
-  const DFromV<decltype(a)> du64;
-  const RepartitionToNarrow<decltype(du64)> du32;
-  const auto maskL = Set(du64, 0xFFFFFFFFULL);
-  const auto a32 = BitCast(du32, a);
-  const auto b32 = BitCast(du32, b);
-  // Inputs for MulEven: we only need bits [95:64] (= upper half of input)
-  const auto aH = Shuffle2301(a32);
-  const auto bH = Shuffle2301(b32);
-
-  // Same as above, but we're using the odd results (upper 64 bits per block).
-  const auto aLbL = MulEven(a32, b32);
-  const auto w3 = aLbL & maskL;
-
-  const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL);
-  const auto w2 = t2 & maskL;
-  const auto w1 = ShiftRight<32>(t2);
-
-  const auto t = MulEven(a32, bH) + w2;
-  const auto k = ShiftRight<32>(t);
-
-  const auto mulH = MulEven(aH, bH) + w1 + k;
-  const auto mulL = ShiftLeft<32>(t) + w3;
-  return InterleaveUpper(du64, mulL, mulH);
-}
-
-// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
-
-HWY_API Vec512<float> ReorderWidenMulAccumulate(Full512<float> df32,
-                                                Vec512<bfloat16_t> a,
-                                                Vec512<bfloat16_t> b,
-                                                const Vec512<float> sum0,
-                                                Vec512<float>& sum1) {
-  // TODO(janwas): _mm512_dpbf16_ps when available
-  const Repartition<uint16_t, decltype(df32)> du16;
-  const RebindToUnsigned<decltype(df32)> du32;
-  const Vec512<uint16_t> zero = Zero(du16);
-  // Lane order within sum0/1 is undefined, hence we can avoid the
-  // longer-latency lane-crossing PromoteTo.
-  const Vec512<uint32_t> a0 = ZipLower(du32, zero, BitCast(du16, a));
-  const Vec512<uint32_t> a1 = ZipUpper(du32, zero, BitCast(du16, a));
-  const Vec512<uint32_t> b0 = ZipLower(du32, zero, BitCast(du16, b));
-  const Vec512<uint32_t> b1 = ZipUpper(du32, zero, BitCast(du16, b));
-  sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1);
-  return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0);
-}
-
-// ------------------------------ Reductions
-
-// Returns the sum in each lane.
-HWY_API Vec512<int32_t> SumOfLanes(Full512<int32_t> d, Vec512<int32_t> v) {
-  return Set(d, _mm512_reduce_add_epi32(v.raw));
-}
-HWY_API Vec512<int64_t> SumOfLanes(Full512<int64_t> d, Vec512<int64_t> v) {
-  return Set(d, _mm512_reduce_add_epi64(v.raw));
-}
-HWY_API Vec512<uint32_t> SumOfLanes(Full512<uint32_t> d, Vec512<uint32_t> v) {
-  return Set(d, static_cast<uint32_t>(_mm512_reduce_add_epi32(v.raw)));
-}
-HWY_API Vec512<uint64_t> SumOfLanes(Full512<uint64_t> d, Vec512<uint64_t> v) {
-  return Set(d, static_cast<uint64_t>(_mm512_reduce_add_epi64(v.raw)));
-}
-HWY_API Vec512<float> SumOfLanes(Full512<float> d, Vec512<float> v) {
-  return Set(d, _mm512_reduce_add_ps(v.raw));
-}
-HWY_API Vec512<double> SumOfLanes(Full512<double> d, Vec512<double> v) {
-  return Set(d, _mm512_reduce_add_pd(v.raw));
-}
-
-// Returns the minimum in each lane.
-HWY_API Vec512<int32_t> MinOfLanes(Full512<int32_t> d, Vec512<int32_t> v) {
-  return Set(d, _mm512_reduce_min_epi32(v.raw));
-}
-HWY_API Vec512<int64_t> MinOfLanes(Full512<int64_t> d, Vec512<int64_t> v) {
-  return Set(d, _mm512_reduce_min_epi64(v.raw));
-}
-HWY_API Vec512<uint32_t> MinOfLanes(Full512<uint32_t> d, Vec512<uint32_t> v) {
-  return Set(d, _mm512_reduce_min_epu32(v.raw));
-}
-HWY_API Vec512<uint64_t> MinOfLanes(Full512<uint64_t> d, Vec512<uint64_t> v) {
-  return Set(d, _mm512_reduce_min_epu64(v.raw));
-}
-HWY_API Vec512<float> MinOfLanes(Full512<float> d, Vec512<float> v) {
-  return Set(d, _mm512_reduce_min_ps(v.raw));
-}
-HWY_API Vec512<double> MinOfLanes(Full512<double> d, Vec512<double> v) {
-  return Set(d, _mm512_reduce_min_pd(v.raw));
-}
-HWY_API Vec512<uint16_t> MinOfLanes(Full512<uint16_t> d, Vec512<uint16_t> v) {
-  const RepartitionToWide<decltype(d)> d32;
-  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
-  const auto odd = ShiftRight<16>(BitCast(d32, v));
-  const auto min = MinOfLanes(d32, Min(even, odd));
-  // Also broadcast into odd lanes.
-  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
-}
-HWY_API Vec512<int16_t> MinOfLanes(Full512<int16_t> d, Vec512<int16_t> v) {
-  const RepartitionToWide<decltype(d)> d32;
-  // Sign-extend
-  const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
-  const auto odd = ShiftRight<16>(BitCast(d32, v));
-  const auto min = MinOfLanes(d32, Min(even, odd));
-  // Also broadcast into odd lanes.
-  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
-}
-
-// Returns the maximum in each lane.
-HWY_API Vec512<int32_t> MaxOfLanes(Full512<int32_t> d, Vec512<int32_t> v) {
-  return Set(d, _mm512_reduce_max_epi32(v.raw));
-}
-HWY_API Vec512<int64_t> MaxOfLanes(Full512<int64_t> d, Vec512<int64_t> v) {
-  return Set(d, _mm512_reduce_max_epi64(v.raw));
-}
-HWY_API Vec512<uint32_t> MaxOfLanes(Full512<uint32_t> d, Vec512<uint32_t> v) {
-  return Set(d, _mm512_reduce_max_epu32(v.raw));
-}
-HWY_API Vec512<uint64_t> MaxOfLanes(Full512<uint64_t> d, Vec512<uint64_t> v) {
-  return Set(d, _mm512_reduce_max_epu64(v.raw));
-}
-HWY_API Vec512<float> MaxOfLanes(Full512<float> d, Vec512<float> v) {
-  return Set(d, _mm512_reduce_max_ps(v.raw));
-}
-HWY_API Vec512<double> MaxOfLanes(Full512<double> d, Vec512<double> v) {
-  return Set(d, _mm512_reduce_max_pd(v.raw));
-}
-HWY_API Vec512<uint16_t> MaxOfLanes(Full512<uint16_t> d, Vec512<uint16_t> v) {
-  const RepartitionToWide<decltype(d)> d32;
-  const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF));
-  const auto odd = ShiftRight<16>(BitCast(d32, v));
-  const auto min = MaxOfLanes(d32, Max(even, odd));
-  // Also broadcast into odd lanes.
-  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
-}
-HWY_API Vec512<int16_t> MaxOfLanes(Full512<int16_t> d, Vec512<int16_t> v) {
-  const RepartitionToWide<decltype(d)> d32;
-  // Sign-extend
-  const auto even = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, v)));
-  const auto odd = ShiftRight<16>(BitCast(d32, v));
-  const auto min = MaxOfLanes(d32, Max(even, odd));
-  // Also broadcast into odd lanes.
-  return OddEven(BitCast(d, ShiftLeft<16>(min)), BitCast(d, min));
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-// Note that the GCC warnings are not suppressed if we only wrap the *intrin.h -
-// the warning seems to be issued at the call site of intrinsics, i.e. our code.
-HWY_DIAGNOSTICS(pop)
diff --git a/third_party/highway/hwy/per_target.cc b/third_party/highway/hwy/per_target.cc
deleted file mode 100644 (file)
index 4cbf152..0000000
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright 2022 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/per_target.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "hwy/per_target.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#include "hwy/highway.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-// On SVE, Lanes rounds down to a power of two, but we want to know the actual
-// size here. Otherwise, hypothetical SVE with 48 bytes would round down to 32
-// and we'd enable HWY_SVE_256, and then fail reverse_test because Reverse on
-// HWY_SVE_256 requires the actual vector to be a power of two.
-#if HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE_256
-size_t GetVectorBytes() { return detail::AllHardwareLanes(hwy::SizeTag<1>()); }
-#else
-size_t GetVectorBytes() { return Lanes(ScalableTag<uint8_t>()); }
-#endif
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace hwy {
-namespace {
-HWY_EXPORT(GetVectorBytes);  // Local function.
-}  // namespace
-
-size_t VectorBytes() { return HWY_DYNAMIC_DISPATCH(GetVectorBytes)(); }
-
-}  // namespace hwy
-#endif  // HWY_ONCE
diff --git a/third_party/highway/hwy/per_target.h b/third_party/highway/hwy/per_target.h
deleted file mode 100644 (file)
index da85de3..0000000
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright 2022 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HIGHWAY_HWY_PER_TARGET_H_
-#define HIGHWAY_HWY_PER_TARGET_H_
-
-#include <stddef.h>
-
-// Per-target functions.
-
-namespace hwy {
-
-// Returns size in bytes of a vector, i.e. `Lanes(ScalableTag<uint8_t>())`.
-//
-// Do not cache the result, which may change after calling DisableTargets, or
-// if software requests a different vector size (e.g. when entering/exiting SME
-// streaming mode). Instead call this right before the code that depends on the
-// result, without any DisableTargets or SME transition in-between. Note that
-// this involves an indirect call, so prefer not to call this frequently nor
-// unnecessarily.
-size_t VectorBytes();
-
-}  // namespace hwy
-
-#endif  // HIGHWAY_HWY_PER_TARGET_H_
diff --git a/third_party/highway/hwy/print-inl.h b/third_party/highway/hwy/print-inl.h
deleted file mode 100644 (file)
index d256657..0000000
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright 2022 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Print() function
-
-#include <stdint.h>
-
-#include "hwy/aligned_allocator.h"
-#include "hwy/highway.h"
-#include "hwy/print.h"
-
-// Per-target include guard
-#if defined(HIGHWAY_HWY_PRINT_INL_H_) == \
-    defined(HWY_TARGET_TOGGLE)
-#ifdef HIGHWAY_HWY_PRINT_INL_H_
-#undef HIGHWAY_HWY_PRINT_INL_H_
-#else
-#define HIGHWAY_HWY_PRINT_INL_H_
-#endif
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-// Prints lanes around `lane`, in memory order.
-template <class D, class V = Vec<D>>
-void Print(const D d, const char* caption, VecArg<V> v, size_t lane_u = 0,
-           size_t max_lanes = 7) {
-  const size_t N = Lanes(d);
-  using T = TFromD<D>;
-  auto lanes = AllocateAligned<T>(N);
-  Store(v, d, lanes.get());
-
-  const auto info = hwy::detail::MakeTypeInfo<T>();
-  hwy::detail::PrintArray(info, caption, lanes.get(), N, lane_u, max_lanes);
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#endif  // per-target include guard
diff --git a/third_party/highway/hwy/print.cc b/third_party/highway/hwy/print.cc
deleted file mode 100644 (file)
index 0b52cde..0000000
+++ /dev/null
@@ -1,110 +0,0 @@
-// Copyright 2022 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/print.h"
-
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS  // before inttypes.h
-#endif
-#include <inttypes.h>
-#include <stddef.h>
-#include <stdio.h>
-
-#include "hwy/base.h"
-
-namespace hwy {
-namespace detail {
-
-HWY_DLLEXPORT void TypeName(const TypeInfo& info, size_t N, char* string100) {
-  const char prefix = info.is_float ? 'f' : (info.is_signed ? 'i' : 'u');
-  // Omit the xN suffix for scalars.
-  if (N == 1) {
-    // NOLINTNEXTLINE
-    snprintf(string100, 64, "%c%d", prefix,
-             static_cast<int>(info.sizeof_t * 8));
-  } else {
-    // NOLINTNEXTLINE
-    snprintf(string100, 64, "%c%dx%d", prefix,
-             static_cast<int>(info.sizeof_t * 8), static_cast<int>(N));
-  }
-}
-
-HWY_DLLEXPORT void ToString(const TypeInfo& info, const void* ptr,
-                            char* string100) {
-  if (info.sizeof_t == 1) {
-    uint8_t byte;
-    CopyBytes<1>(ptr, &byte);  // endian-safe: we ensured sizeof(T)=1.
-    snprintf(string100, 100, "0x%02X", byte);  // NOLINT
-  } else if (info.sizeof_t == 2) {
-    uint16_t bits;
-    CopyBytes<2>(ptr, &bits);
-    snprintf(string100, 100, "0x%04X", bits);  // NOLINT
-  } else if (info.sizeof_t == 4) {
-    if (info.is_float) {
-      float value;
-      CopyBytes<4>(ptr, &value);
-      snprintf(string100, 100, "%g", static_cast<double>(value));  // NOLINT
-    } else if (info.is_signed) {
-      int32_t value;
-      CopyBytes<4>(ptr, &value);
-      snprintf(string100, 100, "%d", value);  // NOLINT
-    } else {
-      uint32_t value;
-      CopyBytes<4>(ptr, &value);
-      snprintf(string100, 100, "%u", value);  // NOLINT
-    }
-  } else {
-    HWY_ASSERT(info.sizeof_t == 8);
-    if (info.is_float) {
-      double value;
-      CopyBytes<8>(ptr, &value);
-      snprintf(string100, 100, "%g", value);  // NOLINT
-    } else if (info.is_signed) {
-      int64_t value;
-      CopyBytes<8>(ptr, &value);
-      snprintf(string100, 100, "%" PRIi64 "", value);  // NOLINT
-    } else {
-      uint64_t value;
-      CopyBytes<8>(ptr, &value);
-      snprintf(string100, 100, "%" PRIu64 "", value);  // NOLINT
-    }
-  }
-}
-
-HWY_DLLEXPORT void PrintArray(const TypeInfo& info, const char* caption,
-                              const void* array_void, size_t N, size_t lane_u,
-                              size_t max_lanes) {
-  const uint8_t* array_bytes = reinterpret_cast<const uint8_t*>(array_void);
-
-  char type_name[100];
-  TypeName(info, N, type_name);
-
-  const intptr_t lane = intptr_t(lane_u);
-  const size_t begin = static_cast<size_t>(HWY_MAX(0, lane - 2));
-  const size_t end = HWY_MIN(begin + max_lanes, N);
-  fprintf(stderr, "%s %s [%" PRIu64 "+ ->]:\n  ", type_name, caption,
-          static_cast<uint64_t>(begin));
-  for (size_t i = begin; i < end; ++i) {
-    const void* ptr = array_bytes + i * info.sizeof_t;
-    char str[100];
-    ToString(info, ptr, str);
-    fprintf(stderr, "%s,", str);
-  }
-  if (begin >= end) fprintf(stderr, "(out of bounds)");
-  fprintf(stderr, "\n");
-}
-
-}  // namespace detail
-}  // namespace hwy
diff --git a/third_party/highway/hwy/print.h b/third_party/highway/hwy/print.h
deleted file mode 100644 (file)
index 1379286..0000000
+++ /dev/null
@@ -1,73 +0,0 @@
-// Copyright 2022 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HWY_PRINT_H_
-#define HWY_PRINT_H_
-
-// Helpers for printing vector lanes.
-
-#include <stddef.h>
-#include <stdio.h>
-
-#include "hwy/base.h"
-#include "hwy/highway_export.h"
-
-namespace hwy {
-
-namespace detail {
-
-// For implementing value comparisons etc. as type-erased functions to reduce
-// template bloat.
-struct TypeInfo {
-  size_t sizeof_t;
-  bool is_float;
-  bool is_signed;
-};
-
-template <typename T>
-HWY_INLINE TypeInfo MakeTypeInfo() {
-  TypeInfo info;
-  info.sizeof_t = sizeof(T);
-  info.is_float = IsFloat<T>();
-  info.is_signed = IsSigned<T>();
-  return info;
-}
-
-HWY_DLLEXPORT void TypeName(const TypeInfo& info, size_t N, char* string100);
-HWY_DLLEXPORT void ToString(const TypeInfo& info, const void* ptr,
-                            char* string100);
-
-HWY_DLLEXPORT void PrintArray(const TypeInfo& info, const char* caption,
-                              const void* array_void, size_t N,
-                              size_t lane_u = 0, size_t max_lanes = 7);
-
-}  // namespace detail
-
-template <typename T>
-HWY_NOINLINE void PrintValue(T value) {
-  char str[100];
-  detail::ToString(hwy::detail::MakeTypeInfo<T>(), &value, str);
-  fprintf(stderr, "%s,", str);
-}
-
-template <typename T>
-HWY_NOINLINE void PrintArray(const T* value, size_t count) {
-  detail::PrintArray(hwy::detail::MakeTypeInfo<T>(), "", value, count, 0,
-                     count);
-}
-
-}  // namespace hwy
-
-#endif  // HWY_PRINT_H_
diff --git a/third_party/highway/hwy/targets.cc b/third_party/highway/hwy/targets.cc
deleted file mode 100644 (file)
index 2fde4db..0000000
+++ /dev/null
@@ -1,434 +0,0 @@
-// Copyright 2019 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/targets.h"
-
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS  // before inttypes.h
-#endif
-#include <inttypes.h>  // PRIx64
-#include <stdarg.h>
-#include <stddef.h>
-#include <stdint.h>
-#include <stdio.h>
-
-#include <atomic>
-
-#include "hwy/per_target.h"  // VectorBytes
-
-#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
-#include "sanitizer/common_interface_defs.h"  // __sanitizer_print_stack_trace
-#endif
-
-#include <stdlib.h>  // abort / exit
-
-#if HWY_ARCH_X86
-#include <xmmintrin.h>
-#if HWY_COMPILER_MSVC
-#include <intrin.h>
-#else  // !HWY_COMPILER_MSVC
-#include <cpuid.h>
-#endif  // HWY_COMPILER_MSVC
-
-#elif HWY_ARCH_ARM && HWY_OS_LINUX
-#include <asm/hwcap.h>
-#include <sys/auxv.h>
-#endif  // HWY_ARCH_*
-
-namespace hwy {
-namespace {
-
-#if HWY_ARCH_X86
-
-HWY_INLINE bool IsBitSet(const uint32_t reg, const int index) {
-  return (reg & (1U << index)) != 0;
-}
-
-// Calls CPUID instruction with eax=level and ecx=count and returns the result
-// in abcd array where abcd = {eax, ebx, ecx, edx} (hence the name abcd).
-HWY_INLINE void Cpuid(const uint32_t level, const uint32_t count,
-                      uint32_t* HWY_RESTRICT abcd) {
-#if HWY_COMPILER_MSVC
-  int regs[4];
-  __cpuidex(regs, level, count);
-  for (int i = 0; i < 4; ++i) {
-    abcd[i] = regs[i];
-  }
-#else   // HWY_COMPILER_MSVC
-  uint32_t a;
-  uint32_t b;
-  uint32_t c;
-  uint32_t d;
-  __cpuid_count(level, count, a, b, c, d);
-  abcd[0] = a;
-  abcd[1] = b;
-  abcd[2] = c;
-  abcd[3] = d;
-#endif  // HWY_COMPILER_MSVC
-}
-
-// Returns the lower 32 bits of extended control register 0.
-// Requires CPU support for "OSXSAVE" (see below).
-uint32_t ReadXCR0() {
-#if HWY_COMPILER_MSVC
-  return static_cast<uint32_t>(_xgetbv(0));
-#else   // HWY_COMPILER_MSVC
-  uint32_t xcr0, xcr0_high;
-  const uint32_t index = 0;
-  asm volatile(".byte 0x0F, 0x01, 0xD0"
-               : "=a"(xcr0), "=d"(xcr0_high)
-               : "c"(index));
-  return xcr0;
-#endif  // HWY_COMPILER_MSVC
-}
-
-#endif  // HWY_ARCH_X86
-
-// When running tests, this value can be set to the mocked supported targets
-// mask. Only written to from a single thread before the test starts.
-int64_t supported_targets_for_test_ = 0;
-
-// Mask of targets disabled at runtime with DisableTargets.
-int64_t supported_mask_ = LimitsMax<int64_t>();
-
-#if HWY_ARCH_X86
-// Arbritrary bit indices indicating which instruction set extensions are
-// supported. Use enum to ensure values are distinct.
-enum class FeatureIndex : uint32_t {
-  kSSE = 0,
-  kSSE2,
-  kSSE3,
-  kSSSE3,
-
-  kSSE41,
-  kSSE42,
-  kCLMUL,
-  kAES,
-
-  kAVX,
-  kAVX2,
-  kF16C,
-  kFMA,
-  kLZCNT,
-  kBMI,
-  kBMI2,
-
-  kAVX512F,
-  kAVX512VL,
-  kAVX512DQ,
-  kAVX512BW,
-
-  kVNNI,
-  kVPCLMULQDQ,
-  kVBMI,
-  kVBMI2,
-  kVAES,
-  kPOPCNTDQ,
-  kBITALG,
-
-  kSentinel
-};
-static_assert(static_cast<size_t>(FeatureIndex::kSentinel) < 64,
-              "Too many bits for u64");
-
-HWY_INLINE constexpr uint64_t Bit(FeatureIndex index) {
-  return 1ull << static_cast<size_t>(index);
-}
-
-constexpr uint64_t kGroupSSSE3 =
-    Bit(FeatureIndex::kSSE) | Bit(FeatureIndex::kSSE2) |
-    Bit(FeatureIndex::kSSE3) | Bit(FeatureIndex::kSSSE3);
-
-constexpr uint64_t kGroupSSE4 =
-    Bit(FeatureIndex::kSSE41) | Bit(FeatureIndex::kSSE42) |
-    Bit(FeatureIndex::kCLMUL) | Bit(FeatureIndex::kAES) | kGroupSSSE3;
-
-// We normally assume BMI/BMI2/FMA are available if AVX2 is. This allows us to
-// use BZHI and (compiler-generated) MULX. However, VirtualBox lacks them
-// [https://www.virtualbox.org/ticket/15471]. Thus we provide the option of
-// avoiding using and requiring these so AVX2 can still be used.
-#ifdef HWY_DISABLE_BMI2_FMA
-constexpr uint64_t kGroupBMI2_FMA = 0;
-#else
-constexpr uint64_t kGroupBMI2_FMA = Bit(FeatureIndex::kBMI) |
-                                    Bit(FeatureIndex::kBMI2) |
-                                    Bit(FeatureIndex::kFMA);
-#endif
-
-#ifdef HWY_DISABLE_F16C
-constexpr uint64_t kGroupF16C = 0;
-#else
-constexpr uint64_t kGroupF16C = Bit(FeatureIndex::kF16C);
-#endif
-
-constexpr uint64_t kGroupAVX2 =
-    Bit(FeatureIndex::kAVX) | Bit(FeatureIndex::kAVX2) |
-    Bit(FeatureIndex::kLZCNT) | kGroupBMI2_FMA | kGroupF16C | kGroupSSE4;
-
-constexpr uint64_t kGroupAVX3 =
-    Bit(FeatureIndex::kAVX512F) | Bit(FeatureIndex::kAVX512VL) |
-    Bit(FeatureIndex::kAVX512DQ) | Bit(FeatureIndex::kAVX512BW) | kGroupAVX2;
-
-constexpr uint64_t kGroupAVX3_DL =
-    Bit(FeatureIndex::kVNNI) | Bit(FeatureIndex::kVPCLMULQDQ) |
-    Bit(FeatureIndex::kVBMI) | Bit(FeatureIndex::kVBMI2) |
-    Bit(FeatureIndex::kVAES) | Bit(FeatureIndex::kPOPCNTDQ) |
-    Bit(FeatureIndex::kBITALG) | kGroupAVX3;
-
-#endif  // HWY_ARCH_X86
-
-// Returns targets supported by the CPU, independently of DisableTargets.
-// Factored out of SupportedTargets to make its structure more obvious. Note
-// that x86 CPUID may take several hundred cycles.
-int64_t DetectTargets() {
-  // Apps will use only one of these (the default is EMU128), but compile flags
-  // for this TU may differ from that of the app, so allow both.
-  int64_t bits = HWY_SCALAR | HWY_EMU128;
-
-#if HWY_ARCH_X86
-  bool has_osxsave = false;
-  {  // ensures we do not accidentally use flags outside this block
-    uint64_t flags = 0;
-    uint32_t abcd[4];
-
-    Cpuid(0, 0, abcd);
-    const uint32_t max_level = abcd[0];
-
-    // Standard feature flags
-    Cpuid(1, 0, abcd);
-    flags |= IsBitSet(abcd[3], 25) ? Bit(FeatureIndex::kSSE) : 0;
-    flags |= IsBitSet(abcd[3], 26) ? Bit(FeatureIndex::kSSE2) : 0;
-    flags |= IsBitSet(abcd[2], 0) ? Bit(FeatureIndex::kSSE3) : 0;
-    flags |= IsBitSet(abcd[2], 1) ? Bit(FeatureIndex::kCLMUL) : 0;
-    flags |= IsBitSet(abcd[2], 9) ? Bit(FeatureIndex::kSSSE3) : 0;
-    flags |= IsBitSet(abcd[2], 12) ? Bit(FeatureIndex::kFMA) : 0;
-    flags |= IsBitSet(abcd[2], 19) ? Bit(FeatureIndex::kSSE41) : 0;
-    flags |= IsBitSet(abcd[2], 20) ? Bit(FeatureIndex::kSSE42) : 0;
-    flags |= IsBitSet(abcd[2], 25) ? Bit(FeatureIndex::kAES) : 0;
-    flags |= IsBitSet(abcd[2], 28) ? Bit(FeatureIndex::kAVX) : 0;
-    flags |= IsBitSet(abcd[2], 29) ? Bit(FeatureIndex::kF16C) : 0;
-    has_osxsave = IsBitSet(abcd[2], 27);
-
-    // Extended feature flags
-    Cpuid(0x80000001U, 0, abcd);
-    flags |= IsBitSet(abcd[2], 5) ? Bit(FeatureIndex::kLZCNT) : 0;
-
-    // Extended features
-    if (max_level >= 7) {
-      Cpuid(7, 0, abcd);
-      flags |= IsBitSet(abcd[1], 3) ? Bit(FeatureIndex::kBMI) : 0;
-      flags |= IsBitSet(abcd[1], 5) ? Bit(FeatureIndex::kAVX2) : 0;
-      flags |= IsBitSet(abcd[1], 8) ? Bit(FeatureIndex::kBMI2) : 0;
-
-      flags |= IsBitSet(abcd[1], 16) ? Bit(FeatureIndex::kAVX512F) : 0;
-      flags |= IsBitSet(abcd[1], 17) ? Bit(FeatureIndex::kAVX512DQ) : 0;
-      flags |= IsBitSet(abcd[1], 30) ? Bit(FeatureIndex::kAVX512BW) : 0;
-      flags |= IsBitSet(abcd[1], 31) ? Bit(FeatureIndex::kAVX512VL) : 0;
-
-      flags |= IsBitSet(abcd[2], 1) ? Bit(FeatureIndex::kVBMI) : 0;
-      flags |= IsBitSet(abcd[2], 6) ? Bit(FeatureIndex::kVBMI2) : 0;
-      flags |= IsBitSet(abcd[2], 9) ? Bit(FeatureIndex::kVAES) : 0;
-      flags |= IsBitSet(abcd[2], 10) ? Bit(FeatureIndex::kVPCLMULQDQ) : 0;
-      flags |= IsBitSet(abcd[2], 11) ? Bit(FeatureIndex::kVNNI) : 0;
-      flags |= IsBitSet(abcd[2], 12) ? Bit(FeatureIndex::kBITALG) : 0;
-      flags |= IsBitSet(abcd[2], 14) ? Bit(FeatureIndex::kPOPCNTDQ) : 0;
-    }
-
-    // Set target bit(s) if all their group's flags are all set.
-    if ((flags & kGroupAVX3_DL) == kGroupAVX3_DL) {
-      bits |= HWY_AVX3_DL;
-    }
-    if ((flags & kGroupAVX3) == kGroupAVX3) {
-      bits |= HWY_AVX3;
-    }
-    if ((flags & kGroupAVX2) == kGroupAVX2) {
-      bits |= HWY_AVX2;
-    }
-    if ((flags & kGroupSSE4) == kGroupSSE4) {
-      bits |= HWY_SSE4;
-    }
-    if ((flags & kGroupSSSE3) == kGroupSSSE3) {
-      bits |= HWY_SSSE3;
-    }
-  }
-
-  // Clear bits if the OS does not support XSAVE - otherwise, registers
-  // are not preserved across context switches.
-  if (has_osxsave) {
-    const uint32_t xcr0 = ReadXCR0();
-    const int64_t min_avx3 = HWY_AVX3 | HWY_AVX3_DL;
-    const int64_t min_avx2 = HWY_AVX2 | min_avx3;
-    // XMM
-    if (!IsBitSet(xcr0, 1)) {
-      bits &= ~(HWY_SSSE3 | HWY_SSE4 | min_avx2);
-    }
-    // YMM
-    if (!IsBitSet(xcr0, 2)) {
-      bits &= ~min_avx2;
-    }
-    // opmask, ZMM lo/hi
-    if (!IsBitSet(xcr0, 5) || !IsBitSet(xcr0, 6) || !IsBitSet(xcr0, 7)) {
-      bits &= ~min_avx3;
-    }
-  }
-
-  if ((bits & HWY_ENABLED_BASELINE) != HWY_ENABLED_BASELINE) {
-    fprintf(stderr,
-            "WARNING: CPU supports %" PRIx64 " but software requires %" PRIx64
-            "\n",
-            bits, static_cast<int64_t>(HWY_ENABLED_BASELINE));
-  }
-
-#elif HWY_ARCH_ARM && HWY_HAVE_RUNTIME_DISPATCH
-  using CapBits = unsigned long;  // NOLINT
-  const CapBits hw = getauxval(AT_HWCAP);
-  (void)hw;
-
-#if HWY_ARCH_ARM_A64
-
-#if defined(HWCAP_AES)
-  // aarch64 always has NEON and VFPv4, but not necessarily AES, which we
-  // require and thus must still check for.
-  if (hw & HWCAP_AES) {
-    bits |= HWY_NEON;
-  }
-#endif  // HWCAP_AES
-
-#if defined(HWCAP_SVE)
-  if (hw & HWCAP_SVE) {
-    bits |= HWY_SVE;
-  }
-#endif
-
-#if defined(HWCAP2_SVE2) && defined(HWCAP2_SVEAES)
-  const CapBits hw2 = getauxval(AT_HWCAP2);
-  if ((hw2 & HWCAP2_SVE2) && (hw2 & HWCAP2_SVEAES)) {
-    bits |= HWY_SVE2;
-  }
-#endif
-
-#else  // HWY_ARCH_ARM_A64
-
-// Some old auxv.h / hwcap.h do not define these. If not, treat as unsupported.
-// Note that AES has a different HWCAP bit compared to aarch64.
-#if defined(HWCAP_NEON) && defined(HWCAP_VFPv4)
-  if ((hw & HWCAP_NEON) && (hw & HWCAP_VFPv4)) {
-    bits |= HWY_NEON;
-  }
-#endif
-
-#endif  // HWY_ARCH_ARM_A64
-  if ((bits & HWY_ENABLED_BASELINE) != HWY_ENABLED_BASELINE) {
-    fprintf(stderr,
-            "WARNING: CPU supports %" PRIx64 " but software requires %" PRIx64
-            "\n",
-            bits, static_cast<int64_t>(HWY_ENABLED_BASELINE));
-  }
-#else   // HWY_ARCH_ARM && HWY_HAVE_RUNTIME_DISPATCH
-  // TODO(janwas): detect for other platforms and check for baseline
-  // This file is typically compiled without HWY_IS_TEST, but targets_test has
-  // it set, and will expect all of its HWY_TARGETS (= all attainable) to be
-  // supported.
-  bits |= HWY_ENABLED_BASELINE;
-#endif  // HWY_ARCH_X86
-
-  return bits;
-}
-
-}  // namespace
-
-HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4)
-    Abort(const char* file, int line, const char* format, ...) {
-  char buf[2000];
-  va_list args;
-  va_start(args, format);
-  vsnprintf(buf, sizeof(buf), format, args);
-  va_end(args);
-
-  fprintf(stderr, "Abort at %s:%d: %s\n", file, line, buf);
-
-// If compiled with any sanitizer, they can also print a stack trace.
-#if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
-  __sanitizer_print_stack_trace();
-#endif  // HWY_IS_*
-  fflush(stderr);
-
-// Now terminate the program:
-#if HWY_ARCH_RVV
-  exit(1);  // trap/abort just freeze Spike.
-#elif HWY_IS_DEBUG_BUILD && !HWY_COMPILER_MSVC
-  // Facilitates breaking into a debugger, but don't use this in non-debug
-  // builds because it looks like "illegal instruction", which is misleading.
-  __builtin_trap();
-#else
-  abort();  // Compile error without this due to HWY_NORETURN.
-#endif
-}
-
-HWY_DLLEXPORT void DisableTargets(int64_t disabled_targets) {
-  supported_mask_ = static_cast<int64_t>(~disabled_targets);
-  // This will take effect on the next call to SupportedTargets, which is
-  // called right before GetChosenTarget::Update. However, calling Update here
-  // would make it appear that HWY_DYNAMIC_DISPATCH was called, which we want
-  // to check in tests. We instead de-initialize such that the next
-  // HWY_DYNAMIC_DISPATCH calls GetChosenTarget::Update via FunctionCache.
-  GetChosenTarget().DeInit();
-}
-
-HWY_DLLEXPORT void SetSupportedTargetsForTest(int64_t targets) {
-  supported_targets_for_test_ = targets;
-  GetChosenTarget().DeInit();  // see comment above
-}
-
-HWY_DLLEXPORT int64_t SupportedTargets() {
-  int64_t targets = supported_targets_for_test_;
-  if (HWY_LIKELY(targets == 0)) {
-    // Mock not active. Re-detect instead of caching just in case we're on a
-    // heterogeneous ISA (also requires some app support to pin threads). This
-    // is only reached on the first HWY_DYNAMIC_DISPATCH or after each call to
-    // DisableTargets or SetSupportedTargetsForTest.
-    targets = DetectTargets();
-
-    // VectorBytes invokes HWY_DYNAMIC_DISPATCH. To prevent infinite recursion,
-    // first set up ChosenTarget. No need to Update() again afterwards with the
-    // final targets - that will be done by a caller of this function.
-    GetChosenTarget().Update(targets);
-
-    // Now that we can call VectorBytes, check for targets with specific sizes.
-    if (HWY_ARCH_ARM_A64) {
-      const size_t vec_bytes = VectorBytes();  // uncached, see declaration
-      if ((targets & HWY_SVE) && vec_bytes == 32) {
-        targets = static_cast<int64_t>(targets | HWY_SVE_256);
-      } else {
-        targets = static_cast<int64_t>(targets & ~HWY_SVE_256);
-      }
-      if ((targets & HWY_SVE2) && vec_bytes == 16) {
-        targets = static_cast<int64_t>(targets | HWY_SVE2_128);
-      } else {
-        targets = static_cast<int64_t>(targets & ~HWY_SVE2_128);
-      }
-    }  // HWY_ARCH_ARM_A64
-  }
-
-  targets &= supported_mask_;
-  return targets == 0 ? HWY_STATIC_TARGET : targets;
-}
-
-HWY_DLLEXPORT ChosenTarget& GetChosenTarget() {
-  static ChosenTarget chosen_target;
-  return chosen_target;
-}
-
-}  // namespace hwy
diff --git a/third_party/highway/hwy/targets.h b/third_party/highway/hwy/targets.h
deleted file mode 100644 (file)
index 2d9afbf..0000000
+++ /dev/null
@@ -1,318 +0,0 @@
-// Copyright 2020 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HIGHWAY_HWY_TARGETS_H_
-#define HIGHWAY_HWY_TARGETS_H_
-
-#include <vector>
-
-// For SIMD module implementations and their callers. Defines which targets to
-// generate and call.
-
-#include "hwy/base.h"
-#include "hwy/detect_targets.h"
-#include "hwy/highway_export.h"
-
-#if !HWY_ARCH_RVV
-#include <atomic>
-#endif
-
-namespace hwy {
-
-// Returns bitfield of enabled targets that are supported on this CPU; there is
-// always at least one such target, hence the return value is never 0. The
-// targets returned may change after calling DisableTargets. This function is
-// always defined, but the HWY_SUPPORTED_TARGETS wrapper may allow eliding
-// calls to it if there is only a single target enabled.
-HWY_DLLEXPORT int64_t SupportedTargets();
-
-// Evaluates to a function call, or literal if there is a single target.
-#if (HWY_TARGETS & (HWY_TARGETS - 1)) == 0
-#define HWY_SUPPORTED_TARGETS HWY_TARGETS
-#else
-#define HWY_SUPPORTED_TARGETS hwy::SupportedTargets()
-#endif
-
-// Subsequent SupportedTargets will not return targets whose bit(s) are set in
-// `disabled_targets`. Exception: if SupportedTargets would return 0, it will
-// instead return HWY_STATIC_TARGET (there must always be one target to call).
-//
-// This function is useful for disabling targets known to be buggy, or if the
-// best available target is undesirable (perhaps due to throttling or memory
-// bandwidth limitations). Use SetSupportedTargetsForTest instead of this
-// function for iteratively enabling specific targets for testing.
-HWY_DLLEXPORT void DisableTargets(int64_t disabled_targets);
-
-// Subsequent SupportedTargets will return the given set of targets, except
-// those disabled via DisableTargets. Call with a mask of 0 to disable the mock
-// and return to the normal SupportedTargets behavior. Used to run tests for
-// all targets.
-HWY_DLLEXPORT void SetSupportedTargetsForTest(int64_t targets);
-
-// Return the list of targets in HWY_TARGETS supported by the CPU as a list of
-// individual HWY_* target macros such as HWY_SCALAR or HWY_NEON. This list
-// is affected by the current SetSupportedTargetsForTest() mock if any.
-HWY_INLINE std::vector<int64_t> SupportedAndGeneratedTargets() {
-  std::vector<int64_t> ret;
-  for (int64_t targets = SupportedTargets() & HWY_TARGETS; targets != 0;
-       targets = targets & (targets - 1)) {
-    int64_t current_target = targets & ~(targets - 1);
-    ret.push_back(current_target);
-  }
-  return ret;
-}
-
-static inline HWY_MAYBE_UNUSED const char* TargetName(int64_t target) {
-  switch (target) {
-#if HWY_ARCH_X86
-    case HWY_SSSE3:
-      return "SSSE3";
-    case HWY_SSE4:
-      return "SSE4";
-    case HWY_AVX2:
-      return "AVX2";
-    case HWY_AVX3:
-      return "AVX3";
-    case HWY_AVX3_DL:
-      return "AVX3_DL";
-#endif
-
-#if HWY_ARCH_ARM
-    case HWY_SVE2_128:
-      return "SVE2_128";
-    case HWY_SVE_256:
-      return "SVE_256";
-    case HWY_SVE2:
-      return "SVE2";
-    case HWY_SVE:
-      return "SVE";
-    case HWY_NEON:
-      return "NEON";
-#endif
-
-#if HWY_ARCH_PPC
-    case HWY_PPC8:
-      return "PPC8";
-#endif
-
-#if HWY_ARCH_WASM
-    case HWY_WASM:
-      return "WASM";
-    case HWY_WASM_EMU256:
-      return "WASM_EMU256";
-#endif
-
-#if HWY_ARCH_RVV
-    case HWY_RVV:
-      return "RVV";
-#endif
-
-    case HWY_EMU128:
-      return "EMU128";
-    case HWY_SCALAR:
-      return "SCALAR";
-
-    default:
-      return "Unknown";  // must satisfy gtest IsValidParamName()
-  }
-}
-
-// The maximum number of dynamic targets on any architecture is defined by
-// HWY_MAX_DYNAMIC_TARGETS and depends on the arch.
-
-// For the ChosenTarget mask and index we use a different bit arrangement than
-// in the HWY_TARGETS mask. Only the targets involved in the current
-// architecture are used in this mask, and therefore only the least significant
-// (HWY_MAX_DYNAMIC_TARGETS + 2) bits of the int64_t mask are used. The least
-// significant bit is set when the mask is not initialized, the next
-// HWY_MAX_DYNAMIC_TARGETS more significant bits are a range of bits from the
-// HWY_TARGETS or SupportedTargets() mask for the given architecture shifted to
-// that position and the next more significant bit is used for HWY_SCALAR (if
-// HWY_COMPILE_ONLY_SCALAR is defined) or HWY_EMU128. Because of this we need to
-// define equivalent values for HWY_TARGETS in this representation.
-// This mask representation allows to use ctz() on this mask and obtain a small
-// number that's used as an index of the table for dynamic dispatch. In this
-// way the first entry is used when the mask is uninitialized, the following
-// HWY_MAX_DYNAMIC_TARGETS are for dynamic dispatch and the last one is for
-// scalar.
-
-// The HWY_SCALAR/HWY_EMU128 bit in the ChosenTarget mask format.
-#define HWY_CHOSEN_TARGET_MASK_SCALAR (1LL << (HWY_MAX_DYNAMIC_TARGETS + 1))
-
-// Converts from a HWY_TARGETS mask to a ChosenTarget mask format for the
-// current architecture.
-#define HWY_CHOSEN_TARGET_SHIFT(X)                                    \
-  ((((X) >> (HWY_HIGHEST_TARGET_BIT + 1 - HWY_MAX_DYNAMIC_TARGETS)) & \
-    ((1LL << HWY_MAX_DYNAMIC_TARGETS) - 1))                           \
-   << 1)
-
-// The HWY_TARGETS mask in the ChosenTarget mask format.
-#define HWY_CHOSEN_TARGET_MASK_TARGETS \
-  (HWY_CHOSEN_TARGET_SHIFT(HWY_TARGETS) | HWY_CHOSEN_TARGET_MASK_SCALAR | 1LL)
-
-#if HWY_ARCH_X86
-// Maximum number of dynamic targets, changing this value is an ABI incompatible
-// change
-#define HWY_MAX_DYNAMIC_TARGETS 15
-#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_X86
-// These must match the order in which the HWY_TARGETS are defined
-// starting by the least significant (HWY_HIGHEST_TARGET_BIT + 1 -
-// HWY_MAX_DYNAMIC_TARGETS) bit. This list must contain exactly
-// HWY_MAX_DYNAMIC_TARGETS elements and does not include SCALAR. The first entry
-// corresponds to the best target. Don't include a "," at the end of the list.
-#define HWY_CHOOSE_TARGET_LIST(func_name)                   \
-  nullptr,                           /* reserved */         \
-      nullptr,                       /* reserved */         \
-      nullptr,                       /* reserved */         \
-      nullptr,                       /* reserved */         \
-      nullptr,                       /* reserved */         \
-      nullptr,                       /* reserved */         \
-      nullptr,                       /* reserved */         \
-      HWY_CHOOSE_AVX3_DL(func_name), /* AVX3_DL */          \
-      HWY_CHOOSE_AVX3(func_name),    /* AVX3 */             \
-      HWY_CHOOSE_AVX2(func_name),    /* AVX2 */             \
-      nullptr,                       /* AVX */              \
-      HWY_CHOOSE_SSE4(func_name),    /* SSE4 */             \
-      HWY_CHOOSE_SSSE3(func_name),   /* SSSE3 */            \
-      nullptr ,                       /* reserved - SSE3? */ \
-      nullptr                        /* reserved - SSE2? */
-
-#elif HWY_ARCH_ARM
-// See HWY_ARCH_X86 above for details.
-#define HWY_MAX_DYNAMIC_TARGETS 15
-#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_ARM
-#define HWY_CHOOSE_TARGET_LIST(func_name)                \
-  nullptr,                            /* reserved */     \
-      nullptr,                        /* reserved */     \
-      nullptr,                        /* reserved */     \
-      nullptr,                        /* reserved */     \
-      nullptr,                        /* reserved */     \
-      nullptr,                        /* reserved */     \
-      nullptr,                        /* reserved */     \
-      nullptr,                        /* reserved */     \
-      nullptr,                        /* reserved */     \
-      HWY_CHOOSE_SVE2_128(func_name), /* SVE2 128-bit */ \
-      HWY_CHOOSE_SVE_256(func_name),  /* SVE 256-bit */  \
-      HWY_CHOOSE_SVE2(func_name),     /* SVE2 */         \
-      HWY_CHOOSE_SVE(func_name),      /* SVE */          \
-      HWY_CHOOSE_NEON(func_name),     /* NEON */         \
-      nullptr                         /* reserved - Helium? */
-
-#elif HWY_ARCH_RVV
-// See HWY_ARCH_X86 above for details.
-#define HWY_MAX_DYNAMIC_TARGETS 9
-#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_RVV
-#define HWY_CHOOSE_TARGET_LIST(func_name)       \
-  nullptr,                       /* reserved */ \
-      nullptr,                   /* reserved */ \
-      nullptr,                   /* reserved */ \
-      nullptr,                   /* reserved */ \
-      nullptr,                   /* reserved */ \
-      nullptr,                   /* reserved */ \
-      nullptr,                   /* reserved */ \
-      HWY_CHOOSE_RVV(func_name), /* RVV */      \
-      nullptr                    /* reserved */
-
-#elif HWY_ARCH_PPC
-// See HWY_ARCH_X86 above for details.
-#define HWY_MAX_DYNAMIC_TARGETS 9
-#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_PPC
-#define HWY_CHOOSE_TARGET_LIST(func_name)                         \
-  nullptr,                        /* reserved */                  \
-      nullptr,                    /* reserved */                  \
-      nullptr,                    /* reserved */                  \
-      nullptr,                    /* reserved */                  \
-      nullptr,                    /* reserved */                  \
-      nullptr,                    /* reserved */                  \
-      HWY_CHOOSE_PPC8(func_name), /* PPC8 */                      \
-      nullptr,                    /* reserved (VSX or AltiVec) */ \
-      nullptr                     /* reserved (VSX or AltiVec) */
-
-#elif HWY_ARCH_WASM
-// See HWY_ARCH_X86 above for details.
-#define HWY_MAX_DYNAMIC_TARGETS 9
-#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_WASM
-#define HWY_CHOOSE_TARGET_LIST(func_name)                  \
-  nullptr,                               /* reserved */    \
-      nullptr,                           /* reserved */    \
-      nullptr,                           /* reserved */    \
-      nullptr,                           /* reserved */    \
-      nullptr,                           /* reserved */    \
-      nullptr,                           /* reserved */    \
-      HWY_CHOOSE_WASM_EMU256(func_name), /* WASM_EMU256 */ \
-      HWY_CHOOSE_WASM(func_name),        /* WASM */        \
-      nullptr                            /* reserved */
-
-#else
-// Unknown architecture, will use HWY_SCALAR without dynamic dispatch, though
-// still creating single-entry tables in HWY_EXPORT to ensure portability.
-#define HWY_MAX_DYNAMIC_TARGETS 1
-#define HWY_HIGHEST_TARGET_BIT HWY_HIGHEST_TARGET_BIT_SCALAR
-#endif
-
-// Bitfield of supported and enabled targets. The format differs from that of
-// HWY_TARGETS; the lowest bit governs the first function pointer (which is
-// special in that it calls FunctionCache, then Update, then dispatches to the
-// actual implementation) in the tables created by HWY_EXPORT. Monostate (see
-// GetChosenTarget), thread-safe except on RVV.
-struct ChosenTarget {
- public:
-  // Reset bits according to `targets` (typically the return value of
-  // SupportedTargets()). Postcondition: IsInitialized() == true.
-  void Update(int64_t targets) {
-    // These are `targets` shifted downwards, see above. Also include SCALAR
-    // (corresponds to the last entry in the function table) as fallback.
-    StoreMask(HWY_CHOSEN_TARGET_SHIFT(targets) | HWY_CHOSEN_TARGET_MASK_SCALAR);
-  }
-
-  // Reset to the uninitialized state, so that FunctionCache will call Update
-  // during the next HWY_DYNAMIC_DISPATCH, and IsInitialized returns false.
-  void DeInit() { StoreMask(1); }
-
-  // Whether Update was called. This indicates whether any HWY_DYNAMIC_DISPATCH
-  // function was called, which we check in tests.
-  bool IsInitialized() const { return LoadMask() != 1; }
-
-  // Return the index in the dynamic dispatch table to be used by the current
-  // CPU. Note that this method must be in the header file so it uses the value
-  // of HWY_CHOSEN_TARGET_MASK_TARGETS defined in the translation unit that
-  // calls it, which may be different from others. This means we only enable
-  // those targets that were actually compiled in this module.
-  size_t HWY_INLINE GetIndex() const {
-    return hwy::Num0BitsBelowLS1Bit_Nonzero64(
-        static_cast<uint64_t>(LoadMask() & HWY_CHOSEN_TARGET_MASK_TARGETS));
-  }
-
- private:
-  // TODO(janwas): remove #if once <atomic> is available
-#if HWY_ARCH_RVV
-  int64_t LoadMask() const { return mask_; }
-  void StoreMask(int64_t mask) { mask_ = mask; }
-
-  int64_t mask_{1};  // Initialized to 1 so GetIndex() returns 0.
-#else
-  int64_t LoadMask() const { return mask_.load(); }
-  void StoreMask(int64_t mask) { mask_.store(mask); }
-
-  std::atomic<int64_t> mask_{1};  // Initialized to 1 so GetIndex() returns 0.
-#endif  // HWY_ARCH_RVV
-};
-
-// For internal use (e.g. by FunctionCache and DisableTargets).
-HWY_DLLEXPORT ChosenTarget& GetChosenTarget();
-
-}  // namespace hwy
-
-#endif  // HIGHWAY_HWY_TARGETS_H_
diff --git a/third_party/highway/hwy/targets_test.cc b/third_party/highway/hwy/targets_test.cc
deleted file mode 100644 (file)
index e58a6fa..0000000
+++ /dev/null
@@ -1,135 +0,0 @@
-// Copyright 2020 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/targets.h"
-
-#include "hwy/tests/test_util-inl.h"
-
-namespace fake {
-
-#define DECLARE_FUNCTION(TGT)                                                \
-  namespace N_##TGT {                                                        \
-    /* Function argument is just to ensure/demonstrate they are possible. */ \
-    int64_t FakeFunction(int) { return HWY_##TGT; }                          \
-  }
-
-DECLARE_FUNCTION(AVX3_DL)
-DECLARE_FUNCTION(AVX3)
-DECLARE_FUNCTION(AVX2)
-DECLARE_FUNCTION(SSE4)
-DECLARE_FUNCTION(SSSE3)
-DECLARE_FUNCTION(NEON)
-DECLARE_FUNCTION(SVE)
-DECLARE_FUNCTION(SVE2)
-DECLARE_FUNCTION(SVE_256)
-DECLARE_FUNCTION(SVE2_128)
-DECLARE_FUNCTION(PPC8)
-DECLARE_FUNCTION(WASM)
-DECLARE_FUNCTION(RVV)
-DECLARE_FUNCTION(SCALAR)
-DECLARE_FUNCTION(EMU128)
-
-HWY_EXPORT(FakeFunction);
-
-void CallFunctionForTarget(int64_t target, int line) {
-  if ((HWY_TARGETS & target) == 0) return;
-  hwy::SetSupportedTargetsForTest(target);
-
-  // Call Update() first to make &HWY_DYNAMIC_DISPATCH() return
-  // the pointer to the already cached function.
-  hwy::GetChosenTarget().Update(hwy::SupportedTargets());
-
-  EXPECT_EQ(target, HWY_DYNAMIC_DISPATCH(FakeFunction)(42)) << line;
-
-  // Calling DeInit() will test that the initializer function
-  // also calls the right function.
-  hwy::GetChosenTarget().DeInit();
-
-#if HWY_DISPATCH_WORKAROUND
-  EXPECT_EQ(HWY_STATIC_TARGET, HWY_DYNAMIC_DISPATCH(FakeFunction)(42)) << line;
-#else
-  EXPECT_EQ(target, HWY_DYNAMIC_DISPATCH(FakeFunction)(42)) << line;
-#endif
-
-  // Second call uses the cached value from the previous call.
-  EXPECT_EQ(target, HWY_DYNAMIC_DISPATCH(FakeFunction)(42)) << line;
-}
-
-void CheckFakeFunction() {
-  // When adding a target, also add to DECLARE_FUNCTION above.
-  CallFunctionForTarget(HWY_AVX3_DL, __LINE__);
-  CallFunctionForTarget(HWY_AVX3, __LINE__);
-  CallFunctionForTarget(HWY_AVX2, __LINE__);
-  CallFunctionForTarget(HWY_SSE4, __LINE__);
-  CallFunctionForTarget(HWY_SSSE3, __LINE__);
-  CallFunctionForTarget(HWY_NEON, __LINE__);
-  CallFunctionForTarget(HWY_SVE, __LINE__);
-  CallFunctionForTarget(HWY_SVE2, __LINE__);
-  CallFunctionForTarget(HWY_SVE_256, __LINE__);
-  CallFunctionForTarget(HWY_SVE2_128, __LINE__);
-  CallFunctionForTarget(HWY_PPC8, __LINE__);
-  CallFunctionForTarget(HWY_WASM, __LINE__);
-  CallFunctionForTarget(HWY_RVV, __LINE__);
-  // The tables only have space for either HWY_SCALAR or HWY_EMU128; the former
-  // is opt-in only.
-#if defined(HWY_COMPILE_ONLY_SCALAR) || HWY_BROKEN_EMU128
-  CallFunctionForTarget(HWY_SCALAR, __LINE__);
-#else
-  CallFunctionForTarget(HWY_EMU128, __LINE__);
-#endif
-}
-
-}  // namespace fake
-
-namespace hwy {
-
-class HwyTargetsTest : public testing::Test {
- protected:
-  void TearDown() override {
-    SetSupportedTargetsForTest(0);
-    DisableTargets(0);  // Reset the mask.
-  }
-};
-
-// Test that the order in the HWY_EXPORT static array matches the expected
-// value of the target bits. This is only checked for the targets that are
-// enabled in the current compilation.
-TEST_F(HwyTargetsTest, ChosenTargetOrderTest) { fake::CheckFakeFunction(); }
-
-TEST_F(HwyTargetsTest, DisabledTargetsTest) {
-  DisableTargets(~0LL);
-  // Check that disabling everything at least leaves the static target.
-  HWY_ASSERT(HWY_STATIC_TARGET == SupportedTargets());
-
-  DisableTargets(0);  // Reset the mask.
-  const int64_t current_targets = SupportedTargets();
-  const int64_t enabled_baseline = static_cast<int64_t>(HWY_ENABLED_BASELINE);
-  // Exclude these two because they are always returned by SupportedTargets.
-  const int64_t fallback = HWY_SCALAR | HWY_EMU128;
-  if ((current_targets & ~enabled_baseline & ~fallback) == 0) {
-    // We can't test anything else if the only compiled target is the baseline.
-    return;
-  }
-
-  // Get the lowest bit in the mask (the best target) and disable that one.
-  const int64_t best_target = current_targets & (~current_targets + 1);
-  DisableTargets(best_target);
-
-  // Check that the other targets are still enabled.
-  HWY_ASSERT((best_target ^ current_targets) == SupportedTargets());
-  DisableTargets(0);  // Reset the mask.
-}
-
-}  // namespace hwy
diff --git a/third_party/highway/hwy/tests/arithmetic_test.cc b/third_party/highway/hwy/tests/arithmetic_test.cc
deleted file mode 100644 (file)
index 1fbbd29..0000000
+++ /dev/null
@@ -1,445 +0,0 @@
-// Copyright 2019 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <stddef.h>
-#include <stdint.h>
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "tests/arithmetic_test.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#include "hwy/highway.h"
-#include "hwy/tests/test_util-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-struct TestPlusMinus {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v2 = Iota(d, T(2));
-    const auto v3 = Iota(d, T(3));
-    const auto v4 = Iota(d, T(4));
-
-    const size_t N = Lanes(d);
-    auto lanes = AllocateAligned<T>(N);
-    for (size_t i = 0; i < N; ++i) {
-      lanes[i] = static_cast<T>((2 + i) + (3 + i));
-    }
-    HWY_ASSERT_VEC_EQ(d, lanes.get(), Add(v2, v3));
-    HWY_ASSERT_VEC_EQ(d, Set(d, 2), Sub(v4, v2));
-
-    for (size_t i = 0; i < N; ++i) {
-      lanes[i] = static_cast<T>((2 + i) + (4 + i));
-    }
-    auto sum = v2;
-    sum = Add(sum, v4);  // sum == 6,8..
-    HWY_ASSERT_VEC_EQ(d, Load(d, lanes.get()), sum);
-
-    sum = Sub(sum, v4);
-    HWY_ASSERT_VEC_EQ(d, v2, sum);
-  }
-};
-
-HWY_NOINLINE void TestAllPlusMinus() {
-  ForAllTypes(ForPartialVectors<TestPlusMinus>());
-}
-
-struct TestUnsignedSaturatingArithmetic {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v0 = Zero(d);
-    const auto vi = Iota(d, 1);
-    const auto vm = Set(d, LimitsMax<T>());
-
-    HWY_ASSERT_VEC_EQ(d, Add(v0, v0), SaturatedAdd(v0, v0));
-    HWY_ASSERT_VEC_EQ(d, Add(v0, vi), SaturatedAdd(v0, vi));
-    HWY_ASSERT_VEC_EQ(d, Add(v0, vm), SaturatedAdd(v0, vm));
-    HWY_ASSERT_VEC_EQ(d, vm, SaturatedAdd(vi, vm));
-    HWY_ASSERT_VEC_EQ(d, vm, SaturatedAdd(vm, vm));
-
-    HWY_ASSERT_VEC_EQ(d, v0, SaturatedSub(v0, v0));
-    HWY_ASSERT_VEC_EQ(d, v0, SaturatedSub(v0, vi));
-    HWY_ASSERT_VEC_EQ(d, v0, SaturatedSub(vi, vi));
-    HWY_ASSERT_VEC_EQ(d, v0, SaturatedSub(vi, vm));
-    HWY_ASSERT_VEC_EQ(d, Sub(vm, vi), SaturatedSub(vm, vi));
-  }
-};
-
-struct TestSignedSaturatingArithmetic {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v0 = Zero(d);
-    const auto vpm = Set(d, LimitsMax<T>());
-    // Ensure all lanes are positive, even if Iota wraps around
-    const auto vi = Or(And(Iota(d, 0), vpm), Set(d, 1));
-    const auto vn = Sub(v0, vi);
-    const auto vnm = Set(d, LimitsMin<T>());
-    HWY_ASSERT_MASK_EQ(d, MaskTrue(d), Gt(vi, v0));
-    HWY_ASSERT_MASK_EQ(d, MaskTrue(d), Lt(vn, v0));
-
-    HWY_ASSERT_VEC_EQ(d, v0, SaturatedAdd(v0, v0));
-    HWY_ASSERT_VEC_EQ(d, vi, SaturatedAdd(v0, vi));
-    HWY_ASSERT_VEC_EQ(d, vpm, SaturatedAdd(v0, vpm));
-    HWY_ASSERT_VEC_EQ(d, vpm, SaturatedAdd(vi, vpm));
-    HWY_ASSERT_VEC_EQ(d, vpm, SaturatedAdd(vpm, vpm));
-
-    HWY_ASSERT_VEC_EQ(d, v0, SaturatedSub(v0, v0));
-    HWY_ASSERT_VEC_EQ(d, Sub(v0, vi), SaturatedSub(v0, vi));
-    HWY_ASSERT_VEC_EQ(d, vn, SaturatedSub(vn, v0));
-    HWY_ASSERT_VEC_EQ(d, vnm, SaturatedSub(vnm, vi));
-    HWY_ASSERT_VEC_EQ(d, vnm, SaturatedSub(vnm, vpm));
-  }
-};
-
-HWY_NOINLINE void TestAllSaturatingArithmetic() {
-  const ForPartialVectors<TestUnsignedSaturatingArithmetic> test_unsigned;
-  test_unsigned(uint8_t());
-  test_unsigned(uint16_t());
-
-  const ForPartialVectors<TestSignedSaturatingArithmetic> test_signed;
-  test_signed(int8_t());
-  test_signed(int16_t());
-}
-
-struct TestAverage {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v0 = Zero(d);
-    const auto v1 = Set(d, T(1));
-    const auto v2 = Set(d, T(2));
-
-    HWY_ASSERT_VEC_EQ(d, v0, AverageRound(v0, v0));
-    HWY_ASSERT_VEC_EQ(d, v1, AverageRound(v0, v1));
-    HWY_ASSERT_VEC_EQ(d, v1, AverageRound(v1, v1));
-    HWY_ASSERT_VEC_EQ(d, v2, AverageRound(v1, v2));
-    HWY_ASSERT_VEC_EQ(d, v2, AverageRound(v2, v2));
-  }
-};
-
-HWY_NOINLINE void TestAllAverage() {
-  const ForPartialVectors<TestAverage> test;
-  test(uint8_t());
-  test(uint16_t());
-}
-
-struct TestAbs {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v0 = Zero(d);
-    const auto vp1 = Set(d, T(1));
-    const auto vn1 = Set(d, T(-1));
-    const auto vpm = Set(d, LimitsMax<T>());
-    const auto vnm = Set(d, LimitsMin<T>());
-
-    HWY_ASSERT_VEC_EQ(d, v0, Abs(v0));
-    HWY_ASSERT_VEC_EQ(d, vp1, Abs(vp1));
-    HWY_ASSERT_VEC_EQ(d, vp1, Abs(vn1));
-    HWY_ASSERT_VEC_EQ(d, vpm, Abs(vpm));
-    HWY_ASSERT_VEC_EQ(d, vnm, Abs(vnm));
-  }
-};
-
-struct TestFloatAbs {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v0 = Zero(d);
-    const auto vp1 = Set(d, T(1));
-    const auto vn1 = Set(d, T(-1));
-    const auto vp2 = Set(d, T(0.01));
-    const auto vn2 = Set(d, T(-0.01));
-
-    HWY_ASSERT_VEC_EQ(d, v0, Abs(v0));
-    HWY_ASSERT_VEC_EQ(d, vp1, Abs(vp1));
-    HWY_ASSERT_VEC_EQ(d, vp1, Abs(vn1));
-    HWY_ASSERT_VEC_EQ(d, vp2, Abs(vp2));
-    HWY_ASSERT_VEC_EQ(d, vp2, Abs(vn2));
-  }
-};
-
-HWY_NOINLINE void TestAllAbs() {
-  ForSignedTypes(ForPartialVectors<TestAbs>());
-  ForFloatTypes(ForPartialVectors<TestFloatAbs>());
-}
-
-struct TestNeg {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v0 = Zero(d);
-    const auto vn = Set(d, T(-3));
-    const auto vp = Set(d, T(3));
-    HWY_ASSERT_VEC_EQ(d, v0, Neg(v0));
-    HWY_ASSERT_VEC_EQ(d, vp, Neg(vn));
-    HWY_ASSERT_VEC_EQ(d, vn, Neg(vp));
-  }
-};
-
-HWY_NOINLINE void TestAllNeg() {
-  ForSignedTypes(ForPartialVectors<TestNeg>());
-  ForFloatTypes(ForPartialVectors<TestNeg>());
-}
-
-struct TestUnsignedMinMax {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v0 = Zero(d);
-    // Leave headroom such that v1 < v2 even after wraparound.
-    const auto mod = And(Iota(d, 0), Set(d, LimitsMax<T>() >> 1));
-    const auto v1 = Add(mod, Set(d, 1));
-    const auto v2 = Add(mod, Set(d, 2));
-    HWY_ASSERT_VEC_EQ(d, v1, Min(v1, v2));
-    HWY_ASSERT_VEC_EQ(d, v2, Max(v1, v2));
-    HWY_ASSERT_VEC_EQ(d, v0, Min(v1, v0));
-    HWY_ASSERT_VEC_EQ(d, v1, Max(v1, v0));
-
-    const auto vmin = Set(d, LimitsMin<T>());
-    const auto vmax = Set(d, LimitsMax<T>());
-
-    HWY_ASSERT_VEC_EQ(d, vmin, Min(vmin, vmax));
-    HWY_ASSERT_VEC_EQ(d, vmin, Min(vmax, vmin));
-
-    HWY_ASSERT_VEC_EQ(d, vmax, Max(vmin, vmax));
-    HWY_ASSERT_VEC_EQ(d, vmax, Max(vmax, vmin));
-  }
-};
-
-struct TestSignedMinMax {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    // Leave headroom such that v1 < v2 even after wraparound.
-    const auto mod = And(Iota(d, 0), Set(d, LimitsMax<T>() >> 1));
-    const auto v1 = Add(mod, Set(d, 1));
-    const auto v2 = Add(mod, Set(d, 2));
-    const auto v_neg = Sub(Zero(d), v1);
-    HWY_ASSERT_VEC_EQ(d, v1, Min(v1, v2));
-    HWY_ASSERT_VEC_EQ(d, v2, Max(v1, v2));
-    HWY_ASSERT_VEC_EQ(d, v_neg, Min(v1, v_neg));
-    HWY_ASSERT_VEC_EQ(d, v1, Max(v1, v_neg));
-
-    const auto v0 = Zero(d);
-    const auto vmin = Set(d, LimitsMin<T>());
-    const auto vmax = Set(d, LimitsMax<T>());
-    HWY_ASSERT_VEC_EQ(d, vmin, Min(v0, vmin));
-    HWY_ASSERT_VEC_EQ(d, vmin, Min(vmin, v0));
-    HWY_ASSERT_VEC_EQ(d, v0, Max(v0, vmin));
-    HWY_ASSERT_VEC_EQ(d, v0, Max(vmin, v0));
-
-    HWY_ASSERT_VEC_EQ(d, vmin, Min(vmin, vmax));
-    HWY_ASSERT_VEC_EQ(d, vmin, Min(vmax, vmin));
-
-    HWY_ASSERT_VEC_EQ(d, vmax, Max(vmin, vmax));
-    HWY_ASSERT_VEC_EQ(d, vmax, Max(vmax, vmin));
-  }
-};
-
-struct TestFloatMinMax {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v1 = Iota(d, 1);
-    const auto v2 = Iota(d, 2);
-    const auto v_neg = Iota(d, -T(Lanes(d)));
-    HWY_ASSERT_VEC_EQ(d, v1, Min(v1, v2));
-    HWY_ASSERT_VEC_EQ(d, v2, Max(v1, v2));
-    HWY_ASSERT_VEC_EQ(d, v_neg, Min(v1, v_neg));
-    HWY_ASSERT_VEC_EQ(d, v1, Max(v1, v_neg));
-
-    const auto v0 = Zero(d);
-    const auto vmin = Set(d, T(-1E30));
-    const auto vmax = Set(d, T(1E30));
-    HWY_ASSERT_VEC_EQ(d, vmin, Min(v0, vmin));
-    HWY_ASSERT_VEC_EQ(d, vmin, Min(vmin, v0));
-    HWY_ASSERT_VEC_EQ(d, v0, Max(v0, vmin));
-    HWY_ASSERT_VEC_EQ(d, v0, Max(vmin, v0));
-
-    HWY_ASSERT_VEC_EQ(d, vmin, Min(vmin, vmax));
-    HWY_ASSERT_VEC_EQ(d, vmin, Min(vmax, vmin));
-
-    HWY_ASSERT_VEC_EQ(d, vmax, Max(vmin, vmax));
-    HWY_ASSERT_VEC_EQ(d, vmax, Max(vmax, vmin));
-  }
-};
-
-HWY_NOINLINE void TestAllMinMax() {
-  ForUnsignedTypes(ForPartialVectors<TestUnsignedMinMax>());
-  ForSignedTypes(ForPartialVectors<TestSignedMinMax>());
-  ForFloatTypes(ForPartialVectors<TestFloatMinMax>());
-}
-
-template <class D>
-static HWY_NOINLINE Vec<D> Make128(D d, uint64_t hi, uint64_t lo) {
-  alignas(16) uint64_t in[2];
-  in[0] = lo;
-  in[1] = hi;
-  return LoadDup128(d, in);
-}
-
-struct TestMinMax128 {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    using V = Vec<D>;
-    const size_t N = Lanes(d);
-    auto a_lanes = AllocateAligned<T>(N);
-    auto b_lanes = AllocateAligned<T>(N);
-    auto min_lanes = AllocateAligned<T>(N);
-    auto max_lanes = AllocateAligned<T>(N);
-    RandomState rng;
-
-    const V v00 = Zero(d);
-    const V v01 = Make128(d, 0, 1);
-    const V v10 = Make128(d, 1, 0);
-    const V v11 = Add(v01, v10);
-
-    // Same arg
-    HWY_ASSERT_VEC_EQ(d, v00, Min128(d, v00, v00));
-    HWY_ASSERT_VEC_EQ(d, v01, Min128(d, v01, v01));
-    HWY_ASSERT_VEC_EQ(d, v10, Min128(d, v10, v10));
-    HWY_ASSERT_VEC_EQ(d, v11, Min128(d, v11, v11));
-    HWY_ASSERT_VEC_EQ(d, v00, Max128(d, v00, v00));
-    HWY_ASSERT_VEC_EQ(d, v01, Max128(d, v01, v01));
-    HWY_ASSERT_VEC_EQ(d, v10, Max128(d, v10, v10));
-    HWY_ASSERT_VEC_EQ(d, v11, Max128(d, v11, v11));
-
-    // First arg less
-    HWY_ASSERT_VEC_EQ(d, v00, Min128(d, v00, v01));
-    HWY_ASSERT_VEC_EQ(d, v01, Min128(d, v01, v10));
-    HWY_ASSERT_VEC_EQ(d, v10, Min128(d, v10, v11));
-    HWY_ASSERT_VEC_EQ(d, v01, Max128(d, v00, v01));
-    HWY_ASSERT_VEC_EQ(d, v10, Max128(d, v01, v10));
-    HWY_ASSERT_VEC_EQ(d, v11, Max128(d, v10, v11));
-
-    // Second arg less
-    HWY_ASSERT_VEC_EQ(d, v00, Min128(d, v01, v00));
-    HWY_ASSERT_VEC_EQ(d, v01, Min128(d, v10, v01));
-    HWY_ASSERT_VEC_EQ(d, v10, Min128(d, v11, v10));
-    HWY_ASSERT_VEC_EQ(d, v01, Max128(d, v01, v00));
-    HWY_ASSERT_VEC_EQ(d, v10, Max128(d, v10, v01));
-    HWY_ASSERT_VEC_EQ(d, v11, Max128(d, v11, v10));
-
-    // Also check 128-bit blocks are independent
-    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
-      for (size_t i = 0; i < N; ++i) {
-        a_lanes[i] = Random64(&rng);
-        b_lanes[i] = Random64(&rng);
-      }
-      const V a = Load(d, a_lanes.get());
-      const V b = Load(d, b_lanes.get());
-      for (size_t i = 0; i < N; i += 2) {
-        const bool lt = a_lanes[i + 1] == b_lanes[i + 1]
-                            ? (a_lanes[i] < b_lanes[i])
-                            : (a_lanes[i + 1] < b_lanes[i + 1]);
-        min_lanes[i + 0] = lt ? a_lanes[i + 0] : b_lanes[i + 0];
-        min_lanes[i + 1] = lt ? a_lanes[i + 1] : b_lanes[i + 1];
-        max_lanes[i + 0] = lt ? b_lanes[i + 0] : a_lanes[i + 0];
-        max_lanes[i + 1] = lt ? b_lanes[i + 1] : a_lanes[i + 1];
-      }
-      HWY_ASSERT_VEC_EQ(d, min_lanes.get(), Min128(d, a, b));
-      HWY_ASSERT_VEC_EQ(d, max_lanes.get(), Max128(d, a, b));
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllMinMax128() {
-  ForGEVectors<128, TestMinMax128>()(uint64_t());
-}
-
-struct TestMinMax128Upper {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    using V = Vec<D>;
-    const size_t N = Lanes(d);
-    auto a_lanes = AllocateAligned<T>(N);
-    auto b_lanes = AllocateAligned<T>(N);
-    auto min_lanes = AllocateAligned<T>(N);
-    auto max_lanes = AllocateAligned<T>(N);
-    RandomState rng;
-
-    const V v00 = Zero(d);
-    const V v01 = Make128(d, 0, 1);
-    const V v10 = Make128(d, 1, 0);
-    const V v11 = Add(v01, v10);
-
-    // Same arg
-    HWY_ASSERT_VEC_EQ(d, v00, Min128Upper(d, v00, v00));
-    HWY_ASSERT_VEC_EQ(d, v01, Min128Upper(d, v01, v01));
-    HWY_ASSERT_VEC_EQ(d, v10, Min128Upper(d, v10, v10));
-    HWY_ASSERT_VEC_EQ(d, v11, Min128Upper(d, v11, v11));
-    HWY_ASSERT_VEC_EQ(d, v00, Max128Upper(d, v00, v00));
-    HWY_ASSERT_VEC_EQ(d, v01, Max128Upper(d, v01, v01));
-    HWY_ASSERT_VEC_EQ(d, v10, Max128Upper(d, v10, v10));
-    HWY_ASSERT_VEC_EQ(d, v11, Max128Upper(d, v11, v11));
-
-    // Equivalent but not equal (chooses second arg)
-    HWY_ASSERT_VEC_EQ(d, v01, Min128Upper(d, v00, v01));
-    HWY_ASSERT_VEC_EQ(d, v11, Min128Upper(d, v10, v11));
-    HWY_ASSERT_VEC_EQ(d, v00, Min128Upper(d, v01, v00));
-    HWY_ASSERT_VEC_EQ(d, v10, Min128Upper(d, v11, v10));
-    HWY_ASSERT_VEC_EQ(d, v00, Max128Upper(d, v01, v00));
-    HWY_ASSERT_VEC_EQ(d, v10, Max128Upper(d, v11, v10));
-    HWY_ASSERT_VEC_EQ(d, v01, Max128Upper(d, v00, v01));
-    HWY_ASSERT_VEC_EQ(d, v11, Max128Upper(d, v10, v11));
-
-    // First arg less
-    HWY_ASSERT_VEC_EQ(d, v01, Min128Upper(d, v01, v10));
-    HWY_ASSERT_VEC_EQ(d, v10, Max128Upper(d, v01, v10));
-
-    // Second arg less
-    HWY_ASSERT_VEC_EQ(d, v01, Min128Upper(d, v10, v01));
-    HWY_ASSERT_VEC_EQ(d, v10, Max128Upper(d, v10, v01));
-
-    // Also check 128-bit blocks are independent
-    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
-      for (size_t i = 0; i < N; ++i) {
-        a_lanes[i] = Random64(&rng);
-        b_lanes[i] = Random64(&rng);
-      }
-      const V a = Load(d, a_lanes.get());
-      const V b = Load(d, b_lanes.get());
-      for (size_t i = 0; i < N; i += 2) {
-        const bool lt = a_lanes[i + 1] < b_lanes[i + 1];
-        min_lanes[i + 0] = lt ? a_lanes[i + 0] : b_lanes[i + 0];
-        min_lanes[i + 1] = lt ? a_lanes[i + 1] : b_lanes[i + 1];
-        max_lanes[i + 0] = lt ? b_lanes[i + 0] : a_lanes[i + 0];
-        max_lanes[i + 1] = lt ? b_lanes[i + 1] : a_lanes[i + 1];
-      }
-      HWY_ASSERT_VEC_EQ(d, min_lanes.get(), Min128Upper(d, a, b));
-      HWY_ASSERT_VEC_EQ(d, max_lanes.get(), Max128Upper(d, a, b));
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllMinMax128Upper() {
-  ForGEVectors<128, TestMinMax128Upper>()(uint64_t());
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-
-namespace hwy {
-HWY_BEFORE_TEST(HwyArithmeticTest);
-HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllPlusMinus);
-HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSaturatingArithmetic);
-HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAverage);
-HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAbs);
-HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllNeg);
-HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax);
-HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax128);
-HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax128Upper);
-}  // namespace hwy
-
-#endif
diff --git a/third_party/highway/hwy/tests/blockwise_shift_test.cc b/third_party/highway/hwy/tests/blockwise_shift_test.cc
deleted file mode 100644 (file)
index d14fb86..0000000
+++ /dev/null
@@ -1,268 +0,0 @@
-// Copyright 2019 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <stddef.h>
-#include <stdint.h>
-#include <string.h>  // memcpy
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "tests/blockwise_shift_test.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#include "hwy/highway.h"
-#include "hwy/tests/test_util-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-struct TestShiftBytes {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    // Scalar does not define Shift*Bytes.
-#if HWY_TARGET != HWY_SCALAR || HWY_IDE
-    const Repartition<uint8_t, D> du8;
-    const size_t N8 = Lanes(du8);
-
-    // Zero remains zero
-    const auto v0 = Zero(d);
-    HWY_ASSERT_VEC_EQ(d, v0, ShiftLeftBytes<1>(v0));
-    HWY_ASSERT_VEC_EQ(d, v0, ShiftLeftBytes<1>(d, v0));
-    HWY_ASSERT_VEC_EQ(d, v0, ShiftRightBytes<1>(d, v0));
-
-    // Zero after shifting out the high/low byte
-    auto bytes = AllocateAligned<uint8_t>(N8);
-    std::fill(bytes.get(), bytes.get() + N8, 0);
-    bytes[N8 - 1] = 0x7F;
-    const auto vhi = BitCast(d, Load(du8, bytes.get()));
-    bytes[N8 - 1] = 0;
-    bytes[0] = 0x7F;
-    const auto vlo = BitCast(d, Load(du8, bytes.get()));
-    HWY_ASSERT_VEC_EQ(d, v0, ShiftLeftBytes<1>(vhi));
-    HWY_ASSERT_VEC_EQ(d, v0, ShiftLeftBytes<1>(d, vhi));
-    HWY_ASSERT_VEC_EQ(d, v0, ShiftRightBytes<1>(d, vlo));
-
-    // Check expected result with Iota
-    const size_t N = Lanes(d);
-    auto in = AllocateAligned<T>(N);
-    const uint8_t* in_bytes = reinterpret_cast<const uint8_t*>(in.get());
-    const auto v = BitCast(d, Iota(du8, 1));
-    Store(v, d, in.get());
-
-    auto expected = AllocateAligned<T>(N);
-    uint8_t* expected_bytes = reinterpret_cast<uint8_t*>(expected.get());
-
-    const size_t block_size = HWY_MIN(N8, 16);
-    for (size_t block = 0; block < N8; block += block_size) {
-      expected_bytes[block] = 0;
-      memcpy(expected_bytes + block + 1, in_bytes + block, block_size - 1);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftBytes<1>(v));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftBytes<1>(d, v));
-
-    for (size_t block = 0; block < N8; block += block_size) {
-      memcpy(expected_bytes + block, in_bytes + block + 1, block_size - 1);
-      expected_bytes[block + block_size - 1] = 0;
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightBytes<1>(d, v));
-#else
-    (void)d;
-#endif  // #if HWY_TARGET != HWY_SCALAR
-  }
-};
-
-HWY_NOINLINE void TestAllShiftBytes() {
-  ForIntegerTypes(ForPartialVectors<TestShiftBytes>());
-}
-
-struct TestShiftLeftLanes {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    // Scalar does not define Shift*Lanes.
-#if HWY_TARGET != HWY_SCALAR || HWY_IDE
-    const auto v = Iota(d, T(1));
-    const size_t N = Lanes(d);
-    if (N == 1) return;
-    auto expected = AllocateAligned<T>(N);
-
-    HWY_ASSERT_VEC_EQ(d, v, ShiftLeftLanes<0>(v));
-    HWY_ASSERT_VEC_EQ(d, v, ShiftLeftLanes<0>(d, v));
-
-    constexpr size_t kLanesPerBlock = 16 / sizeof(T);
-
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = (i % kLanesPerBlock) == 0 ? T(0) : T(i);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftLanes<1>(v));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftLanes<1>(d, v));
-#else
-    (void)d;
-#endif  // #if HWY_TARGET != HWY_SCALAR
-  }
-};
-
-struct TestShiftRightLanes {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    // Scalar does not define Shift*Lanes.
-#if HWY_TARGET != HWY_SCALAR || HWY_IDE
-    const auto v = Iota(d, T(1));
-    const size_t N = Lanes(d);
-    if (N == 1) return;
-    auto expected = AllocateAligned<T>(N);
-
-    HWY_ASSERT_VEC_EQ(d, v, ShiftRightLanes<0>(d, v));
-
-    constexpr size_t kLanesPerBlock = 16 / sizeof(T);
-
-    for (size_t i = 0; i < N; ++i) {
-      const size_t mod = i % kLanesPerBlock;
-      expected[i] = mod == (kLanesPerBlock - 1) || i >= N - 1 ? T(0) : T(2 + i);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightLanes<1>(d, v));
-#else
-    (void)d;
-#endif  // #if HWY_TARGET != HWY_SCALAR
-  }
-};
-
-HWY_NOINLINE void TestAllShiftLeftLanes() {
-  ForAllTypes(ForPartialVectors<TestShiftLeftLanes>());
-}
-
-HWY_NOINLINE void TestAllShiftRightLanes() {
-  ForAllTypes(ForPartialVectors<TestShiftRightLanes>());
-}
-
-// Scalar does not define CombineShiftRightBytes.
-#if HWY_TARGET != HWY_SCALAR || HWY_IDE
-
-template <int kBytes>
-struct TestCombineShiftRightBytes {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T, D d) {
-    constexpr size_t kBlockSize = 16;
-    static_assert(kBytes < kBlockSize, "Shift count is per block");
-    const Repartition<uint8_t, D> d8;
-    const size_t N8 = Lanes(d8);
-    if (N8 < 16) return;
-    auto hi_bytes = AllocateAligned<uint8_t>(N8);
-    auto lo_bytes = AllocateAligned<uint8_t>(N8);
-    auto expected_bytes = AllocateAligned<uint8_t>(N8);
-    uint8_t combined[2 * kBlockSize];
-
-    // Random inputs in each lane
-    RandomState rng;
-    for (size_t rep = 0; rep < AdjustedReps(100); ++rep) {
-      for (size_t i = 0; i < N8; ++i) {
-        hi_bytes[i] = static_cast<uint8_t>(Random64(&rng) & 0xFF);
-        lo_bytes[i] = static_cast<uint8_t>(Random64(&rng) & 0xFF);
-      }
-      for (size_t i = 0; i < N8; i += kBlockSize) {
-        // Arguments are not the same size.
-        CopyBytes<kBlockSize>(&lo_bytes[i], combined);
-        CopyBytes<kBlockSize>(&hi_bytes[i], combined + kBlockSize);
-        CopyBytes<kBlockSize>(combined + kBytes, &expected_bytes[i]);
-      }
-
-      const auto hi = BitCast(d, Load(d8, hi_bytes.get()));
-      const auto lo = BitCast(d, Load(d8, lo_bytes.get()));
-      const auto expected = BitCast(d, Load(d8, expected_bytes.get()));
-      HWY_ASSERT_VEC_EQ(d, expected, CombineShiftRightBytes<kBytes>(d, hi, lo));
-    }
-  }
-};
-
-template <int kLanes>
-struct TestCombineShiftRightLanes {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T, D d) {
-    const Repartition<uint8_t, D> d8;
-    const size_t N8 = Lanes(d8);
-    if (N8 < 16) return;
-
-    auto hi_bytes = AllocateAligned<uint8_t>(N8);
-    auto lo_bytes = AllocateAligned<uint8_t>(N8);
-    auto expected_bytes = AllocateAligned<uint8_t>(N8);
-    constexpr size_t kBlockSize = 16;
-    uint8_t combined[2 * kBlockSize];
-
-    // Random inputs in each lane
-    RandomState rng;
-    for (size_t rep = 0; rep < AdjustedReps(100); ++rep) {
-      for (size_t i = 0; i < N8; ++i) {
-        hi_bytes[i] = static_cast<uint8_t>(Random64(&rng) & 0xFF);
-        lo_bytes[i] = static_cast<uint8_t>(Random64(&rng) & 0xFF);
-      }
-      for (size_t i = 0; i < N8; i += kBlockSize) {
-        // Arguments are not the same size.
-        CopyBytes<kBlockSize>(&lo_bytes[i], combined);
-        CopyBytes<kBlockSize>(&hi_bytes[i], combined + kBlockSize);
-        CopyBytes<kBlockSize>(combined + kLanes * sizeof(T),
-                              &expected_bytes[i]);
-      }
-
-      const auto hi = BitCast(d, Load(d8, hi_bytes.get()));
-      const auto lo = BitCast(d, Load(d8, lo_bytes.get()));
-      const auto expected = BitCast(d, Load(d8, expected_bytes.get()));
-      HWY_ASSERT_VEC_EQ(d, expected, CombineShiftRightLanes<kLanes>(d, hi, lo));
-    }
-  }
-};
-
-#endif  // #if HWY_TARGET != HWY_SCALAR
-
-struct TestCombineShiftRight {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T t, D d) {
-// Scalar does not define CombineShiftRightBytes.
-#if HWY_TARGET != HWY_SCALAR || HWY_IDE
-    constexpr int kMaxBytes =
-        HWY_MIN(16, static_cast<int>(MaxLanes(d) * sizeof(T)));
-    constexpr int kMaxLanes = kMaxBytes / static_cast<int>(sizeof(T));
-    TestCombineShiftRightBytes<kMaxBytes - 1>()(t, d);
-    TestCombineShiftRightBytes<HWY_MAX(kMaxBytes / 2, 1)>()(t, d);
-    TestCombineShiftRightBytes<1>()(t, d);
-
-    TestCombineShiftRightLanes<kMaxLanes - 1>()(t, d);
-    TestCombineShiftRightLanes<HWY_MAX(kMaxLanes / 2, -1)>()(t, d);
-    TestCombineShiftRightLanes<1>()(t, d);
-#else
-    (void)t;
-    (void)d;
-#endif
-  }
-};
-
-HWY_NOINLINE void TestAllCombineShiftRight() {
-  // Need at least 2 lanes.
-  ForAllTypes(ForShrinkableVectors<TestCombineShiftRight>());
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-
-namespace hwy {
-HWY_BEFORE_TEST(HwyBlockwiseShiftTest);
-HWY_EXPORT_AND_TEST_P(HwyBlockwiseShiftTest, TestAllShiftBytes);
-HWY_EXPORT_AND_TEST_P(HwyBlockwiseShiftTest, TestAllShiftLeftLanes);
-HWY_EXPORT_AND_TEST_P(HwyBlockwiseShiftTest, TestAllShiftRightLanes);
-HWY_EXPORT_AND_TEST_P(HwyBlockwiseShiftTest, TestAllCombineShiftRight);
-}  // namespace hwy
-
-#endif
diff --git a/third_party/highway/hwy/tests/blockwise_test.cc b/third_party/highway/hwy/tests/blockwise_test.cc
deleted file mode 100644 (file)
index 41097ee..0000000
+++ /dev/null
@@ -1,452 +0,0 @@
-// Copyright 2019 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <stddef.h>
-#include <stdint.h>
-#include <string.h>
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "tests/blockwise_test.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#include "hwy/highway.h"
-#include "hwy/tests/test_util-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-template <typename D, int kLane>
-struct TestBroadcastR {
-  HWY_NOINLINE void operator()() const {
-    using T = typename D::T;
-    const D d;
-    const size_t N = Lanes(d);
-    if (kLane >= N) return;
-    auto in_lanes = AllocateAligned<T>(N);
-    std::fill(in_lanes.get(), in_lanes.get() + N, T(0));
-    const size_t blockN = HWY_MIN(N * sizeof(T), 16) / sizeof(T);
-    // Need to set within each 128-bit block
-    for (size_t block = 0; block < N; block += blockN) {
-      in_lanes[block + kLane] = static_cast<T>(block + 1);
-    }
-    const auto in = Load(d, in_lanes.get());
-    auto expected = AllocateAligned<T>(N);
-    for (size_t block = 0; block < N; block += blockN) {
-      for (size_t i = 0; i < blockN; ++i) {
-        expected[block + i] = T(block + 1);
-      }
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), Broadcast<kLane>(in));
-
-    TestBroadcastR<D, kLane - 1>()();
-  }
-};
-
-template <class D>
-struct TestBroadcastR<D, -1> {
-  void operator()() const {}
-};
-
-struct TestBroadcast {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    TestBroadcastR<D, HWY_MIN(MaxLanes(d), 16 / sizeof(T)) - 1>()();
-  }
-};
-
-HWY_NOINLINE void TestAllBroadcast() {
-  const ForPartialVectors<TestBroadcast> test;
-  // No u/i8.
-  test(uint16_t());
-  test(int16_t());
-  ForUIF3264(test);
-}
-
-template <bool kFull>
-struct ChooseTableSize {
-  template <typename T, typename DIdx>
-  using type = DIdx;
-};
-template <>
-struct ChooseTableSize<true> {
-  template <typename T, typename DIdx>
-  using type = ScalableTag<T>;
-};
-
-template <bool kFull>
-struct TestTableLookupBytes {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-#if HWY_TARGET != HWY_SCALAR
-    RandomState rng;
-
-    const typename ChooseTableSize<kFull>::template type<T, D> d_tbl;
-    const Repartition<uint8_t, decltype(d_tbl)> d_tbl8;
-    const size_t NT8 = Lanes(d_tbl8);
-
-    const Repartition<uint8_t, D> d8;
-    const size_t N8 = Lanes(d8);
-
-    // Random input bytes
-    auto in_bytes = AllocateAligned<uint8_t>(NT8);
-    for (size_t i = 0; i < NT8; ++i) {
-      in_bytes[i] = Random32(&rng) & 0xFF;
-    }
-    const auto in = BitCast(d_tbl, Load(d_tbl8, in_bytes.get()));
-
-    // Enough test data; for larger vectors, upper lanes will be zero.
-    const uint8_t index_bytes_source[64] = {
-        // Same index as source, multiple outputs from same input,
-        // unused input (9), ascending/descending and nonconsecutive neighbors.
-        0,  2,  1, 2, 15, 12, 13, 14, 6,  7,  8,  5,  4,  3,  10, 11,
-        11, 10, 3, 4, 5,  8,  7,  6,  14, 13, 12, 15, 2,  1,  2,  0,
-        4,  3,  2, 2, 5,  6,  7,  7,  15, 15, 15, 15, 15, 15, 0,  1};
-    auto index_bytes = AllocateAligned<uint8_t>(N8);
-    const size_t max_index = HWY_MIN(NT8, 16) - 1;
-    for (size_t i = 0; i < N8; ++i) {
-      index_bytes[i] = (i < 64) ? index_bytes_source[i] : 0;
-      // Avoid asan error for partial vectors.
-      index_bytes[i] = static_cast<uint8_t>(HWY_MIN(index_bytes[i], max_index));
-    }
-    const auto indices = Load(d, reinterpret_cast<const T*>(index_bytes.get()));
-
-    const size_t N = Lanes(d);
-    auto expected = AllocateAligned<T>(N);
-    uint8_t* expected_bytes = reinterpret_cast<uint8_t*>(expected.get());
-
-    for (size_t block = 0; block < N8; block += 16) {
-      for (size_t i = 0; i < 16 && (block + i) < N8; ++i) {
-        const uint8_t index = index_bytes[block + i];
-        HWY_ASSERT(index <= max_index);
-        // Note that block + index may exceed NT8 on RVV, which is fine because
-        // the operation uses the larger of the table and index vector size.
-        HWY_ASSERT(block + index < HWY_MAX(N8, NT8));
-        // For large vectors, the lane index may wrap around due to block,
-        // also wrap around after 8-bit overflow.
-        expected_bytes[block + i] =
-            in_bytes[(block + index) % HWY_MIN(NT8, 256)];
-      }
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), TableLookupBytes(in, indices));
-
-    // Individually test zeroing each byte position.
-    for (size_t i = 0; i < N8; ++i) {
-      const uint8_t prev_expected = expected_bytes[i];
-      const uint8_t prev_index = index_bytes[i];
-      expected_bytes[i] = 0;
-
-      const int idx = 0x80 + (static_cast<int>(Random32(&rng) & 7) << 4);
-      HWY_ASSERT(0x80 <= idx && idx < 256);
-      index_bytes[i] = static_cast<uint8_t>(idx);
-
-      const auto indices =
-          Load(d, reinterpret_cast<const T*>(index_bytes.get()));
-      HWY_ASSERT_VEC_EQ(d, expected.get(), TableLookupBytesOr0(in, indices));
-      expected_bytes[i] = prev_expected;
-      index_bytes[i] = prev_index;
-    }
-#else
-    (void)d;
-#endif
-  }
-};
-
-HWY_NOINLINE void TestAllTableLookupBytesSame() {
-  // Partial index, same-sized table.
-  ForIntegerTypes(ForPartialVectors<TestTableLookupBytes<false>>());
-}
-
-HWY_NOINLINE void TestAllTableLookupBytesMixed() {
-  // Partial index, full-size table.
-  ForIntegerTypes(ForPartialVectors<TestTableLookupBytes<true>>());
-}
-
-struct TestInterleaveLower {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    using TU = MakeUnsigned<T>;
-    const size_t N = Lanes(d);
-    auto even_lanes = AllocateAligned<T>(N);
-    auto odd_lanes = AllocateAligned<T>(N);
-    auto expected = AllocateAligned<T>(N);
-    for (size_t i = 0; i < N; ++i) {
-      even_lanes[i] = static_cast<T>(2 * i + 0);
-      odd_lanes[i] = static_cast<T>(2 * i + 1);
-    }
-    const auto even = Load(d, even_lanes.get());
-    const auto odd = Load(d, odd_lanes.get());
-
-    const size_t blockN = HWY_MIN(16 / sizeof(T), N);
-    for (size_t i = 0; i < Lanes(d); ++i) {
-      const size_t block = i / blockN;
-      const size_t index = (i % blockN) + block * 2 * blockN;
-      expected[i] = static_cast<T>(index & LimitsMax<TU>());
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveLower(even, odd));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveLower(d, even, odd));
-  }
-};
-
-struct TestInterleaveUpper {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    if (N == 1) return;
-    auto even_lanes = AllocateAligned<T>(N);
-    auto odd_lanes = AllocateAligned<T>(N);
-    auto expected = AllocateAligned<T>(N);
-    for (size_t i = 0; i < N; ++i) {
-      even_lanes[i] = static_cast<T>(2 * i + 0);
-      odd_lanes[i] = static_cast<T>(2 * i + 1);
-    }
-    const auto even = Load(d, even_lanes.get());
-    const auto odd = Load(d, odd_lanes.get());
-
-    const size_t blockN = HWY_MIN(16 / sizeof(T), N);
-    for (size_t i = 0; i < Lanes(d); ++i) {
-      const size_t block = i / blockN;
-      expected[i] = T((i % blockN) + block * 2 * blockN + blockN);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveUpper(d, even, odd));
-  }
-};
-
-HWY_NOINLINE void TestAllInterleave() {
-  // Not DemoteVectors because this cannot be supported by HWY_SCALAR.
-  ForAllTypes(ForShrinkableVectors<TestInterleaveLower>());
-  ForAllTypes(ForShrinkableVectors<TestInterleaveUpper>());
-}
-
-struct TestZipLower {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    using WideT = MakeWide<T>;
-    static_assert(sizeof(T) * 2 == sizeof(WideT), "Must be double-width");
-    static_assert(IsSigned<T>() == IsSigned<WideT>(), "Must have same sign");
-    const size_t N = Lanes(d);
-    auto even_lanes = AllocateAligned<T>(N);
-    auto odd_lanes = AllocateAligned<T>(N);
-    // At least 2 lanes for HWY_SCALAR
-    auto zip_lanes = AllocateAligned<T>(HWY_MAX(N, 2));
-    const T kMaxT = LimitsMax<T>();
-    for (size_t i = 0; i < N; ++i) {
-      even_lanes[i] = static_cast<T>((2 * i + 0) & kMaxT);
-      odd_lanes[i] = static_cast<T>((2 * i + 1) & kMaxT);
-    }
-    const auto even = Load(d, even_lanes.get());
-    const auto odd = Load(d, odd_lanes.get());
-
-    const Repartition<WideT, D> dw;
-#if HWY_TARGET == HWY_SCALAR
-    // Safely handle big-endian
-    const auto expected = Set(dw, static_cast<WideT>(1ULL << (sizeof(T) * 8)));
-#else
-    const size_t blockN = HWY_MIN(size_t(16) / sizeof(T), N);
-    for (size_t i = 0; i < N; i += 2) {
-      const size_t base = (i / blockN) * blockN;
-      const size_t mod = i % blockN;
-      zip_lanes[i + 0] = even_lanes[mod / 2 + base];
-      zip_lanes[i + 1] = odd_lanes[mod / 2 + base];
-    }
-    const auto expected =
-        Load(dw, reinterpret_cast<const WideT*>(zip_lanes.get()));
-#endif  // HWY_TARGET == HWY_SCALAR
-    HWY_ASSERT_VEC_EQ(dw, expected, ZipLower(even, odd));
-    HWY_ASSERT_VEC_EQ(dw, expected, ZipLower(dw, even, odd));
-  }
-};
-
-HWY_NOINLINE void TestAllZipLower() {
-  const ForDemoteVectors<TestZipLower> lower_unsigned;
-  lower_unsigned(uint8_t());
-  lower_unsigned(uint16_t());
-#if HWY_HAVE_INTEGER64
-  lower_unsigned(uint32_t());  // generates u64
-#endif
-
-  const ForDemoteVectors<TestZipLower> lower_signed;
-  lower_signed(int8_t());
-  lower_signed(int16_t());
-#if HWY_HAVE_INTEGER64
-  lower_signed(int32_t());  // generates i64
-#endif
-
-  // No float - concatenating f32 does not result in a f64
-}
-
-// Remove this test (so it does not show as having run) if the only target is
-// HWY_SCALAR, which does not support this op.
-#if HWY_TARGETS != HWY_SCALAR
-
-struct TestZipUpper {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-#if HWY_TARGET == HWY_SCALAR
-    (void)d;
-#else
-    using WideT = MakeWide<T>;
-    static_assert(sizeof(T) * 2 == sizeof(WideT), "Must be double-width");
-    static_assert(IsSigned<T>() == IsSigned<WideT>(), "Must have same sign");
-    const size_t N = Lanes(d);
-    if (N < 16 / sizeof(T)) return;
-    auto even_lanes = AllocateAligned<T>(N);
-    auto odd_lanes = AllocateAligned<T>(N);
-    auto zip_lanes = AllocateAligned<T>(N);
-    const T kMaxT = LimitsMax<T>();
-    for (size_t i = 0; i < N; ++i) {
-      even_lanes[i] = static_cast<T>((2 * i + 0) & kMaxT);
-      odd_lanes[i] = static_cast<T>((2 * i + 1) & kMaxT);
-    }
-    const auto even = Load(d, even_lanes.get());
-    const auto odd = Load(d, odd_lanes.get());
-
-    const size_t blockN = HWY_MIN(size_t(16) / sizeof(T), N);
-
-    for (size_t i = 0; i < N; i += 2) {
-      const size_t base = (i / blockN) * blockN + blockN / 2;
-      const size_t mod = i % blockN;
-      zip_lanes[i + 0] = even_lanes[mod / 2 + base];
-      zip_lanes[i + 1] = odd_lanes[mod / 2 + base];
-    }
-    const Repartition<WideT, D> dw;
-    const auto expected =
-        Load(dw, reinterpret_cast<const WideT*>(zip_lanes.get()));
-    HWY_ASSERT_VEC_EQ(dw, expected, ZipUpper(dw, even, odd));
-#endif  // HWY_TARGET == HWY_SCALAR
-  }
-};
-
-HWY_NOINLINE void TestAllZipUpper() {
-  const ForShrinkableVectors<TestZipUpper> upper_unsigned;
-  upper_unsigned(uint8_t());
-  upper_unsigned(uint16_t());
-#if HWY_HAVE_INTEGER64
-  upper_unsigned(uint32_t());  // generates u64
-#endif
-
-  const ForShrinkableVectors<TestZipUpper> upper_signed;
-  upper_signed(int8_t());
-  upper_signed(int16_t());
-#if HWY_HAVE_INTEGER64
-  upper_signed(int32_t());  // generates i64
-#endif
-
-  // No float - concatenating f32 does not result in a f64
-}
-
-#endif  // HWY_TARGETS != HWY_SCALAR
-
-class TestSpecialShuffle32 {
- public:
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v = Iota(d, 0);
-    VerifyLanes32(d, Shuffle2301(v), 2, 3, 0, 1, __FILE__, __LINE__);
-    VerifyLanes32(d, Shuffle1032(v), 1, 0, 3, 2, __FILE__, __LINE__);
-    VerifyLanes32(d, Shuffle0321(v), 0, 3, 2, 1, __FILE__, __LINE__);
-    VerifyLanes32(d, Shuffle2103(v), 2, 1, 0, 3, __FILE__, __LINE__);
-    VerifyLanes32(d, Shuffle0123(v), 0, 1, 2, 3, __FILE__, __LINE__);
-  }
-
- private:
-  // HWY_INLINE works around a Clang SVE compiler bug where all but the first
-  // 128 bits (the NEON register) of actual are zero.
-  template <class D, class V>
-  HWY_INLINE void VerifyLanes32(D d, VecArg<V> actual, const size_t i3,
-                                const size_t i2, const size_t i1,
-                                const size_t i0, const char* filename,
-                                const int line) {
-    using T = TFromD<D>;
-    constexpr size_t kBlockN = 16 / sizeof(T);
-    const size_t N = Lanes(d);
-    if (N < 4) return;
-    auto expected = AllocateAligned<T>(N);
-    for (size_t block = 0; block < N; block += kBlockN) {
-      expected[block + 3] = static_cast<T>(block + i3);
-      expected[block + 2] = static_cast<T>(block + i2);
-      expected[block + 1] = static_cast<T>(block + i1);
-      expected[block + 0] = static_cast<T>(block + i0);
-    }
-    AssertVecEqual(d, expected.get(), actual, filename, line);
-  }
-};
-
-class TestSpecialShuffle64 {
- public:
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v = Iota(d, 0);
-    VerifyLanes64(d, Shuffle01(v), 0, 1, __FILE__, __LINE__);
-  }
-
- private:
-  // HWY_INLINE works around a Clang SVE compiler bug where all but the first
-  // 128 bits (the NEON register) of actual are zero.
-  template <class D, class V>
-  HWY_INLINE void VerifyLanes64(D d, VecArg<V> actual, const size_t i1,
-                                const size_t i0, const char* filename,
-                                const int line) {
-    using T = TFromD<D>;
-    constexpr size_t kBlockN = 16 / sizeof(T);
-    const size_t N = Lanes(d);
-    if (N < 2) return;
-    auto expected = AllocateAligned<T>(N);
-    for (size_t block = 0; block < N; block += kBlockN) {
-      expected[block + 1] = static_cast<T>(block + i1);
-      expected[block + 0] = static_cast<T>(block + i0);
-    }
-    AssertVecEqual(d, expected.get(), actual, filename, line);
-  }
-};
-
-HWY_NOINLINE void TestAllSpecialShuffles() {
-  const ForGEVectors<128, TestSpecialShuffle32> test32;
-  test32(uint32_t());
-  test32(int32_t());
-  test32(float());
-
-#if HWY_HAVE_INTEGER64
-  const ForGEVectors<128, TestSpecialShuffle64> test64;
-  test64(uint64_t());
-  test64(int64_t());
-#endif
-
-#if HWY_HAVE_FLOAT64
-  const ForGEVectors<128, TestSpecialShuffle64> test_d;
-  test_d(double());
-#endif
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-
-namespace hwy {
-HWY_BEFORE_TEST(HwyBlockwiseTest);
-HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllBroadcast);
-HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllTableLookupBytesSame);
-HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllTableLookupBytesMixed);
-HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllInterleave);
-HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllZipLower);
-#if HWY_TARGETS != HWY_SCALAR
-HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllZipUpper);
-#endif
-HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllSpecialShuffles);
-}  // namespace hwy
-
-#endif
diff --git a/third_party/highway/hwy/tests/combine_test.cc b/third_party/highway/hwy/tests/combine_test.cc
deleted file mode 100644 (file)
index b99f07a..0000000
+++ /dev/null
@@ -1,273 +0,0 @@
-// Copyright 2019 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <stddef.h>
-#include <stdint.h>
-#include <string.h>  // memcpy
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "tests/combine_test.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#include "hwy/highway.h"
-#include "hwy/tests/test_util-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-struct TestLowerHalf {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const Half<D> d2;
-
-    const size_t N = Lanes(d);
-    auto lanes = AllocateAligned<T>(N);
-    auto lanes2 = AllocateAligned<T>(N);
-    std::fill(lanes.get(), lanes.get() + N, T(0));
-    std::fill(lanes2.get(), lanes2.get() + N, T(0));
-    const auto v = Iota(d, 1);
-    Store(LowerHalf(d2, v), d2, lanes.get());
-    Store(LowerHalf(v), d2, lanes2.get());  // optionally without D
-    size_t i = 0;
-    for (; i < Lanes(d2); ++i) {
-      HWY_ASSERT_EQ(T(1 + i), lanes[i]);
-      HWY_ASSERT_EQ(T(1 + i), lanes2[i]);
-    }
-    // Other half remains unchanged
-    for (; i < N; ++i) {
-      HWY_ASSERT_EQ(T(0), lanes[i]);
-      HWY_ASSERT_EQ(T(0), lanes2[i]);
-    }
-  }
-};
-
-struct TestLowerQuarter {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const Half<D> d2;
-    const Half<decltype(d2)> d4;
-
-    const size_t N = Lanes(d);
-    auto lanes = AllocateAligned<T>(N);
-    auto lanes2 = AllocateAligned<T>(N);
-    std::fill(lanes.get(), lanes.get() + N, T(0));
-    std::fill(lanes2.get(), lanes2.get() + N, T(0));
-    const auto v = Iota(d, 1);
-    const auto lo = LowerHalf(d4, LowerHalf(d2, v));
-    const auto lo2 = LowerHalf(LowerHalf(v));  // optionally without D
-    Store(lo, d4, lanes.get());
-    Store(lo2, d4, lanes2.get());
-    size_t i = 0;
-    for (; i < Lanes(d4); ++i) {
-      HWY_ASSERT_EQ(T(i + 1), lanes[i]);
-      HWY_ASSERT_EQ(T(i + 1), lanes2[i]);
-    }
-    // Upper 3/4 remain unchanged
-    for (; i < N; ++i) {
-      HWY_ASSERT_EQ(T(0), lanes[i]);
-      HWY_ASSERT_EQ(T(0), lanes2[i]);
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllLowerHalf() {
-  ForAllTypes(ForHalfVectors<TestLowerHalf>());
-
-  // The minimum vector size is 128 bits, so there's no guarantee we can have
-  // quarters of 64-bit lanes, hence test 'all' other types.
-  ForHalfVectors<TestLowerQuarter, 2> test_quarter;
-  ForUI8(test_quarter);
-  ForUI16(test_quarter);  // exclude float16_t - cannot compare
-  ForUIF32(test_quarter);
-}
-
-struct TestUpperHalf {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    // Scalar does not define UpperHalf.
-#if HWY_TARGET != HWY_SCALAR
-    const Half<D> d2;
-    const size_t N2 = Lanes(d2);
-    HWY_ASSERT(N2 * 2 == Lanes(d));
-    auto expected = AllocateAligned<T>(N2);
-    size_t i = 0;
-    for (; i < N2; ++i) {
-      expected[i] = static_cast<T>(N2 + 1 + i);
-    }
-    HWY_ASSERT_VEC_EQ(d2, expected.get(), UpperHalf(d2, Iota(d, 1)));
-#else
-    (void)d;
-#endif
-  }
-};
-
-HWY_NOINLINE void TestAllUpperHalf() {
-  ForAllTypes(ForHalfVectors<TestUpperHalf>());
-}
-
-struct TestZeroExtendVector {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const Twice<D> d2;
-
-    const auto v = Iota(d, 1);
-    const size_t N = Lanes(d);
-    const size_t N2 = Lanes(d2);
-    // If equal, then N was already MaxLanes(d) and it's not clear what
-    // Combine or ZeroExtendVector should return.
-    if (N2 == N) return;
-    HWY_ASSERT(N2 == 2 * N);
-    auto lanes = AllocateAligned<T>(N2);
-    Store(v, d, &lanes[0]);
-    Store(v, d, &lanes[N]);
-
-    const auto ext = ZeroExtendVector(d2, v);
-    Store(ext, d2, lanes.get());
-
-    // Lower half is unchanged
-    HWY_ASSERT_VEC_EQ(d, v, Load(d, &lanes[0]));
-    // Upper half is zero
-    HWY_ASSERT_VEC_EQ(d, Zero(d), Load(d, &lanes[N]));
-  }
-};
-
-HWY_NOINLINE void TestAllZeroExtendVector() {
-  ForAllTypes(ForExtendableVectors<TestZeroExtendVector>());
-}
-
-struct TestCombine {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const Twice<D> d2;
-    const size_t N2 = Lanes(d2);
-    auto lanes = AllocateAligned<T>(N2);
-
-    const auto lo = Iota(d, 1);
-    const auto hi = Iota(d, static_cast<T>(N2 / 2 + 1));
-    const auto combined = Combine(d2, hi, lo);
-    Store(combined, d2, lanes.get());
-
-    const auto expected = Iota(d2, 1);
-    HWY_ASSERT_VEC_EQ(d2, expected, combined);
-  }
-};
-
-HWY_NOINLINE void TestAllCombine() {
-  ForAllTypes(ForExtendableVectors<TestCombine>());
-}
-
-struct TestConcat {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    if (N == 1) return;
-    const size_t half_bytes = N * sizeof(T) / 2;
-
-    auto hi = AllocateAligned<T>(N);
-    auto lo = AllocateAligned<T>(N);
-    auto expected = AllocateAligned<T>(N);
-    RandomState rng;
-    for (size_t rep = 0; rep < 10; ++rep) {
-      for (size_t i = 0; i < N; ++i) {
-        hi[i] = static_cast<T>(Random64(&rng) & 0xFF);
-        lo[i] = static_cast<T>(Random64(&rng) & 0xFF);
-      }
-
-      {
-        memcpy(&expected[N / 2], &hi[N / 2], half_bytes);
-        memcpy(&expected[0], &lo[0], half_bytes);
-        const auto vhi = Load(d, hi.get());
-        const auto vlo = Load(d, lo.get());
-        HWY_ASSERT_VEC_EQ(d, expected.get(), ConcatUpperLower(d, vhi, vlo));
-      }
-
-      {
-        memcpy(&expected[N / 2], &hi[N / 2], half_bytes);
-        memcpy(&expected[0], &lo[N / 2], half_bytes);
-        const auto vhi = Load(d, hi.get());
-        const auto vlo = Load(d, lo.get());
-        HWY_ASSERT_VEC_EQ(d, expected.get(), ConcatUpperUpper(d, vhi, vlo));
-      }
-
-      {
-        memcpy(&expected[N / 2], &hi[0], half_bytes);
-        memcpy(&expected[0], &lo[N / 2], half_bytes);
-        const auto vhi = Load(d, hi.get());
-        const auto vlo = Load(d, lo.get());
-        HWY_ASSERT_VEC_EQ(d, expected.get(), ConcatLowerUpper(d, vhi, vlo));
-      }
-
-      {
-        memcpy(&expected[N / 2], &hi[0], half_bytes);
-        memcpy(&expected[0], &lo[0], half_bytes);
-        const auto vhi = Load(d, hi.get());
-        const auto vlo = Load(d, lo.get());
-        HWY_ASSERT_VEC_EQ(d, expected.get(), ConcatLowerLower(d, vhi, vlo));
-      }
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllConcat() {
-  ForAllTypes(ForShrinkableVectors<TestConcat>());
-}
-
-struct TestConcatOddEven {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-#if HWY_TARGET != HWY_SCALAR
-    const size_t N = Lanes(d);
-    const auto hi = Iota(d, static_cast<T>(N));
-    const auto lo = Iota(d, 0);
-    const auto even = Add(Iota(d, 0), Iota(d, 0));
-    const auto odd = Add(even, Set(d, 1));
-    HWY_ASSERT_VEC_EQ(d, odd, ConcatOdd(d, hi, lo));
-    HWY_ASSERT_VEC_EQ(d, even, ConcatEven(d, hi, lo));
-
-    // This test catches inadvertent saturation.
-    const auto min = Set(d, LowestValue<T>());
-    const auto max = Set(d, HighestValue<T>());
-    HWY_ASSERT_VEC_EQ(d, max, ConcatOdd(d, max, max));
-    HWY_ASSERT_VEC_EQ(d, max, ConcatEven(d, max, max));
-    HWY_ASSERT_VEC_EQ(d, min, ConcatOdd(d, min, min));
-    HWY_ASSERT_VEC_EQ(d, min, ConcatEven(d, min, min));
-#else
-    (void)d;
-#endif
-  }
-};
-
-HWY_NOINLINE void TestAllConcatOddEven() {
-  ForAllTypes(ForShrinkableVectors<TestConcatOddEven>());
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-
-namespace hwy {
-HWY_BEFORE_TEST(HwyCombineTest);
-HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllLowerHalf);
-HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllUpperHalf);
-HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllZeroExtendVector);
-HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllCombine);
-HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllConcat);
-HWY_EXPORT_AND_TEST_P(HwyCombineTest, TestAllConcatOddEven);
-}  // namespace hwy
-
-#endif  // HWY_ONCE
diff --git a/third_party/highway/hwy/tests/compare_test.cc b/third_party/highway/hwy/tests/compare_test.cc
deleted file mode 100644 (file)
index a159ecb..0000000
+++ /dev/null
@@ -1,459 +0,0 @@
-// Copyright 2019 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <stddef.h>
-#include <stdint.h>
-#include <string.h>  // memset
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "tests/compare_test.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#include "hwy/highway.h"
-#include "hwy/tests/test_util-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-// All types.
-struct TestEquality {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v2 = Iota(d, 2);
-    const auto v2b = Iota(d, 2);
-    const auto v3 = Iota(d, 3);
-
-    const auto mask_false = MaskFalse(d);
-    const auto mask_true = MaskTrue(d);
-
-    HWY_ASSERT_MASK_EQ(d, mask_false, Eq(v2, v3));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Eq(v3, v2));
-    HWY_ASSERT_MASK_EQ(d, mask_true, Eq(v2, v2));
-    HWY_ASSERT_MASK_EQ(d, mask_true, Eq(v2, v2b));
-
-    HWY_ASSERT_MASK_EQ(d, mask_true, Ne(v2, v3));
-    HWY_ASSERT_MASK_EQ(d, mask_true, Ne(v3, v2));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Ne(v2, v2));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Ne(v2, v2b));
-  }
-};
-
-HWY_NOINLINE void TestAllEquality() {
-  ForAllTypes(ForPartialVectors<TestEquality>());
-}
-
-// a > b should be true, verify that for Gt/Lt and with swapped args.
-template <class D>
-void EnsureGreater(D d, TFromD<D> a, TFromD<D> b, const char* file, int line) {
-  const auto mask_false = MaskFalse(d);
-  const auto mask_true = MaskTrue(d);
-
-  const auto va = Set(d, a);
-  const auto vb = Set(d, b);
-  AssertMaskEqual(d, mask_true, Gt(va, vb), file, line);
-  AssertMaskEqual(d, mask_false, Lt(va, vb), file, line);
-
-  // Swapped order
-  AssertMaskEqual(d, mask_false, Gt(vb, va), file, line);
-  AssertMaskEqual(d, mask_true, Lt(vb, va), file, line);
-
-  // Also ensure irreflexive
-  AssertMaskEqual(d, mask_false, Gt(va, va), file, line);
-  AssertMaskEqual(d, mask_false, Gt(vb, vb), file, line);
-  AssertMaskEqual(d, mask_false, Lt(va, va), file, line);
-  AssertMaskEqual(d, mask_false, Lt(vb, vb), file, line);
-}
-
-#define HWY_ENSURE_GREATER(d, a, b) EnsureGreater(d, a, b, __FILE__, __LINE__)
-
-struct TestStrictUnsigned {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const T max = LimitsMax<T>();
-    const auto v0 = Zero(d);
-    const auto v2 = And(Iota(d, T(2)), Set(d, 255));  // 0..255
-
-    const auto mask_false = MaskFalse(d);
-
-    // Individual values of interest
-    HWY_ENSURE_GREATER(d, 2, 1);
-    HWY_ENSURE_GREATER(d, 1, 0);
-    HWY_ENSURE_GREATER(d, 128, 127);
-    HWY_ENSURE_GREATER(d, max, max / 2);
-    HWY_ENSURE_GREATER(d, max, 1);
-    HWY_ENSURE_GREATER(d, max, 0);
-
-    // Also use Iota to ensure lanes are independent
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, v0));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v0, v2));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v0, v0));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v0, v0));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, v2));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v2, v2));
-  }
-};
-
-HWY_NOINLINE void TestAllStrictUnsigned() {
-  ForUnsignedTypes(ForPartialVectors<TestStrictUnsigned>());
-}
-
-struct TestStrictInt {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const T min = LimitsMin<T>();
-    const T max = LimitsMax<T>();
-    const auto v0 = Zero(d);
-    const auto v2 = And(Iota(d, T(2)), Set(d, 127));  // 0..127
-    const auto vn = Sub(Neg(v2), Set(d, 1));          // -1..-128
-
-    const auto mask_false = MaskFalse(d);
-    const auto mask_true = MaskTrue(d);
-
-    // Individual values of interest
-    HWY_ENSURE_GREATER(d, 2, 1);
-    HWY_ENSURE_GREATER(d, 1, 0);
-    HWY_ENSURE_GREATER(d, 0, -1);
-    HWY_ENSURE_GREATER(d, -1, -2);
-    HWY_ENSURE_GREATER(d, max, max / 2);
-    HWY_ENSURE_GREATER(d, max, 1);
-    HWY_ENSURE_GREATER(d, max, 0);
-    HWY_ENSURE_GREATER(d, max, -1);
-    HWY_ENSURE_GREATER(d, max, min);
-    HWY_ENSURE_GREATER(d, 0, min);
-    HWY_ENSURE_GREATER(d, min / 2, min);
-
-    // Also use Iota to ensure lanes are independent
-    HWY_ASSERT_MASK_EQ(d, mask_true, Gt(v2, vn));
-    HWY_ASSERT_MASK_EQ(d, mask_true, Lt(vn, v2));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, vn));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Gt(vn, v2));
-
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v0, v0));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, v2));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt(vn, vn));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v0, v0));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v2, v2));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Gt(vn, vn));
-  }
-};
-
-// S-SSE3 bug (#795): same upper, differing MSB in lower
-struct TestStrictInt64 {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto m0 = MaskFalse(d);
-    const auto m1 = MaskTrue(d);
-    HWY_ASSERT_MASK_EQ(d, m0, Lt(Set(d, 0x380000000LL), Set(d, 0x300000001LL)));
-    HWY_ASSERT_MASK_EQ(d, m1, Lt(Set(d, 0xF00000000LL), Set(d, 0xF80000000LL)));
-    HWY_ASSERT_MASK_EQ(d, m1, Lt(Set(d, 0xF00000000LL), Set(d, 0xF80000001LL)));
-  }
-};
-
-HWY_NOINLINE void TestAllStrictInt() {
-  ForSignedTypes(ForPartialVectors<TestStrictInt>());
-  ForPartialVectors<TestStrictInt64>()(int64_t());
-}
-
-struct TestStrictFloat {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const T huge_neg = T(-1E35);
-    const T huge_pos = T(1E36);
-    const auto v0 = Zero(d);
-    const auto v2 = Iota(d, T(2));
-    const auto vn = Neg(v2);
-
-    const auto mask_false = MaskFalse(d);
-    const auto mask_true = MaskTrue(d);
-
-    // Individual values of interest
-    HWY_ENSURE_GREATER(d, 2, 1);
-    HWY_ENSURE_GREATER(d, 1, 0);
-    HWY_ENSURE_GREATER(d, 0, -1);
-    HWY_ENSURE_GREATER(d, -1, -2);
-    HWY_ENSURE_GREATER(d, huge_pos, 1);
-    HWY_ENSURE_GREATER(d, huge_pos, 0);
-    HWY_ENSURE_GREATER(d, huge_pos, -1);
-    HWY_ENSURE_GREATER(d, huge_pos, huge_neg);
-    HWY_ENSURE_GREATER(d, 0, huge_neg);
-
-    // Also use Iota to ensure lanes are independent
-    HWY_ASSERT_MASK_EQ(d, mask_true, Gt(v2, vn));
-    HWY_ASSERT_MASK_EQ(d, mask_true, Lt(vn, v2));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, vn));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Gt(vn, v2));
-
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v0, v0));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt(v2, v2));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt(vn, vn));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v0, v0));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Gt(v2, v2));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Gt(vn, vn));
-  }
-};
-
-HWY_NOINLINE void TestAllStrictFloat() {
-  ForFloatTypes(ForPartialVectors<TestStrictFloat>());
-}
-
-struct TestWeakFloat {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v2 = Iota(d, T(2));
-    const auto vn = Iota(d, -T(Lanes(d)));
-
-    const auto mask_false = MaskFalse(d);
-    const auto mask_true = MaskTrue(d);
-
-    HWY_ASSERT_MASK_EQ(d, mask_true, Ge(v2, v2));
-    HWY_ASSERT_MASK_EQ(d, mask_true, Le(vn, vn));
-
-    HWY_ASSERT_MASK_EQ(d, mask_true, Ge(v2, vn));
-    HWY_ASSERT_MASK_EQ(d, mask_true, Le(vn, v2));
-
-    HWY_ASSERT_MASK_EQ(d, mask_false, Le(v2, vn));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Ge(vn, v2));
-  }
-};
-
-HWY_NOINLINE void TestAllWeakFloat() {
-  ForFloatTypes(ForPartialVectors<TestWeakFloat>());
-}
-
-template <class D>
-static HWY_NOINLINE Vec<D> Make128(D d, uint64_t hi, uint64_t lo) {
-  alignas(16) uint64_t in[2];
-  in[0] = lo;
-  in[1] = hi;
-  return LoadDup128(d, in);
-}
-
-struct TestLt128 {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    using V = Vec<D>;
-    const V v00 = Zero(d);
-    const V v01 = Make128(d, 0, 1);
-    const V v10 = Make128(d, 1, 0);
-    const V v11 = Add(v01, v10);
-
-    const auto mask_false = MaskFalse(d);
-    const auto mask_true = MaskTrue(d);
-
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v00, v00));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v01, v01));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v10, v10));
-
-    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v00, v01));
-    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v01, v10));
-    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v01, v11));
-
-    // Reversed order
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v01, v00));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v10, v01));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v11, v01));
-
-    // Also check 128-bit blocks are independent
-    const V iota = Iota(d, 1);
-    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, iota, Add(iota, v01)));
-    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, iota, Add(iota, v10)));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, Add(iota, v01), iota));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, Add(iota, v10), iota));
-
-    // Max value
-    const V vm = Make128(d, LimitsMax<T>(), LimitsMax<T>());
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, vm));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, v00));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, v01));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, v10));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, v11));
-    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v00, vm));
-    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v01, vm));
-    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v10, vm));
-    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v11, vm));
-  }
-};
-
-HWY_NOINLINE void TestAllLt128() { ForGEVectors<128, TestLt128>()(uint64_t()); }
-
-struct TestLt128Upper {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    using V = Vec<D>;
-    const V v00 = Zero(d);
-    const V v01 = Make128(d, 0, 1);
-    const V v10 = Make128(d, 1, 0);
-    const V v11 = Add(v01, v10);
-
-    const auto mask_false = MaskFalse(d);
-    const auto mask_true = MaskTrue(d);
-
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v00, v00));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v01, v01));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v10, v10));
-
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v00, v01));
-    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, v01, v10));
-    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, v01, v11));
-
-    // Reversed order
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v01, v00));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v10, v01));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, v11, v01));
-
-    // Also check 128-bit blocks are independent
-    const V iota = Iota(d, 1);
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, iota, Add(iota, v01)));
-    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, iota, Add(iota, v10)));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, Add(iota, v01), iota));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, Add(iota, v10), iota));
-
-    // Max value
-    const V vm = Make128(d, LimitsMax<T>(), LimitsMax<T>());
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, vm, vm));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, vm, v00));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, vm, v01));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, vm, v10));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Lt128Upper(d, vm, v11));
-    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, v00, vm));
-    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, v01, vm));
-    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, v10, vm));
-    HWY_ASSERT_MASK_EQ(d, mask_true, Lt128Upper(d, v11, vm));
-  }
-};
-
-HWY_NOINLINE void TestAllLt128Upper() {
-  ForGEVectors<128, TestLt128Upper>()(uint64_t());
-}
-
-struct TestEq128 {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    using V = Vec<D>;
-    const V v00 = Zero(d);
-    const V v01 = Make128(d, 0, 1);
-    const V v10 = Make128(d, 1, 0);
-    const V v11 = Add(v01, v10);
-
-    const auto mask_false = MaskFalse(d);
-    const auto mask_true = MaskTrue(d);
-
-    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128(d, v00, v00));
-    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128(d, v01, v01));
-    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128(d, v10, v10));
-
-    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v00, v01));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v01, v10));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v01, v11));
-
-    // Reversed order
-    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v01, v00));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v10, v01));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v11, v01));
-
-    // Also check 128-bit blocks are independent
-    const V iota = Iota(d, 1);
-    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, iota, Add(iota, v01)));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, iota, Add(iota, v10)));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, Add(iota, v01), iota));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, Add(iota, v10), iota));
-
-    // Max value
-    const V vm = Make128(d, LimitsMax<T>(), LimitsMax<T>());
-    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128(d, vm, vm));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, vm, v00));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, vm, v01));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, vm, v10));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, vm, v11));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v00, vm));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v01, vm));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v10, vm));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128(d, v11, vm));
-  }
-};
-
-HWY_NOINLINE void TestAllEq128() { ForGEVectors<128, TestEq128>()(uint64_t()); }
-
-struct TestEq128Upper {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    using V = Vec<D>;
-    const V v00 = Zero(d);
-    const V v01 = Make128(d, 0, 1);
-    const V v10 = Make128(d, 1, 0);
-    const V v11 = Add(v01, v10);
-
-    const auto mask_false = MaskFalse(d);
-    const auto mask_true = MaskTrue(d);
-
-    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v00, v00));
-    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v01, v01));
-    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v10, v10));
-
-    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v00, v01));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v01, v10));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v01, v11));
-
-    // Reversed order
-    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, v01, v00));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v10, v01));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v11, v01));
-
-    // Also check 128-bit blocks are independent
-    const V iota = Iota(d, 1);
-    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, iota, Add(iota, v01)));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, iota, Add(iota, v10)));
-    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, Add(iota, v01), iota));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, Add(iota, v10), iota));
-
-    // Max value
-    const V vm = Make128(d, LimitsMax<T>(), LimitsMax<T>());
-    HWY_ASSERT_MASK_EQ(d, mask_true, Eq128Upper(d, vm, vm));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, vm, v00));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, vm, v01));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, vm, v10));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, vm, v11));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v00, vm));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v01, vm));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v10, vm));
-    HWY_ASSERT_MASK_EQ(d, mask_false, Eq128Upper(d, v11, vm));
-  }
-};
-
-HWY_NOINLINE void TestAllEq128Upper() {
-  ForGEVectors<128, TestEq128Upper>()(uint64_t());
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-
-namespace hwy {
-HWY_BEFORE_TEST(HwyCompareTest);
-HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllEquality);
-HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictUnsigned);
-HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictInt);
-HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictFloat);
-HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllWeakFloat);
-HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllLt128);
-HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllLt128Upper);
-HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllEq128);
-HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllEq128Upper);
-}  // namespace hwy
-
-#endif
diff --git a/third_party/highway/hwy/tests/compress_test.cc b/third_party/highway/hwy/tests/compress_test.cc
deleted file mode 100644 (file)
index bcb370a..0000000
+++ /dev/null
@@ -1,739 +0,0 @@
-// Copyright 2022 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <stddef.h>
-#include <stdint.h>
-#include <string.h>  // memset
-
-#include <array>  // IWYU pragma: keep
-
-#include "hwy/base.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "tests/compress_test.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#include "hwy/highway.h"
-#include "hwy/tests/test_util-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-// Regenerate tables used in the implementation, instead of testing.
-#define HWY_PRINT_TABLES 0
-
-#if !HWY_PRINT_TABLES || HWY_IDE
-
-template <class D, class DI, typename T = TFromD<D>, typename TI = TFromD<DI>>
-void CheckStored(D d, DI di, size_t expected_pos, size_t actual_pos,
-                 size_t num_to_check, const AlignedFreeUniquePtr<T[]>& in,
-                 const AlignedFreeUniquePtr<TI[]>& mask_lanes,
-                 const AlignedFreeUniquePtr<T[]>& expected, const T* actual_u,
-                 int line) {
-  if (expected_pos != actual_pos) {
-    hwy::Abort(__FILE__, line, "Size mismatch for %s: expected %d, actual %d\n",
-               TypeName(T(), Lanes(d)).c_str(), static_cast<int>(expected_pos),
-               static_cast<int>(actual_pos));
-  }
-  // Modified from AssertVecEqual - we may not be checking all lanes.
-  for (size_t i = 0; i < num_to_check; ++i) {
-    if (!IsEqual(expected[i], actual_u[i])) {
-      const size_t N = Lanes(d);
-      fprintf(stderr, "Mismatch at i=%d of %d, line %d:\n\n",
-              static_cast<int>(i), static_cast<int>(num_to_check), line);
-      Print(di, "mask", Load(di, mask_lanes.get()), 0, N);
-      Print(d, "in", Load(d, in.get()), 0, N);
-      Print(d, "expect", Load(d, expected.get()), 0, N);
-      Print(d, "actual", Load(d, actual_u), 0, N);
-      HWY_ASSERT(false);
-    }
-  }
-}
-
-struct TestCompress {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    RandomState rng;
-
-    using TI = MakeSigned<T>;  // For mask > 0 comparison
-    const Rebind<TI, D> di;
-    const size_t N = Lanes(d);
-
-    const T zero{0};
-
-    for (int frac : {0, 2, 3}) {
-      // For CompressStore
-      const size_t misalign = static_cast<size_t>(frac) * N / 4;
-
-      auto in_lanes = AllocateAligned<T>(N);
-      auto mask_lanes = AllocateAligned<TI>(N);
-      auto expected = AllocateAligned<T>(N);
-      auto actual_a = AllocateAligned<T>(misalign + N);
-      T* actual_u = actual_a.get() + misalign;
-
-      const size_t bits_size = RoundUpTo((N + 7) / 8, 8);
-      auto bits = AllocateAligned<uint8_t>(bits_size);
-      memset(bits.get(), 0, bits_size);  // for MSAN
-
-      // Each lane should have a chance of having mask=true.
-      for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
-        size_t expected_pos = 0;
-        for (size_t i = 0; i < N; ++i) {
-          const uint64_t bits = Random32(&rng);
-          in_lanes[i] = T();  // cannot initialize float16_t directly.
-          CopyBytes<sizeof(T)>(&bits, &in_lanes[i]);  // not same size
-          mask_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0);
-          if (mask_lanes[i] > 0) {
-            expected[expected_pos++] = in_lanes[i];
-          }
-        }
-        size_t num_to_check;
-        if (CompressIsPartition<T>::value) {
-          // For non-native Compress, also check that mask=false lanes were
-          // moved to the back of the vector (highest indices).
-          size_t extra = expected_pos;
-          for (size_t i = 0; i < N; ++i) {
-            if (mask_lanes[i] == 0) {
-              expected[extra++] = in_lanes[i];
-            }
-          }
-          HWY_ASSERT(extra == N);
-          num_to_check = N;
-        } else {
-          // For native Compress, only the mask=true lanes are defined.
-          num_to_check = expected_pos;
-        }
-
-        const auto in = Load(d, in_lanes.get());
-        const auto mask =
-            RebindMask(d, Gt(Load(di, mask_lanes.get()), Zero(di)));
-        StoreMaskBits(d, mask, bits.get());
-
-        // Compress
-        memset(actual_u, 0, N * sizeof(T));
-        StoreU(Compress(in, mask), d, actual_u);
-        CheckStored(d, di, expected_pos, expected_pos, num_to_check, in_lanes,
-                    mask_lanes, expected, actual_u, __LINE__);
-
-        // CompressNot
-        memset(actual_u, 0, N * sizeof(T));
-        StoreU(CompressNot(in, Not(mask)), d, actual_u);
-        CheckStored(d, di, expected_pos, expected_pos, num_to_check, in_lanes,
-                    mask_lanes, expected, actual_u, __LINE__);
-
-        // CompressStore
-        memset(actual_u, 0, N * sizeof(T));
-        const size_t size1 = CompressStore(in, mask, d, actual_u);
-        // expected_pos instead of num_to_check because this op is not
-        // affected by CompressIsPartition.
-        CheckStored(d, di, expected_pos, size1, expected_pos, in_lanes,
-                    mask_lanes, expected, actual_u, __LINE__);
-
-        // CompressBlendedStore
-        memset(actual_u, 0, N * sizeof(T));
-        const size_t size2 = CompressBlendedStore(in, mask, d, actual_u);
-        // expected_pos instead of num_to_check because this op only writes
-        // the mask=true lanes.
-        CheckStored(d, di, expected_pos, size2, expected_pos, in_lanes,
-                    mask_lanes, expected, actual_u, __LINE__);
-        // Subsequent lanes are untouched.
-        for (size_t i = size2; i < N; ++i) {
-          HWY_ASSERT_EQ(zero, actual_u[i]);
-        }
-
-        // CompressBits
-        memset(actual_u, 0, N * sizeof(T));
-        StoreU(CompressBits(in, bits.get()), d, actual_u);
-        CheckStored(d, di, expected_pos, expected_pos, num_to_check, in_lanes,
-                    mask_lanes, expected, actual_u, __LINE__);
-
-        // CompressBitsStore
-        memset(actual_u, 0, N * sizeof(T));
-        const size_t size3 = CompressBitsStore(in, bits.get(), d, actual_u);
-        // expected_pos instead of num_to_check because this op is not
-        // affected by CompressIsPartition.
-        CheckStored(d, di, expected_pos, size3, expected_pos, in_lanes,
-                    mask_lanes, expected, actual_u, __LINE__);
-      }  // rep
-    }    // frac
-  }      // operator()
-};
-
-HWY_NOINLINE void TestAllCompress() {
-  ForUIF163264(ForPartialVectors<TestCompress>());
-}
-
-struct TestCompressBlocks {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-#if HWY_TARGET == HWY_SCALAR
-    (void)d;
-#else
-    static_assert(sizeof(T) == 8 && !IsSigned<T>(), "Should be u64");
-    RandomState rng;
-
-    using TI = MakeSigned<T>;  // For mask > 0 comparison
-    const Rebind<TI, D> di;
-    const size_t N = Lanes(d);
-
-    auto in_lanes = AllocateAligned<T>(N);
-    auto mask_lanes = AllocateAligned<TI>(N);
-    auto expected = AllocateAligned<T>(N);
-    auto actual = AllocateAligned<T>(N);
-
-    // Each lane should have a chance of having mask=true.
-    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
-      size_t expected_pos = 0;
-      for (size_t i = 0; i < N; i += 2) {
-        const uint64_t bits = Random32(&rng);
-        in_lanes[i + 1] = in_lanes[i] = T();  // cannot set float16_t directly.
-        CopyBytes<sizeof(T)>(&bits, &in_lanes[i]);      // not same size
-        CopyBytes<sizeof(T)>(&bits, &in_lanes[i + 1]);  // not same size
-        mask_lanes[i + 1] = mask_lanes[i] = TI{(Random32(&rng) & 8) ? 1 : 0};
-        if (mask_lanes[i] > 0) {
-          expected[expected_pos++] = in_lanes[i];
-          expected[expected_pos++] = in_lanes[i + 1];
-        }
-      }
-      size_t num_to_check;
-      if (CompressIsPartition<T>::value) {
-        // For non-native Compress, also check that mask=false lanes were
-        // moved to the back of the vector (highest indices).
-        size_t extra = expected_pos;
-        for (size_t i = 0; i < N; ++i) {
-          if (mask_lanes[i] == 0) {
-            expected[extra++] = in_lanes[i];
-          }
-        }
-        HWY_ASSERT(extra == N);
-        num_to_check = N;
-      } else {
-        // For native Compress, only the mask=true lanes are defined.
-        num_to_check = expected_pos;
-      }
-
-      const auto in = Load(d, in_lanes.get());
-      const auto mask = RebindMask(d, Gt(Load(di, mask_lanes.get()), Zero(di)));
-
-      // CompressBlocksNot
-      memset(actual.get(), 0, N * sizeof(T));
-      StoreU(CompressBlocksNot(in, Not(mask)), d, actual.get());
-      CheckStored(d, di, expected_pos, expected_pos, num_to_check, in_lanes,
-                  mask_lanes, expected, actual.get(), __LINE__);
-    }  // rep
-#endif  // HWY_TARGET == HWY_SCALAR
-  }     // operator()
-};
-
-HWY_NOINLINE void TestAllCompressBlocks() {
-  ForGE128Vectors<TestCompressBlocks>()(uint64_t());
-}
-
-#endif  // !HWY_PRINT_TABLES
-
-#if HWY_PRINT_TABLES || HWY_IDE
-namespace detail {  // for code folding
-
-void PrintCompress16x8Tables() {
-  printf("======================================= 16x8\n");
-  constexpr size_t N = 8;  // 128-bit SIMD
-  for (uint64_t code = 0; code < (1ull << N); ++code) {
-    std::array<uint8_t, N> indices{0};
-    size_t pos = 0;
-    // All lanes where mask = true
-    for (size_t i = 0; i < N; ++i) {
-      if (code & (1ull << i)) {
-        indices[pos++] = i;
-      }
-    }
-    // All lanes where mask = false
-    for (size_t i = 0; i < N; ++i) {
-      if (!(code & (1ull << i))) {
-        indices[pos++] = i;
-      }
-    }
-    HWY_ASSERT(pos == N);
-
-    // Doubled (for converting lane to byte indices)
-    for (size_t i = 0; i < N; ++i) {
-      printf("%d,", 2 * indices[i]);
-    }
-    printf(code & 1 ? "//\n" : "/**/");
-  }
-  printf("\n");
-}
-
-void PrintCompressNot16x8Tables() {
-  printf("======================================= Not 16x8\n");
-  constexpr size_t N = 8;  // 128-bit SIMD
-  for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
-    const uint64_t code = ~not_code;
-    std::array<uint8_t, N> indices{0};
-    size_t pos = 0;
-    // All lanes where mask = true
-    for (size_t i = 0; i < N; ++i) {
-      if (code & (1ull << i)) {
-        indices[pos++] = i;
-      }
-    }
-    // All lanes where mask = false
-    for (size_t i = 0; i < N; ++i) {
-      if (!(code & (1ull << i))) {
-        indices[pos++] = i;
-      }
-    }
-    HWY_ASSERT(pos == N);
-
-    // Doubled (for converting lane to byte indices)
-    for (size_t i = 0; i < N; ++i) {
-      printf("%d,", 2 * indices[i]);
-    }
-    printf(not_code & 1 ? "//\n" : "/**/");
-  }
-  printf("\n");
-}
-
-// Compressed to nibbles, unpacked via variable right shift
-void PrintCompress32x8Tables() {
-  printf("======================================= 32/64x8\n");
-  constexpr size_t N = 8;  // AVX2 or 64-bit AVX3
-  for (uint64_t code = 0; code < (1ull << N); ++code) {
-    std::array<uint32_t, N> indices{0};
-    size_t pos = 0;
-    // All lanes where mask = true
-    for (size_t i = 0; i < N; ++i) {
-      if (code & (1ull << i)) {
-        indices[pos++] = i;
-      }
-    }
-    // All lanes where mask = false
-    for (size_t i = 0; i < N; ++i) {
-      if (!(code & (1ull << i))) {
-        indices[pos++] = i;
-      }
-    }
-    HWY_ASSERT(pos == N);
-
-    // Convert to nibbles
-    uint64_t packed = 0;
-    for (size_t i = 0; i < N; ++i) {
-      HWY_ASSERT(indices[i] < N);
-      packed += indices[i] << (i * 4);
-    }
-
-    HWY_ASSERT(packed < (1ull << (N * 4)));
-    printf("0x%08x,", static_cast<uint32_t>(packed));
-  }
-  printf("\n");
-}
-
-void PrintCompressNot32x8Tables() {
-  printf("======================================= Not 32/64x8\n");
-  constexpr size_t N = 8;  // AVX2 or 64-bit AVX3
-  for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
-    const uint64_t code = ~not_code;
-    std::array<uint32_t, N> indices{0};
-    size_t pos = 0;
-    // All lanes where mask = true
-    for (size_t i = 0; i < N; ++i) {
-      if (code & (1ull << i)) {
-        indices[pos++] = i;
-      }
-    }
-    // All lanes where mask = false
-    for (size_t i = 0; i < N; ++i) {
-      if (!(code & (1ull << i))) {
-        indices[pos++] = i;
-      }
-    }
-    HWY_ASSERT(pos == N);
-
-    // Convert to nibbles
-    uint64_t packed = 0;
-    for (size_t i = 0; i < N; ++i) {
-      HWY_ASSERT(indices[i] < N);
-      packed += indices[i] << (i * 4);
-    }
-
-    HWY_ASSERT(packed < (1ull << (N * 4)));
-    printf("0x%08x,", static_cast<uint32_t>(packed));
-  }
-  printf("\n");
-}
-
-// Compressed to nibbles (for AVX3 64x4)
-void PrintCompress64x4NibbleTables() {
-  printf("======================================= 64x4Nibble\n");
-  constexpr size_t N = 4;  // AVX2
-  for (uint64_t code = 0; code < (1ull << N); ++code) {
-    std::array<uint32_t, N> indices{0};
-    size_t pos = 0;
-    // All lanes where mask = true
-    for (size_t i = 0; i < N; ++i) {
-      if (code & (1ull << i)) {
-        indices[pos++] = i;
-      }
-    }
-    // All lanes where mask = false
-    for (size_t i = 0; i < N; ++i) {
-      if (!(code & (1ull << i))) {
-        indices[pos++] = i;
-      }
-    }
-    HWY_ASSERT(pos == N);
-
-    // Convert to nibbles
-    uint64_t packed = 0;
-    for (size_t i = 0; i < N; ++i) {
-      HWY_ASSERT(indices[i] < N);
-      packed += indices[i] << (i * 4);
-    }
-
-    HWY_ASSERT(packed < (1ull << (N * 4)));
-    printf("0x%08x,", static_cast<uint32_t>(packed));
-  }
-  printf("\n");
-}
-
-void PrintCompressNot64x4NibbleTables() {
-  printf("======================================= Not 64x4Nibble\n");
-  constexpr size_t N = 4;  // AVX2
-  for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
-    const uint64_t code = ~not_code;
-    std::array<uint32_t, N> indices{0};
-    size_t pos = 0;
-    // All lanes where mask = true
-    for (size_t i = 0; i < N; ++i) {
-      if (code & (1ull << i)) {
-        indices[pos++] = i;
-      }
-    }
-    // All lanes where mask = false
-    for (size_t i = 0; i < N; ++i) {
-      if (!(code & (1ull << i))) {
-        indices[pos++] = i;
-      }
-    }
-    HWY_ASSERT(pos == N);
-
-    // Convert to nibbles
-    uint64_t packed = 0;
-    for (size_t i = 0; i < N; ++i) {
-      HWY_ASSERT(indices[i] < N);
-      packed += indices[i] << (i * 4);
-    }
-
-    HWY_ASSERT(packed < (1ull << (N * 4)));
-    printf("0x%08x,", static_cast<uint32_t>(packed));
-  }
-  printf("\n");
-}
-
-void PrintCompress64x4Tables() {
-  printf("======================================= 64x4 uncompressed\n");
-  constexpr size_t N = 4;  // SVE_256
-  for (uint64_t code = 0; code < (1ull << N); ++code) {
-    std::array<size_t, N> indices{0};
-    size_t pos = 0;
-    // All lanes where mask = true
-    for (size_t i = 0; i < N; ++i) {
-      if (code & (1ull << i)) {
-        indices[pos++] = i;
-      }
-    }
-    // All lanes where mask = false
-    for (size_t i = 0; i < N; ++i) {
-      if (!(code & (1ull << i))) {
-        indices[pos++] = i;
-      }
-    }
-    HWY_ASSERT(pos == N);
-
-    // Store uncompressed indices because SVE TBL returns 0 if an index is out
-    // of bounds. On AVX3 we simply variable-shift because permute indices are
-    // interpreted modulo N. Compression is not worth the extra shift+AND
-    // because the table is anyway only 512 bytes.
-    for (size_t i = 0; i < N; ++i) {
-      printf("%d,", static_cast<int>(indices[i]));
-    }
-  }
-  printf("\n");
-}
-
-void PrintCompressNot64x4Tables() {
-  printf("======================================= Not 64x4 uncompressed\n");
-  constexpr size_t N = 4;  // SVE_256
-  for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
-    const uint64_t code = ~not_code;
-    std::array<size_t, N> indices{0};
-    size_t pos = 0;
-    // All lanes where mask = true
-    for (size_t i = 0; i < N; ++i) {
-      if (code & (1ull << i)) {
-        indices[pos++] = i;
-      }
-    }
-    // All lanes where mask = false
-    for (size_t i = 0; i < N; ++i) {
-      if (!(code & (1ull << i))) {
-        indices[pos++] = i;
-      }
-    }
-    HWY_ASSERT(pos == N);
-
-    // Store uncompressed indices because SVE TBL returns 0 if an index is out
-    // of bounds. On AVX3 we simply variable-shift because permute indices are
-    // interpreted modulo N. Compression is not worth the extra shift+AND
-    // because the table is anyway only 512 bytes.
-    for (size_t i = 0; i < N; ++i) {
-      printf("%d,", static_cast<int>(indices[i]));
-    }
-  }
-  printf("\n");
-}
-
-// Same as above, but prints pairs of u32 indices (for AVX2)
-void PrintCompress64x4PairTables() {
-  printf("======================================= 64x4 u32 index\n");
-  constexpr size_t N = 4;  // AVX2
-  for (uint64_t code = 0; code < (1ull << N); ++code) {
-    std::array<size_t, N> indices{0};
-    size_t pos = 0;
-    // All lanes where mask = true
-    for (size_t i = 0; i < N; ++i) {
-      if (code & (1ull << i)) {
-        indices[pos++] = i;
-      }
-    }
-    // All lanes where mask = false
-    for (size_t i = 0; i < N; ++i) {
-      if (!(code & (1ull << i))) {
-        indices[pos++] = i;
-      }
-    }
-    HWY_ASSERT(pos == N);
-
-    // Store uncompressed indices because SVE TBL returns 0 if an index is out
-    // of bounds. On AVX3 we simply variable-shift because permute indices are
-    // interpreted modulo N. Compression is not worth the extra shift+AND
-    // because the table is anyway only 512 bytes.
-    for (size_t i = 0; i < N; ++i) {
-      printf("%d, %d, ", static_cast<int>(2 * indices[i] + 0),
-             static_cast<int>(2 * indices[i]) + 1);
-    }
-  }
-  printf("\n");
-}
-
-void PrintCompressNot64x4PairTables() {
-  printf("======================================= Not 64x4 u32 index\n");
-  constexpr size_t N = 4;  // AVX2
-  for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
-    const uint64_t code = ~not_code;
-    std::array<size_t, N> indices{0};
-    size_t pos = 0;
-    // All lanes where mask = true
-    for (size_t i = 0; i < N; ++i) {
-      if (code & (1ull << i)) {
-        indices[pos++] = i;
-      }
-    }
-    // All lanes where mask = false
-    for (size_t i = 0; i < N; ++i) {
-      if (!(code & (1ull << i))) {
-        indices[pos++] = i;
-      }
-    }
-    HWY_ASSERT(pos == N);
-
-    // Store uncompressed indices because SVE TBL returns 0 if an index is out
-    // of bounds. On AVX3 we simply variable-shift because permute indices are
-    // interpreted modulo N. Compression is not worth the extra shift+AND
-    // because the table is anyway only 512 bytes.
-    for (size_t i = 0; i < N; ++i) {
-      printf("%d, %d, ", static_cast<int>(2 * indices[i] + 0),
-             static_cast<int>(2 * indices[i]) + 1);
-    }
-  }
-  printf("\n");
-}
-
-// 4-tuple of byte indices
-void PrintCompress32x4Tables() {
-  printf("======================================= 32x4\n");
-  using T = uint32_t;
-  constexpr size_t N = 4;  // SSE4
-  for (uint64_t code = 0; code < (1ull << N); ++code) {
-    std::array<uint32_t, N> indices{0};
-    size_t pos = 0;
-    // All lanes where mask = true
-    for (size_t i = 0; i < N; ++i) {
-      if (code & (1ull << i)) {
-        indices[pos++] = i;
-      }
-    }
-    // All lanes where mask = false
-    for (size_t i = 0; i < N; ++i) {
-      if (!(code & (1ull << i))) {
-        indices[pos++] = i;
-      }
-    }
-    HWY_ASSERT(pos == N);
-
-    for (size_t i = 0; i < N; ++i) {
-      for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
-        printf("%d,", static_cast<int>(sizeof(T) * indices[i] + idx_byte));
-      }
-    }
-  }
-  printf("\n");
-}
-
-void PrintCompressNot32x4Tables() {
-  printf("======================================= Not 32x4\n");
-  using T = uint32_t;
-  constexpr size_t N = 4;  // SSE4
-  for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
-    const uint64_t code = ~not_code;
-    std::array<uint32_t, N> indices{0};
-    size_t pos = 0;
-    // All lanes where mask = true
-    for (size_t i = 0; i < N; ++i) {
-      if (code & (1ull << i)) {
-        indices[pos++] = i;
-      }
-    }
-    // All lanes where mask = false
-    for (size_t i = 0; i < N; ++i) {
-      if (!(code & (1ull << i))) {
-        indices[pos++] = i;
-      }
-    }
-    HWY_ASSERT(pos == N);
-
-    for (size_t i = 0; i < N; ++i) {
-      for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
-        printf("%d,", static_cast<int>(sizeof(T) * indices[i] + idx_byte));
-      }
-    }
-  }
-  printf("\n");
-}
-
-// 8-tuple of byte indices
-void PrintCompress64x2Tables() {
-  printf("======================================= 64x2\n");
-  using T = uint64_t;
-  constexpr size_t N = 2;  // SSE4
-  for (uint64_t code = 0; code < (1ull << N); ++code) {
-    std::array<uint32_t, N> indices{0};
-    size_t pos = 0;
-    // All lanes where mask = true
-    for (size_t i = 0; i < N; ++i) {
-      if (code & (1ull << i)) {
-        indices[pos++] = i;
-      }
-    }
-    // All lanes where mask = false
-    for (size_t i = 0; i < N; ++i) {
-      if (!(code & (1ull << i))) {
-        indices[pos++] = i;
-      }
-    }
-    HWY_ASSERT(pos == N);
-
-    for (size_t i = 0; i < N; ++i) {
-      for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
-        printf("%d,", static_cast<int>(sizeof(T) * indices[i] + idx_byte));
-      }
-    }
-  }
-  printf("\n");
-}
-
-void PrintCompressNot64x2Tables() {
-  printf("======================================= Not 64x2\n");
-  using T = uint64_t;
-  constexpr size_t N = 2;  // SSE4
-  for (uint64_t not_code = 0; not_code < (1ull << N); ++not_code) {
-    const uint64_t code = ~not_code;
-    std::array<uint32_t, N> indices{0};
-    size_t pos = 0;
-    // All lanes where mask = true
-    for (size_t i = 0; i < N; ++i) {
-      if (code & (1ull << i)) {
-        indices[pos++] = i;
-      }
-    }
-    // All lanes where mask = false
-    for (size_t i = 0; i < N; ++i) {
-      if (!(code & (1ull << i))) {
-        indices[pos++] = i;
-      }
-    }
-    HWY_ASSERT(pos == N);
-
-    for (size_t i = 0; i < N; ++i) {
-      for (size_t idx_byte = 0; idx_byte < sizeof(T); ++idx_byte) {
-        printf("%d,", static_cast<int>(sizeof(T) * indices[i] + idx_byte));
-      }
-    }
-  }
-  printf("\n");
-}
-
-}  // namespace detail
-
-HWY_NOINLINE void PrintTables() {
-  // Only print once.
-#if HWY_TARGET == HWY_STATIC_TARGET
-  detail::PrintCompress32x8Tables();
-  detail::PrintCompressNot32x8Tables();
-  detail::PrintCompress64x4NibbleTables();
-  detail::PrintCompressNot64x4NibbleTables();
-  detail::PrintCompress64x4Tables();
-  detail::PrintCompressNot64x4Tables();
-  detail::PrintCompress32x4Tables();
-  detail::PrintCompressNot32x4Tables();
-  detail::PrintCompress64x2Tables();
-  detail::PrintCompressNot64x2Tables();
-  detail::PrintCompress64x4PairTables();
-  detail::PrintCompressNot64x4PairTables();
-  detail::PrintCompress16x8Tables();
-  detail::PrintCompressNot16x8Tables();
-#endif
-}
-
-#endif  // HWY_PRINT_TABLES
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-
-namespace hwy {
-HWY_BEFORE_TEST(HwyCompressTest);
-#if HWY_PRINT_TABLES
-// Only print instead of running tests; this will be visible in the log.
-HWY_EXPORT_AND_TEST_P(HwyCompressTest, PrintTables);
-#else
-HWY_EXPORT_AND_TEST_P(HwyCompressTest, TestAllCompress);
-HWY_EXPORT_AND_TEST_P(HwyCompressTest, TestAllCompressBlocks);
-#endif
-}  // namespace hwy
-
-#endif
diff --git a/third_party/highway/hwy/tests/convert_test.cc b/third_party/highway/hwy/tests/convert_test.cc
deleted file mode 100644 (file)
index a7aea5f..0000000
+++ /dev/null
@@ -1,643 +0,0 @@
-// Copyright 2019 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <stddef.h>
-#include <stdint.h>
-#include <string.h>
-
-#include <cmath>  // std::isfinite
-
-#include "hwy/base.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "tests/convert_test.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#include "hwy/highway.h"
-#include "hwy/tests/test_util-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-// Cast and ensure bytes are the same. Called directly from TestAllBitCast or
-// via TestBitCastFrom.
-template <typename ToT>
-struct TestBitCast {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const Repartition<ToT, D> dto;
-    const size_t N = Lanes(d);
-    const size_t Nto = Lanes(dto);
-    if (N == 0 || Nto == 0) return;
-    HWY_ASSERT_EQ(N * sizeof(T), Nto * sizeof(ToT));
-    const auto vf = Iota(d, 1);
-    const auto vt = BitCast(dto, vf);
-    // Must return the same bits
-    auto from_lanes = AllocateAligned<T>(Lanes(d));
-    auto to_lanes = AllocateAligned<ToT>(Lanes(dto));
-    Store(vf, d, from_lanes.get());
-    Store(vt, dto, to_lanes.get());
-    HWY_ASSERT(
-        BytesEqual(from_lanes.get(), to_lanes.get(), Lanes(d) * sizeof(T)));
-  }
-};
-
-// From D to all types.
-struct TestBitCastFrom {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T t, D d) {
-    TestBitCast<uint8_t>()(t, d);
-    TestBitCast<uint16_t>()(t, d);
-    TestBitCast<uint32_t>()(t, d);
-#if HWY_HAVE_INTEGER64
-    TestBitCast<uint64_t>()(t, d);
-#endif
-    TestBitCast<int8_t>()(t, d);
-    TestBitCast<int16_t>()(t, d);
-    TestBitCast<int32_t>()(t, d);
-#if HWY_HAVE_INTEGER64
-    TestBitCast<int64_t>()(t, d);
-#endif
-    TestBitCast<float>()(t, d);
-#if HWY_HAVE_FLOAT64
-    TestBitCast<double>()(t, d);
-#endif
-  }
-};
-
-HWY_NOINLINE void TestAllBitCast() {
-  // For HWY_SCALAR and partial vectors, we can only cast to same-sized types:
-  // the former can't partition its single lane, and the latter can be smaller
-  // than a destination type.
-  const ForPartialVectors<TestBitCast<uint8_t>> to_u8;
-  to_u8(uint8_t());
-  to_u8(int8_t());
-
-  const ForPartialVectors<TestBitCast<int8_t>> to_i8;
-  to_i8(uint8_t());
-  to_i8(int8_t());
-
-  const ForPartialVectors<TestBitCast<uint16_t>> to_u16;
-  to_u16(uint16_t());
-  to_u16(int16_t());
-
-  const ForPartialVectors<TestBitCast<int16_t>> to_i16;
-  to_i16(uint16_t());
-  to_i16(int16_t());
-
-  const ForPartialVectors<TestBitCast<uint32_t>> to_u32;
-  to_u32(uint32_t());
-  to_u32(int32_t());
-  to_u32(float());
-
-  const ForPartialVectors<TestBitCast<int32_t>> to_i32;
-  to_i32(uint32_t());
-  to_i32(int32_t());
-  to_i32(float());
-
-#if HWY_HAVE_INTEGER64
-  const ForPartialVectors<TestBitCast<uint64_t>> to_u64;
-  to_u64(uint64_t());
-  to_u64(int64_t());
-#if HWY_HAVE_FLOAT64
-  to_u64(double());
-#endif
-
-  const ForPartialVectors<TestBitCast<int64_t>> to_i64;
-  to_i64(uint64_t());
-  to_i64(int64_t());
-#if HWY_HAVE_FLOAT64
-  to_i64(double());
-#endif
-#endif  // HWY_HAVE_INTEGER64
-
-  const ForPartialVectors<TestBitCast<float>> to_float;
-  to_float(uint32_t());
-  to_float(int32_t());
-  to_float(float());
-
-#if HWY_HAVE_FLOAT64
-  const ForPartialVectors<TestBitCast<double>> to_double;
-  to_double(double());
-#if HWY_HAVE_INTEGER64
-  to_double(uint64_t());
-  to_double(int64_t());
-#endif  // HWY_HAVE_INTEGER64
-#endif  // HWY_HAVE_FLOAT64
-
-#if HWY_TARGET != HWY_SCALAR
-  // For non-scalar vectors, we can cast all types to all.
-  ForAllTypes(ForGEVectors<64, TestBitCastFrom>());
-#endif
-}
-
-template <typename ToT>
-struct TestPromoteTo {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D from_d) {
-    static_assert(sizeof(T) < sizeof(ToT), "Input type must be narrower");
-    const Rebind<ToT, D> to_d;
-
-    const size_t N = Lanes(from_d);
-    auto from = AllocateAligned<T>(N);
-    auto expected = AllocateAligned<ToT>(N);
-
-    RandomState rng;
-    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
-      for (size_t i = 0; i < N; ++i) {
-        const uint64_t bits = rng();
-        CopyBytes<sizeof(T)>(&bits, &from[i]);  // not same size
-        expected[i] = from[i];
-      }
-
-      HWY_ASSERT_VEC_EQ(to_d, expected.get(),
-                        PromoteTo(to_d, Load(from_d, from.get())));
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllPromoteTo() {
-  const ForPromoteVectors<TestPromoteTo<uint16_t>, 1> to_u16div2;
-  to_u16div2(uint8_t());
-
-  const ForPromoteVectors<TestPromoteTo<uint32_t>, 2> to_u32div4;
-  to_u32div4(uint8_t());
-
-  const ForPromoteVectors<TestPromoteTo<uint32_t>, 1> to_u32div2;
-  to_u32div2(uint16_t());
-
-  const ForPromoteVectors<TestPromoteTo<int16_t>, 1> to_i16div2;
-  to_i16div2(uint8_t());
-  to_i16div2(int8_t());
-
-  const ForPromoteVectors<TestPromoteTo<int32_t>, 1> to_i32div2;
-  to_i32div2(uint16_t());
-  to_i32div2(int16_t());
-
-  const ForPromoteVectors<TestPromoteTo<int32_t>, 2> to_i32div4;
-  to_i32div4(uint8_t());
-  to_i32div4(int8_t());
-
-  // Must test f16/bf16 separately because we can only load/store/convert them.
-
-#if HWY_HAVE_INTEGER64
-  const ForPromoteVectors<TestPromoteTo<uint64_t>, 1> to_u64div2;
-  to_u64div2(uint32_t());
-
-  const ForPromoteVectors<TestPromoteTo<int64_t>, 1> to_i64div2;
-  to_i64div2(int32_t());
-#endif
-
-#if HWY_HAVE_FLOAT64
-  const ForPromoteVectors<TestPromoteTo<double>, 1> to_f64div2;
-  to_f64div2(int32_t());
-  to_f64div2(float());
-#endif
-}
-
-template <typename T, HWY_IF_FLOAT(T)>
-bool IsFinite(T t) {
-  return std::isfinite(t);
-}
-// Wrapper avoids calling std::isfinite for integer types (ambiguous).
-template <typename T, HWY_IF_NOT_FLOAT(T)>
-bool IsFinite(T /*unused*/) {
-  return true;
-}
-
-template <class D>
-AlignedFreeUniquePtr<float[]> F16TestCases(D d, size_t& padded) {
-  const float test_cases[] = {
-      // +/- 1
-      1.0f, -1.0f,
-      // +/- 0
-      0.0f, -0.0f,
-      // near 0
-      0.25f, -0.25f,
-      // +/- integer
-      4.0f, -32.0f,
-      // positive near limit
-      65472.0f, 65504.0f,
-      // negative near limit
-      -65472.0f, -65504.0f,
-      // positive +/- delta
-      2.00390625f, 3.99609375f,
-      // negative +/- delta
-      -2.00390625f, -3.99609375f,
-      // No infinity/NaN - implementation-defined due to ARM.
-  };
-  constexpr size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
-  const size_t N = Lanes(d);
-  HWY_ASSERT(N != 0);
-  padded = RoundUpTo(kNumTestCases, N);  // allow loading whole vectors
-  auto in = AllocateAligned<float>(padded);
-  auto expected = AllocateAligned<float>(padded);
-  size_t i = 0;
-  for (; i < kNumTestCases; ++i) {
-    in[i] = test_cases[i];
-  }
-  for (; i < padded; ++i) {
-    in[i] = 0.0f;
-  }
-  return in;
-}
-
-struct TestF16 {
-  template <typename TF32, class DF32>
-  HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) {
-#if HWY_HAVE_FLOAT16
-    size_t padded;
-    const size_t N = Lanes(d32);  // same count for f16
-    HWY_ASSERT(N != 0);
-    auto in = F16TestCases(d32, padded);
-    using TF16 = float16_t;
-    const Rebind<TF16, DF32> d16;
-    auto temp16 = AllocateAligned<TF16>(N);
-
-    for (size_t i = 0; i < padded; i += N) {
-      const auto loaded = Load(d32, &in[i]);
-      Store(DemoteTo(d16, loaded), d16, temp16.get());
-      HWY_ASSERT_VEC_EQ(d32, loaded, PromoteTo(d32, Load(d16, temp16.get())));
-    }
-#else
-    (void)d32;
-#endif
-  }
-};
-
-HWY_NOINLINE void TestAllF16() { ForDemoteVectors<TestF16>()(float()); }
-
-template <class D>
-AlignedFreeUniquePtr<float[]> BF16TestCases(D d, size_t& padded) {
-  const float test_cases[] = {
-      // +/- 1
-      1.0f, -1.0f,
-      // +/- 0
-      0.0f, -0.0f,
-      // near 0
-      0.25f, -0.25f,
-      // +/- integer
-      4.0f, -32.0f,
-      // positive near limit
-      3.389531389251535E38f, 1.99384199368e+38f,
-      // negative near limit
-      -3.389531389251535E38f, -1.99384199368e+38f,
-      // positive +/- delta
-      2.015625f, 3.984375f,
-      // negative +/- delta
-      -2.015625f, -3.984375f,
-  };
-  constexpr size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
-  const size_t N = Lanes(d);
-  HWY_ASSERT(N != 0);
-  padded = RoundUpTo(kNumTestCases, N);  // allow loading whole vectors
-  auto in = AllocateAligned<float>(padded);
-  auto expected = AllocateAligned<float>(padded);
-  size_t i = 0;
-  for (; i < kNumTestCases; ++i) {
-    in[i] = test_cases[i];
-  }
-  for (; i < padded; ++i) {
-    in[i] = 0.0f;
-  }
-  return in;
-}
-
-struct TestBF16 {
-  template <typename TF32, class DF32>
-  HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) {
-#if !defined(HWY_EMULATE_SVE)
-    size_t padded;
-    auto in = BF16TestCases(d32, padded);
-    using TBF16 = bfloat16_t;
-#if HWY_TARGET == HWY_SCALAR
-    const Rebind<TBF16, DF32> dbf16;  // avoid 4/2 = 2 lanes
-#else
-    const Repartition<TBF16, DF32> dbf16;
-#endif
-    const Half<decltype(dbf16)> dbf16_half;
-    const size_t N = Lanes(d32);
-    HWY_ASSERT(Lanes(dbf16_half) <= N);
-    auto temp16 = AllocateAligned<TBF16>(N);
-
-    for (size_t i = 0; i < padded; i += N) {
-      const auto loaded = Load(d32, &in[i]);
-      const auto v16 = DemoteTo(dbf16_half, loaded);
-      Store(v16, dbf16_half, temp16.get());
-      const auto v16_loaded = Load(dbf16_half, temp16.get());
-      HWY_ASSERT_VEC_EQ(d32, loaded, PromoteTo(d32, v16_loaded));
-    }
-#else
-    (void)d32;
-#endif
-  }
-};
-
-HWY_NOINLINE void TestAllBF16() { ForShrinkableVectors<TestBF16>()(float()); }
-
-struct TestConvertU8 {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, const D du32) {
-    const Rebind<uint8_t, D> du8;
-    const auto wrap = Set(du32, 0xFF);
-    HWY_ASSERT_VEC_EQ(du8, Iota(du8, 0), U8FromU32(And(Iota(du32, 0), wrap)));
-    HWY_ASSERT_VEC_EQ(du8, Iota(du8, 0x7F),
-                      U8FromU32(And(Iota(du32, 0x7F), wrap)));
-  }
-};
-
-HWY_NOINLINE void TestAllConvertU8() {
-  ForDemoteVectors<TestConvertU8, 2>()(uint32_t());
-}
-
-template <typename From, typename To, class D>
-constexpr bool IsSupportedTruncation() {
-  return (sizeof(To) < sizeof(From)) &&
-         (Pow2(Rebind<To, D>()) + 3 >= static_cast<int>(CeilLog2(sizeof(To))));
-}
-
-struct TestTruncateTo {
-  template <typename From, typename To, class D,
-            hwy::EnableIf<!IsSupportedTruncation<From, To, D>()>* = nullptr>
-  HWY_NOINLINE void testTo(From, To, const D) {
-    // do nothing
-  }
-
-  template <typename From, typename To, class D,
-            hwy::EnableIf<IsSupportedTruncation<From, To, D>()>* = nullptr>
-  HWY_NOINLINE void testTo(From, To, const D d) {
-    constexpr uint32_t base = 0xFA578D00;
-    const Rebind<To, D> dTo;
-    const auto src = Iota(d, static_cast<From>(base));
-    const auto expected = Iota(dTo, static_cast<To>(base));
-    const VFromD<decltype(dTo)> actual = TruncateTo(dTo, src);
-    HWY_ASSERT_VEC_EQ(dTo, expected, actual);
-  }
-
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T from, const D d) {
-    testTo<T, uint8_t, D>(from, uint8_t(), d);
-    testTo<T, uint16_t, D>(from, uint16_t(), d);
-    testTo<T, uint32_t, D>(from, uint32_t(), d);
-  }
-};
-
-HWY_NOINLINE void TestAllTruncate() {
-  ForUnsignedTypes(ForPartialVectors<TestTruncateTo>());
-}
-
-// Separate function to attempt to work around a compiler bug on ARM: when this
-// is merged with TestIntFromFloat, outputs match a previous Iota(-(N+1)) input.
-struct TestIntFromFloatHuge {
-  template <typename TF, class DF>
-  HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
-    // The ARMv7 manual says that float->int saturates, i.e. chooses the
-    // nearest representable value. This works correctly on armhf with GCC, but
-    // not with clang. For reasons unknown, MSVC also runs into an out-of-memory
-    // error here.
-#if HWY_COMPILER_CLANG || HWY_COMPILER_MSVC
-    (void)df;
-#else
-    using TI = MakeSigned<TF>;
-    const Rebind<TI, DF> di;
-
-    // Workaround for incorrect 32-bit GCC codegen for SSSE3 - Print-ing
-    // the expected lvalue also seems to prevent the issue.
-    const size_t N = Lanes(df);
-    auto expected = AllocateAligned<TI>(N);
-
-    // Huge positive
-    Store(Set(di, LimitsMax<TI>()), di, expected.get());
-    HWY_ASSERT_VEC_EQ(di, expected.get(), ConvertTo(di, Set(df, TF(1E20))));
-
-    // Huge negative
-    Store(Set(di, LimitsMin<TI>()), di, expected.get());
-    HWY_ASSERT_VEC_EQ(di, expected.get(), ConvertTo(di, Set(df, TF(-1E20))));
-#endif
-  }
-};
-
-class TestIntFromFloat {
-  template <typename TF, class DF>
-  static HWY_NOINLINE void TestPowers(TF /*unused*/, const DF df) {
-    using TI = MakeSigned<TF>;
-    const Rebind<TI, DF> di;
-    constexpr size_t kBits = sizeof(TF) * 8;
-
-    // Powers of two, plus offsets to set some mantissa bits.
-    const int64_t ofs_table[3] = {0LL, 3LL << (kBits / 2), 1LL << (kBits - 15)};
-    for (int sign = 0; sign < 2; ++sign) {
-      for (size_t shift = 0; shift < kBits - 1; ++shift) {
-        for (int64_t ofs : ofs_table) {
-          const int64_t mag = (int64_t{1} << shift) + ofs;
-          const int64_t val = sign ? mag : -mag;
-          HWY_ASSERT_VEC_EQ(di, Set(di, static_cast<TI>(val)),
-                            ConvertTo(di, Set(df, static_cast<TF>(val))));
-        }
-      }
-    }
-  }
-
-  template <typename TF, class DF>
-  static HWY_NOINLINE void TestRandom(TF /*unused*/, const DF df) {
-    using TI = MakeSigned<TF>;
-    const Rebind<TI, DF> di;
-    const size_t N = Lanes(df);
-
-    // TF does not have enough precision to represent TI.
-    const double min = static_cast<double>(LimitsMin<TI>());
-    const double max = static_cast<double>(LimitsMax<TI>());
-
-    // Also check random values.
-    auto from = AllocateAligned<TF>(N);
-    auto expected = AllocateAligned<TI>(N);
-    RandomState rng;
-    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
-      for (size_t i = 0; i < N; ++i) {
-        do {
-          const uint64_t bits = rng();
-          CopyBytes<sizeof(TF)>(&bits, &from[i]);  // not same size
-        } while (!std::isfinite(from[i]));
-        if (from[i] >= max) {
-          expected[i] = LimitsMax<TI>();
-        } else if (from[i] <= min) {
-          expected[i] = LimitsMin<TI>();
-        } else {
-          expected[i] = static_cast<TI>(from[i]);
-        }
-      }
-
-      HWY_ASSERT_VEC_EQ(di, expected.get(),
-                        ConvertTo(di, Load(df, from.get())));
-    }
-  }
-
- public:
-  template <typename TF, class DF>
-  HWY_NOINLINE void operator()(TF tf, const DF df) {
-    using TI = MakeSigned<TF>;
-    const Rebind<TI, DF> di;
-    const size_t N = Lanes(df);
-
-    // Integer positive
-    HWY_ASSERT_VEC_EQ(di, Iota(di, TI(4)), ConvertTo(di, Iota(df, TF(4.0))));
-
-    // Integer negative
-    HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), ConvertTo(di, Iota(df, -TF(N))));
-
-    // Above positive
-    HWY_ASSERT_VEC_EQ(di, Iota(di, TI(2)), ConvertTo(di, Iota(df, TF(2.001))));
-
-    // Below positive
-    HWY_ASSERT_VEC_EQ(di, Iota(di, TI(3)), ConvertTo(di, Iota(df, TF(3.9999))));
-
-    const TF eps = static_cast<TF>(0.0001);
-    // Above negative
-    HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)),
-                      ConvertTo(di, Iota(df, -TF(N + 1) + eps)));
-
-    // Below negative
-    HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N + 1)),
-                      ConvertTo(di, Iota(df, -TF(N + 1) - eps)));
-
-    TestPowers(tf, df);
-    TestRandom(tf, df);
-  }
-};
-
-HWY_NOINLINE void TestAllIntFromFloat() {
-  ForFloatTypes(ForPartialVectors<TestIntFromFloatHuge>());
-  ForFloatTypes(ForPartialVectors<TestIntFromFloat>());
-}
-
-struct TestFloatFromInt {
-  template <typename TF, class DF>
-  HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
-    using TI = MakeSigned<TF>;
-    const RebindToSigned<DF> di;
-    const size_t N = Lanes(df);
-
-    // Integer positive
-    HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4.0)), ConvertTo(df, Iota(di, TI(4))));
-
-    // Integer negative
-    HWY_ASSERT_VEC_EQ(df, Iota(df, -TF(N)), ConvertTo(df, Iota(di, -TI(N))));
-
-    // Max positive
-    HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMax<TI>())),
-                      ConvertTo(df, Set(di, LimitsMax<TI>())));
-
-    // Min negative
-    HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMin<TI>())),
-                      ConvertTo(df, Set(di, LimitsMin<TI>())));
-  }
-};
-
-HWY_NOINLINE void TestAllFloatFromInt() {
-  ForFloatTypes(ForPartialVectors<TestFloatFromInt>());
-}
-
-struct TestFloatFromUint {
-  template <typename TF, class DF>
-  HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
-    using TU = MakeUnsigned<TF>;
-    const RebindToUnsigned<DF> du;
-
-    // Integer positive
-    HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4.0)), ConvertTo(df, Iota(du, TU(4))));
-    HWY_ASSERT_VEC_EQ(df, Iota(df, TF(65535.0)),
-                      ConvertTo(df, Iota(du, 65535)));  // 2^16-1
-    if (sizeof(TF) > 4) {
-      HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4294967295.0)),
-                        ConvertTo(df, Iota(du, 4294967295ULL)));  // 2^32-1
-    }
-
-    // Max positive
-    HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMax<TU>())),
-                      ConvertTo(df, Set(du, LimitsMax<TU>())));
-
-    // Zero
-    HWY_ASSERT_VEC_EQ(df, Zero(df), ConvertTo(df, Zero(du)));
-  }
-};
-
-HWY_NOINLINE void TestAllFloatFromUint() {
-  ForFloatTypes(ForPartialVectors<TestFloatFromUint>());
-}
-
-struct TestI32F64 {
-  template <typename TF, class DF>
-  HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
-    using TI = int32_t;
-    const Rebind<TI, DF> di;
-    const size_t N = Lanes(df);
-
-    // Integer positive
-    HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4.0)), PromoteTo(df, Iota(di, TI(4))));
-
-    // Integer negative
-    HWY_ASSERT_VEC_EQ(df, Iota(df, -TF(N)), PromoteTo(df, Iota(di, -TI(N))));
-
-    // Above positive
-    HWY_ASSERT_VEC_EQ(df, Iota(df, TF(2.0)), PromoteTo(df, Iota(di, TI(2))));
-
-    // Below positive
-    HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4.0)), PromoteTo(df, Iota(di, TI(4))));
-
-    // Above negative
-    HWY_ASSERT_VEC_EQ(df, Iota(df, TF(-4.0)), PromoteTo(df, Iota(di, TI(-4))));
-
-    // Below negative
-    HWY_ASSERT_VEC_EQ(df, Iota(df, TF(-2.0)), PromoteTo(df, Iota(di, TI(-2))));
-
-    // Max positive int
-    HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMax<TI>())),
-                      PromoteTo(df, Set(di, LimitsMax<TI>())));
-
-    // Min negative int
-    HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMin<TI>())),
-                      PromoteTo(df, Set(di, LimitsMin<TI>())));
-  }
-};
-
-HWY_NOINLINE void TestAllI32F64() {
-#if HWY_HAVE_FLOAT64
-  ForDemoteVectors<TestI32F64>()(double());
-#endif
-}
-
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-
-namespace hwy {
-HWY_BEFORE_TEST(HwyConvertTest);
-HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllBitCast);
-HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllPromoteTo);
-HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllF16);
-HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllBF16);
-HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllConvertU8);
-HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllTruncate);
-HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllIntFromFloat);
-HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllFloatFromInt);
-HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllFloatFromUint);
-HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllI32F64);
-}  // namespace hwy
-
-#endif
diff --git a/third_party/highway/hwy/tests/crypto_test.cc b/third_party/highway/hwy/tests/crypto_test.cc
deleted file mode 100644 (file)
index b7dfb19..0000000
+++ /dev/null
@@ -1,553 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <stddef.h>
-#include <stdint.h>
-#include <string.h>  // memcpy
-
-#include "hwy/aligned_allocator.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "tests/crypto_test.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#include "hwy/highway.h"
-#include "hwy/tests/test_util-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-#define HWY_PRINT_CLMUL_GOLDEN 0
-
-#if HWY_TARGET != HWY_SCALAR
-
-class TestAES {
-  template <typename T, class D>
-  HWY_NOINLINE void TestSBox(T /*unused*/, D d) {
-    // The generic implementation of the S-box is difficult to verify by
-    // inspection, so we add a white-box test that verifies it using enumeration
-    // (outputs for 0..255 vs. https://en.wikipedia.org/wiki/Rijndael_S-box).
-    const uint8_t sbox[256] = {
-        0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b,
-        0xfe, 0xd7, 0xab, 0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0,
-        0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26,
-        0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
-        0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2,
-        0xeb, 0x27, 0xb2, 0x75, 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0,
-        0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, 0x53, 0xd1, 0x00, 0xed,
-        0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
-        0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f,
-        0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
-        0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, 0x13, 0xec,
-        0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
-        0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14,
-        0xde, 0x5e, 0x0b, 0xdb, 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c,
-        0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d,
-        0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
-        0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f,
-        0x4b, 0xbd, 0x8b, 0x8a, 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e,
-        0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, 0xe1, 0xf8, 0x98, 0x11,
-        0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
-        0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f,
-        0xb0, 0x54, 0xbb, 0x16};
-
-    // Ensure it's safe to load an entire vector by padding.
-    const size_t N = Lanes(d);
-    const size_t padded = RoundUpTo(256, N);
-    auto expected = AllocateAligned<T>(padded);
-    // Must wrap around to match the input (Iota).
-    for (size_t pos = 0; pos < padded;) {
-      const size_t remaining = HWY_MIN(padded - pos, size_t(256));
-      memcpy(expected.get() + pos, sbox, remaining);
-      pos += remaining;
-    }
-
-    for (size_t i = 0; i < 256; i += N) {
-      const auto in = Iota(d, static_cast<T>(i));
-      HWY_ASSERT_VEC_EQ(d, expected.get() + i, detail::SubBytes(in));
-    }
-  }
-
- public:
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T t, D d) {
-    // Test vector (after first KeyAddition) from
-    // https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Standards-and-Guidelines/documents/examples/AES_Core128.pdf
-    alignas(16) constexpr uint8_t test_lanes[16] = {
-        0x40, 0xBF, 0xAB, 0xF4, 0x06, 0xEE, 0x4D, 0x30,
-        0x42, 0xCA, 0x6B, 0x99, 0x7A, 0x5C, 0x58, 0x16};
-    const auto test = LoadDup128(d, test_lanes);
-
-    // = ShiftRow result
-    alignas(16) constexpr uint8_t expected_sr_lanes[16] = {
-        0x09, 0x28, 0x7F, 0x47, 0x6F, 0x74, 0x6A, 0xBF,
-        0x2C, 0x4A, 0x62, 0x04, 0xDA, 0x08, 0xE3, 0xEE};
-    const auto expected_sr = LoadDup128(d, expected_sr_lanes);
-
-    // = MixColumn result
-    alignas(16) constexpr uint8_t expected_mc_lanes[16] = {
-        0x52, 0x9F, 0x16, 0xC2, 0x97, 0x86, 0x15, 0xCA,
-        0xE0, 0x1A, 0xAE, 0x54, 0xBA, 0x1A, 0x26, 0x59};
-    const auto expected_mc = LoadDup128(d, expected_mc_lanes);
-
-    // = KeyAddition result
-    alignas(16) constexpr uint8_t expected_lanes[16] = {
-        0xF2, 0x65, 0xE8, 0xD5, 0x1F, 0xD2, 0x39, 0x7B,
-        0xC3, 0xB9, 0x97, 0x6D, 0x90, 0x76, 0x50, 0x5C};
-    const auto expected = LoadDup128(d, expected_lanes);
-
-    alignas(16) uint8_t key_lanes[16];
-    for (size_t i = 0; i < 16; ++i) {
-      key_lanes[i] = expected_mc_lanes[i] ^ expected_lanes[i];
-    }
-    const auto round_key = LoadDup128(d, key_lanes);
-
-    HWY_ASSERT_VEC_EQ(d, expected_mc, AESRound(test, Zero(d)));
-    HWY_ASSERT_VEC_EQ(d, expected, AESRound(test, round_key));
-    HWY_ASSERT_VEC_EQ(d, expected_sr, AESLastRound(test, Zero(d)));
-    HWY_ASSERT_VEC_EQ(d, Xor(expected_sr, round_key),
-                      AESLastRound(test, round_key));
-
-    TestSBox(t, d);
-  }
-};
-HWY_NOINLINE void TestAllAES() { ForGEVectors<128, TestAES>()(uint8_t()); }
-
-#else
-HWY_NOINLINE void TestAllAES() {}
-#endif  // HWY_TARGET != HWY_SCALAR
-
-struct TestCLMul {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    // needs 64 bit lanes and 128-bit result
-#if HWY_TARGET != HWY_SCALAR && HWY_HAVE_INTEGER64
-    const size_t N = Lanes(d);
-    if (N == 1) return;
-
-    auto in1 = AllocateAligned<T>(N);
-    auto in2 = AllocateAligned<T>(N);
-
-    constexpr size_t kCLMulNum = 512;
-    // Depends on rng!
-    static constexpr uint64_t kCLMulLower[kCLMulNum] = {
-        0x24511d4ce34d6350ULL, 0x4ca582edde1236bbULL, 0x537e58f72dac25a8ULL,
-        0x4e942d5e130b9225ULL, 0x75a906c519257a68ULL, 0x1df9f85126d96c5eULL,
-        0x464e7c13f4ad286aULL, 0x138535ee35dabc40ULL, 0xb2f7477b892664ecULL,
-        0x01557b077167c25dULL, 0xf32682490ee49624ULL, 0x0025bac603b9e140ULL,
-        0xcaa86aca3e3daf40ULL, 0x1fbcfe4af73eb6c4ULL, 0x8ee8064dd0aae5dcULL,
-        0x1248cb547858c213ULL, 0x37a55ee5b10fb34cULL, 0x6eb5c97b958f86e2ULL,
-        0x4b1ab3eb655ea7cdULL, 0x1d66645a85627520ULL, 0xf8728e96daa36748ULL,
-        0x38621043e6ff5e3bULL, 0xd1d28b5da5ffefb4ULL, 0x0a5cd65931546df7ULL,
-        0x2a0639be3d844150ULL, 0x0e2d0f18c8d6f045ULL, 0xfacc770b963326c1ULL,
-        0x19611b31ca2ef141ULL, 0xabea29510dd87518ULL, 0x18a7dc4b205f2768ULL,
-        0x9d3975ea5612dc86ULL, 0x06319c139e374773ULL, 0x6641710400b4c390ULL,
-        0x356c29b6001c3670ULL, 0xe9e04d851e040a00ULL, 0x21febe561222d79aULL,
-        0xc071eaae6e148090ULL, 0x0eed351a0af94f5bULL, 0x04324eedb3c03688ULL,
-        0x39e89b136e0d6ccdULL, 0x07d0fd2777a31600ULL, 0x44b8573827209822ULL,
-        0x6d690229ea177d78ULL, 0x1b9749d960ba9f18ULL, 0x190945271c0fbb94ULL,
-        0x189aea0e07d2c88eULL, 0xf18eab6b65a6beb2ULL, 0x57744b21c13d0d84ULL,
-        0xf63050a613e95c2eULL, 0x12cd20d25f97102fULL, 0x5a5df0678dbcba60ULL,
-        0x0b08fb80948bfafcULL, 0x44cf1cbe7c6fc3c8ULL, 0x166a470ef25da288ULL,
-        0x2c498a609204e48cULL, 0x261b0a22585697ecULL, 0x737750574af7dde4ULL,
-        0x4079959c60b01e0cULL, 0x06ed8aac13f782d6ULL, 0x019d454ba9b5ef20ULL,
-        0xea1edbf96d49e858ULL, 0x17c2f3ebde9ac469ULL, 0x5cf72706e3d6f5e4ULL,
-        0x16e856aa3c841516ULL, 0x256f7e3cef83368eULL, 0x47e17c8eb2774e77ULL,
-        0x9b48ac150a804821ULL, 0x584523f61ccfdf22ULL, 0xedcb6a2a75d9e7f2ULL,
-        0x1fe3d1838e537aa7ULL, 0x778872e9f64549caULL, 0x2f1cea6f0d3faf92ULL,
-        0x0e8c4b6a9343f326ULL, 0x01902d1ba3048954ULL, 0xc5c1fd5269e91dc0ULL,
-        0x0ef8a4707817eb9cULL, 0x1f696f09a5354ca4ULL, 0x369cd9de808b818cULL,
-        0xf6917d1dd43fd784ULL, 0x7f4b76bf40dc166fULL, 0x4ce67698724ace12ULL,
-        0x02c3bf60e6e9cd92ULL, 0xb8229e45b21458e8ULL, 0x415efd41e91adf49ULL,
-        0x5edfcd516bb921cdULL, 0x5ff2c29429fd187eULL, 0x0af666b17103b3e0ULL,
-        0x1f5e4ff8f54c9a5bULL, 0x429253d8a5544ba6ULL, 0x19de2fdf9f4d9dcaULL,
-        0x29bf3d37ddc19a40ULL, 0x04d4513a879552baULL, 0x5cc7476cf71ee155ULL,
-        0x40011f8c238784a5ULL, 0x1a3ae50b0fd2ee2bULL, 0x7db22f432ba462baULL,
-        0x417290b0bee2284aULL, 0x055a6bd5bb853db2ULL, 0xaa667daeed8c2a34ULL,
-        0x0d6b316bda7f3577ULL, 0x72d35598468e3d5dULL, 0x375b594804bfd33aULL,
-        0x16ed3a319b540ae8ULL, 0x093bace4b4695afdULL, 0xc7118754ec2737ceULL,
-        0x0fff361f0505c81aULL, 0x996e9e7291321af0ULL, 0x496b1d9b0b89ba8cULL,
-        0x65a98b2e9181da9cULL, 0x70759c8dd45575dfULL, 0x3446fe727f5e2cbbULL,
-        0x1121ae609d195e74ULL, 0x5ff5d68ce8a21018ULL, 0x0e27eca3825b60d6ULL,
-        0x82f628bceca3d1daULL, 0x2756a0914e344047ULL, 0xa460406c1c708d50ULL,
-        0x63ce32a0c083e491ULL, 0xc883e5a685c480e0ULL, 0x602c951891e600f9ULL,
-        0x02ecb2e3911ca5f8ULL, 0x0d8675f4bb70781aULL, 0x43545cc3c78ea496ULL,
-        0x04164b01d6b011c2ULL, 0x3acbb323dcab2c9bULL, 0x31c5ba4e22793082ULL,
-        0x5a6484af5f7c2d10ULL, 0x1a929b16194e8078ULL, 0x7a6a75d03b313924ULL,
-        0x0553c73a35b1d525ULL, 0xf18628c51142be34ULL, 0x1b51cf80d7efd8f5ULL,
-        0x52e0ca4df63ee258ULL, 0x0e977099160650c9ULL, 0x6be1524e92024f70ULL,
-        0x0ee2152625438b9dULL, 0xfa32af436f6d8eb4ULL, 0x5ecf49c2154287e5ULL,
-        0x6b72f4ae3590569dULL, 0x086c5ee6e87bfb68ULL, 0x737a4f0dc04b6187ULL,
-        0x08c3439280edea41ULL, 0x9547944f01636c5cULL, 0x6acfbfc2571cd71fULL,
-        0x85d7842972449637ULL, 0x252ea5e5a7fad86aULL, 0x4e41468f99ba1632ULL,
-        0x095e0c3ae63b25a2ULL, 0xb005ce88fd1c9425ULL, 0x748e668abbe09f03ULL,
-        0xb2cfdf466b187d18ULL, 0x60b11e633d8fe845ULL, 0x07144c4d246db604ULL,
-        0x139bcaac55e96125ULL, 0x118679b5a6176327ULL, 0x1cebe90fa4d9f83fULL,
-        0x22244f52f0d312acULL, 0x669d4e17c9bfb713ULL, 0x96390e0b834bb0d0ULL,
-        0x01f7f0e82ba08071ULL, 0x2dffeee31ca6d284ULL, 0x1f4738745ef039feULL,
-        0x4ce0dd2b603b6420ULL, 0x0035fc905910a4d5ULL, 0x07df2b533df6fb04ULL,
-        0x1cee2735c9b910ddULL, 0x2bc4af565f7809eaULL, 0x2f876c1f5cb1076cULL,
-        0x33e079524099d056ULL, 0x169e0405d2f9efbaULL, 0x018643ab548a358cULL,
-        0x1bb6fc4331cffe92ULL, 0x05111d3a04e92faaULL, 0x23c27ecf0d638b73ULL,
-        0x1b79071dc1685d68ULL, 0x0662d20aba8e1e0cULL, 0xe7f6440277144c6fULL,
-        0x4ca38b64c22196c0ULL, 0x43c05f6d1936fbeeULL, 0x0654199d4d1faf0fULL,
-        0xf2014054e71c2d04ULL, 0x0a103e47e96b4c84ULL, 0x7986e691dd35b040ULL,
-        0x4e1ebb53c306a341ULL, 0x2775bb3d75d65ba6ULL, 0x0562ab0adeff0f15ULL,
-        0x3c2746ad5eba3eacULL, 0x1facdb5765680c60ULL, 0xb802a60027d81d00ULL,
-        0x1191d0f6366ae3a9ULL, 0x81a97b5ae0ea5d14ULL, 0x06bee05b6178a770ULL,
-        0xc7baeb2fe1d6aeb3ULL, 0x594cb5b867d04fdfULL, 0xf515a80138a4e350ULL,
-        0x646417ad8073cf38ULL, 0x4a229a43373fb8d4ULL, 0x10fa6eafff1ca453ULL,
-        0x9f060700895cc731ULL, 0x00521133d11d11f4ULL, 0xb940a2bb912a7a5cULL,
-        0x3fab180670ad2a3cULL, 0x45a5f0e5b6fdb95dULL, 0x27c1baad6f946b15ULL,
-        0x336c6bdbe527cf58ULL, 0x3b83aa602a5baea3ULL, 0xdf749153f9bcc376ULL,
-        0x1a05513a6c0b4a90ULL, 0xb81e0b570a075c47ULL, 0x471fabb40bdc27ceULL,
-        0x9dec9472f6853f60ULL, 0x361f71b88114193bULL, 0x3b550a8c4feeff00ULL,
-        0x0f6cde5a68bc9bc0ULL, 0x3f50121a925703e0ULL, 0x6967ff66d6d343a9ULL,
-        0xff6b5bd2ce7bc3ccULL, 0x05474cea08bf6cd8ULL, 0xf76eabbfaf108eb0ULL,
-        0x067529be4fc6d981ULL, 0x4d766b137cf8a988ULL, 0x2f09c7395c5cfbbdULL,
-        0x388793712da06228ULL, 0x02c9ff342c8f339aULL, 0x152c734139a860a3ULL,
-        0x35776eb2b270c04dULL, 0x0f8d8b41f11c4608ULL, 0x0c2071665be6b288ULL,
-        0xc034e212b3f71d88ULL, 0x071d961ef3276f99ULL, 0xf98598ee75b60773ULL,
-        0x062062c58c6724e4ULL, 0xd156438e2125572cULL, 0x38552d59a7f0f7c8ULL,
-        0x1a402178206e413cULL, 0x1f1f996c68293b26ULL, 0x8bce3cafe1730f7eULL,
-        0x2d0480a0828f6bf5ULL, 0x6c99cffa171f92f6ULL, 0x0087f842bb0ac681ULL,
-        0x11d7ed06e1e7fd3eULL, 0x07cb1186f2385dc6ULL, 0x5d7763ebff1e170fULL,
-        0x2dacc870231ac292ULL, 0x8486317a9ffb390cULL, 0x1c3a6dd20c959ac6ULL,
-        0x90dc96e3992e06b8ULL, 0x70d60bfa33e72b67ULL, 0x70c9bddd0985ee63ULL,
-        0x012c9767b3673093ULL, 0xfcd3bc5580f6a88aULL, 0x0ac80017ef6308c3ULL,
-        0xdb67d709ef4bba09ULL, 0x4c63e324f0e247ccULL, 0xa15481d3fe219d60ULL,
-        0x094c4279cdccb501ULL, 0x965a28c72575cb82ULL, 0x022869db25e391ebULL,
-        0x37f528c146023910ULL, 0x0c1290636917deceULL, 0x9aee25e96251ca9cULL,
-        0x728ac5ba853b69c2ULL, 0x9f272c93c4be20c8ULL, 0x06c1aa6319d28124ULL,
-        0x4324496b1ca8a4f7ULL, 0x0096ecfe7dfc0189ULL, 0x9e06131b19ae0020ULL,
-        0x15278b15902f4597ULL, 0x2a9fece8c13842d8ULL, 0x1d4e6781f0e1355eULL,
-        0x6855b712d3dbf7c0ULL, 0x06a07fad99be6f46ULL, 0x3ed9d7957e4d1d7cULL,
-        0x0c326f7cbc248bb2ULL, 0xe6363ad2c537cf51ULL, 0x0e12eb1c40723f13ULL,
-        0xf5c6ac850afba803ULL, 0x0322a79d615fa9f0ULL, 0x6116696ed97bd5f8ULL,
-        0x0d438080fbbdc9f1ULL, 0x2e4dc42c38f1e243ULL, 0x64948e9104f3a5bfULL,
-        0x9fd622371bdb5f00ULL, 0x0f12bf082b2a1b6eULL, 0x4b1f8d867d78031cULL,
-        0x134392ea9f5ef832ULL, 0xf3d70472321bc23eULL, 0x05fcbe5e9eea268eULL,
-        0x136dede7175a22cfULL, 0x1308f8baac2cbcccULL, 0xd691026f0915eb64ULL,
-        0x0e49a668345c3a38ULL, 0x24ddbbe8bc96f331ULL, 0x4d2ec9479b640578ULL,
-        0x450f0697327b359cULL, 0x32b45360f4488ee0ULL, 0x4f6d9ecec46a105aULL,
-        0x5500c63401ae8e80ULL, 0x47dea495cf6f98baULL, 0x13dc9a2dfca80babULL,
-        0xe6f8a93f7b24ca92ULL, 0x073f57a6d900a87fULL, 0x9ddb935fd3aa695aULL,
-        0x101e98d24b39e8aaULL, 0x6b8d0eb95a507ddcULL, 0x45a908b3903d209bULL,
-        0x6c96a3e119e617d4ULL, 0x2442787543d3be48ULL, 0xd3bc055c7544b364ULL,
-        0x7693bb042ca8653eULL, 0xb95e3a4ea5d0101eULL, 0x116f0d459bb94a73ULL,
-        0x841244b72cdc5e90ULL, 0x1271acced6cb34d3ULL, 0x07d289106524d638ULL,
-        0x537c9cf49c01b5bbULL, 0x8a8e16706bb7a5daULL, 0x12e50a9c499dc3a9ULL,
-        0x1cade520db2ba830ULL, 0x1add52f000d7db70ULL, 0x12cf15db2ce78e30ULL,
-        0x0657eaf606bfc866ULL, 0x4026816d3b05b1d0ULL, 0x1ba0ebdf90128e4aULL,
-        0xdfd649375996dd6eULL, 0x0f416e906c23d9aeULL, 0x384273cad0582a24ULL,
-        0x2ff27b0378a46189ULL, 0xc4ecd18a2d7a7616ULL, 0x35cef0b5cd51d640ULL,
-        0x7d582363643f48b7ULL, 0x0984ad746ad0ab7cULL, 0x2990a999835f9688ULL,
-        0x2d4df66a97b19e05ULL, 0x592c79720af99aa2ULL, 0x052863c230602cd3ULL,
-        0x5f5e2b15edcf2840ULL, 0x01dff1b694b978b0ULL, 0x14345a48b622025eULL,
-        0x028fab3b6407f715ULL, 0x3455d188e6feca50ULL, 0x1d0d40288fb1b5fdULL,
-        0x4685c5c2b6a1e5aeULL, 0x3a2077b1e5fe5adeULL, 0x1bc55d611445a0d8ULL,
-        0x05480ae95f3f83feULL, 0xbbb59cfcf7e17fb6ULL, 0x13f7f10970bbb990ULL,
-        0x6d00ac169425a352ULL, 0x7da0db397ef2d5d3ULL, 0x5b512a247f8d2479ULL,
-        0x637eaa6a977c3c32ULL, 0x3720f0ae37cba89cULL, 0x443df6e6aa7f525bULL,
-        0x28664c287dcef321ULL, 0x03c267c00cf35e49ULL, 0x690185572d4021deULL,
-        0x2707ff2596e321c2ULL, 0xd865f5af7722c380ULL, 0x1ea285658e33aafbULL,
-        0xc257c5e88755bef4ULL, 0x066f67275cfcc31eULL, 0xb09931945cc0fed0ULL,
-        0x58c1dc38d6e3a03fULL, 0xf99489678fc94ee8ULL, 0x75045bb99be5758aULL,
-        0x6c163bc34b40feefULL, 0x0420063ce7bdd3b4ULL, 0xf86ef10582bf2e28ULL,
-        0x162c3449ca14858cULL, 0x94106aa61dfe3280ULL, 0x4073ae7a4e7e4941ULL,
-        0x32b13fd179c250b4ULL, 0x0178fbb216a7e744ULL, 0xf840ae2f1cf92669ULL,
-        0x18fc709acc80243dULL, 0x20ac2ebd69f4d558ULL, 0x6e580ad9c73ad46aULL,
-        0x76d2b535b541c19dULL, 0x6c7a3fb9dd0ce0afULL, 0xc3481689b9754f28ULL,
-        0x156e813b6557abdbULL, 0x6ee372e31276eb10ULL, 0x19cf37c038c8d381ULL,
-        0x00d4d906c9ae3072ULL, 0x09f03cbb6dfbfd40ULL, 0x461ba31c4125f3cfULL,
-        0x25b29fc63ad9f05bULL, 0x6808c95c2dddede9ULL, 0x0564224337066d9bULL,
-        0xc87eb5f4a4d966f2ULL, 0x66fc66e1701f5847ULL, 0xc553a3559f74da28ULL,
-        0x1dfd841be574df43ULL, 0x3ee2f100c3ebc082ULL, 0x1a2c4f9517b56e89ULL,
-        0x502f65c4b535c8ffULL, 0x1da5663ab6f96ec0ULL, 0xba1f80b73988152cULL,
-        0x364ff12182ac8dc1ULL, 0xe3457a3c4871db31ULL, 0x6ae9cadf92fd7e84ULL,
-        0x9621ba3d6ca15186ULL, 0x00ff5af878c144ceULL, 0x918464dc130101a4ULL,
-        0x036511e6b187efa6ULL, 0x06667d66550ff260ULL, 0x7fd18913f9b51bc1ULL,
-        0x3740e6b27af77aa8ULL, 0x1f546c2fd358ff8aULL, 0x42f1424e3115c891ULL,
-        0x03767db4e3a1bb33ULL, 0xa171a1c564345060ULL, 0x0afcf632fd7b1324ULL,
-        0xb59508d933ffb7d0ULL, 0x57d766c42071be83ULL, 0x659f0447546114a2ULL,
-        0x4070364481c460aeULL, 0xa2b9752280644d52ULL, 0x04ab884bea5771bdULL,
-        0x87cd135602a232b4ULL, 0x15e54cd9a8155313ULL, 0x1e8005efaa3e1047ULL,
-        0x696b93f4ab15d39fULL, 0x0855a8e540de863aULL, 0x0bb11799e79f9426ULL,
-        0xeffa61e5c1b579baULL, 0x1e060a1d11808219ULL, 0x10e219205667c599ULL,
-        0x2f7b206091c49498ULL, 0xb48854c820064860ULL, 0x21c4aaa3bfbe4a38ULL,
-        0x8f4a032a3fa67e9cULL, 0x3146b3823401e2acULL, 0x3afee26f19d88400ULL,
-        0x167087c485791d38ULL, 0xb67a1ed945b0fb4bULL, 0x02436eb17e27f1c0ULL,
-        0xe05afce2ce2d2790ULL, 0x49c536fc6224cfebULL, 0x178865b3b862b856ULL,
-        0x1ce530de26acde5bULL, 0x87312c0b30a06f38ULL, 0x03e653b578558d76ULL,
-        0x4d3663c21d8b3accULL, 0x038003c23626914aULL, 0xd9d5a2c052a09451ULL,
-        0x39b5acfe08a49384ULL, 0x40f349956d5800e4ULL, 0x0968b6950b1bd8feULL,
-        0xd60b2ca030f3779cULL, 0x7c8bc11a23ce18edULL, 0xcc23374e27630bc2ULL,
-        0x2e38fc2a8bb33210ULL, 0xe421357814ee5c44ULL, 0x315fb65ea71ec671ULL,
-        0xfb1b0223f70ed290ULL, 0x30556c9f983eaf07ULL, 0x8dd438c3d0cd625aULL,
-        0x05a8fd0c7ffde71bULL, 0x764d1313b5aeec7aULL, 0x2036af5de9622f47ULL,
-        0x508a5bfadda292feULL, 0x3f77f04ba2830e90ULL, 0x9047cd9c66ca66d2ULL,
-        0x1168b5318a54eb21ULL, 0xc93462d221da2e15ULL, 0x4c2c7cc54abc066eULL,
-        0x767a56fec478240eULL, 0x095de72546595bd3ULL, 0xc9da535865158558ULL,
-        0x1baccf36f33e73fbULL, 0xf3d7dbe64df77f18ULL, 0x1f8ebbb7be4850b8ULL,
-        0x043c5ed77bce25a1ULL, 0x07d401041b2a178aULL, 0x9181ebb8bd8d5618ULL,
-        0x078b935dc3e4034aULL, 0x7b59c08954214300ULL, 0x03570dc2a4f84421ULL,
-        0xdd8715b82f6b4078ULL, 0x2bb49c8bb544163bULL, 0xc9eb125564d59686ULL,
-        0x5fdc7a38f80b810aULL, 0x3a4a6d8fff686544ULL, 0x28360e2418627d3aULL,
-        0x60874244c95ed992ULL, 0x2115cc1dd9c34ed3ULL, 0xfaa3ef61f55e9efcULL,
-        0x27ac9b1ef1adc7e6ULL, 0x95ea00478fec3f54ULL, 0x5aea808b2d99ab43ULL,
-        0xc8f79e51fe43a580ULL, 0x5dbccd714236ce25ULL, 0x783fa76ed0753458ULL,
-        0x48cb290f19d84655ULL, 0xc86a832f7696099aULL, 0x52f30c6fec0e71d3ULL,
-        0x77d4e91e8cdeb886ULL, 0x7169a703c6a79ccdULL, 0x98208145b9596f74ULL,
-        0x0945695c761c0796ULL, 0x0be897830d17bae0ULL, 0x033ad3924caeeeb4ULL,
-        0xedecb6cfa2d303a8ULL, 0x3f86b074818642e7ULL, 0xeefa7c878a8b03f4ULL,
-        0x093c101b80922551ULL, 0xfb3b4e6c26ac0034ULL, 0x162bf87999b94f5eULL,
-        0xeaedae76e975b17cULL, 0x1852aa090effe18eULL};
-
-    static constexpr uint64_t kCLMulUpper[kCLMulNum] = {
-        0xbb41199b1d587c69ULL, 0x514d94d55894ee29ULL, 0xebc6cd4d2efd5d16ULL,
-        0x042044ad2de477fdULL, 0xb865c8b0fcdf4b15ULL, 0x0724d7e551cc40f3ULL,
-        0xb15a16f39edb0bccULL, 0x37d64419ede7a171ULL, 0x2aa01bb80c753401ULL,
-        0x06ff3f8a95fdaf4dULL, 0x79898cc0838546deULL, 0x776acbd1b237c60aULL,
-        0x4c1753be4f4e0064ULL, 0x0ba9243601206ed3ULL, 0xd567c3b1bf3ec557ULL,
-        0x043fac7bcff61fb3ULL, 0x49356232b159fb2fULL, 0x3910c82038102d4dULL,
-        0x30592fef753eb300ULL, 0x7b2660e0c92a9e9aULL, 0x8246c9248d671ef0ULL,
-        0x5a0dcd95147af5faULL, 0x43fde953909cc0eaULL, 0x06147b972cb96e1bULL,
-        0xd84193a6b2411d80ULL, 0x00cd7711b950196fULL, 0x1088f9f4ade7fa64ULL,
-        0x05a13096ec113cfbULL, 0x958d816d53b00edcULL, 0x3846154a7cdba9cbULL,
-        0x8af516db6b27d1e6ULL, 0x1a1d462ab8a33b13ULL, 0x4040b0ac1b2c754cULL,
-        0x05127fe9af2fe1d6ULL, 0x9f96e79374321fa6ULL, 0x06ff64a4d9c326f3ULL,
-        0x28709566e158ac15ULL, 0x301701d7111ca51cULL, 0x31e0445d1b9d9544ULL,
-        0x0a95aff69bf1d03eULL, 0x7c298c8414ecb879ULL, 0x00801499b4143195ULL,
-        0x91521a00dd676a5cULL, 0x2777526a14c2f723ULL, 0xfa26aac6a6357dddULL,
-        0x1d265889b0187a4bULL, 0xcd6e70fa8ed283e4ULL, 0x18a815aa50ea92caULL,
-        0xc01e082694a263c6ULL, 0x4b40163ba53daf25ULL, 0xbc658caff6501673ULL,
-        0x3ba35359586b9652ULL, 0x74f96acc97a4936cULL, 0x3989dfdb0cf1d2cfULL,
-        0x358a01eaa50dda32ULL, 0x01109a5ed8f0802bULL, 0x55b84922e63c2958ULL,
-        0x55b14843d87551d5ULL, 0x1db8ec61b1b578d8ULL, 0x79a2d49ef8c3658fULL,
-        0xa304516816b3fbe0ULL, 0x163ecc09cc7b82f9ULL, 0xab91e8d22aabef00ULL,
-        0x0ed6b09262de8354ULL, 0xcfd47d34cf73f6f2ULL, 0x7dbd1db2390bc6c3ULL,
-        0x5ae789d3875e7b00ULL, 0x1d60fd0e70fe8fa4ULL, 0x690bc15d5ae4f6f5ULL,
-        0x121ef5565104fb44ULL, 0x6e98e89297353b54ULL, 0x42554949249d62edULL,
-        0xd6d6d16b12df78d2ULL, 0x320b33549b74975dULL, 0xd2a0618763d22e00ULL,
-        0x0808deb93cba2017ULL, 0x01bd3b2302a2cc70ULL, 0x0b7b8dd4d71c8dd6ULL,
-        0x34d60a3382a0756cULL, 0x40984584c8219629ULL, 0xf1152cba10093a66ULL,
-        0x068001c6b2159ccbULL, 0x3d70f13c6cda0800ULL, 0x0e6b6746a322b956ULL,
-        0x83a494319d8c770bULL, 0x0faecf64a8553e9aULL, 0xa34919222c39b1bcULL,
-        0x0c63850d89e71c6fULL, 0x585f0bee92e53dc8ULL, 0x10f222b13b4fa5deULL,
-        0x61573114f94252f2ULL, 0x09d59c311fba6c27ULL, 0x014effa7da49ed4eULL,
-        0x4a400a1bc1c31d26ULL, 0xc9091c047b484972ULL, 0x3989f341ec2230ccULL,
-        0xdcb03a98b3aee41eULL, 0x4a54a676a33a95e1ULL, 0xe499b7753951ef7cULL,
-        0x2f43b1d1061d8b48ULL, 0xc3313bdc68ceb146ULL, 0x5159f6bc0e99227fULL,
-        0x98128e6d9c05efcaULL, 0x15ea32b27f77815bULL, 0xe882c054e2654eecULL,
-        0x003d2cdb8faee8c6ULL, 0xb416dd333a9fe1dfULL, 0x73f6746aefcfc98bULL,
-        0x93dc114c10a38d70ULL, 0x05055941657845eaULL, 0x2ed7351347349334ULL,
-        0x26fb1ee2c69ae690ULL, 0xa4575d10dc5b28e0ULL, 0x3395b11295e485ebULL,
-        0xe840f198a224551cULL, 0x78e6e5a431d941d4ULL, 0xa1fee3ceab27f391ULL,
-        0x07d35b3c5698d0dcULL, 0x983c67fca9174a29ULL, 0x2bb6bbae72b5144aULL,
-        0xa7730b8d13ce58efULL, 0x51b5272883de1998ULL, 0xb334e128bb55e260ULL,
-        0x1cacf5fbbe1b9974ULL, 0x71a9df4bb743de60ULL, 0x5176fe545c2d0d7aULL,
-        0xbe592ecf1a16d672ULL, 0x27aa8a30c3efe460ULL, 0x4c78a32f47991e06ULL,
-        0x383459294312f26aULL, 0x97ba789127f1490cULL, 0x51c9aa8a3abd1ef1ULL,
-        0xcc7355188121e50fULL, 0x0ecb3a178ae334c1ULL, 0x84879a5e574b7160ULL,
-        0x0765298f6389e8f3ULL, 0x5c6750435539bb22ULL, 0x11a05cf056c937b5ULL,
-        0xb5dc2172dbfb7662ULL, 0x3ffc17915d9f40e8ULL, 0xbc7904daf3b431b0ULL,
-        0x71f2088490930a7cULL, 0xa89505fd9efb53c4ULL, 0x02e194afd61c5671ULL,
-        0x99a97f4abf35fcecULL, 0x26830aad30fae96fULL, 0x4b2abc16b25cf0b0ULL,
-        0x07ec6fffa1cafbdbULL, 0xf38188fde97a280cULL, 0x121335701afff64dULL,
-        0xea5ef38b4e672a64ULL, 0x477edbcae3eabf03ULL, 0xa32813cc0e0d244dULL,
-        0x13346d2af4972eefULL, 0xcbc18357af1cfa9aULL, 0x561b630316e73fa6ULL,
-        0xe9dfb53249249305ULL, 0x5d2b9dd1479312eeULL, 0x3458008119b56d04ULL,
-        0x50e6790b49801385ULL, 0x5bb9febe2349492bULL, 0x0c2813954299098fULL,
-        0xf747b0c890a071d5ULL, 0x417e8f82cc028d77ULL, 0xa134fee611d804f8ULL,
-        0x24c99ee9a0408761ULL, 0x3ebb224e727137f3ULL, 0x0686022073ceb846ULL,
-        0xa05e901fb82ad7daULL, 0x0ece7dc43ab470fcULL, 0x2d334ecc58f7d6a3ULL,
-        0x23166fadacc54e40ULL, 0x9c3a4472f839556eULL, 0x071717ab5267a4adULL,
-        0xb6600ac351ba3ea0ULL, 0x30ec748313bb63d4ULL, 0xb5374e39287b23ccULL,
-        0x074d75e784238aebULL, 0x77315879243914a4ULL, 0x3bbb1971490865f1ULL,
-        0xa355c21f4fbe02d3ULL, 0x0027f4bb38c8f402ULL, 0xeef8708e652bc5f0ULL,
-        0x7b9aa56cf9440050ULL, 0x113ac03c16cfc924ULL, 0x395db36d3e4bef9fULL,
-        0x5d826fabcaa597aeULL, 0x2a77d3c58786d7e0ULL, 0x85996859a3ba19d4ULL,
-        0x01e7e3c904c2d97fULL, 0x34f90b9b98d51fd0ULL, 0x243aa97fd2e99bb7ULL,
-        0x40a0cebc4f65c1e8ULL, 0x46d3922ed4a5503eULL, 0x446e7ecaf1f9c0a4ULL,
-        0x49dc11558bc2e6aeULL, 0xe7a9f20881793af8ULL, 0x5771cc4bc98103f1ULL,
-        0x2446ea6e718fce90ULL, 0x25d14aca7f7da198ULL, 0x4347af186f9af964ULL,
-        0x10cb44fc9146363aULL, 0x8a35587afce476b4ULL, 0x575144662fee3d3aULL,
-        0x69f41177a6bc7a05ULL, 0x02ff8c38d6b3c898ULL, 0x57c73589a226ca40ULL,
-        0x732f6b5baae66683ULL, 0x00c008bbedd4bb34ULL, 0x7412ff09524d6cadULL,
-        0xb8fd0b5ad8c145a8ULL, 0x74bd9f94b6cdc7dfULL, 0x68233b317ca6c19cULL,
-        0x314b9c2c08b15c54ULL, 0x5bd1ad72072ebd08ULL, 0x6610e6a6c07030e4ULL,
-        0xa4fc38e885ead7ceULL, 0x36975d1ca439e034ULL, 0xa358f0fe358ffb1aULL,
-        0x38e247ad663acf7dULL, 0x77daed3643b5deb8ULL, 0x5507c2aeae1ec3d0ULL,
-        0xfdec226c73acf775ULL, 0x1b87ff5f5033492dULL, 0xa832dee545d9033fULL,
-        0x1cee43a61e41783bULL, 0xdff82b2e2d822f69ULL, 0x2bbc9a376cb38cf2ULL,
-        0x117b1cdaf765dc02ULL, 0x26a407f5682be270ULL, 0x8eb664cf5634af28ULL,
-        0x17cb4513bec68551ULL, 0xb0df6527900cbfd0ULL, 0x335a2dc79c5afdfcULL,
-        0xa2f0ca4cd38dca88ULL, 0x1c370713b81a2de1ULL, 0x849d5df654d1adfcULL,
-        0x2fd1f7675ae14e44ULL, 0x4ff64dfc02247f7bULL, 0x3a2bcf40e395a48dULL,
-        0x436248c821b187c1ULL, 0x29f4337b1c7104c0ULL, 0xfc317c46e6630ec4ULL,
-        0x2774bccc4e3264c7ULL, 0x2d03218d9d5bee23ULL, 0x36a0ed04d659058aULL,
-        0x452484461573cab6ULL, 0x0708edf87ed6272bULL, 0xf07960a1587446cbULL,
-        0x3660167b067d84e0ULL, 0x65990a6993ddf8c4ULL, 0x0b197cd3d0b40b3fULL,
-        0x1dcec4ab619f3a05ULL, 0x722ab223a84f9182ULL, 0x0822d61a81e7c38fULL,
-        0x3d22ad75da563201ULL, 0x93cef6979fd35e0fULL, 0x05c3c25ae598b14cULL,
-        0x1338df97dd496377ULL, 0x15bc324dc9c20acfULL, 0x96397c6127e6e8cfULL,
-        0x004d01069ef2050fULL, 0x2fcf2e27893fdcbcULL, 0x072f77c3e44f4a5cULL,
-        0x5eb1d80b3fe44918ULL, 0x1f59e7c28cc21f22ULL, 0x3390ce5df055c1f8ULL,
-        0x4c0ef11df92cb6bfULL, 0x50f82f9e0848c900ULL, 0x08d0fde3ffc0ae38ULL,
-        0xbd8d0089a3fbfb73ULL, 0x118ba5b0f311ef59ULL, 0x9be9a8407b926a61ULL,
-        0x4ea04fbb21318f63ULL, 0xa1c8e7bb07b871ffULL, 0x1253a7262d5d3b02ULL,
-        0x13e997a0512e5b29ULL, 0x54318460ce9055baULL, 0x4e1d8a4db0054798ULL,
-        0x0b235226e2cade32ULL, 0x2588732c1476b315ULL, 0x16a378750ba8ac68ULL,
-        0xba0b116c04448731ULL, 0x4dd02bd47694c2f1ULL, 0x16d6797b218b6b25ULL,
-        0x769eb3709cfbf936ULL, 0x197746a0ce396f38ULL, 0x7d17ad8465961d6eULL,
-        0xfe58f4998ae19bb4ULL, 0x36df24305233ce69ULL, 0xb88a4eb008f4ee72ULL,
-        0x302b2eb923334787ULL, 0x15a4e3edbe13d448ULL, 0x39a4bf64dd7730ceULL,
-        0xedf25421b31090c4ULL, 0x4d547fc131be3b69ULL, 0x2b316e120ca3b90eULL,
-        0x0faf2357bf18a169ULL, 0x71f34b54ee2c1d62ULL, 0x18eaf6e5c93a3824ULL,
-        0x7e168ba03c1b4c18ULL, 0x1a534dd586d9e871ULL, 0xa2cccd307f5f8c38ULL,
-        0x2999a6fb4dce30f6ULL, 0x8f6d3b02c1d549a6ULL, 0x5cf7f90d817aac5aULL,
-        0xd2a4ceefe66c8170ULL, 0x11560edc4ca959feULL, 0x89e517e6f0dc464dULL,
-        0x75bb8972dddd2085ULL, 0x13859ed1e459d65aULL, 0x057114653326fa84ULL,
-        0xe2e6f465173cc86cULL, 0x0ada4076497d7de4ULL, 0xa856fa10ec6dbf8aULL,
-        0x41505d9a7c25d875ULL, 0x3091b6278382eccdULL, 0x055737185b2c3f13ULL,
-        0x2f4df8ecd6f9c632ULL, 0x0633e89c33552d98ULL, 0xf7673724d16db440ULL,
-        0x7331bd08e636c391ULL, 0x0252f29672fee426ULL, 0x1fc384946b6b9ddeULL,
-        0x03460c12c901443aULL, 0x003a0792e10abcdaULL, 0x8dbec31f624e37d0ULL,
-        0x667420d5bfe4dcbeULL, 0xfbfa30e874ed7641ULL, 0x46d1ae14db7ecef6ULL,
-        0x216bd7e8f5448768ULL, 0x32bcd40d3d69cc88ULL, 0x2e991dbc39b65abeULL,
-        0x0e8fb123a502f553ULL, 0x3d2d486b2c7560c0ULL, 0x09aba1db3079fe03ULL,
-        0xcb540c59398c9bceULL, 0x363970e5339ed600ULL, 0x2caee457c28af00eULL,
-        0x005e7d7ee47f41a0ULL, 0x69fad3eb10f44100ULL, 0x048109388c75beb3ULL,
-        0x253dddf96c7a6fb8ULL, 0x4c47f705b9d47d09ULL, 0x6cec894228b5e978ULL,
-        0x04044bb9f8ff45c2ULL, 0x079e75704d775caeULL, 0x073bd54d2a9e2c33ULL,
-        0xcec7289270a364fbULL, 0x19e7486f19cd9e4eULL, 0xb50ac15b86b76608ULL,
-        0x0620cf81f165c812ULL, 0x63eaaf13be7b11d4ULL, 0x0e0cf831948248c2ULL,
-        0xf0412df8f46e7957ULL, 0x671c1fe752517e3fULL, 0x8841bfb04dd3f540ULL,
-        0x122de4142249f353ULL, 0x40a4959fb0e76870ULL, 0x25cfd3d4b4bbc459ULL,
-        0x78a07c82930c60d0ULL, 0x12c2de24d4cbc969ULL, 0x85d44866096ad7f4ULL,
-        0x1fd917ca66b2007bULL, 0x01fbbb0751764764ULL, 0x3d2a4953c6fe0fdcULL,
-        0xcc1489c5737afd94ULL, 0x1817c5b6a5346f41ULL, 0xe605a6a7e9985644ULL,
-        0x3c50412328ff1946ULL, 0xd8c7fd65817f1291ULL, 0x0bd66975ab66339bULL,
-        0x2baf8fa1c7d10fa9ULL, 0x24abdf06ddef848dULL, 0x14df0c9b2ea4f6c2ULL,
-        0x2be950edfd2cb1f7ULL, 0x21911e21094178b6ULL, 0x0fa54d518a93b379ULL,
-        0xb52508e0ac01ab42ULL, 0x0e035b5fd8cb79beULL, 0x1c1c6d1a3b3c8648ULL,
-        0x286037b42ea9871cULL, 0xfe67bf311e48a340ULL, 0x02324131e932a472ULL,
-        0x2486dc2dd919e2deULL, 0x008aec7f1da1d2ebULL, 0x63269ba0e8d3eb3aULL,
-        0x23c0f11154adb62fULL, 0xc6052393ecd4c018ULL, 0x523585b7d2f5b9fcULL,
-        0xf7e6f8c1e87564c9ULL, 0x09eb9fe5dd32c1a3ULL, 0x4d4f86886e055472ULL,
-        0x67ea17b58a37966bULL, 0x3d3ce8c23b1ed1a8ULL, 0x0df97c5ac48857ceULL,
-        0x9b6992623759eb12ULL, 0x275aa9551ae091f2ULL, 0x08855e19ac5e62e5ULL,
-        0x1155fffe0ae083ccULL, 0xbc9c78db7c570240ULL, 0x074560c447dd2418ULL,
-        0x3bf78d330bcf1e70ULL, 0x49867cd4b7ed134bULL, 0x8e6eee0cb4470accULL,
-        0x1dabafdf59233dd6ULL, 0xea3a50d844fc3fb8ULL, 0x4f03f4454764cb87ULL,
-        0x1f2f41cc36c9e6ecULL, 0x53cba4df42963441ULL, 0x10883b70a88d91fbULL,
-        0x62b1fc77d4eb9481ULL, 0x893d8f2604b362e1ULL, 0x0933b7855368b440ULL,
-        0x9351b545703b2fceULL, 0x59c1d489b9bdd3b4ULL, 0xe72a9c4311417b18ULL,
-        0x5355df77e88eb226ULL, 0xe802c37aa963d7e1ULL, 0x381c3747bd6c3bc3ULL,
-        0x378565573444258cULL, 0x37848b1e52b43c18ULL, 0x5da2cd32bdce12b6ULL,
-        0x13166c5da615f6fdULL, 0xa51ef95efcc66ac8ULL, 0x640c95e473f1e541ULL,
-        0x6ec68def1f217500ULL, 0x49ce3543c76a4079ULL, 0x5fc6fd3cddc706b5ULL,
-        0x05c3c0f0f6a1fb0dULL, 0xe7820c0996ad1bddULL, 0x21f0d752a088f35cULL,
-        0x755405b51d6fc4a0ULL, 0x7ec7649ca4b0e351ULL, 0x3d2b6a46a251f790ULL,
-        0x23e1176b19f418adULL, 0x06056575efe8ac05ULL, 0x0f75981b6966e477ULL,
-        0x06e87ec41ad437e4ULL, 0x43f6c255d5e1cb84ULL, 0xe4e67d1120ceb580ULL,
-        0x2cd67b9e12c26d7bULL, 0xcd00b5ff7fd187f1ULL, 0x3f6cd40accdc4106ULL,
-        0x3e895c835459b330ULL, 0x0814d53a217c0850ULL, 0xc9111fe78bc3a62dULL,
-        0x719967e351473204ULL, 0xe757707d24282aa4ULL, 0x7226b7f5607f98e6ULL,
-        0x7b268ffae3c08d96ULL, 0x16d3917c8b86020eULL, 0x5128bca51c49ea64ULL,
-        0x345ffea02bb1698dULL, 0x9460f5111fe4fbc8ULL, 0x60dd1aa5762852cbULL,
-        0xbb7440ed3c81667cULL, 0x0a4b12affa7f6f5cULL, 0x95cbcb0ae03861b6ULL,
-        0x07ab3b0591db6070ULL, 0xc6476a4c3de78982ULL, 0x204e82e8623ad725ULL,
-        0x569a5b4e8ac2a5ccULL, 0x425a1d77d72ebae2ULL, 0xcdaad5551ab33830ULL,
-        0x0b7c68fd8422939eULL, 0x46d9a01f53ec3020ULL, 0x102871edbb29e852ULL,
-        0x7a8e8084039075a5ULL, 0x40eaede8615e376aULL, 0x4dc67d757a1c751fULL,
-        0x1176ef33063f9145ULL, 0x4ea230285b1c8156ULL, 0x6b2aa46ce0027392ULL,
-        0x32b13230fba1b068ULL, 0x0e69796851bb984fULL, 0xb749f4542db698c0ULL,
-        0x19ad0241ffffd49cULL, 0x2f41e92ef6caff52ULL, 0x4d0b068576747439ULL,
-        0x14d607aef7463e00ULL, 0x1443d00d85fb440eULL, 0x529b43bf68688780ULL,
-        0x21133a6bc3a3e378ULL, 0x865b6436dae0e7e5ULL, 0x6b4fe83dc1d6defcULL,
-        0x03a5858a0ca0be46ULL, 0x1e841b187e67f312ULL, 0x61ee22ef40a66940ULL,
-        0x0494bd2e9e741ef8ULL, 0x4eb59e323010e72cULL, 0x19f2abcfb749810eULL,
-        0xb30f1e4f994ef9bcULL, 0x53cf6cdd51bd2d96ULL, 0x263943036497a514ULL,
-        0x0d4b52170aa2edbaULL, 0x0c4758a1c7b4f758ULL, 0x178dadb1b502b51aULL,
-        0x1ddbb20a602eb57aULL, 0x1fc2e2564a9f27fdULL, 0xd5f8c50a0e3d6f90ULL,
-        0x0081da3bbe72ac09ULL, 0xcf140d002ccdb200ULL, 0x0ae8389f09b017feULL,
-        0x17cc9ffdc03f4440ULL, 0x04eb921d704bcdddULL, 0x139a0ce4cdc521abULL,
-        0x0bfce00c145cb0f0ULL, 0x99925ff132eff707ULL, 0x063f6e5da50c3d35ULL,
-        0xa0c25dea3f0e6e29ULL, 0x0c7a9048cc8e040fULL,
-    };
-
-    const size_t padded = RoundUpTo(kCLMulNum, N);
-    auto expected_lower = AllocateAligned<T>(padded);
-    auto expected_upper = AllocateAligned<T>(padded);
-    CopyBytes<kCLMulNum * sizeof(T)>(kCLMulLower, expected_lower.get());
-    CopyBytes<kCLMulNum * sizeof(T)>(kCLMulUpper, expected_upper.get());
-    const size_t padding_size = (padded - kCLMulNum) * sizeof(T);
-    memset(expected_lower.get() + kCLMulNum, 0, padding_size);
-    memset(expected_upper.get() + kCLMulNum, 0, padding_size);
-
-    // Random inputs in each lane
-    RandomState rng;
-    for (size_t rep = 0; rep < kCLMulNum / N; ++rep) {
-      for (size_t i = 0; i < N; ++i) {
-        in1[i] = Random64(&rng);
-        in2[i] = Random64(&rng);
-      }
-
-      const auto a = Load(d, in1.get());
-      const auto b = Load(d, in2.get());
-#if HWY_PRINT_CLMUL_GOLDEN
-      Store(CLMulLower(a, b), d, expected_lower.get() + rep * N);
-      Store(CLMulUpper(a, b), d, expected_upper.get() + rep * N);
-#else
-      HWY_ASSERT_VEC_EQ(d, expected_lower.get() + rep * N, CLMulLower(a, b));
-      HWY_ASSERT_VEC_EQ(d, expected_upper.get() + rep * N, CLMulUpper(a, b));
-#endif
-    }
-
-#if HWY_PRINT_CLMUL_GOLDEN
-    // RVV lacks PRIu64, so print 32-bit halves.
-    for (size_t i = 0; i < kCLMulNum; ++i) {
-      printf("0x%08x%08xULL,", static_cast<uint32_t>(expected_lower[i] >> 32),
-             static_cast<uint32_t>(expected_lower[i] & 0xFFFFFFFFU));
-    }
-    printf("\n");
-    for (size_t i = 0; i < kCLMulNum; ++i) {
-      printf("0x%08x%08xULL,", static_cast<uint32_t>(expected_upper[i] >> 32),
-             static_cast<uint32_t>(expected_upper[i] & 0xFFFFFFFFU));
-    }
-#endif  // HWY_PRINT_CLMUL_GOLDEN
-#else
-    (void)d;
-#endif
-  }
-};
-
-HWY_NOINLINE void TestAllCLMul() { ForGEVectors<128, TestCLMul>()(uint64_t()); }
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-
-namespace hwy {
-HWY_BEFORE_TEST(HwyCryptoTest);
-HWY_EXPORT_AND_TEST_P(HwyCryptoTest, TestAllAES);
-HWY_EXPORT_AND_TEST_P(HwyCryptoTest, TestAllCLMul);
-}  // namespace hwy
-
-#endif
diff --git a/third_party/highway/hwy/tests/demote_test.cc b/third_party/highway/hwy/tests/demote_test.cc
deleted file mode 100644 (file)
index 4339a54..0000000
+++ /dev/null
@@ -1,326 +0,0 @@
-// Copyright 2019 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <stddef.h>
-#include <stdint.h>
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "tests/demote_test.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#include "hwy/highway.h"
-#include "hwy/tests/test_util-inl.h"
-
-// Causes build timeout.
-#if !HWY_IS_MSAN
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-template <typename T, HWY_IF_FLOAT(T)>
-bool IsFiniteT(T t) {
-  return std::isfinite(t);
-}
-// Wrapper avoids calling std::isfinite for integer types (ambiguous).
-template <typename T, HWY_IF_NOT_FLOAT(T)>
-bool IsFiniteT(T /*unused*/) {
-  return true;
-}
-
-template <typename ToT>
-struct TestDemoteTo {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D from_d) {
-    static_assert(!IsFloat<ToT>(), "Use TestDemoteToFloat for float output");
-    static_assert(sizeof(T) > sizeof(ToT), "Input type must be wider");
-    const Rebind<ToT, D> to_d;
-
-    const size_t N = Lanes(from_d);
-    auto from = AllocateAligned<T>(N);
-    auto expected = AllocateAligned<ToT>(N);
-
-    // Narrower range in the wider type, for clamping before we cast
-    const T min = LimitsMin<ToT>();
-    const T max = LimitsMax<ToT>();
-
-    const auto value_ok = [&](T& value) {
-      if (!IsFiniteT(value)) return false;
-      return true;
-    };
-
-    RandomState rng;
-    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
-      for (size_t i = 0; i < N; ++i) {
-        do {
-          const uint64_t bits = rng();
-          CopyBytes<sizeof(T)>(&bits, &from[i]);  // not same size
-        } while (!value_ok(from[i]));
-        expected[i] = static_cast<ToT>(HWY_MIN(HWY_MAX(min, from[i]), max));
-      }
-
-      const auto in = Load(from_d, from.get());
-      HWY_ASSERT_VEC_EQ(to_d, expected.get(), DemoteTo(to_d, in));
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllDemoteToInt() {
-  ForDemoteVectors<TestDemoteTo<uint8_t>>()(int16_t());
-  ForDemoteVectors<TestDemoteTo<uint8_t>, 2>()(int32_t());
-
-  ForDemoteVectors<TestDemoteTo<int8_t>>()(int16_t());
-  ForDemoteVectors<TestDemoteTo<int8_t>, 2>()(int32_t());
-
-  const ForDemoteVectors<TestDemoteTo<uint16_t>> to_u16;
-  to_u16(int32_t());
-
-  const ForDemoteVectors<TestDemoteTo<int16_t>> to_i16;
-  to_i16(int32_t());
-}
-
-HWY_NOINLINE void TestAllDemoteToMixed() {
-#if HWY_HAVE_FLOAT64
-  const ForDemoteVectors<TestDemoteTo<int32_t>> to_i32;
-  to_i32(double());
-#endif
-}
-
-template <typename ToT>
-struct TestDemoteToFloat {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D from_d) {
-    // For floats, we clamp differently and cannot call LimitsMin.
-    static_assert(IsFloat<ToT>(), "Use TestDemoteTo for integer output");
-    static_assert(sizeof(T) > sizeof(ToT), "Input type must be wider");
-    const Rebind<ToT, D> to_d;
-
-    const size_t N = Lanes(from_d);
-    auto from = AllocateAligned<T>(N);
-    auto expected = AllocateAligned<ToT>(N);
-
-    RandomState rng;
-    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
-      for (size_t i = 0; i < N; ++i) {
-        do {
-          const uint64_t bits = rng();
-          CopyBytes<sizeof(T)>(&bits, &from[i]);  // not same size
-        } while (!IsFiniteT(from[i]));
-        const T magn = std::abs(from[i]);
-        const T max_abs = HighestValue<ToT>();
-        // NOTE: std:: version from C++11 cmath is not defined in RVV GCC, see
-        // https://lists.freebsd.org/pipermail/freebsd-current/2014-January/048130.html
-        const T clipped = copysign(HWY_MIN(magn, max_abs), from[i]);
-        expected[i] = static_cast<ToT>(clipped);
-      }
-
-      HWY_ASSERT_VEC_EQ(to_d, expected.get(),
-                        DemoteTo(to_d, Load(from_d, from.get())));
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllDemoteToFloat() {
-  // Must test f16 separately because we can only load/store/convert them.
-
-#if HWY_HAVE_FLOAT64
-  const ForDemoteVectors<TestDemoteToFloat<float>, 1> to_float;
-  to_float(double());
-#endif
-}
-
-template <class D>
-AlignedFreeUniquePtr<float[]> ReorderBF16TestCases(D d, size_t& padded) {
-  const float test_cases[] = {
-      // Same as BF16TestCases:
-      // +/- 1
-      1.0f,
-      -1.0f,
-      // +/- 0
-      0.0f,
-      -0.0f,
-      // near 0
-      0.25f,
-      -0.25f,
-      // +/- integer
-      4.0f,
-      -32.0f,
-      // positive +/- delta
-      2.015625f,
-      3.984375f,
-      // negative +/- delta
-      -2.015625f,
-      -3.984375f,
-
-      // No huge values - would interfere with sum. But add more to fill 2 * N:
-      -2.0f,
-      -10.0f,
-      0.03125f,
-      1.03125f,
-      1.5f,
-      2.0f,
-      4.0f,
-      5.0f,
-      6.0f,
-      8.0f,
-      10.0f,
-      256.0f,
-      448.0f,
-      2080.0f,
-  };
-  const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
-  const size_t N = Lanes(d);
-  padded = RoundUpTo(kNumTestCases, 2 * N);  // allow loading pairs of vectors
-  auto in = AllocateAligned<float>(padded);
-  auto expected = AllocateAligned<float>(padded);
-  std::copy(test_cases, test_cases + kNumTestCases, in.get());
-  std::fill(in.get() + kNumTestCases, in.get() + padded, 0.0f);
-  return in;
-}
-
-class TestReorderDemote2To {
-  // In-place N^2 selection sort to avoid dependencies
-  void Sort(float* p, size_t count) {
-    for (size_t i = 0; i < count - 1; ++i) {
-      // Find min_element
-      size_t idx_min = i;
-      for (size_t j = i + 1; j < count; j++) {
-        if (p[j] < p[idx_min]) {
-          idx_min = j;
-        }
-      }
-
-      // Swap with current
-      const float tmp = p[i];
-      p[i] = p[idx_min];
-      p[idx_min] = tmp;
-    }
-  }
-
- public:
-  template <typename TF32, class DF32>
-  HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) {
-#if HWY_TARGET != HWY_SCALAR
-    size_t padded;
-    auto in = ReorderBF16TestCases(d32, padded);
-
-    using TBF16 = bfloat16_t;
-    const Repartition<TBF16, DF32> dbf16;
-    const Half<decltype(dbf16)> dbf16_half;
-    const size_t N = Lanes(d32);
-    auto temp16 = AllocateAligned<TBF16>(2 * N);
-    auto expected = AllocateAligned<float>(2 * N);
-    auto actual = AllocateAligned<float>(2 * N);
-
-    for (size_t i = 0; i < padded; i += 2 * N) {
-      const auto f0 = Load(d32, &in[i + 0]);
-      const auto f1 = Load(d32, &in[i + N]);
-      const auto v16 = ReorderDemote2To(dbf16, f0, f1);
-      Store(v16, dbf16, temp16.get());
-      const auto promoted0 = PromoteTo(d32, Load(dbf16_half, temp16.get() + 0));
-      const auto promoted1 = PromoteTo(d32, Load(dbf16_half, temp16.get() + N));
-
-      // Smoke test: sum should be same (with tolerance for non-associativity)
-      const auto sum_expected = GetLane(SumOfLanes(d32, Add(f0, f1)));
-      const auto sum_actual =
-          GetLane(SumOfLanes(d32, Add(promoted0, promoted1)));
-
-      HWY_ASSERT(sum_expected - 1E-4 <= sum_actual &&
-                 sum_actual <= sum_expected + 1E-4);
-
-      // Ensure values are the same after sorting to undo the Reorder
-      Store(f0, d32, expected.get() + 0);
-      Store(f1, d32, expected.get() + N);
-      Store(promoted0, d32, actual.get() + 0);
-      Store(promoted1, d32, actual.get() + N);
-      Sort(expected.get(), 2 * N);
-      Sort(actual.get(), 2 * N);
-      HWY_ASSERT_VEC_EQ(d32, expected.get() + 0, Load(d32, actual.get() + 0));
-      HWY_ASSERT_VEC_EQ(d32, expected.get() + N, Load(d32, actual.get() + N));
-    }
-#else  // HWY_SCALAR
-    (void)d32;
-#endif
-  }
-};
-
-HWY_NOINLINE void TestAllReorderDemote2To() {
-  ForShrinkableVectors<TestReorderDemote2To>()(float());
-}
-
-struct TestI32F64 {
-  template <typename TF, class DF>
-  HWY_NOINLINE void operator()(TF /*unused*/, const DF df) {
-    using TI = int32_t;
-    const Rebind<TI, DF> di;
-    const size_t N = Lanes(df);
-
-    // Integer positive
-    HWY_ASSERT_VEC_EQ(di, Iota(di, TI(4)), DemoteTo(di, Iota(df, TF(4.0))));
-
-    // Integer negative
-    HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), DemoteTo(di, Iota(df, -TF(N))));
-
-    // Above positive
-    HWY_ASSERT_VEC_EQ(di, Iota(di, TI(2)), DemoteTo(di, Iota(df, TF(2.001))));
-
-    // Below positive
-    HWY_ASSERT_VEC_EQ(di, Iota(di, TI(3)), DemoteTo(di, Iota(df, TF(3.9999))));
-
-    const TF eps = static_cast<TF>(0.0001);
-    // Above negative
-    HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)),
-                      DemoteTo(di, Iota(df, -TF(N + 1) + eps)));
-
-    // Below negative
-    HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N + 1)),
-                      DemoteTo(di, Iota(df, -TF(N + 1) - eps)));
-
-    // Huge positive float
-    HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMax<TI>()),
-                      DemoteTo(di, Set(df, TF(1E12))));
-
-    // Huge negative float
-    HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMin<TI>()),
-                      DemoteTo(di, Set(df, TF(-1E12))));
-  }
-};
-
-HWY_NOINLINE void TestAllI32F64() {
-#if HWY_HAVE_FLOAT64
-  ForDemoteVectors<TestI32F64>()(double());
-#endif
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#endif  //  !HWY_IS_MSAN
-
-#if HWY_ONCE
-
-namespace hwy {
-#if !HWY_IS_MSAN
-HWY_BEFORE_TEST(HwyDemoteTest);
-HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToInt);
-HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToMixed);
-HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToFloat);
-HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllReorderDemote2To);
-HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllI32F64);
-#endif  //  !HWY_IS_MSAN
-}  // namespace hwy
-
-#endif
diff --git a/third_party/highway/hwy/tests/float_test.cc b/third_party/highway/hwy/tests/float_test.cc
deleted file mode 100644 (file)
index 05d7b76..0000000
+++ /dev/null
@@ -1,349 +0,0 @@
-// Copyright 2019 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Tests some ops specific to floating-point types (Div, Round etc.)
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include <algorithm>
-#include <limits>
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "tests/float_test.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#include "hwy/highway.h"
-#include "hwy/tests/test_util-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-struct TestDiv {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v = Iota(d, T(-2));
-    const auto v1 = Set(d, T(1));
-
-    // Unchanged after division by 1.
-    HWY_ASSERT_VEC_EQ(d, v, Div(v, v1));
-
-    const size_t N = Lanes(d);
-    auto expected = AllocateAligned<T>(N);
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = (T(i) - 2) / T(2);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), Div(v, Set(d, T(2))));
-  }
-};
-
-HWY_NOINLINE void TestAllDiv() { ForFloatTypes(ForPartialVectors<TestDiv>()); }
-
-struct TestApproximateReciprocal {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v = Iota(d, T(-2));
-    const auto nonzero = IfThenElse(Eq(v, Zero(d)), Set(d, T(1)), v);
-    const size_t N = Lanes(d);
-    auto input = AllocateAligned<T>(N);
-    Store(nonzero, d, input.get());
-
-    auto actual = AllocateAligned<T>(N);
-    Store(ApproximateReciprocal(nonzero), d, actual.get());
-
-    double max_l1 = 0.0;
-    double worst_expected = 0.0;
-    double worst_actual = 0.0;
-    for (size_t i = 0; i < N; ++i) {
-      const double expected = 1.0 / input[i];
-      const double l1 = std::abs(expected - actual[i]);
-      if (l1 > max_l1) {
-        max_l1 = l1;
-        worst_expected = expected;
-        worst_actual = actual[i];
-      }
-    }
-    const double abs_worst_expected = std::abs(worst_expected);
-    if (abs_worst_expected > 1E-5) {
-      const double max_rel = max_l1 / abs_worst_expected;
-      fprintf(stderr, "max l1 %f rel %f (%f vs %f)\n", max_l1, max_rel,
-              worst_expected, worst_actual);
-      HWY_ASSERT(max_rel < 0.004);
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllApproximateReciprocal() {
-  ForPartialVectors<TestApproximateReciprocal>()(float());
-}
-
-struct TestSquareRoot {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto vi = Iota(d, 0);
-    HWY_ASSERT_VEC_EQ(d, vi, Sqrt(Mul(vi, vi)));
-  }
-};
-
-HWY_NOINLINE void TestAllSquareRoot() {
-  ForFloatTypes(ForPartialVectors<TestSquareRoot>());
-}
-
-struct TestReciprocalSquareRoot {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v = Set(d, 123.0f);
-    const size_t N = Lanes(d);
-    auto lanes = AllocateAligned<T>(N);
-    Store(ApproximateReciprocalSqrt(v), d, lanes.get());
-    for (size_t i = 0; i < N; ++i) {
-      float err = lanes[i] - 0.090166f;
-      if (err < 0.0f) err = -err;
-      if (err >= 4E-4f) {
-        HWY_ABORT("Lane %d (%d): actual %f err %f\n", static_cast<int>(i),
-                  static_cast<int>(N), lanes[i], err);
-      }
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllReciprocalSquareRoot() {
-  ForPartialVectors<TestReciprocalSquareRoot>()(float());
-}
-
-template <typename T, class D>
-AlignedFreeUniquePtr<T[]> RoundTestCases(T /*unused*/, D d, size_t& padded) {
-  const T eps = std::numeric_limits<T>::epsilon();
-  const T test_cases[] = {
-    // +/- 1
-    T(1),
-    T(-1),
-    // +/- 0
-    T(0),
-    T(-0),
-    // near 0
-    T(0.4),
-    T(-0.4),
-    // +/- integer
-    T(4),
-    T(-32),
-    // positive near limit
-    MantissaEnd<T>() - T(1.5),
-    MantissaEnd<T>() + T(1.5),
-    // negative near limit
-    -MantissaEnd<T>() - T(1.5),
-    -MantissaEnd<T>() + T(1.5),
-    // positive tiebreak
-    T(1.5),
-    T(2.5),
-    // negative tiebreak
-    T(-1.5),
-    T(-2.5),
-    // positive +/- delta
-    T(2.0001),
-    T(3.9999),
-    // negative +/- delta
-    T(-999.9999),
-    T(-998.0001),
-    // positive +/- epsilon
-    T(1) + eps,
-    T(1) - eps,
-    // negative +/- epsilon
-    T(-1) + eps,
-    T(-1) - eps,
-    // +/- huge (but still fits in float)
-    T(1E34),
-    T(-1E35),
-    // +/- infinity
-    std::numeric_limits<T>::infinity(),
-    -std::numeric_limits<T>::infinity(),
-    // qNaN
-    GetLane(NaN(d))
-  };
-  const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]);
-  const size_t N = Lanes(d);
-  padded = RoundUpTo(kNumTestCases, N);  // allow loading whole vectors
-  auto in = AllocateAligned<T>(padded);
-  auto expected = AllocateAligned<T>(padded);
-  std::copy(test_cases, test_cases + kNumTestCases, in.get());
-  std::fill(in.get() + kNumTestCases, in.get() + padded, T(0));
-  return in;
-}
-
-struct TestRound {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T t, D d) {
-    size_t padded;
-    auto in = RoundTestCases(t, d, padded);
-    auto expected = AllocateAligned<T>(padded);
-
-    for (size_t i = 0; i < padded; ++i) {
-      // Avoid [std::]round, which does not round to nearest *even*.
-      // NOTE: std:: version from C++11 cmath is not defined in RVV GCC, see
-      // https://lists.freebsd.org/pipermail/freebsd-current/2014-January/048130.html
-      expected[i] = static_cast<T>(nearbyint(in[i]));
-    }
-    for (size_t i = 0; i < padded; i += Lanes(d)) {
-      HWY_ASSERT_VEC_EQ(d, &expected[i], Round(Load(d, &in[i])));
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllRound() {
-  ForFloatTypes(ForPartialVectors<TestRound>());
-}
-
-struct TestNearestInt {
-  template <typename TF, class DF>
-  HWY_NOINLINE void operator()(TF tf, const DF df) {
-    using TI = MakeSigned<TF>;
-    const RebindToSigned<DF> di;
-
-    size_t padded;
-    auto in = RoundTestCases(tf, df, padded);
-    auto expected = AllocateAligned<TI>(padded);
-
-    constexpr double max = static_cast<double>(LimitsMax<TI>());
-    for (size_t i = 0; i < padded; ++i) {
-      if (std::isnan(in[i])) {
-        // We replace NaN with 0 below (no_nan)
-        expected[i] = 0;
-      } else if (std::isinf(in[i]) || double{std::abs(in[i])} >= max) {
-        // Avoid undefined result for lrintf
-        expected[i] = std::signbit(in[i]) ? LimitsMin<TI>() : LimitsMax<TI>();
-      } else {
-        expected[i] = static_cast<TI>(lrintf(in[i]));
-      }
-    }
-    for (size_t i = 0; i < padded; i += Lanes(df)) {
-      const auto v = Load(df, &in[i]);
-      const auto no_nan = IfThenElse(Eq(v, v), v, Zero(df));
-      HWY_ASSERT_VEC_EQ(di, &expected[i], NearestInt(no_nan));
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllNearestInt() {
-  ForPartialVectors<TestNearestInt>()(float());
-}
-
-struct TestTrunc {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T t, D d) {
-    size_t padded;
-    auto in = RoundTestCases(t, d, padded);
-    auto expected = AllocateAligned<T>(padded);
-
-    for (size_t i = 0; i < padded; ++i) {
-      // NOTE: std:: version from C++11 cmath is not defined in RVV GCC, see
-      // https://lists.freebsd.org/pipermail/freebsd-current/2014-January/048130.html
-      expected[i] = static_cast<T>(trunc(in[i]));
-    }
-    for (size_t i = 0; i < padded; i += Lanes(d)) {
-      HWY_ASSERT_VEC_EQ(d, &expected[i], Trunc(Load(d, &in[i])));
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllTrunc() {
-  ForFloatTypes(ForPartialVectors<TestTrunc>());
-}
-
-struct TestCeil {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T t, D d) {
-    size_t padded;
-    auto in = RoundTestCases(t, d, padded);
-    auto expected = AllocateAligned<T>(padded);
-
-    for (size_t i = 0; i < padded; ++i) {
-      expected[i] = std::ceil(in[i]);
-    }
-    for (size_t i = 0; i < padded; i += Lanes(d)) {
-      HWY_ASSERT_VEC_EQ(d, &expected[i], Ceil(Load(d, &in[i])));
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllCeil() {
-  ForFloatTypes(ForPartialVectors<TestCeil>());
-}
-
-struct TestFloor {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T t, D d) {
-    size_t padded;
-    auto in = RoundTestCases(t, d, padded);
-    auto expected = AllocateAligned<T>(padded);
-
-    for (size_t i = 0; i < padded; ++i) {
-      expected[i] = std::floor(in[i]);
-    }
-    for (size_t i = 0; i < padded; i += Lanes(d)) {
-      HWY_ASSERT_VEC_EQ(d, &expected[i], Floor(Load(d, &in[i])));
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllFloor() {
-  ForFloatTypes(ForPartialVectors<TestFloor>());
-}
-
-struct TestAbsDiff {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    auto in_lanes_a = AllocateAligned<T>(N);
-    auto in_lanes_b = AllocateAligned<T>(N);
-    auto out_lanes = AllocateAligned<T>(N);
-    for (size_t i = 0; i < N; ++i) {
-      in_lanes_a[i] = static_cast<T>((i ^ 1u) << i);
-      in_lanes_b[i] = static_cast<T>(i << i);
-      out_lanes[i] = std::abs(in_lanes_a[i] - in_lanes_b[i]);
-    }
-    const auto a = Load(d, in_lanes_a.get());
-    const auto b = Load(d, in_lanes_b.get());
-    const auto expected = Load(d, out_lanes.get());
-    HWY_ASSERT_VEC_EQ(d, expected, AbsDiff(a, b));
-    HWY_ASSERT_VEC_EQ(d, expected, AbsDiff(b, a));
-  }
-};
-
-HWY_NOINLINE void TestAllAbsDiff() {
-  ForPartialVectors<TestAbsDiff>()(float());
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-
-namespace hwy {
-HWY_BEFORE_TEST(HwyFloatTest);
-HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllDiv);
-HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllApproximateReciprocal);
-HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllSquareRoot);
-HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllReciprocalSquareRoot);
-HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllRound);
-HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllNearestInt);
-HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllTrunc);
-HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllCeil);
-HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllFloor);
-HWY_EXPORT_AND_TEST_P(HwyFloatTest, TestAllAbsDiff);
-}  // namespace hwy
-
-#endif
diff --git a/third_party/highway/hwy/tests/hwy_gtest.h b/third_party/highway/hwy/tests/hwy_gtest.h
deleted file mode 100644 (file)
index acecee8..0000000
+++ /dev/null
@@ -1,157 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HWY_TESTS_HWY_GTEST_H_
-#define HWY_TESTS_HWY_GTEST_H_
-
-// Adapters for GUnit to run tests for all targets.
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include <string>
-#include <utility>  // std::tuple
-
-#include "gtest/gtest.h"
-#include "hwy/highway.h"
-
-namespace hwy {
-
-// googletest before 1.10 didn't define INSTANTIATE_TEST_SUITE_P() but instead
-// used INSTANTIATE_TEST_CASE_P which is now deprecated.
-#ifdef INSTANTIATE_TEST_SUITE_P
-#define HWY_GTEST_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_SUITE_P
-#else
-#define HWY_GTEST_INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_CASE_P
-#endif
-
-// Helper class to run parametric tests using the hwy target as parameter. To
-// use this define the following in your test:
-//   class MyTestSuite : public TestWithParamTarget {
-//    ...
-//   };
-//   HWY_TARGET_INSTANTIATE_TEST_SUITE_P(MyTestSuite);
-//   TEST_P(MyTestSuite, MyTest) { ... }
-class TestWithParamTarget : public testing::TestWithParam<int64_t> {
- protected:
-  void SetUp() override { SetSupportedTargetsForTest(GetParam()); }
-
-  void TearDown() override {
-    // Check that the parametric test calls SupportedTargets() when the source
-    // was compiled with more than one target. In the single-target case only
-    // static dispatch will be used anyway.
-#if (HWY_TARGETS & (HWY_TARGETS - 1)) != 0
-    EXPECT_TRUE(GetChosenTarget().IsInitialized())
-        << "This hwy target parametric test doesn't use dynamic-dispatch and "
-           "doesn't need to be parametric.";
-#endif
-    SetSupportedTargetsForTest(0);
-  }
-};
-
-// Function to convert the test parameter of a TestWithParamTarget for
-// displaying it in the gtest test name.
-static inline std::string TestParamTargetName(
-    const testing::TestParamInfo<int64_t>& info) {
-  return TargetName(info.param);
-}
-
-#define HWY_TARGET_INSTANTIATE_TEST_SUITE_P(suite)              \
-  HWY_GTEST_INSTANTIATE_TEST_SUITE_P(                           \
-      suite##Group, suite,                                      \
-      testing::ValuesIn(::hwy::SupportedAndGeneratedTargets()), \
-      ::hwy::TestParamTargetName)
-
-// Helper class similar to TestWithParamTarget to run parametric tests that
-// depend on the target and another parametric test. If you need to use multiple
-// extra parameters use a std::tuple<> of them and ::testing::Generate(...) as
-// the generator. To use this class define the following in your test:
-//   class MyTestSuite : public TestWithParamTargetT<int> {
-//    ...
-//   };
-//   HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T(MyTestSuite, ::testing::Range(0, 9));
-//   TEST_P(MyTestSuite, MyTest) { ... GetParam() .... }
-template <typename T>
-class TestWithParamTargetAndT
-    : public ::testing::TestWithParam<std::tuple<int64_t, T>> {
- public:
-  // Expose the parametric type here so it can be used by the
-  // HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T macro.
-  using HwyParamType = T;
-
- protected:
-  void SetUp() override {
-    SetSupportedTargetsForTest(std::get<0>(
-        ::testing::TestWithParam<std::tuple<int64_t, T>>::GetParam()));
-  }
-
-  void TearDown() override {
-    // Check that the parametric test calls SupportedTargets() when the source
-    // was compiled with more than one target. In the single-target case only
-    // static dispatch will be used anyway.
-#if (HWY_TARGETS & (HWY_TARGETS - 1)) != 0
-    EXPECT_TRUE(GetChosenTarget().IsInitialized())
-        << "This hwy target parametric test doesn't use dynamic-dispatch and "
-           "doesn't need to be parametric.";
-#endif
-    SetSupportedTargetsForTest(0);
-  }
-
-  T GetParam() {
-    return std::get<1>(
-        ::testing::TestWithParam<std::tuple<int64_t, T>>::GetParam());
-  }
-};
-
-template <typename T>
-std::string TestParamTargetNameAndT(
-    const testing::TestParamInfo<std::tuple<int64_t, T>>& info) {
-  return std::string(TargetName(std::get<0>(info.param))) + "_" +
-         ::testing::PrintToString(std::get<1>(info.param));
-}
-
-#define HWY_TARGET_INSTANTIATE_TEST_SUITE_P_T(suite, generator)     \
-  HWY_GTEST_INSTANTIATE_TEST_SUITE_P(                               \
-      suite##Group, suite,                                          \
-      ::testing::Combine(                                           \
-          testing::ValuesIn(::hwy::SupportedAndGeneratedTargets()), \
-          generator),                                               \
-      ::hwy::TestParamTargetNameAndT<suite::HwyParamType>)
-
-// Helper macro to export a function and define a test that tests it. This is
-// equivalent to do a HWY_EXPORT of a void(void) function and run it in a test:
-//   class MyTestSuite : public TestWithParamTarget {
-//    ...
-//   };
-//   HWY_TARGET_INSTANTIATE_TEST_SUITE_P(MyTestSuite);
-//   HWY_EXPORT_AND_TEST_P(MyTestSuite, MyTest);
-#define HWY_EXPORT_AND_TEST_P(suite, func_name)                   \
-  HWY_EXPORT(func_name);                                          \
-  TEST_P(suite, func_name) { HWY_DYNAMIC_DISPATCH(func_name)(); } \
-  static_assert(true, "For requiring trailing semicolon")
-
-#define HWY_EXPORT_AND_TEST_P_T(suite, func_name)                           \
-  HWY_EXPORT(func_name);                                                    \
-  TEST_P(suite, func_name) { HWY_DYNAMIC_DISPATCH(func_name)(GetParam()); } \
-  static_assert(true, "For requiring trailing semicolon")
-
-#define HWY_BEFORE_TEST(suite)                      \
-  class suite : public hwy::TestWithParamTarget {}; \
-  HWY_TARGET_INSTANTIATE_TEST_SUITE_P(suite);       \
-  static_assert(true, "For requiring trailing semicolon")
-
-}  // namespace hwy
-
-#endif  // HWY_TESTS_HWY_GTEST_H_
diff --git a/third_party/highway/hwy/tests/if_test.cc b/third_party/highway/hwy/tests/if_test.cc
deleted file mode 100644 (file)
index e44a878..0000000
+++ /dev/null
@@ -1,175 +0,0 @@
-// Copyright 2019 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "hwy/aligned_allocator.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "tests/if_test.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#include "hwy/highway.h"
-#include "hwy/tests/test_util-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-struct TestIfThenElse {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    RandomState rng;
-
-    using TI = MakeSigned<T>;  // For mask > 0 comparison
-    const Rebind<TI, D> di;
-    const size_t N = Lanes(d);
-    auto in1 = AllocateAligned<T>(N);
-    auto in2 = AllocateAligned<T>(N);
-    auto bool_lanes = AllocateAligned<TI>(N);
-    auto expected = AllocateAligned<T>(N);
-
-    // Each lane should have a chance of having mask=true.
-    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
-      for (size_t i = 0; i < N; ++i) {
-        in1[i] = static_cast<T>(Random32(&rng));
-        in2[i] = static_cast<T>(Random32(&rng));
-        bool_lanes[i] = (Random32(&rng) & 16) ? TI(1) : TI(0);
-      }
-
-      const auto v1 = Load(d, in1.get());
-      const auto v2 = Load(d, in2.get());
-      const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
-
-      for (size_t i = 0; i < N; ++i) {
-        expected[i] = bool_lanes[i] ? in1[i] : in2[i];
-      }
-      HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenElse(mask, v1, v2));
-
-      for (size_t i = 0; i < N; ++i) {
-        expected[i] = bool_lanes[i] ? in1[i] : T(0);
-      }
-      HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenElseZero(mask, v1));
-
-      for (size_t i = 0; i < N; ++i) {
-        expected[i] = bool_lanes[i] ? T(0) : in2[i];
-      }
-      HWY_ASSERT_VEC_EQ(d, expected.get(), IfThenZeroElse(mask, v2));
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllIfThenElse() {
-  ForAllTypes(ForPartialVectors<TestIfThenElse>());
-}
-
-struct TestIfVecThenElse {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    RandomState rng;
-
-    using TU = MakeUnsigned<T>;  // For all-one mask
-    const Rebind<TU, D> du;
-    const size_t N = Lanes(d);
-    auto in1 = AllocateAligned<T>(N);
-    auto in2 = AllocateAligned<T>(N);
-    auto vec_lanes = AllocateAligned<TU>(N);
-    auto expected = AllocateAligned<T>(N);
-
-    // Each lane should have a chance of having mask=true.
-    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
-      for (size_t i = 0; i < N; ++i) {
-        in1[i] = static_cast<T>(Random32(&rng));
-        in2[i] = static_cast<T>(Random32(&rng));
-        vec_lanes[i] = (Random32(&rng) & 16) ? static_cast<TU>(~TU(0)) : TU(0);
-      }
-
-      const auto v1 = Load(d, in1.get());
-      const auto v2 = Load(d, in2.get());
-      const auto vec = BitCast(d, Load(du, vec_lanes.get()));
-
-      for (size_t i = 0; i < N; ++i) {
-        expected[i] = vec_lanes[i] ? in1[i] : in2[i];
-      }
-      HWY_ASSERT_VEC_EQ(d, expected.get(), IfVecThenElse(vec, v1, v2));
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllIfVecThenElse() {
-  ForAllTypes(ForPartialVectors<TestIfVecThenElse>());
-}
-
-struct TestZeroIfNegative {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v0 = Zero(d);
-    const auto vp = Iota(d, 1);
-    const auto vn = Iota(d, T(-1E5));  // assumes N < 10^5
-
-    // Zero and positive remain unchanged
-    HWY_ASSERT_VEC_EQ(d, v0, ZeroIfNegative(v0));
-    HWY_ASSERT_VEC_EQ(d, vp, ZeroIfNegative(vp));
-
-    // Negative are all replaced with zero
-    HWY_ASSERT_VEC_EQ(d, v0, ZeroIfNegative(vn));
-  }
-};
-
-HWY_NOINLINE void TestAllZeroIfNegative() {
-  ForFloatTypes(ForPartialVectors<TestZeroIfNegative>());
-}
-
-struct TestIfNegative {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v0 = Zero(d);
-    const auto vp = Iota(d, 1);
-    const auto vn = Or(vp, SignBit(d));
-
-    // Zero and positive remain unchanged
-    HWY_ASSERT_VEC_EQ(d, v0, IfNegativeThenElse(v0, vn, v0));
-    HWY_ASSERT_VEC_EQ(d, vn, IfNegativeThenElse(v0, v0, vn));
-    HWY_ASSERT_VEC_EQ(d, vp, IfNegativeThenElse(vp, vn, vp));
-    HWY_ASSERT_VEC_EQ(d, vn, IfNegativeThenElse(vp, vp, vn));
-
-    // Negative are replaced with 2nd arg
-    HWY_ASSERT_VEC_EQ(d, v0, IfNegativeThenElse(vn, v0, vp));
-    HWY_ASSERT_VEC_EQ(d, vn, IfNegativeThenElse(vn, vn, v0));
-    HWY_ASSERT_VEC_EQ(d, vp, IfNegativeThenElse(vn, vp, vn));
-  }
-};
-
-HWY_NOINLINE void TestAllIfNegative() {
-  ForFloatTypes(ForPartialVectors<TestIfNegative>());
-  ForSignedTypes(ForPartialVectors<TestIfNegative>());
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-
-namespace hwy {
-HWY_BEFORE_TEST(HwyIfTest);
-HWY_EXPORT_AND_TEST_P(HwyIfTest, TestAllIfThenElse);
-HWY_EXPORT_AND_TEST_P(HwyIfTest, TestAllIfVecThenElse);
-HWY_EXPORT_AND_TEST_P(HwyIfTest, TestAllZeroIfNegative);
-HWY_EXPORT_AND_TEST_P(HwyIfTest, TestAllIfNegative);
-}  // namespace hwy
-
-#endif
diff --git a/third_party/highway/hwy/tests/interleaved_test.cc b/third_party/highway/hwy/tests/interleaved_test.cc
deleted file mode 100644 (file)
index 4d1fbd5..0000000
+++ /dev/null
@@ -1,256 +0,0 @@
-// Copyright 2019 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <stddef.h>
-#include <stdint.h>
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "tests/interleaved_test.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#include "hwy/highway.h"
-#include "hwy/tests/test_util-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-struct TestLoadStoreInterleaved2 {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-
-    RandomState rng;
-
-    // Data to be interleaved
-    auto bytes = AllocateAligned<T>(2 * N);
-    for (size_t i = 0; i < 2 * N; ++i) {
-      bytes[i] = static_cast<T>(Random32(&rng) & 0xFF);
-    }
-    const auto in0 = Load(d, &bytes[0 * N]);
-    const auto in1 = Load(d, &bytes[1 * N]);
-
-    // Interleave here, ensure vector results match scalar
-    auto expected = AllocateAligned<T>(3 * N);
-    auto actual_aligned = AllocateAligned<T>(3 * N + 1);
-    T* actual = actual_aligned.get() + 1;
-
-    for (size_t rep = 0; rep < 100; ++rep) {
-      for (size_t i = 0; i < N; ++i) {
-        expected[2 * i + 0] = bytes[0 * N + i];
-        expected[2 * i + 1] = bytes[1 * N + i];
-        // Ensure we do not write more than 2*N bytes
-        expected[2 * N + i] = actual[2 * N + i] = 0;
-      }
-      StoreInterleaved2(in0, in1, d, actual);
-      size_t pos = 0;
-      if (!BytesEqual(expected.get(), actual, 3 * N * sizeof(T), &pos)) {
-        Print(d, "in0", in0, pos / 4);
-        Print(d, "in1", in1, pos / 4);
-        const size_t i = pos;
-        fprintf(stderr, "interleaved i=%d %f %f %f %f  %f %f %f %f\n",
-                static_cast<int>(i), static_cast<double>(actual[i]),
-                static_cast<double>(actual[i + 1]),
-                static_cast<double>(actual[i + 2]),
-                static_cast<double>(actual[i + 3]),
-                static_cast<double>(actual[i + 4]),
-                static_cast<double>(actual[i + 5]),
-                static_cast<double>(actual[i + 6]),
-                static_cast<double>(actual[i + 7]));
-        HWY_ASSERT(false);
-      }
-
-      Vec<D> out0, out1;
-      LoadInterleaved2(d, actual, out0, out1);
-      HWY_ASSERT_VEC_EQ(d, in0, out0);
-      HWY_ASSERT_VEC_EQ(d, in1, out1);
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllLoadStoreInterleaved2() {
-#if HWY_TARGET == HWY_RVV
-  // Segments are limited to 8 registers, so we can only go up to LMUL=2.
-  const ForExtendableVectors<TestLoadStoreInterleaved2, 2> test;
-#else
-  const ForPartialVectors<TestLoadStoreInterleaved2> test;
-#endif
-  ForAllTypes(test);
-}
-
-// Workaround for build timeout on GCC 12 aarch64, see #776
-#if HWY_COMPILER_GCC_ACTUAL >= 1200 && HWY_ARCH_ARM_A64
-#define HWY_BROKEN_LOAD34 1
-#else
-#define HWY_BROKEN_LOAD34 0
-#endif
-
-#if !HWY_BROKEN_LOAD34
-
-struct TestLoadStoreInterleaved3 {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-
-    RandomState rng;
-
-    // Data to be interleaved
-    auto bytes = AllocateAligned<T>(3 * N);
-    for (size_t i = 0; i < 3 * N; ++i) {
-      bytes[i] = static_cast<T>(Random32(&rng) & 0xFF);
-    }
-    const auto in0 = Load(d, &bytes[0 * N]);
-    const auto in1 = Load(d, &bytes[1 * N]);
-    const auto in2 = Load(d, &bytes[2 * N]);
-
-    // Interleave here, ensure vector results match scalar
-    auto expected = AllocateAligned<T>(4 * N);
-    auto actual_aligned = AllocateAligned<T>(4 * N + 1);
-    T* actual = actual_aligned.get() + 1;
-
-    for (size_t rep = 0; rep < 100; ++rep) {
-      for (size_t i = 0; i < N; ++i) {
-        expected[3 * i + 0] = bytes[0 * N + i];
-        expected[3 * i + 1] = bytes[1 * N + i];
-        expected[3 * i + 2] = bytes[2 * N + i];
-        // Ensure we do not write more than 3*N bytes
-        expected[3 * N + i] = actual[3 * N + i] = 0;
-      }
-      StoreInterleaved3(in0, in1, in2, d, actual);
-      size_t pos = 0;
-      if (!BytesEqual(expected.get(), actual, 4 * N * sizeof(T), &pos)) {
-        Print(d, "in0", in0, pos / 3, N);
-        Print(d, "in1", in1, pos / 3, N);
-        Print(d, "in2", in2, pos / 3, N);
-        const size_t i = pos;
-        fprintf(stderr, "interleaved i=%d %f %f %f  %f %f %f\n",
-                static_cast<int>(i), static_cast<double>(actual[i]),
-                static_cast<double>(actual[i + 1]),
-                static_cast<double>(actual[i + 2]),
-                static_cast<double>(actual[i + 3]),
-                static_cast<double>(actual[i + 4]),
-                static_cast<double>(actual[i + 5]));
-        HWY_ASSERT(false);
-      }
-
-      Vec<D> out0, out1, out2;
-      LoadInterleaved3(d, actual, out0, out1, out2);
-      HWY_ASSERT_VEC_EQ(d, in0, out0);
-      HWY_ASSERT_VEC_EQ(d, in1, out1);
-      HWY_ASSERT_VEC_EQ(d, in2, out2);
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllLoadStoreInterleaved3() {
-#if HWY_TARGET == HWY_RVV
-  // Segments are limited to 8 registers, so we can only go up to LMUL=2.
-  const ForExtendableVectors<TestLoadStoreInterleaved3, 2> test;
-#else
-  const ForPartialVectors<TestLoadStoreInterleaved3> test;
-#endif
-  ForAllTypes(test);
-}
-
-struct TestLoadStoreInterleaved4 {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-
-    RandomState rng;
-
-    // Data to be interleaved
-    auto bytes = AllocateAligned<T>(4 * N);
-
-    for (size_t i = 0; i < 4 * N; ++i) {
-      bytes[i] = static_cast<T>(Random32(&rng) & 0xFF);
-    }
-    const auto in0 = Load(d, &bytes[0 * N]);
-    const auto in1 = Load(d, &bytes[1 * N]);
-    const auto in2 = Load(d, &bytes[2 * N]);
-    const auto in3 = Load(d, &bytes[3 * N]);
-
-    // Interleave here, ensure vector results match scalar
-    auto expected = AllocateAligned<T>(5 * N);
-    auto actual_aligned = AllocateAligned<T>(5 * N + 1);
-    T* actual = actual_aligned.get() + 1;
-
-    for (size_t rep = 0; rep < 100; ++rep) {
-      for (size_t i = 0; i < N; ++i) {
-        expected[4 * i + 0] = bytes[0 * N + i];
-        expected[4 * i + 1] = bytes[1 * N + i];
-        expected[4 * i + 2] = bytes[2 * N + i];
-        expected[4 * i + 3] = bytes[3 * N + i];
-        // Ensure we do not write more than 4*N bytes
-        expected[4 * N + i] = actual[4 * N + i] = 0;
-      }
-      StoreInterleaved4(in0, in1, in2, in3, d, actual);
-      size_t pos = 0;
-      if (!BytesEqual(expected.get(), actual, 5 * N * sizeof(T), &pos)) {
-        Print(d, "in0", in0, pos / 4);
-        Print(d, "in1", in1, pos / 4);
-        Print(d, "in2", in2, pos / 4);
-        Print(d, "in3", in3, pos / 4);
-        const size_t i = pos;
-        fprintf(stderr, "interleaved i=%d %f %f %f %f  %f %f %f %f\n",
-                static_cast<int>(i), static_cast<double>(actual[i]),
-                static_cast<double>(actual[i + 1]),
-                static_cast<double>(actual[i + 2]),
-                static_cast<double>(actual[i + 3]),
-                static_cast<double>(actual[i + 4]),
-                static_cast<double>(actual[i + 5]),
-                static_cast<double>(actual[i + 6]),
-                static_cast<double>(actual[i + 7]));
-        HWY_ASSERT(false);
-      }
-
-      Vec<D> out0, out1, out2, out3;
-      LoadInterleaved4(d, actual, out0, out1, out2, out3);
-      HWY_ASSERT_VEC_EQ(d, in0, out0);
-      HWY_ASSERT_VEC_EQ(d, in1, out1);
-      HWY_ASSERT_VEC_EQ(d, in2, out2);
-      HWY_ASSERT_VEC_EQ(d, in3, out3);
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllLoadStoreInterleaved4() {
-#if HWY_TARGET == HWY_RVV
-  // Segments are limited to 8 registers, so we can only go up to LMUL=2.
-  const ForExtendableVectors<TestLoadStoreInterleaved4, 2> test;
-#else
-  const ForPartialVectors<TestLoadStoreInterleaved4> test;
-#endif
-  ForAllTypes(test);
-}
-
-#endif  // !HWY_BROKEN_LOAD34
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-
-namespace hwy {
-HWY_BEFORE_TEST(HwyInterleavedTest);
-HWY_EXPORT_AND_TEST_P(HwyInterleavedTest, TestAllLoadStoreInterleaved2);
-#if !HWY_BROKEN_LOAD34
-HWY_EXPORT_AND_TEST_P(HwyInterleavedTest, TestAllLoadStoreInterleaved3);
-HWY_EXPORT_AND_TEST_P(HwyInterleavedTest, TestAllLoadStoreInterleaved4);
-#endif
-}  // namespace hwy
-
-#endif
diff --git a/third_party/highway/hwy/tests/list_targets.cc b/third_party/highway/hwy/tests/list_targets.cc
deleted file mode 100644 (file)
index d09ee4f..0000000
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright 2020 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Simple tool to print the list of targets that were compiled in when building
-// this tool.
-
-#include <stdio.h>
-
-#include "hwy/highway.h"
-
-void PrintTargets(const char* msg, int64_t targets) {
-  fprintf(stderr, "%s", msg);
-  // For each bit:
-  for (int64_t x = targets; x != 0; x = x & (x - 1)) {
-    // Extract value of least-significant bit.
-    fprintf(stderr, " %s", hwy::TargetName(x & (~x + 1)));
-  }
-  fprintf(stderr, "\n");
-}
-
-int main() {
-#ifdef HWY_COMPILE_ONLY_EMU128
-  const int only_emu128 = 1;
-#else
-  const int only_emu128 = 0;
-#endif
-#ifdef HWY_COMPILE_ONLY_SCALAR
-  const int only_scalar = 1;
-#else
-  const int only_scalar = 0;
-#endif
-#ifdef HWY_COMPILE_ONLY_STATIC
-  const int only_static = 1;
-#else
-  const int only_static = 0;
-#endif
-#ifdef HWY_COMPILE_ALL_ATTAINABLE
-  const int all_attain = 1;
-#else
-  const int all_attain = 0;
-#endif
-#ifdef HWY_IS_TEST
-  const int is_test = 1;
-#else
-  const int is_test = 0;
-#endif
-
-  fprintf(stderr,
-          "Config: emu128:%d scalar:%d static:%d all_attain:%d is_test:%d\n",
-          only_emu128, only_scalar, only_static, all_attain, is_test);
-  PrintTargets("Compiled HWY_TARGETS:  ", HWY_TARGETS);
-  PrintTargets("HWY_ATTAINABLE_TARGETS:", HWY_ATTAINABLE_TARGETS);
-  PrintTargets("HWY_BASELINE_TARGETS:  ", HWY_BASELINE_TARGETS);
-  PrintTargets("HWY_STATIC_TARGET:     ", HWY_STATIC_TARGET);
-  PrintTargets("HWY_BROKEN_TARGETS:    ", HWY_BROKEN_TARGETS);
-  PrintTargets("HWY_DISABLED_TARGETS:  ", HWY_DISABLED_TARGETS);
-  PrintTargets("Current CPU supports:  ", hwy::SupportedTargets());
-  return 0;
-}
diff --git a/third_party/highway/hwy/tests/logical_test.cc b/third_party/highway/hwy/tests/logical_test.cc
deleted file mode 100644 (file)
index fa2b9b9..0000000
+++ /dev/null
@@ -1,270 +0,0 @@
-// Copyright 2019 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <stddef.h>
-#include <stdint.h>
-#include <string.h>  // memcmp
-
-#include "hwy/aligned_allocator.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "tests/logical_test.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#include "hwy/highway.h"
-#include "hwy/tests/test_util-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-struct TestLogicalInteger {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v0 = Zero(d);
-    const auto vi = Iota(d, 0);
-    const auto ones = VecFromMask(d, Eq(v0, v0));
-    const auto v1 = Set(d, 1);
-    const auto vnot1 = Set(d, T(~T(1)));
-
-    HWY_ASSERT_VEC_EQ(d, v0, Not(ones));
-    HWY_ASSERT_VEC_EQ(d, ones, Not(v0));
-    HWY_ASSERT_VEC_EQ(d, v1, Not(vnot1));
-    HWY_ASSERT_VEC_EQ(d, vnot1, Not(v1));
-
-    HWY_ASSERT_VEC_EQ(d, v0, And(v0, vi));
-    HWY_ASSERT_VEC_EQ(d, v0, And(vi, v0));
-    HWY_ASSERT_VEC_EQ(d, vi, And(vi, vi));
-
-    HWY_ASSERT_VEC_EQ(d, vi, Or(v0, vi));
-    HWY_ASSERT_VEC_EQ(d, vi, Or(vi, v0));
-    HWY_ASSERT_VEC_EQ(d, vi, Or(vi, vi));
-
-    HWY_ASSERT_VEC_EQ(d, vi, Xor(v0, vi));
-    HWY_ASSERT_VEC_EQ(d, vi, Xor(vi, v0));
-    HWY_ASSERT_VEC_EQ(d, v0, Xor(vi, vi));
-
-    HWY_ASSERT_VEC_EQ(d, vi, AndNot(v0, vi));
-    HWY_ASSERT_VEC_EQ(d, v0, AndNot(vi, v0));
-    HWY_ASSERT_VEC_EQ(d, v0, AndNot(vi, vi));
-
-    HWY_ASSERT_VEC_EQ(d, v0, Or3(v0, v0, v0));
-    HWY_ASSERT_VEC_EQ(d, vi, Or3(v0, vi, v0));
-    HWY_ASSERT_VEC_EQ(d, vi, Or3(v0, v0, vi));
-    HWY_ASSERT_VEC_EQ(d, vi, Or3(v0, vi, vi));
-    HWY_ASSERT_VEC_EQ(d, vi, Or3(vi, v0, v0));
-    HWY_ASSERT_VEC_EQ(d, vi, Or3(vi, vi, v0));
-    HWY_ASSERT_VEC_EQ(d, vi, Or3(vi, v0, vi));
-    HWY_ASSERT_VEC_EQ(d, vi, Or3(vi, vi, vi));
-
-    HWY_ASSERT_VEC_EQ(d, v0, OrAnd(v0, v0, v0));
-    HWY_ASSERT_VEC_EQ(d, v0, OrAnd(v0, vi, v0));
-    HWY_ASSERT_VEC_EQ(d, v0, OrAnd(v0, v0, vi));
-    HWY_ASSERT_VEC_EQ(d, vi, OrAnd(v0, vi, vi));
-    HWY_ASSERT_VEC_EQ(d, vi, OrAnd(vi, v0, v0));
-    HWY_ASSERT_VEC_EQ(d, vi, OrAnd(vi, vi, v0));
-    HWY_ASSERT_VEC_EQ(d, vi, OrAnd(vi, v0, vi));
-    HWY_ASSERT_VEC_EQ(d, vi, OrAnd(vi, vi, vi));
-
-    auto v = vi;
-    v = And(v, vi);
-    HWY_ASSERT_VEC_EQ(d, vi, v);
-    v = And(v, v0);
-    HWY_ASSERT_VEC_EQ(d, v0, v);
-
-    v = Or(v, vi);
-    HWY_ASSERT_VEC_EQ(d, vi, v);
-    v = Or(v, v0);
-    HWY_ASSERT_VEC_EQ(d, vi, v);
-
-    v = Xor(v, vi);
-    HWY_ASSERT_VEC_EQ(d, v0, v);
-    v = Xor(v, v0);
-    HWY_ASSERT_VEC_EQ(d, v0, v);
-  }
-};
-
-HWY_NOINLINE void TestAllLogicalInteger() {
-  ForIntegerTypes(ForPartialVectors<TestLogicalInteger>());
-}
-
-struct TestLogicalFloat {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v0 = Zero(d);
-    const auto vi = Iota(d, 0);
-
-    HWY_ASSERT_VEC_EQ(d, v0, And(v0, vi));
-    HWY_ASSERT_VEC_EQ(d, v0, And(vi, v0));
-    HWY_ASSERT_VEC_EQ(d, vi, And(vi, vi));
-
-    HWY_ASSERT_VEC_EQ(d, vi, Or(v0, vi));
-    HWY_ASSERT_VEC_EQ(d, vi, Or(vi, v0));
-    HWY_ASSERT_VEC_EQ(d, vi, Or(vi, vi));
-
-    HWY_ASSERT_VEC_EQ(d, vi, Xor(v0, vi));
-    HWY_ASSERT_VEC_EQ(d, vi, Xor(vi, v0));
-    HWY_ASSERT_VEC_EQ(d, v0, Xor(vi, vi));
-
-    HWY_ASSERT_VEC_EQ(d, vi, AndNot(v0, vi));
-    HWY_ASSERT_VEC_EQ(d, v0, AndNot(vi, v0));
-    HWY_ASSERT_VEC_EQ(d, v0, AndNot(vi, vi));
-
-    auto v = vi;
-    v = And(v, vi);
-    HWY_ASSERT_VEC_EQ(d, vi, v);
-    v = And(v, v0);
-    HWY_ASSERT_VEC_EQ(d, v0, v);
-
-    v = Or(v, vi);
-    HWY_ASSERT_VEC_EQ(d, vi, v);
-    v = Or(v, v0);
-    HWY_ASSERT_VEC_EQ(d, vi, v);
-
-    v = Xor(v, vi);
-    HWY_ASSERT_VEC_EQ(d, v0, v);
-    v = Xor(v, v0);
-    HWY_ASSERT_VEC_EQ(d, v0, v);
-  }
-};
-
-HWY_NOINLINE void TestAllLogicalFloat() {
-  ForFloatTypes(ForPartialVectors<TestLogicalFloat>());
-}
-
-struct TestCopySign {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v0 = Zero(d);
-    const auto vp = Iota(d, 1);
-    const auto vn = Iota(d, T(-1E5));  // assumes N < 10^5
-
-    // Zero remains zero regardless of sign
-    HWY_ASSERT_VEC_EQ(d, v0, CopySign(v0, v0));
-    HWY_ASSERT_VEC_EQ(d, v0, CopySign(v0, vp));
-    HWY_ASSERT_VEC_EQ(d, v0, CopySign(v0, vn));
-    HWY_ASSERT_VEC_EQ(d, v0, CopySignToAbs(v0, v0));
-    HWY_ASSERT_VEC_EQ(d, v0, CopySignToAbs(v0, vp));
-    HWY_ASSERT_VEC_EQ(d, v0, CopySignToAbs(v0, vn));
-
-    // Positive input, positive sign => unchanged
-    HWY_ASSERT_VEC_EQ(d, vp, CopySign(vp, vp));
-    HWY_ASSERT_VEC_EQ(d, vp, CopySignToAbs(vp, vp));
-
-    // Positive input, negative sign => negated
-    HWY_ASSERT_VEC_EQ(d, Neg(vp), CopySign(vp, vn));
-    HWY_ASSERT_VEC_EQ(d, Neg(vp), CopySignToAbs(vp, vn));
-
-    // Negative input, negative sign => unchanged
-    HWY_ASSERT_VEC_EQ(d, vn, CopySign(vn, vn));
-
-    // Negative input, positive sign => negated
-    HWY_ASSERT_VEC_EQ(d, Neg(vn), CopySign(vn, vp));
-  }
-};
-
-HWY_NOINLINE void TestAllCopySign() {
-  ForFloatTypes(ForPartialVectors<TestCopySign>());
-}
-
-struct TestBroadcastSignBit {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto s0 = Zero(d);
-    const auto s1 = Set(d, -1);  // all bit set
-    const auto vpos = And(Iota(d, 0), Set(d, LimitsMax<T>()));
-    const auto vneg = Sub(s1, vpos);
-
-    HWY_ASSERT_VEC_EQ(d, s0, BroadcastSignBit(vpos));
-    HWY_ASSERT_VEC_EQ(d, s0, BroadcastSignBit(Set(d, LimitsMax<T>())));
-
-    HWY_ASSERT_VEC_EQ(d, s1, BroadcastSignBit(vneg));
-    HWY_ASSERT_VEC_EQ(d, s1, BroadcastSignBit(Set(d, LimitsMin<T>())));
-    HWY_ASSERT_VEC_EQ(d, s1, BroadcastSignBit(Set(d, LimitsMin<T>() / 2)));
-  }
-};
-
-HWY_NOINLINE void TestAllBroadcastSignBit() {
-  ForSignedTypes(ForPartialVectors<TestBroadcastSignBit>());
-}
-
-struct TestTestBit {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t kNumBits = sizeof(T) * 8;
-    for (size_t i = 0; i < kNumBits; ++i) {
-      const auto bit1 = Set(d, T(1ull << i));
-      const auto bit2 = Set(d, T(1ull << ((i + 1) % kNumBits)));
-      const auto bit3 = Set(d, T(1ull << ((i + 2) % kNumBits)));
-      const auto bits12 = Or(bit1, bit2);
-      const auto bits23 = Or(bit2, bit3);
-      HWY_ASSERT(AllTrue(d, TestBit(bit1, bit1)));
-      HWY_ASSERT(AllTrue(d, TestBit(bits12, bit1)));
-      HWY_ASSERT(AllTrue(d, TestBit(bits12, bit2)));
-
-      HWY_ASSERT(AllFalse(d, TestBit(bits12, bit3)));
-      HWY_ASSERT(AllFalse(d, TestBit(bits23, bit1)));
-      HWY_ASSERT(AllFalse(d, TestBit(bit1, bit2)));
-      HWY_ASSERT(AllFalse(d, TestBit(bit2, bit1)));
-      HWY_ASSERT(AllFalse(d, TestBit(bit1, bit3)));
-      HWY_ASSERT(AllFalse(d, TestBit(bit3, bit1)));
-      HWY_ASSERT(AllFalse(d, TestBit(bit2, bit3)));
-      HWY_ASSERT(AllFalse(d, TestBit(bit3, bit2)));
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllTestBit() {
-  ForIntegerTypes(ForPartialVectors<TestTestBit>());
-}
-
-struct TestPopulationCount {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    RandomState rng;
-    size_t N = Lanes(d);
-    auto data = AllocateAligned<T>(N);
-    auto popcnt = AllocateAligned<T>(N);
-    for (size_t i = 0; i < AdjustedReps(1 << 18) / N; i++) {
-      for (size_t i = 0; i < N; i++) {
-        data[i] = static_cast<T>(rng());
-        popcnt[i] = static_cast<T>(PopCount(data[i]));
-      }
-      HWY_ASSERT_VEC_EQ(d, popcnt.get(), PopulationCount(Load(d, data.get())));
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllPopulationCount() {
-  ForUnsignedTypes(ForPartialVectors<TestPopulationCount>());
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-
-namespace hwy {
-HWY_BEFORE_TEST(HwyLogicalTest);
-HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalInteger);
-HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalFloat);
-HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCopySign);
-HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllBroadcastSignBit);
-HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllTestBit);
-HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllPopulationCount);
-}  // namespace hwy
-
-#endif
diff --git a/third_party/highway/hwy/tests/mask_mem_test.cc b/third_party/highway/hwy/tests/mask_mem_test.cc
deleted file mode 100644 (file)
index c44119d..0000000
+++ /dev/null
@@ -1,197 +0,0 @@
-// Copyright 2019 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS  // before inttypes.h
-#endif
-#include <inttypes.h>
-#include <stddef.h>
-#include <stdint.h>
-#include <string.h>  // memcmp
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "tests/mask_mem_test.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#include "hwy/highway.h"
-#include "hwy/tests/test_util-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-struct TestMaskedLoad {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    RandomState rng;
-
-    using TI = MakeSigned<T>;  // For mask > 0 comparison
-    const Rebind<TI, D> di;
-    const size_t N = Lanes(d);
-    auto bool_lanes = AllocateAligned<TI>(N);
-
-    auto lanes = AllocateAligned<T>(N);
-    Store(Iota(d, T{1}), d, lanes.get());
-
-    // Each lane should have a chance of having mask=true.
-    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
-      for (size_t i = 0; i < N; ++i) {
-        bool_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0);
-      }
-
-      const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
-      const auto expected = IfThenElseZero(mask, Load(d, lanes.get()));
-      const auto actual = MaskedLoad(mask, d, lanes.get());
-      HWY_ASSERT_VEC_EQ(d, expected, actual);
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllMaskedLoad() {
-  ForAllTypes(ForPartialVectors<TestMaskedLoad>());
-}
-
-struct TestBlendedStore {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    RandomState rng;
-
-    using TI = MakeSigned<T>;  // For mask > 0 comparison
-    const Rebind<TI, D> di;
-    const size_t N = Lanes(d);
-    auto bool_lanes = AllocateAligned<TI>(N);
-
-    const Vec<D> v = Iota(d, T{1});
-    auto actual = AllocateAligned<T>(N);
-    auto expected = AllocateAligned<T>(N);
-
-    // Each lane should have a chance of having mask=true.
-    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
-      for (size_t i = 0; i < N; ++i) {
-        bool_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0);
-        // Re-initialize to something distinct from v[i].
-        actual[i] = static_cast<T>(127 - (i & 127));
-        expected[i] = bool_lanes[i] ? static_cast<T>(i + 1) : actual[i];
-      }
-
-      const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
-      BlendedStore(v, mask, d, actual.get());
-      HWY_ASSERT_VEC_EQ(d, expected.get(), Load(d, actual.get()));
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllBlendedStore() {
-  ForAllTypes(ForPartialVectors<TestBlendedStore>());
-}
-
-class TestStoreMaskBits {
- public:
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*t*/, D /*d*/) {
-    RandomState rng;
-    using TI = MakeSigned<T>;  // For mask > 0 comparison
-    const Rebind<TI, D> di;
-    const size_t N = Lanes(di);
-    auto bool_lanes = AllocateAligned<TI>(N);
-
-    const ScalableTag<uint8_t, -3> d_bits;
-    const size_t expected_num_bytes = (N + 7) / 8;
-    auto expected = AllocateAligned<uint8_t>(expected_num_bytes);
-    auto actual = AllocateAligned<uint8_t>(HWY_MAX(8, expected_num_bytes));
-
-    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
-      // Generate random mask pattern.
-      for (size_t i = 0; i < N; ++i) {
-        bool_lanes[i] = static_cast<TI>((rng() & 1024) ? 1 : 0);
-      }
-      const auto bools = Load(di, bool_lanes.get());
-      const auto mask = Gt(bools, Zero(di));
-
-      // Requires at least 8 bytes, ensured above.
-      const size_t bytes_written = StoreMaskBits(di, mask, actual.get());
-      if (bytes_written != expected_num_bytes) {
-        fprintf(stderr, "%s expected %" PRIu64 " bytes, actual %" PRIu64 "\n",
-                TypeName(T(), N).c_str(),
-                static_cast<uint64_t>(expected_num_bytes),
-                static_cast<uint64_t>(bytes_written));
-
-        HWY_ASSERT(false);
-      }
-
-      // Requires at least 8 bytes, ensured above.
-      const auto mask2 = LoadMaskBits(di, actual.get());
-      HWY_ASSERT_MASK_EQ(di, mask, mask2);
-
-      memset(expected.get(), 0, expected_num_bytes);
-      for (size_t i = 0; i < N; ++i) {
-        expected[i / 8] =
-            static_cast<uint8_t>(expected[i / 8] | (bool_lanes[i] << (i % 8)));
-      }
-
-      size_t i = 0;
-      // Stored bits must match original mask
-      for (; i < N; ++i) {
-        const TI is_set = (actual[i / 8] & (1 << (i % 8))) ? 1 : 0;
-        if (is_set != bool_lanes[i]) {
-          fprintf(stderr, "%s lane %" PRIu64 ": expected %d, actual %d\n",
-                  TypeName(T(), N).c_str(), static_cast<uint64_t>(i),
-                  static_cast<int>(bool_lanes[i]), static_cast<int>(is_set));
-          Print(di, "bools", bools, 0, N);
-          Print(d_bits, "expected bytes", Load(d_bits, expected.get()), 0,
-                expected_num_bytes);
-          Print(d_bits, "actual bytes", Load(d_bits, actual.get()), 0,
-                expected_num_bytes);
-
-          HWY_ASSERT(false);
-        }
-      }
-      // Any partial bits in the last byte must be zero
-      for (; i < 8 * bytes_written; ++i) {
-        const int bit = (actual[i / 8] & (1 << (i % 8)));
-        if (bit != 0) {
-          fprintf(stderr, "%s: bit #%" PRIu64 " should be zero\n",
-                  TypeName(T(), N).c_str(), static_cast<uint64_t>(i));
-          Print(di, "bools", bools, 0, N);
-          Print(d_bits, "expected bytes", Load(d_bits, expected.get()), 0,
-                expected_num_bytes);
-          Print(d_bits, "actual bytes", Load(d_bits, actual.get()), 0,
-                expected_num_bytes);
-
-          HWY_ASSERT(false);
-        }
-      }
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllStoreMaskBits() {
-  ForAllTypes(ForPartialVectors<TestStoreMaskBits>());
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-
-namespace hwy {
-HWY_BEFORE_TEST(HwyMaskTest);
-HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllMaskedLoad);
-HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllBlendedStore);
-HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllStoreMaskBits);
-}  // namespace hwy
-
-#endif
diff --git a/third_party/highway/hwy/tests/mask_test.cc b/third_party/highway/hwy/tests/mask_test.cc
deleted file mode 100644 (file)
index 90004d7..0000000
+++ /dev/null
@@ -1,287 +0,0 @@
-// Copyright 2019 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <stddef.h>
-#include <stdint.h>
-#include <string.h>  // memcmp
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "tests/mask_test.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#include "hwy/highway.h"
-#include "hwy/tests/test_util-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-// All types.
-struct TestFromVec {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    auto lanes = AllocateAligned<T>(N);
-
-    memset(lanes.get(), 0, N * sizeof(T));
-    const auto actual_false = MaskFromVec(Load(d, lanes.get()));
-    HWY_ASSERT_MASK_EQ(d, MaskFalse(d), actual_false);
-
-    memset(lanes.get(), 0xFF, N * sizeof(T));
-    const auto actual_true = MaskFromVec(Load(d, lanes.get()));
-    HWY_ASSERT_MASK_EQ(d, MaskTrue(d), actual_true);
-  }
-};
-
-HWY_NOINLINE void TestAllFromVec() {
-  ForAllTypes(ForPartialVectors<TestFromVec>());
-}
-
-struct TestFirstN {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    auto bool_lanes = AllocateAligned<T>(N);
-
-    using TN = SignedFromSize<HWY_MIN(sizeof(size_t), sizeof(T))>;
-    const size_t max_len = static_cast<size_t>(LimitsMax<TN>());
-
-    const size_t max_lanes = HWY_MIN(2 * N, AdjustedReps(512));
-    for (size_t len = 0; len <= HWY_MIN(max_lanes, max_len); ++len) {
-      // Loop instead of Iota+Lt to avoid wraparound for 8-bit T.
-      for (size_t i = 0; i < N; ++i) {
-        bool_lanes[i] = (i < len) ? T{1} : 0;
-      }
-      const auto expected = Eq(Load(d, bool_lanes.get()), Set(d, T{1}));
-      HWY_ASSERT_MASK_EQ(d, expected, FirstN(d, len));
-    }
-
-    // Also ensure huge values yield all-true (unless the vector is actually
-    // larger than max_len).
-    for (size_t i = 0; i < N; ++i) {
-      bool_lanes[i] = (i < max_len) ? T{1} : 0;
-    }
-    const auto expected = Eq(Load(d, bool_lanes.get()), Set(d, T{1}));
-    HWY_ASSERT_MASK_EQ(d, expected, FirstN(d, max_len));
-  }
-};
-
-HWY_NOINLINE void TestAllFirstN() {
-  ForAllTypes(ForPartialVectors<TestFirstN>());
-}
-
-struct TestMaskVec {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    RandomState rng;
-
-    using TI = MakeSigned<T>;  // For mask > 0 comparison
-    const Rebind<TI, D> di;
-    const size_t N = Lanes(d);
-    auto bool_lanes = AllocateAligned<TI>(N);
-
-    // Each lane should have a chance of having mask=true.
-    for (size_t rep = 0; rep < AdjustedReps(200); ++rep) {
-      for (size_t i = 0; i < N; ++i) {
-        bool_lanes[i] = (Random32(&rng) & 1024) ? TI(1) : TI(0);
-      }
-
-      const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
-      HWY_ASSERT_MASK_EQ(d, mask, MaskFromVec(VecFromMask(d, mask)));
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllMaskVec() {
-  const ForPartialVectors<TestMaskVec> test;
-
-  test(uint16_t());
-  test(int16_t());
-  // TODO(janwas): float16_t - cannot compare yet
-
-  ForUIF3264(test);
-}
-
-struct TestAllTrueFalse {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto zero = Zero(d);
-    auto v = zero;
-
-    const size_t N = Lanes(d);
-    auto lanes = AllocateAligned<T>(N);
-    std::fill(lanes.get(), lanes.get() + N, T(0));
-
-    HWY_ASSERT(AllTrue(d, Eq(v, zero)));
-    HWY_ASSERT(!AllFalse(d, Eq(v, zero)));
-
-    // Single lane implies AllFalse = !AllTrue. Otherwise, there are multiple
-    // lanes and one is nonzero.
-    const bool expected_all_false = (N != 1);
-
-    // Set each lane to nonzero and back to zero
-    for (size_t i = 0; i < N; ++i) {
-      lanes[i] = T(1);
-      v = Load(d, lanes.get());
-
-      HWY_ASSERT(!AllTrue(d, Eq(v, zero)));
-
-      HWY_ASSERT(expected_all_false ^ AllFalse(d, Eq(v, zero)));
-
-      lanes[i] = T(-1);
-      v = Load(d, lanes.get());
-      HWY_ASSERT(!AllTrue(d, Eq(v, zero)));
-      HWY_ASSERT(expected_all_false ^ AllFalse(d, Eq(v, zero)));
-
-      // Reset to all zero
-      lanes[i] = T(0);
-      v = Load(d, lanes.get());
-      HWY_ASSERT(AllTrue(d, Eq(v, zero)));
-      HWY_ASSERT(!AllFalse(d, Eq(v, zero)));
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllAllTrueFalse() {
-  ForAllTypes(ForPartialVectors<TestAllTrueFalse>());
-}
-
-struct TestCountTrue {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    using TI = MakeSigned<T>;  // For mask > 0 comparison
-    const Rebind<TI, D> di;
-    const size_t N = Lanes(di);
-    auto bool_lanes = AllocateAligned<TI>(N);
-    memset(bool_lanes.get(), 0, N * sizeof(TI));
-
-    // For all combinations of zero/nonzero state of subset of lanes:
-    const size_t max_lanes = HWY_MIN(N, size_t(10));
-
-    for (size_t code = 0; code < (1ull << max_lanes); ++code) {
-      // Number of zeros written = number of mask lanes that are true.
-      size_t expected = 0;
-      for (size_t i = 0; i < max_lanes; ++i) {
-        const bool is_true = (code & (1ull << i)) != 0;
-        bool_lanes[i] = is_true ? TI(1) : TI(0);
-        expected += is_true;
-      }
-
-      const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
-      const size_t actual = CountTrue(d, mask);
-      HWY_ASSERT_EQ(expected, actual);
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllCountTrue() {
-  ForAllTypes(ForPartialVectors<TestCountTrue>());
-}
-
-struct TestFindFirstTrue {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    using TI = MakeSigned<T>;  // For mask > 0 comparison
-    const Rebind<TI, D> di;
-    const size_t N = Lanes(di);
-    auto bool_lanes = AllocateAligned<TI>(N);
-    memset(bool_lanes.get(), 0, N * sizeof(TI));
-
-    // For all combinations of zero/nonzero state of subset of lanes:
-    const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(9)));
-
-    HWY_ASSERT_EQ(intptr_t(-1), FindFirstTrue(d, MaskFalse(d)));
-    HWY_ASSERT_EQ(intptr_t(0), FindFirstTrue(d, MaskTrue(d)));
-
-    for (size_t code = 1; code < (1ull << max_lanes); ++code) {
-      for (size_t i = 0; i < max_lanes; ++i) {
-        bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0);
-      }
-
-      const intptr_t expected = static_cast<intptr_t>(
-          Num0BitsBelowLS1Bit_Nonzero32(static_cast<uint32_t>(code)));
-      const auto mask = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
-      const intptr_t actual = FindFirstTrue(d, mask);
-      HWY_ASSERT_EQ(expected, actual);
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllFindFirstTrue() {
-  ForAllTypes(ForPartialVectors<TestFindFirstTrue>());
-}
-
-struct TestLogicalMask {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto m0 = MaskFalse(d);
-    const auto m_all = MaskTrue(d);
-
-    using TI = MakeSigned<T>;  // For mask > 0 comparison
-    const Rebind<TI, D> di;
-    const size_t N = Lanes(di);
-    auto bool_lanes = AllocateAligned<TI>(N);
-    memset(bool_lanes.get(), 0, N * sizeof(TI));
-
-    HWY_ASSERT_MASK_EQ(d, m0, Not(m_all));
-    HWY_ASSERT_MASK_EQ(d, m_all, Not(m0));
-
-    // For all combinations of zero/nonzero state of subset of lanes:
-    const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(6)));
-    for (size_t code = 0; code < (1ull << max_lanes); ++code) {
-      for (size_t i = 0; i < max_lanes; ++i) {
-        bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0);
-      }
-
-      const auto m = RebindMask(d, Gt(Load(di, bool_lanes.get()), Zero(di)));
-
-      HWY_ASSERT_MASK_EQ(d, m0, Xor(m, m));
-      HWY_ASSERT_MASK_EQ(d, m0, AndNot(m, m));
-      HWY_ASSERT_MASK_EQ(d, m0, AndNot(m_all, m));
-
-      HWY_ASSERT_MASK_EQ(d, m, Or(m, m));
-      HWY_ASSERT_MASK_EQ(d, m, Or(m0, m));
-      HWY_ASSERT_MASK_EQ(d, m, Or(m, m0));
-      HWY_ASSERT_MASK_EQ(d, m, Xor(m0, m));
-      HWY_ASSERT_MASK_EQ(d, m, Xor(m, m0));
-      HWY_ASSERT_MASK_EQ(d, m, And(m, m));
-      HWY_ASSERT_MASK_EQ(d, m, And(m_all, m));
-      HWY_ASSERT_MASK_EQ(d, m, And(m, m_all));
-      HWY_ASSERT_MASK_EQ(d, m, AndNot(m0, m));
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllLogicalMask() {
-  ForAllTypes(ForPartialVectors<TestLogicalMask>());
-}
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-
-namespace hwy {
-HWY_BEFORE_TEST(HwyMaskTest);
-HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFromVec);
-HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFirstN);
-HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllMaskVec);
-HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllAllTrueFalse);
-HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllCountTrue);
-HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllFindFirstTrue);
-HWY_EXPORT_AND_TEST_P(HwyMaskTest, TestAllLogicalMask);
-}  // namespace hwy
-
-#endif
diff --git a/third_party/highway/hwy/tests/memory_test.cc b/third_party/highway/hwy/tests/memory_test.cc
deleted file mode 100644 (file)
index b78be2b..0000000
+++ /dev/null
@@ -1,341 +0,0 @@
-// Copyright 2019 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Ensure incompabilities with Windows macros (e.g. #define StoreFence) are
-// detected. Must come before Highway headers.
-#include "hwy/base.h"
-#if defined(_WIN32) || defined(_WIN64)
-#include <windows.h>
-#endif
-
-#include <stddef.h>
-#include <stdint.h>
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "tests/memory_test.cc"
-#include "hwy/cache_control.h"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#include "hwy/highway.h"
-#include "hwy/tests/test_util-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-struct TestLoadStore {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    const auto hi = Iota(d, static_cast<T>(1 + N));
-    const auto lo = Iota(d, 1);
-    auto lanes = AllocateAligned<T>(2 * N);
-    Store(hi, d, &lanes[N]);
-    Store(lo, d, &lanes[0]);
-
-    // Aligned load
-    const auto lo2 = Load(d, &lanes[0]);
-    HWY_ASSERT_VEC_EQ(d, lo2, lo);
-
-    // Aligned store
-    auto lanes2 = AllocateAligned<T>(2 * N);
-    Store(lo2, d, &lanes2[0]);
-    Store(hi, d, &lanes2[N]);
-    for (size_t i = 0; i < 2 * N; ++i) {
-      HWY_ASSERT_EQ(lanes[i], lanes2[i]);
-    }
-
-    // Unaligned load
-    const auto vu = LoadU(d, &lanes[1]);
-    auto lanes3 = AllocateAligned<T>(N);
-    Store(vu, d, lanes3.get());
-    for (size_t i = 0; i < N; ++i) {
-      HWY_ASSERT_EQ(T(i + 2), lanes3[i]);
-    }
-
-    // Unaligned store
-    StoreU(lo2, d, &lanes2[N / 2]);
-    size_t i = 0;
-    for (; i < N / 2; ++i) {
-      HWY_ASSERT_EQ(lanes[i], lanes2[i]);
-    }
-    for (; i < 3 * N / 2; ++i) {
-      HWY_ASSERT_EQ(T(i - N / 2 + 1), lanes2[i]);
-    }
-    // Subsequent values remain unchanged.
-    for (; i < 2 * N; ++i) {
-      HWY_ASSERT_EQ(T(i + 1), lanes2[i]);
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllLoadStore() {
-  ForAllTypes(ForPartialVectors<TestLoadStore>());
-}
-
-struct TestSafeCopyN {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    const auto v = Iota(d, 1);
-    auto from = AllocateAligned<T>(N + 2);
-    auto to = AllocateAligned<T>(N + 2);
-    Store(v, d, from.get());
-
-    // 0: nothing changes
-    to[0] = T();
-    SafeCopyN(0, d, from.get(), to.get());
-    HWY_ASSERT_EQ(T(), to[0]);
-
-    // 1: only first changes
-    to[1] = T();
-    SafeCopyN(1, d, from.get(), to.get());
-    HWY_ASSERT_EQ(static_cast<T>(1), to[0]);
-    HWY_ASSERT_EQ(T(), to[1]);
-
-    // N-1: last does not change
-    to[N - 1] = T();
-    SafeCopyN(N - 1, d, from.get(), to.get());
-    HWY_ASSERT_EQ(T(), to[N - 1]);
-    // Also check preceding lanes
-    to[N - 1] = static_cast<T>(N);
-    HWY_ASSERT_VEC_EQ(d, to.get(), v);
-
-    // N: all change
-    to[N] = T();
-    SafeCopyN(N, d, from.get(), to.get());
-    HWY_ASSERT_VEC_EQ(d, to.get(), v);
-    HWY_ASSERT_EQ(T(), to[N]);
-
-    // N+1: subsequent lane does not change if using masked store
-    to[N + 1] = T();
-    SafeCopyN(N + 1, d, from.get(), to.get());
-    HWY_ASSERT_VEC_EQ(d, to.get(), v);
-#if !HWY_MEM_OPS_MIGHT_FAULT
-    HWY_ASSERT_EQ(T(), to[N + 1]);
-#endif
-  }
-};
-
-HWY_NOINLINE void TestAllSafeCopyN() {
-  ForAllTypes(ForPartialVectors<TestSafeCopyN>());
-}
-
-struct TestLoadDup128 {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    // Scalar does not define LoadDup128.
-#if HWY_TARGET != HWY_SCALAR || HWY_IDE
-    constexpr size_t N128 = 16 / sizeof(T);
-    alignas(16) T lanes[N128];
-    for (size_t i = 0; i < N128; ++i) {
-      lanes[i] = static_cast<T>(1 + i);
-    }
-
-    const size_t N = Lanes(d);
-    auto expected = AllocateAligned<T>(N);
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = static_cast<T>(i % N128 + 1);
-    }
-
-    HWY_ASSERT_VEC_EQ(d, expected.get(), LoadDup128(d, lanes));
-#else
-    (void)d;
-#endif
-  }
-};
-
-HWY_NOINLINE void TestAllLoadDup128() {
-  ForAllTypes(ForGEVectors<128, TestLoadDup128>());
-}
-
-struct TestStream {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v = Iota(d, T(1));
-    const size_t affected_bytes =
-        (Lanes(d) * sizeof(T) + HWY_STREAM_MULTIPLE - 1) &
-        ~size_t(HWY_STREAM_MULTIPLE - 1);
-    const size_t affected_lanes = affected_bytes / sizeof(T);
-    auto out = AllocateAligned<T>(2 * affected_lanes);
-    std::fill(out.get(), out.get() + 2 * affected_lanes, T(0));
-
-    Stream(v, d, out.get());
-    FlushStream();
-    const auto actual = Load(d, out.get());
-    HWY_ASSERT_VEC_EQ(d, v, actual);
-    // Ensure Stream didn't modify more memory than expected
-    for (size_t i = affected_lanes; i < 2 * affected_lanes; ++i) {
-      HWY_ASSERT_EQ(T(0), out[i]);
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllStream() {
-  const ForPartialVectors<TestStream> test;
-  // No u8,u16.
-  test(uint32_t());
-  test(uint64_t());
-  // No i8,i16.
-  test(int32_t());
-  test(int64_t());
-  ForFloatTypes(test);
-}
-
-// Assumes little-endian byte order!
-struct TestScatter {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    using Offset = MakeSigned<T>;
-
-    const size_t N = Lanes(d);
-    const size_t range = 4 * N;                  // number of items to scatter
-    const size_t max_bytes = range * sizeof(T);  // upper bound on offset
-
-    RandomState rng;
-
-    // Data to be scattered
-    auto bytes = AllocateAligned<uint8_t>(max_bytes);
-    for (size_t i = 0; i < max_bytes; ++i) {
-      bytes[i] = static_cast<uint8_t>(Random32(&rng) & 0xFF);
-    }
-    const auto data = Load(d, reinterpret_cast<const T*>(bytes.get()));
-
-    // Scatter into these regions, ensure vector results match scalar
-    auto expected = AllocateAligned<T>(range);
-    auto actual = AllocateAligned<T>(range);
-
-    const Rebind<Offset, D> d_offsets;
-    auto offsets = AllocateAligned<Offset>(N);  // or indices
-
-    for (size_t rep = 0; rep < 100; ++rep) {
-      // Byte offsets
-      std::fill(expected.get(), expected.get() + range, T(0));
-      std::fill(actual.get(), actual.get() + range, T(0));
-      for (size_t i = 0; i < N; ++i) {
-        // Must be aligned
-        offsets[i] = static_cast<Offset>((Random32(&rng) % range) * sizeof(T));
-        CopyBytes<sizeof(T)>(
-            bytes.get() + i * sizeof(T),
-            reinterpret_cast<uint8_t*>(expected.get()) + offsets[i]);
-      }
-      const auto voffsets = Load(d_offsets, offsets.get());
-      ScatterOffset(data, d, actual.get(), voffsets);
-      if (!BytesEqual(expected.get(), actual.get(), max_bytes)) {
-        Print(d, "Data", data);
-        Print(d_offsets, "Offsets", voffsets);
-        HWY_ASSERT(false);
-      }
-
-      // Indices
-      std::fill(expected.get(), expected.get() + range, T(0));
-      std::fill(actual.get(), actual.get() + range, T(0));
-      for (size_t i = 0; i < N; ++i) {
-        offsets[i] = static_cast<Offset>(Random32(&rng) % range);
-        CopyBytes<sizeof(T)>(bytes.get() + i * sizeof(T),
-                             &expected[size_t(offsets[i])]);
-      }
-      const auto vindices = Load(d_offsets, offsets.get());
-      ScatterIndex(data, d, actual.get(), vindices);
-      if (!BytesEqual(expected.get(), actual.get(), max_bytes)) {
-        Print(d, "Data", data);
-        Print(d_offsets, "Indices", vindices);
-        HWY_ASSERT(false);
-      }
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllScatter() {
-  ForUIF3264(ForPartialVectors<TestScatter>());
-}
-
-struct TestGather {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    using Offset = MakeSigned<T>;
-
-    const size_t N = Lanes(d);
-    const size_t range = 4 * N;                  // number of items to gather
-    const size_t max_bytes = range * sizeof(T);  // upper bound on offset
-
-    RandomState rng;
-
-    // Data to be gathered from
-    auto bytes = AllocateAligned<uint8_t>(max_bytes);
-    for (size_t i = 0; i < max_bytes; ++i) {
-      bytes[i] = static_cast<uint8_t>(Random32(&rng) & 0xFF);
-    }
-
-    auto expected = AllocateAligned<T>(N);
-    auto offsets = AllocateAligned<Offset>(N);
-    auto indices = AllocateAligned<Offset>(N);
-
-    for (size_t rep = 0; rep < 100; ++rep) {
-      // Offsets
-      for (size_t i = 0; i < N; ++i) {
-        // Must be aligned
-        offsets[i] = static_cast<Offset>((Random32(&rng) % range) * sizeof(T));
-        CopyBytes<sizeof(T)>(bytes.get() + offsets[i], &expected[i]);
-      }
-
-      const Rebind<Offset, D> d_offset;
-      const T* base = reinterpret_cast<const T*>(bytes.get());
-      auto actual = GatherOffset(d, base, Load(d_offset, offsets.get()));
-      HWY_ASSERT_VEC_EQ(d, expected.get(), actual);
-
-      // Indices
-      for (size_t i = 0; i < N; ++i) {
-        indices[i] =
-            static_cast<Offset>(Random32(&rng) % (max_bytes / sizeof(T)));
-        CopyBytes<sizeof(T)>(base + indices[i], &expected[i]);
-      }
-      actual = GatherIndex(d, base, Load(d_offset, indices.get()));
-      HWY_ASSERT_VEC_EQ(d, expected.get(), actual);
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllGather() {
-  ForUIF3264(ForPartialVectors<TestGather>());
-}
-
-HWY_NOINLINE void TestAllCache() {
-  LoadFence();
-  FlushStream();
-  int test = 0;
-  Prefetch(&test);
-  FlushCacheline(&test);
-  Pause();
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-
-namespace hwy {
-HWY_BEFORE_TEST(HwyMemoryTest);
-HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadStore);
-HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllSafeCopyN);
-HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadDup128);
-HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStream);
-HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllScatter);
-HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllGather);
-HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllCache);
-}  // namespace hwy
-
-#endif
diff --git a/third_party/highway/hwy/tests/mul_test.cc b/third_party/highway/hwy/tests/mul_test.cc
deleted file mode 100644 (file)
index f04bf0c..0000000
+++ /dev/null
@@ -1,436 +0,0 @@
-// Copyright 2019 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <stddef.h>
-#include <stdint.h>
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "tests/mul_test.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#include "hwy/highway.h"
-#include "hwy/tests/test_util-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-struct TestUnsignedMul {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v0 = Zero(d);
-    const auto v1 = Set(d, T(1));
-    const auto vi = Iota(d, 1);
-    const auto vj = Iota(d, 3);
-    const size_t N = Lanes(d);
-    auto expected = AllocateAligned<T>(N);
-
-    HWY_ASSERT_VEC_EQ(d, v0, Mul(v0, v0));
-    HWY_ASSERT_VEC_EQ(d, v1, Mul(v1, v1));
-    HWY_ASSERT_VEC_EQ(d, vi, Mul(v1, vi));
-    HWY_ASSERT_VEC_EQ(d, vi, Mul(vi, v1));
-
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = static_cast<T>((1 + i) * (1 + i));
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vi, vi));
-
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = static_cast<T>((1 + i) * (3 + i));
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vi, vj));
-
-    const T max = LimitsMax<T>();
-    const auto vmax = Set(d, max);
-    HWY_ASSERT_VEC_EQ(d, vmax, Mul(vmax, v1));
-    HWY_ASSERT_VEC_EQ(d, vmax, Mul(v1, vmax));
-
-    const size_t bits = sizeof(T) * 8;
-    const uint64_t mask = bits==64 ? (~uint64_t{0}) : (1ull << bits) - 1;
-    const T max2 = (static_cast<uint64_t>(max) * max) & mask;
-    HWY_ASSERT_VEC_EQ(d, Set(d, max2), Mul(vmax, vmax));
-  }
-};
-
-struct TestSignedMul {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    auto expected = AllocateAligned<T>(N);
-
-    const auto v0 = Zero(d);
-    const auto v1 = Set(d, T(1));
-    const auto vi = Iota(d, 1);
-    const auto vn = Iota(d, -T(N));  // no i8 supported, so no wraparound
-    HWY_ASSERT_VEC_EQ(d, v0, Mul(v0, v0));
-    HWY_ASSERT_VEC_EQ(d, v1, Mul(v1, v1));
-    HWY_ASSERT_VEC_EQ(d, vi, Mul(v1, vi));
-    HWY_ASSERT_VEC_EQ(d, vi, Mul(vi, v1));
-
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = static_cast<T>((1 + i) * (1 + i));
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vi, vi));
-
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = static_cast<T>((-T(N) + T(i)) * T(1u + i));
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vn, vi));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), Mul(vi, vn));
-  }
-};
-
-HWY_NOINLINE void TestAllMul() {
-  const ForPartialVectors<TestUnsignedMul> test_unsigned;
-  // No u8.
-  test_unsigned(uint16_t());
-  test_unsigned(uint32_t());
-  test_unsigned(uint64_t());
-
-  const ForPartialVectors<TestSignedMul> test_signed;
-  // No i8.
-  test_signed(int16_t());
-  test_signed(int32_t());
-  test_signed(int64_t());
-}
-
-struct TestMulHigh {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    using Wide = MakeWide<T>;
-    const size_t N = Lanes(d);
-    auto in_lanes = AllocateAligned<T>(N);
-    auto expected_lanes = AllocateAligned<T>(N);
-
-    const auto vi = Iota(d, 1);
-    // no i8 supported, so no wraparound
-    const auto vni = Iota(d, T(static_cast<T>(~N + 1)));
-
-    const auto v0 = Zero(d);
-    HWY_ASSERT_VEC_EQ(d, v0, MulHigh(v0, v0));
-    HWY_ASSERT_VEC_EQ(d, v0, MulHigh(v0, vi));
-    HWY_ASSERT_VEC_EQ(d, v0, MulHigh(vi, v0));
-
-    // Large positive squared
-    for (size_t i = 0; i < N; ++i) {
-      in_lanes[i] = T(LimitsMax<T>() >> i);
-      expected_lanes[i] = T((Wide(in_lanes[i]) * in_lanes[i]) >> 16);
-    }
-    auto v = Load(d, in_lanes.get());
-    HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(v, v));
-
-    // Large positive * small positive
-    for (size_t i = 0; i < N; ++i) {
-      expected_lanes[i] = T((Wide(in_lanes[i]) * T(1u + i)) >> 16);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(v, vi));
-    HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(vi, v));
-
-    // Large positive * small negative
-    for (size_t i = 0; i < N; ++i) {
-      expected_lanes[i] = T((Wide(in_lanes[i]) * T(i - N)) >> 16);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(v, vni));
-    HWY_ASSERT_VEC_EQ(d, expected_lanes.get(), MulHigh(vni, v));
-  }
-};
-
-HWY_NOINLINE void TestAllMulHigh() {
-  ForPartialVectors<TestMulHigh> test;
-  test(int16_t());
-  test(uint16_t());
-}
-
-struct TestMulFixedPoint15 {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v0 = Zero(d);
-    HWY_ASSERT_VEC_EQ(d, v0, MulFixedPoint15(v0, v0));
-    HWY_ASSERT_VEC_EQ(d, v0, MulFixedPoint15(v0, v0));
-
-    const size_t N = Lanes(d);
-    auto in1 = AllocateAligned<T>(N);
-    auto in2 = AllocateAligned<T>(N);
-    auto expected = AllocateAligned<T>(N);
-
-    // Random inputs in each lane
-    RandomState rng;
-    for (size_t rep = 0; rep < AdjustedReps(10000); ++rep) {
-      for (size_t i = 0; i < N; ++i) {
-        in1[i] = static_cast<T>(Random64(&rng) & 0xFFFF);
-        in2[i] = static_cast<T>(Random64(&rng) & 0xFFFF);
-      }
-
-      for (size_t i = 0; i < N; ++i) {
-        // There are three ways to compute the results. x86 and ARM are defined
-        // using 32-bit multiplication results:
-        const int arm = (2 * in1[i] * in2[i] + 0x8000) >> 16;
-        const int x86 = (((in1[i] * in2[i]) >> 14) + 1) >> 1;
-        // On other platforms, split the result into upper and lower 16 bits.
-        const auto v1 = Set(d, in1[i]);
-        const auto v2 = Set(d, in2[i]);
-        const int hi = GetLane(MulHigh(v1, v2));
-        const int lo = GetLane(Mul(v1, v2)) & 0xFFFF;
-        const int split = 2 * hi + ((lo + 0x4000) >> 15);
-        expected[i] = static_cast<T>(arm);
-        if (in1[i] != -32768 || in2[i] != -32768) {
-          HWY_ASSERT_EQ(arm, x86);
-          HWY_ASSERT_EQ(arm, split);
-        }
-      }
-
-      const auto a = Load(d, in1.get());
-      const auto b = Load(d, in2.get());
-      HWY_ASSERT_VEC_EQ(d, expected.get(), MulFixedPoint15(a, b));
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllMulFixedPoint15() {
-  ForPartialVectors<TestMulFixedPoint15>()(int16_t());
-}
-
-struct TestMulEven {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    using Wide = MakeWide<T>;
-    const Repartition<Wide, D> d2;
-    const auto v0 = Zero(d);
-    HWY_ASSERT_VEC_EQ(d2, Zero(d2), MulEven(v0, v0));
-
-    const size_t N = Lanes(d);
-    auto in_lanes = AllocateAligned<T>(N);
-    auto expected = AllocateAligned<Wide>(Lanes(d2));
-    for (size_t i = 0; i < N; i += 2) {
-      in_lanes[i + 0] = LimitsMax<T>() >> i;
-      if (N != 1) {
-        in_lanes[i + 1] = 1;  // unused
-      }
-      expected[i / 2] = Wide(in_lanes[i + 0]) * in_lanes[i + 0];
-    }
-
-    const auto v = Load(d, in_lanes.get());
-    HWY_ASSERT_VEC_EQ(d2, expected.get(), MulEven(v, v));
-  }
-};
-
-struct TestMulEvenOdd64 {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-#if HWY_TARGET != HWY_SCALAR
-    const auto v0 = Zero(d);
-    HWY_ASSERT_VEC_EQ(d, Zero(d), MulEven(v0, v0));
-    HWY_ASSERT_VEC_EQ(d, Zero(d), MulOdd(v0, v0));
-
-    const size_t N = Lanes(d);
-    if (N == 1) return;
-
-    auto in1 = AllocateAligned<T>(N);
-    auto in2 = AllocateAligned<T>(N);
-    auto expected_even = AllocateAligned<T>(N);
-    auto expected_odd = AllocateAligned<T>(N);
-
-    // Random inputs in each lane
-    RandomState rng;
-    for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) {
-      for (size_t i = 0; i < N; ++i) {
-        in1[i] = Random64(&rng);
-        in2[i] = Random64(&rng);
-      }
-
-      for (size_t i = 0; i < N; i += 2) {
-        expected_even[i] = Mul128(in1[i], in2[i], &expected_even[i + 1]);
-        expected_odd[i] = Mul128(in1[i + 1], in2[i + 1], &expected_odd[i + 1]);
-      }
-
-      const auto a = Load(d, in1.get());
-      const auto b = Load(d, in2.get());
-      HWY_ASSERT_VEC_EQ(d, expected_even.get(), MulEven(a, b));
-      HWY_ASSERT_VEC_EQ(d, expected_odd.get(), MulOdd(a, b));
-    }
-#else
-    (void)d;
-#endif  // HWY_TARGET != HWY_SCALAR
-  }
-};
-
-HWY_NOINLINE void TestAllMulEven() {
-  ForGEVectors<64, TestMulEven> test;
-  test(int32_t());
-  test(uint32_t());
-
-  ForGEVectors<128, TestMulEvenOdd64>()(uint64_t());
-}
-
-#ifndef HWY_NATIVE_FMA
-#error "Bug in set_macros-inl.h, did not set HWY_NATIVE_FMA"
-#endif
-
-struct TestMulAdd {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto k0 = Zero(d);
-    const auto kNeg0 = Set(d, T(-0.0));
-    const auto v1 = Iota(d, 1);
-    const auto v2 = Iota(d, 2);
-    const size_t N = Lanes(d);
-    auto expected = AllocateAligned<T>(N);
-    HWY_ASSERT_VEC_EQ(d, k0, MulAdd(k0, k0, k0));
-    HWY_ASSERT_VEC_EQ(d, v2, MulAdd(k0, v1, v2));
-    HWY_ASSERT_VEC_EQ(d, v2, MulAdd(v1, k0, v2));
-    HWY_ASSERT_VEC_EQ(d, k0, NegMulAdd(k0, k0, k0));
-    HWY_ASSERT_VEC_EQ(d, v2, NegMulAdd(k0, v1, v2));
-    HWY_ASSERT_VEC_EQ(d, v2, NegMulAdd(v1, k0, v2));
-
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = static_cast<T>((i + 1) * (i + 2));
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), MulAdd(v2, v1, k0));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), MulAdd(v1, v2, k0));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulAdd(Neg(v2), v1, k0));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulAdd(v1, Neg(v2), k0));
-
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = static_cast<T>((i + 2) * (i + 2) + (i + 1));
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), MulAdd(v2, v2, v1));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulAdd(Neg(v2), v2, v1));
-
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] =
-          T(-T(i + 2u) * static_cast<T>(i + 2) + static_cast<T>(1 + i));
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulAdd(v2, v2, v1));
-
-    HWY_ASSERT_VEC_EQ(d, k0, MulSub(k0, k0, k0));
-    HWY_ASSERT_VEC_EQ(d, kNeg0, NegMulSub(k0, k0, k0));
-
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = -T(i + 2);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), MulSub(k0, v1, v2));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), MulSub(v1, k0, v2));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulSub(Neg(k0), v1, v2));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulSub(v1, Neg(k0), v2));
-
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = static_cast<T>((i + 1) * (i + 2));
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), MulSub(v1, v2, k0));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), MulSub(v2, v1, k0));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulSub(Neg(v1), v2, k0));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulSub(v2, Neg(v1), k0));
-
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = static_cast<T>((i + 2) * (i + 2) - (1 + i));
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), MulSub(v2, v2, v1));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), NegMulSub(Neg(v2), v2, v1));
-  }
-};
-
-HWY_NOINLINE void TestAllMulAdd() {
-  ForFloatTypes(ForPartialVectors<TestMulAdd>());
-}
-
-struct TestReorderWidenMulAccumulate {
-  template <typename TN, class DN>
-  HWY_NOINLINE void operator()(TN /*unused*/, DN dn) {
-    using TW = MakeWide<TN>;
-    const RepartitionToWide<DN> dw;
-    const auto f0 = Zero(dw);
-    const auto f1 = Set(dw, 1.0f);
-    const auto fi = Iota(dw, 1);
-    const auto bf0 = ReorderDemote2To(dn, f0, f0);
-    const auto bf1 = ReorderDemote2To(dn, f1, f1);
-    const auto bfi = ReorderDemote2To(dn, fi, fi);
-    const size_t NW = Lanes(dw);
-    auto delta = AllocateAligned<TW>(2 * NW);
-    for (size_t i = 0; i < 2 * NW; ++i) {
-      delta[i] = 0.0f;
-    }
-
-    // Any input zero => both outputs zero
-    auto sum1 = f0;
-    HWY_ASSERT_VEC_EQ(dw, f0,
-                      ReorderWidenMulAccumulate(dw, bf0, bf0, f0, sum1));
-    HWY_ASSERT_VEC_EQ(dw, f0, sum1);
-    HWY_ASSERT_VEC_EQ(dw, f0,
-                      ReorderWidenMulAccumulate(dw, bf0, bfi, f0, sum1));
-    HWY_ASSERT_VEC_EQ(dw, f0, sum1);
-    HWY_ASSERT_VEC_EQ(dw, f0,
-                      ReorderWidenMulAccumulate(dw, bfi, bf0, f0, sum1));
-    HWY_ASSERT_VEC_EQ(dw, f0, sum1);
-
-    // delta[p] := 1.0, all others zero. For each p: Dot(delta, all-ones) == 1.
-    for (size_t p = 0; p < 2 * NW; ++p) {
-      delta[p] = 1.0f;
-      const auto delta0 = Load(dw, delta.get() + 0);
-      const auto delta1 = Load(dw, delta.get() + NW);
-      delta[p] = 0.0f;
-      const auto bf_delta = ReorderDemote2To(dn, delta0, delta1);
-
-      {
-        sum1 = f0;
-        const auto sum0 =
-            ReorderWidenMulAccumulate(dw, bf_delta, bf1, f0, sum1);
-        HWY_ASSERT_EQ(1.0f, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
-      }
-      // Swapped arg order
-      {
-        sum1 = f0;
-        const auto sum0 =
-            ReorderWidenMulAccumulate(dw, bf1, bf_delta, f0, sum1);
-        HWY_ASSERT_EQ(1.0f, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
-      }
-      // Start with nonzero sum0 or sum1
-      {
-        sum1 = delta1;
-        const auto sum0 =
-            ReorderWidenMulAccumulate(dw, bf_delta, bf1, delta0, sum1);
-        HWY_ASSERT_EQ(2.0f, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
-      }
-      // Start with nonzero sum0 or sum1, and swap arg order
-      {
-        sum1 = delta1;
-        const auto sum0 =
-            ReorderWidenMulAccumulate(dw, bf1, bf_delta, delta0, sum1);
-        HWY_ASSERT_EQ(2.0f, GetLane(SumOfLanes(dw, Add(sum0, sum1))));
-      }
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllReorderWidenMulAccumulate() {
-  ForShrinkableVectors<TestReorderWidenMulAccumulate>()(bfloat16_t());
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-
-namespace hwy {
-HWY_BEFORE_TEST(HwyMulTest);
-HWY_EXPORT_AND_TEST_P(HwyMulTest, TestAllMul);
-HWY_EXPORT_AND_TEST_P(HwyMulTest, TestAllMulHigh);
-HWY_EXPORT_AND_TEST_P(HwyMulTest, TestAllMulFixedPoint15);
-HWY_EXPORT_AND_TEST_P(HwyMulTest, TestAllMulEven);
-HWY_EXPORT_AND_TEST_P(HwyMulTest, TestAllMulAdd);
-HWY_EXPORT_AND_TEST_P(HwyMulTest, TestAllReorderWidenMulAccumulate);
-}  // namespace hwy
-
-#endif
diff --git a/third_party/highway/hwy/tests/reduction_test.cc b/third_party/highway/hwy/tests/reduction_test.cc
deleted file mode 100644 (file)
index 981e3f3..0000000
+++ /dev/null
@@ -1,228 +0,0 @@
-// Copyright 2019 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <stddef.h>
-#include <stdint.h>
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "tests/reduction_test.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#include "hwy/highway.h"
-#include "hwy/tests/test_util-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-struct TestSumOfLanes {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    auto in_lanes = AllocateAligned<T>(N);
-
-    // Lane i = bit i, higher lanes 0
-    double sum = 0.0;
-    // Avoid setting sign bit and cap at double precision
-    constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 51);
-    for (size_t i = 0; i < N; ++i) {
-      in_lanes[i] = i < kBits ? static_cast<T>(1ull << i) : 0;
-      sum += static_cast<double>(in_lanes[i]);
-    }
-    HWY_ASSERT_VEC_EQ(d, Set(d, T(sum)),
-                      SumOfLanes(d, Load(d, in_lanes.get())));
-
-    // Lane i = i (iota) to include upper lanes
-    sum = 0.0;
-    for (size_t i = 0; i < N; ++i) {
-      sum += static_cast<double>(i);
-    }
-    HWY_ASSERT_VEC_EQ(d, Set(d, T(sum)), SumOfLanes(d, Iota(d, 0)));
-  }
-};
-
-HWY_NOINLINE void TestAllSumOfLanes() {
-  ForUIF3264(ForPartialVectors<TestSumOfLanes>());
-}
-
-struct TestMinOfLanes {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    auto in_lanes = AllocateAligned<T>(N);
-
-    // Lane i = bit i, higher lanes = 2 (not the minimum)
-    T min = HighestValue<T>();
-    // Avoid setting sign bit and cap at double precision
-    constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 51);
-    for (size_t i = 0; i < N; ++i) {
-      in_lanes[i] = i < kBits ? static_cast<T>(1ull << i) : 2;
-      min = HWY_MIN(min, in_lanes[i]);
-    }
-    HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(d, Load(d, in_lanes.get())));
-
-    // Lane i = N - i to include upper lanes
-    min = HighestValue<T>();
-    for (size_t i = 0; i < N; ++i) {
-      in_lanes[i] = static_cast<T>(N - i);  // no 8-bit T so no wraparound
-      min = HWY_MIN(min, in_lanes[i]);
-    }
-    HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(d, Load(d, in_lanes.get())));
-
-    // Bug #910: also check negative values
-    min = HighestValue<T>();
-    const T input_copy[] = {static_cast<T>(-1),
-                            static_cast<T>(-2),
-                            1,
-                            2,
-                            3,
-                            4,
-                            5,
-                            6,
-                            7,
-                            8,
-                            9,
-                            10,
-                            11,
-                            12,
-                            13,
-                            14};
-    size_t i = 0;
-    for (; i < HWY_MIN(N, sizeof(input_copy) / sizeof(T)); ++i) {
-      in_lanes[i] = input_copy[i];
-      min = HWY_MIN(min, input_copy[i]);
-    }
-    // Pad with neutral element to full vector (so we can load)
-    for (; i < N; ++i) {
-      in_lanes[i] = min;
-    }
-    HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(d, Load(d, in_lanes.get())));
-  }
-};
-
-struct TestMaxOfLanes {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    auto in_lanes = AllocateAligned<T>(N);
-
-    T max = LowestValue<T>();
-    // Avoid setting sign bit and cap at double precision
-    constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 51);
-    for (size_t i = 0; i < N; ++i) {
-      in_lanes[i] = i < kBits ? static_cast<T>(1ull << i) : 0;
-      max = HWY_MAX(max, in_lanes[i]);
-    }
-    HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(d, Load(d, in_lanes.get())));
-
-    // Lane i = i to include upper lanes
-    max = LowestValue<T>();
-    for (size_t i = 0; i < N; ++i) {
-      in_lanes[i] = static_cast<T>(i);  // no 8-bit T so no wraparound
-      max = HWY_MAX(max, in_lanes[i]);
-    }
-    HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(d, Load(d, in_lanes.get())));
-
-    // Bug #910: also check negative values
-    max = LowestValue<T>();
-    const T input_copy[] = {static_cast<T>(-1),
-                            static_cast<T>(-2),
-                            1,
-                            2,
-                            3,
-                            4,
-                            5,
-                            6,
-                            7,
-                            8,
-                            9,
-                            10,
-                            11,
-                            12,
-                            13,
-                            14};
-    size_t i = 0;
-    for (; i < HWY_MIN(N, sizeof(input_copy) / sizeof(T)); ++i) {
-      in_lanes[i] = input_copy[i];
-      max = HWY_MAX(max, in_lanes[i]);
-    }
-    // Pad with neutral element to full vector (so we can load)
-    for (; i < N; ++i) {
-      in_lanes[i] = max;
-    }
-    HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(d, Load(d, in_lanes.get())));
-  }
-};
-
-HWY_NOINLINE void TestAllMinMaxOfLanes() {
-  const ForPartialVectors<TestMinOfLanes> test_min;
-  const ForPartialVectors<TestMaxOfLanes> test_max;
-  ForUIF3264(test_min);
-  ForUIF3264(test_max);
-  test_min(uint16_t());
-  test_max(uint16_t());
-  test_min(int16_t());
-  test_max(int16_t());
-}
-
-struct TestSumsOf8 {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    RandomState rng;
-
-    const size_t N = Lanes(d);
-    if (N < 8) return;
-    const Repartition<uint64_t, D> du64;
-
-    auto in_lanes = AllocateAligned<T>(N);
-    auto sum_lanes = AllocateAligned<uint64_t>(N / 8);
-
-    for (size_t rep = 0; rep < 100; ++rep) {
-      for (size_t i = 0; i < N; ++i) {
-        in_lanes[i] = Random64(&rng) & 0xFF;
-      }
-
-      for (size_t idx_sum = 0; idx_sum < N / 8; ++idx_sum) {
-        uint64_t sum = 0;
-        for (size_t i = 0; i < 8; ++i) {
-          sum += in_lanes[idx_sum * 8 + i];
-        }
-        sum_lanes[idx_sum] = sum;
-      }
-
-      const Vec<D> in = Load(d, in_lanes.get());
-      HWY_ASSERT_VEC_EQ(du64, sum_lanes.get(), SumsOf8(in));
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllSumsOf8() {
-  ForGEVectors<64, TestSumsOf8>()(uint8_t());
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-
-namespace hwy {
-HWY_BEFORE_TEST(HwyReductionTest);
-HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllSumOfLanes);
-HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllMinMaxOfLanes);
-HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllSumsOf8);
-}  // namespace hwy
-
-#endif
diff --git a/third_party/highway/hwy/tests/reverse_test.cc b/third_party/highway/hwy/tests/reverse_test.cc
deleted file mode 100644 (file)
index fcbcb7f..0000000
+++ /dev/null
@@ -1,176 +0,0 @@
-// Copyright 2022 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <stddef.h>
-
-#include "hwy/base.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "tests/reverse_test.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#include "hwy/highway.h"
-#include "hwy/tests/test_util-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-struct TestReverse {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    const RebindToUnsigned<D> du;  // Iota does not support float16_t.
-    const auto v = BitCast(d, Iota(du, 1));
-    auto expected = AllocateAligned<T>(N);
-
-    // Can't set float16_t value directly, need to permute in memory.
-    auto copy = AllocateAligned<T>(N);
-    Store(v, d, copy.get());
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = copy[N - 1 - i];
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), Reverse(d, v));
-  }
-};
-
-struct TestReverse2 {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    const RebindToUnsigned<D> du;  // Iota does not support float16_t.
-    const auto v = BitCast(d, Iota(du, 1));
-    auto expected = AllocateAligned<T>(N);
-
-    // Can't set float16_t value directly, need to permute in memory.
-    auto copy = AllocateAligned<T>(N);
-    Store(v, d, copy.get());
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = copy[i ^ 1];
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), Reverse2(d, v));
-  }
-};
-
-struct TestReverse4 {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    const RebindToUnsigned<D> du;  // Iota does not support float16_t.
-    const auto v = BitCast(d, Iota(du, 1));
-    auto expected = AllocateAligned<T>(N);
-
-    // Can't set float16_t value directly, need to permute in memory.
-    auto copy = AllocateAligned<T>(N);
-    Store(v, d, copy.get());
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = copy[i ^ 3];
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), Reverse4(d, v));
-  }
-};
-
-struct TestReverse8 {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    const RebindToUnsigned<D> du;  // Iota does not support float16_t.
-    const auto v = BitCast(d, Iota(du, 1));
-    auto expected = AllocateAligned<T>(N);
-
-    // Can't set float16_t value directly, need to permute in memory.
-    auto copy = AllocateAligned<T>(N);
-    Store(v, d, copy.get());
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = copy[i ^ 7];
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), Reverse8(d, v));
-  }
-};
-
-HWY_NOINLINE void TestAllReverse() {
-  // 8-bit is not supported because Risc-V uses rgather of Lanes - Iota,
-  // which requires 16 bits.
-  ForUIF163264(ForPartialVectors<TestReverse>());
-}
-
-HWY_NOINLINE void TestAllReverse2() {
-  // 8-bit is not supported because Risc-V uses rgather of Lanes - Iota,
-  // which requires 16 bits.
-  ForUIF64(ForGEVectors<128, TestReverse2>());
-  ForUIF32(ForGEVectors<64, TestReverse2>());
-  ForUIF16(ForGEVectors<32, TestReverse2>());
-}
-
-HWY_NOINLINE void TestAllReverse4() {
-  // 8-bit is not supported because Risc-V uses rgather of Lanes - Iota,
-  // which requires 16 bits.
-  ForUIF64(ForGEVectors<256, TestReverse4>());
-  ForUIF32(ForGEVectors<128, TestReverse4>());
-  ForUIF16(ForGEVectors<64, TestReverse4>());
-}
-
-HWY_NOINLINE void TestAllReverse8() {
-  // 8-bit is not supported because Risc-V uses rgather of Lanes - Iota,
-  // which requires 16 bits.
-  ForUIF64(ForGEVectors<512, TestReverse8>());
-  ForUIF32(ForGEVectors<256, TestReverse8>());
-  ForUIF16(ForGEVectors<128, TestReverse8>());
-}
-
-struct TestReverseBlocks {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    const RebindToUnsigned<D> du;  // Iota does not support float16_t.
-    const auto v = BitCast(d, Iota(du, 1));
-    auto expected = AllocateAligned<T>(N);
-
-    constexpr size_t kLanesPerBlock = 16 / sizeof(T);
-    const size_t num_blocks = N / kLanesPerBlock;
-    HWY_ASSERT(num_blocks != 0);
-
-    // Can't set float16_t value directly, need to permute in memory.
-    auto copy = AllocateAligned<T>(N);
-    Store(v, d, copy.get());
-    for (size_t i = 0; i < N; ++i) {
-      const size_t idx_block = i / kLanesPerBlock;
-      const size_t base = (num_blocks - 1 - idx_block) * kLanesPerBlock;
-      expected[i] = copy[base + (i % kLanesPerBlock)];
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), ReverseBlocks(d, v));
-  }
-};
-
-HWY_NOINLINE void TestAllReverseBlocks() {
-  ForAllTypes(ForGEVectors<128, TestReverseBlocks>());
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-
-namespace hwy {
-HWY_BEFORE_TEST(HwyReverseTest);
-HWY_EXPORT_AND_TEST_P(HwyReverseTest, TestAllReverse);
-HWY_EXPORT_AND_TEST_P(HwyReverseTest, TestAllReverse2);
-HWY_EXPORT_AND_TEST_P(HwyReverseTest, TestAllReverse4);
-HWY_EXPORT_AND_TEST_P(HwyReverseTest, TestAllReverse8);
-HWY_EXPORT_AND_TEST_P(HwyReverseTest, TestAllReverseBlocks);
-}  // namespace hwy
-
-#endif
diff --git a/third_party/highway/hwy/tests/shift_test.cc b/third_party/highway/hwy/tests/shift_test.cc
deleted file mode 100644 (file)
index 585eba7..0000000
+++ /dev/null
@@ -1,428 +0,0 @@
-// Copyright 2019 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include <algorithm>
-#include <limits>
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "tests/shift_test.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#include "hwy/highway.h"
-#include "hwy/tests/test_util-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-template <bool kSigned>
-struct TestLeftShifts {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T t, D d) {
-    if (kSigned) {
-      // Also test positive values
-      TestLeftShifts</*kSigned=*/false>()(t, d);
-    }
-
-    using TI = MakeSigned<T>;
-    using TU = MakeUnsigned<T>;
-    const size_t N = Lanes(d);
-    auto expected = AllocateAligned<T>(N);
-
-    // Values to shift
-    const auto values = Iota(d, static_cast<T>(kSigned ? -TI(N) : TI(0)));
-    constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
-
-    // 0
-    HWY_ASSERT_VEC_EQ(d, values, ShiftLeft<0>(values));
-    HWY_ASSERT_VEC_EQ(d, values, ShiftLeftSame(values, 0));
-
-    // 1
-    for (size_t i = 0; i < N; ++i) {
-      const T value = kSigned ? T(T(i) - T(N)) : T(i);
-      expected[i] = T(TU(value) << 1);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<1>(values));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, 1));
-
-    // max
-    for (size_t i = 0; i < N; ++i) {
-      const T value = kSigned ? T(T(i) - T(N)) : T(i);
-      expected[i] = T(TU(value) << kMaxShift);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<kMaxShift>(values));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, kMaxShift));
-  }
-};
-
-template <bool kSigned>
-struct TestVariableLeftShifts {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T t, D d) {
-    if (kSigned) {
-      // Also test positive values
-      TestVariableLeftShifts</*kSigned=*/false>()(t, d);
-    }
-
-    using TI = MakeSigned<T>;
-    using TU = MakeUnsigned<T>;
-    const size_t N = Lanes(d);
-    auto expected = AllocateAligned<T>(N);
-
-    const auto v0 = Zero(d);
-    const auto v1 = Set(d, 1);
-    const auto values = Iota(d, kSigned ? -TI(N) : TI(0));  // value to shift
-
-    constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
-    const auto max_shift = Set(d, kMaxShift);
-    const auto small_shifts = And(Iota(d, 0), max_shift);
-    const auto large_shifts = max_shift - small_shifts;
-
-    // Same: 0
-    HWY_ASSERT_VEC_EQ(d, values, Shl(values, v0));
-
-    // Same: 1
-    for (size_t i = 0; i < N; ++i) {
-      const T value = kSigned ? T(i) - T(N) : T(i);
-      expected[i] = T(TU(value) << 1);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, v1));
-
-    // Same: max
-    for (size_t i = 0; i < N; ++i) {
-      const T value = kSigned ? T(i) - T(N) : T(i);
-      expected[i] = T(TU(value) << kMaxShift);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, max_shift));
-
-    // Variable: small
-    for (size_t i = 0; i < N; ++i) {
-      const T value = kSigned ? T(i) - T(N) : T(i);
-      expected[i] = T(TU(value) << (i & kMaxShift));
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, small_shifts));
-
-    // Variable: large
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = T(TU(1) << (kMaxShift - (i & kMaxShift)));
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(v1, large_shifts));
-  }
-};
-
-struct TestUnsignedRightShifts {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    auto expected = AllocateAligned<T>(N);
-
-    const auto values = Iota(d, 0);
-
-    const T kMax = LimitsMax<T>();
-    constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
-
-    // Shift by 0
-    HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values));
-    HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0));
-
-    // Shift by 1
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = T(T(i & kMax) >> 1);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1));
-
-    // max
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = T(T(i & kMax) >> kMaxShift);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<kMaxShift>(values));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, kMaxShift));
-  }
-};
-
-struct TestRotateRight {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    auto expected = AllocateAligned<T>(N);
-
-    constexpr size_t kBits = sizeof(T) * 8;
-    const auto mask_shift = Set(d, T{kBits});
-    // Cover as many bit positions as possible to test shifting out
-    const auto values = Shl(Set(d, T{1}), And(Iota(d, 0), mask_shift));
-
-    // Rotate by 0
-    HWY_ASSERT_VEC_EQ(d, values, RotateRight<0>(values));
-
-    // Rotate by 1
-    Store(values, d, expected.get());
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = (expected[i] >> 1) | (expected[i] << (kBits - 1));
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<1>(values));
-
-    // Rotate by half
-    Store(values, d, expected.get());
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = (expected[i] >> (kBits / 2)) | (expected[i] << (kBits / 2));
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<kBits / 2>(values));
-
-    // Rotate by max
-    Store(values, d, expected.get());
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = (expected[i] >> (kBits - 1)) | (expected[i] << 1);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<kBits - 1>(values));
-  }
-};
-
-struct TestVariableUnsignedRightShifts {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    auto expected = AllocateAligned<T>(N);
-
-    const auto v0 = Zero(d);
-    const auto v1 = Set(d, 1);
-    const auto values = Iota(d, 0);
-
-    const T kMax = LimitsMax<T>();
-    const auto max = Set(d, kMax);
-
-    constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
-    const auto max_shift = Set(d, kMaxShift);
-    const auto small_shifts = And(Iota(d, 0), max_shift);
-    const auto large_shifts = max_shift - small_shifts;
-
-    // Same: 0
-    HWY_ASSERT_VEC_EQ(d, values, Shr(values, v0));
-
-    // Same: 1
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = T(T(i & kMax) >> 1);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(values, v1));
-
-    // Same: max
-    HWY_ASSERT_VEC_EQ(d, v0, Shr(values, max_shift));
-
-    // Variable: small
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = T(i) >> (i & kMaxShift);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(values, small_shifts));
-
-    // Variable: Large
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = kMax >> (kMaxShift - (i & kMaxShift));
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(max, large_shifts));
-  }
-};
-
-template <int kAmount, typename T>
-T RightShiftNegative(T val) {
-  // C++ shifts are implementation-defined for negative numbers, and we have
-  // seen divisions replaced with shifts, so resort to bit operations.
-  using TU = hwy::MakeUnsigned<T>;
-  TU bits;
-  CopySameSize(&val, &bits);
-
-  const TU shifted = TU(bits >> kAmount);
-
-  const TU all = TU(~TU(0));
-  const size_t num_zero = sizeof(TU) * 8 - 1 - kAmount;
-  const TU sign_extended = static_cast<TU>((all << num_zero) & LimitsMax<TU>());
-
-  bits = shifted | sign_extended;
-  CopySameSize(&bits, &val);
-  return val;
-}
-
-class TestSignedRightShifts {
- public:
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    auto expected = AllocateAligned<T>(N);
-    constexpr T kMin = LimitsMin<T>();
-    constexpr T kMax = LimitsMax<T>();
-    constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
-
-    // First test positive values, negative are checked below.
-    const auto v0 = Zero(d);
-    const auto values = And(Iota(d, 0), Set(d, kMax));
-
-    // Shift by 0
-    HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values));
-    HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0));
-
-    // Shift by 1
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = T(T(i & kMax) >> 1);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1));
-
-    // max
-    HWY_ASSERT_VEC_EQ(d, v0, ShiftRight<kMaxShift>(values));
-    HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(values, kMaxShift));
-
-    // Even negative value
-    Test<0>(kMin, d, __LINE__);
-    Test<1>(kMin, d, __LINE__);
-    Test<2>(kMin, d, __LINE__);
-    Test<kMaxShift>(kMin, d, __LINE__);
-
-    const T odd = static_cast<T>(kMin + 1);
-    Test<0>(odd, d, __LINE__);
-    Test<1>(odd, d, __LINE__);
-    Test<2>(odd, d, __LINE__);
-    Test<kMaxShift>(odd, d, __LINE__);
-  }
-
- private:
-  template <int kAmount, typename T, class D>
-  void Test(T val, D d, int line) {
-    const auto expected = Set(d, RightShiftNegative<kAmount>(val));
-    const auto in = Set(d, val);
-    const char* file = __FILE__;
-    AssertVecEqual(d, expected, ShiftRight<kAmount>(in), file, line);
-    AssertVecEqual(d, expected, ShiftRightSame(in, kAmount), file, line);
-  }
-};
-
-struct TestVariableSignedRightShifts {
-  template <typename T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    using TU = MakeUnsigned<T>;
-    const size_t N = Lanes(d);
-    auto expected = AllocateAligned<T>(N);
-
-    constexpr T kMin = LimitsMin<T>();
-    constexpr T kMax = LimitsMax<T>();
-
-    constexpr size_t kMaxShift = (sizeof(T) * 8) - 1;
-
-    // First test positive values, negative are checked below.
-    const auto v0 = Zero(d);
-    const auto positive = Iota(d, 0) & Set(d, kMax);
-
-    // Shift by 0
-    HWY_ASSERT_VEC_EQ(d, positive, ShiftRight<0>(positive));
-    HWY_ASSERT_VEC_EQ(d, positive, ShiftRightSame(positive, 0));
-
-    // Shift by 1
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = T(T(i & kMax) >> 1);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(positive));
-    HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(positive, 1));
-
-    // max
-    HWY_ASSERT_VEC_EQ(d, v0, ShiftRight<kMaxShift>(positive));
-    HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(positive, kMaxShift));
-
-    const auto max_shift = Set(d, kMaxShift);
-    const auto small_shifts = And(Iota(d, 0), max_shift);
-    const auto large_shifts = max_shift - small_shifts;
-
-    const auto negative = Iota(d, kMin);
-
-    // Test varying negative to shift
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = RightShiftNegative<1>(static_cast<T>(kMin + i));
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(negative, Set(d, 1)));
-
-    // Shift MSB right by small amounts
-    for (size_t i = 0; i < N; ++i) {
-      const size_t amount = i & kMaxShift;
-      const TU shifted = ~((1ull << (kMaxShift - amount)) - 1);
-      CopySameSize(&shifted, &expected[i]);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(Set(d, kMin), small_shifts));
-
-    // Shift MSB right by large amounts
-    for (size_t i = 0; i < N; ++i) {
-      const size_t amount = kMaxShift - (i & kMaxShift);
-      const TU shifted = ~((1ull << (kMaxShift - amount)) - 1);
-      CopySameSize(&shifted, &expected[i]);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(Set(d, kMin), large_shifts));
-  }
-};
-
-HWY_NOINLINE void TestAllShifts() {
-  ForUnsignedTypes(ForPartialVectors<TestLeftShifts</*kSigned=*/false>>());
-  ForSignedTypes(ForPartialVectors<TestLeftShifts</*kSigned=*/true>>());
-  ForUnsignedTypes(ForPartialVectors<TestUnsignedRightShifts>());
-  ForSignedTypes(ForPartialVectors<TestSignedRightShifts>());
-}
-
-HWY_NOINLINE void TestAllVariableShifts() {
-  const ForPartialVectors<TestLeftShifts</*kSigned=*/false>> shl_u;
-  const ForPartialVectors<TestLeftShifts</*kSigned=*/true>> shl_s;
-  const ForPartialVectors<TestUnsignedRightShifts> shr_u;
-  const ForPartialVectors<TestSignedRightShifts> shr_s;
-
-  shl_u(uint16_t());
-  shr_u(uint16_t());
-
-  shl_u(uint32_t());
-  shr_u(uint32_t());
-
-  shl_s(int16_t());
-  shr_s(int16_t());
-
-  shl_s(int32_t());
-  shr_s(int32_t());
-
-#if HWY_HAVE_INTEGER64
-  shl_u(uint64_t());
-  shr_u(uint64_t());
-
-  shl_s(int64_t());
-  shr_s(int64_t());
-#endif
-}
-
-HWY_NOINLINE void TestAllRotateRight() {
-  const ForPartialVectors<TestRotateRight> test;
-  test(uint32_t());
-#if HWY_HAVE_INTEGER64
-  test(uint64_t());
-#endif
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-
-namespace hwy {
-HWY_BEFORE_TEST(HwyShiftTest);
-HWY_EXPORT_AND_TEST_P(HwyShiftTest, TestAllShifts);
-HWY_EXPORT_AND_TEST_P(HwyShiftTest, TestAllVariableShifts);
-HWY_EXPORT_AND_TEST_P(HwyShiftTest, TestAllRotateRight);
-}  // namespace hwy
-
-#endif
diff --git a/third_party/highway/hwy/tests/swizzle_test.cc b/third_party/highway/hwy/tests/swizzle_test.cc
deleted file mode 100644 (file)
index f447f7a..0000000
+++ /dev/null
@@ -1,272 +0,0 @@
-// Copyright 2019 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <stddef.h>
-#include <string.h>  // memset
-
-#include "hwy/base.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "tests/swizzle_test.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#include "hwy/highway.h"
-#include "hwy/tests/test_util-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-struct TestGetLane {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v = Iota(d, T(1));
-    HWY_ASSERT_EQ(T(1), GetLane(v));
-  }
-};
-
-HWY_NOINLINE void TestAllGetLane() {
-  ForAllTypes(ForPartialVectors<TestGetLane>());
-}
-
-struct TestExtractLane {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const auto v = Iota(d, T(1));
-    for (size_t i = 0; i < Lanes(d); ++i) {
-      const T actual = ExtractLane(v, i);
-      HWY_ASSERT_EQ(static_cast<T>(i + 1), actual);
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllExtractLane() {
-  ForAllTypes(ForPartialVectors<TestExtractLane>());
-}
-
-struct TestInsertLane {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    using V = Vec<D>;
-    const V v = Iota(d, T(1));
-    const size_t N = Lanes(d);
-    auto lanes = AllocateAligned<T>(N);
-    Store(v, d, lanes.get());
-
-    for (size_t i = 0; i < Lanes(d); ++i) {
-      lanes[i] = T{0};
-      const V actual = InsertLane(v, i, static_cast<T>(i + 1));
-      HWY_ASSERT_VEC_EQ(d, v, actual);
-      Store(v, d, lanes.get());  // restore lane i
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllInsertLane() {
-  ForAllTypes(ForPartialVectors<TestInsertLane>());
-}
-
-struct TestDupEven {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    auto expected = AllocateAligned<T>(N);
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = static_cast<T>((static_cast<int>(i) & ~1) + 1);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), DupEven(Iota(d, 1)));
-  }
-};
-
-HWY_NOINLINE void TestAllDupEven() {
-  ForUIF3264(ForShrinkableVectors<TestDupEven>());
-}
-
-struct TestDupOdd {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-#if HWY_TARGET != HWY_SCALAR
-    const size_t N = Lanes(d);
-    auto expected = AllocateAligned<T>(N);
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = static_cast<T>((static_cast<int>(i) & ~1) + 2);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), DupOdd(Iota(d, 1)));
-#else
-    (void)d;
-#endif
-  }
-};
-
-HWY_NOINLINE void TestAllDupOdd() {
-  ForUIF3264(ForShrinkableVectors<TestDupOdd>());
-}
-
-struct TestOddEven {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    const auto even = Iota(d, 1);
-    const auto odd = Iota(d, static_cast<T>(1 + N));
-    auto expected = AllocateAligned<T>(N);
-    for (size_t i = 0; i < N; ++i) {
-      expected[i] = static_cast<T>(1 + i + ((i & 1) ? N : 0));
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), OddEven(odd, even));
-  }
-};
-
-HWY_NOINLINE void TestAllOddEven() {
-  ForAllTypes(ForShrinkableVectors<TestOddEven>());
-}
-
-struct TestOddEvenBlocks {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    const auto even = Iota(d, 1);
-    const auto odd = Iota(d, static_cast<T>(1 + N));
-    auto expected = AllocateAligned<T>(N);
-    for (size_t i = 0; i < N; ++i) {
-      const size_t idx_block = i / (16 / sizeof(T));
-      expected[i] = static_cast<T>(1 + i + ((idx_block & 1) ? N : 0));
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), OddEvenBlocks(odd, even));
-  }
-};
-
-HWY_NOINLINE void TestAllOddEvenBlocks() {
-  ForAllTypes(ForGEVectors<128, TestOddEvenBlocks>());
-}
-
-struct TestSwapAdjacentBlocks {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const size_t N = Lanes(d);
-    constexpr size_t kLanesPerBlock = 16 / sizeof(T);
-    if (N < 2 * kLanesPerBlock) return;
-    const auto vi = Iota(d, 1);
-    auto expected = AllocateAligned<T>(N);
-    for (size_t i = 0; i < N; ++i) {
-      const size_t idx_block = i / kLanesPerBlock;
-      const size_t base = (idx_block ^ 1) * kLanesPerBlock;
-      const size_t mod = i % kLanesPerBlock;
-      expected[i] = static_cast<T>(1 + base + mod);
-    }
-    HWY_ASSERT_VEC_EQ(d, expected.get(), SwapAdjacentBlocks(vi));
-  }
-};
-
-HWY_NOINLINE void TestAllSwapAdjacentBlocks() {
-  ForAllTypes(ForGEVectors<128, TestSwapAdjacentBlocks>());
-}
-
-struct TestTableLookupLanes {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T /*unused*/, D d) {
-    const RebindToSigned<D> di;
-    using TI = TFromD<decltype(di)>;
-#if HWY_TARGET != HWY_SCALAR
-    const size_t N = Lanes(d);
-    auto idx = AllocateAligned<TI>(N);
-    memset(idx.get(), 0, N * sizeof(TI));
-    auto expected = AllocateAligned<T>(N);
-    const auto v = Iota(d, 1);
-
-    if (N <= 8) {  // Test all permutations
-      for (size_t i0 = 0; i0 < N; ++i0) {
-        idx[0] = static_cast<TI>(i0);
-
-        for (size_t i1 = 0; i1 < N; ++i1) {
-          if (N >= 2) idx[1] = static_cast<TI>(i1);
-          for (size_t i2 = 0; i2 < N; ++i2) {
-            if (N >= 4) idx[2] = static_cast<TI>(i2);
-            for (size_t i3 = 0; i3 < N; ++i3) {
-              if (N >= 4) idx[3] = static_cast<TI>(i3);
-
-              for (size_t i = 0; i < N; ++i) {
-                expected[i] = static_cast<T>(idx[i] + 1);  // == v[idx[i]]
-              }
-
-              const auto opaque1 = IndicesFromVec(d, Load(di, idx.get()));
-              const auto actual1 = TableLookupLanes(v, opaque1);
-              HWY_ASSERT_VEC_EQ(d, expected.get(), actual1);
-
-              const auto opaque2 = SetTableIndices(d, idx.get());
-              const auto actual2 = TableLookupLanes(v, opaque2);
-              HWY_ASSERT_VEC_EQ(d, expected.get(), actual2);
-            }
-          }
-        }
-      }
-    } else {
-      // Too many permutations to test exhaustively; choose one with repeated
-      // and cross-block indices and ensure indices do not exceed #lanes.
-      // For larger vectors, upper lanes will be zero.
-      HWY_ALIGN TI idx_source[16] = {1,  3,  2,  2,  8, 1, 7, 6,
-                                     15, 14, 14, 15, 4, 9, 8, 5};
-      for (size_t i = 0; i < N; ++i) {
-        idx[i] = (i < 16) ? idx_source[i] : 0;
-        // Avoid undefined results / asan error for scalar by capping indices.
-        if (idx[i] >= static_cast<TI>(N)) {
-          idx[i] = static_cast<TI>(N - 1);
-        }
-        expected[i] = static_cast<T>(idx[i] + 1);  // == v[idx[i]]
-      }
-
-      const auto opaque1 = IndicesFromVec(d, Load(di, idx.get()));
-      const auto actual1 = TableLookupLanes(v, opaque1);
-      HWY_ASSERT_VEC_EQ(d, expected.get(), actual1);
-
-      const auto opaque2 = SetTableIndices(d, idx.get());
-      const auto actual2 = TableLookupLanes(v, opaque2);
-      HWY_ASSERT_VEC_EQ(d, expected.get(), actual2);
-    }
-#else
-    const TI index = 0;
-    const auto v = Set(d, 1);
-    const auto opaque1 = SetTableIndices(d, &index);
-    HWY_ASSERT_VEC_EQ(d, v, TableLookupLanes(v, opaque1));
-    const auto opaque2 = IndicesFromVec(d, Zero(di));
-    HWY_ASSERT_VEC_EQ(d, v, TableLookupLanes(v, opaque2));
-#endif
-  }
-};
-
-HWY_NOINLINE void TestAllTableLookupLanes() {
-  ForUIF3264(ForPartialVectors<TestTableLookupLanes>());
-}
-
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-
-namespace hwy {
-HWY_BEFORE_TEST(HwySwizzleTest);
-HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllGetLane);
-HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllExtractLane);
-HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllInsertLane);
-HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllDupEven);
-HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllDupOdd);
-HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllOddEven);
-HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllOddEvenBlocks);
-HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllSwapAdjacentBlocks);
-HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllTableLookupLanes);
-}  // namespace hwy
-
-#endif
diff --git a/third_party/highway/hwy/tests/test_util-inl.h b/third_party/highway/hwy/tests/test_util-inl.h
deleted file mode 100644 (file)
index d9c1aeb..0000000
+++ /dev/null
@@ -1,665 +0,0 @@
-// Copyright 2019 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Target-specific helper functions for use by *_test.cc.
-
-#include <stdint.h>
-
-#include "hwy/base.h"
-#include "hwy/tests/hwy_gtest.h"
-#include "hwy/tests/test_util.h"
-
-// After test_util (also includes highway.h)
-#include "hwy/print-inl.h"
-
-// Per-target include guard
-#if defined(HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_) == \
-    defined(HWY_TARGET_TOGGLE)
-#ifdef HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_
-#undef HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_
-#else
-#define HIGHWAY_HWY_TESTS_TEST_UTIL_INL_H_
-#endif
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-// Compare expected vector to vector.
-// HWY_INLINE works around a Clang SVE compiler bug where all but the first
-// 128 bits (the NEON register) of actual are zero.
-template <class D, typename T = TFromD<D>, class V = Vec<D>>
-HWY_INLINE void AssertVecEqual(D d, const T* expected, VecArg<V> actual,
-                               const char* filename, const int line) {
-  const size_t N = Lanes(d);
-  auto actual_lanes = AllocateAligned<T>(N);
-  Store(actual, d, actual_lanes.get());
-
-  const auto info = hwy::detail::MakeTypeInfo<T>();
-  const char* target_name = hwy::TargetName(HWY_TARGET);
-  hwy::detail::AssertArrayEqual(info, expected, actual_lanes.get(), N,
-                                target_name, filename, line);
-}
-
-// Compare expected lanes to vector.
-// HWY_INLINE works around a Clang SVE compiler bug where all but the first
-// 128 bits (the NEON register) of actual are zero.
-template <class D, typename T = TFromD<D>, class V = Vec<D>>
-HWY_INLINE void AssertVecEqual(D d, VecArg<V> expected, VecArg<V> actual,
-                               const char* filename, int line) {
-  auto expected_lanes = AllocateAligned<T>(Lanes(d));
-  Store(expected, d, expected_lanes.get());
-  AssertVecEqual(d, expected_lanes.get(), actual, filename, line);
-}
-
-// Only checks the valid mask elements (those whose index < Lanes(d)).
-template <class D>
-HWY_NOINLINE void AssertMaskEqual(D d, VecArg<Mask<D>> a, VecArg<Mask<D>> b,
-                                  const char* filename, int line) {
-  // lvalues prevented MSAN failure in farm_sve.
-  const Vec<D> va = VecFromMask(d, a);
-  const Vec<D> vb = VecFromMask(d, b);
-  AssertVecEqual(d, va, vb, filename, line);
-
-  const char* target_name = hwy::TargetName(HWY_TARGET);
-  AssertEqual(CountTrue(d, a), CountTrue(d, b), target_name, filename, line);
-  AssertEqual(AllTrue(d, a), AllTrue(d, b), target_name, filename, line);
-  AssertEqual(AllFalse(d, a), AllFalse(d, b), target_name, filename, line);
-
-  const size_t N = Lanes(d);
-#if HWY_TARGET == HWY_SCALAR
-  const Rebind<uint8_t, D> d8;
-#else
-  const Repartition<uint8_t, D> d8;
-#endif
-  const size_t N8 = Lanes(d8);
-  auto bits_a = AllocateAligned<uint8_t>(HWY_MAX(8, N8));
-  auto bits_b = AllocateAligned<uint8_t>(HWY_MAX(8, N8));
-  memset(bits_a.get(), 0, N8);
-  memset(bits_b.get(), 0, N8);
-  const size_t num_bytes_a = StoreMaskBits(d, a, bits_a.get());
-  const size_t num_bytes_b = StoreMaskBits(d, b, bits_b.get());
-  AssertEqual(num_bytes_a, num_bytes_b, target_name, filename, line);
-  size_t i = 0;
-  // First check whole bytes (if that many elements are still valid)
-  for (; i < N / 8; ++i) {
-    if (bits_a[i] != bits_b[i]) {
-      fprintf(stderr, "Mismatch in byte %d: %d != %d\n", static_cast<int>(i),
-              bits_a[i], bits_b[i]);
-      Print(d8, "expect", Load(d8, bits_a.get()), 0, N8);
-      Print(d8, "actual", Load(d8, bits_b.get()), 0, N8);
-      hwy::Abort(filename, line, "Masks not equal");
-    }
-  }
-  // Then the valid bit(s) in the last byte.
-  const size_t remainder = N % 8;
-  if (remainder != 0) {
-    const int mask = (1 << remainder) - 1;
-    const int valid_a = bits_a[i] & mask;
-    const int valid_b = bits_b[i] & mask;
-    if (valid_a != valid_b) {
-      fprintf(stderr, "Mismatch in last byte %d: %d != %d\n",
-              static_cast<int>(i), valid_a, valid_b);
-      Print(d8, "expect", Load(d8, bits_a.get()), 0, N8);
-      Print(d8, "actual", Load(d8, bits_b.get()), 0, N8);
-      hwy::Abort(filename, line, "Masks not equal");
-    }
-  }
-}
-
-// Only sets valid elements (those whose index < Lanes(d)). This helps catch
-// tests that are not masking off the (undefined) upper mask elements.
-//
-// TODO(janwas): with HWY_NOINLINE GCC zeros the upper half of AVX2 masks.
-template <class D>
-HWY_INLINE Mask<D> MaskTrue(const D d) {
-  return FirstN(d, Lanes(d));
-}
-
-template <class D>
-HWY_INLINE Mask<D> MaskFalse(const D d) {
-  const auto zero = Zero(RebindToSigned<D>());
-  return RebindMask(d, Lt(zero, zero));
-}
-
-#ifndef HWY_ASSERT_EQ
-
-#define HWY_ASSERT_EQ(expected, actual)                                     \
-  hwy::AssertEqual(expected, actual, hwy::TargetName(HWY_TARGET), __FILE__, \
-                   __LINE__)
-
-#define HWY_ASSERT_ARRAY_EQ(expected, actual, count)                          \
-  hwy::AssertArrayEqual(expected, actual, count, hwy::TargetName(HWY_TARGET), \
-                        __FILE__, __LINE__)
-
-#define HWY_ASSERT_STRING_EQ(expected, actual)                          \
-  hwy::AssertStringEqual(expected, actual, hwy::TargetName(HWY_TARGET), \
-                         __FILE__, __LINE__)
-
-#define HWY_ASSERT_VEC_EQ(d, expected, actual) \
-  AssertVecEqual(d, expected, actual, __FILE__, __LINE__)
-
-#define HWY_ASSERT_MASK_EQ(d, expected, actual) \
-  AssertMaskEqual(d, expected, actual, __FILE__, __LINE__)
-
-#endif  // HWY_ASSERT_EQ
-
-namespace detail {
-
-// Helpers for instantiating tests with combinations of lane types / counts.
-
-// Calls Test for each CappedTag<T, N> where N is in [kMinLanes, kMul * kMinArg]
-// and the resulting Lanes() is in [min_lanes, max_lanes]. The upper bound
-// is required to ensure capped vectors remain extendable. Implemented by
-// recursively halving kMul until it is zero.
-template <typename T, size_t kMul, size_t kMinArg, class Test>
-struct ForeachCappedR {
-  static void Do(size_t min_lanes, size_t max_lanes) {
-    const CappedTag<T, kMul * kMinArg> d;
-
-    // If we already don't have enough lanes, stop.
-    const size_t lanes = Lanes(d);
-    if (lanes < min_lanes) return;
-
-    if (lanes <= max_lanes) {
-      Test()(T(), d);
-    }
-    ForeachCappedR<T, kMul / 2, kMinArg, Test>::Do(min_lanes, max_lanes);
-  }
-};
-
-// Base case to stop the recursion.
-template <typename T, size_t kMinArg, class Test>
-struct ForeachCappedR<T, 0, kMinArg, Test> {
-  static void Do(size_t, size_t) {}
-};
-
-#if HWY_HAVE_SCALABLE
-
-template <typename T>
-constexpr int MinPow2() {
-  // Highway follows RVV LMUL in that the smallest fraction is 1/8th (encoded
-  // as kPow2 == -3). The fraction also must not result in zero lanes for the
-  // smallest possible vector size, which is 128 bits even on RISC-V (with the
-  // application processor profile).
-  return HWY_MAX(-3, -static_cast<int>(CeilLog2(16 / sizeof(T))));
-}
-
-// Iterates kPow2 upward through +3.
-template <typename T, int kPow2, int kAddPow2, class Test>
-struct ForeachShiftR {
-  static void Do(size_t min_lanes) {
-    const ScalableTag<T, kPow2 + kAddPow2> d;
-
-    // Precondition: [kPow2, 3] + kAddPow2 is a valid fraction of the minimum
-    // vector size, so we always have enough lanes, except ForGEVectors.
-    if (Lanes(d) >= min_lanes) {
-      Test()(T(), d);
-    } else {
-      fprintf(stderr, "%d lanes < %d: T=%d pow=%d\n",
-              static_cast<int>(Lanes(d)), static_cast<int>(min_lanes),
-              static_cast<int>(sizeof(T)), kPow2 + kAddPow2);
-      HWY_ASSERT(min_lanes != 1);
-    }
-
-    ForeachShiftR<T, kPow2 + 1, kAddPow2, Test>::Do(min_lanes);
-  }
-};
-
-// Base case to stop the recursion.
-template <typename T, int kAddPow2, class Test>
-struct ForeachShiftR<T, 4, kAddPow2, Test> {
-  static void Do(size_t) {}
-};
-#else
-// ForeachCappedR already handled all possible sizes.
-#endif  // HWY_HAVE_SCALABLE
-
-}  // namespace detail
-
-// These 'adapters' call a test for all possible N or kPow2 subject to
-// constraints such as "vectors must be extendable" or "vectors >= 128 bits".
-// They may be called directly, or via For*Types. Note that for an adapter C,
-// `C<Test>(T())` does not call the test - the correct invocation is
-// `C<Test>()(T())`, or preferably `ForAllTypes(C<Test>())`. We check at runtime
-// that operator() is called to prevent such bugs. Note that this is not
-// thread-safe, but that is fine because C are typically local variables.
-
-// Calls Test for all power of two N in [1, Lanes(d) >> kPow2]. This is for
-// ops that widen their input, e.g. Combine (not supported by HWY_SCALAR).
-template <class Test, int kPow2 = 1>
-class ForExtendableVectors {
-  mutable bool called_ = false;
-
- public:
-  ~ForExtendableVectors() {
-    if (!called_) {
-      HWY_ABORT("Test is incorrect, ensure operator() is called");
-    }
-  }
-
-  template <typename T>
-  void operator()(T /*unused*/) const {
-    called_ = true;
-    constexpr size_t kMaxCapped = HWY_LANES(T);
-    // Skip CappedTag that are already full vectors.
-    const size_t max_lanes = Lanes(ScalableTag<T>()) >> kPow2;
-    (void)kMaxCapped;
-    (void)max_lanes;
-#if HWY_TARGET == HWY_SCALAR
-    // not supported
-#else
-    detail::ForeachCappedR<T, (kMaxCapped >> kPow2), 1, Test>::Do(1, max_lanes);
-#if HWY_TARGET == HWY_RVV
-    // For each [MinPow2, 3 - kPow2]; counter is [MinPow2 + kPow2, 3].
-    detail::ForeachShiftR<T, detail::MinPow2<T>() + kPow2, -kPow2, Test>::Do(1);
-#elif HWY_HAVE_SCALABLE
-    // For each [MinPow2, 0 - kPow2]; counter is [MinPow2 + kPow2 + 3, 3].
-    detail::ForeachShiftR<T, detail::MinPow2<T>() + kPow2 + 3, -kPow2 - 3,
-                          Test>::Do(1);
-#endif
-#endif  // HWY_SCALAR
-  }
-};
-
-// Calls Test for all power of two N in [1 << kPow2, Lanes(d)]. This is for ops
-// that narrow their input, e.g. UpperHalf.
-template <class Test, int kPow2 = 1>
-class ForShrinkableVectors {
-  mutable bool called_ = false;
-
- public:
-  ~ForShrinkableVectors() {
-    if (!called_) {
-      HWY_ABORT("Test is incorrect, ensure operator() is called");
-    }
-  }
-
-  template <typename T>
-  void operator()(T /*unused*/) const {
-    called_ = true;
-    constexpr size_t kMinLanes = size_t{1} << kPow2;
-    constexpr size_t kMaxCapped = HWY_LANES(T);
-    // For shrinking, an upper limit is unnecessary.
-    constexpr size_t max_lanes = kMaxCapped;
-
-    (void)kMinLanes;
-    (void)max_lanes;
-    (void)max_lanes;
-#if HWY_TARGET == HWY_SCALAR
-    // not supported
-#else
-    detail::ForeachCappedR<T, (kMaxCapped >> kPow2), kMinLanes, Test>::Do(
-        kMinLanes, max_lanes);
-#if HWY_TARGET == HWY_RVV
-    // For each [MinPow2 + kPow2, 3]; counter is [MinPow2 + kPow2, 3].
-    detail::ForeachShiftR<T, detail::MinPow2<T>() + kPow2, 0, Test>::Do(
-        kMinLanes);
-#elif HWY_HAVE_SCALABLE
-    // For each [MinPow2 + kPow2, 0]; counter is [MinPow2 + kPow2 + 3, 3].
-    detail::ForeachShiftR<T, detail::MinPow2<T>() + kPow2 + 3, -3, Test>::Do(
-        kMinLanes);
-#endif
-#endif  // HWY_TARGET == HWY_SCALAR
-  }
-};
-
-// Calls Test for all supported power of two vectors of at least kMinBits.
-// Examples: AES or 64x64 require 128 bits, casts may require 64 bits.
-template <size_t kMinBits, class Test>
-class ForGEVectors {
-  mutable bool called_ = false;
-
- public:
-  ~ForGEVectors() {
-    if (!called_) {
-      HWY_ABORT("Test is incorrect, ensure operator() is called");
-    }
-  }
-
-  template <typename T>
-  void operator()(T /*unused*/) const {
-    called_ = true;
-    constexpr size_t kMaxCapped = HWY_LANES(T);
-    constexpr size_t kMinLanes = kMinBits / 8 / sizeof(T);
-    // An upper limit is unnecessary.
-    constexpr size_t max_lanes = kMaxCapped;
-    (void)max_lanes;
-#if HWY_TARGET == HWY_SCALAR
-    (void)kMinLanes;  // not supported
-#else
-    detail::ForeachCappedR<T, HWY_LANES(T) / kMinLanes, kMinLanes, Test>::Do(
-        kMinLanes, max_lanes);
-#if HWY_TARGET == HWY_RVV
-    // Can be 0 (handled below) if kMinBits > 64.
-    constexpr size_t kRatio = 128 / kMinBits;
-    constexpr int kMinPow2 =
-        kRatio == 0 ? 0 : -static_cast<int>(CeilLog2(kRatio));
-    // For each [kMinPow2, 3]; counter is [kMinPow2, 3].
-    detail::ForeachShiftR<T, kMinPow2, 0, Test>::Do(kMinLanes);
-#elif HWY_HAVE_SCALABLE
-    // Can be 0 (handled below) if kMinBits > 128.
-    constexpr size_t kRatio = 128 / kMinBits;
-    constexpr int kMinPow2 =
-        kRatio == 0 ? 0 : -static_cast<int>(CeilLog2(kRatio));
-    // For each [kMinPow2, 0]; counter is [kMinPow2 + 3, 3].
-    detail::ForeachShiftR<T, kMinPow2 + 3, -3, Test>::Do(kMinLanes);
-#endif
-#endif  // HWY_TARGET == HWY_SCALAR
-  }
-};
-
-template <class Test>
-using ForGE128Vectors = ForGEVectors<128, Test>;
-
-// Calls Test for all N that can be promoted (not the same as Extendable because
-// HWY_SCALAR has one lane). Also used for ZipLower, but not ZipUpper.
-template <class Test, int kPow2 = 1>
-class ForPromoteVectors {
-  mutable bool called_ = false;
-
- public:
-  ~ForPromoteVectors() {
-    if (!called_) {
-      HWY_ABORT("Test is incorrect, ensure operator() is called");
-    }
-  }
-
-  template <typename T>
-  void operator()(T /*unused*/) const {
-    called_ = true;
-    constexpr size_t kFactor = size_t{1} << kPow2;
-    static_assert(kFactor >= 2 && kFactor * sizeof(T) <= sizeof(uint64_t), "");
-    constexpr size_t kMaxCapped = HWY_LANES(T);
-    constexpr size_t kMinLanes = kFactor;
-    // Skip CappedTag that are already full vectors.
-    const size_t max_lanes = Lanes(ScalableTag<T>()) >> kPow2;
-    (void)kMaxCapped;
-    (void)kMinLanes;
-    (void)max_lanes;
-#if HWY_TARGET == HWY_SCALAR
-    detail::ForeachCappedR<T, 1, 1, Test>::Do(1, 1);
-#else
-    // TODO(janwas): call Extendable if kMinLanes check not required?
-    detail::ForeachCappedR<T, (kMaxCapped >> kPow2), 1, Test>::Do(kMinLanes,
-                                                                  max_lanes);
-#if HWY_TARGET == HWY_RVV
-    // For each [MinPow2, 3 - kPow2]; counter is [MinPow2 + kPow2, 3].
-    detail::ForeachShiftR<T, detail::MinPow2<T>() + kPow2, -kPow2, Test>::Do(
-        kMinLanes);
-#elif HWY_HAVE_SCALABLE
-    // For each [MinPow2, 0 - kPow2]; counter is [MinPow2 + kPow2 + 3, 3].
-    detail::ForeachShiftR<T, detail::MinPow2<T>() + kPow2 + 3, -kPow2 - 3,
-                          Test>::Do(kMinLanes);
-#endif
-#endif  // HWY_SCALAR
-  }
-};
-
-// Calls Test for all N than can be demoted (not the same as Shrinkable because
-// HWY_SCALAR has one lane).
-template <class Test, int kPow2 = 1>
-class ForDemoteVectors {
-  mutable bool called_ = false;
-
- public:
-  ~ForDemoteVectors() {
-    if (!called_) {
-      HWY_ABORT("Test is incorrect, ensure operator() is called");
-    }
-  }
-
-  template <typename T>
-  void operator()(T /*unused*/) const {
-    called_ = true;
-    constexpr size_t kMinLanes = size_t{1} << kPow2;
-    constexpr size_t kMaxCapped = HWY_LANES(T);
-    // For shrinking, an upper limit is unnecessary.
-    constexpr size_t max_lanes = kMaxCapped;
-
-    (void)kMinLanes;
-    (void)max_lanes;
-    (void)max_lanes;
-#if HWY_TARGET == HWY_SCALAR
-    detail::ForeachCappedR<T, 1, 1, Test>::Do(1, 1);
-#else
-    detail::ForeachCappedR<T, (kMaxCapped >> kPow2), kMinLanes, Test>::Do(
-        kMinLanes, max_lanes);
-
-// TODO(janwas): call Extendable if kMinLanes check not required?
-#if HWY_TARGET == HWY_RVV
-    // For each [MinPow2 + kPow2, 3]; counter is [MinPow2 + kPow2, 3].
-    detail::ForeachShiftR<T, detail::MinPow2<T>() + kPow2, 0, Test>::Do(
-        kMinLanes);
-#elif HWY_HAVE_SCALABLE
-    // For each [MinPow2 + kPow2, 0]; counter is [MinPow2 + kPow2 + 3, 3].
-    detail::ForeachShiftR<T, detail::MinPow2<T>() + kPow2 + 3, -3, Test>::Do(
-        kMinLanes);
-#endif
-#endif  // HWY_TARGET == HWY_SCALAR
-  }
-};
-
-// For LowerHalf/Quarter.
-template <class Test, int kPow2 = 1>
-class ForHalfVectors {
-  mutable bool called_ = false;
-
- public:
-  ~ForHalfVectors() {
-    if (!called_) {
-      HWY_ABORT("Test is incorrect, ensure operator() is called");
-    }
-  }
-
-  template <typename T>
-  void operator()(T /*unused*/) const {
-    called_ = true;
-#if HWY_TARGET == HWY_SCALAR
-    detail::ForeachCappedR<T, 1, 1, Test>::Do(1, 1);
-#else
-    constexpr size_t kMinLanes = size_t{1} << kPow2;
-    // For shrinking, an upper limit is unnecessary.
-    constexpr size_t kMaxCapped = HWY_LANES(T);
-    detail::ForeachCappedR<T, (kMaxCapped >> kPow2), kMinLanes, Test>::Do(
-        kMinLanes, kMaxCapped);
-
-// TODO(janwas): call Extendable if kMinLanes check not required?
-#if HWY_TARGET == HWY_RVV
-    // For each [MinPow2 + kPow2, 3]; counter is [MinPow2 + kPow2, 3].
-    detail::ForeachShiftR<T, detail::MinPow2<T>() + kPow2, 0, Test>::Do(
-        kMinLanes);
-#elif HWY_HAVE_SCALABLE
-    // For each [MinPow2 + kPow2, 0]; counter is [MinPow2 + kPow2 + 3, 3].
-    detail::ForeachShiftR<T, detail::MinPow2<T>() + kPow2 + 3, -3, Test>::Do(
-        kMinLanes);
-#endif
-#endif  // HWY_TARGET == HWY_SCALAR
-  }
-};
-
-// Calls Test for all power of two N in [1, Lanes(d)]. This is the default
-// for ops that do not narrow nor widen their input, nor require 128 bits.
-template <class Test>
-class ForPartialVectors {
-  mutable bool called_ = false;
-
- public:
-  ~ForPartialVectors() {
-    if (!called_) {
-      HWY_ABORT("Test is incorrect, ensure operator() is called");
-    }
-  }
-
-  template <typename T>
-  void operator()(T t) const {
-    called_ = true;
-#if HWY_TARGET == HWY_SCALAR
-    (void)t;
-    detail::ForeachCappedR<T, 1, 1, Test>::Do(1, 1);
-#else
-    ForExtendableVectors<Test, 0>()(t);
-#endif
-  }
-};
-
-// Type lists to shorten call sites:
-
-template <class Func>
-void ForSignedTypes(const Func& func) {
-  func(int8_t());
-  func(int16_t());
-  func(int32_t());
-#if HWY_HAVE_INTEGER64
-  func(int64_t());
-#endif
-}
-
-template <class Func>
-void ForUnsignedTypes(const Func& func) {
-  func(uint8_t());
-  func(uint16_t());
-  func(uint32_t());
-#if HWY_HAVE_INTEGER64
-  func(uint64_t());
-#endif
-}
-
-template <class Func>
-void ForIntegerTypes(const Func& func) {
-  ForSignedTypes(func);
-  ForUnsignedTypes(func);
-}
-
-template <class Func>
-void ForFloatTypes(const Func& func) {
-  func(float());
-#if HWY_HAVE_FLOAT64
-  func(double());
-#endif
-}
-
-template <class Func>
-void ForAllTypes(const Func& func) {
-  ForIntegerTypes(func);
-  ForFloatTypes(func);
-}
-
-template <class Func>
-void ForUI8(const Func& func) {
-  func(uint8_t());
-  func(int8_t());
-}
-
-template <class Func>
-void ForUI16(const Func& func) {
-  func(uint16_t());
-  func(int16_t());
-}
-
-template <class Func>
-void ForUIF16(const Func& func) {
-  ForUI16(func);
-#if HWY_HAVE_FLOAT16
-  func(float16_t());
-#endif
-}
-
-template <class Func>
-void ForUI32(const Func& func) {
-  func(uint32_t());
-  func(int32_t());
-}
-
-template <class Func>
-void ForUIF32(const Func& func) {
-  ForUI32(func);
-  func(float());
-}
-
-template <class Func>
-void ForUI64(const Func& func) {
-#if HWY_HAVE_INTEGER64
-  func(uint64_t());
-  func(int64_t());
-#endif
-}
-
-template <class Func>
-void ForUIF64(const Func& func) {
-  ForUI64(func);
-#if HWY_HAVE_FLOAT64
-  func(double());
-#endif
-}
-
-template <class Func>
-void ForUI3264(const Func& func) {
-  ForUI32(func);
-  ForUI64(func);
-}
-
-template <class Func>
-void ForUIF3264(const Func& func) {
-  ForUIF32(func);
-  ForUIF64(func);
-}
-
-template <class Func>
-void ForUI163264(const Func& func) {
-  ForUI16(func);
-  ForUI3264(func);
-}
-
-template <class Func>
-void ForUIF163264(const Func& func) {
-  ForUIF16(func);
-  ForUIF3264(func);
-}
-
-// For tests that involve loops, adjust the trip count so that emulated tests
-// finish quickly (but always at least 2 iterations to ensure some diversity).
-constexpr size_t AdjustedReps(size_t max_reps) {
-#if HWY_ARCH_RVV
-  return HWY_MAX(max_reps / 32, 2);
-#elif HWY_IS_DEBUG_BUILD
-  return HWY_MAX(max_reps / 8, 2);
-#elif HWY_ARCH_ARM
-  return HWY_MAX(max_reps / 4, 2);
-#else
-  return HWY_MAX(max_reps, 2);
-#endif
-}
-
-// Same as above, but the loop trip count will be 1 << max_pow2.
-constexpr size_t AdjustedLog2Reps(size_t max_pow2) {
-  // If "negative" (unsigned wraparound), use original.
-#if HWY_ARCH_RVV
-  return HWY_MIN(max_pow2 - 4, max_pow2);
-#elif HWY_IS_DEBUG_BUILD
-  return HWY_MIN(max_pow2 - 1, max_pow2);
-#elif HWY_ARCH_ARM
-  return HWY_MIN(max_pow2 - 1, max_pow2);
-#else
-  return max_pow2;
-#endif
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#endif  // per-target include guard
diff --git a/third_party/highway/hwy/tests/test_util.cc b/third_party/highway/hwy/tests/test_util.cc
deleted file mode 100644 (file)
index a0796b1..0000000
+++ /dev/null
@@ -1,117 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "hwy/tests/test_util.h"
-
-#include <stddef.h>
-#include <stdio.h>
-
-#include <cmath>
-
-#include "hwy/base.h"
-#include "hwy/print.h"
-
-namespace hwy {
-
-HWY_TEST_DLLEXPORT bool BytesEqual(const void* p1, const void* p2,
-                                   const size_t size, size_t* pos) {
-  const uint8_t* bytes1 = reinterpret_cast<const uint8_t*>(p1);
-  const uint8_t* bytes2 = reinterpret_cast<const uint8_t*>(p2);
-  for (size_t i = 0; i < size; ++i) {
-    if (bytes1[i] != bytes2[i]) {
-      if (pos != nullptr) {
-        *pos = i;
-      }
-      return false;
-    }
-  }
-  return true;
-}
-
-void AssertStringEqual(const char* expected, const char* actual,
-                       const char* target_name, const char* filename,
-                       int line) {
-  while (*expected == *actual++) {
-    if (*expected++ == '\0') return;
-  }
-
-  Abort(filename, line, "%s string mismatch: expected '%s', got '%s'.\n",
-        target_name, expected, actual);
-}
-
-namespace detail {
-
-HWY_TEST_DLLEXPORT bool IsEqual(const TypeInfo& info, const void* expected_ptr,
-                                const void* actual_ptr) {
-  if (!info.is_float) {
-    return BytesEqual(expected_ptr, actual_ptr, info.sizeof_t);
-  }
-
-  if (info.sizeof_t == 4) {
-    float expected, actual;
-    CopyBytes<4>(expected_ptr, &expected);
-    CopyBytes<4>(actual_ptr, &actual);
-    return ComputeUlpDelta(expected, actual) <= 1;
-  } else if (info.sizeof_t == 8) {
-    double expected, actual;
-    CopyBytes<8>(expected_ptr, &expected);
-    CopyBytes<8>(actual_ptr, &actual);
-    return ComputeUlpDelta(expected, actual) <= 1;
-  } else {
-    HWY_ABORT("Unexpected float size %d\n", static_cast<int>(info.sizeof_t));
-    return false;
-  }
-}
-
-HWY_TEST_DLLEXPORT HWY_NORETURN void PrintMismatchAndAbort(
-    const TypeInfo& info, const void* expected_ptr, const void* actual_ptr,
-    const char* target_name, const char* filename, int line, size_t lane,
-    size_t num_lanes) {
-  char type_name[100];
-  TypeName(info, 1, type_name);
-  char expected_str[100];
-  ToString(info, expected_ptr, expected_str);
-  char actual_str[100];
-  ToString(info, actual_ptr, actual_str);
-  Abort(filename, line,
-        "%s, %sx%d lane %d mismatch: expected '%s', got '%s'.\n", target_name,
-        type_name, static_cast<int>(num_lanes), static_cast<int>(lane),
-        expected_str, actual_str);
-}
-
-HWY_TEST_DLLEXPORT void AssertArrayEqual(const TypeInfo& info,
-                                         const void* expected_void,
-                                         const void* actual_void, size_t N,
-                                         const char* target_name,
-                                         const char* filename, int line) {
-  const uint8_t* expected_array =
-      reinterpret_cast<const uint8_t*>(expected_void);
-  const uint8_t* actual_array = reinterpret_cast<const uint8_t*>(actual_void);
-  for (size_t i = 0; i < N; ++i) {
-    const void* expected_ptr = expected_array + i * info.sizeof_t;
-    const void* actual_ptr = actual_array + i * info.sizeof_t;
-    if (!IsEqual(info, expected_ptr, actual_ptr)) {
-      fprintf(stderr, "\n\n");
-      PrintArray(info, "expect", expected_array, N, i);
-      PrintArray(info, "actual", actual_array, N, i);
-
-      PrintMismatchAndAbort(info, expected_ptr, actual_ptr, target_name,
-                            filename, line, i, N);
-    }
-  }
-}
-
-}  // namespace detail
-}  // namespace hwy
diff --git a/third_party/highway/hwy/tests/test_util.h b/third_party/highway/hwy/tests/test_util.h
deleted file mode 100644 (file)
index 459de96..0000000
+++ /dev/null
@@ -1,172 +0,0 @@
-// Copyright 2021 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HWY_TESTS_TEST_UTIL_H_
-#define HWY_TESTS_TEST_UTIL_H_
-
-// Target-independent helper functions for use by *_test.cc.
-
-#include <stddef.h>
-#include <stdint.h>
-#include <string.h>
-
-#include <string>
-
-#include "hwy/aligned_allocator.h"
-#include "hwy/base.h"
-#include "hwy/highway.h"
-#include "hwy/highway_export.h"
-#include "hwy/print.h"
-
-namespace hwy {
-
-// The maximum vector size used in tests when defining test data. DEPRECATED.
-constexpr size_t kTestMaxVectorSize = 64;
-
-// 64-bit random generator (Xorshift128+). Much smaller state than std::mt19937,
-// which triggers a compiler bug.
-class RandomState {
- public:
-  explicit RandomState(const uint64_t seed = 0x123456789ull) {
-    s0_ = SplitMix64(seed + 0x9E3779B97F4A7C15ull);
-    s1_ = SplitMix64(s0_);
-  }
-
-  HWY_INLINE uint64_t operator()() {
-    uint64_t s1 = s0_;
-    const uint64_t s0 = s1_;
-    const uint64_t bits = s1 + s0;
-    s0_ = s0;
-    s1 ^= s1 << 23;
-    s1 ^= s0 ^ (s1 >> 18) ^ (s0 >> 5);
-    s1_ = s1;
-    return bits;
-  }
-
- private:
-  static uint64_t SplitMix64(uint64_t z) {
-    z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull;
-    z = (z ^ (z >> 27)) * 0x94D049BB133111EBull;
-    return z ^ (z >> 31);
-  }
-
-  uint64_t s0_;
-  uint64_t s1_;
-};
-
-static HWY_INLINE uint32_t Random32(RandomState* rng) {
-  return static_cast<uint32_t>((*rng)());
-}
-
-static HWY_INLINE uint64_t Random64(RandomState* rng) { return (*rng)(); }
-
-// Prevents the compiler from eliding the computations that led to "output".
-// Works by indicating to the compiler that "output" is being read and modified.
-// The +r constraint avoids unnecessary writes to memory, but only works for
-// built-in types.
-template <class T>
-inline void PreventElision(T&& output) {
-#if HWY_COMPILER_MSVC
-  (void)output;
-#else   // HWY_COMPILER_MSVC
-  asm volatile("" : "+r"(output) : : "memory");
-#endif  // HWY_COMPILER_MSVC
-}
-
-HWY_TEST_DLLEXPORT bool BytesEqual(const void* p1, const void* p2,
-                                   const size_t size, size_t* pos = nullptr);
-
-void AssertStringEqual(const char* expected, const char* actual,
-                       const char* target_name, const char* filename, int line);
-
-namespace detail {
-
-template <typename T, typename TU = MakeUnsigned<T>>
-TU ComputeUlpDelta(const T expected, const T actual) {
-  // Handle -0 == 0 and infinities.
-  if (expected == actual) return 0;
-
-  // Consider "equal" if both are NaN, so we can verify an expected NaN.
-  // Needs a special case because there are many possible NaN representations.
-  if (std::isnan(expected) && std::isnan(actual)) return 0;
-
-  // Compute the difference in units of last place. We do not need to check for
-  // differing signs; they will result in large differences, which is fine.
-  TU ux, uy;
-  CopySameSize(&expected, &ux);
-  CopySameSize(&actual, &uy);
-
-  // Avoid unsigned->signed cast: 2's complement is only guaranteed by C++20.
-  const TU ulp = HWY_MAX(ux, uy) - HWY_MIN(ux, uy);
-  return ulp;
-}
-
-HWY_TEST_DLLEXPORT bool IsEqual(const TypeInfo& info, const void* expected_ptr,
-                                const void* actual_ptr);
-
-HWY_TEST_DLLEXPORT HWY_NORETURN void PrintMismatchAndAbort(
-    const TypeInfo& info, const void* expected_ptr, const void* actual_ptr,
-    const char* target_name, const char* filename, int line, size_t lane = 0,
-    size_t num_lanes = 1);
-
-HWY_TEST_DLLEXPORT void AssertArrayEqual(const TypeInfo& info,
-                                         const void* expected_void,
-                                         const void* actual_void, size_t N,
-                                         const char* target_name,
-                                         const char* filename, int line);
-
-}  // namespace detail
-
-// Returns a name for the vector/part/scalar. The type prefix is u/i/f for
-// unsigned/signed/floating point, followed by the number of bits per lane;
-// then 'x' followed by the number of lanes. Example: u8x16. This is useful for
-// understanding which instantiation of a generic test failed.
-template <typename T>
-std::string TypeName(T /*unused*/, size_t N) {
-  char string100[100];
-  detail::TypeName(detail::MakeTypeInfo<T>(), N, string100);
-  return string100;
-}
-
-// Compare non-vector, non-string T.
-template <typename T>
-HWY_INLINE bool IsEqual(const T expected, const T actual) {
-  const auto info = detail::MakeTypeInfo<T>();
-  return detail::IsEqual(info, &expected, &actual);
-}
-
-template <typename T>
-HWY_INLINE void AssertEqual(const T expected, const T actual,
-                            const char* target_name, const char* filename,
-                            int line, size_t lane = 0) {
-  const auto info = detail::MakeTypeInfo<T>();
-  if (!detail::IsEqual(info, &expected, &actual)) {
-    detail::PrintMismatchAndAbort(info, &expected, &actual, target_name,
-                                  filename, line, lane);
-  }
-}
-
-template <typename T>
-HWY_INLINE void AssertArrayEqual(const T* expected, const T* actual,
-                                 size_t count, const char* target_name,
-                                 const char* filename, int line) {
-  const auto info = hwy::detail::MakeTypeInfo<T>();
-  detail::AssertArrayEqual(info, expected, actual, count, target_name, filename,
-                           line);
-}
-
-}  // namespace hwy
-
-#endif  // HWY_TESTS_TEST_UTIL_H_
diff --git a/third_party/highway/hwy/tests/test_util_test.cc b/third_party/highway/hwy/tests/test_util_test.cc
deleted file mode 100644 (file)
index d55e2e8..0000000
+++ /dev/null
@@ -1,105 +0,0 @@
-// Copyright 2019 Google LLC
-// SPDX-License-Identifier: Apache-2.0
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//      http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <stddef.h>
-#include <stdint.h>
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "tests/test_util_test.cc"
-#include "hwy/foreach_target.h"  // IWYU pragma: keep
-#include "hwy/highway.h"
-#include "hwy/tests/test_util-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace hwy {
-namespace HWY_NAMESPACE {
-
-struct TestName {
-  template <class T, class D>
-  HWY_NOINLINE void operator()(T t, D d) {
-    char num[10];
-    std::string expected = IsFloat<T>() ? "f" : (IsSigned<T>() ? "i" : "u");
-    snprintf(num, sizeof(num), "%u" , static_cast<unsigned>(sizeof(T) * 8));
-    expected += num;
-
-    const size_t N = Lanes(d);
-    if (N != 1) {
-      expected += 'x';
-      snprintf(num, sizeof(num), "%u", static_cast<unsigned>(N));
-      expected += num;
-    }
-    const std::string actual = TypeName(t, N);
-    if (expected != actual) {
-      HWY_ABORT("%s mismatch: expected '%s', got '%s'.\n",
-                hwy::TargetName(HWY_TARGET), expected.c_str(), actual.c_str());
-    }
-  }
-};
-
-HWY_NOINLINE void TestAllName() { ForAllTypes(ForPartialVectors<TestName>()); }
-
-struct TestEqualInteger {
-  template <class T>
-  HWY_NOINLINE void operator()(T /*t*/) const {
-    HWY_ASSERT_EQ(T(0), T(0));
-    HWY_ASSERT_EQ(T(1), T(1));
-    HWY_ASSERT_EQ(T(-1), T(-1));
-    HWY_ASSERT_EQ(LimitsMin<T>(), LimitsMin<T>());
-
-    HWY_ASSERT(!IsEqual(T(0), T(1)));
-    HWY_ASSERT(!IsEqual(T(1), T(0)));
-    HWY_ASSERT(!IsEqual(T(1), T(-1)));
-    HWY_ASSERT(!IsEqual(T(-1), T(1)));
-    HWY_ASSERT(!IsEqual(LimitsMin<T>(), LimitsMax<T>()));
-    HWY_ASSERT(!IsEqual(LimitsMax<T>(), LimitsMin<T>()));
-  }
-};
-
-struct TestEqualFloat {
-  template <class T>
-  HWY_NOINLINE void operator()(T /*t*/) const {
-    HWY_ASSERT(IsEqual(T(0), T(0)));
-    HWY_ASSERT(IsEqual(T(1), T(1)));
-    HWY_ASSERT(IsEqual(T(-1), T(-1)));
-    HWY_ASSERT(IsEqual(MantissaEnd<T>(), MantissaEnd<T>()));
-
-    HWY_ASSERT(!IsEqual(T(0), T(1)));
-    HWY_ASSERT(!IsEqual(T(1), T(0)));
-    HWY_ASSERT(!IsEqual(T(1), T(-1)));
-    HWY_ASSERT(!IsEqual(T(-1), T(1)));
-    HWY_ASSERT(!IsEqual(LowestValue<T>(), HighestValue<T>()));
-    HWY_ASSERT(!IsEqual(HighestValue<T>(), LowestValue<T>()));
-  }
-};
-
-HWY_NOINLINE void TestAllEqual() {
-  ForIntegerTypes(TestEqualInteger());
-  ForFloatTypes(TestEqualFloat());
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace hwy
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-
-namespace hwy {
-HWY_BEFORE_TEST(TestUtilTest);
-HWY_EXPORT_AND_TEST_P(TestUtilTest, TestAllName);
-HWY_EXPORT_AND_TEST_P(TestUtilTest, TestAllEqual);
-}  // namespace hwy
-
-#endif
diff --git a/third_party/highway/libhwy-contrib.pc.in b/third_party/highway/libhwy-contrib.pc.in
deleted file mode 100644 (file)
index 89c45f5..0000000
+++ /dev/null
@@ -1,10 +0,0 @@
-prefix=@CMAKE_INSTALL_PREFIX@
-exec_prefix=${prefix}
-libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@
-includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
-
-Name: libhwy-contrib
-Description: Additions to Highway: dot product, image, math, sort
-Version: @HWY_LIBRARY_VERSION@
-Libs: -L${libdir} -lhwy_contrib
-Cflags: -I${includedir}
diff --git a/third_party/highway/libhwy-test.pc.in b/third_party/highway/libhwy-test.pc.in
deleted file mode 100644 (file)
index ff91690..0000000
+++ /dev/null
@@ -1,10 +0,0 @@
-prefix=@CMAKE_INSTALL_PREFIX@
-libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@
-includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
-
-Name: libhwy-test
-Description: Efficient and performance-portable SIMD wrapper, test helpers.
-Requires: gtest
-Version: @HWY_LIBRARY_VERSION@
-Libs: -L${libdir} -lhwy_test
-Cflags: -I${includedir}
diff --git a/third_party/highway/libhwy.pc.in b/third_party/highway/libhwy.pc.in
deleted file mode 100644 (file)
index 6439892..0000000
+++ /dev/null
@@ -1,10 +0,0 @@
-prefix=@CMAKE_INSTALL_PREFIX@
-exec_prefix=${prefix}
-libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@
-includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
-
-Name: libhwy
-Description: Efficient and performance-portable SIMD wrapper
-Version: @HWY_LIBRARY_VERSION@
-Libs: -L${libdir} -lhwy
-Cflags: -I${includedir} -D@DLLEXPORT_TO_DEFINE@
diff --git a/third_party/highway/preamble.js.lds b/third_party/highway/preamble.js.lds
deleted file mode 100644 (file)
index f484a19..0000000
+++ /dev/null
@@ -1,9 +0,0 @@
-/*
- * Copyright 2019 Google LLC
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/* mock crypto module for benchmarks and unit tests or std::random_device fails at runtime */
-var crypto = { getRandomValues: function(array) { for (var i = 0; i < array.length; i++) array[i] = (Math.random()*256)|0 } };
\ No newline at end of file
diff --git a/third_party/highway/run_tests.bat b/third_party/highway/run_tests.bat
deleted file mode 100644 (file)
index 26600a2..0000000
+++ /dev/null
@@ -1,20 +0,0 @@
-@echo off
-REM Switch directory of this batch file
-cd %~dp0
-
-if not exist build_win mkdir build_win
-
-cd build_win
-cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON -G Ninja || goto error
-ninja || goto error
-ctest -j || goto error
-
-cd ..
-echo Success
-goto end
-
-:error
-echo Failure
-exit /b 1
-
-:end
diff --git a/third_party/highway/run_tests.sh b/third_party/highway/run_tests.sh
deleted file mode 100755 (executable)
index 017e536..0000000
+++ /dev/null
@@ -1,80 +0,0 @@
-#!/bin/bash
-
-# Switch to directory of this script
-MYDIR=$(dirname $(realpath "$0"))
-cd "${MYDIR}"
-
-# Exit if anything fails
-set -e
-
-#######################################
-echo RELEASE
-rm -rf build
-mkdir build
-cd build
-cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON
-make -j
-ctest -j
-cd ..
-rm -rf build
-
-#######################################
-echo DEBUG Clang 7
-rm -rf build_dbg
-mkdir build_dbg
-cd build_dbg
-CXX=clang++-7 CC=clang-7 cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON -DCMAKE_BUILD_TYPE=Debug
-make -j
-ctest -j
-cd ..
-rm -rf build_dbg
-
-#######################################
-echo 32-bit GCC
-rm -rf build_32
-mkdir build_32
-cd build_32
-CFLAGS=-m32 CXXFLAGS=-m32 LDFLAGS=-m32 CXX=g++ CC=gcc cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON
-make -j
-ctest -j
-cd ..
-rm -rf build_32
-
-#######################################
-for VER in 8 9 10; do
-  echo GCC $VER
-  rm -rf build_g$VER
-  mkdir build_g$VER
-  cd build_g$VER
-  CC=gcc-$VER CXX=g++-$VER cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON
-  make -j
-  make test
-  cd ..
-  rm -rf build_g$VER
-done
-
-#######################################
-echo ARMv7 GCC
-export QEMU_LD_PREFIX=/usr/arm-linux-gnueabihf
-rm -rf build_arm7
-mkdir build_arm7
-cd build_arm7
-CC=arm-linux-gnueabihf-gcc-11 CXX=arm-linux-gnueabihf-g++-11 cmake .. -DHWY_CMAKE_ARM7:BOOL=ON -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON
-make -j8
-ctest
-cd ..
-rm -rf build_arm7
-
-#######################################
-echo ARMv8 GCC
-export QEMU_LD_PREFIX=/usr/aarch64-linux-gnu
-rm -rf build_arm8
-mkdir build_arm8
-cd build_arm8
-CC=aarch64-linux-gnu-gcc-11 CXX=aarch64-linux-gnu-g++-11 cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON
-make -j8
-ctest
-cd ..
-rm -rf build_arm8
-
-echo Success
index c4551de..dec6be9 100644 (file)
@@ -45,12 +45,12 @@ target_include_directories(lcms2
     PUBLIC "${CMAKE_CURRENT_LIST_DIR}/lcms/include")
 # This warning triggers with gcc-8.
 if (CMAKE_C_COMPILER_ID MATCHES "GNU")
-target_compile_options(lcms2
-  PRIVATE
-    # gcc-only flags.
-    -Wno-stringop-truncation
-    -Wno-strict-aliasing
-)
+  target_compile_options(lcms2
+    PRIVATE
+      # gcc-only flags.
+      -Wno-stringop-truncation
+      -Wno-strict-aliasing
+  )
 endif()
 # By default LCMS uses sizeof(void*) for memory alignment, but in arm 32-bits we
 # can't access doubles not aligned to 8 bytes. This forces the alignment to 8
index 4d2a79c..2dfc403 100644 (file)
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-add_library(skcms-obj OBJECT EXCLUDE_FROM_ALL skcms/skcms.cc)
-target_include_directories(skcms-obj PUBLIC "${CMAKE_CURRENT_LIST_DIR}/skcms/")
-
-# This library is meant to be compiled/used by external libs (such as plugins)
-# that need to use skcms. We use a wrapper for libjxl.
-add_library(skcms-interface INTERFACE)
-target_sources(skcms-interface INTERFACE ${CMAKE_CURRENT_LIST_DIR}/skcms/skcms.cc)
-target_include_directories(skcms-interface INTERFACE ${CMAKE_CURRENT_LIST_DIR}/skcms)
+add_library(skcms STATIC EXCLUDE_FROM_ALL skcms/skcms.cc)
+target_include_directories(skcms PUBLIC "${CMAKE_CURRENT_LIST_DIR}/skcms/")
 
 include(CheckCXXCompilerFlag)
 check_cxx_compiler_flag("-Wno-psabi" CXX_WPSABI_SUPPORTED)
 if(CXX_WPSABI_SUPPORTED)
-  target_compile_options(skcms-obj PRIVATE -Wno-psabi)
-  target_compile_options(skcms-interface INTERFACE -Wno-psabi)
-endif()
-
-if(JPEGXL_BUNDLE_SKCMS)
-  target_compile_options(skcms-obj PRIVATE -DJPEGXL_BUNDLE_SKCMS=1)
-  if(MSVC)
-    target_compile_options(skcms-obj
-      PRIVATE /FI${CMAKE_CURRENT_SOURCE_DIR}/../lib/jxl/enc_jxl_skcms.h)
-  else()
-    target_compile_options(skcms-obj
-      PRIVATE -include ${CMAKE_CURRENT_SOURCE_DIR}/../lib/jxl/enc_jxl_skcms.h)
-  endif()
+  target_compile_options(skcms PRIVATE -Wno-psabi)
 endif()
 
-set_target_properties(skcms-obj PROPERTIES
+set_target_properties(skcms PROPERTIES
   POSITION_INDEPENDENT_CODE ON
   CXX_VISIBILITY_PRESET hidden
   VISIBILITY_INLINES_HIDDEN 1
 )
-
-add_library(skcms STATIC EXCLUDE_FROM_ALL $<TARGET_OBJECTS:skcms-obj>)
-target_include_directories(skcms
-  PUBLIC $<TARGET_PROPERTY:skcms-obj,INCLUDE_DIRECTORIES>)
-
index 9f49737..5d8ad92 100644 (file)
@@ -26,6 +26,7 @@ if(BUILD_TESTING)
 if (EXISTS "${SOURCE_DIR}/googletest/CMakeLists.txt" AND
     NOT JPEGXL_FORCE_SYSTEM_GTEST)
   add_subdirectory(third_party/googletest EXCLUDE_FROM_ALL)
+  include(GoogleTest)
 
   set(GTEST_ROOT "${SOURCE_DIR}/googletest/googletest")
   set(GTEST_INCLUDE_DIR "$<TARGET_PROPERTY:INCLUDE_DIRECTORIES,gtest>"
@@ -33,8 +34,6 @@ if (EXISTS "${SOURCE_DIR}/googletest/CMakeLists.txt" AND
   set(GMOCK_INCLUDE_DIR "$<TARGET_PROPERTY:INCLUDE_DIRECTORIES,gmock>")
   set(GTEST_LIBRARY "$<TARGET_FILE:gtest>")
   set(GTEST_MAIN_LIBRARY "$<TARGET_FILE:gtest_main>")
-  add_library(GTest::GTest ALIAS gtest)
-  add_library(GTest::Main ALIAS gtest_main)
 
   set_target_properties(gtest PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
   set_target_properties(gmock PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
@@ -55,29 +54,7 @@ else()
     configure_file("${JPEGXL_DEP_LICENSE_DIR}/googletest/copyright"
                    ${PROJECT_BINARY_DIR}/LICENSE.googletest COPYONLY)
   endif()  # JPEGXL_DEP_LICENSE_DIR
+  find_package(GTest REQUIRED)
 endif()
-find_package(GTest)
-if (NOT GTEST_FOUND)
-  set(BUILD_TESTING OFF CACHE BOOL "Build tests" FORCE)
-  message(SEND_ERROR "GTest not found. Install googletest package "
-          "(libgtest-dev) in the system or download googletest to "
-          "third_party/googletest from https://github.com/google/googletest ."
-          "To disable tests instead re-run cmake with -DBUILD_TESTING=OFF.")
-endif()  # NOT GTEST_FOUND
 
-# Look for gmock in the system too.
-if (NOT DEFINED GMOCK_INCLUDE_DIR)
-  find_path(
-      GMOCK_INCLUDE_DIR "gmock/gmock.h"
-      HINTS ${GTEST_INCLUDE_DIRS})
-  if (NOT GMOCK_INCLUDE_DIR)
-    set(BUILD_TESTING OFF CACHE BOOL "Build tests" FORCE)
-    message(SEND_ERROR "GMock not found. Install googletest package "
-            "(libgmock-dev) in the system or download googletest to "
-            "third_party/googletest from https://github.com/google/googletest ."
-            "To disable tests instead re-run cmake with -DBUILD_TESTING=OFF.")
-  else()
-    message(STATUS "Found GMock: ${GMOCK_INCLUDE_DIR}")
-  endif()  # NOT GMOCK_INCLUDE_DIR
-endif()  # NOT DEFINED GMOCK_INCLUDE_DIR
 endif()  # BUILD_TESTING
diff --git a/tools/BUILD b/tools/BUILD
new file mode 100644 (file)
index 0000000..58e0e9a
--- /dev/null
@@ -0,0 +1 @@
+package(default_visibility = ["//:__subpackages__"])
index 934ed89..7701e4d 100644 (file)
@@ -6,39 +6,39 @@
 # ICC detection library used by the comparison and viewer tools.
 if(JPEGXL_ENABLE_VIEWERS)
 if(WIN32)
-  find_package(Qt5 QUIET COMPONENTS Widgets)
-  if (NOT Qt5_FOUND)
-    message(WARNING "Qt5 was not found.")
+  find_package(Qt6 QUIET COMPONENTS Widgets)
+  if (NOT Qt6_FOUND)
+    message(WARNING "Qt6 was not found.")
   else()
     add_library(icc_detect STATIC EXCLUDE_FROM_ALL
       icc_detect/icc_detect_win32.cc
       icc_detect/icc_detect.h
     )
     target_include_directories(icc_detect PRIVATE "${PROJECT_SOURCE_DIR}")
-    target_link_libraries(icc_detect PUBLIC Qt5::Widgets)
+    target_link_libraries(icc_detect PUBLIC Qt6::Widgets)
     if(JPEGXL_DEP_LICENSE_DIR)
-      configure_file("${JPEGXL_DEP_LICENSE_DIR}/libqt5widgets5/copyright"
-                     ${PROJECT_BINARY_DIR}/LICENSE.libqt5widgets5 COPYONLY)
+      configure_file("${JPEGXL_DEP_LICENSE_DIR}/libqt6widgets6/copyright"
+                     ${PROJECT_BINARY_DIR}/LICENSE.libqt6widgets6 COPYONLY)
     endif()  # JPEGXL_DEP_LICENSE_DIR
   endif()
 elseif(APPLE)
-  find_package(Qt5 QUIET COMPONENTS Widgets)
-  if (Qt5_FOUND)
+  find_package(Qt6 QUIET COMPONENTS Widgets)
+  if (Qt6_FOUND)
     add_library(icc_detect STATIC EXCLUDE_FROM_ALL
       icc_detect/icc_detect_empty.cc
       icc_detect/icc_detect.h
     )
     target_include_directories(icc_detect PRIVATE "${PROJECT_SOURCE_DIR}")
-    target_link_libraries(icc_detect PUBLIC Qt5::Widgets)
+    target_link_libraries(icc_detect PUBLIC Qt6::Widgets)
   else()
-    message(WARNING "APPLE: Qt5 was not found.")
+    message(WARNING "APPLE: Qt6 was not found.")
   endif()
 else()
-  find_package(Qt5 QUIET COMPONENTS Widgets X11Extras)
+  find_package(Qt6 QUIET COMPONENTS Widgets)
   find_package(ECM QUIET NO_MODULE)
-  if (NOT Qt5_FOUND OR NOT ECM_FOUND)
-    if (NOT Qt5_FOUND)
-      message(WARNING "Qt5 was not found.")
+  if (NOT Qt6_FOUND OR NOT ECM_FOUND)
+    if (NOT Qt6_FOUND)
+      message(WARNING "Qt6 was not found.")
     else()
       message(WARNING "extra-cmake-modules were not found.")
     endif()
@@ -50,7 +50,7 @@ else()
         icc_detect/icc_detect_x11.cc
         icc_detect/icc_detect.h
       )
-      target_link_libraries(icc_detect PUBLIC jxl-static Qt5::Widgets Qt5::X11Extras XCB::XCB)
+      target_link_libraries(icc_detect PUBLIC jxl-internal Qt6::Widgets XCB::XCB)
     endif ()
   endif()
 endif()
@@ -60,17 +60,18 @@ endif()  # JPEGXL_ENABLE_VIEWERS
 set(TOOL_BINARIES)
 # Tools that depend on jxl internal functions.
 set(INTERNAL_TOOL_BINARIES)
+set(FUZZER_CORPUS_BINARIES)
 
 add_library(jxl_tool STATIC EXCLUDE_FROM_ALL
   cmdline.cc
   codec_config.cc
   speed_stats.cc
-  file_io.cc
   tool_version.cc
+  ${JXL_CMS_OBJECTS}
 )
 target_compile_options(jxl_tool PUBLIC "${JPEGXL_INTERNAL_FLAGS}")
 target_include_directories(jxl_tool PUBLIC "${PROJECT_SOURCE_DIR}")
-target_link_libraries(jxl_tool hwy)
+target_link_libraries(jxl_tool PUBLIC hwy)
 
 # The JPEGXL_VERSION is set from the builders.
 if(NOT DEFINED JPEGXL_VERSION OR JPEGXL_VERSION STREQUAL "")
@@ -125,7 +126,7 @@ if(JPEGXL_ENABLE_TOOLS)
   add_executable(cjxl cjxl_main.cc)
   target_link_libraries(cjxl
     jxl
-    jxl_extras_codec-static
+    jxl_extras_codec
     jxl_threads
     jxl_tool
   )
@@ -135,14 +136,19 @@ if(JPEGXL_ENABLE_TOOLS)
   add_executable(djxl djxl_main.cc)
   target_link_libraries(djxl
     jxl
-    jxl_extras_codec-static
+    jxl_extras_codec
     jxl_threads
     jxl_tool
   )
   list(APPEND TOOL_BINARIES djxl)
 
-  add_executable(cjpeg_hdr cjpeg_hdr.cc)
-  list(APPEND INTERNAL_TOOL_BINARIES cjpeg_hdr)
+  if(JPEGXL_ENABLE_JPEGLI)
+    # Depends on parts of jxl_extras that are only built if libjpeg is found and
+    # jpegli is enabled.
+    add_executable(cjpegli cjpegli.cc)
+    add_executable(djpegli djpegli.cc)
+    list(APPEND INTERNAL_TOOL_BINARIES cjpegli djpegli)
+  endif()
 
   add_executable(jxlinfo jxlinfo.c)
   target_link_libraries(jxlinfo jxl)
@@ -160,33 +166,53 @@ endif()  # JPEGXL_ENABLE_TOOLS
 # Other developer tools.
 if(JPEGXL_ENABLE_DEVTOOLS)
   list(APPEND INTERNAL_TOOL_BINARIES
-    fuzzer_corpus
     butteraugli_main
     decode_and_encode
     display_to_hlg
+    exr_to_pq
     pq_to_hlg
     render_hlg
+    local_tone_map
     tone_map
     texture_to_cube
     generate_lut_template
     ssimulacra_main
+    ssimulacra2
     xyb_range
     jxl_from_tree
   )
 
-  add_executable(fuzzer_corpus fuzzer_corpus.cc)
-
   add_executable(ssimulacra_main ssimulacra_main.cc ssimulacra.cc)
+  add_executable(ssimulacra2 ssimulacra2_main.cc ssimulacra2.cc)
   add_executable(butteraugli_main butteraugli_main.cc)
   add_executable(decode_and_encode decode_and_encode.cc)
   add_executable(display_to_hlg hdr/display_to_hlg.cc)
+  add_executable(exr_to_pq hdr/exr_to_pq.cc)
   add_executable(pq_to_hlg hdr/pq_to_hlg.cc)
   add_executable(render_hlg hdr/render_hlg.cc)
+  add_executable(local_tone_map hdr/local_tone_map.cc)
   add_executable(tone_map hdr/tone_map.cc)
   add_executable(texture_to_cube hdr/texture_to_cube.cc)
   add_executable(generate_lut_template hdr/generate_lut_template.cc)
   add_executable(xyb_range xyb_range.cc)
   add_executable(jxl_from_tree jxl_from_tree.cc)
+
+  list(APPEND FUZZER_CORPUS_BINARIES djxl_fuzzer_corpus)
+  add_executable(djxl_fuzzer_corpus djxl_fuzzer_corpus.cc)
+  target_link_libraries(djxl_fuzzer_corpus
+    jxl_extras-internal
+    jxl_testlib-internal
+    jxl_tool
+  )
+  if(JPEGXL_ENABLE_JPEGLI)
+    list(APPEND FUZZER_CORPUS_BINARIES jpegli_dec_fuzzer_corpus)
+    add_executable(jpegli_dec_fuzzer_corpus jpegli_dec_fuzzer_corpus.cc)
+    target_link_libraries(jpegli_dec_fuzzer_corpus
+      jpegli-static
+      jxl_tool
+      jxl_threads
+    )
+  endif()
 endif()  # JPEGXL_ENABLE_DEVTOOLS
 
 # Benchmark tools.
@@ -205,8 +231,11 @@ if(JPEGXL_ENABLE_BENCHMARK AND JPEGXL_ENABLE_TOOLS)
     benchmark/benchmark_utils.h
     benchmark/benchmark_codec_custom.cc
     benchmark/benchmark_codec_custom.h
+    benchmark/benchmark_codec_jpeg.cc
+    benchmark/benchmark_codec_jpeg.h
     benchmark/benchmark_codec_jxl.cc
     benchmark/benchmark_codec_jxl.h
+    ssimulacra2.cc
     ../third_party/dirent.cc
   )
   target_link_libraries(benchmark_xl Threads::Threads)
@@ -215,14 +244,6 @@ if(JPEGXL_ENABLE_BENCHMARK AND JPEGXL_ENABLE_TOOLS)
   target_compile_definitions(benchmark_xl PRIVATE "-DHAS_GLOB=0")
   endif() # MINGW
 
-  find_package(JPEG)
-  if(JPEG_FOUND)
-    target_sources(benchmark_xl PRIVATE
-      "${CMAKE_CURRENT_LIST_DIR}/benchmark/benchmark_codec_jpeg.cc"
-      "${CMAKE_CURRENT_LIST_DIR}/benchmark/benchmark_codec_jpeg.h"
-    )
-  endif ()
-
   if(NOT JPEGXL_BUNDLE_LIBPNG)
     find_package(PNG)
   endif()
@@ -231,6 +252,7 @@ if(JPEGXL_ENABLE_BENCHMARK AND JPEGXL_ENABLE_TOOLS)
       "${CMAKE_CURRENT_LIST_DIR}/benchmark/benchmark_codec_png.cc"
       "${CMAKE_CURRENT_LIST_DIR}/benchmark/benchmark_codec_png.h"
     )
+    target_compile_definitions(benchmark_xl PRIVATE -DBENCHMARK_PNG)
   endif()
 
   find_package(PkgConfig)
@@ -245,11 +267,16 @@ if(JPEGXL_ENABLE_BENCHMARK AND JPEGXL_ENABLE_TOOLS)
     # Use the static version of webp if available.
     find_library(WebP_STATIC_LINK_LIBRARY NAMES libwebp.a
         PATHS "${WebP_LIBDIR}")
+    find_library(SharpYuv_STATIC_LINK_LIBRARY NAMES libsharpyuv.a
+        PATHS "${WebP_LIBDIR}")
     if(NOT WebP_STATIC_LINK_LIBRARY)
       message(WARNING "Using dynamic libwebp")
       target_link_libraries(benchmark_xl PkgConfig::WebP)
     else()
       target_link_libraries(benchmark_xl "${WebP_STATIC_LINK_LIBRARY}")
+      if(SharpYuv_STATIC_LINK_LIBRARY)
+        target_link_libraries(benchmark_xl "${SharpYuv_STATIC_LINK_LIBRARY}")
+      endif()
       target_include_directories(benchmark_xl
           PRIVATE ${WebP_STATIC_INCLUDE_DIRS})
       target_compile_options(benchmark_xl PRIVATE ${WebP_STATIC_CFLAGS_OTHER})
@@ -270,33 +297,46 @@ endif()  # JPEGXL_ENABLE_BENCHMARK
 # All tool binaries depend on "jxl" library and the tool helpers.
 foreach(BINARY IN LISTS INTERNAL_TOOL_BINARIES)
   target_link_libraries("${BINARY}"
-    jxl_extras-static
+    jxl_extras-internal
+    jxl_threads
     jxl_tool
   )
 endforeach()
 
-list(APPEND TOOL_BINARIES ${INTERNAL_TOOL_BINARIES})
+list(APPEND TOOL_BINARIES ${INTERNAL_TOOL_BINARIES} ${FUZZER_CORPUS_BINARIES})
 
 foreach(BINARY IN LISTS TOOL_BINARIES)
-  if(JPEGXL_EMSCRIPTEN)
-    set_target_properties(${BINARY} PROPERTIES LINK_FLAGS "-s USE_LIBPNG=1")
+  if(EMSCRIPTEN)
+    set(JXL_WASM_TOOLS_LINK_FLAGS "\
+      -s USE_LIBPNG=1 \
+      -s ALLOW_MEMORY_GROWTH=1 \
+      -s USE_PTHREADS=1 \
+      -s PTHREAD_POOL_SIZE=16 \
+    ")
+    set_target_properties(${BINARY} PROPERTIES LINK_FLAGS "${JXL_WASM_TOOLS_LINK_FLAGS}")
   endif()
 endforeach()
 
 install(TARGETS ${TOOL_BINARIES} RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}")
 message(STATUS "Building tools: ${TOOL_BINARIES}")
 
-set(FUZZER_BINARIES
-  color_encoding_fuzzer
-  decode_basic_info_fuzzer
-  cjxl_fuzzer
-  djxl_fuzzer
-  icc_codec_fuzzer
-  fields_fuzzer
-  rans_fuzzer
-  set_from_bytes_fuzzer
-  transforms_fuzzer
-)
+# djxl_fuzzer builds even when not JPEGXL_ENABLE_TOOLS
+set(FUZZER_BINARIES djxl_fuzzer)
+if(JPEGXL_ENABLE_TOOLS)
+  list(APPEND FUZZER_BINARIES
+    color_encoding_fuzzer
+    decode_basic_info_fuzzer
+    cjxl_fuzzer
+    icc_codec_fuzzer
+    fields_fuzzer
+    rans_fuzzer
+    set_from_bytes_fuzzer
+    transforms_fuzzer
+  )
+if(JPEGXL_ENABLE_JPEGLI)
+  list(APPEND FUZZER_BINARIES jpegli_dec_fuzzer)
+endif()
+endif()
 
 # Fuzzers.
 foreach(FUZZER IN LISTS FUZZER_BINARIES)
@@ -314,12 +354,15 @@ foreach(FUZZER IN LISTS FUZZER_BINARIES)
   target_include_directories("${BINARY}" PRIVATE "${CMAKE_SOURCE_DIR}")
   if(FUZZER STREQUAL djxl_fuzzer)
     target_link_libraries("${BINARY}"
-      jxl_dec-static
-      jxl_threads-static
+      jxl_dec-internal
+      jxl_threads
     )
+  elseif(FUZZER STREQUAL jpegli_dec_fuzzer)
+    target_link_libraries("${BINARY}" jpegli-static)
   else()
     target_link_libraries("${BINARY}"
-      jxl_extras-static
+      jxl_extras_nocodec-internal
+      jxl_testlib-internal
       jxl_tool
     )
   endif()
@@ -370,37 +413,8 @@ add_subdirectory(comparison_viewer)
 add_subdirectory(flicker_test)
 endif()
 
-add_subdirectory(box)
 add_subdirectory(conformance)
-
-
-if (JPEGXL_ENABLE_TOOLS AND JPEGXL_EMSCRIPTEN)
-# WASM API facade.
-add_executable(jxl_emcc jxl_emcc.cc)
-target_link_libraries(jxl_emcc
-    jxl_extras-static
-)
-set_target_properties(jxl_emcc PROPERTIES LINK_FLAGS "\
-  -O3\
-  --closure 1 \
-  -s TOTAL_MEMORY=75mb \
-  -s USE_LIBPNG=1 \
-  -s DISABLE_EXCEPTION_CATCHING=1 \
-  -s MODULARIZE=1 \
-  -s FILESYSTEM=0 \
-  -s USE_PTHREADS=1 \
-  -s PTHREAD_POOL_SIZE=4 \
-  -s EXPORT_NAME=\"JxlCodecModule\"\
-  -s \"EXPORTED_FUNCTIONS=[\
-    _malloc,\
-    _free,\
-    _jxlCreateInstance,\
-    _jxlDestroyInstance,\
-    _jxlFlush,\
-    _jxlProcessInput\
-  ]\"\
-")
-endif ()  # JPEGXL_ENABLE_TOOLS AND JPEGXL_EMSCRIPTEN
+add_subdirectory(wasm_demo)
 
 if(JPEGXL_ENABLE_JNI)
 find_package(JNI QUIET)
@@ -412,7 +426,7 @@ if (JNI_FOUND AND Java_FOUND)
   # decoder_jni_onload.cc might be necessary for Android; not used yet.
   add_library(jxl_jni SHARED jni/org/jpeg/jpegxl/wrapper/decoder_jni.cc)
   target_include_directories(jxl_jni PRIVATE "${JNI_INCLUDE_DIRS}" "${PROJECT_SOURCE_DIR}")
-  target_link_libraries(jxl_jni PUBLIC jxl_dec-static jxl_threads-static)
+  target_link_libraries(jxl_jni PUBLIC jxl_dec-internal jxl_threads)
   if(NOT DEFINED JPEGXL_INSTALL_JNIDIR)
     set(JPEGXL_INSTALL_JNIDIR ${CMAKE_INSTALL_LIBDIR})
   endif()
@@ -455,15 +469,21 @@ endif()  # JNI_FOUND & Java_FOUND
 endif()  # JPEGXL_ENABLE_JNI
 
 # End-to-end tests for the tools
-if(BUILD_TESTING AND JPEGXL_ENABLE_TOOLS AND JPEGXL_ENABLE_DEVTOOLS AND JPEGXL_ENABLE_TRANSCODE_JPEG AND (NOT JPEGXL_ENABLE_JNI))
+if(JPEGXL_TEST_TOOLS)
 find_program (BASH_PROGRAM bash)
-if(BASH_PROGRAM AND $<TARGET_EXISTS:cjxl> AND $<TARGET_EXISTS:djxl> AND $<TARGET_EXISTS:ssimulacra_main>)
-  add_test(
-    NAME roundtrip_test
-    COMMAND ${BASH_PROGRAM} ${CMAKE_CURRENT_SOURCE_DIR}/roundtrip_test.sh
-            ${CMAKE_BINARY_DIR})
-  if (CMAKE_CROSSCOMPILING_EMULATOR)
-    set_tests_properties(roundtrip_test PROPERTIES ENVIRONMENT "EMULATOR=${CMAKE_CROSSCOMPILING_EMULATOR}")
+if (BASH_PROGRAM)
+  set(TEST_SCRIPTS)
+  find_package(JPEG)
+  if (JPEG_FOUND AND JPEGXL_ENABLE_TRANSCODE_JPEG)
+    list(APPEND TEST_SCRIPTS roundtrip_test)
   endif()
-endif()
-endif() # BUILD_TESTING
+  if (JPEG_FOUND AND JPEGXL_ENABLE_JPEGLI)
+    list(APPEND TEST_SCRIPTS jpegli_tools_test)
+  endif()
+  foreach(SCRIPT IN LISTS TEST_SCRIPTS)
+    add_test(NAME ${SCRIPT}
+      COMMAND ${BASH_PROGRAM} ${CMAKE_CURRENT_SOURCE_DIR}/scripts/${SCRIPT}.sh
+      ${CMAKE_BINARY_DIR})
+  endforeach()
+endif()  # BASH_PROGRAM
+endif()  # JPEGXL_TEST_TOOLS
diff --git a/tools/README.cjpeg_hdr.md b/tools/README.cjpeg_hdr.md
deleted file mode 100644 (file)
index bd7c793..0000000
+++ /dev/null
@@ -1,73 +0,0 @@
-# High bit depth JPEG encoder
-`cjpeg_hdr` is an (experimental) JPEG encoder that can preserve a higher bit
-depth than a traditional JPEG encoder. In particular, it may be used to produce
-HDR JPEGs that do not show obvious signs of banding.
-
-Note that at this point in time `cjpeg_hdr` does not attempt to actually
-*compress* the image - it behaves in the same way as a "quality 100" JPEG
-encoder would normally do, i.e. no quantization, to achieve the maximum
-possible visual quality.  Moreover, no Huffman optimization is performed.
-
-## Generating HBD JPEGs
-Note: this and the following sections assume that `libjxl` has been built in
-the `build/` directory, either by using CMake or by running `./ci.sh opt`.
-
-It should be sufficient to run `build/tools/cjpeg_hdr input_image output.jpg`.
-Various input formats are supported, including NetBPM and (8- or 16-bit) PNG.
-
-If the PNG image includes a colour profile, it will be copied in the resulting
-JPEG image. If this colour profile approximates the PQ or HLG transfer curves,
-some applications will consider the resulting image to be HDR.
-
-To attach a PQ profile to an image without a colour profile (or with a
-different colour profile), the following command can be used:
-
-```
- build/tools/decode_and_encode input RGB_D65_202_Rel_PeQ output_with_pq.png 16
-```
-
-Similarly, to attach an HLG profile, the following command can be used
-
-```
- build/tools/decode_and_encode input RGB_D65_202_Rel_HLG output_with_pq.png 16
-```
-
-## Decoding HBD JPEGs
-HBD JPEGs are fully retrocompatible with libjpeg, and any JPEG viewer ought to
-be able to visualize them. Nonetheless, to achieve the best visual quality, a
-high bit depth decoder should be used.
-
-Such a decoder does not exist today. As a workaround, it is possible to do a
-lossless conversion to JPEG XL and then view the resulting image:
-
-```
-  build/tools/cjxl --jpeg_transcode_disable_cfl hbd.jpeg hbd.jxl
-```
-
-The resulting JPEG XL file can be visualized, for example, in a browser,
-assuming that the corresponding flag is enabled in the settings.
-
-In particular, if the HBD JPEG has a PQ or HLG profile attached and the current
-display is an HDR display, Chrome ought to visualize the image as HDR content.
-
-It is also possible to convert the JPEG XL file back to a 16-bit PNG:
-
-```
-  build/tools/djxl hbd.jxl --bits_per_sample=16 output.png
-```
-
-Note however that as of today (2 Nov 2021) Chrome does not interpret such a PNG
-as an HDR image, even if a PQ or HLG profile is attached. Thus, to display the
-HDR image correctly it is recommended to either display the JPEG XL image
-directly or to convert the PNG to a format that Chrome interprets as HDR, such
-as AVIF. This can be done with the following command for a PQ image:
-
-```
-  avifenc -l -y 444 --depth 10 --cicp 9/16/9 image.png output.avif
-```
-
-and the following one for an HLG image:
-
-```
-  avifenc -l -y 444 --depth 10 --cicp 9/18/9 image.png output.avif
-```
index 7d04ce3..e34b75e 100644 (file)
 #include <string.h>
 
 #include <string>
-#include <vector>
+#include <utility>
 
 #include "lib/extras/dec/color_hints.h"
 #include "lib/jxl/base/override.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/codec_in_out.h"  // DecoderHints
-#include "lib/jxl/gaborish.h"
-#include "lib/jxl/modular/options.h"
+#include "tools/file_io.h"
 
 namespace jpegxl {
 namespace tools {
@@ -54,8 +52,8 @@ static inline bool ParseFloatPair(const char* arg,
   return true;
 }
 
-static inline bool ParseAndAppendKeyValue(const char* arg,
-                                          jxl::extras::ColorHints* out) {
+template <typename Callback>
+static inline bool ParseAndAppendKeyValue(const char* arg, Callback* cb) {
   const char* eq = strchr(arg, '=');
   if (!eq) {
     fprintf(stderr, "Expected argument as 'key=value' but received '%s'\n",
@@ -63,26 +61,7 @@ static inline bool ParseAndAppendKeyValue(const char* arg,
     return false;
   }
   std::string key(arg, eq);
-  out->Add(key, std::string(eq + 1));
-  return true;
-}
-
-static inline bool ParsePredictor(const char* arg, jxl::Predictor* out) {
-  char* end;
-  uint64_t p = static_cast<uint64_t>(strtoull(arg, &end, 0));
-  if (end[0] != '\0') {
-    fprintf(stderr, "Invalid predictor: %s.\n", arg);
-    return JXL_FAILURE("Args");
-  }
-  if (p >= jxl::kNumModularEncoderPredictors) {
-    fprintf(stderr,
-            "Invalid predictor value %" PRIu64 ", must be less than %" PRIu64
-            ".\n",
-            p, static_cast<uint64_t>(jxl::kNumModularEncoderPredictors));
-    return JXL_FAILURE("Args");
-  }
-  *out = static_cast<jxl::Predictor>(p);
-  return true;
+  return (*cb)(key, std::string(eq + 1));
 }
 
 static inline bool ParseCString(const char* arg, const char** out) {
@@ -95,6 +74,28 @@ static inline bool IncrementUnsigned(size_t* out) {
   return true;
 }
 
+struct ColorHintsProxy {
+  jxl::extras::ColorHints target;
+  bool operator()(const std::string& key, const std::string& value) {
+    if (key == "icc_pathname") {
+      std::vector<uint8_t> icc;
+      JXL_RETURN_IF_ERROR(ReadFile(value, &icc));
+      const char* data = reinterpret_cast<const char*>(icc.data());
+      target.Add("icc", std::string(data, data + icc.size()));
+    } else if (key == "exif" || key == "xmp" || key == "jumbf") {
+      std::vector<uint8_t> metadata;
+      JXL_RETURN_IF_ERROR(ReadFile(value, &metadata));
+      const char* data = reinterpret_cast<const char*>(metadata.data());
+      target.Add(key, std::string(data, data + metadata.size()));
+    } else if (key == "strip") {
+      target.Add(value, "");
+    } else {
+      target.Add(key, value);
+    }
+    return true;
+  }
+};
+
 }  // namespace tools
 }  // namespace jpegxl
 
index 2bd3eb8..cc2504a 100644 (file)
 #include "lib/extras/dec/color_description.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/color_encoding_internal.h"
-#include "lib/jxl/color_management.h"
-#include "tools/benchmark/benchmark_codec_jpeg.h"  // for AddCommand..
+#include "tools/benchmark/benchmark_codec_custom.h"  // for AddCommand..
+#include "tools/benchmark/benchmark_codec_jpeg.h"    // for AddCommand..
 #include "tools/benchmark/benchmark_codec_jxl.h"
-#if JPEGXL_ENABLE_APNG
+
+#ifdef BENCHMARK_PNG
 #include "tools/benchmark/benchmark_codec_png.h"
-#endif
+#endif  // BENCHMARK_PNG
 
 #ifdef BENCHMARK_WEBP
 #include "tools/benchmark/benchmark_codec_webp.h"
@@ -31,7 +32,8 @@
 #include "tools/benchmark/benchmark_codec_avif.h"
 #endif  // BENCHMARK_AVIF
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
 
 std::vector<std::string> SplitString(const std::string& s, char c) {
   std::vector<std::string> result;
@@ -128,6 +130,7 @@ Status BenchmarkArgs::AddCommandLineOptions() {
   AddDouble(&mul_output, "mul_output",
             "If nonzero, multiplies linear sRGB by this and clamps to 255",
             0.0);
+  AddFlag(&save_heatmap, "save_heatmap", "Saves the heatmap images.", true);
   AddDouble(&heatmap_good, "heatmap_good",
             "If greater than zero, use this as the good "
             "threshold for creating heatmap images.",
@@ -143,6 +146,11 @@ Status BenchmarkArgs::AddCommandLineOptions() {
           "Base64-encode the images in the HTML report rather than use "
           "external file names. May cause very large HTML data size.",
           false);
+  AddFlag(&html_report_use_decompressed, "html_report_use_decompressed",
+          "Show the compressed image as decompressed to --output_extension.",
+          true);
+  AddFlag(&html_report_add_heatmap, "html_report_add_heatmap",
+          "Add heatmaps to the image comparisons.", false);
 
   AddFlag(
       &markdown, "markdown",
@@ -186,13 +194,6 @@ Status BenchmarkArgs::AddCommandLineOptions() {
   AddDouble(&error_pnorm, "error_pnorm",
             "smallest p norm for pooling butteraugli values", 3.0);
 
-  AddFloat(&ba_params.hf_asymmetry, "hf_asymmetry",
-           "Multiplier for weighting HF artefacts more than features "
-           "being smoothed out. 1.0 means no HF asymmetry. 0.3 is "
-           "a good value to start exploring for asymmetry.",
-           0.8f);
-  AddFlag(&profiler, "profiler", "If true, print profiler results.", false);
-
   AddFlag(&show_progress, "show_progress",
           "Show activity dots per completed file during benchmark.", false);
 
@@ -210,13 +211,13 @@ Status BenchmarkArgs::AddCommandLineOptions() {
       "Distance numbers and compression speeds shown in the table are invalid.",
       false);
 
+  if (!AddCommandLineOptionsCustomCodec(this)) return false;
   if (!AddCommandLineOptionsJxlCodec(this)) return false;
-#ifdef BENCHMARK_JPEG
   if (!AddCommandLineOptionsJPEGCodec(this)) return false;
-#endif  // BENCHMARK_JPEG
-#if JPEGXL_ENABLE_APNG
+
+#ifdef BENCHMARK_PNG
   if (!AddCommandLineOptionsPNGCodec(this)) return false;
-#endif
+#endif  // BENCHMARK_PNG
 #ifdef BENCHMARK_WEBP
   if (!AddCommandLineOptionsWebPCodec(this)) return false;
 #endif  // BENCHMARK_WEBP
@@ -228,13 +229,12 @@ Status BenchmarkArgs::AddCommandLineOptions() {
 }
 
 Status BenchmarkArgs::ValidateArgs() {
-  size_t bits_per_sample = 0;  // unused
   if (input.empty()) {
     fprintf(stderr, "Missing --input filename(s).\n");
     return false;
   }
-  if (extras::CodecFromExtension(output_extension, &bits_per_sample) ==
-      extras::Codec::kUnknown) {
+  if (jxl::extras::CodecFromPath(output_extension) ==
+      jxl::extras::Codec::kUnknown) {
     JXL_WARNING("Unrecognized output_extension %s, try .png",
                 output_extension.c_str());
     return false;  // already warned
@@ -245,14 +245,13 @@ Status BenchmarkArgs::ValidateArgs() {
   if (!output_description.empty()) {
     // Validate, but also create the profile (only needs to happen once).
     JxlColorEncoding output_encoding_external;
-    if (!ParseDescription(output_description, &output_encoding_external)) {
+    if (!jxl::ParseDescription(output_description, &output_encoding_external)) {
       JXL_WARNING("Unrecognized output_description %s, try RGB_D65_SRG_Rel_Lin",
                   output_description.c_str());
       return false;  // already warned
     }
-    JXL_RETURN_IF_ERROR(jxl::ConvertExternalToInternalColorEncoding(
-        output_encoding_external, &output_encoding));
-    JXL_RETURN_IF_ERROR(output_encoding.CreateICC());
+    JXL_RETURN_IF_ERROR(output_encoding.FromExternal(output_encoding_external));
+    JXL_RETURN_IF_ERROR(!output_encoding.ICC().empty());
   }
 
   JXL_RETURN_IF_ERROR(ValidateArgsJxlCodec(this));
@@ -278,4 +277,5 @@ Status BenchmarkArgs::ValidateArgs() {
   return true;
 }
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
index bebc0ac..bdc385c 100644 (file)
 #include "tools/args.h"
 #include "tools/cmdline.h"
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
+
+using ::jxl::ColorEncoding;
+using ::jxl::Override;
+using ::jxl::Status;
 
 std::vector<std::string> SplitString(const std::string& s, char c);
 
@@ -108,7 +113,7 @@ struct BenchmarkArgs {
   bool silent_errors;
   bool save_compressed;
   bool save_decompressed;
-  std::string output_extension;    // see CodecFromExtension
+  std::string output_extension;    // see CodecFromPath
   std::string output_description;  // see ParseDescription
   ColorEncoding output_encoding;   // determined by output_description
 
@@ -126,8 +131,11 @@ struct BenchmarkArgs {
   double heatmap_good;
   double heatmap_bad;
 
+  bool save_heatmap;
   bool write_html_report;
   bool html_report_self_contained;
+  bool html_report_use_decompressed;
+  bool html_report_add_heatmap;
   bool markdown;
   bool more_columns;
 
@@ -143,9 +151,7 @@ struct BenchmarkArgs {
 
   int num_samples;
   int sample_dimensions;
-  ButteraugliParams ba_params;
 
-  bool profiler;
   double error_pnorm;
   bool show_progress;
 
@@ -169,6 +175,7 @@ struct BenchmarkArgs {
 // Returns singleton
 BenchmarkArgs* Args();
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
 
 #endif  // TOOLS_BENCHMARK_BENCHMARK_ARGS_H_
index 230665b..c788aef 100644 (file)
 
 #include "lib/extras/time.h"
 #include "lib/jxl/base/data_parallel.h"
-#include "lib/jxl/base/padded_bytes.h"
-#include "lib/jxl/base/profiler.h"
 #include "lib/jxl/base/span.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/codec_in_out.h"
 #include "lib/jxl/color_encoding_internal.h"
-#include "lib/jxl/color_management.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/image_bundle.h"
 #include "lib/jxl/image_ops.h"
 #include "tools/benchmark/benchmark_args.h"
 #include "tools/benchmark/benchmark_codec_custom.h"
-#ifdef JPEGXL_ENABLE_JPEG
 #include "tools/benchmark/benchmark_codec_jpeg.h"
-#endif  // JPEG_ENABLE_JPEG
 #include "tools/benchmark/benchmark_codec_jxl.h"
-#include "tools/benchmark/benchmark_codec_png.h"
 #include "tools/benchmark/benchmark_stats.h"
 
+#ifdef BENCHMARK_PNG
+#include "tools/benchmark/benchmark_codec_png.h"
+#endif  // BENCHMARK_PNG
+
 #ifdef BENCHMARK_WEBP
 #include "tools/benchmark/benchmark_codec_webp.h"
 #endif  // BENCHMARK_WEBP
 #include "tools/benchmark/benchmark_codec_avif.h"
 #endif  // BENCHMARK_AVIF
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
+
+using ::jxl::Image3F;
 
 void ImageCodec::ParseParameters(const std::string& parameters) {
   params_ = parameters;
@@ -75,26 +76,8 @@ Status ImageCodec::ParseParam(const std::string& param) {
       return false;
     }
     butteraugli_target_ = butteraugli_target;
-
-    // full hf asymmetry at high distance
-    static const double kHighDistance = 2.5;
-
-    // no hf asymmetry at low distance
-    static const double kLowDistance = 0.6;
-
-    if (butteraugli_target_ >= kHighDistance) {
-      ba_params_.hf_asymmetry = args_.ba_params.hf_asymmetry;
-    } else if (butteraugli_target_ >= kLowDistance) {
-      float w =
-          (butteraugli_target_ - kLowDistance) / (kHighDistance - kLowDistance);
-      ba_params_.hf_asymmetry =
-          args_.ba_params.hf_asymmetry * w + 1.0f * (1.0f - w);
-    } else {
-      ba_params_.hf_asymmetry = 1.0f;
-    }
     return true;
   } else if (param[0] == 'r') {
-    ba_params_.hf_asymmetry = args_.ba_params.hf_asymmetry;
     bitrate_target_ = strtof(param.substr(1).c_str(), nullptr);
     return true;
   }
@@ -108,10 +91,9 @@ class NoneCodec : public ImageCodec {
   Status ParseParam(const std::string& param) override { return true; }
 
   Status Compress(const std::string& filename, const CodecInOut* io,
-                  ThreadPoolInternal* pool, std::vector<uint8_t>* compressed,
+                  ThreadPool* pool, std::vector<uint8_t>* compressed,
                   jpegxl::tools::SpeedStats* speed_stats) override {
-    PROFILER_ZONE("NoneCompress");
-    const double start = Now();
+    const double start = jxl::Now();
     // Encode image size so we "decompress" something of the same size, as
     // required by butteraugli.
     const uint32_t xsize = io->xsize();
@@ -119,17 +101,16 @@ class NoneCodec : public ImageCodec {
     compressed->resize(8);
     memcpy(compressed->data(), &xsize, 4);
     memcpy(compressed->data() + 4, &ysize, 4);
-    const double end = Now();
+    const double end = jxl::Now();
     speed_stats->NotifyElapsed(end - start);
     return true;
   }
 
   Status Decompress(const std::string& filename,
-                    const Span<const uint8_t> compressed,
-                    ThreadPoolInternal* pool, CodecInOut* io,
+                    const Span<const uint8_t> compressed, ThreadPool* pool,
+                    CodecInOut* io,
                     jpegxl::tools::SpeedStats* speed_stats) override {
-    PROFILER_ZONE("NoneDecompress");
-    const double start = Now();
+    const double start = jxl::Now();
     JXL_ASSERT(compressed.size() == 8);
     uint32_t xsize, ysize;
     memcpy(&xsize, compressed.data(), 4);
@@ -139,7 +120,7 @@ class NoneCodec : public ImageCodec {
     io->metadata.m.SetFloat32Samples();
     io->metadata.m.color_encoding = ColorEncoding::SRGB();
     io->SetFromImage(std::move(image), io->metadata.m.color_encoding);
-    const double end = Now();
+    const double end = jxl::Now();
     speed_stats->NotifyElapsed(end - start);
     return true;
   }
@@ -162,14 +143,12 @@ ImageCodecPtr CreateImageCodec(const std::string& description) {
   } else if (name == "custom") {
     result.reset(CreateNewCustomCodec(*Args()));
 #endif
-#ifdef JPEGXL_ENABLE_JPEG
   } else if (name == "jpeg") {
     result.reset(CreateNewJPEGCodec(*Args()));
-#endif  // BENCHMARK_JPEG
-#if JPEGXL_ENABLE_APNG
+#ifdef BENCHMARK_PNG
   } else if (name == "png") {
     result.reset(CreateNewPNGCodec(*Args()));
-#endif
+#endif  // BENCHMARK_PNG
   } else if (name == "none") {
     result.reset(new NoneCodec(*Args()));
 #ifdef BENCHMARK_WEBP
@@ -180,7 +159,8 @@ ImageCodecPtr CreateImageCodec(const std::string& description) {
   } else if (name == "avif") {
     result.reset(CreateNewAvifCodec(*Args()));
 #endif  // BENCHMARK_AVIF
-  } else {
+  }
+  if (!result.get()) {
     JXL_ABORT("Unknown image codec: %s", name.c_str());
   }
   result->set_description(description);
@@ -188,4 +168,5 @@ ImageCodecPtr CreateImageCodec(const std::string& description) {
   return result;
 }
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
index e554fc2..eb19c35 100644 (file)
 #include <string>
 #include <vector>
 
-#include "lib/jxl/aux_out.h"
 #include "lib/jxl/base/data_parallel.h"
-#include "lib/jxl/base/padded_bytes.h"
 #include "lib/jxl/base/span.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/base/thread_pool_internal.h"
 #include "lib/jxl/butteraugli/butteraugli.h"
 #include "lib/jxl/codec_in_out.h"
 #include "lib/jxl/image.h"
 #include "tools/benchmark/benchmark_stats.h"
 #include "tools/cmdline.h"
 #include "tools/speed_stats.h"
+#include "tools/thread_pool_internal.h"
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
+
+using ::jxl::CodecInOut;
+using ::jxl::Span;
 
 // Thread-compatible.
 class ImageCodec {
@@ -43,36 +45,26 @@ class ImageCodec {
   void set_description(const std::string& desc) { description_ = desc; }
   const std::string& description() const { return description_; }
 
-  const ButteraugliParams& BaParams() const { return ba_params_; }
-
   virtual void ParseParameters(const std::string& parameters);
 
   virtual Status ParseParam(const std::string& param);
 
-  // Returns true iff the codec instance (including parameters) can tolerate
-  // ImageBundle c_current() != metadata()->color_encoding, and the possibility
-  // of negative (out of gamut) pixel values.
-  virtual bool IsColorAware() const { return false; }
-
-  // Returns true iff the codec instance (including parameters) will operate
-  // only with quantized DCT (JPEG) coefficients in input.
-  virtual bool IsJpegTranscoder() const { return false; }
-
   virtual Status Compress(const std::string& filename, const CodecInOut* io,
-                          ThreadPoolInternal* pool,
-                          std::vector<uint8_t>* compressed,
+                          ThreadPool* pool, std::vector<uint8_t>* compressed,
                           jpegxl::tools::SpeedStats* speed_stats) = 0;
 
   virtual Status Decompress(const std::string& filename,
                             const Span<const uint8_t> compressed,
-                            ThreadPoolInternal* pool, CodecInOut* io,
+                            ThreadPool* pool, CodecInOut* io,
                             jpegxl::tools::SpeedStats* speed_stats) = 0;
 
   virtual void GetMoreStats(BenchmarkStats* stats) {}
 
+  virtual bool IgnoreAlpha() const { return false; }
+
   virtual Status CanRecompressJpeg() const { return false; }
   virtual Status RecompressJpeg(const std::string& filename,
-                                const std::string& data,
+                                const std::vector<uint8_t>& data,
                                 std::vector<uint8_t>* compressed,
                                 jpegxl::tools::SpeedStats* speed_stats) {
     return false;
@@ -87,7 +79,6 @@ class ImageCodec {
   float butteraugli_target_;
   float q_target_;
   float bitrate_target_;
-  ButteraugliParams ba_params_;
   std::string error_message_;
 };
 
@@ -98,6 +89,7 @@ using ImageCodecPtr = std::unique_ptr<ImageCodec>;
 // then ParseParameters of the codec gets called with the part behind the colon.
 ImageCodecPtr CreateImageCodec(const std::string& description);
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
 
 #endif  // TOOLS_BENCHMARK_BENCHMARK_CODEC_H_
index fbe36b5..0ff1968 100644 (file)
@@ -5,15 +5,15 @@
 #include "tools/benchmark/benchmark_codec_avif.h"
 
 #include <avif/avif.h>
+#include <jxl/cms.h>
 
 #include "lib/extras/time.h"
-#include "lib/jxl/base/padded_bytes.h"
 #include "lib/jxl/base/span.h"
-#include "lib/jxl/base/thread_pool_internal.h"
 #include "lib/jxl/codec_in_out.h"
 #include "lib/jxl/dec_external_image.h"
 #include "lib/jxl/enc_external_image.h"
 #include "tools/cmdline.h"
+#include "tools/thread_pool_internal.h"
 
 #define JXL_RETURN_IF_AVIF_ERROR(result)                                       \
   do {                                                                         \
     }                                                                          \
   } while (false)
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
+
+using ::jxl::Bytes;
+using ::jxl::CodecInOut;
+using ::jxl::IccBytes;
+using ::jxl::ImageBundle;
+using ::jxl::Primaries;
+using ::jxl::Span;
+using ::jxl::ThreadPool;
+using ::jxl::TransferFunction;
+using ::jxl::WhitePoint;
 
 namespace {
 
+size_t GetNumThreads(ThreadPool* pool) {
+  size_t result = 0;
+  const auto count_threads = [&](const size_t num_threads) {
+    result = num_threads;
+    return true;
+  };
+  const auto no_op = [&](const uint32_t /*task*/, size_t /*thread*/) {};
+  (void)jxl::RunOnPool(pool, 0, 1, count_threads, no_op, "Compress");
+  return result;
+}
+
 struct AvifArgs {
   avifPixelFormat chroma_subsampling = AVIF_PIXEL_FORMAT_YUV444;
 };
@@ -55,13 +77,13 @@ bool ParseChromaSubsampling(const char* arg, avifPixelFormat* subsampling) {
 }
 
 void SetUpAvifColor(const ColorEncoding& color, avifImage* const image) {
-  bool need_icc = color.white_point != WhitePoint::kD65;
+  bool need_icc = (color.GetWhitePointType() != WhitePoint::kD65);
 
   image->matrixCoefficients = AVIF_MATRIX_COEFFICIENTS_BT709;
   if (!color.HasPrimaries()) {
     need_icc = true;
   } else {
-    switch (color.primaries) {
+    switch (color.GetPrimariesType()) {
       case Primaries::kSRGB:
         image->colorPrimaries = AVIF_COLOR_PRIMARIES_BT709;
         break;
@@ -76,7 +98,7 @@ void SetUpAvifColor(const ColorEncoding& color, avifImage* const image) {
     }
   }
 
-  switch (color.tf.GetTransferFunction()) {
+  switch (color.Tf().GetTransferFunction()) {
     case TransferFunction::kSRGB:
       image->transferCharacteristics = AVIF_TRANSFER_CHARACTERISTICS_SRGB;
       break;
@@ -102,40 +124,41 @@ void SetUpAvifColor(const ColorEncoding& color, avifImage* const image) {
 
 Status ReadAvifColor(const avifImage* const image, ColorEncoding* const color) {
   if (image->icc.size != 0) {
-    PaddedBytes icc;
+    IccBytes icc;
     icc.assign(image->icc.data, image->icc.data + image->icc.size);
-    return color->SetICC(std::move(icc));
+    return color->SetICC(std::move(icc), JxlGetDefaultCms());
   }
 
-  color->white_point = WhitePoint::kD65;
+  JXL_RETURN_IF_ERROR(color->SetWhitePointType(WhitePoint::kD65));
   switch (image->colorPrimaries) {
     case AVIF_COLOR_PRIMARIES_BT709:
-      color->primaries = Primaries::kSRGB;
+      JXL_RETURN_IF_ERROR(color->SetPrimariesType(Primaries::kSRGB));
       break;
     case AVIF_COLOR_PRIMARIES_BT2020:
-      color->primaries = Primaries::k2100;
+      JXL_RETURN_IF_ERROR(color->SetPrimariesType(Primaries::k2100));
       break;
     default:
       return JXL_FAILURE("unsupported avif primaries");
   }
+  jxl::cms::CustomTransferFunction& tf = color->Tf();
   switch (image->transferCharacteristics) {
     case AVIF_TRANSFER_CHARACTERISTICS_BT470M:
-      JXL_RETURN_IF_ERROR(color->tf.SetGamma(2.2));
+      JXL_RETURN_IF_ERROR(tf.SetGamma(2.2));
       break;
     case AVIF_TRANSFER_CHARACTERISTICS_BT470BG:
-      JXL_RETURN_IF_ERROR(color->tf.SetGamma(2.8));
+      JXL_RETURN_IF_ERROR(tf.SetGamma(2.8));
       break;
     case AVIF_TRANSFER_CHARACTERISTICS_LINEAR:
-      color->tf.SetTransferFunction(TransferFunction::kLinear);
+      tf.SetTransferFunction(TransferFunction::kLinear);
       break;
     case AVIF_TRANSFER_CHARACTERISTICS_SRGB:
-      color->tf.SetTransferFunction(TransferFunction::kSRGB);
+      tf.SetTransferFunction(TransferFunction::kSRGB);
       break;
     case AVIF_TRANSFER_CHARACTERISTICS_SMPTE2084:
-      color->tf.SetTransferFunction(TransferFunction::kPQ);
+      tf.SetTransferFunction(TransferFunction::kPQ);
       break;
     case AVIF_TRANSFER_CHARACTERISTICS_HLG:
-      color->tf.SetTransferFunction(TransferFunction::kHLG);
+      tf.SetTransferFunction(TransferFunction::kHLG);
       break;
     default:
       return JXL_FAILURE("unsupported avif TRC");
@@ -213,10 +236,11 @@ class AvifCodec : public ImageCodec {
   }
 
   Status Compress(const std::string& filename, const CodecInOut* io,
-                  ThreadPoolInternal* pool, std::vector<uint8_t>* compressed,
-                  jpegxl::tools::SpeedStats* speed_stats) override {
+                  ThreadPool* pool, std::vector<uint8_t>* compressed,
+                  SpeedStats* speed_stats) override {
     double elapsed_convert_image = 0;
-    const double start = Now();
+    size_t max_threads = GetNumThreads(pool);
+    const double start = jxl::Now();
     {
       const auto depth =
           std::min<int>(16, io->metadata.m.bit_depth.bits_per_sample);
@@ -229,10 +253,15 @@ class AvifCodec : public ImageCodec {
       encoder->tileColsLog2 = log2_cols;
       encoder->tileRowsLog2 = log2_rows;
       encoder->speed = speed_;
-      encoder->maxThreads = pool->NumThreads();
+      encoder->maxThreads = max_threads;
       for (const auto& opts : codec_specific_options_) {
-        avifEncoderSetCodecSpecificOption(encoder.get(), opts.first.c_str(),
-                                          opts.second.c_str());
+#if AVIF_VERSION_MAJOR >= 1
+        JXL_RETURN_IF_AVIF_ERROR(avifEncoderSetCodecSpecificOption(
+            encoder.get(), opts.first.c_str(), opts.second.c_str()));
+#else
+        (void)avifEncoderSetCodecSpecificOption(
+            encoder.get(), opts.first.c_str(), opts.second.c_str());
+#endif
       }
       avifAddImageFlags add_image_flags = AVIF_ADD_IMAGE_FLAG_SINGLE;
       if (io->metadata.m.have_animation) {
@@ -258,14 +287,14 @@ class AvifCodec : public ImageCodec {
         avifRGBImageAllocatePixels(&rgb_image);
         std::unique_ptr<avifRGBImage, void (*)(avifRGBImage*)> pixels_freer(
             &rgb_image, &avifRGBImageFreePixels);
-        const double start_convert_image = Now();
+        const double start_convert_image = jxl::Now();
         JXL_RETURN_IF_ERROR(ConvertToExternal(
             ib, depth, /*float_out=*/false,
             /*num_channels=*/ib.HasAlpha() ? 4 : 3, JXL_NATIVE_ENDIAN,
             /*stride=*/rgb_image.rowBytes, pool, rgb_image.pixels,
             rgb_image.rowBytes * rgb_image.height,
             /*out_callback=*/{}, jxl::Orientation::kIdentity));
-        const double end_convert_image = Now();
+        const double end_convert_image = jxl::Now();
         elapsed_convert_image += end_convert_image - start_convert_image;
         JXL_RETURN_IF_AVIF_ERROR(avifImageRGBToYUV(image.get(), &rgb_image));
         JXL_RETURN_IF_AVIF_ERROR(avifEncoderAddImage(
@@ -276,24 +305,23 @@ class AvifCodec : public ImageCodec {
       compressed->assign(buffer.data, buffer.data + buffer.size);
       avifRWDataFree(&buffer);
     }
-    const double end = Now();
+    const double end = jxl::Now();
     speed_stats->NotifyElapsed(end - start - elapsed_convert_image);
     return true;
   }
 
   Status Decompress(const std::string& filename,
-                    const Span<const uint8_t> compressed,
-                    ThreadPoolInternal* pool, CodecInOut* io,
-                    jpegxl::tools::SpeedStats* speed_stats) override {
+                    const Span<const uint8_t> compressed, ThreadPool* pool,
+                    CodecInOut* io, SpeedStats* speed_stats) override {
     io->frames.clear();
-    io->dec_pixels = 0;
+    size_t max_threads = GetNumThreads(pool);
     double elapsed_convert_image = 0;
-    const double start = Now();
+    const double start = jxl::Now();
     {
       std::unique_ptr<avifDecoder, void (*)(avifDecoder*)> decoder(
           avifDecoderCreate(), &avifDecoderDestroy);
       decoder->codecChoice = decoder_;
-      decoder->maxThreads = pool->NumThreads();
+      decoder->maxThreads = max_threads;
       JXL_RETURN_IF_AVIF_ERROR(avifDecoderSetIOMemory(
           decoder.get(), compressed.data(), compressed.size()));
       JXL_RETURN_IF_AVIF_ERROR(avifDecoderParse(decoder.get()));
@@ -316,27 +344,27 @@ class AvifCodec : public ImageCodec {
         std::unique_ptr<avifRGBImage, void (*)(avifRGBImage*)> pixels_freer(
             &rgb_image, &avifRGBImageFreePixels);
         JXL_RETURN_IF_AVIF_ERROR(avifImageYUVToRGB(decoder->image, &rgb_image));
-        const double start_convert_image = Now();
+        const double start_convert_image = jxl::Now();
         {
+          JxlPixelFormat format = {
+              (has_alpha ? 4u : 3u),
+              (rgb_image.depth <= 8 ? JXL_TYPE_UINT8 : JXL_TYPE_UINT16),
+              JXL_NATIVE_ENDIAN, 0};
           ImageBundle ib(&io->metadata.m);
           JXL_RETURN_IF_ERROR(ConvertFromExternal(
-              Span<const uint8_t>(rgb_image.pixels,
-                                  rgb_image.height * rgb_image.rowBytes),
-              rgb_image.width, rgb_image.height, color, (has_alpha ? 4 : 3),
-              /*alpha_is_premultiplied=*/false, rgb_image.depth,
-              JXL_NATIVE_ENDIAN, pool, &ib,
-              /*float_in=*/false, /*align=*/0));
+              Bytes(rgb_image.pixels, rgb_image.height * rgb_image.rowBytes),
+              rgb_image.width, rgb_image.height, color, rgb_image.depth, format,
+              pool, &ib));
           io->frames.push_back(std::move(ib));
-          io->dec_pixels += rgb_image.width * rgb_image.height;
         }
-        const double end_convert_image = Now();
+        const double end_convert_image = jxl::Now();
         elapsed_convert_image += end_convert_image - start_convert_image;
       }
       if (next_image != AVIF_RESULT_NO_IMAGES_REMAINING) {
         JXL_RETURN_IF_AVIF_ERROR(next_image);
       }
     }
-    const double end = Now();
+    const double end = jxl::Now();
     speed_stats->NotifyElapsed(end - start - elapsed_convert_image);
     return true;
   }
@@ -355,4 +383,5 @@ ImageCodec* CreateNewAvifCodec(const BenchmarkArgs& args) {
   return new AvifCodec(args);
 }
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
index b3dc38e..c3816cf 100644 (file)
 #include "tools/benchmark/benchmark_args.h"
 #include "tools/benchmark/benchmark_codec.h"
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
 ImageCodec* CreateNewAvifCodec(const BenchmarkArgs& args);
 
 // Registers the avif-specific command line options.
 Status AddCommandLineOptionsAvifCodec(BenchmarkArgs* args);
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
 
 #endif  // TOOLS_BENCHMARK_BENCHMARK_CODEC_AVIF_H_
index eefae6e..87fc04c 100644 (file)
 #include <fstream>
 
 #include "lib/extras/codec.h"
+#include "lib/extras/dec/color_description.h"
 #include "lib/extras/enc/apng.h"
 #include "lib/extras/time.h"
-#include "lib/jxl/base/file_io.h"
-#include "lib/jxl/base/thread_pool_internal.h"
 #include "lib/jxl/codec_in_out.h"
 #include "lib/jxl/image_bundle.h"
 #include "tools/benchmark/benchmark_utils.h"
+#include "tools/file_io.h"
+#include "tools/thread_pool_internal.h"
 
-namespace jxl {
-namespace {
+namespace jpegxl {
+namespace tools {
 
-std::string GetBaseName(std::string filename) {
-  std::string result = std::move(filename);
-  result = basename(&result[0]);
-  const size_t dot = result.rfind('.');
-  if (dot != std::string::npos) {
-    result.resize(dot);
-  }
-  return result;
+struct CustomCodecArgs {
+  std::string extension;
+  std::string colorspace;
+  bool quiet;
+};
+
+static CustomCodecArgs* const custom_args = new CustomCodecArgs;
+
+Status AddCommandLineOptionsCustomCodec(BenchmarkArgs* args) {
+  args->AddString(
+      &custom_args->extension, "custom_codec_extension",
+      "Converts input and output of codec to this file type (default: png).",
+      "png");
+  args->AddString(
+      &custom_args->colorspace, "custom_codec_colorspace",
+      "If not empty, converts input and output of codec to this colorspace.",
+      "");
+  args->AddFlag(&custom_args->quiet, "custom_codec_quiet",
+                "Whether stdin and stdout of custom codec should be shown.",
+                false);
+  return true;
 }
 
+namespace {
+
 // This uses `output_filename` to determine the name of the corresponding
 // `.time` file.
 template <typename F>
 Status ReportCodecRunningTime(F&& function, std::string output_filename,
                               jpegxl::tools::SpeedStats* const speed_stats) {
-  const double start = Now();
+  const double start = jxl::Now();
   JXL_RETURN_IF_ERROR(function());
-  const double end = Now();
+  const double end = jxl::Now();
   const std::string time_filename =
       GetBaseName(std::move(output_filename)) + ".time";
   std::ifstream time_stream(time_filename);
@@ -64,21 +80,36 @@ class CustomCodec : public ImageCodec {
   explicit CustomCodec(const BenchmarkArgs& args) : ImageCodec(args) {}
 
   Status ParseParam(const std::string& param) override {
+    if (param_index_ == 0) {
+      description_ = "";
+    }
     switch (param_index_) {
       case 0:
         extension_ = param;
+        description_ += param;
         break;
-
       case 1:
         compress_command_ = param;
+        description_ += std::string(":");
+        if (param.find_last_of('/') < param.size()) {
+          description_ += param.substr(param.find_last_of('/') + 1);
+        } else {
+          description_ += param;
+        }
         break;
-
       case 2:
         decompress_command_ = param;
         break;
-
       default:
         compress_args_.push_back(param);
+        description_ += std::string(":");
+        if (param.size() > 2 && param[0] == '-' && param[1] == '-') {
+          description_ += param.substr(2);
+        } else if (param.size() > 2 && param[0] == '-') {
+          description_ += param.substr(1);
+        } else {
+          description_ += param;
+        }
         break;
     }
     ++param_index_;
@@ -86,49 +117,68 @@ class CustomCodec : public ImageCodec {
   }
 
   Status Compress(const std::string& filename, const CodecInOut* io,
-                  ThreadPoolInternal* pool, std::vector<uint8_t>* compressed,
+                  ThreadPool* pool, std::vector<uint8_t>* compressed,
                   jpegxl::tools::SpeedStats* speed_stats) override {
     JXL_RETURN_IF_ERROR(param_index_ > 2);
 
     const std::string basename = GetBaseName(filename);
-    TemporaryFile png_file(basename, "png"), encoded_file(basename, extension_);
-    std::string png_filename, encoded_filename;
-    JXL_RETURN_IF_ERROR(png_file.GetFileName(&png_filename));
+    TemporaryFile in_file(basename, custom_args->extension);
+    TemporaryFile encoded_file(basename, extension_);
+    std::string in_filename, encoded_filename;
+    JXL_RETURN_IF_ERROR(in_file.GetFileName(&in_filename));
     JXL_RETURN_IF_ERROR(encoded_file.GetFileName(&encoded_filename));
     saved_intensity_target_ = io->metadata.m.IntensityTarget();
 
     const size_t bits = io->metadata.m.bit_depth.bits_per_sample;
-    JXL_RETURN_IF_ERROR(
-        EncodeToFile(*io, io->Main().c_current(), bits, png_filename, pool));
+    ColorEncoding c_enc = io->Main().c_current();
+    if (!custom_args->colorspace.empty()) {
+      JxlColorEncoding colorspace;
+      JXL_RETURN_IF_ERROR(
+          jxl::ParseDescription(custom_args->colorspace, &colorspace));
+      JXL_RETURN_IF_ERROR(c_enc.FromExternal(colorspace));
+    }
+    std::vector<uint8_t> encoded;
+    JXL_RETURN_IF_ERROR(Encode(*io, c_enc, bits, in_filename, &encoded, pool));
+    JXL_RETURN_IF_ERROR(WriteFile(in_filename, encoded));
     std::vector<std::string> arguments = compress_args_;
-    arguments.push_back(png_filename);
+    arguments.push_back(in_filename);
     arguments.push_back(encoded_filename);
     JXL_RETURN_IF_ERROR(ReportCodecRunningTime(
-        [&, this] { return RunCommand(compress_command_, arguments); },
+        [&, this] {
+          return RunCommand(compress_command_, arguments, custom_args->quiet);
+        },
         encoded_filename, speed_stats));
     return ReadFile(encoded_filename, compressed);
   }
 
   Status Decompress(const std::string& filename,
-                    const Span<const uint8_t> compressed,
-                    ThreadPoolInternal* pool, CodecInOut* io,
+                    const Span<const uint8_t> compressed, ThreadPool* pool,
+                    CodecInOut* io,
                     jpegxl::tools::SpeedStats* speed_stats) override {
     const std::string basename = GetBaseName(filename);
-    TemporaryFile encoded_file(basename, extension_), png_file(basename, "png");
-    std::string encoded_filename, png_filename;
+    TemporaryFile encoded_file(basename, extension_);
+    TemporaryFile out_file(basename, custom_args->extension);
+    std::string encoded_filename, out_filename;
     JXL_RETURN_IF_ERROR(encoded_file.GetFileName(&encoded_filename));
-    JXL_RETURN_IF_ERROR(png_file.GetFileName(&png_filename));
+    JXL_RETURN_IF_ERROR(out_file.GetFileName(&out_filename));
 
-    JXL_RETURN_IF_ERROR(WriteFile(compressed, encoded_filename));
+    JXL_RETURN_IF_ERROR(WriteFile(encoded_filename, compressed));
     JXL_RETURN_IF_ERROR(ReportCodecRunningTime(
         [&, this] {
           return RunCommand(
               decompress_command_,
-              std::vector<std::string>{encoded_filename, png_filename});
+              std::vector<std::string>{encoded_filename, out_filename},
+              custom_args->quiet);
         },
-        png_filename, speed_stats));
+        out_filename, speed_stats));
+    jxl::extras::ColorHints hints;
+    if (!custom_args->colorspace.empty()) {
+      hints.Add("color_space", custom_args->colorspace);
+    }
+    std::vector<uint8_t> encoded;
+    JXL_RETURN_IF_ERROR(ReadFile(out_filename, &encoded));
     JXL_RETURN_IF_ERROR(
-        SetFromFile(png_filename, extras::ColorHints(), io, pool));
+        jxl::SetFromBytes(jxl::Bytes(encoded), hints, io, pool));
     io->metadata.m.SetIntensityTarget(saved_intensity_target_);
     return true;
   }
@@ -148,14 +198,18 @@ ImageCodec* CreateNewCustomCodec(const BenchmarkArgs& args) {
   return new CustomCodec(args);
 }
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
 
 #else
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
 
 ImageCodec* CreateNewCustomCodec(const BenchmarkArgs& args) { return nullptr; }
+Status AddCommandLineOptionsCustomCodec(BenchmarkArgs* args) { return true; }
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
 
 #endif  // _MSC_VER
index b2711cd..6e3d017 100644 (file)
 #include "tools/benchmark/benchmark_args.h"
 #include "tools/benchmark/benchmark_codec.h"
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
 
 ImageCodec* CreateNewCustomCodec(const BenchmarkArgs& args);
+Status AddCommandLineOptionsCustomCodec(BenchmarkArgs* args);
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
 
 #endif  // TOOLS_BENCHMARK_BENCHMARK_CODEC_CUSTOM_H_
index ae3215a..ae6abae 100644 (file)
 #include <numeric>  // partial_sum
 #include <string>
 
+#if JPEGXL_ENABLE_JPEGLI
+#include "lib/extras/dec/jpegli.h"
+#endif
 #include "lib/extras/dec/jpg.h"
+#if JPEGXL_ENABLE_JPEGLI
+#include "lib/extras/enc/jpegli.h"
+#endif
 #include "lib/extras/enc/jpg.h"
 #include "lib/extras/packed_image.h"
 #include "lib/extras/packed_image_convert.h"
 #include "lib/extras/time.h"
-#include "lib/jxl/base/padded_bytes.h"
 #include "lib/jxl/base/span.h"
-#include "lib/jxl/base/thread_pool_internal.h"
 #include "lib/jxl/codec_in_out.h"
+#include "lib/jxl/image_bundle.h"
+#include "tools/benchmark/benchmark_utils.h"
 #include "tools/cmdline.h"
+#include "tools/file_io.h"
+#include "tools/thread_pool_internal.h"
 
-namespace jxl {
-
-namespace {
+namespace jpegxl {
+namespace tools {
 
 struct JPEGArgs {
-  std::string jpeg_encoder = "libjpeg";
-  std::string chroma_subsampling = "444";
+  std::string base_quant_fn;
+  float search_q_start;
+  float search_q_min;
+  float search_q_max;
+  float search_d_min;
+  float search_d_max;
+  int search_max_iters;
+  float search_tolerance;
+  float search_q_precision;
+  float search_first_iter_slope;
 };
 
-JPEGArgs* const jpegargs = new JPEGArgs;
+static JPEGArgs* const jpegargs = new JPEGArgs;
 
-}  // namespace
+#define SET_ENCODER_ARG(name)                                  \
+  if (jpegargs->name > 0) {                                    \
+    encoder->SetOption(#name, std::to_string(jpegargs->name)); \
+  }
 
 Status AddCommandLineOptionsJPEGCodec(BenchmarkArgs* args) {
-  args->cmdline.AddOptionValue(
-      '\0', "chroma_subsampling", "444/422/420/411",
-      "default JPEG chroma subsampling (default: 444).",
-      &jpegargs->chroma_subsampling, &jpegxl::tools::ParseString);
+  args->AddString(&jpegargs->base_quant_fn, "qtables",
+                  "Custom base quantization tables.");
+  args->AddFloat(&jpegargs->search_q_start, "search_q_start",
+                 "Starting quality for quality-to-target search", 0.0f);
+  args->AddFloat(&jpegargs->search_q_min, "search_q_min",
+                 "Minimum quality for quality-to-target search", 0.0f);
+  args->AddFloat(&jpegargs->search_q_max, "search_q_max",
+                 "Maximum quality for quality-to-target search", 0.0f);
+  args->AddFloat(&jpegargs->search_d_min, "search_d_min",
+                 "Minimum distance for quality-to-target search", 0.0f);
+  args->AddFloat(&jpegargs->search_d_max, "search_d_max",
+                 "Maximum distance for quality-to-target search", 0.0f);
+  args->AddFloat(&jpegargs->search_tolerance, "search_tolerance",
+                 "Percentage value, if quality-to-target search result "
+                 "relative error is within this, search stops.",
+                 0.0f);
+  args->AddFloat(&jpegargs->search_q_precision, "search_q_precision",
+                 "If last quality change in quality-to-target search is "
+                 "within this value, search stops.",
+                 0.0f);
+  args->AddFloat(&jpegargs->search_first_iter_slope, "search_first_iter_slope",
+                 "Slope of first extrapolation step in quality-to-target "
+                 "search.",
+                 0.0f);
+  args->AddSigned(&jpegargs->search_max_iters, "search_max_iters",
+                  "Maximum search steps in quality-to-target search.", 0);
   return true;
 }
 
 class JPEGCodec : public ImageCodec {
  public:
-  explicit JPEGCodec(const BenchmarkArgs& args) : ImageCodec(args) {
-    jpeg_encoder_ = jpegargs->jpeg_encoder;
-    chroma_subsampling_ = jpegargs->chroma_subsampling;
-  }
+  explicit JPEGCodec(const BenchmarkArgs& args) : ImageCodec(args) {}
 
   Status ParseParam(const std::string& param) override {
+    if (param[0] == 'q' && ImageCodec::ParseParam(param)) {
+      enc_quality_set_ = true;
+      return true;
+    }
     if (ImageCodec::ParseParam(param)) {
       return true;
     }
-    if (param == "sjpeg") {
+    if (param == "sjpeg" || param.find("cjpeg") != std::string::npos) {
       jpeg_encoder_ = param;
       return true;
     }
+#if JPEGXL_ENABLE_JPEGLI
+    if (param == "enc-jpegli") {
+      jpeg_encoder_ = "jpegli";
+      return true;
+    }
+#endif
     if (param.compare(0, 3, "yuv") == 0) {
-      if (param.size() != 6) return false;
       chroma_subsampling_ = param.substr(3);
       return true;
     }
+    if (param.compare(0, 4, "psnr") == 0) {
+      psnr_target_ = std::stof(param.substr(4));
+      return true;
+    }
+    if (param[0] == 'p') {
+      progressive_id_ = strtol(param.substr(1).c_str(), nullptr, 10);
+      return true;
+    }
+    if (param == "fix") {
+      fix_codes_ = true;
+      return true;
+    }
+    if (param[0] == 'Q') {
+      libjpeg_quality_ = strtol(param.substr(1).c_str(), nullptr, 10);
+      return true;
+    }
+    if (param.compare(0, 3, "YUV") == 0) {
+      if (param.size() != 6) return false;
+      libjpeg_chroma_subsampling_ = param.substr(3);
+      return true;
+    }
+    if (param == "noaq") {
+      enable_adaptive_quant_ = false;
+      return true;
+    }
+#if JPEGXL_ENABLE_JPEGLI
+    if (param == "xyb") {
+      xyb_mode_ = true;
+      return true;
+    }
+    if (param == "std") {
+      use_std_tables_ = true;
+      return true;
+    }
+    if (param == "dec-jpegli") {
+      jpeg_decoder_ = "jpegli";
+      return true;
+    }
+    if (param.substr(0, 2) == "bd") {
+      bitdepth_ = strtol(param.substr(2).c_str(), nullptr, 10);
+      return true;
+    }
+    if (param.substr(0, 6) == "cquant") {
+      num_colors_ = strtol(param.substr(6).c_str(), nullptr, 10);
+      return true;
+    }
+#endif
     return false;
   }
 
+  bool IgnoreAlpha() const override { return true; }
+
   Status Compress(const std::string& filename, const CodecInOut* io,
-                  ThreadPoolInternal* pool, std::vector<uint8_t>* compressed,
+                  ThreadPool* pool, std::vector<uint8_t>* compressed,
                   jpegxl::tools::SpeedStats* speed_stats) override {
-    extras::PackedPixelFile ppf;
-    JxlPixelFormat format = {0, JXL_TYPE_UINT8, JXL_BIG_ENDIAN, 0};
+    if (jpeg_encoder_.find("cjpeg") != std::string::npos) {
+// Not supported on Windows due to Linux-specific functions.
+// Not supported in Android NDK before API 28.
+#if !defined(_WIN32) && !defined(__EMSCRIPTEN__) && \
+    (!defined(__ANDROID_API__) || __ANDROID_API__ >= 28)
+      const std::string basename = GetBaseName(filename);
+      TemporaryFile in_file(basename, "pnm");
+      TemporaryFile encoded_file(basename, "jpg");
+      std::string in_filename, encoded_filename;
+      JXL_RETURN_IF_ERROR(in_file.GetFileName(&in_filename));
+      JXL_RETURN_IF_ERROR(encoded_file.GetFileName(&encoded_filename));
+      const size_t bits = io->metadata.m.bit_depth.bits_per_sample;
+      ColorEncoding c_enc = io->Main().c_current();
+      std::vector<uint8_t> encoded;
+      JXL_RETURN_IF_ERROR(
+          Encode(*io, c_enc, bits, in_filename, &encoded, pool));
+      JXL_RETURN_IF_ERROR(WriteFile(in_filename, encoded));
+      std::string compress_command = jpeg_encoder_;
+      std::vector<std::string> arguments;
+      arguments.push_back("-outfile");
+      arguments.push_back(encoded_filename);
+      arguments.push_back("-quality");
+      arguments.push_back(std::to_string(static_cast<int>(q_target_)));
+      arguments.push_back("-sample");
+      if (chroma_subsampling_ == "444") {
+        arguments.push_back("1x1");
+      } else if (chroma_subsampling_ == "420") {
+        arguments.push_back("2x2");
+      } else if (!chroma_subsampling_.empty()) {
+        return JXL_FAILURE("Unsupported chroma subsampling");
+      }
+      arguments.push_back("-optimize");
+      arguments.push_back(in_filename);
+      const double start = jxl::Now();
+      JXL_RETURN_IF_ERROR(RunCommand(compress_command, arguments, false));
+      const double end = jxl::Now();
+      speed_stats->NotifyElapsed(end - start);
+      return ReadFile(encoded_filename, compressed);
+#else
+      return JXL_FAILURE("Not supported on this build");
+#endif
+    }
+
+    jxl::extras::PackedPixelFile ppf;
+    size_t bits_per_sample = io->metadata.m.bit_depth.bits_per_sample;
+    JxlPixelFormat format = {
+        0,  // num_channels is ignored by the converter
+        bits_per_sample <= 8 ? JXL_TYPE_UINT8 : JXL_TYPE_UINT16, JXL_BIG_ENDIAN,
+        0};
     JXL_RETURN_IF_ERROR(ConvertCodecInOutToPackedPixelFile(
         *io, format, io->metadata.m.color_encoding, pool, &ppf));
-    extras::EncodedImage encoded;
-    std::unique_ptr<extras::Encoder> encoder = extras::GetJPEGEncoder();
-    std::ostringstream os;
-    os << static_cast<int>(std::round(q_target_));
-    encoder->SetOption("q", os.str());
-    encoder->SetOption("jpeg_encoder", jpeg_encoder_);
-    encoder->SetOption("chroma_subsampling", chroma_subsampling_);
-    const double start = Now();
-    JXL_RETURN_IF_ERROR(encoder->Encode(ppf, &encoded, pool));
-    const double end = Now();
-    *compressed = encoded.bitstreams.back();
-    speed_stats->NotifyElapsed(end - start);
+    double elapsed = 0.0;
+    if (jpeg_encoder_ == "jpegli") {
+#if JPEGXL_ENABLE_JPEGLI
+      jxl::extras::JpegSettings settings;
+      settings.xyb = xyb_mode_;
+      if (!xyb_mode_) {
+        settings.use_std_quant_tables = use_std_tables_;
+      }
+      if (enc_quality_set_) {
+        settings.quality = q_target_;
+      } else {
+        settings.distance = butteraugli_target_;
+      }
+      if (progressive_id_ >= 0) {
+        settings.progressive_level = progressive_id_;
+      }
+      if (psnr_target_ > 0) {
+        settings.psnr_target = psnr_target_;
+      }
+      if (jpegargs->search_tolerance > 0) {
+        settings.search_tolerance = 0.01f * jpegargs->search_tolerance;
+      }
+      if (jpegargs->search_d_min > 0) {
+        settings.min_distance = jpegargs->search_d_min;
+      }
+      if (jpegargs->search_d_max > 0) {
+        settings.max_distance = jpegargs->search_d_max;
+      }
+      settings.chroma_subsampling = chroma_subsampling_;
+      settings.use_adaptive_quantization = enable_adaptive_quant_;
+      settings.libjpeg_quality = libjpeg_quality_;
+      settings.libjpeg_chroma_subsampling = libjpeg_chroma_subsampling_;
+      settings.optimize_coding = !fix_codes_;
+      const double start = jxl::Now();
+      JXL_RETURN_IF_ERROR(
+          jxl::extras::EncodeJpeg(ppf, settings, pool, compressed));
+      const double end = jxl::Now();
+      elapsed = end - start;
+#endif
+    } else {
+      jxl::extras::EncodedImage encoded;
+      std::unique_ptr<jxl::extras::Encoder> encoder =
+          jxl::extras::GetJPEGEncoder();
+      if (!encoder) {
+        fprintf(stderr, "libjpeg codec is not supported\n");
+        return false;
+      }
+      std::ostringstream os;
+      os << static_cast<int>(std::round(q_target_));
+      encoder->SetOption("q", os.str());
+      encoder->SetOption("jpeg_encoder", jpeg_encoder_);
+      if (!chroma_subsampling_.empty()) {
+        encoder->SetOption("chroma_subsampling", chroma_subsampling_);
+      }
+      if (progressive_id_ >= 0) {
+        encoder->SetOption("progressive", std::to_string(progressive_id_));
+      }
+      if (libjpeg_quality_ > 0) {
+        encoder->SetOption("libjpeg_quality", std::to_string(libjpeg_quality_));
+      }
+      if (!libjpeg_chroma_subsampling_.empty()) {
+        encoder->SetOption("libjpeg_chroma_subsampling",
+                           libjpeg_chroma_subsampling_);
+      }
+      if (fix_codes_) {
+        encoder->SetOption("optimize", "OFF");
+      }
+      if (!enable_adaptive_quant_) {
+        encoder->SetOption("adaptive_q", "OFF");
+      }
+      if (psnr_target_ > 0) {
+        encoder->SetOption("psnr", std::to_string(psnr_target_));
+      }
+      if (!jpegargs->base_quant_fn.empty()) {
+        encoder->SetOption("base_quant_fn", jpegargs->base_quant_fn);
+      }
+      SET_ENCODER_ARG(search_q_start);
+      SET_ENCODER_ARG(search_q_min);
+      SET_ENCODER_ARG(search_q_max);
+      SET_ENCODER_ARG(search_q_precision);
+      SET_ENCODER_ARG(search_tolerance);
+      SET_ENCODER_ARG(search_first_iter_slope);
+      SET_ENCODER_ARG(search_max_iters);
+      const double start = jxl::Now();
+      JXL_RETURN_IF_ERROR(encoder->Encode(ppf, &encoded, pool));
+      const double end = jxl::Now();
+      elapsed = end - start;
+      *compressed = encoded.bitstreams.back();
+    }
+    speed_stats->NotifyElapsed(elapsed);
     return true;
   }
 
   Status Decompress(const std::string& filename,
-                    const Span<const uint8_t> compressed,
-                    ThreadPoolInternal* pool, CodecInOut* io,
+                    const Span<const uint8_t> compressed, ThreadPool* pool,
+                    CodecInOut* io,
                     jpegxl::tools::SpeedStats* speed_stats) override {
-    extras::PackedPixelFile ppf;
-    const double start = Now();
-    JXL_RETURN_IF_ERROR(DecodeImageJPG(compressed, extras::ColorHints(),
-                                       SizeConstraints(), &ppf));
-    const double end = Now();
-    speed_stats->NotifyElapsed(end - start);
-    JXL_RETURN_IF_ERROR(ConvertPackedPixelFileToCodecInOut(ppf, pool, io));
+    jxl::extras::PackedPixelFile ppf;
+    if (jpeg_decoder_ == "jpegli") {
+#if JPEGXL_ENABLE_JPEGLI
+      std::vector<uint8_t> jpeg_bytes(compressed.data(),
+                                      compressed.data() + compressed.size());
+      const double start = jxl::Now();
+      jxl::extras::JpegDecompressParams dparams;
+      dparams.output_data_type =
+          bitdepth_ > 8 ? JXL_TYPE_UINT16 : JXL_TYPE_UINT8;
+      dparams.num_colors = num_colors_;
+      JXL_RETURN_IF_ERROR(
+          jxl::extras::DecodeJpeg(jpeg_bytes, dparams, pool, &ppf));
+      const double end = jxl::Now();
+      speed_stats->NotifyElapsed(end - start);
+#endif
+    } else {
+      const double start = jxl::Now();
+      jxl::extras::JPGDecompressParams dparams;
+      dparams.num_colors = num_colors_;
+      JXL_RETURN_IF_ERROR(
+          jxl::extras::DecodeImageJPG(compressed, jxl::extras::ColorHints(),
+                                      &ppf, /*constraints=*/nullptr, &dparams));
+      const double end = jxl::Now();
+      speed_stats->NotifyElapsed(end - start);
+    }
+    JXL_RETURN_IF_ERROR(
+        jxl::extras::ConvertPackedPixelFileToCodecInOut(ppf, pool, io));
     return true;
   }
 
  protected:
-  std::string jpeg_encoder_;
+  // JPEG encoder and its parameters
+  std::string jpeg_encoder_ = "libjpeg";
   std::string chroma_subsampling_;
+  int progressive_id_ = -1;
+  bool fix_codes_ = false;
+  float psnr_target_ = 0.0f;
+  bool enc_quality_set_ = false;
+  int libjpeg_quality_ = 0;
+  std::string libjpeg_chroma_subsampling_;
+#if JPEGXL_ENABLE_JPEGLI
+  bool xyb_mode_ = false;
+  bool use_std_tables_ = false;
+#endif
+  bool enable_adaptive_quant_ = true;
+  // JPEG decoder and its parameters
+  std::string jpeg_decoder_ = "libjpeg";
+  int num_colors_ = 0;
+#if JPEGXL_ENABLE_JPEGLI
+  size_t bitdepth_ = 8;
+#endif
 };
 
 ImageCodec* CreateNewJPEGCodec(const BenchmarkArgs& args) {
   return new JPEGCodec(args);
 }
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
index cd4b009..d9f0c35 100644 (file)
 #include "tools/benchmark/benchmark_args.h"
 #include "tools/benchmark/benchmark_codec.h"
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
 ImageCodec* CreateNewJPEGCodec(const BenchmarkArgs& args);
 
 // Registers the jpeg-specific command line options.
 Status AddCommandLineOptionsJPEGCodec(BenchmarkArgs* args);
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
 
 #endif  // TOOLS_BENCHMARK_BENCHMARK_CODEC_JPEG_H_
index 6557858..554115a 100644 (file)
@@ -4,6 +4,9 @@
 // license that can be found in the LICENSE file.
 #include "tools/benchmark/benchmark_codec_jxl.h"
 
+#include <jxl/stats.h>
+#include <jxl/thread_parallel_runner_cxx.h>
+
 #include <cstdint>
 #include <cstdlib>
 #include <functional>
 #include <utility>
 #include <vector>
 
-#include "jxl/thread_parallel_runner_cxx.h"
 #include "lib/extras/codec.h"
 #include "lib/extras/dec/jxl.h"
-#if JPEGXL_ENABLE_JPEG
+#include "lib/extras/enc/apng.h"
+#include "lib/extras/enc/encode.h"
 #include "lib/extras/enc/jpg.h"
-#endif
+#include "lib/extras/enc/jxl.h"
 #include "lib/extras/packed_image_convert.h"
 #include "lib/extras/time.h"
-#include "lib/jxl/aux_out.h"
 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/base/override.h"
-#include "lib/jxl/base/padded_bytes.h"
 #include "lib/jxl/base/span.h"
 #include "lib/jxl/codec_in_out.h"
-#include "lib/jxl/enc_cache.h"
-#include "lib/jxl/enc_color_management.h"
-#include "lib/jxl/enc_external_image.h"
-#include "lib/jxl/enc_file.h"
-#include "lib/jxl/enc_params.h"
-#include "lib/jxl/image_bundle.h"
-#include "lib/jxl/image_metadata.h"
-#include "lib/jxl/modular/encoding/encoding.h"
 #include "tools/benchmark/benchmark_file_io.h"
 #include "tools/benchmark/benchmark_stats.h"
 #include "tools/cmdline.h"
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
 
-// Output function for EncodeBrunsli.
-size_t OutputToBytes(void* data, const uint8_t* buf, size_t count) {
-  PaddedBytes* output = reinterpret_cast<PaddedBytes*>(data);
-  output->append(buf, buf + count);
-  return count;
-}
+using ::jxl::Image3F;
+using ::jxl::extras::EncodedImage;
+using ::jxl::extras::Encoder;
+using ::jxl::extras::JXLCompressParams;
+using ::jxl::extras::JXLDecompressParams;
+using ::jxl::extras::PackedFrame;
+using ::jxl::extras::PackedPixelFile;
 
 struct JxlArgs {
-  double xmul;
-  double quant_bias;
-
-  bool use_ac_strategy;
   bool qprogressive;  // progressive with shift-quantization.
   bool progressive;
   int progressive_dc;
@@ -60,20 +51,12 @@ struct JxlArgs {
   Override dots;
   Override patches;
 
-  bool log_search_state;
   std::string debug_image_dir;
 };
 
 static JxlArgs* const jxlargs = new JxlArgs;
 
 Status AddCommandLineOptionsJxlCodec(BenchmarkArgs* args) {
-  args->AddDouble(&jxlargs->xmul, "xmul",
-                  "Multiplier for the difference in X channel in Butteraugli.",
-                  1.0);
-  args->AddDouble(&jxlargs->quant_bias, "quant_bias",
-                  "Bias border pixels during quantization by this ratio.", 0.0);
-  args->AddFlag(&jxlargs->use_ac_strategy, "use_ac_strategy",
-                "If true, AC strategy will be used.", false);
   args->AddFlag(&jxlargs->qprogressive, "qprogressive",
                 "Enable quantized progressive mode for AC.", false);
   args->AddFlag(&jxlargs->progressive, "progressive",
@@ -88,9 +71,6 @@ Status AddCommandLineOptionsJxlCodec(BenchmarkArgs* args) {
   args->AddOverride(&jxlargs->patches, "patches",
                     "Enable(1)/disable(0) patch dictionary.");
 
-  args->AddFlag(&jxlargs->log_search_state, "log_search_state",
-                "Print out debug info for tortoise mode AQ loop.", false);
-
   args->AddString(
       &jxlargs->debug_image_dir, "debug_image_dir",
       "If not empty, saves debug images for each "
@@ -101,37 +81,76 @@ Status AddCommandLineOptionsJxlCodec(BenchmarkArgs* args) {
 
 Status ValidateArgsJxlCodec(BenchmarkArgs* args) { return true; }
 
+inline bool ParseEffort(const std::string& s, int* out) {
+  if (s == "lightning") {
+    *out = 1;
+    return true;
+  } else if (s == "thunder") {
+    *out = 2;
+    return true;
+  } else if (s == "falcon") {
+    *out = 3;
+    return true;
+  } else if (s == "cheetah") {
+    *out = 4;
+    return true;
+  } else if (s == "hare") {
+    *out = 5;
+    return true;
+  } else if (s == "fast" || s == "wombat") {
+    *out = 6;
+    return true;
+  } else if (s == "squirrel") {
+    *out = 7;
+    return true;
+  } else if (s == "kitten") {
+    *out = 8;
+    return true;
+  } else if (s == "guetzli" || s == "tortoise") {
+    *out = 9;
+    return true;
+  } else if (s == "glacier") {
+    *out = 10;
+    return true;
+  }
+  size_t st = static_cast<size_t>(strtoull(s.c_str(), nullptr, 0));
+  if (st <= 10 && st >= 1) {
+    *out = st;
+    return true;
+  }
+  return false;
+}
+
 class JxlCodec : public ImageCodec {
  public:
-  explicit JxlCodec(const BenchmarkArgs& args) : ImageCodec(args) {}
+  explicit JxlCodec(const BenchmarkArgs& args)
+      : ImageCodec(args), stats_(nullptr, JxlEncoderStatsDestroy) {}
 
   Status ParseParam(const std::string& param) override {
     const std::string kMaxPassesPrefix = "max_passes=";
     const std::string kDownsamplingPrefix = "downsampling=";
     const std::string kResamplingPrefix = "resampling=";
     const std::string kEcResamplingPrefix = "ec_resampling=";
-
+    int val;
+    float fval;
     if (param.substr(0, kResamplingPrefix.size()) == kResamplingPrefix) {
       std::istringstream parser(param.substr(kResamplingPrefix.size()));
-      parser >> cparams_.resampling;
+      int resampling;
+      parser >> resampling;
+      cparams_.AddOption(JXL_ENC_FRAME_SETTING_RESAMPLING, resampling);
     } else if (param.substr(0, kEcResamplingPrefix.size()) ==
                kEcResamplingPrefix) {
       std::istringstream parser(param.substr(kEcResamplingPrefix.size()));
-      parser >> cparams_.ec_resampling;
+      int ec_resampling;
+      parser >> ec_resampling;
+      cparams_.AddOption(JXL_ENC_FRAME_SETTING_EXTRA_CHANNEL_RESAMPLING,
+                         ec_resampling);
     } else if (ImageCodec::ParseParam(param)) {
-      if (param[0] == 'd' && butteraugli_target_ == 0.0) {
-        cparams_.SetLossless();
-      }
+      // Nothing to do.
     } else if (param == "uint8") {
       uint8_ = true;
-    } else if (param[0] == 'u') {
-      char* end;
-      cparams_.uniform_quant = strtof(param.c_str() + 1, &end);
-      if (end == param.c_str() + 1 || *end != '\0') {
-        return JXL_FAILURE("failed to parse uniform quant parameter %s",
-                           param.c_str());
-      }
-      ba_params_.hf_asymmetry = args_.ba_params.hf_asymmetry;
+    } else if (param[0] == 'D') {
+      cparams_.alpha_distance = strtof(param.substr(1).c_str(), nullptr);
     } else if (param.substr(0, kMaxPassesPrefix.size()) == kMaxPassesPrefix) {
       std::istringstream parser(param.substr(kMaxPassesPrefix.size()));
       parser >> dparams_.max_passes;
@@ -139,159 +158,127 @@ class JxlCodec : public ImageCodec {
                kDownsamplingPrefix) {
       std::istringstream parser(param.substr(kDownsamplingPrefix.size()));
       parser >> dparams_.max_downsampling;
-    } else if (ParseSpeedTier(param, &cparams_.speed_tier)) {
-      // Nothing to do.
+    } else if (ParseEffort(param, &val)) {
+      cparams_.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, val);
     } else if (param[0] == 'X') {
-      cparams_.channel_colors_pre_transform_percent =
-          strtol(param.substr(1).c_str(), nullptr, 10);
+      fval = strtof(param.substr(1).c_str(), nullptr);
+      cparams_.AddFloatOption(
+          JXL_ENC_FRAME_SETTING_CHANNEL_COLORS_GLOBAL_PERCENT, fval);
     } else if (param[0] == 'Y') {
-      cparams_.channel_colors_percent =
-          strtol(param.substr(1).c_str(), nullptr, 10);
+      fval = strtof(param.substr(1).c_str(), nullptr);
+      cparams_.AddFloatOption(
+          JXL_ENC_FRAME_SETTING_CHANNEL_COLORS_GROUP_PERCENT, fval);
     } else if (param[0] == 'p') {
-      cparams_.palette_colors = strtol(param.substr(1).c_str(), nullptr, 10);
+      val = strtol(param.substr(1).c_str(), nullptr, 10);
+      cparams_.AddOption(JXL_ENC_FRAME_SETTING_PALETTE_COLORS, val);
     } else if (param == "lp") {
-      cparams_.lossy_palette = true;
+      cparams_.AddOption(JXL_ENC_FRAME_SETTING_LOSSY_PALETTE, 1);
     } else if (param[0] == 'C') {
-      cparams_.colorspace = strtol(param.substr(1).c_str(), nullptr, 10);
+      val = strtol(param.substr(1).c_str(), nullptr, 10);
+      cparams_.AddOption(JXL_ENC_FRAME_SETTING_MODULAR_COLOR_SPACE, val);
     } else if (param[0] == 'c') {
-      cparams_.color_transform =
-          (jxl::ColorTransform)strtol(param.substr(1).c_str(), nullptr, 10);
+      val = strtol(param.substr(1).c_str(), nullptr, 10);
+      cparams_.AddOption(JXL_ENC_FRAME_SETTING_COLOR_TRANSFORM, val);
       has_ctransform_ = true;
     } else if (param[0] == 'I') {
-      cparams_.options.nb_repeats = strtof(param.substr(1).c_str(), nullptr);
+      fval = strtof(param.substr(1).c_str(), nullptr);
+      cparams_.AddFloatOption(
+          JXL_ENC_FRAME_SETTING_MODULAR_MA_TREE_LEARNING_PERCENT, fval * 100.0);
     } else if (param[0] == 'E') {
-      cparams_.options.max_properties =
-          strtof(param.substr(1).c_str(), nullptr);
+      val = strtol(param.substr(1).c_str(), nullptr, 10);
+      cparams_.AddOption(JXL_ENC_FRAME_SETTING_MODULAR_NB_PREV_CHANNELS, val);
     } else if (param[0] == 'P') {
-      cparams_.options.predictor =
-          static_cast<Predictor>(strtof(param.substr(1).c_str(), nullptr));
+      val = strtol(param.substr(1).c_str(), nullptr, 10);
+      cparams_.AddOption(JXL_ENC_FRAME_SETTING_MODULAR_PREDICTOR, val);
     } else if (param == "slow") {
-      cparams_.options.nb_repeats = 2;
+      cparams_.AddFloatOption(
+          JXL_ENC_FRAME_SETTING_MODULAR_MA_TREE_LEARNING_PERCENT, 50.0);
     } else if (param == "R") {
-      cparams_.responsive = 1;
+      cparams_.AddOption(JXL_ENC_FRAME_SETTING_RESPONSIVE, 1);
     } else if (param[0] == 'R') {
-      cparams_.responsive = strtol(param.substr(1).c_str(), nullptr, 10);
+      val = strtol(param.substr(1).c_str(), nullptr, 10);
+      cparams_.AddOption(JXL_ENC_FRAME_SETTING_RESPONSIVE, val);
     } else if (param == "m") {
-      cparams_.modular_mode = true;
-      cparams_.color_transform = jxl::ColorTransform::kNone;
+      cparams_.AddOption(JXL_ENC_FRAME_SETTING_MODULAR, 1);
+      cparams_.AddOption(JXL_ENC_FRAME_SETTING_COLOR_TRANSFORM, 1);  // kNone
+      modular_mode_ = true;
     } else if (param.substr(0, 3) == "gab") {
-      long gab = strtol(param.substr(3).c_str(), nullptr, 10);
-      if (gab != 0 && gab != 1) {
+      val = strtol(param.substr(3).c_str(), nullptr, 10);
+      if (val != 0 && val != 1) {
         return JXL_FAILURE("Invalid gab value");
       }
-      cparams_.gaborish = static_cast<Override>(gab);
+      cparams_.AddOption(JXL_ENC_FRAME_SETTING_GABORISH, val);
     } else if (param[0] == 'g') {
-      long gsize = strtol(param.substr(1).c_str(), nullptr, 10);
-      if (gsize < 0 || gsize > 3) {
+      val = strtol(param.substr(1).c_str(), nullptr, 10);
+      if (val < 0 || val > 3) {
         return JXL_FAILURE("Invalid group size shift value");
       }
-      cparams_.modular_group_size_shift = gsize;
+      cparams_.AddOption(JXL_ENC_FRAME_SETTING_MODULAR_GROUP_SIZE, val);
     } else if (param == "plt") {
-      cparams_.options.max_properties = 0;
-      cparams_.options.nb_repeats = 0;
-      cparams_.options.predictor = Predictor::Zero;
-      cparams_.responsive = 0;
-      cparams_.colorspace = 0;
-      cparams_.channel_colors_pre_transform_percent = 0;
-      cparams_.channel_colors_percent = 0;
+      cparams_.AddOption(JXL_ENC_FRAME_SETTING_MODULAR_NB_PREV_CHANNELS, 0);
+      cparams_.AddFloatOption(
+          JXL_ENC_FRAME_SETTING_MODULAR_MA_TREE_LEARNING_PERCENT, 0.0f);
+      cparams_.AddOption(JXL_ENC_FRAME_SETTING_MODULAR_PREDICTOR, 0);
+      cparams_.AddOption(JXL_ENC_FRAME_SETTING_RESPONSIVE, 0);
+      cparams_.AddOption(JXL_ENC_FRAME_SETTING_MODULAR_COLOR_SPACE, 0);
+      cparams_.AddOption(JXL_ENC_FRAME_SETTING_CHANNEL_COLORS_GLOBAL_PERCENT,
+                         0);
+      cparams_.AddOption(JXL_ENC_FRAME_SETTING_CHANNEL_COLORS_GROUP_PERCENT, 0);
     } else if (param.substr(0, 3) == "epf") {
-      cparams_.epf = strtol(param.substr(3).c_str(), nullptr, 10);
-      if (cparams_.epf > 3) {
+      val = strtol(param.substr(3).c_str(), nullptr, 10);
+      if (val > 3) {
         return JXL_FAILURE("Invalid epf value");
       }
-    } else if (param.substr(0, 2) == "nr") {
-      normalize_bitrate_ = true;
+      cparams_.AddOption(JXL_ENC_FRAME_SETTING_EPF, val);
     } else if (param.substr(0, 16) == "faster_decoding=") {
-      cparams_.decoding_speed_tier =
-          strtol(param.substr(16).c_str(), nullptr, 10);
+      val = strtol(param.substr(16).c_str(), nullptr, 10);
+      cparams_.AddOption(JXL_ENC_FRAME_SETTING_DECODING_SPEED, val);
     } else {
       return JXL_FAILURE("Unrecognized param");
     }
     return true;
   }
 
-  bool IsColorAware() const override {
-    // Can't deal with negative values from color space conversion.
-    if (cparams_.modular_mode) return false;
-    if (normalize_bitrate_) return false;
-    // Otherwise, input may be in any color space.
-    return true;
-  }
-
-  bool IsJpegTranscoder() const override {
-    // TODO(veluca): figure out when to turn this on.
-    return false;
-  }
-
   Status Compress(const std::string& filename, const CodecInOut* io,
-                  ThreadPoolInternal* pool, std::vector<uint8_t>* compressed,
+                  ThreadPool* pool, std::vector<uint8_t>* compressed,
                   jpegxl::tools::SpeedStats* speed_stats) override {
-    if (!jxlargs->debug_image_dir.empty()) {
-      cinfo_.dump_image = [](const CodecInOut& io, const std::string& path) {
-        return EncodeToFile(io, path);
-      };
-      cinfo_.debug_prefix =
-          JoinPath(jxlargs->debug_image_dir, FileBaseName(filename)) +
-          ".jxl:" + params_ + ".dbg/";
-      JXL_RETURN_IF_ERROR(MakeDir(cinfo_.debug_prefix));
+    PackedPixelFile ppf;
+    JxlPixelFormat format{0, JXL_TYPE_FLOAT, JXL_NATIVE_ENDIAN, 0};
+    JXL_RETURN_IF_ERROR(ConvertCodecInOutToPackedPixelFile(
+        *io, format, io->Main().c_current(), pool, &ppf));
+    cparams_.runner = pool->runner();
+    cparams_.runner_opaque = pool->runner_opaque();
+    cparams_.distance = butteraugli_target_;
+    cparams_.AddOption(JXL_ENC_FRAME_SETTING_NOISE, (int)jxlargs->noise);
+    cparams_.AddOption(JXL_ENC_FRAME_SETTING_DOTS, (int)jxlargs->dots);
+    cparams_.AddOption(JXL_ENC_FRAME_SETTING_PATCHES, (int)jxlargs->patches);
+    cparams_.AddOption(JXL_ENC_FRAME_SETTING_PROGRESSIVE_AC,
+                       jxlargs->progressive);
+    cparams_.AddOption(JXL_ENC_FRAME_SETTING_QPROGRESSIVE_AC,
+                       jxlargs->qprogressive);
+    cparams_.AddOption(JXL_ENC_FRAME_SETTING_PROGRESSIVE_DC,
+                       jxlargs->progressive_dc);
+    if (butteraugli_target_ > 0.f && modular_mode_ && !has_ctransform_) {
+      // Reset color transform to default XYB for lossy modular.
+      cparams_.AddOption(JXL_ENC_FRAME_SETTING_COLOR_TRANSFORM, -1);
     }
-    cparams_.butteraugli_distance = butteraugli_target_;
-    cparams_.target_bitrate = bitrate_target_;
-
-    cparams_.dots = jxlargs->dots;
-    cparams_.patches = jxlargs->patches;
-
-    cparams_.progressive_mode = jxlargs->progressive;
-    cparams_.qprogressive_mode = jxlargs->qprogressive;
-    cparams_.progressive_dc = jxlargs->progressive_dc;
-
-    cparams_.noise = jxlargs->noise;
-
-    cparams_.quant_border_bias = static_cast<float>(jxlargs->quant_bias);
-    cparams_.ba_params.hf_asymmetry = ba_params_.hf_asymmetry;
-    cparams_.ba_params.xmul = static_cast<float>(jxlargs->xmul);
-
-    if (cparams_.butteraugli_distance > 0.f &&
-        cparams_.color_transform == ColorTransform::kNone &&
-        cparams_.modular_mode && !has_ctransform_) {
-      cparams_.color_transform = ColorTransform::kXYB;
+    std::string debug_prefix;
+    SetDebugImageCallback(filename, &debug_prefix, &cparams_);
+    if (args_.print_more_stats) {
+      stats_.reset(JxlEncoderStatsCreate());
+      cparams_.stats = stats_.get();
     }
-
-    cparams_.log_search_state = jxlargs->log_search_state;
-
-#if JPEGXL_ENABLE_JPEG
-    if (normalize_bitrate_ && cparams_.butteraugli_distance > 0.0f) {
-      extras::PackedPixelFile ppf;
-      JxlPixelFormat format = {0, JXL_TYPE_UINT8, JXL_BIG_ENDIAN, 0};
-      JXL_RETURN_IF_ERROR(ConvertCodecInOutToPackedPixelFile(
-          *io, format, io->metadata.m.color_encoding, pool, &ppf));
-      extras::EncodedImage encoded;
-      std::unique_ptr<extras::Encoder> encoder = extras::GetJPEGEncoder();
-      encoder->SetOption("q", "95");
-      JXL_RETURN_IF_ERROR(encoder->Encode(ppf, &encoded, pool));
-      float jpeg_bits = encoded.bitstreams.back().size() * kBitsPerByte;
-      float jpeg_bitrate = jpeg_bits / (io->xsize() * io->ysize());
-      // Formula fitted on jyrki31 corpus for distances between 1.0 and 8.0.
-      cparams_.target_bitrate = (jpeg_bitrate * 0.36f /
-                                 (0.6f * cparams_.butteraugli_distance + 0.4f));
-    }
-#endif
-
-    const double start = Now();
-    PassesEncoderState passes_encoder_state;
-    PaddedBytes compressed_padded;
-    JXL_RETURN_IF_ERROR(EncodeFile(cparams_, io, &passes_encoder_state,
-                                   &compressed_padded, GetJxlCms(), &cinfo_,
-                                   pool));
-    const double end = Now();
-    compressed->assign(compressed_padded.begin(), compressed_padded.end());
+    const double start = jxl::Now();
+    JXL_RETURN_IF_ERROR(jxl::extras::EncodeImageJXL(
+        cparams_, ppf, /*jpeg_bytes=*/nullptr, compressed));
+    const double end = jxl::Now();
     speed_stats->NotifyElapsed(end - start);
     return true;
   }
 
   Status Decompress(const std::string& filename,
-                    const Span<const uint8_t> compressed,
-                    ThreadPoolInternal* pool, CodecInOut* io,
+                    const Span<const uint8_t> compressed, ThreadPool* pool,
+                    CodecInOut* io,
                     jpegxl::tools::SpeedStats* speed_stats) override {
     dparams_.runner = pool->runner();
     dparams_.runner_opaque = pool->runner_opaque();
@@ -304,35 +291,67 @@ class JxlCodec : public ImageCodec {
     // originals, so we must set the option to keep the original orientation
     // instead.
     dparams_.keep_orientation = true;
-    extras::PackedPixelFile ppf;
+    PackedPixelFile ppf;
     size_t decoded_bytes;
-    const double start = Now();
-    JXL_RETURN_IF_ERROR(DecodeImageJXL(compressed.data(), compressed.size(),
-                                       dparams_, &decoded_bytes, &ppf));
-    const double end = Now();
+    const double start = jxl::Now();
+    JXL_RETURN_IF_ERROR(jxl::extras::DecodeImageJXL(
+        compressed.data(), compressed.size(), dparams_, &decoded_bytes, &ppf));
+    const double end = jxl::Now();
     speed_stats->NotifyElapsed(end - start);
     JXL_RETURN_IF_ERROR(ConvertPackedPixelFileToCodecInOut(ppf, pool, io));
     return true;
   }
 
   void GetMoreStats(BenchmarkStats* stats) override {
-    JxlStats jxl_stats;
-    jxl_stats.num_inputs = 1;
-    jxl_stats.aux_out = cinfo_;
-    stats->jxl_stats.Assimilate(jxl_stats);
+    stats->jxl_stats.num_inputs += 1;
+    JxlEncoderStatsMerge(stats->jxl_stats.stats.get(), stats_.get());
   }
 
  protected:
-  AuxOut cinfo_;
-  CompressParams cparams_;
+  JXLCompressParams cparams_;
   bool has_ctransform_ = false;
-  extras::JXLDecompressParams dparams_;
+  bool modular_mode_ = false;
+  JXLDecompressParams dparams_;
   bool uint8_ = false;
-  bool normalize_bitrate_ = false;
+  std::unique_ptr<JxlEncoderStats, decltype(JxlEncoderStatsDestroy)*> stats_;
+
+ private:
+  void SetDebugImageCallback(const std::string& filename,
+                             std::string* debug_prefix,
+                             JXLCompressParams* cparams) {
+    if (jxlargs->debug_image_dir.empty()) return;
+    *debug_prefix = JoinPath(jxlargs->debug_image_dir, FileBaseName(filename)) +
+                    ".jxl:" + params_ + ".dbg/";
+    JXL_CHECK(MakeDir(*debug_prefix));
+    cparams->debug_image_opaque = debug_prefix;
+    cparams->debug_image = [](void* opaque, const char* label, size_t xsize,
+                              size_t ysize, const JxlColorEncoding* color,
+                              const uint16_t* pixels) {
+      auto encoder = jxl::extras::GetAPNGEncoder();
+      JXL_CHECK(encoder);
+      PackedPixelFile debug_ppf;
+      JxlPixelFormat format{3, JXL_TYPE_UINT16, JXL_BIG_ENDIAN, 0};
+      PackedFrame frame(xsize, ysize, format);
+      memcpy(frame.color.pixels(), pixels, 6 * xsize * ysize);
+      debug_ppf.frames.emplace_back(std::move(frame));
+      debug_ppf.info.xsize = xsize;
+      debug_ppf.info.ysize = ysize;
+      debug_ppf.info.num_color_channels = 3;
+      debug_ppf.info.bits_per_sample = 16;
+      debug_ppf.color_encoding = *color;
+      EncodedImage encoded;
+      JXL_CHECK(encoder->Encode(debug_ppf, &encoded));
+      JXL_CHECK(!encoded.bitstreams.empty());
+      std::string* debug_prefix = reinterpret_cast<std::string*>(opaque);
+      std::string fn = *debug_prefix + std::string(label) + ".png";
+      WriteFile(fn, encoded.bitstreams[0]);
+    };
+  }
 };
 
 ImageCodec* CreateNewJxlCodec(const BenchmarkArgs& args) {
   return new JxlCodec(args);
 }
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
index 12e9fef..967be26 100644 (file)
 #include "tools/benchmark/benchmark_args.h"
 #include "tools/benchmark/benchmark_codec.h"
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
 ImageCodec* CreateNewJxlCodec(const BenchmarkArgs& args);
 
 // Registers the jxl-specific command line options.
 Status AddCommandLineOptionsJxlCodec(BenchmarkArgs* args);
 Status ValidateArgsJxlCodec(BenchmarkArgs* args);
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
 
 #endif  // TOOLS_BENCHMARK_BENCHMARK_CODEC_JXL_H_
index b310b11..2886166 100644 (file)
@@ -3,8 +3,6 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-#if JPEGXL_ENABLE_APNG
-
 #include "tools/benchmark/benchmark_codec_png.h"
 
 #include <stddef.h>
 #include "lib/extras/packed_image.h"
 #include "lib/extras/packed_image_convert.h"
 #include "lib/extras/time.h"
-#include "lib/jxl/base/padded_bytes.h"
 #include "lib/jxl/base/span.h"
-#include "lib/jxl/base/thread_pool_internal.h"
 #include "lib/jxl/codec_in_out.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/image_metadata.h"
+#include "tools/thread_pool_internal.h"
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
 
 struct PNGArgs {
   // Empty, no PNG-specific args currently.
@@ -41,36 +41,43 @@ class PNGCodec : public ImageCodec {
   Status ParseParam(const std::string& param) override { return true; }
 
   Status Compress(const std::string& filename, const CodecInOut* io,
-                  ThreadPoolInternal* pool, std::vector<uint8_t>* compressed,
+                  ThreadPool* pool, std::vector<uint8_t>* compressed,
                   jpegxl::tools::SpeedStats* speed_stats) override {
     const size_t bits = io->metadata.m.bit_depth.bits_per_sample;
-    const double start = Now();
-    JXL_RETURN_IF_ERROR(Encode(*io, extras::Codec::kPNG, io->Main().c_current(),
-                               bits, compressed, pool));
-    const double end = Now();
+    const double start = jxl::Now();
+    JXL_RETURN_IF_ERROR(jxl::Encode(*io, jxl::extras::Codec::kPNG,
+                                    io->Main().c_current(), bits, compressed,
+                                    pool));
+    const double end = jxl::Now();
     speed_stats->NotifyElapsed(end - start);
     return true;
   }
 
   Status Decompress(const std::string& /*filename*/,
-                    const Span<const uint8_t> compressed,
-                    ThreadPoolInternal* pool, CodecInOut* io,
+                    const Span<const uint8_t> compressed, ThreadPool* pool,
+                    CodecInOut* io,
                     jpegxl::tools::SpeedStats* speed_stats) override {
-    extras::PackedPixelFile ppf;
-    const double start = Now();
-    JXL_RETURN_IF_ERROR(extras::DecodeImageAPNG(
-        compressed, extras::ColorHints(), SizeConstraints(), &ppf));
-    const double end = Now();
+    jxl::extras::PackedPixelFile ppf;
+    const double start = jxl::Now();
+    JXL_RETURN_IF_ERROR(jxl::extras::DecodeImageAPNG(
+        compressed, jxl::extras::ColorHints(), &ppf));
+    const double end = jxl::Now();
     speed_stats->NotifyElapsed(end - start);
-    JXL_RETURN_IF_ERROR(ConvertPackedPixelFileToCodecInOut(ppf, pool, io));
+    JXL_RETURN_IF_ERROR(
+        jxl::extras::ConvertPackedPixelFileToCodecInOut(ppf, pool, io));
     return true;
   }
 };
 
 ImageCodec* CreateNewPNGCodec(const BenchmarkArgs& args) {
-  return new PNGCodec(args);
+  if (jxl::extras::GetAPNGEncoder() &&
+      jxl::extras::CanDecode(jxl::extras::Codec::kPNG)) {
+    return new PNGCodec(args);
+  } else {
+    return nullptr;
+  }
 }
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
 
-#endif
index 23d982e..8f29583 100644 (file)
@@ -6,21 +6,19 @@
 #ifndef TOOLS_BENCHMARK_BENCHMARK_CODEC_PNG_H_
 #define TOOLS_BENCHMARK_BENCHMARK_CODEC_PNG_H_
 
-#if JPEGXL_ENABLE_APNG
-
 #include <string>
 
 #include "lib/jxl/base/status.h"
 #include "tools/benchmark/benchmark_args.h"
 #include "tools/benchmark/benchmark_codec.h"
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
 ImageCodec* CreateNewPNGCodec(const BenchmarkArgs& args);
 
 // Registers the png-specific command line options.
 Status AddCommandLineOptionsPNGCodec(BenchmarkArgs* args);
-}  // namespace jxl
-
-#endif
+}  // namespace tools
+}  // namespace jpegxl
 
 #endif  // TOOLS_BENCHMARK_BENCHMARK_CODEC_PNG_H_
index 3b1bb26..926dee6 100644 (file)
@@ -4,6 +4,7 @@
 // license that can be found in the LICENSE file.
 #include "tools/benchmark/benchmark_codec_webp.h"
 
+#include <jxl/cms.h>
 #include <stdint.h>
 #include <string.h>
 #include <webp/decode.h>
 #include "lib/extras/time.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/data_parallel.h"
-#include "lib/jxl/base/padded_bytes.h"
 #include "lib/jxl/base/span.h"
-#include "lib/jxl/base/thread_pool_internal.h"
+#include "lib/jxl/base/status.h"
 #include "lib/jxl/codec_in_out.h"
 #include "lib/jxl/dec_external_image.h"
-#include "lib/jxl/enc_color_management.h"
 #include "lib/jxl/enc_external_image.h"
 #include "lib/jxl/enc_image_bundle.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/image_bundle.h"
 #include "lib/jxl/sanitizers.h"
+#include "tools/thread_pool_internal.h"
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
+
+using ::jxl::ImageBundle;
+using ::jxl::ImageMetadata;
+using ::jxl::ThreadPool;
 
 // Sets image data from 8-bit sRGB pixel array in bytes.
 // Amount of input bytes per pixel must be:
 // (is_gray ? 1 : 3) + (has_alpha ? 1 : 0)
 Status FromSRGB(const size_t xsize, const size_t ysize, const bool is_gray,
-                const bool has_alpha, const bool alpha_is_premultiplied,
-                const bool is_16bit, const JxlEndianness endianness,
-                const uint8_t* pixels, const uint8_t* end, ThreadPool* pool,
-                ImageBundle* ib) {
+                const bool has_alpha, const bool is_16bit,
+                const JxlEndianness endianness, const uint8_t* pixels,
+                const uint8_t* end, ThreadPool* pool, ImageBundle* ib) {
   const ColorEncoding& c = ColorEncoding::SRGB(is_gray);
-  const size_t bits_per_sample = (is_16bit ? 2 : 1) * kBitsPerByte;
+  const size_t bits_per_sample = (is_16bit ? 2 : 1) * jxl::kBitsPerByte;
+  const uint32_t num_channels = (is_gray ? 1 : 3) + (has_alpha ? 1 : 0);
+  JxlDataType data_type = is_16bit ? JXL_TYPE_UINT16 : JXL_TYPE_UINT8;
+  JxlPixelFormat format = {num_channels, data_type, endianness, 0};
   const Span<const uint8_t> span(pixels, end - pixels);
-  return ConvertFromExternal(
-      span, xsize, ysize, c, (is_gray ? 1 : 3) + (has_alpha ? 1 : 0),
-      alpha_is_premultiplied, bits_per_sample, endianness, pool, ib,
-      /*float_in=*/false, /*align=*/0);
+  return ConvertFromExternal(span, xsize, ysize, c, bits_per_sample, format,
+                             pool, ib);
 }
 
 struct WebPArgs {
@@ -85,9 +90,9 @@ class WebPCodec : public ImageCodec {
   }
 
   Status Compress(const std::string& filename, const CodecInOut* io,
-                  ThreadPoolInternal* pool, std::vector<uint8_t>* compressed,
+                  ThreadPool* pool, std::vector<uint8_t>* compressed,
                   jpegxl::tools::SpeedStats* speed_stats) override {
-    const double start = Now();
+    const double start = jxl::Now();
     const ImageBundle& ib = io->Main();
 
     if (ib.HasAlpha() && ib.metadata()->GetAlphaBits() > 8) {
@@ -99,8 +104,8 @@ class WebPCodec : public ImageCodec {
     ImageBundle store(&metadata);
     const ImageBundle* transformed;
     const ColorEncoding& c_desired = ColorEncoding::SRGB(false);
-    JXL_RETURN_IF_ERROR(TransformIfNeeded(ib, c_desired, GetJxlCms(), pool,
-                                          &store, &transformed));
+    JXL_RETURN_IF_ERROR(jxl::TransformIfNeeded(
+        ib, c_desired, *JxlGetDefaultCms(), pool, &store, &transformed));
     size_t xsize = ib.oriented_xsize();
     size_t ysize = ib.oriented_ysize();
     size_t stride = xsize * num_chans;
@@ -153,14 +158,14 @@ class WebPCodec : public ImageCodec {
     } else {
       return false;
     }
-    const double end = Now();
+    const double end = jxl::Now();
     speed_stats->NotifyElapsed(end - start);
     return true;
   }
 
   Status Decompress(const std::string& filename,
-                    const Span<const uint8_t> compressed,
-                    ThreadPoolInternal* pool, CodecInOut* io,
+                    const Span<const uint8_t> compressed, ThreadPool* pool,
+                    CodecInOut* io,
                     jpegxl::tools::SpeedStats* speed_stats) override {
     WebPDecoderConfig config;
 #ifdef MEMORY_SANITIZER
@@ -177,11 +182,11 @@ class WebPCodec : public ImageCodec {
     buf->colorspace = MODE_RGBA;
     const uint8_t* webp_data = compressed.data();
     const int webp_size = compressed.size();
-    const double start = Now();
+    const double start = jxl::Now();
     if (WebPDecode(webp_data, webp_size, &config) != VP8_STATUS_OK) {
       return JXL_FAILURE("WebPDecode failed");
     }
-    const double end = Now();
+    const double end = jxl::Now();
     speed_stats->NotifyElapsed(end - start);
     JXL_CHECK(buf->u.RGBA.stride == buf->width * 4);
 
@@ -191,7 +196,7 @@ class WebPCodec : public ImageCodec {
     const uint8_t* data_end = data_begin + buf->width * buf->height * 4;
     // The image data is initialized by libwebp, which we are not instrumenting
     // with msan.
-    msan::UnpoisonMemory(data_begin, data_end - data_begin);
+    jxl::msan::UnpoisonMemory(data_begin, data_end - data_begin);
     if (io->metadata.m.color_encoding.IsGray() != is_gray) {
       // TODO(lode): either ensure is_gray matches what the color profile says,
       // or set a correct color profile, e.g.
@@ -201,13 +206,11 @@ class WebPCodec : public ImageCodec {
       return JXL_FAILURE("Color profile is-gray mismatch");
     }
     io->metadata.m.SetAlphaBits(8);
-    const Status ok =
-        FromSRGB(buf->width, buf->height, is_gray, has_alpha,
-                 /*alpha_is_premultiplied=*/false, /*is_16bit=*/false,
-                 JXL_LITTLE_ENDIAN, data_begin, data_end, pool, &io->Main());
+    const Status ok = FromSRGB(buf->width, buf->height, is_gray, has_alpha,
+                               /*is_16bit=*/false, JXL_LITTLE_ENDIAN,
+                               data_begin, data_end, pool, &io->Main());
     WebPFreeDecBuffer(buf);
     JXL_RETURN_IF_ERROR(ok);
-    io->dec_pixels = buf->width * buf->height;
     return true;
   }
 
@@ -228,7 +231,9 @@ class WebPCodec : public ImageCodec {
                           std::vector<uint8_t>* compressed) {
     compressed->clear();
     WebPConfig config;
-    WebPConfigInit(&config);
+    if (!WebPConfigInit(&config)) {
+      return JXL_FAILURE("WebPConfigInit failed");
+    }
     JXL_ASSERT(!lossless_ || !near_lossless_);  // can't have both
     config.lossless = lossless_;
     config.quality = quality;
@@ -243,7 +248,9 @@ class WebPCodec : public ImageCodec {
     JXL_CHECK(WebPValidateConfig(&config));
 
     WebPPicture pic;
-    WebPPictureInit(&pic);
+    if (!WebPPictureInit(&pic)) {
+      return JXL_FAILURE("WebPPictureInit failed");
+    }
     pic.width = static_cast<int>(xsize);
     pic.height = static_cast<int>(ysize);
     pic.writer = &WebPStringWrite;
@@ -251,9 +258,13 @@ class WebPCodec : public ImageCodec {
     pic.custom_ptr = compressed;
 
     if (num_chans == 3) {
-      WebPPictureImportRGB(&pic, srgb.data(), 3 * xsize);
+      if (!WebPPictureImportRGB(&pic, srgb.data(), 3 * xsize)) {
+        return JXL_FAILURE("WebPPictureImportRGB failed");
+      }
     } else {
-      WebPPictureImportRGBA(&pic, srgb.data(), 4 * xsize);
+      if (!WebPPictureImportRGBA(&pic, srgb.data(), 4 * xsize)) {
+        return JXL_FAILURE("WebPPictureImportRGBA failed");
+      }
     }
 
     // WebP encoding may fail, for example, if the image is more than 16384
@@ -262,7 +273,7 @@ class WebPCodec : public ImageCodec {
     WebPPictureFree(&pic);
     // Compressed image data is initialized by libwebp, which we are not
     // instrumenting with msan.
-    msan::UnpoisonMemory(compressed->data(), compressed->size());
+    jxl::msan::UnpoisonMemory(compressed->data(), compressed->size());
     return ok;
   }
 
@@ -277,4 +288,5 @@ ImageCodec* CreateNewWebPCodec(const BenchmarkArgs& args) {
   return new WebPCodec(args);
 }
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
index cd4c60f..37d3c58 100644 (file)
 #include "tools/benchmark/benchmark_args.h"
 #include "tools/benchmark/benchmark_codec.h"
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
 ImageCodec* CreateNewWebPCodec(const BenchmarkArgs& args);
 
 // Registers the webp-specific command line options.
 Status AddCommandLineOptionsWebPCodec(BenchmarkArgs* args);
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
 
 #endif  // TOOLS_BENCHMARK_BENCHMARK_CODEC_WEBP_H_
index c5db02b..b8acbfb 100644 (file)
@@ -38,7 +38,8 @@
 #define GLOB_TILDE 0
 #endif
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
 
 const char kPathSeparator = '/';
 
@@ -229,4 +230,5 @@ Status MatchFiles(const std::string& pattern, std::vector<std::string>* list) {
 #endif  // HAS_GLOB
 }
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
index ecb8359..3c68acc 100644 (file)
 #include <string>
 #include <vector>
 
-#include "lib/jxl/base/file_io.h"
 #include "lib/jxl/base/status.h"
+#include "tools/file_io.h"
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
+
+using ::jxl::Status;
 
 // Checks if the file exists, either as file or as directory
 bool PathExists(const std::string& fname);
@@ -48,6 +51,7 @@ Status MatchFiles(const std::string& pattern, std::vector<std::string>* list);
 
 std::string JoinPath(const std::string& first, const std::string& second);
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
 
 #endif  // TOOLS_BENCHMARK_BENCHMARK_FILE_IO_H_
index f22e89c..87b9985 100644 (file)
 #include "lib/jxl/base/status.h"
 #include "tools/benchmark/benchmark_args.h"
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
+
+#define ADD_NAME(val, name) \
+  case JXL_ENC_STAT_##val:  \
+    return name
+const char* JxlStatsName(JxlEncoderStatsKey key) {
+  switch (key) {
+    ADD_NAME(HEADER_BITS, "Header bits");
+    ADD_NAME(TOC_BITS, "TOC bits");
+    ADD_NAME(DICTIONARY_BITS, "Patch dictionary bits");
+    ADD_NAME(SPLINES_BITS, "Splines bits");
+    ADD_NAME(NOISE_BITS, "Noise bits");
+    ADD_NAME(QUANT_BITS, "Quantizer bits");
+    ADD_NAME(MODULAR_TREE_BITS, "Modular tree bits");
+    ADD_NAME(MODULAR_GLOBAL_BITS, "Modular global bits");
+    ADD_NAME(DC_BITS, "DC bits");
+    ADD_NAME(MODULAR_DC_GROUP_BITS, "Modular DC group bits");
+    ADD_NAME(CONTROL_FIELDS_BITS, "Control field bits");
+    ADD_NAME(COEF_ORDER_BITS, "Coeff order bits");
+    ADD_NAME(AC_HISTOGRAM_BITS, "AC histogram bits");
+    ADD_NAME(AC_BITS, "AC token bits");
+    ADD_NAME(MODULAR_AC_GROUP_BITS, "Modular AC group bits");
+    ADD_NAME(NUM_SMALL_BLOCKS, "Number of small blocks");
+    ADD_NAME(NUM_DCT4X8_BLOCKS, "Number of 4x8 blocks");
+    ADD_NAME(NUM_AFV_BLOCKS, "Number of AFV blocks");
+    ADD_NAME(NUM_DCT8_BLOCKS, "Number of 8x8 blocks");
+    ADD_NAME(NUM_DCT8X32_BLOCKS, "Number of 8x32 blocks");
+    ADD_NAME(NUM_DCT16_BLOCKS, "Number of 16x16 blocks");
+    ADD_NAME(NUM_DCT16X32_BLOCKS, "Number of 16x32 blocks");
+    ADD_NAME(NUM_DCT32_BLOCKS, "Number of 32x32 blocks");
+    ADD_NAME(NUM_DCT32X64_BLOCKS, "Number of 32x64 blocks");
+    ADD_NAME(NUM_DCT64_BLOCKS, "Number of 64x64 blocks");
+    ADD_NAME(NUM_BUTTERAUGLI_ITERS, "Butteraugli iters");
+    default:
+      return "";
+  };
+  return "";
+}
+#undef ADD_NAME
+
+void JxlStats::Print() const {
+  for (int i = 0; i < JXL_ENC_NUM_STATS; ++i) {
+    JxlEncoderStatsKey key = static_cast<JxlEncoderStatsKey>(i);
+    size_t value = JxlEncoderStatsGet(stats.get(), key);
+    if (value) printf("%-25s  %10" PRIuS "\n", JxlStatsName(key), value);
+  }
+}
+
 namespace {
 
 // Computes longest codec name from Args()->codec, for table alignment.
@@ -61,7 +109,7 @@ struct ColumnDescriptor {
   bool more;  // Whether to print only if more_columns is enabled
 };
 
-static const ColumnDescriptor ExtraMetricDescriptor() {
+static ColumnDescriptor ExtraMetricDescriptor() {
   ColumnDescriptor d{{"DO NOT USE"}, 12, 4, TYPE_POSITIVE_FLOAT, false};
   return d;
 }
@@ -81,21 +129,11 @@ std::vector<ColumnDescriptor> GetColumnDescriptors(size_t num_extra_metrics) {
       {{"E MP/s"},          8,  3, TYPE_POSITIVE_FLOAT, false},
       {{"D MP/s"},          8,  3, TYPE_POSITIVE_FLOAT, false},
       {{"Max norm"},       13,  8, TYPE_POSITIVE_FLOAT, false},
+      {{"SSIMULACRA2"},    13,  8, TYPE_POSITIVE_FLOAT, false},
+      {{"PSNR"},            7,  2, TYPE_POSITIVE_FLOAT, false},
       {{"pnorm"},          13,  8, TYPE_POSITIVE_FLOAT, false},
-      {{"PSNR"},            7,  2, TYPE_POSITIVE_FLOAT, true},
-      {{"QABPP"},           8,  3, TYPE_POSITIVE_FLOAT, true},
-      {{"SmallB"},          8,  4, TYPE_POSITIVE_FLOAT, true},
-      {{"DCT4x8"},          8,  4, TYPE_POSITIVE_FLOAT, true},
-      {{"AFV"},             8,  4, TYPE_POSITIVE_FLOAT, true},
-      {{"DCT8x8"},          8,  4, TYPE_POSITIVE_FLOAT, true},
-      {{"8x16"},            8,  4, TYPE_POSITIVE_FLOAT, true},
-      {{"8x32"},            8,  4, TYPE_POSITIVE_FLOAT, true},
-      {{"16"},              8,  4, TYPE_POSITIVE_FLOAT, true},
-      {{"16x32"},           8,  4, TYPE_POSITIVE_FLOAT, true},
-      {{"32"},              8,  4, TYPE_POSITIVE_FLOAT, true},
-      {{"32x64"},           8,  4, TYPE_POSITIVE_FLOAT, true},
-      {{"64"},              8,  4, TYPE_POSITIVE_FLOAT, true},
       {{"BPP*pnorm"},      16, 12, TYPE_POSITIVE_FLOAT, false},
+      {{"QABPP"},           8,  3, TYPE_POSITIVE_FLOAT, false},
       {{"Bugs"},            7,  5, TYPE_COUNT, false},
   };
   // clang-format on
@@ -124,7 +162,7 @@ static std::string FormatFloat(const ColumnDescriptor& label, double value) {
     size_t point = result.rfind('.');
     if (point != std::string::npos) {
       int end = std::max<int>(point + 2, label.width - 1);
-      result = result.substr(0, end);
+      result.resize(end);
     }
   }
   return result;
@@ -148,9 +186,10 @@ void BenchmarkStats::Assimilate(const BenchmarkStats& victim) {
   total_adj_compressed_size += victim.total_adj_compressed_size;
   total_time_encode += victim.total_time_encode;
   total_time_decode += victim.total_time_decode;
-  max_distance = std::max(max_distance, victim.max_distance);
+  max_distance += pow(victim.max_distance, 2.0) * victim.total_input_pixels;
   distance_p_norm += victim.distance_p_norm;
-  distance_2 += victim.distance_2;
+  ssimulacra2 += victim.ssimulacra2;
+  psnr += victim.psnr;
   distances.insert(distances.end(), victim.distances.begin(),
                    victim.distances.end());
   total_errors += victim.total_errors;
@@ -166,13 +205,6 @@ void BenchmarkStats::Assimilate(const BenchmarkStats& victim) {
 void BenchmarkStats::PrintMoreStats() const {
   if (Args()->print_more_stats) {
     jxl_stats.Print();
-    size_t total_bits = jxl_stats.aux_out.TotalBits();
-    size_t compressed_bits = total_compressed_size * kBitsPerByte;
-    if (total_bits != compressed_bits) {
-      printf("Total layer bits: %" PRIuS " vs total compressed bits: %" PRIuS
-             "  (%.2f%% accounted for)\n",
-             total_bits, compressed_bits, total_bits * 100.0 / compressed_bits);
-    }
   }
   if (Args()->print_distance_percentiles) {
     std::vector<float> sorted = distances;
@@ -195,13 +227,12 @@ std::vector<ColumnValue> BenchmarkStats::ComputeColumns(
       ComputeSpeed(total_input_pixels, total_time_encode);
   const double decompression_speed =
       ComputeSpeed(total_input_pixels, total_time_decode);
-  // Already weighted, no need to divide by #channels.
-  const double rmse = std::sqrt(distance_2 / total_input_pixels);
-  const double psnr = total_compressed_size == 0 ? 0.0
-                      : (distance_2 == 0)        ? 99.99
-                                                 : (20 * std::log10(1 / rmse));
-  const double p_norm = distance_p_norm / total_input_pixels;
-  const double bpp_p_norm = p_norm * comp_bpp;
+  const double psnr_avg = psnr / total_input_pixels;
+  const double p_norm_avg = distance_p_norm / total_input_pixels;
+  const double ssimulacra2_avg = ssimulacra2 / total_input_pixels;
+  const double bpp_p_norm = p_norm_avg * comp_bpp;
+
+  const double max_distance_avg = sqrt(max_distance / total_input_pixels);
 
   std::vector<ColumnValue> values(
       GetColumnDescriptors(extra_metrics.size()).size());
@@ -212,40 +243,15 @@ std::vector<ColumnValue> BenchmarkStats::ComputeColumns(
   values[3].f = comp_bpp;
   values[4].f = compression_speed;
   values[5].f = decompression_speed;
-  values[6].f = static_cast<double>(max_distance);
-  values[7].f = p_norm;
-  values[8].f = psnr;
-  values[9].f = adj_comp_bpp;
-  // The DCT2, DCT4, AFV and DCT4X8 are applied to an 8x8 block by having 4x4
-  // DCT2X2s, 2x2 DCT4x4s/AFVs, or 2x1 DCT4X8s, filling the whole 8x8 blocks.
-  // Thus we need to multiply the block count by 8.0 * 8.0 pixels for these
-  // transforms.
-  values[10].f = 100.f * jxl_stats.aux_out.num_small_blocks * 8.0 * 8.0 /
-                 total_input_pixels;
-  values[11].f = 100.f * jxl_stats.aux_out.num_dct4x8_blocks * 8.0 * 8.0 /
-                 total_input_pixels;
-  values[12].f =
-      100.f * jxl_stats.aux_out.num_afv_blocks * 8.0 * 8.0 / total_input_pixels;
-  values[13].f = 100.f * jxl_stats.aux_out.num_dct8_blocks * 8.0 * 8.0 /
-                 total_input_pixels;
-  values[14].f = 100.f * jxl_stats.aux_out.num_dct8x16_blocks * 8.0 * 16.0 /
-                 total_input_pixels;
-  values[15].f = 100.f * jxl_stats.aux_out.num_dct8x32_blocks * 8.0 * 32.0 /
-                 total_input_pixels;
-  values[16].f = 100.f * jxl_stats.aux_out.num_dct16_blocks * 16.0 * 16.0 /
-                 total_input_pixels;
-  values[17].f = 100.f * jxl_stats.aux_out.num_dct16x32_blocks * 16.0 * 32.0 /
-                 total_input_pixels;
-  values[18].f = 100.f * jxl_stats.aux_out.num_dct32_blocks * 32.0 * 32.0 /
-                 total_input_pixels;
-  values[19].f = 100.f * jxl_stats.aux_out.num_dct32x64_blocks * 32.0 * 64.0 /
-                 total_input_pixels;
-  values[20].f = 100.f * jxl_stats.aux_out.num_dct64_blocks * 64.0 * 64.0 /
-                 total_input_pixels;
-  values[21].f = bpp_p_norm;
-  values[22].i = total_errors;
+  values[6].f = static_cast<double>(max_distance_avg);
+  values[7].f = ssimulacra2_avg;
+  values[8].f = psnr_avg;
+  values[9].f = p_norm_avg;
+  values[10].f = bpp_p_norm;
+  values[11].f = adj_comp_bpp;
+  values[12].i = total_errors;
   for (size_t i = 0; i < extra_metrics.size(); i++) {
-    values[23 + i].f = extra_metrics[i] / total_input_files;
+    values[13 + i].f = extra_metrics[i] / total_input_files;
   }
   return values;
 }
@@ -373,4 +379,5 @@ std::string PrintAggregate(
   return PrintFormattedEntries(num_extra_metrics, result);
 }
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
index a23c4a1..deca72a 100644 (file)
@@ -6,31 +6,30 @@
 #ifndef TOOLS_BENCHMARK_BENCHMARK_STATS_H_
 #define TOOLS_BENCHMARK_BENCHMARK_STATS_H_
 
+#include <jxl/stats.h>
 #include <stddef.h>
 #include <stdint.h>
 
+#include <memory>
 #include <string>
 #include <vector>
 
-#include "lib/jxl/aux_out.h"
-
-namespace jxl {
+namespace jpegxl {
+namespace tools {
 
 std::string StringPrintf(const char* format, ...);
 
 struct JxlStats {
-  JxlStats() {
-    num_inputs = 0;
-    aux_out = AuxOut();
-  }
+  JxlStats()
+      : num_inputs(0), stats(JxlEncoderStatsCreate(), JxlEncoderStatsDestroy) {}
   void Assimilate(const JxlStats& victim) {
     num_inputs += victim.num_inputs;
-    aux_out.Assimilate(victim.aux_out);
+    JxlEncoderStatsMerge(stats.get(), victim.stats.get());
   }
-  void Print() const { aux_out.Print(num_inputs); }
+  void Print() const;
 
   size_t num_inputs;
-  AuxOut aux_out;
+  std::unique_ptr<JxlEncoderStats, decltype(JxlEncoderStatsDestroy)*> stats;
 };
 
 // The value of an entry in the table. Depending on the ColumnType, the string,
@@ -61,8 +60,8 @@ struct BenchmarkStats {
   float max_distance = -1.0;  // Max butteraugli score
   // sum of 8th powers of butteraugli distmap pixels.
   double distance_p_norm = 0.0;
-  // sum of 2nd powers of differences between R, G, B.
-  double distance_2 = 0.0;
+  double psnr = 0.0;
+  double ssimulacra2 = 0.0;
   std::vector<float> distances;
   size_t total_errors = 0;
   JxlStats jxl_stats;
@@ -76,6 +75,7 @@ std::string PrintAggregate(
     size_t num_extra_metrics,
     const std::vector<std::vector<ColumnValue>>& aggregate);
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
 
 #endif  // TOOLS_BENCHMARK_BENCHMARK_STATS_H_
index 4b53131..11753f2 100644 (file)
 
 #include <fstream>
 
-#include "lib/jxl/base/file_io.h"
-#include "lib/jxl/codec_in_out.h"
 #include "lib/jxl/image_bundle.h"
+#include "tools/file_io.h"
 
 extern char** environ;
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
 TemporaryFile::TemporaryFile(std::string basename, std::string extension) {
   const auto extension_size = 1 + extension.size();
   temp_filename_ = std::move(basename) + "_XXXXXX." + std::move(extension);
@@ -50,8 +50,18 @@ Status TemporaryFile::GetFileName(std::string* const output) const {
   return true;
 }
 
+std::string GetBaseName(std::string filename) {
+  std::string result = std::move(filename);
+  result = basename(&result[0]);
+  const size_t dot = result.rfind('.');
+  if (dot != std::string::npos) {
+    result.resize(dot);
+  }
+  return result;
+}
+
 Status RunCommand(const std::string& command,
-                  const std::vector<std::string>& arguments) {
+                  const std::vector<std::string>& arguments, bool quiet) {
   std::vector<char*> args;
   args.reserve(arguments.size() + 2);
   args.push_back(const_cast<char*>(command.c_str()));
@@ -60,18 +70,27 @@ Status RunCommand(const std::string& command,
   }
   args.push_back(nullptr);
   pid_t pid;
-  JXL_RETURN_IF_ERROR(posix_spawnp(&pid, command.c_str(), nullptr, nullptr,
-                                   args.data(), environ) == 0);
+  posix_spawn_file_actions_t file_actions;
+  posix_spawn_file_actions_init(&file_actions);
+  if (quiet) {
+    posix_spawn_file_actions_addclose(&file_actions, STDOUT_FILENO);
+    posix_spawn_file_actions_addclose(&file_actions, STDERR_FILENO);
+  }
+  JXL_RETURN_IF_ERROR(posix_spawnp(&pid, command.c_str(), &file_actions,
+                                   nullptr, args.data(), environ) == 0);
   int wstatus;
   waitpid(pid, &wstatus, 0);
+  posix_spawn_file_actions_destroy(&file_actions);
   return WIFEXITED(wstatus) && WEXITSTATUS(wstatus) == EXIT_SUCCESS;
 }
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
 
 #else
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
 
 TemporaryFile::TemporaryFile(std::string basename, std::string extension) {}
 TemporaryFile::~TemporaryFile() {}
@@ -80,11 +99,14 @@ Status TemporaryFile::GetFileName(std::string* const output) const {
   return JXL_FAILURE("Not supported on this build");
 }
 
+std::string GetBaseName(std::string filename) { return filename; }
+
 Status RunCommand(const std::string& command,
-                  const std::vector<std::string>& arguments) {
+                  const std::vector<std::string>& arguments, bool quiet) {
   return JXL_FAILURE("Not supported on this build");
 }
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
 
 #endif  // _MSC_VER
index 027fa08..5df2bec 100644 (file)
 
 #include "lib/jxl/base/status.h"
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
+
+using ::jxl::Status;
 
 class TemporaryFile final {
  public:
@@ -27,9 +30,13 @@ class TemporaryFile final {
   std::string temp_filename_;
 };
 
+std::string GetBaseName(std::string filename);
+
 Status RunCommand(const std::string& command,
-                  const std::vector<std::string>& arguments);
+                  const std::vector<std::string>& arguments,
+                  bool quiet = false);
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
 
 #endif  // TOOLS_BENCHMARK_BENCHMARK_UTILS_H_
index fed5e9b..86d06a3 100644 (file)
@@ -3,13 +3,15 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-#include <math.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
+#include <jxl/cms.h>
+#include <jxl/decode.h>
 
 #include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
 #include <memory>
 #include <mutex>
 #include <numeric>
 #include <utility>
 #include <vector>
 
-#include "jxl/decode.h"
 #include "lib/extras/codec.h"
 #include "lib/extras/dec/color_hints.h"
+#include "lib/extras/enc/apng.h"
+#include "lib/extras/metrics.h"
 #include "lib/extras/time.h"
 #include "lib/jxl/alpha.h"
-#include "lib/jxl/base/cache_aligned.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/data_parallel.h"
-#include "lib/jxl/base/file_io.h"
-#include "lib/jxl/base/padded_bytes.h"
 #include "lib/jxl/base/printf_macros.h"
-#include "lib/jxl/base/profiler.h"
 #include "lib/jxl/base/random.h"
 #include "lib/jxl/base/span.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/base/thread_pool_internal.h"
+#include "lib/jxl/cache_aligned.h"
 #include "lib/jxl/codec_in_out.h"
 #include "lib/jxl/color_encoding_internal.h"
 #include "lib/jxl/enc_butteraugli_comparator.h"
-#include "lib/jxl/enc_butteraugli_pnorm.h"
-#include "lib/jxl/enc_color_management.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/image_bundle.h"
 #include "lib/jxl/image_ops.h"
 #include "tools/benchmark/benchmark_stats.h"
 #include "tools/benchmark/benchmark_utils.h"
 #include "tools/codec_config.h"
+#include "tools/file_io.h"
 #include "tools/speed_stats.h"
+#include "tools/ssimulacra2.h"
+#include "tools/thread_pool_internal.h"
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
 namespace {
 
+using ::jxl::ButteraugliParams;
+using ::jxl::Bytes;
+using ::jxl::CodecInOut;
+using ::jxl::ColorEncoding;
+using ::jxl::Image3F;
+using ::jxl::ImageBundle;
+using ::jxl::ImageF;
+using ::jxl::Rng;
+using ::jxl::Status;
+using ::jxl::ThreadPool;
+
 Status WriteImage(Image3F&& image, ThreadPool* pool,
                   const std::string& filename) {
   CodecInOut io;
   io.metadata.m.SetUintSamples(8);
   io.metadata.m.color_encoding = ColorEncoding::SRGB();
   io.SetFromImage(std::move(image), io.metadata.m.color_encoding);
-  return EncodeToFile(io, filename, pool);
+  std::vector<uint8_t> encoded;
+  return Encode(io, filename, &encoded, pool) && WriteFile(filename, encoded);
 }
 
 Status ReadPNG(const std::string& filename, Image3F* image) {
   CodecInOut io;
-  JXL_CHECK(SetFromFile(filename, extras::ColorHints(), &io));
-  *image = CopyImage(*io.Main().color());
+  std::vector<uint8_t> encoded;
+  JXL_CHECK(ReadFile(filename, &encoded));
+  JXL_CHECK(
+      jxl::SetFromBytes(jxl::Bytes(encoded), jxl::extras::ColorHints(), &io));
+  *image = Image3F(io.xsize(), io.ysize());
+  CopyImageTo(*io.Main().color(), image);
   return true;
 }
 
+std::string CodecToExtension(std::string codec_name, char sep) {
+  std::string result;
+  // Add in the parameters of the codec_name in reverse order, so that the
+  // name of the file format (e.g. jxl) is last.
+  int pos = static_cast<int>(codec_name.size()) - 1;
+  while (pos > 0) {
+    int prev = codec_name.find_last_of(sep, pos);
+    if (prev > pos) prev = -1;
+    result += '.' + codec_name.substr(prev + 1, pos - prev);
+    pos = prev - 1;
+  }
+  return result;
+}
+
 void DoCompress(const std::string& filename, const CodecInOut& io,
                 const std::vector<std::string>& extra_metrics_commands,
-                ImageCodec* codec, ThreadPoolInternal* inner_pool,
+                ImageCodec* codec, ThreadPool* inner_pool,
                 std::vector<uint8_t>* compressed, BenchmarkStats* s) {
-  PROFILER_FUNC;
   ++s->total_input_files;
 
   if (io.frames.size() != 1) {
@@ -104,7 +134,7 @@ void DoCompress(const std::string& filename, const CodecInOut& io,
   if (valid && !Args()->decode_only) {
     for (size_t i = 0; i < Args()->encode_reps; ++i) {
       if (codec->CanRecompressJpeg() && (ext == ".jpg" || ext == ".jpeg")) {
-        std::string data_in;
+        std::vector<uint8_t> data_in;
         JXL_CHECK(ReadFile(filename, &data_in));
         JXL_CHECK(
             codec->RecompressJpeg(filename, data_in, compressed, &speed_stats));
@@ -142,8 +172,8 @@ void DoCompress(const std::string& filename, const CodecInOut& io,
   if (valid) {
     speed_stats = jpegxl::tools::SpeedStats();
     for (size_t i = 0; i < Args()->decode_reps; ++i) {
-      if (!codec->Decompress(filename, Span<const uint8_t>(*compressed),
-                             inner_pool, &io2, &speed_stats)) {
+      if (!codec->Decompress(filename, Bytes(*compressed), inner_pool, &io2,
+                             &speed_stats)) {
         if (!Args()->silent_errors) {
           fprintf(stderr,
                   "%s failed to decompress encoded image. Original source:"
@@ -152,10 +182,13 @@ void DoCompress(const std::string& filename, const CodecInOut& io,
         }
         valid = false;
       }
-
-      // io2.dec_pixels increases each time, but the total should be independent
-      // of decode_reps, so only take the value from the first iteration.
-      if (i == 0) s->total_input_pixels += io2.dec_pixels;
+      // TODO(veluca): this is a hack. codec->Decompress should set the bitdepth
+      // correctly, but for jxl it currently sets it from the pixel format (i.e.
+      // 32-bit float).
+      io2.metadata.m.bit_depth = io.metadata.m.bit_depth;
+    }
+    for (const auto& frame : io2.frames) {
+      s->total_input_pixels += frame.color().xsize() * frame.color().ysize();
     }
     JXL_CHECK(speed_stats.GetSummary(&summary));
     s->total_time_decode += summary.central_tendency;
@@ -180,9 +213,7 @@ void DoCompress(const std::string& filename, const CodecInOut& io,
     valid = false;
   }
 
-  bool lossless = codec->IsJpegTranscoder();
-  bool skip_butteraugli =
-      Args()->skip_butteraugli || Args()->decode_only || lossless;
+  bool skip_butteraugli = Args()->skip_butteraugli || Args()->decode_only;
   ImageF distmap;
   float max_distance = 1.0f;
 
@@ -193,10 +224,9 @@ void DoCompress(const std::string& filename, const CodecInOut& io,
       ImageBundle& ib2 = io2.frames[i];
 
       // Verify output
-      PROFILER_ZONE("Benchmark stats");
       float distance;
       if (SameSize(ib1, ib2)) {
-        ButteraugliParams params = codec->BaParams();
+        ButteraugliParams params;
         if (ib1.metadata()->IntensityTarget() !=
             ib2.metadata()->IntensityTarget()) {
           fprintf(stderr,
@@ -210,21 +240,24 @@ void DoCompress(const std::string& filename, const CodecInOut& io,
         if (fabs(params.intensity_target - 255.0f) < 1e-3) {
           params.intensity_target = 80.0;
         }
-        distance = ButteraugliDistance(ib1, ib2, params, GetJxlCms(), &distmap,
-                                       inner_pool);
-        // Ensure pixels in range 0-1
-        s->distance_2 += ComputeDistance2(ib1, ib2, GetJxlCms());
+        distance =
+            ButteraugliDistance(ib1, ib2, params, *JxlGetDefaultCms(), &distmap,
+                                inner_pool, codec->IgnoreAlpha());
       } else {
         // TODO(veluca): re-upsample and compute proper distance.
         distance = 1e+4f;
         distmap = ImageF(1, 1);
         distmap.Row(0)[0] = distance;
-        s->distance_2 += distance;
       }
       // Update stats
+      s->psnr +=
+          compressed->empty()
+              ? 0
+              : jxl::ComputePSNR(ib1, ib2, *JxlGetDefaultCms()) * input_pixels;
       s->distance_p_norm +=
-          ComputeDistanceP(distmap, Args()->ba_params, Args()->error_pnorm) *
+          ComputeDistanceP(distmap, ButteraugliParams(), Args()->error_pnorm) *
           input_pixels;
+      s->ssimulacra2 += ComputeSSIMULACRA2(ib1, ib2).Score() * input_pixels;
       s->max_distance = std::max(s->max_distance, distance);
       s->distances.push_back(distance);
       max_distance = std::max(max_distance, distance);
@@ -265,48 +298,44 @@ void DoCompress(const std::string& filename, const CodecInOut& io,
     std::string dir = FileDirName(filename);
     std::string outdir =
         Args()->output_dir.empty() ? dir + "/out" : Args()->output_dir;
-    std::string compressed_fn = outdir + "/" + name;
-    // Add in the parameters of the codec_name in reverse order, so that the
-    // name of the file format (e.g. jxl) is last.
-    int pos = static_cast<int>(codec_name.size()) - 1;
-    while (pos > 0) {
-      int prev = codec_name.find_last_of(':', pos);
-      if (prev > pos) prev = -1;
-      compressed_fn += '.' + codec_name.substr(prev + 1, pos - prev);
-      pos = prev - 1;
-    }
+    std::string compressed_fn =
+        outdir + "/" + name + CodecToExtension(codec_name, ':');
     std::string decompressed_fn = compressed_fn + Args()->output_extension;
-#if JPEGXL_ENABLE_APNG
-    std::string heatmap_fn = compressed_fn + ".heatmap.png";
-#else
-    std::string heatmap_fn = compressed_fn + ".heatmap.ppm";
-#endif
+    std::string heatmap_fn;
+    if (jxl::extras::GetAPNGEncoder()) {
+      heatmap_fn = compressed_fn + ".heatmap.png";
+    } else {
+      heatmap_fn = compressed_fn + ".heatmap.ppm";
+    }
     JXL_CHECK(MakeDir(outdir));
     if (Args()->save_compressed) {
-      std::string compressed_str(
-          reinterpret_cast<const char*>(compressed->data()),
-          compressed->size());
-      JXL_CHECK(WriteFile(compressed_str, compressed_fn));
+      JXL_CHECK(WriteFile(compressed_fn, *compressed));
     }
     if (Args()->save_decompressed && valid) {
       // For verifying HDR: scale output.
       if (Args()->mul_output != 0.0) {
         fprintf(stderr, "WARNING: scaling outputs by %f\n", Args()->mul_output);
         JXL_CHECK(ib2.TransformTo(ColorEncoding::LinearSRGB(ib2.IsGray()),
-                                  GetJxlCms(), inner_pool));
+                                  *JxlGetDefaultCms(), inner_pool));
         ScaleImage(static_cast<float>(Args()->mul_output), ib2.color());
       }
 
-      JXL_CHECK(EncodeToFile(io2, *c_desired,
-                             ib2.metadata()->bit_depth.bits_per_sample,
-                             decompressed_fn));
+      std::vector<uint8_t> encoded;
+      JXL_CHECK(Encode(io2, *c_desired,
+                       ib2.metadata()->bit_depth.bits_per_sample,
+                       decompressed_fn, &encoded));
+      JXL_CHECK(WriteFile(decompressed_fn, encoded));
       if (!skip_butteraugli) {
-        float good = Args()->heatmap_good > 0.0f ? Args()->heatmap_good
-                                                 : ButteraugliFuzzyInverse(1.5);
-        float bad = Args()->heatmap_bad > 0.0f ? Args()->heatmap_bad
-                                               : ButteraugliFuzzyInverse(0.5);
-        JXL_CHECK(WriteImage(CreateHeatMapImage(distmap, good, bad), inner_pool,
-                             heatmap_fn));
+        float good = Args()->heatmap_good > 0.0f
+                         ? Args()->heatmap_good
+                         : jxl::ButteraugliFuzzyInverse(1.5);
+        float bad = Args()->heatmap_bad > 0.0f
+                        ? Args()->heatmap_bad
+                        : jxl::ButteraugliFuzzyInverse(0.5);
+        if (Args()->save_heatmap) {
+          JXL_CHECK(WriteImage(CreateHeatMapImage(distmap, good, bad),
+                               inner_pool, heatmap_fn));
+        }
       }
     }
   }
@@ -324,10 +353,13 @@ void DoCompress(const std::string& filename, const CodecInOut& io,
 
     // Convert everything to non-linear SRGB - this is what most metrics expect.
     const ColorEncoding& c_desired = ColorEncoding::SRGB(io.Main().IsGray());
-    JXL_CHECK(EncodeToFile(io, c_desired,
-                           io.metadata.m.bit_depth.bits_per_sample, tmp_in_fn));
-    JXL_CHECK(EncodeToFile(
-        io2, c_desired, io.metadata.m.bit_depth.bits_per_sample, tmp_out_fn));
+    std::vector<uint8_t> encoded;
+    JXL_CHECK(Encode(io, c_desired, io.metadata.m.bit_depth.bits_per_sample,
+                     tmp_in_fn, &encoded));
+    JXL_CHECK(WriteFile(tmp_in_fn, encoded));
+    JXL_CHECK(Encode(io2, c_desired, io.metadata.m.bit_depth.bits_per_sample,
+                     tmp_out_fn, &encoded));
+    JXL_CHECK(WriteFile(tmp_out_fn, encoded));
     if (io.metadata.m.IntensityTarget() != io2.metadata.m.IntensityTarget()) {
       fprintf(stderr,
               "WARNING: original and decoded have different intensity targets "
@@ -371,7 +403,7 @@ void DoCompress(const std::string& filename, const CodecInOut& io,
 
 // Makes a base64 data URI for embedded image in HTML
 std::string Base64Image(const std::string& filename) {
-  PaddedBytes bytes;
+  std::vector<uint8_t> bytes;
   if (!ReadFile(filename, &bytes)) {
     return "";
   }
@@ -406,12 +438,13 @@ void WriteHtmlReport(const std::string& codec_desc,
                      const std::vector<std::string>& fnames,
                      const std::vector<const Task*>& tasks,
                      const std::vector<const CodecInOut*>& images,
-                     bool self_contained) {
+                     bool add_heatmap, bool self_contained) {
   std::string toggle_js =
       "<script type=\"text/javascript\">\n"
       "  var codecname = '" +
       codec_desc + "';\n";
-  toggle_js += R"(
+  if (add_heatmap) {
+    toggle_js += R"(
   var maintitle = codecname + ' - click images to toggle, press space to' +
       ' toggle all, h to toggle all heatmaps. Zoom in with CTRL+wheel or' +
       ' CTRL+plus.';
@@ -435,7 +468,7 @@ void WriteHtmlReport(const std::string& codec_desc,
       hm.style.display = 'block';
     }
   }
-  function toggle3(i) {
+  function toggle(i) {
     for (index = counter.length; index <= i; index++) {
       counter.push(1);
     }
@@ -460,6 +493,48 @@ void WriteHtmlReport(const std::string& codec_desc,
   };
 </script>
 )";
+  } else {
+    toggle_js += R"(
+  var maintitle = codecname + ' - click images to toggle, press space to' +
+      ' toggle all. Zoom in with CTRL+wheel or CTRL+plus.';
+  document.title = maintitle;
+  var counter = [];
+  function setState(i, s) {
+    var preview = document.getElementById("preview" + i);
+    var orig = document.getElementById("orig" + i);
+    if (s == 0) {
+      preview.style.display = 'none';
+      orig.style.display = 'block';
+    } else if (s == 1) {
+      preview.style.display = 'block';
+      orig.style.display = 'none';
+    }
+  }
+  function toggle(i) {
+    for (index = counter.length; index <= i; index++) {
+      counter.push(1);
+    }
+    setState(i, counter[i]);
+    counter[i] = 1 - counter[i];
+    document.title = maintitle;
+  }
+  var toggleall_state = 1;
+  document.body.onkeydown = function(e) {
+    // space (32) to toggle orig/compr
+    if (e.keyCode == 32) {
+      var divs = document.getElementsByTagName('div');
+      toggleall_state = 1 - toggleall_state;
+      document.title = codecname + ' - ' + (toggleall_state == 0 ?
+          'originals' : 'compressed');
+      for (var i = 0; i < divs.length; i++) {
+        setState(i, toggleall_state);
+      }
+      return false;
+    }
+  };
+</script>
+)";
+  }
   std::string out_html;
   std::string outdir;
   out_html += "<body bgcolor=\"#000\">\n";
@@ -471,8 +546,12 @@ void WriteHtmlReport(const std::string& codec_desc,
     std::string name = FileBaseName(fnames[i]);
     std::string dir = FileDirName(fnames[i]);
     outdir = Args()->output_dir.empty() ? dir + "/out" : Args()->output_dir;
-    std::string name_out = name + "." + codec_name + Args()->output_extension;
-    std::string heatmap_out = name + "." + codec_name + ".heatmap.png";
+    std::string name_out = name + CodecToExtension(codec_name, '_');
+    if (Args()->html_report_use_decompressed) {
+      name_out += Args()->output_extension;
+    }
+    std::string heatmap_out =
+        name + CodecToExtension(codec_name, '_') + ".heatmap.png";
 
     std::string fname_orig = fnames[i];
     std::string fname_out = outdir + "/" + name_out;
@@ -500,28 +579,24 @@ void WriteHtmlReport(const std::string& codec_desc,
     double max_dist = tasks[i]->stats.max_distance;
     std::string compressed_title = StringPrintf(
         "compressed. bpp: %f, pnorm: %f, max dist: %f", bpp, pnorm, max_dist);
-    out_html += "<div onclick=\"toggle3(" + number +
+    out_html += "<div onclick=\"toggle(" + number +
                 ");\" style=\"display:inline-block;width:" + html_width +
                 ";height:" + html_height +
                 ";\">\n"
                 "  <img title=\"" +
                 compressed_title + "\" id=\"preview" + number + "\" src=";
-    out_html += "\"" + url_out + "\"";
-    out_html +=
-        " style=\"display:block;\"/>\n"
-        "  <img title=\"original\" id=\"orig" +
-        number + "\" src=";
-    out_html += "\"" + url_orig + "\"";
-    out_html +=
-        " style=\"display:none;\"/>\n"
-        "  <img title=\"heatmap\" id=\"hm" +
-        number + "\" src=";
-    out_html += "\"" + url_heatmap + "\"";
-    out_html += " style=\"display:none;\"/>\n</div>\n";
+    out_html += "\"" + url_out + "\"style=\"display:block;\"/>\n";
+    out_html += "  <img title=\"original\" id=\"orig" + number + "\" src=";
+    out_html += "\"" + url_orig + "\"style=\"display:none;\"/>\n";
+    if (add_heatmap) {
+      out_html = "  <img title=\"heatmap\" id=\"hm" + number + "\" src=";
+      out_html += "\"" + url_heatmap + "\"style=\"display:none;\"/>\n";
+    }
+    out_html += "</div>\n";
   }
   out_html += "</body>\n";
   out_html += toggle_js;
-  JXL_CHECK(WriteFile(out_html, outdir + "/index." + codec_name + ".html"));
+  JXL_CHECK(WriteFile(outdir + "/index." + codec_name + ".html", out_html));
 }
 
 // Prints the detailed and aggregate statistics, in the correct order but as
@@ -552,7 +627,6 @@ struct StatPrinter {
   }
 
   void TaskDone(size_t task_index, const Task& t) {
-    PROFILER_FUNC;
     std::lock_guard<std::mutex> guard(mutex);
     tasks_done_++;
     if (Args()->print_details || Args()->show_progress) {
@@ -603,17 +677,13 @@ struct StatPrinter {
     double comp_bpp =
         t.stats.total_compressed_size * 8.0 / t.stats.total_input_pixels;
     double p_norm = t.stats.distance_p_norm / t.stats.total_input_pixels;
+    double psnr = t.stats.psnr / t.stats.total_input_pixels;
+    double ssimulacra2 = t.stats.ssimulacra2 / t.stats.total_input_pixels;
     double bpp_p_norm = p_norm * comp_bpp;
 
     const double adj_comp_bpp =
         t.stats.total_adj_compressed_size * 8.0 / t.stats.total_input_pixels;
 
-    const double rmse =
-        std::sqrt(t.stats.distance_2 / t.stats.total_input_pixels);
-    const double psnr = t.stats.total_compressed_size == 0 ? 0.0
-                        : (t.stats.distance_2 == 0)
-                            ? 99.99
-                            : (20 * std::log10(1 / rmse));
     size_t pixels = t.stats.total_input_pixels;
 
     const double enc_mps =
@@ -646,10 +716,11 @@ struct StatPrinter {
       printf(
           "error:%" PRIdS "    size:%8" PRIdS "    pixels:%9" PRIdS
           "    enc_speed:%8.8f    dec_speed:%8.8f    bpp:%10.8f    dist:%10.8f"
-          "    psnr:%10.8f    p:%10.8f    bppp:%10.8f    qabpp:%10.8f ",
+          "    psnr:%10.8f    ssimulacra2:%.2f   p:%10.8f    bppp:%10.8f    "
+          "qabpp:%10.8f ",
           t.stats.total_errors, t.stats.total_compressed_size, pixels, enc_mps,
-          dec_mps, comp_bpp, t.stats.max_distance, psnr, p_norm, bpp_p_norm,
-          adj_comp_bpp);
+          dec_mps, comp_bpp, t.stats.max_distance, psnr, ssimulacra2, p_norm,
+          bpp_p_norm, adj_comp_bpp);
       for (size_t i = 0; i < t.stats.extra_metrics.size(); i++) {
         printf(" %s:%.8f", (*extra_metrics_names_)[i].c_str(),
                t.stats.extra_metrics[i]);
@@ -660,7 +731,6 @@ struct StatPrinter {
   }
 
   void PrintStats(const std::string& method, size_t idx_method) {
-    PROFILER_FUNC;
     // Assimilate all tasks with the same idx_method.
     BenchmarkStats method_stats;
     std::vector<const CodecInOut*> images;
@@ -680,6 +750,7 @@ struct StatPrinter {
 
     if (Args()->write_html_report) {
       WriteHtmlReport(method, *fnames_, tasks, images,
+                      Args()->save_heatmap && Args()->html_report_add_heatmap,
                       Args()->html_report_self_contained);
     }
 
@@ -741,27 +812,21 @@ class Benchmark {
   static int Run() {
     int ret = EXIT_SUCCESS;
     {
-      PROFILER_FUNC;
-
       const StringVec methods = GetMethods();
       const StringVec extra_metrics_names = GetExtraMetricsNames();
       const StringVec extra_metrics_commands = GetExtraMetricsCommands();
       const StringVec fnames = GetFilenames();
-      bool all_color_aware;
-      bool jpeg_transcoding_requested;
       // (non-const because Task.stats are updated)
-      std::vector<Task> tasks = CreateTasks(methods, fnames, &all_color_aware,
-                                            &jpeg_transcoding_requested);
+      std::vector<Task> tasks = CreateTasks(methods, fnames);
 
       std::unique_ptr<ThreadPoolInternal> pool;
       std::vector<std::unique_ptr<ThreadPoolInternal>> inner_pools;
-      InitThreads(static_cast<int>(tasks.size()), &pool, &inner_pools);
+      InitThreads(tasks.size(), &pool, &inner_pools);
 
-      const std::vector<CodecInOut> loaded_images = LoadImages(
-          fnames, all_color_aware, jpeg_transcoding_requested, pool.get());
+      const std::vector<CodecInOut> loaded_images = LoadImages(fnames, &*pool);
 
       if (RunTasks(methods, extra_metrics_names, extra_metrics_commands, fnames,
-                   loaded_images, pool.get(), inner_pools, &tasks) != 0) {
+                   loaded_images, &*pool, inner_pools, &tasks) != 0) {
         ret = EXIT_FAILURE;
         if (!Args()->silent_errors) {
           fprintf(stderr, "There were error(s) in the benchmark.\n");
@@ -769,24 +834,23 @@ class Benchmark {
       }
     }
 
-    // Must have exited profiler zone above before calling.
-    if (Args()->profiler) {
-      PROFILER_PRINT_RESULTS();
-    }
-    CacheAligned::PrintStats();
+    jxl::CacheAligned::PrintStats();
     return ret;
   }
 
  private:
-  static int NumOuterThreads(const int num_hw_threads, const int num_tasks) {
-    int num_threads = Args()->num_threads;
+  static size_t NumOuterThreads(const size_t num_hw_threads,
+                                const size_t num_tasks) {
     // Default to #cores
-    if (num_threads < 0) num_threads = num_hw_threads;
+    size_t num_threads = num_hw_threads;
+    if (Args()->num_threads >= 0) {
+      num_threads = static_cast<size_t>(Args()->num_threads);
+    }
 
     // As a safety precaution, limit the number of threads to 4x the number of
     // available CPUs.
     num_threads =
-        std::min<int>(num_threads, 4 * std::thread::hardware_concurrency());
+        std::min<size_t>(num_threads, 4 * std::thread::hardware_concurrency());
 
     // Don't create more threads than there are tasks (pointless/wasteful).
     num_threads = std::min(num_threads, num_tasks);
@@ -797,14 +861,21 @@ class Benchmark {
     return num_threads;
   }
 
-  static int NumInnerThreads(const int num_hw_threads, const int num_threads) {
-    int num_inner = Args()->inner_threads;
+  static int NumInnerThreads(const size_t num_hw_threads,
+                             const size_t num_threads) {
+    size_t num_inner;
 
     // Default: distribute remaining cores among tasks.
-    if (num_inner < 0) {
-      const int cores_for_outer = num_hw_threads - num_threads;
-      num_inner =
-          num_threads == 0 ? num_hw_threads : cores_for_outer / num_threads;
+    if (Args()->inner_threads < 0) {
+      if (num_threads == 0) {
+        num_inner = num_hw_threads;
+      } else if (num_hw_threads <= num_threads) {
+        num_inner = 1;
+      } else {
+        num_inner = (num_hw_threads - num_threads) / num_threads;
+      }
+    } else {
+      num_inner = static_cast<size_t>(Args()->inner_threads);
     }
 
     // Just one thread is counterproductive.
@@ -814,20 +885,21 @@ class Benchmark {
   }
 
   static void InitThreads(
-      const int num_tasks, std::unique_ptr<ThreadPoolInternal>* pool,
+      size_t num_tasks, std::unique_ptr<ThreadPoolInternal>* pool,
       std::vector<std::unique_ptr<ThreadPoolInternal>>* inner_pools) {
-    const int num_hw_threads = std::thread::hardware_concurrency();
-    const int num_threads = NumOuterThreads(num_hw_threads, num_tasks);
-    const int num_inner = NumInnerThreads(num_hw_threads, num_threads);
+    const size_t num_hw_threads = std::thread::hardware_concurrency();
+    const size_t num_threads = NumOuterThreads(num_hw_threads, num_tasks);
+    const size_t num_inner = NumInnerThreads(num_hw_threads, num_threads);
 
     fprintf(stderr,
-            "%d total threads, %d tasks, %d threads, %d inner threads\n",
+            "%" PRIuS " total threads, %" PRIuS " tasks, %" PRIuS
+            " threads, %" PRIuS " inner threads\n",
             num_hw_threads, num_tasks, num_threads, num_inner);
 
     pool->reset(new ThreadPoolInternal(num_threads));
     // Main thread OR worker threads in pool each get a possibly empty nested
     // pool (helps use all available cores when #tasks < #threads)
-    for (size_t i = 0; i < (*pool)->NumThreads(); ++i) {
+    for (size_t i = 0; i < std::max<size_t>(num_threads, 1); ++i) {
       inner_pools->emplace_back(new ThreadPoolInternal(num_inner));
     }
   }
@@ -938,76 +1010,55 @@ class Benchmark {
   }
 
   // (Load only once, not for every codec)
-  static std::vector<CodecInOut> LoadImages(
-      const StringVec& fnames, const bool all_color_aware,
-      const bool jpeg_transcoding_requested, ThreadPool* pool) {
-    PROFILER_FUNC;
+  static std::vector<CodecInOut> LoadImages(const StringVec& fnames,
+                                            ThreadPool* pool) {
     std::vector<CodecInOut> loaded_images;
     loaded_images.resize(fnames.size());
-    JXL_CHECK(RunOnPool(
-        pool, 0, static_cast<uint32_t>(fnames.size()), ThreadPool::NoInit,
-        [&](const uint32_t task, size_t /*thread*/) {
-          const size_t i = static_cast<size_t>(task);
-          Status ok = true;
-
-          if (!Args()->decode_only) {
-            PaddedBytes encoded;
-            ok = ReadFile(fnames[i], &encoded) &&
-                 (jpeg_transcoding_requested
-                      ? jpeg::DecodeImageJPG(Span<const uint8_t>(encoded),
-                                             &loaded_images[i])
-                      : SetFromBytes(Span<const uint8_t>(encoded),
-                                     Args()->color_hints, &loaded_images[i]));
-            if (ok && Args()->intensity_target != 0) {
-              loaded_images[i].metadata.m.SetIntensityTarget(
-                  Args()->intensity_target);
-            }
-          }
-          if (!ok) {
-            if (!Args()->silent_errors) {
-              fprintf(stderr, "Failed to load image %s\n", fnames[i].c_str());
-            }
-            return;
-          }
-
-          if (!Args()->decode_only && all_color_aware) {
-            const bool is_gray = loaded_images[i].Main().IsGray();
-            const ColorEncoding& c_desired = ColorEncoding::LinearSRGB(is_gray);
-            if (!loaded_images[i].TransformTo(c_desired, GetJxlCms(),
-                                              /*pool=*/nullptr)) {
-              JXL_ABORT("Failed to transform to lin. sRGB %s",
-                        fnames[i].c_str());
-            }
-          }
+    const auto process_image = [&](const uint32_t task, size_t /*thread*/) {
+      const size_t i = static_cast<size_t>(task);
+      Status ok = true;
+
+      if (!Args()->decode_only) {
+        std::vector<uint8_t> encoded;
+        ok = ReadFile(fnames[i], &encoded);
+        if (ok) {
+          ok = jxl::SetFromBytes(Bytes(encoded), Args()->color_hints,
+                                 &loaded_images[i]);
+        }
+        if (ok && Args()->intensity_target != 0) {
+          loaded_images[i].metadata.m.SetIntensityTarget(
+              Args()->intensity_target);
+        }
+      }
+      if (!ok) {
+        if (!Args()->silent_errors) {
+          fprintf(stderr, "Failed to load image %s\n", fnames[i].c_str());
+        }
+        return;
+      }
 
-          if (!Args()->decode_only && Args()->override_bitdepth != 0) {
-            if (Args()->override_bitdepth == 32) {
-              loaded_images[i].metadata.m.SetFloat32Samples();
-            } else {
-              loaded_images[i].metadata.m.SetUintSamples(
-                  Args()->override_bitdepth);
-            }
-          }
-        },
-        "Load images"));
+      if (!Args()->decode_only && Args()->override_bitdepth != 0) {
+        if (Args()->override_bitdepth == 32) {
+          loaded_images[i].metadata.m.SetFloat32Samples();
+        } else {
+          loaded_images[i].metadata.m.SetUintSamples(Args()->override_bitdepth);
+        }
+      }
+    };
+    JXL_CHECK(jxl::RunOnPool(pool, 0, static_cast<uint32_t>(fnames.size()),
+                             ThreadPool::NoInit, process_image, "Load images"));
     return loaded_images;
   }
 
   static std::vector<Task> CreateTasks(const StringVec& methods,
-                                       const StringVec& fnames,
-                                       bool* all_color_aware,
-                                       bool* jpeg_transcoding_requested) {
+                                       const StringVec& fnames) {
     std::vector<Task> tasks;
     tasks.reserve(methods.size() * fnames.size());
-    *all_color_aware = true;
-    *jpeg_transcoding_requested = false;
     for (size_t idx_image = 0; idx_image < fnames.size(); ++idx_image) {
       for (size_t idx_method = 0; idx_method < methods.size(); ++idx_method) {
         tasks.emplace_back();
         Task& t = tasks.back();
         t.codec = CreateImageCodec(methods[idx_method]);
-        *all_color_aware &= t.codec->IsColorAware();
-        *jpeg_transcoding_requested |= t.codec->IsJpegTranscoder();
         t.idx_image = idx_image;
         t.idx_method = idx_method;
         // t.stats is default-initialized.
@@ -1021,10 +1072,9 @@ class Benchmark {
   static size_t RunTasks(
       const StringVec& methods, const StringVec& extra_metrics_names,
       const StringVec& extra_metrics_commands, const StringVec& fnames,
-      const std::vector<CodecInOut>& loaded_images, ThreadPoolInternal* pool,
+      const std::vector<CodecInOut>& loaded_images, ThreadPool* pool,
       const std::vector<std::unique_ptr<ThreadPoolInternal>>& inner_pools,
       std::vector<Task>* tasks) {
-    PROFILER_FUNC;
     StatPrinter printer(methods, extra_metrics_names, fnames, *tasks);
     if (Args()->print_details_csv) {
       // Print CSV header
@@ -1038,7 +1088,7 @@ class Benchmark {
     }
 
     std::vector<uint64_t> errors_thread;
-    JXL_CHECK(RunOnPool(
+    JXL_CHECK(jxl::RunOnPool(
         pool, 0, tasks->size(),
         [&](const size_t num_threads) {
           // Reduce false sharing by only writing every 8th slot (64 bytes).
@@ -1051,14 +1101,15 @@ class Benchmark {
           t.image = &image;
           std::vector<uint8_t> compressed;
           DoCompress(fnames[t.idx_image], image, extra_metrics_commands,
-                     t.codec.get(), inner_pools[thread].get(), &compressed,
+                     t.codec.get(), &*inner_pools[thread], &compressed,
                      &t.stats);
           printer.TaskDone(i, t);
           errors_thread[8 * thread] += t.stats.total_errors;
         },
         "Benchmark tasks"));
     if (Args()->show_progress) fprintf(stderr, "\n");
-    return std::accumulate(errors_thread.begin(), errors_thread.end(), 0);
+    return std::accumulate(errors_thread.begin(), errors_thread.end(),
+                           size_t(0));
   }
 };
 
@@ -1085,6 +1136,9 @@ int BenchmarkMain(int argc, const char** argv) {
 }
 
 }  // namespace
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
 
-int main(int argc, const char** argv) { return jxl::BenchmarkMain(argc, argv); }
+int main(int argc, const char** argv) {
+  return jpegxl::tools::BenchmarkMain(argc, argv);
+}
diff --git a/tools/box/CMakeLists.txt b/tools/box/CMakeLists.txt
deleted file mode 100644 (file)
index c79add0..0000000
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) the JPEG XL Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style
-# license that can be found in the LICENSE file.
-
-add_library(box STATIC EXCLUDE_FROM_ALL
-  box.cc
-  box.h
-)
-# This library can be included into position independent binaries.
-set_target_properties(box PROPERTIES POSITION_INDEPENDENT_CODE TRUE)
-target_link_libraries(box
-  jxl-static
-  jxl_threads-static
-)
-target_include_directories(box
-  PRIVATE
-  "${PROJECT_SOURCE_DIR}"
-)
-
-if(JPEGXL_ENABLE_DEVTOOLS)
-add_executable(box_list
-  box_list_main.cc
-)
-target_link_libraries(box_list
-  box
-)
-endif()  # JPEGXL_ENABLE_DEVTOOLS
diff --git a/tools/box/box.cc b/tools/box/box.cc
deleted file mode 100644 (file)
index db73c7c..0000000
+++ /dev/null
@@ -1,285 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "tools/box/box.h"
-
-#include "lib/jxl/base/byte_order.h"  // for GetMaximumBrunsliEncodedSize
-#include "lib/jxl/jpeg/dec_jpeg_data.h"
-#include "lib/jxl/jpeg/jpeg_data.h"
-
-namespace jpegxl {
-namespace tools {
-
-namespace {
-// Checks if a + b > size, taking possible integer overflow into account.
-bool OutOfBounds(size_t a, size_t b, size_t size) {
-  size_t pos = a + b;
-  if (pos > size) return true;
-  if (pos < a) return true;  // overflow happened
-  return false;
-}
-}  // namespace
-
-// Parses the header of a BMFF box. Returns the result in a Box struct.
-// Sets the position to the end of the box header after parsing. The data size
-// is output if known, or must be handled by the caller and runs until the end
-// of the container file if not known.
-jxl::Status ParseBoxHeader(const uint8_t** next_in, size_t* available_in,
-                           Box* box) {
-  size_t pos = 0;
-  size_t size = *available_in;
-  const uint8_t* in = *next_in;
-
-  if (OutOfBounds(pos, 8, size)) return JXL_FAILURE("out of bounds");
-
-  const size_t initial_pos = pos;
-
-  // Total box_size including this header itself.
-  uint64_t box_size = LoadBE32(in + pos);
-  pos += 4;
-  if (box_size == 1) {
-    // If the size is 1, it indicates extended size read from 64-bit integer.
-    if (OutOfBounds(pos, 8, size)) return JXL_FAILURE("out of bounds");
-    box_size = LoadBE64(in + pos);
-    pos += 8;
-  }
-  memcpy(box->type, in + pos, 4);
-  pos += 4;
-  if (!memcmp("uuid", box->type, 4)) {
-    if (OutOfBounds(pos, 16, size)) return JXL_FAILURE("out of bounds");
-    memcpy(box->extended_type, in + pos, 16);
-    pos += 16;
-  }
-
-  // This is the end of the box header, the box data begins here. Handle
-  // the data size now.
-  const size_t data_pos = pos;
-  const size_t header_size = data_pos - initial_pos;
-
-  if (box_size != 0) {
-    if (box_size < header_size) {
-      return JXL_FAILURE("invalid box size");
-    }
-    box->data_size_given = true;
-    box->data_size = box_size - header_size;
-  } else {
-    // The size extends to the end of the file. We don't necessarily know the
-    // end of the file here, since the input size may be only part of the full
-    // container file. Indicate the size is not given, the caller must handle
-    // this.
-    box->data_size_given = false;
-    box->data_size = 0;
-  }
-
-  // The remaining bytes are the data. If the box is a full box, the first
-  // bytes of the data have a certain structure but this is to be handled by
-  // the caller for the appropriate box type.
-  *next_in += pos;
-  *available_in -= pos;
-
-  return true;
-}
-
-jxl::Status AppendBoxHeader(const Box& box, jxl::PaddedBytes* out) {
-  bool use_extended = !memcmp("uuid", box.type, 4);
-
-  uint64_t box_size = 0;
-  bool large_size = false;
-  if (box.data_size_given) {
-    box_size = box.data_size + 8 + (use_extended ? 16 : 0);
-    if (box_size >= 0x100000000ull) {
-      large_size = true;
-    }
-  }
-
-  out->resize(out->size() + 4);
-  StoreBE32(large_size ? 1 : box_size, &out->back() - 4 + 1);
-
-  out->resize(out->size() + 4);
-  memcpy(&out->back() - 4 + 1, box.type, 4);
-
-  if (large_size) {
-    out->resize(out->size() + 8);
-    StoreBE64(box_size, &out->back() - 8 + 1);
-  }
-
-  if (use_extended) {
-    out->resize(out->size() + 16);
-    memcpy(&out->back() - 16 + 1, box.extended_type, 16);
-  }
-
-  return true;
-}
-
-bool IsContainerHeader(const uint8_t* data, size_t size) {
-  const uint8_t box_header[] = {0,   0,   0,   0xc, 'J',  'X',
-                                'L', ' ', 0xd, 0xa, 0x87, 0xa};
-  if (size < sizeof(box_header)) return false;
-  return memcmp(box_header, data, sizeof(box_header)) == 0;
-}
-
-jxl::Status DecodeJpegXlContainerOneShot(const uint8_t* data, size_t size,
-                                         JpegXlContainer* container) {
-  const uint8_t* in = data;
-  size_t available_in = size;
-
-  container->exif = nullptr;
-  container->exif_size = 0;
-  container->exfc = nullptr;
-  container->exfc_size = 0;
-  container->xml.clear();
-  container->xmlc.clear();
-  container->jumb = nullptr;
-  container->jumb_size = 0;
-  container->codestream.clear();
-  container->jpeg_reconstruction = nullptr;
-  container->jpeg_reconstruction_size = 0;
-
-  size_t box_index = 0;
-
-  while (available_in != 0) {
-    Box box;
-    if (!ParseBoxHeader(&in, &available_in, &box)) {
-      return JXL_FAILURE("Invalid box header");
-    }
-
-    size_t data_size = box.data_size_given ? box.data_size : available_in;
-
-    if (box.data_size > available_in) {
-      return JXL_FAILURE("Unexpected end of file");
-    }
-
-    if (box_index == 0) {
-      // TODO(lode): leave out magic signature box?
-      // Must be magic signature box.
-      if (memcmp("JXL ", box.type, 4) != 0) {
-        return JXL_FAILURE("Invalid magic signature");
-      }
-      if (box.data_size != 4) return JXL_FAILURE("Invalid magic signature");
-      if (in[0] != 0xd || in[1] != 0xa || in[2] != 0x87 || in[3] != 0xa) {
-        return JXL_FAILURE("Invalid magic signature");
-      }
-    } else if (box_index == 1) {
-      // Must be ftyp box.
-      if (memcmp("ftyp", box.type, 4) != 0) {
-        return JXL_FAILURE("Invalid ftyp");
-      }
-      if (box.data_size != 12) return JXL_FAILURE("Invalid ftyp");
-      const char* expected = "jxl \0\0\0\0jxl ";
-      if (memcmp(expected, in, 12) != 0) return JXL_FAILURE("Invalid ftyp");
-    } else if (!memcmp("jxli", box.type, 4)) {
-      // TODO(lode): parse JXL frame index box
-      if (!container->codestream.empty()) {
-        return JXL_FAILURE("frame index must come before codestream");
-      }
-    } else if (!memcmp("jxlc", box.type, 4)) {
-      container->codestream.append(in, in + data_size);
-    } else if (!memcmp("jxlp", box.type, 4)) {
-      if (data_size < 4) return JXL_FAILURE("Invalid jxlp");
-      // TODO(jon): don't just ignore the counter
-      container->codestream.append(in + 4, in + data_size);
-    } else if (!memcmp("Exif", box.type, 4)) {
-      if (data_size < 4) return JXL_FAILURE("Invalid Exif");
-      uint32_t tiff_header_offset = LoadBE32(in);
-      if (tiff_header_offset > data_size - 4)
-        return JXL_FAILURE("Invalid Exif tiff header offset");
-      container->exif = in + 4 + tiff_header_offset;
-      container->exif_size = data_size - 4 - tiff_header_offset;
-    } else if (!memcmp("Exfc", box.type, 4)) {
-      container->exfc = in;
-      container->exfc_size = data_size;
-    } else if (!memcmp("xml ", box.type, 4)) {
-      container->xml.emplace_back(in, data_size);
-    } else if (!memcmp("xmlc", box.type, 4)) {
-      container->xmlc.emplace_back(in, data_size);
-    } else if (!memcmp("jumb", box.type, 4)) {
-      container->jumb = in;
-      container->jumb_size = data_size;
-    } else if (!memcmp("jbrd", box.type, 4)) {
-      container->jpeg_reconstruction = in;
-      container->jpeg_reconstruction_size = data_size;
-    } else {
-      // Do nothing: box not recognized here but may be recognizable by
-      // other software.
-    }
-
-    in += data_size;
-    available_in -= data_size;
-    box_index++;
-  }
-
-  return true;
-}
-
-static jxl::Status AppendBoxAndData(const char type[4], const uint8_t* data,
-                                    size_t data_size, jxl::PaddedBytes* out,
-                                    bool exif = false) {
-  Box box;
-  memcpy(box.type, type, 4);
-  box.data_size = data_size + (exif ? 4 : 0);
-  box.data_size_given = true;
-  JXL_RETURN_IF_ERROR(AppendBoxHeader(box, out));
-  // for Exif: always use tiff header offset 0
-  if (exif)
-    for (int i = 0; i < 4; i++) out->push_back(0);
-  out->append(data, data + data_size);
-  return true;
-}
-
-jxl::Status EncodeJpegXlContainerOneShot(const JpegXlContainer& container,
-                                         jxl::PaddedBytes* out) {
-  const unsigned char header[] = {0,   0,   0,    0xc, 'J', 'X', 'L', ' ',
-                                  0xd, 0xa, 0x87, 0xa, 0,   0,   0,   0x14,
-                                  'f', 't', 'y',  'p', 'j', 'x', 'l', ' ',
-                                  0,   0,   0,    0,   'j', 'x', 'l', ' '};
-  size_t header_size = sizeof(header);
-  out->append(header, header + header_size);
-
-  if (container.exif) {
-    JXL_RETURN_IF_ERROR(AppendBoxAndData("Exif", container.exif,
-                                         container.exif_size, out, true));
-  }
-
-  if (container.exfc) {
-    JXL_RETURN_IF_ERROR(
-        AppendBoxAndData("Exfc", container.exfc, container.exfc_size, out));
-  }
-
-  for (size_t i = 0; i < container.xml.size(); i++) {
-    JXL_RETURN_IF_ERROR(AppendBoxAndData("xml ", container.xml[i].first,
-                                         container.xml[i].second, out));
-  }
-
-  for (size_t i = 0; i < container.xmlc.size(); i++) {
-    JXL_RETURN_IF_ERROR(AppendBoxAndData("xmlc", container.xmlc[i].first,
-                                         container.xmlc[i].second, out));
-  }
-
-  if (container.jpeg_reconstruction) {
-    JXL_RETURN_IF_ERROR(AppendBoxAndData("jbrd", container.jpeg_reconstruction,
-                                         container.jpeg_reconstruction_size,
-                                         out));
-  }
-
-  if (!container.codestream.empty()) {
-    JXL_RETURN_IF_ERROR(AppendBoxAndData("jxlc", container.codestream.data(),
-                                         container.codestream.size(), out));
-  } else {
-    return JXL_FAILURE("must have primary image frame");
-  }
-
-  if (container.jumb) {
-    JXL_RETURN_IF_ERROR(
-        AppendBoxAndData("jumb", container.jumb, container.jumb_size, out));
-  }
-
-  return true;
-}
-
-// TODO(veluca): the format defined here encode some things multiple times. Fix
-// that.
-
-}  // namespace tools
-}  // namespace jpegxl
diff --git a/tools/box/box.h b/tools/box/box.h
deleted file mode 100644 (file)
index 4cc3058..0000000
+++ /dev/null
@@ -1,113 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Tools for reading from / writing to ISOBMFF format for JPEG XL.
-
-#ifndef TOOLS_BOX_BOX_H_
-#define TOOLS_BOX_BOX_H_
-
-#include <string>
-#include <vector>
-
-#include "lib/jxl/base/padded_bytes.h"
-#include "lib/jxl/base/status.h"
-#include "lib/jxl/codec_in_out.h"
-#include "lib/jxl/enc_file.h"
-
-namespace jpegxl {
-namespace tools {
-
-// A top-level box in the box format.
-struct Box {
-  // The type of the box.
-  // If "uuid", use extended_type instead
-  char type[4];
-
-  // The extended_type is only used when type == "uuid".
-  // Extended types are not used in JXL. However, the box format itself
-  // supports this so they are handled correctly.
-  char extended_type[16];
-
-  // Size of the data, excluding box header. The box ends, and next box
-  // begins, at data + size. May not be used if data_size_given is false.
-  uint64_t data_size;
-
-  // If the size is not given, the datasize extends to the end of the file.
-  // If this field is false, the size field may not be used.
-  bool data_size_given;
-};
-
-// Parses the header of a BMFF box. Returns the result in a Box struct.
-// Updates next_in and available_in to point at the data in the box, directly
-// after the header.
-// Sets the data_size if known, or must be handled by the caller and runs until
-// the end of the container file if not known.
-// NOTE: available_in should be at least 8 up to 32 bytes to parse the
-// header without error.
-jxl::Status ParseBoxHeader(const uint8_t** next_in, size_t* available_in,
-                           Box* box);
-
-// TODO(lode): streaming C API
-jxl::Status AppendBoxHeader(const Box& box, jxl::PaddedBytes* out);
-
-// NOTE: after DecodeJpegXlContainerOneShot, the exif etc. pointers point to
-// regions within the input data passed to that function.
-struct JpegXlContainer {
-  // Exif metadata, or null if not present in the container.
-  // The exif data has the format of 'Exif block' as defined in
-  // ISO/IEC23008-12:2017 Clause A.2.1
-  // Here we assume the tiff header offset is 0 and store only the
-  // actual Exif data (starting with the tiff header MM or II)
-  // TODO(lode): support the theoretical case of multiple exif boxes
-  const uint8_t* exif = nullptr;  // Not owned
-  size_t exif_size = 0;
-
-  // Brotli-compressed exif metadata, if present. The data points to the brotli
-  // compressed stream, it is not decompressed here.
-  const uint8_t* exfc = nullptr;  // Not owned
-  size_t exfc_size = 0;
-
-  // XML boxes for XMP. There may be multiple XML boxes.
-  // Each entry points to XML location and provides size.
-  // The memory is not owned.
-  // TODO(lode): for C API, cannot use std::vector.
-  std::vector<std::pair<const uint8_t*, size_t>> xml;
-
-  // Brotli-compressed xml boxes. The bytes are given in brotli-compressed form
-  // and are not decompressed here.
-  std::vector<std::pair<const uint8_t*, size_t>> xmlc;
-
-  // JUMBF superbox data, or null if not present in the container.
-  // The parsing of the nested boxes inside is not handled here.
-  const uint8_t* jumb = nullptr;  // Not owned
-  size_t jumb_size = 0;
-
-  // TODO(lode): add frame index data
-
-  // JPEG reconstruction data, or null if not present in the container.
-  const uint8_t* jpeg_reconstruction = nullptr;
-  size_t jpeg_reconstruction_size = 0;
-
-  // The main JPEG XL codestream, of which there must be 1 in the container.
-  jxl::PaddedBytes codestream;
-};
-
-// Returns whether `data` starts with a container header; definitely returns
-// false if `size` is less than 12 bytes.
-bool IsContainerHeader(const uint8_t* data, size_t size);
-
-// NOTE: the input data must remain valid as long as `container` is used,
-// because its exif etc. pointers point to that data.
-jxl::Status DecodeJpegXlContainerOneShot(const uint8_t* data, size_t size,
-                                         JpegXlContainer* container);
-
-// TODO(lode): streaming C API
-jxl::Status EncodeJpegXlContainerOneShot(const JpegXlContainer& container,
-                                         jxl::PaddedBytes* out);
-
-}  // namespace tools
-}  // namespace jpegxl
-
-#endif  // TOOLS_BOX_BOX_H_
diff --git a/tools/box/box_list_main.cc b/tools/box/box_list_main.cc
deleted file mode 100644 (file)
index 40ca910..0000000
+++ /dev/null
@@ -1,90 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// This binary tool lists the boxes of any box-based format (JPEG XL,
-// JPEG 2000, MP4, ...).
-// This exists as a test for manual verification, rather than an actual tool.
-
-#include <stdint.h>
-#include <stdio.h>
-#include <string.h>
-
-#include "lib/jxl/base/file_io.h"
-#include "lib/jxl/base/override.h"
-#include "lib/jxl/base/padded_bytes.h"
-#include "lib/jxl/base/printf_macros.h"
-#include "lib/jxl/base/status.h"
-#include "tools/box/box.h"
-
-namespace jpegxl {
-namespace tools {
-
-int RunMain(int argc, const char* argv[]) {
-  if (argc < 2) {
-    fprintf(stderr, "Usage: %s <filename>", argv[0]);
-    return 1;
-  }
-
-  jxl::PaddedBytes compressed;
-  if (!jxl::ReadFile(argv[1], &compressed)) return 1;
-  fprintf(stderr, "Read %" PRIuS " compressed bytes\n", compressed.size());
-
-  const uint8_t* in = compressed.data();
-  size_t available_in = compressed.size();
-
-  fprintf(stderr, "File size: %" PRIuS "\n", compressed.size());
-
-  while (available_in != 0) {
-    const uint8_t* start = in;
-    Box box;
-    if (!ParseBoxHeader(&in, &available_in, &box)) {
-      fprintf(stderr, "Failed at %" PRIuS "\n",
-              compressed.size() - available_in);
-      break;
-    }
-
-    size_t data_size = box.data_size_given ? box.data_size : available_in;
-    size_t header_size = in - start;
-    size_t box_size = header_size + data_size;
-
-    for (size_t i = 0; i < sizeof(box.type); i++) {
-      char c = box.type[i];
-      if (c < 32 || c > 127) {
-        printf("Unprintable character in box type, likely not a box file.\n");
-        return 0;
-      }
-    }
-
-    printf("box: \"%.4s\" box_size:%" PRIuS " data_size:%" PRIuS, box.type,
-           box_size, data_size);
-    if (!memcmp("uuid", box.type, 4)) {
-      printf(" -- extended type:\"%.16s\"", box.extended_type);
-    }
-    if (!memcmp("ftyp", box.type, 4) && data_size > 4) {
-      std::string ftype(in, in + 4);
-      printf(" -- ftype:\"%s\"", ftype.c_str());
-    }
-    printf("\n");
-
-    if (data_size > available_in) {
-      fprintf(
-          stderr, "Unexpected end of file %" PRIuS " %" PRIuS " %" PRIuS "\n",
-          static_cast<size_t>(box.data_size), available_in, compressed.size());
-      break;
-    }
-
-    in += data_size;
-    available_in -= data_size;
-  }
-
-  return 0;
-}
-
-}  // namespace tools
-}  // namespace jpegxl
-
-int main(int argc, const char* argv[]) {
-  return jpegxl::tools::RunMain(argc, argv);
-}
diff --git a/tools/box/box_test.cc b/tools/box/box_test.cc
deleted file mode 100644 (file)
index 3146bcf..0000000
+++ /dev/null
@@ -1,76 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "tools/box/box.h"
-
-#include <stdint.h>
-#include <stdio.h>
-#include <string.h>
-
-#include "gtest/gtest.h"
-#include "lib/jxl/base/file_io.h"
-#include "lib/jxl/base/override.h"
-#include "lib/jxl/base/padded_bytes.h"
-#include "lib/jxl/base/status.h"
-
-TEST(BoxTest, BoxTest) {
-  size_t test_size = 256;
-  jxl::PaddedBytes exif(test_size);
-  jxl::PaddedBytes xml0(test_size);
-  jxl::PaddedBytes xml1(test_size);
-  jxl::PaddedBytes jumb(test_size);
-  jxl::PaddedBytes codestream(test_size);
-  // Generate arbitrary data for the codestreams: the test is not testing
-  // the contents of them but whether they are preserved in the container.
-  uint8_t v = 0;
-  for (size_t i = 0; i < test_size; ++i) {
-    exif[i] = v++;
-    xml0[i] = v++;
-    xml1[i] = v++;
-    jumb[i] = v++;
-    codestream[i] = v++;
-  }
-
-  jpegxl::tools::JpegXlContainer container;
-  container.exif = exif.data();
-  container.exif_size = exif.size();
-  container.xml.emplace_back(xml0.data(), xml0.size());
-  container.xml.emplace_back(xml1.data(), xml1.size());
-  container.xmlc.emplace_back(xml1.data(), xml1.size());
-  container.jumb = jumb.data();
-  container.jumb_size = jumb.size();
-  container.codestream = std::move(codestream);
-
-  jxl::PaddedBytes file;
-  EXPECT_EQ(true,
-            jpegxl::tools::EncodeJpegXlContainerOneShot(container, &file));
-
-  jpegxl::tools::JpegXlContainer container2;
-  EXPECT_EQ(true, jpegxl::tools::DecodeJpegXlContainerOneShot(
-                      file.data(), file.size(), &container2));
-
-  EXPECT_EQ(exif.size(), container2.exif_size);
-  EXPECT_EQ(0, memcmp(exif.data(), container2.exif, container2.exif_size));
-  EXPECT_EQ(2u, container2.xml.size());
-  if (container2.xml.size() == 2) {
-    EXPECT_EQ(xml0.size(), container2.xml[0].second);
-    EXPECT_EQ(0, memcmp(xml0.data(), container2.xml[0].first,
-                        container2.xml[0].second));
-    EXPECT_EQ(xml1.size(), container2.xml[1].second);
-    EXPECT_EQ(0, memcmp(xml1.data(), container2.xml[1].first,
-                        container2.xml[1].second));
-  }
-  EXPECT_EQ(1u, container2.xmlc.size());
-  if (container2.xmlc.size() == 1) {
-    EXPECT_EQ(xml1.size(), container2.xmlc[0].second);
-    EXPECT_EQ(0, memcmp(xml1.data(), container2.xmlc[0].first,
-                        container2.xmlc[0].second));
-  }
-  EXPECT_EQ(jumb.size(), container2.jumb_size);
-  EXPECT_EQ(0, memcmp(jumb.data(), container2.jumb, container2.jumb_size));
-  EXPECT_EQ(container.codestream.size(), container2.codestream.size());
-  EXPECT_EQ(0, memcmp(container.codestream.data(), container2.codestream.data(),
-                      container2.codestream.size()));
-}
diff --git a/tools/build_cleaner.py b/tools/build_cleaner.py
deleted file mode 100755 (executable)
index 76857d7..0000000
+++ /dev/null
@@ -1,317 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) the JPEG XL Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style
-# license that can be found in the LICENSE file.
-
-
-"""build_cleaner.py: Update build files.
-
-This tool keeps certain parts of the build files up to date.
-"""
-
-import argparse
-import collections
-import locale
-import os
-import re
-import subprocess
-import sys
-import tempfile
-
-
-def RepoFiles(src_dir):
-  """Return the list of files from the source git repository"""
-  git_bin = os.environ.get('GIT_BIN', 'git')
-  files = subprocess.check_output([git_bin, '-C', src_dir, 'ls-files'])
-  ret = files.decode(locale.getpreferredencoding()).splitlines()
-  ret.sort()
-  return ret
-
-def GetPrefixLibFiles(repo_files, prefix, suffixes=('.h', '.cc', '.ui')):
-  """Gets the library files that start with the prefix and end with source
-  code suffix."""
-  prefix_files = [
-      fn for fn in repo_files
-      if fn.startswith(prefix) and any(fn.endswith(suf) for suf in suffixes)]
-  return prefix_files
-
-# Type holding the different types of sources in libjxl:
-#   * decoder and common sources,
-#   * encoder-only sources,
-#   * tests-only sources,
-#   * google benchmark sources,
-#   * threads library sources,
-#   * extras library sources,
-#   * libjxl (encoder+decoder) public include/ headers and
-#   * threads public include/ headers.
-JxlSources = collections.namedtuple(
-    'JxlSources', ['dec', 'enc', 'test', 'gbench', 'threads',
-                   'extras', 'jxl_public_hdrs', 'threads_public_hdrs'])
-
-def SplitLibFiles(repo_files):
-  """Splits the library files into the different groups.
-
-  """
-  testonly = (
-      'testdata.h', 'test_utils.h', 'test_image.h', '_test.h', '_test.cc',
-      # _testonly.* files are library code used in tests only.
-      '_testonly.h', '_testonly.cc'
-  )
-  main_srcs = GetPrefixLibFiles(repo_files, 'lib/jxl/')
-  extras_srcs = GetPrefixLibFiles(repo_files, 'lib/extras/')
-  test_srcs = [fn for fn in main_srcs
-               if any(patt in fn for patt in testonly)]
-  lib_srcs = [fn for fn in main_srcs
-              if not any(patt in fn for patt in testonly)]
-
-  # Google benchmark sources.
-  gbench_srcs = sorted(fn for fn in lib_srcs + extras_srcs
-                       if fn.endswith('_gbench.cc'))
-  lib_srcs = [fn for fn in lib_srcs if fn not in gbench_srcs]
-  # Exclude optional codecs from extras.
-  exclude_extras = [
-    '/dec/gif',
-    '/dec/apng', '/enc/apng',
-    '/dec/exr', '/enc/exr',
-    '/dec/jpg', '/enc/jpg',
-  ]
-  extras_srcs = [fn for fn in extras_srcs if fn not in gbench_srcs and
-                 not any(patt in fn for patt in testonly) and
-                 not any(patt in fn for patt in exclude_extras)]
-
-
-  enc_srcs = [fn for fn in lib_srcs
-              if os.path.basename(fn).startswith('enc_') or
-                 os.path.basename(fn).startswith('butteraugli')]
-  enc_srcs.extend([
-      "lib/jxl/encode.cc",
-      "lib/jxl/encode_internal.h",
-      "lib/jxl/gaborish.cc",
-      "lib/jxl/gaborish.h",
-      "lib/jxl/huffman_tree.cc",
-      "lib/jxl/huffman_tree.h",
-      # Only the inlines in linalg.h header are used in the decoder.
-      # TODO(deymo): split out encoder only linalg.h functions.
-      "lib/jxl/linalg.cc",
-      "lib/jxl/optimize.cc",
-      "lib/jxl/optimize.h",
-      "lib/jxl/progressive_split.cc",
-      "lib/jxl/progressive_split.h",
-      # TODO(deymo): Add luminance.cc and luminance.h here too. Currently used
-      # by aux_out.h.
-  ])
-  # Temporarily remove enc_bit_writer from the encoder sources: a lot of
-  # decoder source code still needs to be split up into encoder and decoder.
-  # Including the enc_bit_writer in the decoder allows to build a working
-  # libjxl_dec library.
-  # TODO(lode): remove the dependencies of the decoder on enc_bit_writer and
-  # remove enc_bit_writer from the dec_srcs again.
-  enc_srcs.remove("lib/jxl/enc_bit_writer.cc")
-  enc_srcs.remove("lib/jxl/enc_bit_writer.h")
-  enc_srcs.sort()
-
-  enc_srcs_set = set(enc_srcs)
-  lib_srcs = [fn for fn in lib_srcs if fn not in enc_srcs_set]
-
-  # The remaining of the files are in the dec_library.
-  dec_srcs = lib_srcs
-
-  thread_srcs = GetPrefixLibFiles(repo_files, 'lib/threads/')
-  thread_srcs = [fn for fn in thread_srcs
-                 if not any(patt in fn for patt in testonly)]
-  public_hdrs = GetPrefixLibFiles(repo_files, 'lib/include/jxl/')
-
-  threads_public_hdrs = [fn for fn in public_hdrs if '_parallel_runner' in fn]
-  jxl_public_hdrs = list(sorted(set(public_hdrs) - set(threads_public_hdrs)))
-  return JxlSources(dec_srcs, enc_srcs, test_srcs, gbench_srcs, thread_srcs,
-                    extras_srcs, jxl_public_hdrs, threads_public_hdrs)
-
-
-def CleanFile(args, filename, pattern_data_list):
-  """Replace a pattern match with new data in the passed file.
-
-  Given a regular expression pattern with a single () match, it runs the regex
-  over the passed filename and replaces the match () with the new data. If
-  args.update is set, it will update the file with the new contents, otherwise
-  it will return True when no changes were needed.
-
-  Multiple pairs of (regular expression, new data) can be passed to the
-  pattern_data_list parameter and will be applied in order.
-
-  The regular expression must match at least once in the file.
-  """
-  filepath = os.path.join(args.src_dir, filename)
-  with open(filepath, 'r') as f:
-    src_text = f.read()
-
-  if not pattern_data_list:
-    return True
-
-  new_text = src_text
-
-  for pattern, data in pattern_data_list:
-    offset = 0
-    chunks = []
-    for match in re.finditer(pattern, new_text):
-      chunks.append(new_text[offset:match.start(1)])
-      offset = match.end(1)
-      chunks.append(data)
-    if not chunks:
-      raise Exception('Pattern not found for %s: %r' % (filename, pattern))
-    chunks.append(new_text[offset:])
-    new_text = ''.join(chunks)
-
-  if new_text == src_text:
-    return True
-
-  if args.update:
-    print('Updating %s' % filename)
-    with open(filepath, 'w') as f:
-      f.write(new_text)
-    return True
-  else:
-    with tempfile.NamedTemporaryFile(
-        mode='w', prefix=os.path.basename(filename)) as new_file:
-      new_file.write(new_text)
-      new_file.flush()
-      subprocess.call(
-          ['diff', '-u', filepath, '--label', 'a/' + filename, new_file.name,
-           '--label', 'b/' + filename])
-    return False
-
-
-def BuildCleaner(args):
-  repo_files = RepoFiles(args.src_dir)
-  ok = True
-
-  # jxl version
-  with open(os.path.join(args.src_dir, 'lib/CMakeLists.txt'), 'r') as f:
-    cmake_text = f.read()
-
-  gni_patterns = []
-  for varname in ('JPEGXL_MAJOR_VERSION', 'JPEGXL_MINOR_VERSION',
-                  'JPEGXL_PATCH_VERSION'):
-    # Defined in CMakeLists.txt as "set(varname 1234)"
-    match = re.search(r'set\(' + varname + r' ([0-9]+)\)', cmake_text)
-    version_value = match.group(1)
-    gni_patterns.append((r'"' + varname + r'=([0-9]+)"', version_value))
-
-  jxl_src = SplitLibFiles(repo_files)
-
-  # libjxl
-  jxl_cmake_patterns = []
-  jxl_cmake_patterns.append(
-      (r'set\(JPEGXL_INTERNAL_SOURCES_DEC\n([^\)]+)\)',
-       ''.join('  %s\n' % fn[len('lib/'):] for fn in jxl_src.dec)))
-  jxl_cmake_patterns.append(
-      (r'set\(JPEGXL_INTERNAL_SOURCES_ENC\n([^\)]+)\)',
-       ''.join('  %s\n' % fn[len('lib/'):] for fn in jxl_src.enc)))
-  ok = CleanFile(
-      args, 'lib/jxl.cmake',
-      jxl_cmake_patterns) and ok
-
-  ok = CleanFile(
-      args, 'lib/jxl_benchmark.cmake',
-      [(r'set\(JPEGXL_INTERNAL_SOURCES_GBENCH\n([^\)]+)\)',
-        ''.join('  %s\n' % fn[len('lib/'):] for fn in jxl_src.gbench))]) and ok
-
-  gni_patterns.append((
-      r'libjxl_dec_sources = \[\n([^\]]+)\]',
-      ''.join('    "%s",\n' % fn[len('lib/'):] for fn in jxl_src.dec)))
-  gni_patterns.append((
-      r'libjxl_enc_sources = \[\n([^\]]+)\]',
-      ''.join('    "%s",\n' % fn[len('lib/'):] for fn in jxl_src.enc)))
-  gni_patterns.append((
-      r'libjxl_gbench_sources = \[\n([^\]]+)\]',
-      ''.join('    "%s",\n' % fn[len('lib/'):] for fn in jxl_src.gbench)))
-
-
-  tests = [fn[len('lib/'):] for fn in jxl_src.test if fn.endswith('_test.cc')]
-  testlib = [fn[len('lib/'):] for fn in jxl_src.test
-             if not fn.endswith('_test.cc')]
-  gni_patterns.append((
-      r'libjxl_tests_sources = \[\n([^\]]+)\]',
-      ''.join('    "%s",\n' % fn for fn in tests)))
-  gni_patterns.append((
-      r'libjxl_testlib_sources = \[\n([^\]]+)\]',
-      ''.join('    "%s",\n' % fn for fn in testlib)))
-
-  # libjxl_threads
-  ok = CleanFile(
-      args, 'lib/jxl_threads.cmake',
-      [(r'set\(JPEGXL_THREADS_SOURCES\n([^\)]+)\)',
-        ''.join('  %s\n' % fn[len('lib/'):] for fn in jxl_src.threads))]) and ok
-
-  gni_patterns.append((
-      r'libjxl_threads_sources = \[\n([^\]]+)\]',
-      ''.join('    "%s",\n' % fn[len('lib/'):] for fn in jxl_src.threads)))
-
-  # libjxl_extras
-  ok = CleanFile(
-      args, 'lib/jxl_extras.cmake',
-      [(r'set\(JPEGXL_EXTRAS_SOURCES\n([^\)]+)\)',
-        ''.join('  %s\n' % fn[len('lib/'):] for fn in jxl_src.extras))]) and ok
-
-  gni_patterns.append((
-      r'libjxl_extras_sources = \[\n([^\]]+)\]',
-      ''.join('    "%s",\n' % fn[len('lib/'):] for fn in jxl_src.extras)))
-
-  # libjxl_profiler
-  profiler_srcs = [fn[len('lib/'):] for fn in repo_files
-                   if fn.startswith('lib/profiler')]
-  ok = CleanFile(
-      args, 'lib/jxl_profiler.cmake',
-      [(r'set\(JPEGXL_PROFILER_SOURCES\n([^\)]+)\)',
-        ''.join('  %s\n' % fn for fn in profiler_srcs))]) and ok
-
-  gni_patterns.append((
-      r'libjxl_profiler_sources = \[\n([^\]]+)\]',
-      ''.join('    "%s",\n' % fn for fn in profiler_srcs)))
-
-  # Public headers.
-  gni_patterns.append((
-      r'libjxl_public_headers = \[\n([^\]]+)\]',
-      ''.join('    "%s",\n' % fn[len('lib/'):]
-              for fn in jxl_src.jxl_public_hdrs)))
-  gni_patterns.append((
-      r'libjxl_threads_public_headers = \[\n([^\]]+)\]',
-      ''.join('    "%s",\n' % fn[len('lib/'):]
-              for fn in jxl_src.threads_public_hdrs)))
-
-
-  # Update the list of tests. CMake version include test files in other libs,
-  # not just in libjxl.
-  tests = [fn[len('lib/'):] for fn in repo_files
-           if fn.endswith('_test.cc') and fn.startswith('lib/')]
-  ok = CleanFile(
-      args, 'lib/jxl_tests.cmake',
-      [(r'set\(TEST_FILES\n([^\)]+)  ### Files before this line',
-        ''.join('  %s\n' % fn for fn in tests))]) and ok
-  ok = CleanFile(
-      args, 'lib/jxl_tests.cmake',
-      [(r'set\(TESTLIB_FILES\n([^\)]+)\)',
-        ''.join('  %s\n' % fn for fn in testlib))]) and ok
-
-  # Update lib.gni
-  ok = CleanFile(args, 'lib/lib.gni', gni_patterns) and ok
-
-  return ok
-
-
-def main():
-  parser = argparse.ArgumentParser(description=__doc__)
-  parser.add_argument('--src-dir',
-                      default=os.path.realpath(os.path.join(
-                          os.path.dirname(__file__), '..')),
-                      help='path to the build directory')
-  parser.add_argument('--update', default=False, action='store_true',
-                      help='update the build files instead of only checking')
-  args = parser.parse_args()
-  if not BuildCleaner(args):
-    print('Build files need update.')
-    sys.exit(2)
-
-
-if __name__ == '__main__':
-  main()
index 247ade8..436d290 100644 (file)
@@ -3,6 +3,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+#include <jxl/cms.h>
 #include <stdint.h>
 #include <stdio.h>
 
 
 #include "lib/extras/codec.h"
 #include "lib/extras/dec/color_hints.h"
+#include "lib/extras/metrics.h"
 #include "lib/jxl/base/data_parallel.h"
-#include "lib/jxl/base/file_io.h"
-#include "lib/jxl/base/padded_bytes.h"
 #include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/span.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/base/thread_pool_internal.h"
 #include "lib/jxl/butteraugli/butteraugli.h"
 #include "lib/jxl/codec_in_out.h"
 #include "lib/jxl/color_encoding_internal.h"
-#include "lib/jxl/color_management.h"
 #include "lib/jxl/enc_butteraugli_comparator.h"
-#include "lib/jxl/enc_butteraugli_pnorm.h"
-#include "lib/jxl/enc_color_management.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/image_bundle.h"
 #include "lib/jxl/image_ops.h"
+#include "tools/file_io.h"
+#include "tools/thread_pool_internal.h"
 
-namespace jxl {
 namespace {
 
+using jpegxl::tools::ThreadPoolInternal;
+using jxl::ButteraugliParams;
+using jxl::CodecInOut;
+using jxl::ColorEncoding;
+using jxl::Image3F;
+using jxl::ImageF;
+using jxl::Status;
+
 Status WriteImage(Image3F&& image, const std::string& filename) {
   ThreadPoolInternal pool(4);
   CodecInOut io;
   io.metadata.m.SetUintSamples(8);
   io.metadata.m.color_encoding = ColorEncoding::SRGB();
   io.SetFromImage(std::move(image), io.metadata.m.color_encoding);
-  return EncodeToFile(io, filename, &pool);
+
+  std::vector<uint8_t> encoded;
+  return jxl::Encode(io, filename, &encoded, &pool) &&
+         jpegxl::tools::WriteFile(filename, encoded);
 }
 
 Status RunButteraugli(const char* pathname1, const char* pathname2,
                       const std::string& distmap_filename,
+                      const std::string& raw_distmap_filename,
                       const std::string& colorspace_hint, double p,
                       float intensity_target) {
-  extras::ColorHints color_hints;
+  jxl::extras::ColorHints color_hints;
   if (!colorspace_hint.empty()) {
     color_hints.Add("color_space", colorspace_hint);
   }
 
-  CodecInOut io1;
+  const char* pathname[2] = {pathname1, pathname2};
+  CodecInOut io[2];
   ThreadPoolInternal pool(4);
-  if (!SetFromFile(pathname1, color_hints, &io1, &pool)) {
-    fprintf(stderr, "Failed to read image from %s\n", pathname1);
-    return false;
-  }
-
-  CodecInOut io2;
-  if (!SetFromFile(pathname2, color_hints, &io2, &pool)) {
-    fprintf(stderr, "Failed to read image from %s\n", pathname2);
-    return false;
+  for (size_t i = 0; i < 2; ++i) {
+    std::vector<uint8_t> encoded;
+    if (!jpegxl::tools::ReadFile(pathname[i], &encoded)) {
+      fprintf(stderr, "Failed to read image from %s\n", pathname[i]);
+      return false;
+    }
+    if (!jxl::SetFromBytes(jxl::Bytes(encoded), color_hints, &io[i], &pool)) {
+      fprintf(stderr, "Failed to decode image from %s\n", pathname[i]);
+      return false;
+    }
   }
 
+  CodecInOut& io1 = io[0];
+  CodecInOut& io2 = io[1];
   if (io1.xsize() != io2.xsize()) {
     fprintf(stderr, "Width mismatch: %" PRIuS " %" PRIuS "\n", io1.xsize(),
             io2.xsize());
@@ -75,33 +89,43 @@ Status RunButteraugli(const char* pathname1, const char* pathname2,
 
   ImageF distmap;
   ButteraugliParams ba_params;
-  ba_params.hf_asymmetry = 0.8f;
+  ba_params.hf_asymmetry = 1.0f;
   ba_params.xmul = 1.0f;
   ba_params.intensity_target = intensity_target;
-  const float distance = ButteraugliDistance(io1.Main(), io2.Main(), ba_params,
-                                             GetJxlCms(), &distmap, &pool);
+  const float distance = jxl::ButteraugliDistance(
+      io1.Main(), io2.Main(), ba_params, *JxlGetDefaultCms(), &distmap, &pool);
   printf("%.10f\n", distance);
 
-  double pnorm = ComputeDistanceP(distmap, ba_params, p);
+  double pnorm = jxl::ComputeDistanceP(distmap, ba_params, p);
   printf("%g-norm: %f\n", p, pnorm);
 
   if (!distmap_filename.empty()) {
-    float good = ButteraugliFuzzyInverse(1.5);
-    float bad = ButteraugliFuzzyInverse(0.5);
-    JXL_CHECK(
-        WriteImage(CreateHeatMapImage(distmap, good, bad), distmap_filename));
+    float good = jxl::ButteraugliFuzzyInverse(1.5);
+    float bad = jxl::ButteraugliFuzzyInverse(0.5);
+    JXL_CHECK(WriteImage(jxl::CreateHeatMapImage(distmap, good, bad),
+                         distmap_filename));
+  }
+  if (!raw_distmap_filename.empty()) {
+    FILE* out = fopen(raw_distmap_filename.c_str(), "wb");
+    JXL_CHECK(out != nullptr);
+    fprintf(out, "Pf\n%" PRIuS " %" PRIuS "\n-1.0\n", distmap.xsize(),
+            distmap.ysize());
+    for (size_t y = distmap.ysize(); y-- > 0;) {
+      fwrite(distmap.Row(y), 4, distmap.xsize(), out);
+    }
+    fclose(out);
   }
   return true;
 }
 
 }  // namespace
-}  // namespace jxl
 
 int main(int argc, char** argv) {
   if (argc < 3) {
     fprintf(stderr,
             "Usage: %s <reference> <distorted>\n"
             "  [--distmap <distmap>]\n"
+            "  [--rawdistmap <distmap.pfm>]\n"
             "  [--intensity_target <intensity_target>]\n"
             "  [--colorspace <colorspace_hint>]\n"
             "  [--pnorm <pth norm>]\n"
@@ -114,12 +138,15 @@ int main(int argc, char** argv) {
     return 1;
   }
   std::string distmap;
+  std::string raw_distmap;
   std::string colorspace;
   double p = 3;
   float intensity_target = 80.0;  // sRGB intensity target.
   for (int i = 3; i < argc; i++) {
     if (std::string(argv[i]) == "--distmap" && i + 1 < argc) {
       distmap = argv[++i];
+    } else if (std::string(argv[i]) == "--rawdistmap" && i + 1 < argc) {
+      raw_distmap = argv[++i];
     } else if (std::string(argv[i]) == "--colorspace" && i + 1 < argc) {
       colorspace = argv[++i];
     } else if (std::string(argv[i]) == "--intensity_target" && i + 1 < argc) {
@@ -137,8 +164,6 @@ int main(int argc, char** argv) {
     }
   }
 
-  return jxl::RunButteraugli(argv[1], argv[2], distmap, colorspace, p,
-                             intensity_target)
-             ? 0
-             : 1;
+  return !RunButteraugli(argv[1], argv[2], distmap, raw_distmap, colorspace, p,
+                         intensity_target);
 }
diff --git a/tools/cjpeg_hdr.cc b/tools/cjpeg_hdr.cc
deleted file mode 100644 (file)
index cfe272e..0000000
+++ /dev/null
@@ -1,306 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <tuple>
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "tools/cjpeg_hdr.cc"
-#include <hwy/foreach_target.h>
-#include <hwy/highway.h>
-
-#include "lib/extras/codec.h"
-#include "lib/jxl/base/file_io.h"
-#include "lib/jxl/base/padded_bytes.h"
-#include "lib/jxl/codec_in_out.h"
-#include "lib/jxl/common.h"
-#include "lib/jxl/enc_adaptive_quantization.h"
-#include "lib/jxl/enc_color_management.h"
-#include "lib/jxl/enc_transforms.h"
-#include "lib/jxl/enc_xyb.h"
-#include "lib/jxl/image.h"
-#include "lib/jxl/image_bundle.h"
-#include "lib/jxl/image_metadata.h"
-#include "lib/jxl/image_ops.h"
-#include "lib/jxl/jpeg/dec_jpeg_data_writer.h"
-#include "lib/jxl/quant_weights.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace jpegxl {
-namespace tools {
-namespace HWY_NAMESPACE {
-void FillJPEGData(const jxl::Image3F& ycbcr, const jxl::PaddedBytes& icc,
-                  const jxl::ImageF& quant_field,
-                  const jxl::FrameDimensions& frame_dim,
-                  jxl::jpeg::JPEGData* out) {
-  // JFIF
-  out->marker_order.push_back(0xE0);
-  out->app_data.emplace_back(std::vector<uint8_t>{
-      0xe0,                      // Marker
-      0, 16,                     // Length
-      'J', 'F', 'I', 'F', '\0',  // ID
-      1, 1,                      // Version (1.1)
-      0,                         // No density units
-      0, 1, 0, 1,                // Pixel density 1
-      0, 0                       // No thumbnail
-  });
-  // ICC
-  if (!icc.empty()) {
-    out->marker_order.push_back(0xE2);
-    std::vector<uint8_t> icc_marker(17 + icc.size());
-    icc_marker[0] = 0xe2;
-    icc_marker[1] = (icc_marker.size() - 1) >> 8;
-    icc_marker[2] = (icc_marker.size() - 1) & 0xFF;
-    memcpy(&icc_marker[3], "ICC_PROFILE", 12);
-    icc_marker[15] = 1;
-    icc_marker[16] = 1;
-    memcpy(&icc_marker[17], icc.data(), icc.size());
-    out->app_data.push_back(std::move(icc_marker));
-  }
-
-  // DQT
-  out->marker_order.emplace_back(0xdb);
-  out->quant.resize(2);
-  out->quant[0].is_last = false;
-  out->quant[0].index = 0;
-  out->quant[1].is_last = true;
-  out->quant[1].index = 1;
-  jxl::DequantMatrices dequant;
-
-  // mozjpeg q99
-  int qluma[64] = {
-      1, 1, 1, 1, 1, 1, 1, 2,  //
-      1, 1, 1, 1, 1, 1, 1, 2,  //
-      1, 1, 1, 1, 1, 1, 2, 3,  //
-      1, 1, 1, 1, 1, 1, 2, 3,  //
-      1, 1, 1, 1, 1, 2, 3, 4,  //
-      1, 1, 1, 1, 2, 2, 3, 5,  //
-      1, 1, 2, 2, 3, 3, 5, 6,  //
-      2, 2, 3, 3, 4, 5, 6, 8,  //
-  };
-  // mozjpeg q95
-  int qchroma[64] = {
-      2, 2, 2,  2,  3,  4,  6,  9,   //
-      2, 2, 2,  3,  3,  4,  5,  8,   //
-      2, 2, 2,  3,  4,  6,  9,  14,  //
-      2, 3, 3,  4,  5,  7,  11, 16,  //
-      3, 3, 4,  5,  7,  9,  13, 19,  //
-      4, 4, 6,  7,  9,  12, 17, 24,  //
-      6, 5, 9,  11, 13, 17, 23, 31,  //
-      9, 8, 14, 16, 19, 24, 31, 42,  //
-  };
-  // Disable quantization for now.
-  std::fill(std::begin(qluma), std::end(qluma), 1);
-  std::fill(std::begin(qchroma), std::end(qchroma), 1);
-
-  memcpy(out->quant[0].values.data(), qluma, sizeof(qluma));
-  memcpy(out->quant[1].values.data(), qchroma, sizeof(qchroma));
-
-  // SOF
-  out->marker_order.emplace_back(0xc2);
-  out->components.resize(3);
-  out->height = frame_dim.ysize;
-  out->width = frame_dim.xsize_padded;
-  out->components[0].id = 1;
-  out->components[1].id = 2;
-  out->components[2].id = 3;
-  out->components[0].h_samp_factor = out->components[1].h_samp_factor =
-      out->components[2].h_samp_factor = out->components[0].v_samp_factor =
-          out->components[1].v_samp_factor = out->components[2].v_samp_factor =
-              1;
-  out->components[0].width_in_blocks = out->components[1].width_in_blocks =
-      out->components[2].width_in_blocks = frame_dim.xsize_blocks;
-  out->components[0].quant_idx = 0;
-  out->components[1].quant_idx = 1;
-  out->components[2].quant_idx = 1;
-  out->components[0].coeffs.resize(frame_dim.xsize_blocks *
-                                   frame_dim.ysize_blocks * 64);
-  out->components[1].coeffs.resize(frame_dim.xsize_blocks *
-                                   frame_dim.ysize_blocks * 64);
-  out->components[2].coeffs.resize(frame_dim.xsize_blocks *
-                                   frame_dim.ysize_blocks * 64);
-
-  HWY_ALIGN float scratch_space[2 * 64];
-
-  for (size_t c = 0; c < 3; c++) {
-    int* qt = c == 0 ? qluma : qchroma;
-    for (size_t by = 0; by < frame_dim.ysize_blocks; by++) {
-      for (size_t bx = 0; bx < frame_dim.xsize_blocks; bx++) {
-        float deadzone = 0.5f / quant_field.Row(by)[bx];
-        // Disable quantization for now.
-        deadzone = 0;
-        auto q = [&](float coeff, size_t x, size_t y) -> int {
-          size_t pos = x * 8 + y;
-          float scoeff = coeff / qt[pos];
-          if (pos == 0) {
-            return std::round(scoeff);
-          }
-          if (std::abs(scoeff) < deadzone) return 0;
-          if (std::abs(scoeff) < 2 * deadzone && x + y >= 7) return 0;
-          return std::round(scoeff);
-        };
-        HWY_ALIGN float dct[64];
-        TransformFromPixels(jxl::AcStrategy::Type::DCT,
-                            ycbcr.PlaneRow(c, 8 * by) + 8 * bx,
-                            ycbcr.PixelsPerRow(), dct, scratch_space);
-        for (size_t iy = 0; iy < 8; iy++) {
-          for (size_t ix = 0; ix < 8; ix++) {
-            float coeff = dct[iy * 8 + ix] * 2040;  // not a typo
-            out->components[c]
-                .coeffs[(frame_dim.xsize_blocks * by + bx) * 64 + ix * 8 + iy] =
-                q(coeff, ix, iy);
-          }
-        }
-      }
-    }
-  }
-
-  // DHT
-  // TODO: optimize
-  out->marker_order.emplace_back(0xC4);
-  out->huffman_code.resize(2);
-  out->huffman_code[0].slot_id = 0x00;  // DC
-  out->huffman_code[0].counts = {{0, 0, 0, 0, 13}};
-  std::iota(out->huffman_code[0].values.begin(),
-            out->huffman_code[0].values.end(), 0);
-  out->huffman_code[0].is_last = false;
-
-  out->huffman_code[1].slot_id = 0x10;  // AC
-  out->huffman_code[1].counts = {{0, 0, 0, 0, 0, 0, 0, 0, 255}};
-  std::iota(out->huffman_code[1].values.begin(),
-            out->huffman_code[1].values.end(), 0);
-  out->huffman_code[1].is_last = true;
-
-  // SOS
-  for (size_t _ = 0; _ < 7; _++) {
-    out->marker_order.emplace_back(0xDA);
-  }
-  out->scan_info.resize(7);
-  // DC
-  // comp id, DC tbl, AC tbl
-  out->scan_info[0].num_components = 3;
-  out->scan_info[0].components = {{jxl::jpeg::JPEGComponentScanInfo{0, 0, 0},
-                                   jxl::jpeg::JPEGComponentScanInfo{1, 0, 0},
-                                   jxl::jpeg::JPEGComponentScanInfo{2, 0, 0}}};
-  out->scan_info[0].Ss = 0;
-  out->scan_info[0].Se = 0;
-  out->scan_info[0].Ah = out->scan_info[0].Al = 0;
-  // AC 1 - highest bits
-  out->scan_info[1].num_components = 1;
-  out->scan_info[1].components = {{jxl::jpeg::JPEGComponentScanInfo{0, 0, 0}}};
-  out->scan_info[1].Ss = 1;
-  out->scan_info[1].Se = 63;
-  out->scan_info[1].Ah = 0;
-  out->scan_info[1].Al = 1;
-
-  // Copy for X / B-Y
-  out->scan_info[2] = out->scan_info[1];
-  out->scan_info[2].components[0].comp_idx = 1;
-  out->scan_info[3] = out->scan_info[1];
-  out->scan_info[3].components[0].comp_idx = 2;
-
-  // AC 2 - lowest bit
-  out->scan_info[4].num_components = 1;
-  out->scan_info[4].components = {{jxl::jpeg::JPEGComponentScanInfo{0, 0, 0}}};
-  out->scan_info[4].Ss = 1;
-  out->scan_info[4].Se = 63;
-  out->scan_info[4].Ah = 1;
-  out->scan_info[4].Al = 0;
-
-  // Copy for X / B-Y
-  out->scan_info[5] = out->scan_info[4];
-  out->scan_info[5].components[0].comp_idx = 1;
-  out->scan_info[6] = out->scan_info[4];
-  out->scan_info[6].components[0].comp_idx = 2;
-
-  // EOI
-  out->marker_order.push_back(0xd9);
-}
-}  // namespace HWY_NAMESPACE
-}  // namespace tools
-}  // namespace jpegxl
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-
-namespace jpegxl {
-namespace tools {
-
-HWY_EXPORT(FillJPEGData);
-
-int HBDJPEGMain(int argc, const char* argv[]) {
-  if (argc < 3) {
-    fprintf(stderr, "Usage: %s input output.jpg\n", argv[0]);
-    return 1;
-  }
-  fprintf(stderr, "Compressing %s to %s\n", argv[1], argv[2]);
-  jxl::CodecInOut io;
-  if (!jxl::SetFromFile(argv[1], jxl::extras::ColorHints{}, &io)) {
-    fprintf(stderr, "Failed to read image %s.\n", argv[1]);
-    return 1;
-  }
-  jxl::Image3F ycbcr(jxl::RoundUpToBlockDim(io.xsize()),
-                     jxl::RoundUpToBlockDim(io.ysize()));
-  ycbcr.ShrinkTo(io.xsize(), io.ysize());
-  jxl::FrameDimensions frame_dim;
-  frame_dim.Set(io.xsize(), io.ysize(), 0, 0, 0, false, 1);
-  for (size_t y = 0; y < ycbcr.ysize(); y++) {
-    for (size_t x = 0; x < ycbcr.xsize(); x++) {
-      float r = io.Main().color()->PlaneRow(0, y)[x];
-      float g = io.Main().color()->PlaneRow(1, y)[x];
-      float b = io.Main().color()->PlaneRow(2, y)[x];
-      ycbcr.PlaneRow(0, y)[x] =
-          0.299 * r + 0.587 * g + 0.114 * b - (128. / 255.);
-      ycbcr.PlaneRow(1, y)[x] = -0.168736 * r - 0.331264 * g + 0.5 * b;
-      ycbcr.PlaneRow(2, y)[x] = 0.5 * r - 0.418688 * g - 0.081312 * b;
-    }
-  }
-  jxl::Image3F rgb2(ycbcr.xsize(), ycbcr.ysize());
-  jxl::Image3F ycbcr2(ycbcr.xsize(), ycbcr.ysize());
-  for (size_t y = 0; y < ycbcr.ysize(); y++) {
-    for (size_t x = 0; x < ycbcr.xsize(); x++) {
-      ycbcr2.PlaneRow(0, y)[x] = ycbcr.PlaneRow(1, y)[x];
-      ycbcr2.PlaneRow(1, y)[x] = ycbcr.PlaneRow(0, y)[x];
-      ycbcr2.PlaneRow(2, y)[x] = ycbcr.PlaneRow(2, y)[x];
-    }
-  }
-  jxl::YcbcrToRgb(ycbcr2, &rgb2, jxl::Rect(ycbcr));
-
-  PadImageToBlockMultipleInPlace(&ycbcr);
-
-  jxl::Image3F opsin(jxl::RoundUpToBlockDim(io.xsize()),
-                     jxl::RoundUpToBlockDim(io.ysize()));
-  opsin.ShrinkTo(io.xsize(), io.ysize());
-  jxl::ToXYB(io.Main(), nullptr, &opsin, jxl::GetJxlCms());
-  PadImageToBlockMultipleInPlace(&opsin);
-  jxl::ImageF mask;
-  jxl::ImageF qf =
-      InitialQuantField(1.0, opsin, frame_dim, nullptr, 1.0, &mask);
-
-  jxl::CodecInOut out;
-  out.Main().jpeg_data = jxl::make_unique<jxl::jpeg::JPEGData>();
-  HWY_DYNAMIC_DISPATCH(FillJPEGData)
-  (ycbcr, io.metadata.m.color_encoding.ICC(), qf, frame_dim,
-   out.Main().jpeg_data.get());
-  jxl::PaddedBytes output;
-  if (!jxl::jpeg::EncodeImageJPGCoefficients(&out, &output)) {
-    return 1;
-  }
-  if (!jxl::WriteFile(output, argv[2])) {
-    fprintf(stderr, "Failed to write to \"%s\"\n", argv[2]);
-    return 1;
-  }
-  return 0;
-}
-
-}  // namespace tools
-}  // namespace jpegxl
-
-int main(int argc, const char** argv) {
-  return jpegxl::tools::HBDJPEGMain(argc, argv);
-}
-#endif
diff --git a/tools/cjpegli.cc b/tools/cjpegli.cc
new file mode 100644 (file)
index 0000000..4088e27
--- /dev/null
@@ -0,0 +1,270 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <vector>
+
+#include "lib/extras/dec/decode.h"
+#include "lib/extras/enc/jpegli.h"
+#include "lib/extras/time.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/span.h"
+#include "tools/args.h"
+#include "tools/cmdline.h"
+#include "tools/file_io.h"
+#include "tools/speed_stats.h"
+
+namespace jpegxl {
+namespace tools {
+namespace {
+
+struct Args {
+  void AddCommandLineOptions(CommandLineParser* cmdline) {
+    std::string input_help("the input can be ");
+    if (jxl::extras::CanDecode(jxl::extras::Codec::kPNG)) {
+      input_help.append("PNG, APNG, ");
+    }
+    if (jxl::extras::CanDecode(jxl::extras::Codec::kGIF)) {
+      input_help.append("GIF, ");
+    }
+    if (jxl::extras::CanDecode(jxl::extras::Codec::kEXR)) {
+      input_help.append("EXR, ");
+    }
+    input_help.append("PPM, PFM, or PGX");
+    cmdline->AddPositionalOption("INPUT", /* required = */ true, input_help,
+                                 &file_in);
+    cmdline->AddPositionalOption("OUTPUT", /* required = */ true,
+                                 "the compressed JPG output file", &file_out);
+
+    cmdline->AddOptionFlag('\0', "disable_output",
+                           "No output file will be written (for benchmarking)",
+                           &disable_output, &SetBooleanTrue, 1);
+
+    cmdline->AddOptionValue(
+        'x', "dec-hints", "key=value",
+        "color_space indicates the ColorEncoding, see Description();\n"
+        "    icc_pathname refers to a binary file containing an ICC profile.",
+        &color_hints_proxy, &ParseAndAppendKeyValue<ColorHintsProxy>, 1);
+
+    opt_distance_id = cmdline->AddOptionValue(
+        'd', "distance", "maxError",
+        "Max. butteraugli distance, lower = higher quality.\n"
+        "    1.0 = visually lossless (default).\n"
+        "    Recommended range: 0.5 .. 3.0. Allowed range: 0.0 ... 25.0.\n"
+        "    Mutually exclusive with --quality and --target_size.",
+        &settings.distance, &ParseFloat);
+
+    opt_quality_id = cmdline->AddOptionValue(
+        'q', "quality", "QUALITY",
+        "Quality setting (is remapped to --distance)."
+        "    Default is quality 90.\n"
+        "    Quality values roughly match libjpeg quality.\n"
+        "    Recommended range: 68 .. 96. Allowed range: 1 .. 100.\n"
+        "    Mutually exclusive with --distance and --target_size.",
+        &quality, &ParseSigned);
+
+    cmdline->AddOptionValue('\0', "chroma_subsampling", "444|440|422|420",
+                            "Chroma subsampling setting.",
+                            &settings.chroma_subsampling, &ParseString);
+
+    cmdline->AddOptionValue(
+        'p', "progressive_level", "N",
+        "Progressive level setting. Range: 0 .. 2.\n"
+        "    Default: 2. Higher number is more scans, 0 means sequential.",
+        &settings.progressive_level, &ParseSigned);
+
+    cmdline->AddOptionFlag('\0', "xyb", "Convert to XYB colorspace",
+                           &settings.xyb, &SetBooleanTrue, 1);
+
+    cmdline->AddOptionFlag(
+        '\0', "std_quant",
+        "Use quantization tables based on Annex K of the JPEG standard.",
+        &settings.use_std_quant_tables, &SetBooleanTrue, 1);
+
+    cmdline->AddOptionFlag(
+        '\0', "noadaptive_quantization", "Disable adaptive quantization.",
+        &settings.use_adaptive_quantization, &SetBooleanFalse, 1);
+
+    cmdline->AddOptionFlag(
+        '\0', "fixed_code",
+        "Disable Huffman code optimization. Must be used together with -p 0.",
+        &settings.optimize_coding, &SetBooleanFalse, 1);
+
+    cmdline->AddOptionValue(
+        '\0', "target_size", "N",
+        "If non-zero, set target size in bytes. This is useful for image \n"
+        "    quality comparisons, but makes encoding speed up to 20x slower.\n"
+        "    Mutually exclusive with --distance and --quality.",
+        &settings.target_size, &ParseUnsigned, 2);
+
+    cmdline->AddOptionValue('\0', "num_reps", "N",
+                            "How many times to compress. (For benchmarking).",
+                            &num_reps, &ParseUnsigned, 1);
+
+    cmdline->AddOptionFlag('\0', "quiet", "Suppress informative output", &quiet,
+                           &SetBooleanTrue, 1);
+
+    cmdline->AddOptionFlag(
+        'v', "verbose",
+        "Verbose output; can be repeated, also applies to help (!).", &verbose,
+        &SetBooleanTrue);
+  }
+
+  const char* file_in = nullptr;
+  const char* file_out = nullptr;
+  bool disable_output = false;
+  ColorHintsProxy color_hints_proxy;
+  jxl::extras::JpegSettings settings;
+  int quality = 90;
+  size_t num_reps = 1;
+  bool quiet = false;
+  bool verbose = false;
+  // References (ids) of specific options to check if they were matched.
+  CommandLineParser::OptionId opt_distance_id = -1;
+  CommandLineParser::OptionId opt_quality_id = -1;
+};
+
+bool ValidateArgs(const Args& args) {
+  const jxl::extras::JpegSettings& settings = args.settings;
+  if (settings.distance < 0.0 || settings.distance > 25.0) {
+    fprintf(stderr, "Invalid --distance argument\n");
+    return false;
+  }
+  if (args.quality <= 0 || args.quality > 100) {
+    fprintf(stderr, "Invalid --quality argument\n");
+    return false;
+  }
+  std::string cs = settings.chroma_subsampling;
+  if (!cs.empty() && cs != "444" && cs != "440" && cs != "422" && cs != "420") {
+    fprintf(stderr, "Invalid --chroma_subsampling argument\n");
+    return false;
+  }
+  if (settings.progressive_level < 0 || settings.progressive_level > 2) {
+    fprintf(stderr, "Invalid --progressive_level argument\n");
+    return false;
+  }
+  if (settings.progressive_level > 0 && !settings.optimize_coding) {
+    fprintf(stderr, "--fixed_code must be used together with -p 0\n");
+    return false;
+  }
+  return true;
+}
+
+bool SetDistance(const Args& args, const CommandLineParser& cmdline,
+                 jxl::extras::JpegSettings* settings) {
+  bool distance_set = cmdline.GetOption(args.opt_distance_id)->matched();
+  bool quality_set = cmdline.GetOption(args.opt_quality_id)->matched();
+  int num_quality_settings = (distance_set ? 1 : 0) + (quality_set ? 1 : 0) +
+                             (args.settings.target_size > 0 ? 1 : 0);
+  if (num_quality_settings > 1) {
+    fprintf(
+        stderr,
+        "Only one of --distance, --quality, or --target_size can be set.\n");
+    return false;
+  }
+  if (quality_set) {
+    settings->quality = args.quality;
+  }
+  return true;
+}
+
+int CJpegliMain(int argc, const char* argv[]) {
+  Args args;
+  CommandLineParser cmdline;
+  args.AddCommandLineOptions(&cmdline);
+
+  if (!cmdline.Parse(argc, const_cast<const char**>(argv))) {
+    // Parse already printed the actual error cause.
+    fprintf(stderr, "Use '%s -h' for more information.\n", argv[0]);
+    return EXIT_FAILURE;
+  }
+
+  if (cmdline.HelpFlagPassed() || !args.file_in) {
+    cmdline.PrintHelp();
+    return EXIT_SUCCESS;
+  }
+
+  if (!args.file_out && !args.disable_output) {
+    fprintf(stderr,
+            "No output file specified and --disable_output flag not passed.\n");
+    return EXIT_FAILURE;
+  }
+
+  if (args.disable_output && !args.quiet) {
+    fprintf(stderr,
+            "Encoding will be performed, but the result will be discarded.\n");
+  }
+
+  std::vector<uint8_t> input_bytes;
+  if (!ReadFile(args.file_in, &input_bytes)) {
+    fprintf(stderr, "Failed to read input image %s\n", args.file_in);
+    return EXIT_FAILURE;
+  }
+
+  jxl::extras::PackedPixelFile ppf;
+  if (!jxl::extras::DecodeBytes(jxl::Bytes(input_bytes),
+                                args.color_hints_proxy.target, &ppf)) {
+    fprintf(stderr, "Failed to decode input image %s\n", args.file_in);
+    return EXIT_FAILURE;
+  }
+
+  if (!args.quiet) {
+    fprintf(stderr, "Read %ux%u image, %" PRIuS " bytes.\n", ppf.info.xsize,
+            ppf.info.ysize, input_bytes.size());
+  }
+
+  if (!ValidateArgs(args) || !SetDistance(args, cmdline, &args.settings)) {
+    return EXIT_FAILURE;
+  }
+
+  if (!args.quiet) {
+    const jxl::extras::JpegSettings& s = args.settings;
+    fprintf(stderr, "Encoding [%s%s d%.3f%s %sAQ p%d %s]\n",
+            s.xyb ? "XYB" : "YUV", s.chroma_subsampling.c_str(), s.distance,
+            s.use_std_quant_tables ? " StdQuant" : "",
+            s.use_adaptive_quantization ? "" : "no", s.progressive_level,
+            s.optimize_coding ? "OPT" : "FIX");
+  }
+
+  jpegxl::tools::SpeedStats stats;
+  std::vector<uint8_t> jpeg_bytes;
+  for (size_t num_rep = 0; num_rep < args.num_reps; ++num_rep) {
+    const double t0 = jxl::Now();
+    if (!jxl::extras::EncodeJpeg(ppf, args.settings, nullptr, &jpeg_bytes)) {
+      fprintf(stderr, "jpegli encoding failed\n");
+      return EXIT_FAILURE;
+    }
+    const double t1 = jxl::Now();
+    stats.NotifyElapsed(t1 - t0);
+    stats.SetImageSize(ppf.info.xsize, ppf.info.ysize);
+  }
+
+  if (args.file_out && !args.disable_output) {
+    if (!WriteFile(args.file_out, jpeg_bytes)) {
+      fprintf(stderr, "Could not write jpeg to %s\n", args.file_out);
+      return EXIT_FAILURE;
+    }
+  }
+  if (!args.quiet) {
+    fprintf(stderr, "Compressed to %" PRIuS " bytes ", jpeg_bytes.size());
+    const size_t num_pixels = ppf.info.xsize * ppf.info.ysize;
+    const double bpp =
+        static_cast<double>(jpeg_bytes.size() * jxl::kBitsPerByte) / num_pixels;
+    fprintf(stderr, "(%.3f bpp).\n", bpp);
+    stats.Print(1);
+  }
+  return EXIT_SUCCESS;
+}
+
+}  // namespace
+}  // namespace tools
+}  // namespace jpegxl
+
+int main(int argc, const char** argv) {
+  return jpegxl::tools::CJpegliMain(argc, argv);
+}
index f3a1d9f..4577143 100644 (file)
@@ -3,22 +3,21 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+#include <jxl/encode.h>
+#include <jxl/encode_cxx.h>
+#include <jxl/thread_parallel_runner.h>
+#include <jxl/thread_parallel_runner_cxx.h>
 #include <limits.h>
 #include <stdint.h>
-#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
 #include <algorithm>
 #include <functional>
+#include <hwy/targets.h>
 #include <random>
 #include <vector>
 
-#include "hwy/targets.h"
-#include "jxl/encode.h"
-#include "jxl/encode_cxx.h"
-#include "jxl/thread_parallel_runner.h"
-#include "jxl/thread_parallel_runner_cxx.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/test_image.h"
 
@@ -120,7 +119,7 @@ bool EncodeJpegXl(const FuzzSpec& spec) {
     // Reading compressed output
     JxlEncoderStatus process_result = JXL_ENC_NEED_MORE_OUTPUT;
     while (process_result == JXL_ENC_NEED_MORE_OUTPUT) {
-      std::vector<uint8_t> buf(spec.output_buffer_size);
+      std::vector<uint8_t> buf(spec.output_buffer_size + 32);
       uint8_t* next_out = buf.data();
       size_t avail_out = buf.size();
       process_result = JxlEncoderProcessOutput(enc, &next_out, &avail_out);
index e43bb27..de1e118 100644 (file)
 // also require a change to the range-check here. The advantage is
 // that this minimizes the size of libjxl.
 
-#include <stdint.h>
-
+#include <jxl/codestream_header.h>
+#include <jxl/encode.h>
+#include <jxl/encode_cxx.h>
+#include <jxl/thread_parallel_runner.h>
+#include <jxl/thread_parallel_runner_cxx.h>
+#include <jxl/types.h>
+
+#include <algorithm>
+#include <cerrno>
 #include <cmath>
+#include <cstdint>
+#include <cstdio>
 #include <cstdlib>
+#include <cstring>
 #include <functional>
 #include <iostream>
+#include <memory>
 #include <sstream>
 #include <string>
 #include <thread>
 #include <type_traits>
 #include <vector>
 
-#include "jxl/codestream_header.h"
-#include "jxl/encode.h"
-#include "jxl/encode_cxx.h"
-#include "jxl/thread_parallel_runner.h"
-#include "jxl/thread_parallel_runner_cxx.h"
-#include "jxl/types.h"
 #include "lib/extras/dec/apng.h"
 #include "lib/extras/dec/color_hints.h"
-#include "lib/extras/dec/gif.h"
-#include "lib/extras/dec/jpg.h"
-#include "lib/extras/dec/pgx.h"
+#include "lib/extras/dec/decode.h"
 #include "lib/extras/dec/pnm.h"
+#include "lib/extras/enc/jxl.h"
 #include "lib/extras/time.h"
+#include "lib/jxl/base/common.h"
 #include "lib/jxl/base/override.h"
 #include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/base/span.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/exif.h"
-#include "lib/jxl/size_constraints.h"
 #include "tools/args.h"
 #include "tools/cmdline.h"
 #include "tools/codec_config.h"
@@ -52,12 +57,11 @@ namespace tools {
 
 namespace {
 inline bool ParsePhotonNoiseParameter(const char* arg, float* out) {
-  return strncmp(arg, "ISO", 3) == 0 && ParseFloat(arg + 3, out) && *out > 0;
+  return ParseFloat(arg, out) && *out >= 0;
 }
 inline bool ParseIntensityTarget(const char* arg, float* out) {
   return ParseFloat(arg, out) && *out > 0;
 }
-
 }  // namespace
 
 enum CjxlRetCode : int {
@@ -75,141 +79,116 @@ enum CjxlRetCode : int {
 struct CompressArgs {
   // CompressArgs() = default;
   void AddCommandLineOptions(CommandLineParser* cmdline) {
+    std::string input_help("the input can be ");
+    if (jxl::extras::CanDecode(jxl::extras::Codec::kPNG)) {
+      input_help.append("PNG, APNG, ");
+    }
+    if (jxl::extras::CanDecode(jxl::extras::Codec::kGIF)) {
+      input_help.append("GIF, ");
+    }
+    if (jxl::extras::CanDecode(jxl::extras::Codec::kJPG)) {
+      input_help.append("JPEG, ");
+    } else {
+      input_help.append("JPEG (lossless recompression only), ");
+    }
+    if (jxl::extras::CanDecode(jxl::extras::Codec::kEXR)) {
+      input_help.append("EXR, ");
+    }
+    input_help.append("PPM, PFM, PAM, PGX, or JXL");
     // Positional arguments.
-    cmdline->AddPositionalOption("INPUT", /* required = */ true,
-                                 "the input can be "
-#if JPEGXL_ENABLE_APNG
-                                 "PNG, APNG, "
-#endif
-#if JPEGXL_ENABLE_GIF
-                                 "GIF, "
-#endif
-#if JPEGXL_ENABLE_JPEG
-                                 "JPEG, "
-#else
-                                 "JPEG (lossless recompression only), "
-#endif
-#if JPEGXL_ENABLE_EXR
-                                 "EXR, "
-#endif
-                                 "PPM, PFM, or PGX",
+    cmdline->AddPositionalOption("INPUT", /* required = */ true, input_help,
                                  &file_in);
-    cmdline->AddPositionalOption(
-        "OUTPUT", /* required = */ true,
-        "the compressed JXL output file (can be omitted for benchmarking)",
-        &file_out);
+    cmdline->AddPositionalOption("OUTPUT", /* required = */ true,
+                                 "the compressed JXL output file", &file_out);
 
     // Flags.
-    // TODO(lode): also add options to add exif/xmp/other metadata in the
-    // container.
-    cmdline->AddOptionValue('\0', "container", "0|1",
-                            "0 = Do not encode using container format (strip "
-                            "Exif/XMP/JPEG bitstream reconstruction data)."
-                            "1 = Force using container format \n"
-                            "(default: use only if needed).\n",
-                            &container, &ParseOverride, 1);
 
-    cmdline->AddOptionValue(
-        '\0', "jpeg_store_metadata", "0|1",
-        ("If --lossless_jpeg=1, store JPEG reconstruction "
-         "metadata in the JPEG XL container "
-         "(for lossless reconstruction of the JPEG codestream)."
-         "(default: 1)"),
-        &jpeg_store_metadata, &ParseUnsigned, 2);
+    cmdline->AddHelpText("\nBasic options:", 0);
 
     // Target distance/size/bpp
     opt_distance_id = cmdline->AddOptionValue(
-        'd', "distance", "maxError",
-        "Max. butteraugli distance, lower = higher quality.\n"
+        'd', "distance", "DISTANCE",
+        "Target visual distance in JND units, lower = higher quality.\n"
         "    0.0 = mathematically lossless. Default for already-lossy input "
         "(JPEG/GIF).\n"
         "    1.0 = visually lossless. Default for other input.\n"
-        "    Recommended range: 0.5 .. 3.0. Mutually exclusive with --quality.",
+        "    Recommended range: 0.5 .. 3.0. Allowed range: 0.0 ... 25.0. "
+        "Mutually exclusive with --quality.",
         &distance, &ParseFloat);
 
     // High-level options
     opt_quality_id = cmdline->AddOptionValue(
         'q', "quality", "QUALITY",
-        "Quality setting (is remapped to --distance). Range: -inf .. 100.\n"
-        "    100 = mathematically lossless. Default for already-lossy input "
-        "(JPEG/GIF).\n"
-        "    Other input gets encoded as per --distance default.\n"
-        "    Positive quality values roughly match libjpeg quality.\n"
-        "    Mutually exclusive with --distance.",
+        "Quality setting, higher value = higher quality. This is internally "
+        "mapped to --distance.\n"
+        "    100 = mathematically lossless. 90 = visually lossless.\n"
+        "    Quality values roughly match libjpeg quality.\n"
+        "    Recommended range: 68 .. 96. Allowed range: 0 .. 100. Mutually "
+        "exclusive with --distance.",
         &quality, &ParseFloat);
 
     cmdline->AddOptionValue(
         'e', "effort", "EFFORT",
         "Encoder effort setting. Range: 1 .. 9.\n"
-        "     Default: 7. Higher number is more effort (slower).",
-        &effort, &ParseUnsigned, -1);
+        "    Default: 7. Higher numbers allow more computation "
+        "at the expense of time.\n"
+        "    For lossless, generally it will produce smaller files.\n"
+        "    For lossy, higher effort should more accurately reach "
+        "the target quality.",
+        &effort, &ParseUnsigned);
 
-    cmdline->AddOptionValue(
-        '\0', "brotli_effort", "B_EFFORT",
-        "Brotli effort setting. Range: 0 .. 11.\n"
-        "    Default: 9. Higher number is more effort (slower).",
-        &brotli_effort, &ParseUnsigned, -1);
-
-    cmdline->AddOptionValue(
-        '\0', "faster_decoding", "0|1|2|3|4",
-        "Favour higher decoding speed. 0 = default, higher "
-        "values give higher speed at the expense of quality",
-        &faster_decoding, &ParseUnsigned, 2);
+    cmdline->AddOptionFlag('V', "version",
+                           "Print encoder library version number and exit.",
+                           &version, &SetBooleanTrue);
+    cmdline->AddOptionFlag('\0', "quiet", "Be more silent", &quiet,
+                           &SetBooleanTrue);
+    cmdline->AddOptionFlag('v', "verbose",
+                           "Verbose output; can be repeated and also applies "
+                           "to help (!).",
+                           &verbose, &SetBooleanTrue);
+
+    cmdline->AddHelpText("\nAdvanced options:", 1);
+
+    opt_alpha_distance_id = cmdline->AddOptionValue(
+        'a', "alpha_distance", "A_DISTANCE",
+        "Target visual distance for the alpha channel, lower = higher "
+        "quality.\n"
+        "    0.0 = mathematically lossless. 1.0 = visually lossless.\n"
+        "    Default is to use the same value as for the color image.\n"
+        "    Recommended range: 0.5 .. 3.0. Allowed range: 0.0 ... 25.0.",
+        &alpha_distance, &ParseFloat, 1);
 
     cmdline->AddOptionFlag('p', "progressive",
-                           "Enable progressive/responsive decoding.",
-                           &progressive, &SetBooleanTrue);
-
-    cmdline->AddOptionValue('\0', "premultiply", "-1|0|1",
-                            "Force premultiplied (associated) alpha.",
-                            &premultiply, &ParseSigned, 1);
-
-    cmdline->AddOptionValue(
-        '\0', "keep_invisible", "0|1",
-        "force disable/enable preserving color of invisible "
-        "pixels (default: 1 if lossless, 0 if lossy).",
-        &keep_invisible, &ParseOverride, 1);
+                           "Enable (more) progressive/responsive decoding.",
+                           &progressive, &SetBooleanTrue, 1);
 
     cmdline->AddOptionValue(
         '\0', "group_order", "0|1",
         "Order in which 256x256 groups are stored "
-        "in the codestream for progressive rendering. "
-        "Value not provided means 'encoder default', 0 means 'scanline order', "
-        "1 means 'center-first order'.",
+        "in the codestream for progressive rendering.\n"
+        "    0 = scanline order, 1 = center-first order. Default: 0.",
         &group_order, &ParseOverride, 1);
 
     cmdline->AddOptionValue(
-        '\0', "center_x", "0..XSIZE",
-        "Determines the horizontal position of center for the center-first "
-        "group order. The value -1 means 'use the middle of the image', "
-        "other values [0..xsize) set this to a particular coordinate.",
-        &center_x, &ParseInt64, 1);
+        '\0', "container", "0|1",
+        "0 = Avoid the container format unless it is needed (default)\n"
+        "    1 = Force using the container format even if it is not needed.",
+        &container, &ParseOverride, 1);
 
-    cmdline->AddOptionValue(
-        '\0', "center_y", "0..YSIZE",
-        "Determines the vertical position of center for the center-first "
-        "group order. The value -1 means 'use the middle of the image', "
-        "other values [0..ysize) set this to a particular coordinate.",
-        &center_y, &ParseInt64, 1);
-
-    // Flags.
-    cmdline->AddOptionFlag('\0', "progressive_ac",
-                           "Use the progressive mode for AC.", &progressive_ac,
-                           &SetBooleanTrue, 1);
-
-    opt_qprogressive_ac_id = cmdline->AddOptionFlag(
-        '\0', "qprogressive_ac",
-        "Use the progressive mode for AC with shift quantization.",
-        &qprogressive_ac, &SetBooleanTrue, 1);
+    cmdline->AddOptionValue('\0', "compress_boxes", "0|1",
+                            "Disable/enable Brotli compression for metadata "
+                            "boxes. Default is 1 (enabled).",
+                            &compress_boxes, &ParseOverride, 1);
 
     cmdline->AddOptionValue(
-        '\0', "progressive_dc", "num_dc_frames",
-        "Progressive-DC setting. Valid values are: -1, 0, 1, 2.",
-        &progressive_dc, &ParseSigned, 1);
+        '\0', "brotli_effort", "B_EFFORT",
+        "Brotli effort setting. Range: 0 .. 11.\n"
+        "    Default: 9. Higher number is more effort (slower).",
+        &brotli_effort, &ParseUnsigned, 1);
 
     cmdline->AddOptionValue(
         'm', "modular", "0|1",
-        "Use modular mode (not provided = encoder chooses, 0 = enforce VarDCT, "
+        "Use modular mode (default = encoder chooses, 0 = enforce VarDCT, "
         "1 = enforce modular mode).",
         &modular, &ParseOverride, 1);
 
@@ -217,206 +196,288 @@ struct CompressArgs {
     opt_lossless_jpeg_id = cmdline->AddOptionValue(
         'j', "lossless_jpeg", "0|1",
         "If the input is JPEG, losslessly transcode JPEG, "
-        "rather than using reencode pixels.",
+        "rather than using reencode pixels. Default is 1 (losslessly "
+        "transcode)",
         &lossless_jpeg, &ParseUnsigned, 1);
 
     cmdline->AddOptionValue(
-        '\0', "jpeg_reconstruction_cfl", "0|1",
-        "Enable/disable chroma-from-luma (CFL) for lossless "
-        "JPEG reconstruction.",
-        &jpeg_reconstruction_cfl, &ParseOverride, 2);
-
-    cmdline->AddOptionValue(
         '\0', "num_threads", "N",
         "Number of worker threads (-1 == use machine default, "
         "0 == do not use multithreading).",
         &num_threads, &ParseSigned, 1);
 
-    cmdline->AddOptionValue('\0', "num_reps", "N",
-                            "How many times to compress. (For benchmarking).",
-                            &num_reps, &ParseUnsigned, 1);
-
     cmdline->AddOptionValue(
-        '\0', "photon_noise", "ISO3200",
-        "Adds noise to the image emulating photographic film noise. "
-        "The higher the given number, the grainier the image will be. "
-        "As an example, a value of 100 gives low noise whereas a value "
-        "of 3200 gives a lot of noise. The default value is 0.",
+        '\0', "photon_noise_iso", "ISO_FILM_SPEED",
+        "Adds noise to the image emulating photographic film or sensor noise.\n"
+        "    Higher number = grainier image, e.g. 100 gives a low amount of "
+        "noise,\n"
+        "    3200 gives a lot of noise. Default is 0.",
         &photon_noise_iso, &ParsePhotonNoiseParameter, 1);
 
     cmdline->AddOptionValue(
-        '\0', "dots", "0|1",
-        "Force disable/enable dots generation. "
-        "(not provided = default, 0 = disable, 1 = enable).",
-        &dots, &ParseOverride, 1);
+        '\0', "intensity_target", "N",
+        "Upper bound on the intensity level present in the image, in nits.\n"
+        "    Default is 0, which means 'choose a sensible default "
+        "value based on the color encoding.",
+        &intensity_target, &ParseIntensityTarget, 1);
 
     cmdline->AddOptionValue(
-        '\0', "patches", "0|1",
-        "Force disable/enable patches generation. "
-        "(not provided = default, 0 = disable, 1 = enable).",
-        &patches, &ParseOverride, 1);
+        'x', "dec-hints", "key=value",
+        "This is useful for 'raw' formats like PPM that cannot store "
+        "colorspace information\n"
+        "    and metadata, or to strip or modify metadata in formats that do.\n"
+        "    The key 'color_space' indicates an enumerated ColorEncoding, for "
+        "example:\n"
+        "      -x color_space=RGB_D65_SRG_Per_SRG is sRGB with perceptual "
+        "rendering intent\n"
+        "      -x color_space=RGB_D65_202_Rel_PeQ is Rec.2100 PQ with relative "
+        "rendering intent\n"
+        "    The key 'icc_pathname' refers to a binary file containing an ICC "
+        "profile.\n"
+        "    The keys 'exif', 'xmp', and 'jumbf' refer to a binary file "
+        "containing metadata;\n"
+        "    existing metadata of the same type will be overwritten.\n"
+        "    Specific metadata can be stripped using e.g. -x strip=exif",
+        &color_hints_proxy, &ParseAndAppendKeyValue<ColorHintsProxy>, 1);
+
+    cmdline->AddHelpText("\nExpert options:", 2);
+
+    cmdline->AddOptionValue(
+        '\0', "jpeg_store_metadata", "0|1",
+        ("If --lossless_jpeg=1, store JPEG reconstruction "
+         "metadata in the JPEG XL container.\n"
+         "    This allows reconstruction of the JPEG codestream. Default: 1."),
+        &jpeg_store_metadata, &ParseUnsigned, 2);
+
+    cmdline->AddOptionValue('\0', "codestream_level", "K",
+                            "The codestream level. Either `-1`, `5` or `10`.",
+                            &codestream_level, &ParseInt64, 2);
+
+    cmdline->AddOptionValue('\0', "faster_decoding", "0|1|2|3|4",
+                            "0 = default, higher values improve decode speed "
+                            "at the expense of quality or density.",
+                            &faster_decoding, &ParseUnsigned, 2);
+
+    cmdline->AddOptionValue('\0', "premultiply", "-1|0|1",
+                            "Force premultiplied (associated) alpha.",
+                            &premultiply, &ParseSigned, 2);
+
+    cmdline->AddOptionValue('\0', "keep_invisible", "0|1",
+                            "disable/enable preserving color of invisible "
+                            "pixels (default: 1 if lossless, 0 if lossy).",
+                            &keep_invisible, &ParseOverride, 2);
 
     cmdline->AddOptionValue(
-        '\0', "resampling", "-1|1|2|4|8",
-        "Resampling for extra channels. Default of -1 applies resampling only "
-        "for low quality. Value 1 does no downsampling (1x1), 2 does 2x2 "
-        "downsampling, 4 is for 4x4 downsampling, and 8 for 8x8 downsampling.",
-        &resampling, &ParseSigned, 0);
+        '\0', "center_x", "-1..XSIZE",
+        "Determines the horizontal position of center for the center-first "
+        "group order.\n"
+        "    Default -1 means 'middle of the image', "
+        "values [0..xsize) set this to a particular coordinate.",
+        &center_x, &ParseInt64, 2);
+
+    cmdline->AddOptionValue(
+        '\0', "center_y", "-1..YSIZE",
+        "Determines the vertical position of center for the center-first "
+        "group order.\n"
+        "    Default -1 means 'middle of the image', "
+        "values [0..ysize) set this to a particular coordinate.",
+        &center_y, &ParseInt64, 2);
+
+    // Flags.
+    cmdline->AddOptionFlag('\0', "progressive_ac",
+                           "Use the progressive mode for AC.", &progressive_ac,
+                           &SetBooleanTrue, 2);
+
+    cmdline->AddOptionFlag(
+        '\0', "qprogressive_ac",
+        "Use the progressive mode for AC with shift quantization.",
+        &qprogressive_ac, &SetBooleanTrue, 2);
 
     cmdline->AddOptionValue(
-        '\0', "ec_resampling", "-1|1|2|4|8",
-        "Resampling for extra channels. Default of -1 applies resampling only "
-        "for low quality. Value 1 does no downsampling (1x1), 2 does 2x2 "
-        "downsampling, 4 is for 4x4 downsampling, and 8 for 8x8 downsampling.",
-        &ec_resampling, &ParseSigned, 2);
+        '\0', "progressive_dc", "num_dc_frames",
+        "Progressive-DC setting. Valid values are: -1, 0, 1, 2.",
+        &progressive_dc, &ParseInt64, 2);
+
+    cmdline->AddOptionValue('\0', "resampling", "-1|1|2|4|8",
+                            "Resampling for color channels. Default of -1 "
+                            "applies resampling only for very low quality.\n"
+                            "    1 = downsampling (1x1), 2 = 2x2 downsampling, "
+                            "4 = 4x4 downsampling, 8 = 8x8 downsampling.",
+                            &resampling, &ParseInt64, 2);
+
+    cmdline->AddOptionValue('\0', "ec_resampling", "-1|1|2|4|8",
+                            "Resampling for extra channels. Same as "
+                            "--resampling but for extra channels like alpha.",
+                            &ec_resampling, &ParseInt64, 2);
 
     cmdline->AddOptionFlag('\0', "already_downsampled",
-                           "Do not downsample the given input before encoding, "
+                           "Do not downsample before encoding, "
                            "but still signal that the decoder should upsample.",
                            &already_downsampled, &SetBooleanTrue, 2);
 
     cmdline->AddOptionValue(
+        '\0', "upsampling_mode", "-1|0|1",
+        "Upsampling mode the decoder should use. Mostly useful in combination "
+        "with --already_downsampled. Value -1 means default (non-separable "
+        "upsampling), 0 means nearest neighbor (useful for pixel art)",
+        &upsampling_mode, &ParseInt64, 2);
+
+    cmdline->AddOptionValue(
         '\0', "epf", "-1|0|1|2|3",
-        "Edge preserving filter level, -1 to 3. "
-        "Value -1 means: default (encoder chooses), 0 to 3 set a strength.",
-        &epf, &ParseSigned, 1);
+        "Edge preserving filter level, 0-3. "
+        "Default -1 means encoder chooses, 0-3 set a strength.",
+        &epf, &ParseInt64, 2);
+
+    cmdline->AddOptionValue('\0', "gaborish", "0|1",
+                            "Force disable/enable the gaborish filter. Default "
+                            "is 'encoder chooses'",
+                            &gaborish, &ParseOverride, 2);
+
+    cmdline->AddOptionValue('\0', "override_bitdepth", "BITDEPTH",
+                            "Default is zero (use the input image bit depth); "
+                            "if nonzero, override the bit depth",
+                            &override_bitdepth, &ParseUnsigned, 2);
+
+    cmdline->AddHelpText("\nOptions for experimentation / benchmarking:", 3);
+
+    cmdline->AddOptionValue('\0', "noise", "0|1",
+                            "Force disable/enable adaptive noise generation "
+                            "(experimental). Default "
+                            "is 'encoder chooses'",
+                            &noise, &ParseOverride, 3);
 
     cmdline->AddOptionValue(
-        '\0', "gaborish", "0|1",
-        "Force disable/enable the gaborish filter. "
-        "(not provided = default, 0 = disable, 1 = enable).",
-        &gaborish, &ParseOverride, 1);
+        '\0', "jpeg_reconstruction_cfl", "0|1",
+        "Enable/disable chroma-from-luma (CFL) for lossless "
+        "JPEG reconstruction.",
+        &jpeg_reconstruction_cfl, &ParseOverride, 3);
+
+    cmdline->AddOptionValue('\0', "num_reps", "N",
+                            "How many times to compress. (For benchmarking).",
+                            &num_reps, &ParseUnsigned, 3);
+
+    cmdline->AddOptionFlag('\0', "streaming_input",
+                           "Enable streaming processing of the input file "
+                           "(works only for PPM and PGM input files).",
+                           &streaming_input, &SetBooleanTrue, 3);
+    cmdline->AddOptionFlag('\0', "streaming_output",
+                           "Enable incremental writing of the output file.",
+                           &streaming_output, &SetBooleanTrue, 3);
+    cmdline->AddOptionFlag('\0', "disable_output",
+                           "No output file will be written (for benchmarking)",
+                           &disable_output, &SetBooleanTrue, 3);
 
     cmdline->AddOptionValue(
-        '\0', "intensity_target", "N",
-        "Upper bound on the intensity level present in the image in nits. "
-        "Leaving this set to its default of 0 lets libjxl choose a sensible "
-        "default "
-        "value based on the color encoding.",
-        &intensity_target, &ParseIntensityTarget, 1);
+        '\0', "dots", "0|1",
+        "Force disable/enable dots generation. "
+        "(not provided = default, 0 = disable, 1 = enable).",
+        &dots, &ParseOverride, 3);
 
     cmdline->AddOptionValue(
-        'x', "dec-hints", "key=value",
-        "color_space indicates the ColorEncoding, see Description();\n"
-        "icc_pathname refers to a binary file containing an ICC profile.",
-        &color_hints, &ParseAndAppendKeyValue, 1);
+        '\0', "patches", "0|1",
+        "Force disable/enable patches generation. "
+        "(not provided = default, 0 = disable, 1 = enable).",
+        &patches, &ParseOverride, 3);
 
     cmdline->AddOptionValue(
-        '\0', "override_bitdepth", "0=use from image, 1-32=override",
-        "If nonzero, store the given bit depth in the JPEG XL file metadata"
-        " (1-32), instead of using the bit depth from the original input"
-        " image.",
-        &override_bitdepth, &ParseUnsigned, 2);
+        '\0', "frame_indexing", "INDICES",
+        // TODO(tfish): Add a more convenient vanilla alternative.
+        "INDICES is of the form '^(0*|1[01]*)'. The i-th position indicates "
+        "whether the\n"
+        "    i-th frame will be indexed in the frame index box.",
+        &frame_indexing, &ParseString, 3);
+
+    cmdline->AddOptionFlag('\0', "allow_expert_options",
+                           "Allow specifying advanced options; this allows "
+                           "setting effort to 10, for\n"
+                           "    somewhat better lossless compression at the "
+                           "cost of a massive speed hit.",
+                           &allow_expert_options, &SetBooleanTrue, 3);
+
+    cmdline->AddHelpText("\nModular mode options:", 4);
 
     // modular mode options
     cmdline->AddOptionValue(
-        'I', "iterations", "F",
-        "[modular encoding] Fraction of pixels used to learn MA trees as "
-        "a percentage. -1 = default, 0 = no MA and fast decode, 50 = "
-        "default value, 100 = all."
-        "Higher values use more encoder memory.",
-        &modular_ma_tree_learning_percent, &ParseFloat, 2);
+        'I', "iterations", "PERCENT",
+        "Percentage of pixels used to learn MA trees. Higher values use\n"
+        "    more encoder memory and can result in better compression. Default "
+        "of -1 means\n"
+        "    the encoder chooses. Zero means no MA trees are used.",
+        &modular_ma_tree_learning_percent, &ParseFloat, 4);
 
     cmdline->AddOptionValue(
         'C', "modular_colorspace", "K",
-        ("[modular encoding] color transform: -1=default, 0=RGB (none), "
-         "1-41=RCT (6=YCoCg, default: try several, depending on speed)"),
-        &modular_colorspace, &ParseSigned, 1);
+        ("Color transform: -1 = default (try several per group, depending\n"
+         "    on effort), 0 = RGB (none), 1-41 = fixed RCT (6 = YCoCg)."),
+        &modular_colorspace, &ParseInt64, 4);
 
     opt_modular_group_size_id = cmdline->AddOptionValue(
         'g', "modular_group_size", "K",
-        "[modular encoding] group size: -1 == default. 0 => 128, "
-        "1 => 256, 2 => 512, 3 => 1024",
-        &modular_group_size, &ParseSigned, 1);
+        "Group size: -1 = default (let the encoder choose),\n"
+        "    0 = 128x128, 1 = 256x256, 2 = 512x512, 3 = 1024x1024.",
+        &modular_group_size, &ParseInt64, 4);
 
     cmdline->AddOptionValue(
         'P', "modular_predictor", "K",
-        "[modular encoding] predictor(s) to use: 0=zero, "
-        "1=left, 2=top, 3=avg0, 4=select, 5=gradient, 6=weighted, "
-        "7=topright, 8=topleft, 9=leftleft, 10=avg1, 11=avg2, 12=avg3, "
-        "13=toptop predictive average "
-        "14=mix 5 and 6, 15=mix everything. If unset, uses default 14, "
-        "at slowest speed default 15.",
-        &modular_predictor, &ParseSigned, 1);
+        "Predictor(s) to use: 0=zero, 1=left, 2=top, 3=avg0, 4=select,\n"
+        "    5=gradient, 6=weighted, 7=topright, 8=topleft, 9=leftleft, "
+        "10=avg1, 11=avg2, 12=avg3,\n"
+        "    13=toptop predictive average, 14=mix 5 and 6, 15=mix everything.\n"
+        "    Default is 14 at effort < 9 and 15 at effort 9.",
+        &modular_predictor, &ParseInt64, 4);
 
     cmdline->AddOptionValue(
         'E', "modular_nb_prev_channels", "K",
-        "[modular encoding] number of extra MA tree properties to use",
-        &modular_nb_prev_channels, &ParseSigned, 2);
+        "Number of extra (previous-channel) MA tree properties to use.",
+        &modular_nb_prev_channels, &ParseInt64, 4);
 
     cmdline->AddOptionValue(
         '\0', "modular_palette_colors", "K",
-        "[modular encoding] Use color palette if number of colors is smaller "
-        "than or equal to this, or -1 to use the encoder default.",
-        &modular_palette_colors, &ParseSigned, 1);
+        "Use palette if number of colors is smaller than or equal to this.",
+        &modular_palette_colors, &ParseInt64, 4);
 
     cmdline->AddOptionFlag(
         '\0', "modular_lossy_palette",
-        "[modular encoding] quantize to a palette that has fewer entries than "
-        "would be necessary for perfect preservation; for the time being, it "
-        "is "
-        "recommended to set --palette=0 with this option to use the default "
-        "palette only",
-        &modular_lossy_palette, &SetBooleanTrue, 1);
-
-    cmdline->AddOptionValue(
-        'X', "pre-compact", "PERCENT",
-        "[modular encoding] Use Global channel palette if the number of "
-        "colors is smaller than this percentage of range. "
-        "Use 0-100 to set an explicit percentage, -1 to use the encoder "
-        "default.",
-        &modular_channel_colors_global_percent, &ParseFloat, 2);
+        "Use delta palette in a lossy way; it is recommended to also\n"
+        "    set --modular_palette_colors=0 with this "
+        "option to use the default palette only.",
+        &modular_lossy_palette, &SetBooleanTrue, 4);
+
+    cmdline->AddOptionValue('X', "pre-compact", "PERCENT",
+                            "Use global channel palette if the number of "
+                            "sample values is smaller\n"
+                            "    than this percentage of the nominal range. ",
+                            &modular_channel_colors_global_percent, &ParseFloat,
+                            4);
 
     cmdline->AddOptionValue(
         'Y', "post-compact", "PERCENT",
-        "[modular encoding] Use Local (per-group) channel palette if the "
-        "number "
-        "of colors is smaller than this percentage of range. Use 0-100 to set "
-        "an explicit percentage, -1 to use the encoder default.",
-        &modular_channel_colors_group_percent, &ParseFloat, 2);
-
-    cmdline->AddOptionValue('\0', "codestream_level", "K",
-                            "The codestream level. Either `-1`, `5` or `10`.",
-                            &codestream_level, &ParseSigned, 2);
-
-    opt_responsive_id = cmdline->AddOptionValue(
-        'R', "responsive", "K",
-        "[modular encoding] do Squeeze transform, 0=false, "
-        "1=true (default: true if lossy, false if lossless)",
-        &responsive, &ParseSigned, 1);
-
-    cmdline->AddOptionFlag('V', "version",
-                           "Print encoder library version number and exit.",
-                           &version, &SetBooleanTrue, 1);
-
-    cmdline->AddOptionFlag('\0', "quiet", "Be more silent", &quiet,
-                           &SetBooleanTrue, 1);
-
-    cmdline->AddOptionValue(
-        '\0', "frame_indexing", "string",
-        // TODO(tfish): Add a more convenient vanilla alternative.
-        "If non-empty, a string matching '^(0*|1[01]*)'. If this string has a "
-        "'1' in i-th position, then the i-th frame will be indexed in "
-        "the frame index box.",
-        &frame_indexing, &ParseString, 1);
-
-    cmdline->AddOptionFlag(
-        'v', "verbose",
-        "Verbose output; can be repeated, also applies to help (!).", &verbose,
-        &SetBooleanTrue);
+        "Use local (per-group) channel palette if the "
+        "number of sample values is\n"
+        "    smaller than this percentage of the nominal range.",
+        &modular_channel_colors_group_percent, &ParseFloat, 4);
+
+    opt_responsive_id =
+        cmdline->AddOptionValue('R', "responsive", "K",
+                                "Do the Squeeze transform, 0=false, "
+                                "1=true (default: 1 if lossy, 0 if lossless)",
+                                &responsive, &ParseInt64, 4);
   }
 
   // Common flags.
   bool version = false;
   jxl::Override container = jxl::Override::kDefault;
   bool quiet = false;
+  bool disable_output = false;
 
   const char* file_in = nullptr;
   const char* file_out = nullptr;
   jxl::Override print_profile = jxl::Override::kDefault;
+  bool streaming_input = false;
+  bool streaming_output = false;
 
   // Decoding source image flags
-  jxl::extras::ColorHints color_hints;
+  ColorHintsProxy color_hints_proxy;
 
   // JXL flags
   size_t override_bitdepth = 0;
@@ -424,9 +485,6 @@ struct CompressArgs {
   size_t num_reps = 1;
   float intensity_target = 0;
 
-  // Filename for the user provided saliency-map.
-  std::string saliency_map_filename;
-
   // Whether to perform lossless transcoding with kVarDCT or kJPEG encoding.
   // If true, attempts to load JPEG coefficients instead of pixels.
   // Reset to false if input image is not a JPEG.
@@ -440,10 +498,11 @@ struct CompressArgs {
   bool progressive = false;
   bool progressive_ac = false;
   bool qprogressive_ac = false;
-  int32_t progressive_dc = -1;
+  int64_t progressive_dc = -1;
   bool modular_lossy_palette = false;
   int32_t premultiply = -1;
   bool already_downsampled = false;
+  int64_t upsampling_mode = -1;
   jxl::Override jpeg_reconstruction_cfl = jxl::Override::kDefault;
   jxl::Override modular = jxl::Override::kDefault;
   jxl::Override keep_invisible = jxl::Override::kDefault;
@@ -451,38 +510,40 @@ struct CompressArgs {
   jxl::Override patches = jxl::Override::kDefault;
   jxl::Override gaborish = jxl::Override::kDefault;
   jxl::Override group_order = jxl::Override::kDefault;
+  jxl::Override compress_boxes = jxl::Override::kDefault;
+  jxl::Override noise = jxl::Override::kDefault;
 
   size_t faster_decoding = 0;
-  int32_t resampling = -1;
-  int32_t ec_resampling = -1;
-  int32_t epf = -1;
+  int64_t resampling = -1;
+  int64_t ec_resampling = -1;
+  int64_t epf = -1;
   int64_t center_x = -1;
   int64_t center_y = -1;
-  int32_t modular_group_size = -1;
-  int32_t modular_predictor = -1;
-  int32_t modular_colorspace = -1;
+  int64_t modular_group_size = -1;
+  int64_t modular_predictor = -1;
+  int64_t modular_colorspace = -1;
   float modular_channel_colors_global_percent = -1.f;
   float modular_channel_colors_group_percent = -1.f;
-  int32_t modular_palette_colors = -1;
-  int32_t modular_nb_prev_channels = -1;
+  int64_t modular_palette_colors = -1;
+  int64_t modular_nb_prev_channels = -1;
   float modular_ma_tree_learning_percent = -1.f;
   float photon_noise_iso = 0;
-  int32_t codestream_level = -1;
-  int32_t responsive = -1;
+  int64_t codestream_level = -1;
+  int64_t responsive = -1;
   float distance = 1.0;
+  float alpha_distance = 1.0;
   size_t effort = 7;
   size_t brotli_effort = 9;
   std::string frame_indexing;
 
-  // Will get passed on to AuxOut.
-  // jxl::InspectorImage3F inspector_image3f;
+  bool allow_expert_options = false;
 
   // References (ids) of specific options to check if they were matched.
   CommandLineParser::OptionId opt_lossless_jpeg_id = -1;
   CommandLineParser::OptionId opt_responsive_id = -1;
   CommandLineParser::OptionId opt_distance_id = -1;
+  CommandLineParser::OptionId opt_alpha_distance_id = -1;
   CommandLineParser::OptionId opt_quality_id = -1;
-  CommandLineParser::OptionId opt_qprogressive_ac_id = -1;
   CommandLineParser::OptionId opt_modular_group_size_id = -1;
 };
 
@@ -506,144 +567,430 @@ std::string DistanceFromArgs(const CompressArgs& args) {
 }
 
 void PrintMode(jxl::extras::PackedPixelFile& ppf, const double decode_mps,
-               size_t num_bytes, const CompressArgs& args) {
+               size_t num_bytes, const CompressArgs& args,
+               jpegxl::tools::CommandLineParser& cmdline) {
   const char* mode = ModeFromArgs(args);
   const std::string distance = DistanceFromArgs(args);
   if (args.lossless_jpeg) {
-    fprintf(stderr, "Read JPEG image with %" PRIuS " bytes.\n", num_bytes);
+    cmdline.VerbosePrintf(1, "Read JPEG image with %" PRIuS " bytes.\n",
+                          num_bytes);
   } else {
-    fprintf(stderr,
-            "Read %" PRIuS "x%" PRIuS " image, %" PRIuS " bytes, %.1f MP/s\n",
-            static_cast<size_t>(ppf.info.xsize),
-            static_cast<size_t>(ppf.info.ysize), num_bytes, decode_mps);
+    cmdline.VerbosePrintf(
+        1, "Read %" PRIuS "x%" PRIuS " image, %" PRIuS " bytes, %.1f MP/s\n",
+        static_cast<size_t>(ppf.info.xsize),
+        static_cast<size_t>(ppf.info.ysize), num_bytes, decode_mps);
   }
-  fprintf(stderr, "Encoding [%s%s, %s, effort: %" PRIuS,
-          (args.container == jxl::Override::kOn ? "Container | " : ""), mode,
-          distance.c_str(), args.effort);
+  cmdline.VerbosePrintf(
+      0, "Encoding [%s%s, %s, effort: %" PRIuS,
+      (args.container == jxl::Override::kOn ? "Container | " : ""), mode,
+      distance.c_str(), args.effort);
   if (args.container == jxl::Override::kOn) {
     if (args.lossless_jpeg && args.jpeg_store_metadata)
-      fprintf(stderr, " | JPEG reconstruction data");
+      cmdline.VerbosePrintf(0, " | JPEG reconstruction data");
     if (!ppf.metadata.exif.empty())
-      fprintf(stderr, " | %" PRIuS "-byte Exif", ppf.metadata.exif.size());
+      cmdline.VerbosePrintf(0, " | %" PRIuS "-byte Exif",
+                            ppf.metadata.exif.size());
     if (!ppf.metadata.xmp.empty())
-      fprintf(stderr, " | %" PRIuS "-byte XMP", ppf.metadata.xmp.size());
+      cmdline.VerbosePrintf(0, " | %" PRIuS "-byte XMP",
+                            ppf.metadata.xmp.size());
     if (!ppf.metadata.jumbf.empty())
-      fprintf(stderr, " | %" PRIuS "-byte JUMBF", ppf.metadata.jumbf.size());
+      cmdline.VerbosePrintf(0, " | %" PRIuS "-byte JUMBF",
+                            ppf.metadata.jumbf.size());
   }
-  fprintf(stderr, "], \n");
+  cmdline.VerbosePrintf(0, "]\n");
 }
 
-}  // namespace tools
-}  // namespace jpegxl
+bool IsJPG(const std::vector<uint8_t>& image_data) {
+  return (image_data.size() >= 2 && image_data[0] == 0xFF &&
+          image_data[1] == 0xD8);
+}
 
-namespace {
+using flag_check_fn = std::function<std::string(int64_t)>;
+using flag_check_float_fn = std::function<std::string(float)>;
 
 template <typename T>
-void SetFlagFrameOptionOrDie(const char* flag_name, T flag_value,
-                             JxlEncoderFrameSettings* frame_settings,
-                             JxlEncoderFrameSettingId encoder_option) {
-  if (JXL_ENC_SUCCESS !=
-      (std::is_same<T, float>::value
-           ? JxlEncoderFrameSettingsSetFloatOption(frame_settings,
-                                                   encoder_option, flag_value)
-           : JxlEncoderFrameSettingsSetOption(frame_settings, encoder_option,
-                                              flag_value))) {
-    std::cerr << "Setting encoder option from flag --" << flag_name
-              << " failed." << std::endl;
+void ProcessFlag(
+    const char* flag_name, T flag_value,
+    JxlEncoderFrameSettingId encoder_option,
+    jxl::extras::JXLCompressParams* params,
+    flag_check_fn flag_check = [](T x) { return std::string(); }) {
+  std::string error = flag_check(flag_value);
+  if (!error.empty()) {
+    std::cerr << "Invalid flag value for --" << flag_name << ": " << error
+              << std::endl;
     exit(EXIT_FAILURE);
   }
+  params->options.emplace_back(
+      jxl::extras::JXLOption(encoder_option, flag_value, 0));
 }
 
-void SetDistanceFromFlags(JxlEncoderFrameSettings* jxl_encoder_frame_settings,
-                          jpegxl::tools::CommandLineParser* cmdline,
-                          jpegxl::tools::CompressArgs* args,
+void ProcessBoolFlag(jxl::Override flag_value,
+                     JxlEncoderFrameSettingId encoder_option,
+                     jxl::extras::JXLCompressParams* params) {
+  if (flag_value != jxl::Override::kDefault) {
+    int64_t value = flag_value == jxl::Override::kOn ? 1 : 0;
+    params->options.emplace_back(
+        jxl::extras::JXLOption(encoder_option, value, 0));
+  }
+}
+
+void SetDistanceFromFlags(CommandLineParser* cmdline, CompressArgs* args,
+                          jxl::extras::JXLCompressParams* params,
                           const jxl::extras::Codec& codec) {
   bool distance_set = cmdline->GetOption(args->opt_distance_id)->matched();
+  bool alpha_distance_set =
+      cmdline->GetOption(args->opt_alpha_distance_id)->matched();
   bool quality_set = cmdline->GetOption(args->opt_quality_id)->matched();
+  if ((distance_set && (args->distance != 0.0)) && args->lossless_jpeg) {
+    std::cerr << "Must not set non-zero distance in combination with "
+                 "--lossless_jpeg=1, which is set by default."
+              << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  if ((quality_set && (args->quality != 100)) && args->lossless_jpeg) {
+    std::cerr << "Must not set quality below 100 in combination with "
+                 "--lossless_jpeg=1, which is set by default"
+              << std::endl;
+    exit(EXIT_FAILURE);
+  }
   if (quality_set) {
     if (distance_set) {
       std::cerr << "Must not set both --distance and --quality." << std::endl;
       exit(EXIT_FAILURE);
     }
-    double distance = args->quality >= 100 ? 0.0
-                      : args->quality >= 30
-                          ? 0.1 + (100 - args->quality) * 0.09
-                          : 6.4 + pow(2.5, (30 - args->quality) / 5.0) / 6.25;
-    args->distance = distance;
+    args->distance = JxlEncoderDistanceFromQuality(args->quality);
     distance_set = true;
   }
+
   if (!distance_set) {
     bool lossy_input = (codec == jxl::extras::Codec::kJPG ||
                         codec == jxl::extras::Codec::kGIF);
     args->distance = lossy_input ? 0.0 : 1.0;
+  } else if (args->distance > 0) {
+    args->lossless_jpeg = 0;
   }
-  if (JXL_ENC_SUCCESS !=
-      JxlEncoderSetFrameDistance(jxl_encoder_frame_settings, args->distance)) {
-    std::cerr << "Setting frame distance failed." << std::endl;
+  params->distance = args->distance;
+  params->alpha_distance =
+      alpha_distance_set ? args->alpha_distance : params->distance;
+}
+
+void ProcessFlags(const jxl::extras::Codec codec,
+                  const jxl::extras::PackedPixelFile& ppf,
+                  const std::vector<uint8_t>* jpeg_bytes,
+                  CommandLineParser* cmdline, CompressArgs* args,
+                  jxl::extras::JXLCompressParams* params) {
+  // Tuning flags.
+  ProcessBoolFlag(args->modular, JXL_ENC_FRAME_SETTING_MODULAR, params);
+  ProcessBoolFlag(args->keep_invisible, JXL_ENC_FRAME_SETTING_KEEP_INVISIBLE,
+                  params);
+  ProcessBoolFlag(args->dots, JXL_ENC_FRAME_SETTING_DOTS, params);
+  ProcessBoolFlag(args->patches, JXL_ENC_FRAME_SETTING_PATCHES, params);
+  ProcessBoolFlag(args->gaborish, JXL_ENC_FRAME_SETTING_GABORISH, params);
+  ProcessBoolFlag(args->group_order, JXL_ENC_FRAME_SETTING_GROUP_ORDER, params);
+  ProcessBoolFlag(args->noise, JXL_ENC_FRAME_SETTING_NOISE, params);
+
+  params->allow_expert_options = args->allow_expert_options;
+
+  if (!args->frame_indexing.empty()) {
+    bool must_be_all_zeros = args->frame_indexing[0] != '1';
+    for (char c : args->frame_indexing) {
+      if (c == '1') {
+        if (must_be_all_zeros) {
+          std::cerr << "Invalid --frame_indexing. If the first character is "
+                       "'0', all must be '0'."
+                    << std::endl;
+          exit(EXIT_FAILURE);
+        }
+      } else if (c != '0') {
+        std::cerr << "Invalid --frame_indexing. Must match the pattern "
+                     "'^(0*|1[01]*)$'."
+                  << std::endl;
+        exit(EXIT_FAILURE);
+      }
+    }
+  }
+
+  ProcessFlag(
+      "effort", static_cast<int64_t>(args->effort),
+      JXL_ENC_FRAME_SETTING_EFFORT, params, [args](int64_t x) -> std::string {
+        if (args->allow_expert_options) {
+          return (1 <= x && x <= 10) ? "" : "Valid range is {1, 2, ..., 10}.";
+        } else {
+          return (1 <= x && x <= 9) ? "" : "Valid range is {1, 2, ..., 9}.";
+        }
+      });
+  ProcessFlag("brotli_effort", static_cast<int64_t>(args->brotli_effort),
+              JXL_ENC_FRAME_SETTING_BROTLI_EFFORT, params,
+              [](int64_t x) -> std::string {
+                return (-1 <= x && x <= 11)
+                           ? ""
+                           : "Valid range is {-1, 0, 1, ..., 11}.";
+              });
+  ProcessFlag(
+      "epf", args->epf, JXL_ENC_FRAME_SETTING_EPF, params,
+      [](int64_t x) -> std::string {
+        return (-1 <= x && x <= 3) ? "" : "Valid range is {-1, 0, 1, 2, 3}.\n";
+      });
+  ProcessFlag("faster_decoding", static_cast<int64_t>(args->faster_decoding),
+              JXL_ENC_FRAME_SETTING_DECODING_SPEED, params,
+              [](int64_t x) -> std::string {
+                return (0 <= x && x <= 4) ? ""
+                                          : "Valid range is {0, 1, 2, 3, 4}.\n";
+              });
+  ProcessFlag("resampling", args->resampling, JXL_ENC_FRAME_SETTING_RESAMPLING,
+              params, [](int64_t x) -> std::string {
+                return (x == -1 || x == 1 || x == 2 || x == 4 || x == 8)
+                           ? ""
+                           : "Valid values are {-1, 1, 2, 4, 8}.\n";
+              });
+  ProcessFlag("ec_resampling", args->ec_resampling,
+              JXL_ENC_FRAME_SETTING_EXTRA_CHANNEL_RESAMPLING, params,
+              [](int64_t x) -> std::string {
+                return (x == -1 || x == 1 || x == 2 || x == 4 || x == 8)
+                           ? ""
+                           : "Valid values are {-1, 1, 2, 4, 8}.\n";
+              });
+  ProcessFlag("photon_noise_iso", args->photon_noise_iso,
+              JXL_ENC_FRAME_SETTING_PHOTON_NOISE, params);
+  ProcessFlag("already_downsampled",
+              static_cast<int64_t>(args->already_downsampled),
+              JXL_ENC_FRAME_SETTING_ALREADY_DOWNSAMPLED, params);
+  if (args->already_downsampled) params->already_downsampled = args->resampling;
+
+  SetDistanceFromFlags(cmdline, args, params, codec);
+
+  if (args->group_order != jxl::Override::kOn &&
+      (args->center_x != -1 || args->center_y != -1)) {
+    std::cerr << "Invalid flag combination. Setting --center_x or --center_y "
+              << "requires setting --group_order=1" << std::endl;
     exit(EXIT_FAILURE);
   }
-}
+  ProcessFlag("center_x", args->center_x,
+              JXL_ENC_FRAME_SETTING_GROUP_ORDER_CENTER_X, params,
+              [](int64_t x) -> std::string {
+                if (x < -1) {
+                  return "Valid values are: -1 or [0 .. xsize).";
+                }
+                return "";
+              });
+  ProcessFlag("center_y", args->center_y,
+              JXL_ENC_FRAME_SETTING_GROUP_ORDER_CENTER_Y, params,
+              [](int64_t x) -> std::string {
+                if (x < -1) {
+                  return "Valid values are: -1 or [0 .. ysize).";
+                }
+                return "";
+              });
+
+  // Progressive/responsive mode settings.
+  bool responsive_set = cmdline->GetOption(args->opt_responsive_id)->matched();
+
+  ProcessFlag("progressive_dc", args->progressive_dc,
+              JXL_ENC_FRAME_SETTING_PROGRESSIVE_DC, params,
+              [](int64_t x) -> std::string {
+                return (-1 <= x && x <= 2) ? ""
+                                           : "Valid range is {-1, 0, 1, 2}.\n";
+              });
+  ProcessFlag("progressive_ac", static_cast<int64_t>(args->progressive_ac),
+              JXL_ENC_FRAME_SETTING_PROGRESSIVE_AC, params);
+
+  if (args->progressive) {
+    args->qprogressive_ac = true;
+    args->responsive = 1;
+    responsive_set = true;
+  }
+  if (responsive_set) {
+    ProcessFlag("responsive", args->responsive,
+                JXL_ENC_FRAME_SETTING_RESPONSIVE, params);
+  }
+  if (args->qprogressive_ac) {
+    ProcessFlag("qprogressive_ac", static_cast<int64_t>(1),
+                JXL_ENC_FRAME_SETTING_QPROGRESSIVE_AC, params);
+  }
 
-using flag_check_fn = std::function<std::string(int64_t)>;
-using flag_check_float_fn = std::function<std::string(float)>;
+  // Modular mode related.
+  ProcessFlag("modular_group_size", args->modular_group_size,
+              JXL_ENC_FRAME_SETTING_MODULAR_GROUP_SIZE, params,
+              [](int64_t x) -> std::string {
+                return (-1 <= x && x <= 3)
+                           ? ""
+                           : "Invalid --modular_group_size. Valid "
+                             "range is {-1, 0, 1, 2, 3}.\n";
+              });
+  ProcessFlag("modular_predictor", args->modular_predictor,
+              JXL_ENC_FRAME_SETTING_MODULAR_PREDICTOR, params,
+              [](int64_t x) -> std::string {
+                return (-1 <= x && x <= 15)
+                           ? ""
+                           : "Invalid --modular_predictor. Valid "
+                             "range is {-1, 0, 1, ..., 15}.\n";
+              });
+  ProcessFlag("modular_colorspace", args->modular_colorspace,
+              JXL_ENC_FRAME_SETTING_MODULAR_COLOR_SPACE, params,
+              [](int64_t x) -> std::string {
+                return (-1 <= x && x <= 41)
+                           ? ""
+                           : "Invalid --modular_colorspace. Valid range is "
+                             "{-1, 0, 1, ..., 41}.\n";
+              });
+  ProcessFlag("modular_ma_tree_learning_percent",
+              args->modular_ma_tree_learning_percent,
+              JXL_ENC_FRAME_SETTING_MODULAR_MA_TREE_LEARNING_PERCENT, params,
+              [](float x) -> std::string {
+                return -1 <= x && x <= 100
+                           ? ""
+                           : "Invalid --modular_ma_tree_learning_percent, Valid"
+                             "rang is [-1, 100].\n";
+              });
+  ProcessFlag("modular_nb_prev_channels", args->modular_nb_prev_channels,
+              JXL_ENC_FRAME_SETTING_MODULAR_NB_PREV_CHANNELS, params,
+              [](int64_t x) -> std::string {
+                return (-1 <= x && x <= 11)
+                           ? ""
+                           : "Invalid --modular_nb_prev_channels. Valid "
+                             "range is {-1, 0, 1, ..., 11}.\n";
+              });
+  if (args->modular_lossy_palette) {
+    if (args->progressive || args->qprogressive_ac) {
+      fprintf(stderr,
+              "WARNING: --modular_lossy_palette is ignored in "
+              "progressive mode.\n");
+      args->modular_lossy_palette = false;
+    }
+  }
+  ProcessFlag("modular_lossy_palette",
+              static_cast<int64_t>(args->modular_lossy_palette),
+              JXL_ENC_FRAME_SETTING_LOSSY_PALETTE, params);
+  ProcessFlag("modular_palette_colors", args->modular_palette_colors,
+              JXL_ENC_FRAME_SETTING_PALETTE_COLORS, params,
+              [](int64_t x) -> std::string {
+                return -1 <= x ? ""
+                               : "Invalid --modular_palette_colors, must "
+                                 "be -1 or non-negative\n";
+              });
+  ProcessFlag("modular_channel_colors_global_percent",
+              args->modular_channel_colors_global_percent,
+              JXL_ENC_FRAME_SETTING_CHANNEL_COLORS_GLOBAL_PERCENT, params,
+              [](float x) -> std::string {
+                return (-1 <= x && x <= 100)
+                           ? ""
+                           : "Invalid --modular_channel_colors_global_percent. "
+                             "Valid "
+                             "range is [-1, 100].\n";
+              });
+  ProcessFlag("modular_channel_colors_group_percent",
+              args->modular_channel_colors_group_percent,
+              JXL_ENC_FRAME_SETTING_CHANNEL_COLORS_GROUP_PERCENT, params,
+              [](float x) -> std::string {
+                return (-1 <= x && x <= 100)
+                           ? ""
+                           : "Invalid --modular_channel_colors_group_percent. "
+                             "Valid "
+                             "range is [-1, 100].\n";
+              });
+
+  if (args->num_threads < -1) {
+    std::cerr
+        << "Invalid flag value for --num_threads: must be -1, 0 or positive."
+        << std::endl;
+    exit(EXIT_FAILURE);
+  }
+  // JPEG specific options.
+  if (jpeg_bytes) {
+    ProcessBoolFlag(args->jpeg_reconstruction_cfl,
+                    JXL_ENC_FRAME_SETTING_JPEG_RECON_CFL, params);
+    ProcessBoolFlag(args->compress_boxes,
+                    JXL_ENC_FRAME_SETTING_JPEG_COMPRESS_BOXES, params);
+  }
+  // Set per-frame options.
+  for (size_t num_frame = 0; num_frame < ppf.num_frames(); ++num_frame) {
+    if (num_frame < args->frame_indexing.size() &&
+        args->frame_indexing[num_frame] == '1') {
+      int64_t value = 1;
+      params->options.emplace_back(
+          jxl::extras::JXLOption(JXL_ENC_FRAME_INDEX_BOX, value, num_frame));
+    }
+  }
+  // Copy over the rest of the non-option params.
+  params->use_container = args->container == jxl::Override::kOn;
+  params->jpeg_store_metadata = args->jpeg_store_metadata;
+  params->intensity_target = args->intensity_target;
+  params->override_bitdepth = args->override_bitdepth;
+  params->codestream_level = args->codestream_level;
+  params->premultiply = args->premultiply;
+  params->compress_boxes = args->compress_boxes != jxl::Override::kOff;
+  params->upsampling_mode = args->upsampling_mode;
+  if (codec == jxl::extras::Codec::kPNM &&
+      ppf.info.exponent_bits_per_sample == 0) {
+    params->input_bitdepth.type = JXL_BIT_DEPTH_FROM_CODESTREAM;
+  }
 
-bool IsJPG(const std::vector<uint8_t>& image_data) {
-  return (image_data.size() >= 2 && image_data[0] == 0xFF &&
-          image_data[1] == 0xD8);
+  // If a metadata field is set to an empty value, it is stripped.
+  // Make sure we also strip it when the input image is read with AddJPEGFrame
+  (void)args->color_hints_proxy.target.Foreach(
+      [&params](const std::string& key,
+                const std::string& value) -> jxl::Status {
+        if (value == "") {
+          if (key == "exif") params->jpeg_strip_exif = true;
+          if (key == "xmp") params->jpeg_strip_xmp = true;
+          if (key == "jumbf") params->jpeg_strip_jumbf = true;
+        }
+        return true;
+      });
 }
 
-// TODO(tfish): Replace with non-C-API library function.
-// Implementation is in extras/.
-jxl::Status GetPixeldata(const std::vector<uint8_t>& image_data,
-                         const jxl::extras::ColorHints& color_hints,
-                         jxl::extras::PackedPixelFile& ppf,
-                         jxl::extras::Codec& codec) {
-  // Any valid encoding is larger (ensures codecs can read the first few bytes).
-  constexpr size_t kMinBytes = 9;
-
-  if (image_data.size() < kMinBytes) return JXL_FAILURE("Input too small.");
-  jxl::Span<const uint8_t> encoded(image_data);
-
-  ppf.info.orientation = JXL_ORIENT_IDENTITY;
-  jxl::SizeConstraints size_constraints;
-
-  const auto choose_codec = [&]() {
-#if JPEGXL_ENABLE_APNG
-    if (jxl::extras::DecodeImageAPNG(encoded, color_hints, size_constraints,
-                                     &ppf)) {
-      return jxl::extras::Codec::kPNG;
-    }
-#endif
-    if (jxl::extras::DecodeImagePGX(encoded, color_hints, size_constraints,
-                                    &ppf)) {
-      return jxl::extras::Codec::kPGX;
-    } else if (jxl::extras::DecodeImagePNM(encoded, color_hints,
-                                           size_constraints, &ppf)) {
-      return jxl::extras::Codec::kPNM;
+struct JxlOutputProcessor {
+  bool SetOutputPath(const std::string& path) {
+    outfile.reset(new FileWrapper(path, "wb"));
+    if (!*outfile) {
+      fprintf(stderr,
+              "Could not open %s for writing\n"
+              "Error: %s",
+              path.c_str(), strerror(errno));
+      return false;
     }
-#if JPEGXL_ENABLE_GIF
-    if (jxl::extras::DecodeImageGIF(encoded, color_hints, size_constraints,
-                                    &ppf)) {
-      return jxl::extras::Codec::kGIF;
+    return true;
+  }
+
+  JxlEncoderOutputProcessor GetOutputProcessor() {
+    return JxlEncoderOutputProcessor{this, GetBuffer, ReleaseBuffer, Seek,
+                                     SetFinalizedPosition};
+  }
+
+  static void* GetBuffer(void* opaque, size_t* size) {
+    JxlOutputProcessor* self = reinterpret_cast<JxlOutputProcessor*>(opaque);
+    self->output.resize(*size);
+    return self->output.data();
+  }
+
+  static void ReleaseBuffer(void* opaque, size_t written_bytes) {
+    JxlOutputProcessor* self = reinterpret_cast<JxlOutputProcessor*>(opaque);
+    if (*self->outfile && fwrite(self->output.data(), 1, written_bytes,
+                                 *self->outfile) != written_bytes) {
+      JXL_WARNING("Failed to write %" PRIuS " bytes to output", written_bytes);
     }
-#endif
-#if JPEGXL_ENABLE_JPEG
-    if (jxl::extras::DecodeImageJPG(encoded, color_hints, size_constraints,
-                                    &ppf)) {
-      return jxl::extras::Codec::kJPG;
+    self->output.clear();
+  }
+
+  static void Seek(void* opaque, uint64_t position) {
+    JxlOutputProcessor* self = reinterpret_cast<JxlOutputProcessor*>(opaque);
+    if (*self->outfile && fseek(*self->outfile, position, SEEK_SET) != 0) {
+      JXL_WARNING("Failed to seek output.");
     }
-#endif
-    // TODO(tfish): Bring back EXR and PSD.
-    return jxl::extras::Codec::kUnknown;
-  };
-  codec = choose_codec();
-  if (codec == jxl::extras::Codec::kUnknown) {
-    return JXL_FAILURE("Codecs failed to decode input.");
   }
-  return true;
-}
 
-}  // namespace
+  static void SetFinalizedPosition(void* opaque, uint64_t finalized_position) {
+    JxlOutputProcessor* self = reinterpret_cast<JxlOutputProcessor*>(opaque);
+    self->finalized_position = finalized_position;
+  }
+
+  std::vector<uint8_t> output;
+  size_t finalized_position = 0;
+  std::unique_ptr<FileWrapper> outfile;
+};
+
+}  // namespace tools
+}  // namespace jpegxl
 
 int main(int argc, char** argv) {
   std::string version = jpegxl::tools::CodecConfigString(JxlEncoderVersion());
@@ -672,9 +1019,15 @@ int main(int argc, char** argv) {
     return jpegxl::tools::CjxlRetCode::OK;
   }
 
-  if (!args.file_out && !args.quiet) {
+  if (!args.file_out && !args.disable_output) {
+    std::cerr
+        << "No output file specified and --disable_output flag not passed."
+        << std::endl;
+    exit(EXIT_FAILURE);
+  }
+
+  if (args.file_out && args.disable_output && !args.quiet) {
     fprintf(stderr,
-            "No output file specified.\n"
             "Encoding will be performed, but the result will be discarded.\n");
   }
 
@@ -682,533 +1035,151 @@ int main(int argc, char** argv) {
   // Depending on flags-settings, we want to either load a JPEG and
   // faithfully convert it to JPEG XL, or load (JPEG or non-JPEG)
   // pixel data.
-  std::vector<uint8_t> image_data;
-  jxl::extras::PackedPixelFile ppf;
-  jxl::extras::Codec codec = jxl::extras::Codec::kUnknown;
-  double decode_mps = 0;
-  size_t pixels = 0;
-  if (!jpegxl::tools::ReadFile(args.file_in, &image_data)) {
+  jpegxl::tools::FileWrapper f(args.file_in, "rb");
+  if (!f) {
     std::cerr << "Reading image data failed." << std::endl;
     exit(EXIT_FAILURE);
   }
-  if (!IsJPG(image_data)) args.lossless_jpeg = 0;
-  if (!args.lossless_jpeg) {
-    const double t0 = jxl::Now();
-    jxl::Status status = GetPixeldata(image_data, args.color_hints, ppf, codec);
-    if (!status) {
-      std::cerr << "Getting pixel data." << std::endl;
+  jxl::extras::JXLCompressParams params;
+  jxl::extras::PackedPixelFile ppf;
+  jxl::extras::Codec codec = jxl::extras::Codec::kUnknown;
+  std::vector<uint8_t> image_data;
+  std::vector<uint8_t>* jpeg_bytes = nullptr;
+  jxl::extras::ChunkedPNMDecoder pnm_dec;
+  size_t pixels = 0;
+  if (args.streaming_input) {
+    pnm_dec.f = f;
+    if (!DecodeImagePNM(&pnm_dec, args.color_hints_proxy.target, &ppf)) {
+      std::cerr << "PNM decoding failed." << std::endl;
       exit(EXIT_FAILURE);
     }
-    if (ppf.frames.empty()) {
-      std::cerr << "No frames on input file." << std::endl;
+    codec = jxl::extras::Codec::kPNM;
+    args.lossless_jpeg = 0;
+    pixels = ppf.info.xsize * ppf.info.ysize;
+  } else {
+    double decode_mps = 0;
+    if (!jpegxl::tools::ReadFile(f, &image_data)) {
+      std::cerr << "Reading image data failed." << std::endl;
       exit(EXIT_FAILURE);
     }
+    if (!jpegxl::tools::IsJPG(image_data)) args.lossless_jpeg = 0;
+    ProcessFlags(codec, ppf, jpeg_bytes, &cmdline, &args, &params);
+    if (!args.lossless_jpeg) {
+      const double t0 = jxl::Now();
+      jxl::Status status = jxl::extras::DecodeBytes(
+          jxl::Bytes(image_data), args.color_hints_proxy.target, &ppf, nullptr,
+          &codec);
 
-    const double t1 = jxl::Now();
-    pixels = ppf.info.xsize * ppf.info.ysize;
-    decode_mps = pixels * ppf.info.num_color_channels * 1E-6 / (t1 - t0);
-  }
-
-  JxlEncoderPtr enc = JxlEncoderMake(/*memory_manager=*/nullptr);
-  JxlEncoder* jxl_encoder = enc.get();
-  JxlThreadParallelRunnerPtr runner;
-  std::vector<uint8_t> compressed;
-  size_t num_worker_threads;
-  jpegxl::tools::SpeedStats stats;
-  for (size_t num_rep = 0; num_rep < args.num_reps; ++num_rep) {
-    const double t0 = jxl::Now();
-    JxlEncoderReset(jxl_encoder);
-    if (args.num_threads != 0) {
-      num_worker_threads = JxlThreadParallelRunnerDefaultNumWorkerThreads();
-      {
-        int64_t flag_num_worker_threads = args.num_threads;
-        if (flag_num_worker_threads > -1) {
-          num_worker_threads = flag_num_worker_threads;
-        }
-      }
-      if (runner == nullptr) {
-        runner = JxlThreadParallelRunnerMake(
-            /*memory_manager=*/nullptr, num_worker_threads);
-      }
-      if (JXL_ENC_SUCCESS !=
-          JxlEncoderSetParallelRunner(jxl_encoder, JxlThreadParallelRunner,
-                                      runner.get())) {
-        std::cerr << "JxlEncoderSetParallelRunner failed." << std::endl;
-        return EXIT_FAILURE;
-      }
-    }
-    JxlEncoderFrameSettings* jxl_encoder_frame_settings =
-        JxlEncoderFrameSettingsCreate(jxl_encoder, nullptr);
-
-    auto process_flag = [&jxl_encoder_frame_settings](
-                            const char* flag_name, int64_t flag_value,
-                            JxlEncoderFrameSettingId encoder_option,
-                            const flag_check_fn& flag_check) {
-      std::string error = flag_check(flag_value);
-      if (!error.empty()) {
-        std::cerr << "Invalid flag value for --" << flag_name << ": " << error
-                  << std::endl;
+      if (!status) {
+        std::cerr << "Getting pixel data failed." << std::endl;
         exit(EXIT_FAILURE);
       }
-      SetFlagFrameOptionOrDie(flag_name, flag_value, jxl_encoder_frame_settings,
-                              encoder_option);
-    };
-    auto process_float_flag = [&jxl_encoder_frame_settings](
-                                  const char* flag_name, float flag_value,
-                                  JxlEncoderFrameSettingId encoder_option,
-                                  const flag_check_float_fn& flag_check) {
-      std::string error = flag_check(flag_value);
-      if (!error.empty()) {
-        std::cerr << "Invalid flag value for --" << flag_name << ": " << error
-                  << std::endl;
+      if (ppf.frames.empty()) {
+        std::cerr << "No frames on input file." << std::endl;
         exit(EXIT_FAILURE);
       }
-      SetFlagFrameOptionOrDie(flag_name, flag_value, jxl_encoder_frame_settings,
-                              encoder_option);
-    };
-
-    auto process_bool_flag = [&jxl_encoder_frame_settings](
-                                 const char* flag_name,
-                                 jxl::Override flag_value,
-                                 JxlEncoderFrameSettingId encoder_option) {
-      if (flag_value != jxl::Override::kDefault) {
-        SetFlagFrameOptionOrDie(flag_name,
-                                flag_value == jxl::Override::kOn ? 1 : 0,
-                                jxl_encoder_frame_settings, encoder_option);
-      }
-    };
-
-    {  // Processing tuning flags.
-      process_bool_flag("modular", args.modular, JXL_ENC_FRAME_SETTING_MODULAR);
-      process_bool_flag("keep_invisible", args.keep_invisible,
-                        JXL_ENC_FRAME_SETTING_KEEP_INVISIBLE);
-      process_bool_flag("dots", args.dots, JXL_ENC_FRAME_SETTING_DOTS);
-      process_bool_flag("patches", args.patches, JXL_ENC_FRAME_SETTING_PATCHES);
-      process_bool_flag("gaborish", args.gaborish,
-                        JXL_ENC_FRAME_SETTING_GABORISH);
-      process_bool_flag("group_order", args.group_order,
-                        JXL_ENC_FRAME_SETTING_GROUP_ORDER);
-
-      if (!args.frame_indexing.empty()) {
-        bool must_be_all_zeros = args.frame_indexing[0] != '1';
-        for (char c : args.frame_indexing) {
-          if (c == '1') {
-            if (must_be_all_zeros) {
-              std::cerr
-                  << "Invalid --frame_indexing. If the first character is "
-                     "'0', all must be '0'."
-                  << std::endl;
-              return EXIT_FAILURE;
-            }
-          } else if (c != '0') {
-            std::cerr << "Invalid --frame_indexing. Must match the pattern "
-                         "'^(0*|1[01]*)$'."
-                      << std::endl;
-            return EXIT_FAILURE;
-          }
-        }
-      }
-
-      process_flag(
-          "effort", args.effort, JXL_ENC_FRAME_SETTING_EFFORT,
-          [](int64_t x) -> std::string {
-            return (1 <= x && x <= 9) ? "" : "Valid range is {1, 2, ..., 9}.";
-          });
-      process_flag(
-          "brotli_effort", args.brotli_effort,
-          JXL_ENC_FRAME_SETTING_BROTLI_EFFORT, [](int64_t x) -> std::string {
-            return (-1 <= x && x <= 11) ? ""
-                                        : "Valid range is {-1, 0, 1, ..., 11}.";
-          });
-      process_flag("epf", args.epf, JXL_ENC_FRAME_SETTING_EPF,
-                   [](int64_t x) -> std::string {
-                     return (-1 <= x && x <= 3)
-                                ? ""
-                                : "Valid range is {-1, 0, 1, 2, 3}.\n";
-                   });
-      process_flag(
-          "faster_decoding", args.faster_decoding,
-          JXL_ENC_FRAME_SETTING_DECODING_SPEED, [](int64_t x) -> std::string {
-            return (0 <= x && x <= 4) ? ""
-                                      : "Valid range is {0, 1, 2, 3, 4}.\n";
-          });
-      process_flag("resampling", args.resampling,
-                   JXL_ENC_FRAME_SETTING_RESAMPLING,
-                   [](int64_t x) -> std::string {
-                     return (x == -1 || x == 1 || x == 4 || x == 8)
-                                ? ""
-                                : "Valid values are {-1, 1, 2, 4, 8}.\n";
-                   });
-      process_flag("ec_resampling", args.ec_resampling,
-                   JXL_ENC_FRAME_SETTING_EXTRA_CHANNEL_RESAMPLING,
-                   [](int64_t x) -> std::string {
-                     return (x == -1 || x == 1 || x == 4 || x == 8)
-                                ? ""
-                                : "Valid values are {-1, 1, 2, 4, 8}.\n";
-                   });
-      SetFlagFrameOptionOrDie("photon_noise_iso", args.photon_noise_iso,
-                              jxl_encoder_frame_settings,
-                              JXL_ENC_FRAME_SETTING_PHOTON_NOISE);
-      SetFlagFrameOptionOrDie("already_downsampled",
-                              static_cast<int32_t>(args.already_downsampled),
-                              jxl_encoder_frame_settings,
-                              JXL_ENC_FRAME_SETTING_ALREADY_DOWNSAMPLED);
-      SetDistanceFromFlags(jxl_encoder_frame_settings, &cmdline, &args, codec);
-
-      if (args.group_order != jxl::Override::kOn &&
-          (args.center_x != -1 || args.center_y != -1)) {
-        std::cerr
-            << "Invalid flag combination. Setting --center_x or --center_y "
-            << "requires setting --group_order=1" << std::endl;
-        return EXIT_FAILURE;
-      }
-      process_flag("center_x", args.center_x,
-                   JXL_ENC_FRAME_SETTING_GROUP_ORDER_CENTER_X,
-                   [](int64_t x) -> std::string {
-                     if (x < -1) {
-                       return "Valid values are: -1 or [0 .. xsize).";
-                     }
-                     return "";
-                   });
-      process_flag("center_y", args.center_y,
-                   JXL_ENC_FRAME_SETTING_GROUP_ORDER_CENTER_Y,
-                   [](int64_t x) -> std::string {
-                     if (x < -1) {
-                       return "Valid values are: -1 or [0 .. ysize).";
-                     }
-                     return "";
-                   });
+      pixels = ppf.info.xsize * ppf.info.ysize;
+      const double t1 = jxl::Now();
+      decode_mps = pixels * ppf.info.num_color_channels * 1E-6 / (t1 - t0);
     }
-    {  // Progressive/responsive mode settings.
-      bool qprogressive_ac_set =
-          cmdline.GetOption(args.opt_qprogressive_ac_id)->matched();
-      int32_t qprogressive_ac = args.qprogressive_ac ? 1 : 0;
-      bool responsive_set =
-          cmdline.GetOption(args.opt_responsive_id)->matched();
-      int32_t responsive = args.responsive ? 1 : 0;
-
-      process_flag(
-          "progressive_dc", args.progressive_dc,
-          JXL_ENC_FRAME_SETTING_PROGRESSIVE_DC, [](int64_t x) -> std::string {
-            return (-1 <= x && x <= 2) ? "" : "Valid range is {-1, 0, 1, 2}.\n";
-          });
-      SetFlagFrameOptionOrDie(
-          "progressive_ac", static_cast<int32_t>(args.progressive_ac),
-          jxl_encoder_frame_settings, JXL_ENC_FRAME_SETTING_PROGRESSIVE_AC);
-
-      if (args.progressive) {
-        qprogressive_ac = 1;
-        qprogressive_ac_set = true;
-        responsive = 1;
-        responsive_set = true;
-      }
-      if (responsive_set) {
-        SetFlagFrameOptionOrDie("responsive", responsive,
-                                jxl_encoder_frame_settings,
-                                JXL_ENC_FRAME_SETTING_RESPONSIVE);
-      }
-      if (qprogressive_ac_set) {
-        SetFlagFrameOptionOrDie("qprogressive_ac", qprogressive_ac,
-                                jxl_encoder_frame_settings,
-                                JXL_ENC_FRAME_SETTING_QPROGRESSIVE_AC);
-      }
-    }
-    {  // Modular mode related.
-      // TODO(firsching): consider doing more validation after image size is
-      // known, i.e. set to 512 if 256 would be silly using
-      // opt_modular_group_size_id.
-      process_flag("modular_group_size", args.modular_group_size,
-                   JXL_ENC_FRAME_SETTING_MODULAR_GROUP_SIZE,
-                   [](int64_t x) -> std::string {
-                     return (-1 <= x && x <= 3)
-                                ? ""
-                                : "Invalid --modular_group_size. Valid "
-                                  "range is {-1, 0, 1, 2, 3}.\n";
-                   });
-      process_flag("modular_predictor", args.modular_predictor,
-                   JXL_ENC_FRAME_SETTING_MODULAR_PREDICTOR,
-                   [](int64_t x) -> std::string {
-                     return (-1 <= x && x <= 15)
-                                ? ""
-                                : "Invalid --modular_predictor. Valid "
-                                  "range is {-1, 0, 1, ..., 15}.\n";
-                   });
-      process_flag(
-          "modular_colorspace", args.modular_colorspace,
-          JXL_ENC_FRAME_SETTING_MODULAR_COLOR_SPACE,
-          [](int64_t x) -> std::string {
-            return (-1 <= x && x <= 41)
-                       ? ""
-                       : "Invalid --modular_colorspace. Valid range is "
-                         "{-1, 0, 1, ..., 41}.\n";
-          });
-      process_float_flag(
-          "modular_ma_tree_learning_percent",
-          args.modular_ma_tree_learning_percent,
-          JXL_ENC_FRAME_SETTING_MODULAR_MA_TREE_LEARNING_PERCENT,
-          [](float x) -> std::string {
-            return -1 <= x && x <= 100
-                       ? ""
-                       : "Invalid --modular_ma_tree_learning_percent, Valid"
-                         "rang is [-1, 100].\n";
-          });
-      process_flag("modular_nb_prev_channels", args.modular_nb_prev_channels,
-                   JXL_ENC_FRAME_SETTING_MODULAR_NB_PREV_CHANNELS,
-                   [](int64_t x) -> std::string {
-                     return (-1 <= x && x <= 11)
-                                ? ""
-                                : "Invalid --modular_nb_prev_channels. Valid "
-                                  "range is {-1, 0, 1, ..., 11}.\n";
-                   });
-      SetFlagFrameOptionOrDie("modular_lossy_palette",
-                              static_cast<int32_t>(args.modular_lossy_palette),
-                              jxl_encoder_frame_settings,
-                              JXL_ENC_FRAME_SETTING_LOSSY_PALETTE);
-      process_flag("modular_palette_colors", args.modular_palette_colors,
-                   JXL_ENC_FRAME_SETTING_PALETTE_COLORS,
-                   [](int64_t x) -> std::string {
-                     return -1 <= x ? ""
-                                    : "Invalid --modular_palette_colors, must "
-                                      "be -1 or non-negative\n";
-                   });
-      process_float_flag(
-          "modular_channel_colors_global_percent",
-          args.modular_channel_colors_global_percent,
-          JXL_ENC_FRAME_SETTING_CHANNEL_COLORS_GLOBAL_PERCENT,
-          [](float x) -> std::string {
-            return (-1 <= x && x <= 100)
-                       ? ""
-                       : "Invalid --modular_channel_colors_global_percent. "
-                         "Valid "
-                         "range is [-1, 100].\n";
-          });
-      process_float_flag(
-          "modular_channel_colors_group_percent",
-          args.modular_channel_colors_group_percent,
-          JXL_ENC_FRAME_SETTING_CHANNEL_COLORS_GROUP_PERCENT,
-          [](float x) -> std::string {
-            return (-1 <= x && x <= 100)
-                       ? ""
-                       : "Invalid --modular_channel_colors_group_percent. "
-                         "Valid "
-                         "range is [-1, 100].\n";
-          });
-    }
-
-    bool use_container = args.container == jxl::Override::kOn;
-    if (!ppf.metadata.exif.empty() || !ppf.metadata.xmp.empty() ||
-        !ppf.metadata.jumbf.empty() || !ppf.metadata.iptc.empty() ||
-        (args.lossless_jpeg && args.jpeg_store_metadata)) {
-      use_container = true;
-    }
-    if (use_container) args.container = jxl::Override::kOn;
-
-    if (!ppf.metadata.exif.empty()) {
-      jxl::InterpretExif(ppf.metadata.exif, &ppf.info.orientation);
-    }
-
-    if (JXL_ENC_SUCCESS !=
-        JxlEncoderUseContainer(jxl_encoder, static_cast<int>(use_container))) {
-      std::cerr << "JxlEncoderUseContainer failed." << std::endl;
-      return EXIT_FAILURE;
+    if (!args.quiet) {
+      PrintMode(ppf, decode_mps, image_data.size(), args, cmdline);
     }
 
-    if (num_rep == 0 && !args.quiet)
-      PrintMode(ppf, decode_mps, image_data.size(), args);
-
-    if (args.lossless_jpeg && IsJPG(image_data)) {
+    if (args.lossless_jpeg && jpegxl::tools::IsJPG(image_data)) {
       if (!cmdline.GetOption(args.opt_lossless_jpeg_id)->matched()) {
         std::cerr << "Note: Implicit-default for JPEG is lossless-transcoding. "
                   << "To silence this message, set --lossless_jpeg=(1|0)."
                   << std::endl;
       }
-      if (args.jpeg_store_metadata) {
-        if (JXL_ENC_SUCCESS !=
-            JxlEncoderStoreJPEGMetadata(jxl_encoder, JXL_TRUE)) {
-          std::cerr << "Storing JPEG metadata failed. " << std::endl;
-          return EXIT_FAILURE;
-        }
-      }
-      process_bool_flag("jpeg_reconstruction_cfl", args.jpeg_reconstruction_cfl,
-                        JXL_ENC_FRAME_SETTING_JPEG_RECON_CFL);
-      if (JXL_ENC_SUCCESS != JxlEncoderAddJPEGFrame(jxl_encoder_frame_settings,
-                                                    image_data.data(),
-                                                    image_data.size())) {
-        std::cerr << "JxlEncoderAddJPEGFrame() failed." << std::endl;
-        return EXIT_FAILURE;
-      }
-    } else {                          // Do JxlEncoderAddImageFrame().
-      size_t num_alpha_channels = 0;  // Adjusted below.
-      {
-        JxlBasicInfo basic_info = ppf.info;
-        if (basic_info.alpha_bits > 0) num_alpha_channels = 1;
-        basic_info.intensity_target = args.intensity_target;
-        basic_info.num_extra_channels = num_alpha_channels;
-        basic_info.num_color_channels = ppf.info.num_color_channels;
-        const bool lossless = args.distance == 0;
-        basic_info.uses_original_profile = lossless;
-        if (args.override_bitdepth != 0) {
-          basic_info.bits_per_sample = args.override_bitdepth;
-          basic_info.exponent_bits_per_sample =
-              args.override_bitdepth == 32 ? 8 : 0;
-        }
-        if (JXL_ENC_SUCCESS !=
-            JxlEncoderSetCodestreamLevel(jxl_encoder, args.codestream_level)) {
-          std::cerr << "Setting --codestream_level failed." << std::endl;
-          return EXIT_FAILURE;
-        }
-        if (JXL_ENC_SUCCESS !=
-            JxlEncoderSetBasicInfo(jxl_encoder, &basic_info)) {
-          std::cerr << "JxlEncoderSetBasicInfo() failed." << std::endl;
-          return EXIT_FAILURE;
-        }
-        if (lossless &&
-            JXL_ENC_SUCCESS != JxlEncoderSetFrameLossless(
-                                   jxl_encoder_frame_settings, JXL_TRUE)) {
-          std::cerr << "JxlEncoderSetFrameLossless() failed." << std::endl;
-          return EXIT_FAILURE;
-        }
-      }
+      jpeg_bytes = &image_data;
+    }
+  }
 
-      if (!ppf.icc.empty()) {
-        if (JXL_ENC_SUCCESS != JxlEncoderSetICCProfile(jxl_encoder,
-                                                       ppf.icc.data(),
-                                                       ppf.icc.size())) {
-          std::cerr << "JxlEncoderSetICCProfile() failed." << std::endl;
-          return EXIT_FAILURE;
-        }
-      } else {
-        if (JXL_ENC_SUCCESS !=
-            JxlEncoderSetColorEncoding(jxl_encoder, &ppf.color_encoding)) {
-          std::cerr << "JxlEncoderSetColorEncoding() failed." << std::endl;
-          return EXIT_FAILURE;
-        }
-      }
+  ProcessFlags(codec, ppf, jpeg_bytes, &cmdline, &args, &params);
 
-      for (size_t num_frame = 0; num_frame < ppf.frames.size(); ++num_frame) {
-        const jxl::extras::PackedFrame& pframe = ppf.frames[num_frame];
-        const jxl::extras::PackedImage& pimage = pframe.color;
-        JxlPixelFormat ppixelformat = pimage.format;
-        {
-          if (JXL_ENC_SUCCESS !=
-              JxlEncoderSetFrameHeader(jxl_encoder_frame_settings,
-                                       &pframe.frame_info)) {
-            std::cerr << "JxlEncoderSetFrameHeader() failed." << std::endl;
-            return EXIT_FAILURE;
-          }
-        }
-        if (num_frame < args.frame_indexing.size() &&
-            args.frame_indexing[num_frame] == '1') {
-          if (JXL_ENC_SUCCESS !=
-              JxlEncoderFrameSettingsSetOption(jxl_encoder_frame_settings,
-                                               JXL_ENC_FRAME_INDEX_BOX, 1)) {
-            std::cerr << "Setting option JXL_ENC_FRAME_INDEX_BOX failed."
-                      << std::endl;
-            return EXIT_FAILURE;
-          }
-        }
-        JxlEncoderStatus enc_status;
-        {
-          if (num_alpha_channels > 0) {
-            JxlExtraChannelInfo extra_channel_info;
-            JxlEncoderInitExtraChannelInfo(JXL_CHANNEL_ALPHA,
-                                           &extra_channel_info);
-            enc_status = JxlEncoderSetExtraChannelInfo(jxl_encoder, 0,
-                                                       &extra_channel_info);
-            if (JXL_ENC_SUCCESS != enc_status) {
-              std::cerr << "JxlEncoderSetExtraChannelInfo() failed."
-                        << std::endl;
-              return EXIT_FAILURE;
-            }
-            if (args.premultiply != -1) {
-              if (args.premultiply != 0 && args.premultiply != 1) {
-                std::cerr << "Flag --premultiply must be one of: -1, 0, 1."
-                          << std::endl;
-                return EXIT_FAILURE;
-              }
-              extra_channel_info.alpha_premultiplied = args.premultiply;
-            }
-            // We take the extra channel blend info frame_info, but don't do
-            // clamping.
-            JxlBlendInfo extra_channel_blend_info =
-                pframe.frame_info.layer_info.blend_info;
-            extra_channel_blend_info.clamp = JXL_FALSE;
-            JxlEncoderSetExtraChannelBlendInfo(jxl_encoder_frame_settings, 0,
-                                               &extra_channel_blend_info);
-          }
-          enc_status =
-              JxlEncoderAddImageFrame(jxl_encoder_frame_settings, &ppixelformat,
-                                      pimage.pixels(), pimage.pixels_size);
-          if (JXL_ENC_SUCCESS != enc_status) {
-            std::cerr << "JxlEncoderAddImageFrame() failed." << std::endl;
-            return EXIT_FAILURE;
-          }
-          // Only set extra channel buffer if is is provided non-interleaved.
-          if (!pframe.extra_channels.empty()) {
-            enc_status = JxlEncoderSetExtraChannelBuffer(
-                jxl_encoder_frame_settings, &ppixelformat,
-                pframe.extra_channels[0].pixels(),
-                pframe.extra_channels[0].stride *
-                    pframe.extra_channels[0].ysize,
-                0);
-            if (JXL_ENC_SUCCESS != enc_status) {
-              std::cerr << "JxlEncoderSetExtraChannelBuffer() failed."
-                        << std::endl;
-              return EXIT_FAILURE;
-            }
-          }
-        }
-      }
+  if (!ppf.metadata.exif.empty()) {
+    jxl::InterpretExif(ppf.metadata.exif, &ppf.info.orientation);
+  }
+
+  if (!ppf.metadata.exif.empty() || !ppf.metadata.xmp.empty() ||
+      !ppf.metadata.jumbf.empty() || !ppf.metadata.iptc.empty() ||
+      (args.lossless_jpeg && args.jpeg_store_metadata)) {
+    if (args.container == jxl::Override::kDefault) {
+      args.container = jxl::Override::kOn;
+    } else if (args.container == jxl::Override::kOff) {
+      cmdline.VerbosePrintf(
+          1, "Stripping all metadata due to explicit container=0\n");
+      ppf.metadata.exif.clear();
+      ppf.metadata.xmp.clear();
+      ppf.metadata.jumbf.clear();
+      ppf.metadata.iptc.clear();
+      args.jpeg_store_metadata = 0;
     }
-    JxlEncoderCloseInput(jxl_encoder);
-    // Reading compressed output
-    compressed.clear();
-    compressed.resize(4096);
-    uint8_t* next_out = compressed.data();
-    size_t avail_out = compressed.size() - (next_out - compressed.data());
-    JxlEncoderStatus process_result = JXL_ENC_NEED_MORE_OUTPUT;
-    while (process_result == JXL_ENC_NEED_MORE_OUTPUT) {
-      process_result =
-          JxlEncoderProcessOutput(jxl_encoder, &next_out, &avail_out);
-      if (process_result == JXL_ENC_NEED_MORE_OUTPUT) {
-        size_t offset = next_out - compressed.data();
-        compressed.resize(compressed.size() * 2);
-        next_out = compressed.data() + offset;
-        avail_out = compressed.size() - offset;
-      }
+  }
+
+  size_t num_worker_threads = JxlThreadParallelRunnerDefaultNumWorkerThreads();
+  int64_t flag_num_worker_threads = args.num_threads;
+  if (flag_num_worker_threads > -1) {
+    num_worker_threads = flag_num_worker_threads;
+  }
+  JxlThreadParallelRunnerPtr runner = JxlThreadParallelRunnerMake(
+      /*memory_manager=*/nullptr, num_worker_threads);
+  params.runner = JxlThreadParallelRunner;
+  params.runner_opaque = runner.get();
+
+  jpegxl::tools::SpeedStats stats;
+  jpegxl::tools::JxlOutputProcessor output_processor;
+  if (args.streaming_output) {
+    if (args.file_out && !args.disable_output &&
+        !output_processor.SetOutputPath(args.file_out)) {
+      return EXIT_FAILURE;
     }
-    compressed.resize(next_out - compressed.data());
-    if (JXL_ENC_SUCCESS != process_result) {
-      std::cerr << "JxlEncoderProcessOutput failed." << std::endl;
+    params.output_processor = output_processor.GetOutputProcessor();
+  }
+  std::vector<uint8_t> compressed;
+  for (size_t num_rep = 0; num_rep < args.num_reps; ++num_rep) {
+    const double t0 = jxl::Now();
+    if (!EncodeImageJXL(params, ppf, jpeg_bytes,
+                        args.streaming_output ? nullptr : &compressed)) {
+      fprintf(stderr, "EncodeImageJXL() failed.\n");
       return EXIT_FAILURE;
     }
-
     const double t1 = jxl::Now();
     stats.NotifyElapsed(t1 - t0);
     stats.SetImageSize(ppf.info.xsize, ppf.info.ysize);
   }
+  size_t compressed_size = args.streaming_output
+                               ? output_processor.finalized_position
+                               : compressed.size();
 
-  if (args.file_out) {
+  if (!args.streaming_output && args.file_out && !args.disable_output) {
     if (!jpegxl::tools::WriteFile(args.file_out, compressed)) {
       std::cerr << "Could not write jxl file." << std::endl;
       return EXIT_FAILURE;
     }
   }
   if (!args.quiet) {
-    const double bpp =
-        static_cast<double>(compressed.size() * jxl::kBitsPerByte) / pixels;
-    fprintf(stderr, "Compressed to %" PRIuS " bytes ", compressed.size());
+    if (compressed_size < 100000) {
+      cmdline.VerbosePrintf(0, "Compressed to %" PRIuS " bytes ",
+                            compressed_size);
+    } else {
+      cmdline.VerbosePrintf(0, "Compressed to %.1f kB ",
+                            compressed_size * 0.001);
+    }
     // For lossless jpeg-reconstruction, we don't print some stats, since we
     // don't have easy access to the image dimensions.
     if (args.container == jxl::Override::kOn) {
-      fprintf(stderr, "including container ");
+      cmdline.VerbosePrintf(0, "including container ");
     }
     if (!args.lossless_jpeg) {
-      fprintf(stderr, "(%.3f bpp%s).\n", bpp / ppf.frames.size(),
-              ppf.frames.size() == 1 ? "" : "/frame");
+      const double bpp =
+          static_cast<double>(compressed_size * jxl::kBitsPerByte) / pixels;
+      cmdline.VerbosePrintf(0, "(%.3f bpp%s).\n", bpp / ppf.num_frames(),
+                            ppf.num_frames() == 1 ? "" : "/frame");
       JXL_CHECK(stats.Print(num_worker_threads));
     } else {
-      fprintf(stderr, "\n");
+      cmdline.VerbosePrintf(0, "\n");
     }
   }
   return EXIT_SUCCESS;
index f777c94..29e4da8 100644 (file)
@@ -29,19 +29,30 @@ void CommandLineParser::PrintHelp() const {
   fprintf(out, " [OPTIONS...]\n");
 
   bool showed_all = true;
+  int max_verbosity = 0;
   for (const auto& option : options_) {
+    max_verbosity = std::max(option->verbosity_level(), max_verbosity);
     if (option->verbosity_level() > verbosity) {
       showed_all = false;
       continue;
     }
+    if (option->help_only()) {
+      fprintf(out, "%s\n", option->help_text());
+      continue;
+    }
     fprintf(out, " %s\n", option->help_flags().c_str());
     const char* help_text = option->help_text();
     if (help_text) {
       fprintf(out, "    %s\n", help_text);
     }
   }
-  fprintf(out, " -h, --help\n    Prints this help message%s.\n",
-          (showed_all ? "" : " (use -v to see more options)"));
+  fprintf(out, "\n -h, --help\n    Prints this help message. ");
+  if (showed_all) {
+    fprintf(out, "All options are shown above.\n");
+  } else {
+    fprintf(out, "Add -v (up to a total of %i times) to see more options.\n",
+            max_verbosity);
+  }
 }
 
 bool CommandLineParser::Parse(int argc, const char* argv[]) {
@@ -91,5 +102,15 @@ bool CommandLineParser::Parse(int argc, const char* argv[]) {
   return true;
 }
 
+void CommandLineParser::VerbosePrintf(int min_verbosity, const char* format,
+                                      ...) const {
+  if (min_verbosity > verbosity) return;
+  va_list args;
+  va_start(args, format);
+  vfprintf(stderr, format, args);
+  fflush(stderr);
+  va_end(args);
+}
+
 }  // namespace tools
 }  // namespace jpegxl
index 9b730e6..994341d 100644 (file)
@@ -6,6 +6,7 @@
 #ifndef TOOLS_CMDLINE_H_
 #define TOOLS_CMDLINE_H_
 
+#include <stdarg.h>
 #include <stdio.h>
 #include <string.h>
 
@@ -19,7 +20,7 @@ namespace tools {
 
 class CommandLineParser {
  public:
-  typedef size_t OptionId;
+  typedef int OptionId;
 
   // An abstract class for defining command line options.
   class CmdOptionInterface {
@@ -53,16 +54,24 @@ class CommandLineParser {
     // Returns whether the option should be displayed as required in the help
     // output. No effect on validation.
     virtual bool required() const = 0;
+
+    // Returns whether the option is not really an option but just help text
+    virtual bool help_only() const = 0;
   };
 
+  // Add help text
+  void AddHelpText(const char* help_text, int verbosity_level = 0) {
+    options_.emplace_back(new CmdHelpText(help_text, verbosity_level));
+  }
+
   // Add a positional argument. Returns the id of the added option or
   // kOptionError on error.
   // The "required" flag indicates whether the parameter is mandatory or
   // optional, but is only used for how it is displayed in the command line
   // help.
   OptionId AddPositionalOption(const char* name, bool required,
-                               const char* help_text, const char** storage,
-                               int verbosity_level = 0) {
+                               const std::string& help_text,
+                               const char** storage, int verbosity_level = 0) {
     options_.emplace_back(new CmdOptionPositional(name, help_text, storage,
                                                   verbosity_level, required));
     return options_.size() - 1;
@@ -113,11 +122,44 @@ class CommandLineParser {
   // Return the remaining positional args
   std::vector<const char*> PositionalArgs() const;
 
+  // Conditionally print a message to stderr
+  void VerbosePrintf(int min_verbosity, const char* format, ...) const;
+
  private:
+  // Help text only.
+  class CmdHelpText : public CmdOptionInterface {
+   public:
+    CmdHelpText(const char* help_text, int verbosity_level)
+        : help_text_(help_text), verbosity_level_(verbosity_level) {}
+
+    std::string help_flags() const override { return ""; }
+    const char* help_text() const override { return help_text_; }
+    int verbosity_level() const override { return verbosity_level_; }
+    bool matched() const override { return false; }
+
+    bool Match(const char* arg, bool parse_options) const override {
+      return false;
+    }
+
+    bool Parse(const int argc, const char* argv[], int* i) override {
+      return true;
+    }
+
+    bool positional() const override { return false; }
+
+    bool required() const override { return false; }
+
+    bool help_only() const override { return true; }
+
+   private:
+    const char* help_text_;
+    const int verbosity_level_;
+  };
+
   // A positional argument.
   class CmdOptionPositional : public CmdOptionInterface {
    public:
-    CmdOptionPositional(const char* name, const char* help_text,
+    CmdOptionPositional(const char* name, const std::string& help_text,
                         const char** storage, int verbosity_level,
                         bool required)
         : name_(name),
@@ -127,7 +169,7 @@ class CommandLineParser {
           required_(required) {}
 
     std::string help_flags() const override { return name_; }
-    const char* help_text() const override { return help_text_; }
+    const char* help_text() const override { return help_text_.c_str(); }
     int verbosity_level() const override { return verbosity_level_; }
     bool matched() const override { return matched_; }
 
@@ -150,9 +192,11 @@ class CommandLineParser {
 
     bool required() const override { return required_; }
 
+    bool help_only() const override { return false; }
+
    private:
     const char* name_;
-    const char* help_text_;
+    const std::string help_text_;
     const char** storage_;
     const int verbosity_level_;
     const bool required_;
@@ -252,6 +296,8 @@ class CommandLineParser {
       return false;
     }
 
+    bool help_only() const override { return false; }
+
    private:
     // Returns whether arg matches the short_name flag of this option.
     bool MatchShort(const char* arg) const {
index a4f79a6..8d1c73f 100644 (file)
@@ -7,6 +7,7 @@
 #define TOOLS_CODEC_CONFIG_H_
 
 #include <stdint.h>
+
 #include <string>
 
 namespace jpegxl {
index 087bd8b..d73dc4f 100644 (file)
@@ -7,18 +7,20 @@
 
 #include "lib/extras/dec/color_description.h"
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
 
 int TestOneInput(const uint8_t* data, size_t size) {
   std::string description(reinterpret_cast<const char*>(data), size);
   JxlColorEncoding c;
-  (void)ParseDescription(description, &c);
+  (void)jxl::ParseDescription(description, &c);
 
   return 0;
 }
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
 
 extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
-  return jxl::TestOneInput(data, size);
+  return jpegxl::tools::TestOneInput(data, size);
 }
index b5b5fa7..3c548d0 100644 (file)
@@ -3,9 +3,9 @@
 # Use of this source code is governed by a BSD-style
 # license that can be found in the LICENSE file.
 
-find_package(Qt5 QUIET COMPONENTS Concurrent Widgets)
-if (NOT Qt5_FOUND)
-  message(WARNING "Qt5 was not found. The comparison tool will not be built.")
+find_package(Qt6 QUIET COMPONENTS Concurrent Widgets)
+if (NOT Qt6_FOUND)
+  message(WARNING "Qt6 was not found. The comparison tool will not be built.")
   return()
 endif ()
 
@@ -28,10 +28,10 @@ target_include_directories(image_loading PRIVATE
   $<TARGET_PROPERTY:lcms2,INCLUDE_DIRECTORIES>
 )
 target_link_libraries(image_loading PUBLIC
-  Qt5::Widgets
-  jxl-static
-  jxl_threads-static
-  jxl_extras-static
+  Qt6::Widgets
+  jxl-internal
+  jxl_threads
+  jxl_extras-internal
   lcms2
 )
 
@@ -51,8 +51,8 @@ add_executable(compare_codecs WIN32
 )
 target_link_libraries(compare_codecs
   image_loading
-  Qt5::Concurrent
-  Qt5::Widgets
+  Qt6::Concurrent
+  Qt6::Widgets
   icc_detect
 )
 
@@ -69,6 +69,6 @@ add_executable(compare_images WIN32
 )
 target_link_libraries(compare_images
   image_loading
-  Qt5::Widgets
+  Qt6::Widgets
   icc_detect
 )
index 9bf6253..0ecd579 100644 (file)
@@ -31,7 +31,8 @@
 #include "tools/comparison_viewer/split_image_view.h"
 #include "tools/icc_detect/icc_detect.h"
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
 
 static constexpr char kPngSuffix[] = "png";
 
@@ -313,4 +314,5 @@ void CodecComparisonWindow::browseDirectory(const QDir& directory, int depth) {
   }
 }
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
index b157a5a..bb23314 100644 (file)
 #include <QSet>
 #include <QString>
 
-#include "lib/jxl/base/padded_bytes.h"
-#include "lib/jxl/common.h"
+#include "lib/jxl/base/common.h"
 #include "tools/comparison_viewer/ui_codec_comparison_window.h"
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
 
 class CodecComparisonWindow : public QMainWindow {
   Q_OBJECT
 
  public:
   explicit CodecComparisonWindow(
-      const QString& directory, float intensityTarget = kDefaultIntensityTarget,
+      const QString& directory,
+      float intensityTarget = jxl::kDefaultIntensityTarget,
       QWidget* parent = nullptr);
   ~CodecComparisonWindow() override = default;
 
@@ -72,6 +73,7 @@ class CodecComparisonWindow : public QMainWindow {
   const QByteArray monitorIccProfile_;
 };
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
 
 #endif  // TOOLS_COMPARISON_VIEWER_CODEC_COMPARISON_WINDOW_H_
index 1fbda6a..85ba810 100644 (file)
      </layout>
     </item>
     <item>
-     <widget class="jxl::SplitImageView" name="splitImageView" native="true"/>
+     <widget class="jpegxl::tools::SplitImageView" name="splitImageView" native="true"/>
     </item>
    </layout>
   </widget>
  </widget>
  <customwidgets>
   <customwidget>
-   <class>jxl::SplitImageView</class>
+   <class>jpegxl::tools::SplitImageView</class>
    <extends>QWidget</extends>
    <header>split_image_view.h</header>
    <container>1</container>
index 932765e..3ab4c8d 100644 (file)
@@ -66,7 +66,7 @@ int main(int argc, char** argv) {
 
   for (const QString& folder : folders) {
     auto* const window =
-        new jxl::CodecComparisonWindow(folder, intensityTarget);
+        new jpegxl::tools::CodecComparisonWindow(folder, intensityTarget);
     window->setAttribute(Qt::WA_DeleteOnClose);
     window->show();
   }
index cf39f88..321b2c4 100644 (file)
@@ -87,13 +87,14 @@ int main(int argc, char** argv) {
     parser.showHelp(EXIT_FAILURE);
   }
 
-  jxl::SplitImageView view;
+  jpegxl::tools::SplitImageView view;
 
-  const QByteArray monitorIccProfile = jxl::GetMonitorIccProfile(&view);
+  const QByteArray monitorIccProfile =
+      jpegxl::tools::GetMonitorIccProfile(&view);
 
   const QString leftImagePath = arguments.takeFirst();
-  QImage leftImage = jxl::loadImage(leftImagePath, monitorIccProfile,
-                                    intensityTarget, colorSpaceHint);
+  QImage leftImage = jpegxl::tools::loadImage(leftImagePath, monitorIccProfile,
+                                              intensityTarget, colorSpaceHint);
   if (leftImage.isNull()) {
     displayLoadingError(leftImagePath);
     return EXIT_FAILURE;
@@ -101,8 +102,8 @@ int main(int argc, char** argv) {
   view.setLeftImage(std::move(leftImage));
 
   const QString rightImagePath = arguments.takeFirst();
-  QImage rightImage = jxl::loadImage(rightImagePath, monitorIccProfile,
-                                     intensityTarget, colorSpaceHint);
+  QImage rightImage = jpegxl::tools::loadImage(
+      rightImagePath, monitorIccProfile, intensityTarget, colorSpaceHint);
   if (rightImage.isNull()) {
     displayLoadingError(rightImagePath);
     return EXIT_FAILURE;
@@ -111,8 +112,8 @@ int main(int argc, char** argv) {
 
   if (!arguments.empty()) {
     const QString middleImagePath = arguments.takeFirst();
-    QImage middleImage = jxl::loadImage(middleImagePath, monitorIccProfile,
-                                        intensityTarget, colorSpaceHint);
+    QImage middleImage = jpegxl::tools::loadImage(
+        middleImagePath, monitorIccProfile, intensityTarget, colorSpaceHint);
     if (middleImage.isNull()) {
       displayLoadingError(middleImagePath);
       return EXIT_FAILURE;
index 55bebb8..4a44dec 100644 (file)
@@ -5,39 +5,56 @@
 
 #include "tools/comparison_viewer/image_loading.h"
 
+#include <jxl/cms.h>
+
 #include <QRgb>
 #include <QThread>
+#include <cstdint>
+#include <vector>
 
 #include "lib/extras/codec.h"
 #include "lib/extras/dec/color_hints.h"
-#include "lib/jxl/base/file_io.h"
-#include "lib/jxl/base/thread_pool_internal.h"
-#include "lib/jxl/color_management.h"
-#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/image_metadata.h"
+#include "tools/file_io.h"
+#include "tools/thread_pool_internal.h"
 #include "tools/viewer/load_jxl.h"
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
+
+using jxl::CodecInOut;
+using jxl::ColorEncoding;
+using jxl::IccBytes;
+using jxl::Image3F;
+using jxl::ImageBundle;
+using jxl::Rect;
+using jxl::Span;
+using jxl::Status;
+using jxl::ThreadPool;
+using jxl::extras::ColorHints;
 
 namespace {
 
-Status loadFromFile(const QString& filename,
-                    const extras::ColorHints& color_hints,
+Status loadFromFile(const QString& filename, const ColorHints& color_hints,
                     CodecInOut* const decoded, ThreadPool* const pool) {
-  PaddedBytes compressed;
-  JXL_RETURN_IF_ERROR(ReadFile(filename.toStdString(), &compressed));
+  std::vector<uint8_t> compressed;
+  JXL_RETURN_IF_ERROR(
+      jpegxl::tools::ReadFile(filename.toStdString(), &compressed));
   const Span<const uint8_t> compressed_span(compressed);
-  return SetFromBytes(compressed_span, color_hints, decoded, pool, nullptr);
+  return jxl::SetFromBytes(compressed_span, color_hints, decoded, pool,
+                           nullptr);
 }
 
 }  // namespace
 
 bool canLoadImageWithExtension(QString extension) {
   extension = extension.toLower();
-  size_t bitsPerSampleUnused;
-  return extension == "jxl" || extension == "j" || extension == "brn" ||
-         extras::CodecFromExtension("." + extension.toStdString(),
-                                    &bitsPerSampleUnused) !=
-             jxl::extras::Codec::kUnknown;
+  if (extension == "jxl" || extension == "j" || extension == "brn") {
+    return true;
+  }
+  const auto codec = jxl::extras::CodecFromPath("." + extension.toStdString());
+  return codec != jxl::extras::Codec::kUnknown;
 }
 
 QImage loadImage(const QString& filename, const QByteArray& targetIccProfile,
@@ -51,7 +68,7 @@ QImage loadImage(const QString& filename, const QByteArray& targetIccProfile,
   static ThreadPoolInternal pool(QThread::idealThreadCount());
 
   CodecInOut decoded;
-  extras::ColorHints color_hints;
+  ColorHints color_hints;
   if (!sourceColorSpaceHint.isEmpty()) {
     color_hints.Add("color_space", sourceColorSpaceHint.toStdString());
   }
@@ -62,22 +79,28 @@ QImage loadImage(const QString& filename, const QByteArray& targetIccProfile,
   const ImageBundle& ib = decoded.Main();
 
   ColorEncoding targetColorSpace;
-  PaddedBytes icc;
-  icc.assign(reinterpret_cast<const uint8_t*>(targetIccProfile.data()),
-             reinterpret_cast<const uint8_t*>(targetIccProfile.data() +
-                                              targetIccProfile.size()));
-  if (!targetColorSpace.SetICC(std::move(icc))) {
+  bool use_fallback_profile = true;
+  if (!targetIccProfile.isEmpty()) {
+    IccBytes icc;
+    icc.assign(reinterpret_cast<const uint8_t*>(targetIccProfile.data()),
+               reinterpret_cast<const uint8_t*>(targetIccProfile.data() +
+                                                targetIccProfile.size()));
+    use_fallback_profile =
+        !targetColorSpace.SetICC(std::move(icc), JxlGetDefaultCms());
+  }
+  if (use_fallback_profile) {
     targetColorSpace = ColorEncoding::SRGB(ib.IsGray());
   }
   Image3F converted;
-  if (!ib.CopyTo(Rect(ib), targetColorSpace, GetJxlCms(), &converted, &pool)) {
+  if (!ib.CopyTo(Rect(ib), targetColorSpace, *JxlGetDefaultCms(), &converted,
+                 &pool)) {
     return QImage();
   }
 
   QImage image(converted.xsize(), converted.ysize(), QImage::Format_ARGB32);
 
   const auto ScaleAndClamp = [](const float x) {
-    return Clamp1(x * 255 + .5f, 0.f, 255.f);
+    return jxl::Clamp1(x * 255 + .5f, 0.f, 255.f);
   };
 
   if (ib.HasAlpha()) {
@@ -108,4 +131,5 @@ QImage loadImage(const QString& filename, const QByteArray& targetIccProfile,
   return image;
 }
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
index 89b37d1..37baaef 100644 (file)
 #include <QImage>
 #include <QString>
 
-#include "lib/jxl/common.h"
+#include "lib/jxl/base/common.h"
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
 
 // `extension` should not include the dot.
 bool canLoadImageWithExtension(QString extension);
@@ -21,9 +22,10 @@ bool canLoadImageWithExtension(QString extension);
 // specified. Thread-hostile.
 QImage loadImage(const QString& filename,
                  const QByteArray& targetIccProfile = QByteArray(),
-                 float intensityTarget = kDefaultIntensityTarget,
+                 float intensityTarget = jxl::kDefaultIntensityTarget,
                  const QString& sourceColorSpaceHint = QString());
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
 
 #endif  // TOOLS_COMPARISON_VIEWER_IMAGE_LOADING_H_
index 9ef117b..ca5c0c9 100644 (file)
@@ -5,7 +5,8 @@
 
 #include "tools/comparison_viewer/settings.h"
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
 
 SettingsDialog::SettingsDialog(QWidget* const parent)
     : QDialog(parent), settings_("JPEG XL project", "Comparison tool") {
@@ -48,4 +49,5 @@ void SettingsDialog::settingsToUi() {
   ui_.grayTime->setValue(renderingSettings_.grayMSecs);
 }
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
index bd91f71..a54cd87 100644 (file)
@@ -12,7 +12,8 @@
 #include "tools/comparison_viewer/split_image_renderer.h"
 #include "tools/comparison_viewer/ui_settings.h"
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
 
 class SettingsDialog : public QDialog {
   Q_OBJECT
@@ -35,6 +36,7 @@ class SettingsDialog : public QDialog {
   SplitImageRenderingSettings renderingSettings_;
 };
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
 
 #endif  // TOOLS_COMPARISON_VIEWER_SETTINGS_H_
index acade64..911229c 100644 (file)
@@ -5,10 +5,6 @@
 
 #include "tools/comparison_viewer/split_image_renderer.h"
 
-#include <algorithm>
-#include <cmath>
-#include <utility>
-
 #include <QEvent>
 #include <QGuiApplication>
 #include <QPainter>
 #include <QPen>
 #include <QPoint>
 #include <QRect>
+#include <algorithm>
+#include <cmath>
+#include <utility>
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
 
 SplitImageRenderer::SplitImageRenderer(QWidget* const parent)
     : QWidget(parent) {
@@ -32,16 +32,19 @@ SplitImageRenderer::SplitImageRenderer(QWidget* const parent)
 
 void SplitImageRenderer::setLeftImage(QImage image) {
   leftImage_ = QPixmap::fromImage(std::move(image));
+  leftImage_.setDevicePixelRatio(devicePixelRatio());
   updateMinimumSize();
   update();
 }
 void SplitImageRenderer::setRightImage(QImage image) {
   rightImage_ = QPixmap::fromImage(std::move(image));
+  rightImage_.setDevicePixelRatio(devicePixelRatio());
   updateMinimumSize();
   update();
 }
 void SplitImageRenderer::setMiddleImage(QImage image) {
   middleImage_ = QPixmap::fromImage(std::move(image));
+  middleImage_.setDevicePixelRatio(devicePixelRatio());
   updateMinimumSize();
   update();
 }
@@ -181,7 +184,8 @@ void SplitImageRenderer::paintEvent(QPaintEvent* const event) {
       painter.transform().inverted().map(QPointF(middleX_, 0.)).x();
   QRectF middleRect = middleImage_.rect();
   middleRect.setWidth(middleWidth);
-  middleRect.moveCenter(QPointF(transformedMiddleX, middleRect.center().y()));
+  middleRect.moveCenter(QPointF(transformedMiddleX * devicePixelRatio(),
+                                middleRect.center().y()));
   middleRect.setLeft(std::round(middleRect.left()));
   middleRect.setRight(std::round(middleRect.right()));
 
@@ -191,24 +195,30 @@ void SplitImageRenderer::paintEvent(QPaintEvent* const event) {
   QRectF rightRect = rightImage_.rect();
   rightRect.setLeft(middleRect.right());
 
-  painter.drawPixmap(leftRect, leftImage_, leftRect);
-  painter.drawPixmap(rightRect, rightImage_, rightRect);
-  painter.drawPixmap(middleRect, middleImage_, middleRect);
+  painter.drawPixmap(QPointF(), leftImage_, leftRect);
+  painter.drawPixmap(middleRect.topLeft() / devicePixelRatio(), middleImage_,
+                     middleRect);
+  painter.drawPixmap(rightRect.topLeft() / devicePixelRatio(), rightImage_,
+                     rightRect);
 
   QPen middlePen;
   middlePen.setStyle(Qt::DotLine);
   painter.setPen(middlePen);
-  painter.drawLine(leftRect.topRight(), leftRect.bottomRight());
-  painter.drawLine(rightRect.topLeft(), rightRect.bottomLeft());
+  painter.drawLine(leftRect.topRight() / devicePixelRatio(),
+                   leftRect.bottomRight() / devicePixelRatio());
+  painter.drawLine(rightRect.topLeft() / devicePixelRatio(),
+                   rightRect.bottomLeft() / devicePixelRatio());
 }
 
 void SplitImageRenderer::updateMinimumSize() {
-  const int imagesWidth = std::max(
-      std::max(leftImage_.width(), rightImage_.width()), middleImage_.width());
-  const int imagesHeight =
-      std::max(std::max(leftImage_.height(), rightImage_.height()),
-               middleImage_.height());
-  setMinimumSize(scale_ * QSize(imagesWidth, imagesHeight));
+  const QSizeF leftSize = leftImage_.deviceIndependentSize();
+  const QSizeF rightSize = rightImage_.deviceIndependentSize();
+  const QSizeF middleSize = middleImage_.deviceIndependentSize();
+  const qreal imagesWidth = std::max(
+      std::max(leftSize.width(), rightSize.width()), middleSize.width());
+  const qreal imagesHeight = std::max(
+      std::max(leftSize.height(), rightSize.height()), middleSize.height());
+  setMinimumSize((scale_ * QSizeF(imagesWidth, imagesHeight)).toSize());
 }
 
 void SplitImageRenderer::setRenderingMode(const RenderingMode newMode) {
@@ -236,4 +246,5 @@ void SplitImageRenderer::setRenderingMode(const RenderingMode newMode) {
   emit renderingModeChanged(mode_);
 }
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
index decb407..5d3029a 100644 (file)
@@ -15,7 +15,8 @@
 #include <QWheelEvent>
 #include <QWidget>
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
 
 struct SplitImageRenderingSettings {
   int fadingMSecs;
@@ -85,6 +86,7 @@ class SplitImageRenderer : public QWidget {
   double scale_ = 1.;
 };
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
 
 #endif  // TOOLS_COMPARISON_VIEWER_SPLIT_IMAGE_RENDERER_H_
index 76c8edc..9c27f46 100644 (file)
@@ -11,7 +11,8 @@
 
 #include "tools/comparison_viewer/split_image_renderer.h"
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
 
 SplitImageView::SplitImageView(QWidget* const parent) : QWidget(parent) {
   ui_.setupUi(this);
@@ -68,4 +69,5 @@ void SplitImageView::on_settingsButton_clicked() {
   }
 }
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
index 4978750..b9c3536 100644 (file)
@@ -11,7 +11,8 @@
 #include "tools/comparison_viewer/settings.h"
 #include "tools/comparison_viewer/ui_split_image_view.h"
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
 
 class SplitImageView : public QWidget {
   Q_OBJECT
@@ -35,6 +36,7 @@ class SplitImageView : public QWidget {
   SettingsDialog settings_;
 };
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
 
 #endif  // TOOLS_COMPARISON_VIEWER_SPLIT_IMAGE_VIEW_H_
index 0755a58..f3b80c9 100644 (file)
@@ -17,7 +17,7 @@
      <property name="widgetResizable">
       <bool>true</bool>
      </property>
-     <widget class="jxl::SplitImageRenderer" name="splitImageRenderer"/>
+     <widget class="jpegxl::tools::SplitImageRenderer" name="splitImageRenderer"/>
     </widget>
    </item>
    <item>
  </widget>
  <customwidgets>
   <customwidget>
-   <class>jxl::SplitImageRenderer</class>
+   <class>jpegxl::tools::SplitImageRenderer</class>
    <extends>QWidget</extends>
    <header>split_image_renderer.h</header>
    <container>1</container>
index 15158bc..e4be865 100755 (executable)
@@ -12,6 +12,7 @@ import argparse
 import json
 import numpy
 import os
+import shutil
 import subprocess
 import sys
 import tempfile
@@ -166,7 +167,13 @@ def ConformanceTestRunner(args):
                 for reference_basename, decoded_filename in exact_tests:
                     reference_filename = os.path.join(test_dir,
                                                       reference_basename)
-                    ok = ok & CompareBinaries(reference_filename, decoded_filename)
+                    binary_ok = CompareBinaries(reference_filename,
+                                                decoded_filename)
+                    if not binary_ok and args.update_on_failure:
+                        os.unlink(reference_filename)
+                        shutil.copy2(decoded_filename, reference_filename)
+                        binary_ok = True
+                    ok = ok & binary_ok
 
                 # Validate metadata.
                 with open(meta_filename, 'r') as f:
@@ -182,36 +189,50 @@ def ConformanceTestRunner(args):
                 with open(reference_icc, 'rb') as f:
                     reference_icc = f.read()
 
-                reference_npy = os.path.join(test_dir, 'reference_image.npy')
-                decoded_npy = os.path.join(work_dir, 'decoded_image.npy')
+                reference_npy_fn = os.path.join(test_dir, 'reference_image.npy')
+                decoded_npy_fn = os.path.join(work_dir, 'decoded_image.npy')
 
-                if not os.path.exists(decoded_npy):
+                if not os.path.exists(decoded_npy_fn):
                     ok = Failure('File not decoded: decoded_image.npy')
                     continue
 
-                reference_npy = numpy.load(reference_npy)
-                decoded_npy = numpy.load(decoded_npy)
+                reference_npy = numpy.load(reference_npy_fn)
+                decoded_npy = numpy.load(decoded_npy_fn)
 
+                frames_ok = True
                 for i, fd in enumerate(descriptor['frames']):
-                    ok = ok & CompareNPY(reference_npy, reference_icc, decoded_npy,
-                                         decoded_icc, i, fd['rms_error'],
-                                         fd['peak_error'])
+                    frames_ok = frames_ok & CompareNPY(
+                        reference_npy, reference_icc, decoded_npy,
+                        decoded_icc, i, fd['rms_error'],
+                        fd['peak_error'])
+
+                if not frames_ok and args.update_on_failure:
+                    os.unlink(reference_npy_fn)
+                    shutil.copy2(decoded_npy_fn, reference_npy_fn)
+                    frames_ok = True
+                ok = ok & frames_ok
 
                 if 'preview' in descriptor:
-                    reference_npy = os.path.join(test_dir,
-                                                 'reference_preview.npy')
-                    decoded_npy = os.path.join(work_dir, 'decoded_preview.npy')
+                    reference_npy_fn = os.path.join(test_dir,
+                                                    'reference_preview.npy')
+                    decoded_npy_fn = os.path.join(work_dir,
+                                                  'decoded_preview.npy')
 
-                    if not os.path.exists(decoded_npy):
+                    if not os.path.exists(decoded_npy_fn):
                         ok = Failure(
                             'File not decoded: decoded_preview.npy')
 
-                    reference_npy = numpy.load(reference_npy)
-                    decoded_npy = numpy.load(decoded_npy)
-                    ok = ok & CompareNPY(reference_npy, reference_icc, decoded_npy,
-                                         decoded_icc, 0,
-                                         descriptor['preview']['rms_error'],
-                                         descriptor['preview']['peak_error'])
+                    reference_npy = numpy.load(reference_npy_fn)
+                    decoded_npy = numpy.load(decoded_npy_fn)
+                    preview_ok = CompareNPY(reference_npy, reference_icc,
+                                            decoded_npy, decoded_icc, 0,
+                                            descriptor['preview']['rms_error'],
+                                            descriptor['preview']['peak_error'])
+                    if not preview_ok & args.update_on_failure:
+                        os.unlink(reference_npy_fn)
+                        shutil.copy2(decoded_npy_fn, reference_npy_fn)
+                        preview_ok = True
+                    ok = ok & preview_ok
 
     return ok
 
@@ -228,6 +249,9 @@ def main():
         required=True,
         help=('path to the corpus directory or corpus descriptor'
               ' text file.'))
+    parser.add_argument(
+        '--update_on_failure', action='store_true',
+        help='If set, updates reference files on failing checks.')
     args = parser.parse_args()
     if not ConformanceTestRunner(args):
         sys.exit(1)
index f8313cd..09f6334 100644 (file)
@@ -8,8 +8,12 @@ import ctypes
 from numpy.ctypeslib import ndpointer
 import numpy
 import os
+import platform
 
-lcms2_lib_path = os.getenv("LCMS2_LIB_PATH", "liblcms2.so.2")
+IS_OSX = (platform.system() == "Darwin")
+
+default_libcms2_lib_path = ["liblcms2.so.2", "liblcms2.2.dylib"][IS_OSX]
+lcms2_lib_path = os.getenv("LCMS2_LIB_PATH", default_libcms2_lib_path)
 lcms2_lib = ctypes.cdll.LoadLibrary(lcms2_lib_path)
 
 native_open_profile = lcms2_lib.cmsOpenProfileFromMem
index 59b1d6d..3c1d8e9 100644 (file)
@@ -9,11 +9,12 @@
 
 #include "lib/extras/codec.h"
 #include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/span.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/base/thread_pool_internal.h"
 #include "lib/jxl/codec_in_out.h"
+#include "tools/file_io.h"
+#include "tools/thread_pool_internal.h"
 
-namespace jxl {
 namespace {
 
 // Reads an input file (typically PNM) with color_space hint and writes to an
@@ -27,16 +28,26 @@ int Convert(int argc, char** argv) {
   const std::string& desc = argv[2];
   const std::string& pathname_out = argv[3];
 
-  CodecInOut io;
-  extras::ColorHints color_hints;
-  ThreadPoolInternal pool(4);
+  std::vector<uint8_t> encoded_in;
+  if (!jpegxl::tools::ReadFile(pathname_in, &encoded_in)) {
+    fprintf(stderr, "Failed to read image from %s\n", pathname_in.c_str());
+    return 1;
+  }
+  jxl::CodecInOut io;
+  jxl::extras::ColorHints color_hints;
+  jpegxl::tools::ThreadPoolInternal pool(4);
   color_hints.Add("color_space", desc);
-  if (!SetFromFile(pathname_in, color_hints, &io, &pool)) {
-    fprintf(stderr, "Failed to read %s\n", pathname_in.c_str());
+  if (!jxl::SetFromBytes(jxl::Bytes(encoded_in), color_hints, &io, &pool)) {
+    fprintf(stderr, "Failed to decode %s\n", pathname_in.c_str());
     return 1;
   }
 
-  if (!EncodeToFile(io, pathname_out, &pool)) {
+  std::vector<uint8_t> encoded_out;
+  if (!jxl::Encode(io, pathname_out, &encoded_out, &pool)) {
+    fprintf(stderr, "Failed to encode %s\n", pathname_out.c_str());
+    return 1;
+  }
+  if (!jpegxl::tools::WriteFile(pathname_out, encoded_out)) {
     fprintf(stderr, "Failed to write %s\n", pathname_out.c_str());
     return 1;
   }
@@ -45,6 +56,5 @@ int Convert(int argc, char** argv) {
 }
 
 }  // namespace
-}  // namespace jxl
 
-int main(int argc, char** argv) { return jxl::Convert(argc, argv); }
+int main(int argc, char** argv) { return Convert(argc, argv); }
index 59f7089..8e97ff6 100644 (file)
@@ -3,11 +3,11 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+#include <jxl/decode.h>
 #include <stdint.h>
 
-#include "jxl/decode.h"
-
-namespace jxl {
+namespace jpegxl {
+namespace tools {
 
 int TestOneInput(const uint8_t* data, size_t size) {
   JxlDecoderStatus status;
@@ -40,19 +40,19 @@ int TestOneInput(const uint8_t* data, size_t size) {
     return 0;
   }
 
-  JxlPixelFormat format = {4, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0};
-  JxlDecoderGetColorAsEncodedProfile(
-      dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL, nullptr);
+  JxlDecoderGetColorAsEncodedProfile(dec, JXL_COLOR_PROFILE_TARGET_ORIGINAL,
+                                     nullptr);
   size_t dec_profile_size;
-  JxlDecoderGetICCProfileSize(dec, &format, JXL_COLOR_PROFILE_TARGET_ORIGINAL,
+  JxlDecoderGetICCProfileSize(dec, JXL_COLOR_PROFILE_TARGET_ORIGINAL,
                               &dec_profile_size);
 
   JxlDecoderDestroy(dec);
   return 0;
 }
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
 
 extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
-  return jxl::TestOneInput(data, size);
+  return jpegxl::tools::TestOneInput(data, size);
 }
diff --git a/tools/djpegli.cc b/tools/djpegli.cc
new file mode 100644 (file)
index 0000000..bac55e1
--- /dev/null
@@ -0,0 +1,197 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <string>
+#include <vector>
+
+#include "lib/extras/dec/jpegli.h"
+#include "lib/extras/enc/apng.h"
+#include "lib/extras/enc/encode.h"
+#include "lib/extras/time.h"
+#include "lib/jxl/base/printf_macros.h"
+#include "tools/cmdline.h"
+#include "tools/file_io.h"
+#include "tools/speed_stats.h"
+
+namespace jpegxl {
+namespace tools {
+namespace {
+
+struct Args {
+  void AddCommandLineOptions(CommandLineParser* cmdline) {
+    std::string output_help("The output can be ");
+    if (jxl::extras::GetAPNGEncoder()) {
+      output_help.append("PNG, ");
+    }
+    output_help.append("PFM or PPM/PGM/PNM");
+    cmdline->AddPositionalOption("INPUT", /* required = */ true,
+                                 "The JPG input file.", &file_in);
+
+    cmdline->AddPositionalOption("OUTPUT", /* required = */ true, output_help,
+                                 &file_out);
+    cmdline->AddOptionFlag('\0', "disable_output",
+                           "No output file will be written (for benchmarking)",
+                           &disable_output, &SetBooleanTrue);
+
+    cmdline->AddOptionValue('\0', "bitdepth", "8|16",
+                            "Sets the output bitdepth for integer based "
+                            "formats, can be 8 (default) "
+                            "or 16. Has no impact on PFM output.",
+                            &bitdepth, &ParseUnsigned);
+
+    cmdline->AddOptionValue('\0', "num_reps", "N",
+                            "Sets the number of times to decompress the image. "
+                            "Used for benchmarking, the default is 1.",
+                            &num_reps, &ParseUnsigned);
+
+    cmdline->AddOptionFlag('\0', "quiet", "Silence output (except for errors).",
+                           &quiet, &SetBooleanTrue);
+  }
+
+  const char* file_in = nullptr;
+  const char* file_out = nullptr;
+  bool disable_output = false;
+  size_t bitdepth = 8;
+  size_t num_reps = 1;
+  bool quiet = false;
+};
+
+bool ValidateArgs(const Args& args) {
+  if (args.bitdepth != 8 && args.bitdepth != 16) {
+    fprintf(stderr, "Invalid --bitdepth argument\n");
+    return false;
+  }
+  return true;
+}
+
+void SetDecompressParams(const Args& args, const std::string& extension,
+                         jxl::extras::JpegDecompressParams* params) {
+  if (extension == ".pfm") {
+    params->output_data_type = JXL_TYPE_FLOAT;
+    params->output_endianness = JXL_BIG_ENDIAN;
+  } else if (args.bitdepth == 16) {
+    params->output_data_type = JXL_TYPE_UINT16;
+    params->output_endianness = JXL_BIG_ENDIAN;
+  }
+  if (extension == ".pgm") {
+    params->force_grayscale = true;
+  } else if (extension == ".ppm") {
+    params->force_rgb = true;
+  }
+}
+
+int DJpegliMain(int argc, const char* argv[]) {
+  Args args;
+  CommandLineParser cmdline;
+  args.AddCommandLineOptions(&cmdline);
+
+  if (!cmdline.Parse(argc, const_cast<const char**>(argv))) {
+    // Parse already printed the actual error cause.
+    fprintf(stderr, "Use '%s -h' for more information.\n", argv[0]);
+    return EXIT_FAILURE;
+  }
+
+  if (cmdline.HelpFlagPassed() || !args.file_in) {
+    cmdline.PrintHelp();
+    return EXIT_SUCCESS;
+  }
+
+  if (!args.file_out && !args.disable_output) {
+    fprintf(stderr,
+            "No output file specified and --disable_output flag not passed.\n");
+    return EXIT_FAILURE;
+  }
+
+  if (args.disable_output && !args.quiet) {
+    fprintf(stderr,
+            "Decoding will be performed, but the result will be discarded.\n");
+  }
+
+  if (!ValidateArgs(args)) {
+    return EXIT_FAILURE;
+  }
+
+  std::vector<uint8_t> jpeg_bytes;
+  if (!ReadFile(args.file_in, &jpeg_bytes)) {
+    fprintf(stderr, "Failed to read input image %s\n", args.file_in);
+    return EXIT_FAILURE;
+  }
+
+  if (!args.quiet) {
+    fprintf(stderr, "Read %" PRIuS " compressed bytes.\n", jpeg_bytes.size());
+  }
+
+  std::string filename_out;
+  std::string extension;
+  if (args.file_out) {
+    filename_out = std::string(args.file_out);
+    size_t pos = filename_out.find_last_of('.');
+    if (pos >= filename_out.size()) {
+      fprintf(stderr, "Unrecognized output extension.\n");
+      return EXIT_FAILURE;
+    }
+    extension = filename_out.substr(pos);
+  }
+
+  jxl::extras::JpegDecompressParams dparams;
+  SetDecompressParams(args, extension, &dparams);
+
+  jxl::extras::PackedPixelFile ppf;
+  jpegxl::tools::SpeedStats stats;
+  for (size_t num_rep = 0; num_rep < args.num_reps; ++num_rep) {
+    const double t0 = jxl::Now();
+    if (!jxl::extras::DecodeJpeg(jpeg_bytes, dparams, nullptr, &ppf)) {
+      fprintf(stderr, "jpegli decoding failed\n");
+      return EXIT_FAILURE;
+    }
+    const double t1 = jxl::Now();
+    stats.NotifyElapsed(t1 - t0);
+    stats.SetImageSize(ppf.info.xsize, ppf.info.ysize);
+  }
+
+  if (!args.quiet) {
+    stats.Print(1);
+  }
+
+  if (args.disable_output) {
+    return EXIT_SUCCESS;
+  }
+
+  if (extension == ".pnm") {
+    extension = ppf.info.num_color_channels == 3 ? ".ppm" : ".pgm";
+  }
+
+  std::unique_ptr<jxl::extras::Encoder> encoder =
+      jxl::extras::Encoder::FromExtension(extension);
+  if (encoder == nullptr) {
+    fprintf(stderr, "Can't decode to the file extension '%s'\n",
+            extension.c_str());
+    return EXIT_FAILURE;
+  }
+  jxl::extras::EncodedImage encoded_image;
+  if (!encoder->Encode(ppf, &encoded_image) ||
+      encoded_image.bitstreams.empty()) {
+    fprintf(stderr, "Encode failed\n");
+    return EXIT_FAILURE;
+  }
+  if (!WriteFile(filename_out, encoded_image.bitstreams[0])) {
+    fprintf(stderr, "Failed to write output file %s\n", filename_out.c_str());
+    return EXIT_FAILURE;
+  }
+
+  return EXIT_SUCCESS;
+}
+
+}  // namespace
+}  // namespace tools
+}  // namespace jpegxl
+
+int main(int argc, const char* argv[]) {
+  return jpegxl::tools::DJpegliMain(argc, argv);
+}
index a03472a..4691eb4 100644 (file)
@@ -3,24 +3,22 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+#include <jxl/decode.h>
+#include <jxl/decode_cxx.h>
+#include <jxl/thread_parallel_runner.h>
+#include <jxl/thread_parallel_runner_cxx.h>
 #include <limits.h>
 #include <stdint.h>
-#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
 #include <algorithm>
+#include <hwy/targets.h>
 #include <map>
 #include <mutex>
 #include <random>
 #include <vector>
 
-#include "hwy/targets.h"
-#include "jxl/decode.h"
-#include "jxl/decode_cxx.h"
-#include "jxl/thread_parallel_runner.h"
-#include "jxl/thread_parallel_runner_cxx.h"
-
 namespace {
 
 // Externally visible value to ensure pixels are used in the fuzzer.
@@ -81,10 +79,10 @@ bool DecodeJpegXl(const uint8_t* jxl, size_t size, size_t max_pixels,
   auto dec = JxlDecoderMake(nullptr);
   if (JXL_DEC_SUCCESS !=
       JxlDecoderSubscribeEvents(
-          dec.get(), JXL_DEC_BASIC_INFO | JXL_DEC_EXTENSIONS |
-                         JXL_DEC_COLOR_ENCODING | JXL_DEC_PREVIEW_IMAGE |
-                         JXL_DEC_FRAME | JXL_DEC_FULL_IMAGE |
-                         JXL_DEC_JPEG_RECONSTRUCTION | JXL_DEC_BOX)) {
+          dec.get(), JXL_DEC_BASIC_INFO | JXL_DEC_COLOR_ENCODING |
+                         JXL_DEC_PREVIEW_IMAGE | JXL_DEC_FRAME |
+                         JXL_DEC_FULL_IMAGE | JXL_DEC_JPEG_RECONSTRUCTION |
+                         JXL_DEC_BOX)) {
     return false;
   }
   if (JXL_DEC_SUCCESS != JxlDecoderSetParallelRunner(dec.get(),
@@ -111,7 +109,6 @@ bool DecodeJpegXl(const uint8_t* jxl, size_t size, size_t max_pixels,
   }
 
   bool seen_basic_info = false;
-  bool seen_extensions = false;
   bool seen_color_encoding = false;
   bool seen_preview = false;
   bool seen_need_image_out = false;
@@ -213,6 +210,7 @@ bool DecodeJpegXl(const uint8_t* jxl, size_t size, size_t max_pixels,
         return false;
       }
     } else if (status == JXL_DEC_JPEG_NEED_MORE_OUTPUT) {
+      if (want_preview) abort();  // expected preview before frame
       if (spec.jpeg_to_pixels) abort();
       if (!seen_jpeg_reconstruction) abort();
       seen_jpeg_need_more_output = true;
@@ -274,12 +272,6 @@ bool DecodeJpegXl(const uint8_t* jxl, size_t size, size_t max_pixels,
         }
         Consume(ec_name.cbegin(), ec_name.cend());
       }
-    } else if (status == JXL_DEC_EXTENSIONS) {
-      if (!seen_basic_info) abort();     // expected basic info first
-      if (seen_color_encoding) abort();  // should happen after this
-      if (seen_extensions) abort();      // already seen extensions
-      seen_extensions = true;
-      // TODO(eustas): get extensions?
     } else if (status == JXL_DEC_COLOR_ENCODING) {
       if (!seen_basic_info) abort();     // expected basic info first
       if (seen_color_encoding) abort();  // already seen color encoding
@@ -288,14 +280,13 @@ bool DecodeJpegXl(const uint8_t* jxl, size_t size, size_t max_pixels,
       // Get the ICC color profile of the pixel data
       size_t icc_size;
       if (JXL_DEC_SUCCESS !=
-          JxlDecoderGetICCProfileSize(
-              dec.get(), &format, JXL_COLOR_PROFILE_TARGET_DATA, &icc_size)) {
+          JxlDecoderGetICCProfileSize(dec.get(), JXL_COLOR_PROFILE_TARGET_DATA,
+                                      &icc_size)) {
         return false;
       }
       icc_profile->resize(icc_size);
       if (JXL_DEC_SUCCESS != JxlDecoderGetColorAsICCProfile(
-                                 dec.get(), &format,
-                                 JXL_COLOR_PROFILE_TARGET_DATA,
+                                 dec.get(), JXL_COLOR_PROFILE_TARGET_DATA,
                                  icc_profile->data(), icc_profile->size())) {
         return false;
       }
@@ -313,6 +304,7 @@ bool DecodeJpegXl(const uint8_t* jxl, size_t size, size_t max_pixels,
         }
       }
     } else if (status == JXL_DEC_PREVIEW_IMAGE) {
+      // TODO(eustas): test JXL_DEC_NEED_PREVIEW_OUT_BUFFER
       if (seen_preview) abort();
       if (!want_preview) abort();
       if (!seen_color_encoding) abort();
@@ -404,7 +396,10 @@ bool DecodeJpegXl(const uint8_t* jxl, size_t size, size_t max_pixels,
         }
       }
     } else if (status == JXL_DEC_JPEG_RECONSTRUCTION) {
-      if (want_preview) abort();  // expected preview before frame
+      // Do not check preview precedence here, since this event only declares
+      // that JPEG is going to be decoded; though, when first byte of JPEG
+      // arrives (JXL_DEC_JPEG_NEED_MORE_OUTPUT) it is certain that preview
+      // should have been produced already.
       if (seen_jpeg_reconstruction) abort();
       seen_jpeg_reconstruction = true;
       if (!spec.jpeg_to_pixels) {
similarity index 87%
rename from tools/fuzzer_corpus.cc
rename to tools/djxl_fuzzer_corpus.cc
index 159256c..73c7eae 100644 (file)
@@ -15,6 +15,8 @@
 #include <unistd.h>
 #endif
 
+#include <jxl/cms.h>
+
 #include <algorithm>
 #include <functional>
 #include <iostream>
 #include <random>
 #include <vector>
 
-#if JPEGXL_ENABLE_JPEG
 #include "lib/extras/codec.h"
-#endif
-#include "lib/jxl/aux_out.h"
 #include "lib/jxl/base/data_parallel.h"
-#include "lib/jxl/base/file_io.h"
 #include "lib/jxl/base/override.h"
 #include "lib/jxl/base/span.h"
-#include "lib/jxl/base/thread_pool_internal.h"
 #include "lib/jxl/codec_in_out.h"
 #include "lib/jxl/enc_ans.h"
 #include "lib/jxl/enc_cache.h"
-#include "lib/jxl/enc_color_management.h"
 #include "lib/jxl/enc_external_image.h"
-#include "lib/jxl/enc_file.h"
 #include "lib/jxl/enc_params.h"
 #include "lib/jxl/encode_internal.h"
 #include "lib/jxl/jpeg/enc_jpeg_data.h"
 #include "lib/jxl/modular/encoding/context_predict.h"
+#include "lib/jxl/test_utils.h"  // TODO(eustas): cut this dependency
+#include "tools/file_io.h"
+#include "tools/thread_pool_internal.h"
 
 namespace {
 
@@ -175,7 +173,6 @@ bool GenerateFile(const char* output_dir, const ImageSpec& spec,
   }
   io.metadata.m.SetAlphaBits(spec.alpha_bit_depth, spec.alpha_is_premultiplied);
   io.metadata.m.orientation = spec.orientation;
-  io.dec_pixels = spec.width * spec.height;
   io.frames.clear();
   io.frames.reserve(spec.num_frames);
 
@@ -214,46 +211,43 @@ bool GenerateFile(const char* output_dir, const ImageSpec& spec,
         }
       }
     }
-
+    uint32_t num_channels = bytes_per_pixel / bytes_per_sample;
+    JxlDataType data_type =
+        bytes_per_sample == 1 ? JXL_TYPE_UINT8 : JXL_TYPE_UINT16;
+    JxlPixelFormat format = {num_channels, data_type, JXL_LITTLE_ENDIAN, 0};
     const jxl::Span<const uint8_t> span(img_data.data(), img_data.size());
     JXL_RETURN_IF_ERROR(ConvertFromExternal(
         span, spec.width, spec.height, io.metadata.m.color_encoding,
-        bytes_per_pixel / bytes_per_sample,
-        /*alpha_is_premultiplied=*/spec.alpha_is_premultiplied,
-        io.metadata.m.bit_depth.bits_per_sample, JXL_LITTLE_ENDIAN, nullptr,
-        &ib, /*float_in=*/false, /*align=*/0));
+        io.metadata.m.bit_depth.bits_per_sample, format, nullptr, &ib));
     io.frames.push_back(std::move(ib));
   }
 
   jxl::CompressParams params;
   params.speed_tier = spec.params.speed_tier;
 
-#if JPEGXL_ENABLE_JPEG
   if (spec.is_reconstructible_jpeg) {
     // If this image is supposed to be a reconstructible JPEG, collect the JPEG
     // metadata and encode it in the beginning of the compressed bytes.
     std::vector<uint8_t> jpeg_bytes;
     io.jpeg_quality = 70;
-    JXL_RETURN_IF_ERROR(jxl::Encode(io, jxl::extras::Codec::kJPG,
-                                    io.metadata.m.color_encoding,
-                                    /*bits_per_sample=*/8, &jpeg_bytes,
-                                    /*pool=*/nullptr));
+    JXL_QUIET_RETURN_IF_ERROR(jxl::Encode(io, jxl::extras::Codec::kJPG,
+                                          io.metadata.m.color_encoding,
+                                          /*bits_per_sample=*/8, &jpeg_bytes,
+                                          /*pool=*/nullptr));
     JXL_RETURN_IF_ERROR(jxl::jpeg::DecodeImageJPG(
-        jxl::Span<const uint8_t>(jpeg_bytes.data(), jpeg_bytes.size()), &io));
-    jxl::PaddedBytes jpeg_data;
+        jxl::Bytes(jpeg_bytes.data(), jpeg_bytes.size()), &io));
+    std::vector<uint8_t> jpeg_data;
     JXL_RETURN_IF_ERROR(
         EncodeJPEGData(*io.Main().jpeg_data, &jpeg_data, params));
     std::vector<uint8_t> header;
-    header.insert(header.end(), jxl::kContainerHeader,
-                  jxl::kContainerHeader + sizeof(jxl::kContainerHeader));
+    header.insert(header.end(), jxl::kContainerHeader.begin(),
+                  jxl::kContainerHeader.end());
     jxl::AppendBoxHeader(jxl::MakeBoxType("jbrd"), jpeg_data.size(), false,
                          &header);
-    header.insert(header.end(), jpeg_data.data(),
-                  jpeg_data.data() + jpeg_data.size());
+    jxl::Bytes(jpeg_data).AppendTo(&header);
     jxl::AppendBoxHeader(jxl::MakeBoxType("jxlc"), 0, true, &header);
     compressed.append(header);
   }
-#endif
 
   params.modular_mode = spec.params.modular_mode;
   params.color_transform = spec.params.color_transform;
@@ -263,13 +257,11 @@ bool GenerateFile(const char* output_dir, const ImageSpec& spec,
   if (spec.params.preview) params.preview = jxl::Override::kOn;
   if (spec.params.noise) params.noise = jxl::Override::kOn;
 
-  jxl::AuxOut aux_out;
   jxl::PassesEncoderState passes_encoder_state;
   // EncodeFile replaces output; pass a temporary storage for it.
-  jxl::PaddedBytes compressed_image;
-  bool ok =
-      jxl::EncodeFile(params, &io, &passes_encoder_state, &compressed_image,
-                      jxl::GetJxlCms(), &aux_out, nullptr);
+  std::vector<uint8_t> compressed_image;
+  bool ok = jxl::test::EncodeFile(params, &io, &passes_encoder_state,
+                                  &compressed_image);
   if (!ok) return false;
   compressed.append(compressed_image);
 
@@ -284,7 +276,7 @@ bool GenerateFile(const char* output_dir, const ImageSpec& spec,
     }
   }
 
-  if (!jxl::WriteFile(compressed, output_fn)) return 1;
+  if (!jpegxl::tools::WriteFile(output_fn, compressed)) return 1;
   if (!quiet) {
     std::unique_lock<std::mutex> lock(stderr_mutex);
     std::cerr << "Stored " << output_fn << " size: " << compressed.size()
@@ -331,7 +323,7 @@ int main(int argc, const char** argv) {
   const char* dest_dir = nullptr;
   bool regenerate = false;
   bool quiet = false;
-  int num_threads = std::thread::hardware_concurrency();
+  size_t num_threads = std::thread::hardware_concurrency();
   for (int optind = 1; optind < argc;) {
     if (!strcmp(argv[optind], "-r")) {
       regenerate = true;
@@ -410,12 +402,8 @@ int main(int argc, const char** argv) {
             for (uint32_t num_frames : {1, 3}) {
               spec.num_frames = num_frames;
               for (uint32_t preview : {0, 1}) {
-#if JPEGXL_ENABLE_JPEG
                 for (bool reconstructible_jpeg : {false, true}) {
                   spec.is_reconstructible_jpeg = reconstructible_jpeg;
-#else   // JPEGXL_ENABLE_JPEG
-                spec.is_reconstructible_jpeg = false;
-#endif  // JPEGXL_ENABLE_JPEG
                   for (const auto& params : params_list) {
                     spec.params = params;
 
@@ -439,9 +427,7 @@ int main(int argc, const char** argv) {
                       specs.push_back(spec);
                     }
                   }
-#if JPEGXL_ENABLE_JPEG
                 }
-#endif  // JPEGXL_ENABLE_JPEG
               }
             }
           }
@@ -457,15 +443,14 @@ int main(int argc, const char** argv) {
     specs.back().params.noise = true;
     specs.back().override_decoder_spec = 0;
 
-    jxl::ThreadPoolInternal pool{num_threads};
-    if (!RunOnPool(
-            &pool, 0, specs.size(), jxl::ThreadPool::NoInit,
-            [&specs, dest_dir, regenerate, quiet](const uint32_t task,
-                                                  size_t /* thread */) {
-              const ImageSpec& spec = specs[task];
-              GenerateFile(dest_dir, spec, regenerate, quiet);
-            },
-            "FuzzerCorpus")) {
+    jpegxl::tools::ThreadPoolInternal pool{num_threads};
+    const auto generate = [&specs, dest_dir, regenerate, quiet](
+                              const uint32_t task, size_t /* thread */) {
+      const ImageSpec& spec = specs[task];
+      GenerateFile(dest_dir, spec, regenerate, quiet);
+    };
+    if (!RunOnPool(&pool, 0, specs.size(), jxl::ThreadPool::NoInit, generate,
+                   "FuzzerCorpus")) {
       std::cerr << "Error generating fuzzer corpus" << std::endl;
       return 1;
     }
index e5b35c9..1b16584 100644 (file)
@@ -3,15 +3,16 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+#include <jxl/thread_parallel_runner.h>
+#include <jxl/thread_parallel_runner_cxx.h>
+
+#include <cstdint>
 #include <sstream>
 #include <string>
 #include <vector>
 
-#include "gtest/gtest.h"
-#include "jxl/thread_parallel_runner.h"
-#include "jxl/thread_parallel_runner_cxx.h"
 #include "lib/jxl/test_utils.h"
-#include "lib/jxl/testdata.h"
+#include "lib/jxl/testing.h"
 
 extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size);
 
@@ -39,6 +40,6 @@ TEST_P(DjxlFuzzerTest, TestOne) {
   std::ostringstream os;
   os << "oss-fuzz/clusterfuzz-testcase-minimized-djxl_fuzzer-" << id;
   printf("Testing %s\n", os.str().c_str());
-  const jxl::PaddedBytes input = jxl::ReadTestData(os.str());
+  const std::vector<uint8_t> input = jxl::test::ReadTestData(os.str());
   LLVMFuzzerTestOneInput(input.data(), input.size());
 }
index 44971c0..9abe0b6 100644 (file)
@@ -3,7 +3,13 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+#include <jxl/decode.h>
+#include <jxl/thread_parallel_runner.h>
+#include <jxl/thread_parallel_runner_cxx.h>
+#include <jxl/types.h>
+
 #include <climits>
+#include <cmath>
 #include <cstddef>
 #include <cstdint>
 #include <cstdio>
 #include <string>
 #include <vector>
 
-#include "jxl/decode.h"
-#include "jxl/thread_parallel_runner.h"
-#include "jxl/thread_parallel_runner_cxx.h"
-#include "jxl/types.h"
+#include "lib/extras/alpha_blend.h"
+#include "lib/extras/codec.h"
 #include "lib/extras/dec/decode.h"
 #include "lib/extras/dec/jxl.h"
+#include "lib/extras/enc/apng.h"
 #include "lib/extras/enc/encode.h"
+#include "lib/extras/enc/exr.h"
+#include "lib/extras/enc/jpg.h"
 #include "lib/extras/enc/pnm.h"
 #include "lib/extras/packed_image.h"
 #include "lib/extras/time.h"
@@ -37,115 +44,178 @@ struct DecompressArgs {
   DecompressArgs() = default;
 
   void AddCommandLineOptions(CommandLineParser* cmdline) {
-    cmdline->AddPositionalOption("INPUT", /* required = */ true,
-                                 "The compressed input file.", &file_in);
-
-    cmdline->AddPositionalOption("OUTPUT", /* required = */ true,
-                                 "The output can be (A)PNG with ICC, JPG, or "
-                                 "PPM/PFM.",
+    std::string output_help("The output format can be ");
+    if (jxl::extras::GetAPNGEncoder()) {
+      output_help.append("PNG, APNG, ");
+    }
+    if (jxl::extras::GetJPEGEncoder()) {
+      output_help.append("JPEG, ");
+    } else {
+      output_help.append("JPEG (lossless reconstruction only), ");
+    }
+    if (jxl::extras::GetEXREncoder()) {
+      output_help.append("EXR, ");
+    }
+    output_help.append(
+        "PGM (for greyscale input), PPM (for color input), PNM, PFM, or PAM.\n"
+        "    To extract metadata, use output format EXIF, XMP, or JUMBF.\n"
+        "    The format is selected based on extension ('filename.png') or "
+        "prefix ('png:filename').\n"
+        "    Use '-' for output to stdout (e.g. 'ppm:-')");
+    cmdline->AddPositionalOption(
+        "INPUT", /* required = */ true,
+        "The compressed input file (JXL). Use '-' for input from stdin.",
+        &file_in);
+
+    cmdline->AddPositionalOption("OUTPUT", /* required = */ true, output_help,
                                  &file_out);
 
+    cmdline->AddHelpText("\nBasic options:", 0);
+
     cmdline->AddOptionFlag('V', "version", "Print version number and exit.",
-                           &version, &SetBooleanTrue);
+                           &version, &SetBooleanTrue, 0);
+    cmdline->AddOptionFlag('\0', "quiet", "Silence output (except for errors).",
+                           &quiet, &SetBooleanTrue, 0);
+    cmdline->AddOptionFlag('v', "verbose",
+                           "Verbose output; can be repeated and also applies "
+                           "to help (!).",
+                           &verbose, &SetBooleanTrue);
 
-    cmdline->AddOptionValue('\0', "num_reps", "N",
-                            "Sets the number of times to decompress the image. "
-                            "Used for benchmarking, the default is 1.",
-                            &num_reps, &ParseUnsigned);
+    cmdline->AddHelpText("\nAdvanced options:", 1);
 
     cmdline->AddOptionValue('\0', "num_threads", "N",
-                            "Sets the number of threads to use. The default 0 "
-                            "value means the machine default.",
-                            &num_threads, &ParseUnsigned);
-
-    cmdline->AddOptionValue('\0', "bits_per_sample", "N",
-                            "Sets the output bit depth. The default 0 value "
-                            "means the original (input) bit depth.",
-                            &bits_per_sample, &ParseUnsigned);
+                            "Number of worker threads (-1 == use machine "
+                            "default, 0 == do not use multithreading).",
+                            &num_threads, &ParseSigned, 1);
+
+    opt_bits_per_sample_id = cmdline->AddOptionValue(
+        '\0', "bits_per_sample", "N",
+        "Sets the output bit depth. The value 0 (default for PNM) "
+        "means the original (input) bit depth.\n"
+        "    The value -1 (default for other codecs) means it depends on the "
+        "output format capabilities\n"
+        "    and the input bit depth (e.g. decoding a 12-bit image to PNG will "
+        "produce a 16-bit PNG).",
+        &bits_per_sample, &ParseSigned, 1);
 
     cmdline->AddOptionValue('\0', "display_nits", "N",
                             "If set to a non-zero value, tone maps the image "
                             "the given peak display luminance.",
-                            &display_nits, &ParseDouble);
-
-    cmdline->AddOptionValue('\0', "color_space", "COLORSPACE_DESC",
-                            "Sets the output color space of the image. This "
-                            "flag has no effect if the image is not XYB "
-                            "encoded.",
-                            &color_space, &ParseString);
-
-    cmdline->AddOptionValue('s', "downsampling", "N",
-                            "If set and the input JXL stream is progressive "
-                            "and contains hints for target downsampling "
-                            "ratios, the decoder will skip any progressive "
-                            "passes that are not needed to produce a partially "
-                            "decoded image intended for this downsampling "
-                            "ratio.",
-                            &downsampling, &ParseUint32);
+                            &display_nits, &ParseDouble, 1);
+
+    cmdline->AddOptionValue(
+        '\0', "color_space", "COLORSPACE_DESC",
+        "Sets the desired output color space of the image. For example:\n"
+        "      --color_space=RGB_D65_SRG_Per_SRG is sRGB with perceptual "
+        "rendering intent\n"
+        "      --color_space=RGB_D65_202_Rel_PeQ is Rec.2100 PQ with relative "
+        "rendering intent",
+        &color_space, &ParseString, 1);
+
+    cmdline->AddOptionValue('s', "downsampling", "1|2|4|8",
+                            "If the input JXL stream is contains hints for "
+                            "target downsampling ratios,\n"
+                            "    only decode what is needed to produce an "
+                            "image intended for this downsampling ratio.",
+                            &downsampling, &ParseUint32, 1);
 
     cmdline->AddOptionFlag('\0', "allow_partial_files",
                            "Allow decoding of truncated files.",
-                           &allow_partial_files, &SetBooleanTrue);
+                           &allow_partial_files, &SetBooleanTrue, 1);
+
+    if (jxl::extras::GetJPEGEncoder()) {
+      cmdline->AddOptionFlag(
+          'j', "pixels_to_jpeg",
+          "By default, if the input JXL is a recompressed JPEG file, "
+          "djxl reconstructs that JPEG file.\n"
+          "    This flag causes the decoder to instead decode to pixels and "
+          "encode a new (lossy) JPEG.",
+          &pixels_to_jpeg, &SetBooleanTrue, 1);
+
+      opt_jpeg_quality_id = cmdline->AddOptionValue(
+          'q', "jpeg_quality", "N",
+          "Sets the JPEG output quality, default is 95. "
+          "Setting this option implies --pixels_to_jpeg.",
+          &jpeg_quality, &ParseUnsigned, 1);
+    }
+
+    cmdline->AddHelpText("\nOptions for experimentation / benchmarking:", 2);
+
+    cmdline->AddOptionValue('\0', "num_reps", "N",
+                            "Sets the number of times to decompress the image. "
+                            "Useful for benchmarking. Default is 1.",
+                            &num_reps, &ParseUnsigned, 2);
+
+    cmdline->AddOptionFlag('\0', "disable_output",
+                           "No output file will be written (for benchmarking)",
+                           &disable_output, &SetBooleanTrue, 2);
+
+    cmdline->AddOptionFlag('\0', "output_extra_channels",
+                           "If set, all extra channels will be written either "
+                           "as part of the main output file (e.g. alpha "
+                           "channel in png) or as separate output files with "
+                           "suffix -ecN in their names. If not set, the "
+                           "(first) alpha channel will only be written when "
+                           "the output format supports alpha channels and all "
+                           "other extra channels won't be decoded. Files are "
+                           "concatenated when outputting to stdout.",
+                           &output_extra_channels, &SetBooleanTrue, 2);
 
-#if JPEGXL_ENABLE_JPEG
     cmdline->AddOptionFlag(
-        'j', "pixels_to_jpeg",
-        "By default, if the input JPEG XL contains a recompressed JPEG file, "
-        "djxl reconstructs the exact original JPEG file. This flag causes the "
-        "decoder to instead decode the image to pixels and encode a new "
-        "(lossy) JPEG. The output file if provided must be a .jpg or .jpeg "
-        "file.",
-        &pixels_to_jpeg, &SetBooleanTrue);
-
-    opt_jpeg_quality_id = cmdline->AddOptionValue(
-        'q', "jpeg_quality", "N",
-        "Sets the JPEG output quality, default is 95. Setting an output "
-        "quality implies --pixels_to_jpeg.",
-        &jpeg_quality, &ParseUnsigned);
-#endif
-
-#if JPEGXL_ENABLE_SJPEG
+        '\0', "output_frames",
+        "If set, all frames will be written either as part of the main output "
+        "file if that supports animation, or as separate output files with "
+        "suffix -N in their names. Files are concatenated when outputting to "
+        "stdout.",
+        &output_frames, &SetBooleanTrue, 2);
+
     cmdline->AddOptionFlag('\0', "use_sjpeg",
                            "Use sjpeg instead of libjpeg for JPEG output.",
-                           &use_sjpeg, &SetBooleanTrue);
-#endif
+                           &use_sjpeg, &SetBooleanTrue, 2);
 
     cmdline->AddOptionFlag('\0', "norender_spotcolors",
-                           "Disables rendering spot colors.",
-                           &render_spotcolors, &SetBooleanFalse);
+                           "Disables rendering of spot colors.",
+                           &render_spotcolors, &SetBooleanFalse, 2);
 
     cmdline->AddOptionValue('\0', "preview_out", "FILENAME",
                             "If specified, writes the preview image to this "
                             "file.",
-                            &preview_out, &ParseString);
+                            &preview_out, &ParseString, 2);
 
     cmdline->AddOptionValue(
         '\0', "icc_out", "FILENAME",
         "If specified, writes the ICC profile of the decoded image to "
         "this file.",
-        &icc_out, &ParseString);
+        &icc_out, &ParseString, 2);
 
     cmdline->AddOptionValue(
         '\0', "orig_icc_out", "FILENAME",
         "If specified, writes the ICC profile of the original image to "
-        "this file. This can be different from the ICC profile of the "
-        "decoded image if --color_space was specified, or if the image "
-        "was XYB encoded and the color conversion to the original "
-        "profile was not supported by the decoder.",
-        &orig_icc_out, &ParseString);
-
-    cmdline->AddOptionValue(
-        '\0', "metadata_out", "FILENAME",
-        "If specified, writes decoded metadata info to this file in "
-        "JSON format. Used by the conformance test script",
-        &metadata_out, &ParseString);
+        "this file\n"
+        "    This can be different from the ICC profile of the "
+        "decoded image if --color_space was specified.",
+        &orig_icc_out, &ParseString, 2);
+
+    cmdline->AddOptionValue('\0', "metadata_out", "FILENAME",
+                            "If specified, writes metadata info to a JSON "
+                            "file. Used by the conformance test script",
+                            &metadata_out, &ParseString, 2);
+
+    cmdline->AddOptionValue('\0', "background", "#NNNNNN",
+                            "Specifies the background color for the "
+                            "--alpha_blend option. Recognized values are "
+                            "'black', 'white' (default), or '#NNNNNN'",
+                            &background_spec, &ParseString, 2);
+
+    cmdline->AddOptionFlag('\0', "alpha_blend",
+                           "Blends alpha channel with the color image using "
+                           "background color specified by --background "
+                           "(default is white).",
+                           &alpha_blend, &SetBooleanTrue, 2);
 
     cmdline->AddOptionFlag('\0', "print_read_bytes",
                            "Print total number of decoded bytes.",
-                           &print_read_bytes, &SetBooleanTrue);
-
-    cmdline->AddOptionFlag('\0', "quiet", "Silence output (except for errors).",
-                           &quiet, &SetBooleanTrue);
+                           &print_read_bytes, &SetBooleanTrue, 2);
   }
 
   // Validate the passed arguments, checking whether all passed options are
@@ -155,15 +225,23 @@ struct DecompressArgs {
       fprintf(stderr, "Missing INPUT filename.\n");
       return false;
     }
+    if (num_threads < -1) {
+      fprintf(
+          stderr,
+          "Invalid flag value for --num_threads: must be -1, 0 or positive.\n");
+      return false;
+    }
     return true;
   }
 
   const char* file_in = nullptr;
   const char* file_out = nullptr;
   bool version = false;
+  bool verbose = false;
   size_t num_reps = 1;
-  size_t num_threads = 0;
-  size_t bits_per_sample = 0;
+  bool disable_output = false;
+  int32_t num_threads = -1;
+  int bits_per_sample = -1;
   double display_nits = 0.0;
   std::string color_space;
   uint32_t downsampling = 0;
@@ -172,13 +250,18 @@ struct DecompressArgs {
   size_t jpeg_quality = 95;
   bool use_sjpeg = false;
   bool render_spotcolors = true;
+  bool output_extra_channels = false;
+  bool output_frames = false;
   std::string preview_out;
   std::string icc_out;
   std::string orig_icc_out;
   std::string metadata_out;
+  std::string background_spec = "white";
+  bool alpha_blend = false;
   bool print_read_bytes = false;
   bool quiet = false;
   // References (ids) of specific options to check if they were matched.
+  CommandLineParser::OptionId opt_bits_per_sample_id = -1;
   CommandLineParser::OptionId opt_jpeg_quality_id = -1;
 };
 
@@ -192,20 +275,21 @@ bool WriteOptionalOutput(const std::string& filename,
   if (filename.empty() || bytes.empty()) {
     return true;
   }
-  return jpegxl::tools::WriteFile(filename.data(), bytes);
+  return jpegxl::tools::WriteFile(filename, bytes);
 }
 
-std::string Filename(const std::string& base, const std::string& extension,
+std::string Filename(const std::string& filename, const std::string& extension,
                      int layer_index, int frame_index, int num_layers,
                      int num_frames) {
+  if (filename == "-") return "-";
   auto digits = [](int n) { return 1 + static_cast<int>(std::log10(n)); };
-  std::string out = base;
+  std::string out = filename;
   if (num_frames > 1) {
     std::vector<char> buf(2 + digits(num_frames));
     snprintf(buf.data(), buf.size(), "-%0*d", digits(num_frames), frame_index);
     out.append(buf.data());
   }
-  if (num_layers > 1) {
+  if (num_layers > 1 && layer_index > 0) {
     std::vector<char> buf(4 + digits(num_layers));
     snprintf(buf.data(), buf.size(), "-ec%0*d", digits(num_layers),
              layer_index);
@@ -213,12 +297,49 @@ std::string Filename(const std::string& base, const std::string& extension,
   }
   if (extension == ".ppm" && layer_index > 0) {
     out.append(".pgm");
-  } else {
+  } else if ((num_frames > 1) || (num_layers > 1 && layer_index > 0)) {
     out.append(extension);
   }
   return out;
 }
 
+void AddFormatsWithAlphaChannel(std::vector<JxlPixelFormat>* formats) {
+  auto add_format = [&](JxlPixelFormat format) {
+    for (auto f : *formats) {
+      if (memcmp(&f, &format, sizeof(format)) == 0) return;
+    }
+    formats->push_back(format);
+  };
+  size_t num_formats = formats->size();
+  for (size_t i = 0; i < num_formats; ++i) {
+    JxlPixelFormat format = (*formats)[i];
+    if (format.num_channels == 1 || format.num_channels == 3) {
+      ++format.num_channels;
+      add_format(format);
+    }
+  }
+}
+
+bool ParseBackgroundColor(const std::string& background_desc,
+                          float background[3]) {
+  if (background_desc == "black") {
+    background[0] = background[1] = background[2] = 0.0f;
+    return true;
+  }
+  if (background_desc == "white") {
+    background[0] = background[1] = background[2] = 1.0f;
+    return true;
+  }
+  if (background_desc.size() != 7 || background_desc[0] != '#') {
+    return false;
+  }
+  uint32_t color = std::stoi(background_desc.substr(1), nullptr, 16);
+  background[0] = ((color >> 16) & 0xff) * (1.0f / 255);
+  background[1] = ((color >> 8) & 0xff) * (1.0f / 255);
+  background[2] = (color & 0xff) * (1.0f / 255);
+  return true;
+}
+
 bool DecompressJxlReconstructJPEG(const jpegxl::tools::DecompressArgs& args,
                                   const std::vector<uint8_t>& compressed,
                                   void* runner,
@@ -227,6 +348,7 @@ bool DecompressJxlReconstructJPEG(const jpegxl::tools::DecompressArgs& args,
   const double t0 = jxl::Now();
   jxl::extras::PackedPixelFile ppf;  // for JxlBasicInfo
   jxl::extras::JXLDecompressParams dparams;
+  dparams.allow_partial_input = args.allow_partial_files;
   dparams.runner = JxlThreadParallelRunner;
   dparams.runner_opaque = runner;
   if (!jxl::extras::DecodeImageJXL(compressed.data(), compressed.size(),
@@ -257,6 +379,13 @@ bool DecompressJxlToPackedPixelFile(
   dparams.runner = JxlThreadParallelRunner;
   dparams.runner_opaque = runner;
   dparams.allow_partial_input = args.allow_partial_files;
+  dparams.need_icc = !args.icc_out.empty();
+  if (args.bits_per_sample == 0) {
+    dparams.output_bitdepth.type = JXL_BIT_DEPTH_FROM_CODESTREAM;
+  } else if (args.bits_per_sample > 0) {
+    dparams.output_bitdepth.type = JXL_BIT_DEPTH_CUSTOM;
+    dparams.output_bitdepth.bits_per_sample = args.bits_per_sample;
+  }
   const double t0 = jxl::Now();
   if (!jxl::extras::DecodeImageJXL(compressed.data(), compressed.size(),
                                    dparams, decoded_bytes, ppf)) {
@@ -293,7 +422,7 @@ int main(int argc, const char* argv[]) {
     fprintf(stderr, "JPEG XL decoder %s\n", version.c_str());
   }
 
-  if (cmdline.HelpFlagPassed()) {
+  if (cmdline.HelpFlagPassed() || !args.file_in) {
     cmdline.PrintHelp();
     return EXIT_SUCCESS;
   }
@@ -311,29 +440,31 @@ int main(int argc, const char* argv[]) {
     return EXIT_FAILURE;
   }
   if (!args.quiet) {
-    fprintf(stderr, "Read %" PRIuS " compressed bytes.\n", compressed.size());
+    cmdline.VerbosePrintf(1, "Read %" PRIuS " compressed bytes.\n",
+                          compressed.size());
   }
 
-  if (!args.file_out && !args.quiet) {
+  if (!args.file_out && !args.disable_output) {
+    std::cerr
+        << "No output file specified and --disable_output flag not passed."
+        << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  if (args.file_out && args.disable_output && !args.quiet) {
     fprintf(stderr,
-            "No output file specified.\n"
             "Decoding will be performed, but the result will be discarded.\n");
   }
 
   std::string filename_out;
-  std::string base;
+  std::string filename;
   std::string extension;
-  if (args.file_out) {
+  jxl::extras::Codec codec = jxl::extras::Codec::kUnknown;
+  if (args.file_out && !args.disable_output) {
     filename_out = std::string(args.file_out);
-    size_t pos = filename_out.find_last_of('.');
-    if (pos < filename_out.size()) {
-      base = filename_out.substr(0, pos);
-      extension = filename_out.substr(pos);
-    } else {
-      base = filename_out;
-    }
+    codec = jxl::extras::CodecFromPath(
+        filename_out, /* bits_per_sample */ nullptr, &filename, &extension);
   }
-  const jxl::extras::Codec codec = jxl::extras::CodecFromExtension(extension);
   if (codec == jxl::extras::Codec::kEXR) {
     std::string force_colorspace = "RGB_D65_SRG_Rel_Lin";
     if (!args.color_space.empty() && args.color_space != force_colorspace) {
@@ -341,12 +472,17 @@ int main(int argc, const char* argv[]) {
     }
     args.color_space = force_colorspace;
   }
+  if (codec == jxl::extras::Codec::kPNM && extension != ".pfm" &&
+      (args.opt_jpeg_quality_id < 0 ||
+       !cmdline.GetOption(args.opt_jpeg_quality_id)->matched())) {
+    args.bits_per_sample = 0;
+  }
 
   jpegxl::tools::SpeedStats stats;
   size_t num_worker_threads = JxlThreadParallelRunnerDefaultNumWorkerThreads();
   {
     int64_t flag_num_worker_threads = args.num_threads;
-    if (flag_num_worker_threads != 0) {
+    if (flag_num_worker_threads > -1) {
       num_worker_threads = flag_num_worker_threads;
     }
   }
@@ -354,12 +490,11 @@ int main(int argc, const char* argv[]) {
       /*memory_manager=*/nullptr, num_worker_threads);
 
   bool decode_to_pixels = (codec != jxl::extras::Codec::kJPG);
-#if JPEGXL_ENABLE_JPEG
-  if (args.pixels_to_jpeg ||
-      cmdline.GetOption(args.opt_jpeg_quality_id)->matched()) {
+  if (args.opt_jpeg_quality_id >= 0 &&
+      (args.pixels_to_jpeg ||
+       cmdline.GetOption(args.opt_jpeg_quality_id)->matched())) {
     decode_to_pixels = true;
   }
-#endif
 
   size_t num_reps = args.num_reps;
   if (!decode_to_pixels) {
@@ -380,9 +515,8 @@ int main(int argc, const char* argv[]) {
       }
     }
     if (!bytes.empty()) {
-      if (!args.quiet) fprintf(stderr, "Reconstructed to JPEG.\n");
-      if (!filename_out.empty() &&
-          !jpegxl::tools::WriteFile(filename_out.c_str(), bytes)) {
+      if (!args.quiet) cmdline.VerbosePrintf(0, "Reconstructed to JPEG.\n");
+      if (!filename_out.empty() && !jpegxl::tools::WriteFile(filename, bytes)) {
         return EXIT_FAILURE;
       }
     }
@@ -398,6 +532,9 @@ int main(int argc, const char* argv[]) {
         return EXIT_FAILURE;
       }
       accepted_formats = encoder->AcceptedFormats();
+      if (args.alpha_blend) {
+        AddFormatsWithAlphaChannel(&accepted_formats);
+      }
     }
     jxl::extras::PackedPixelFile ppf;
     size_t decoded_bytes = 0;
@@ -409,57 +546,63 @@ int main(int argc, const char* argv[]) {
         return EXIT_FAILURE;
       }
     }
-    if (!args.quiet) fprintf(stderr, "Decoded to pixels.\n");
+    if (!args.quiet) cmdline.VerbosePrintf(0, "Decoded to pixels.\n");
     if (args.print_read_bytes) {
       fprintf(stderr, "Decoded bytes: %" PRIuS "\n", decoded_bytes);
     }
-    if (extension == ".pfm") {
-      ppf.info.bits_per_sample = 32;
-    } else if (args.bits_per_sample > 0) {
-      ppf.info.bits_per_sample = args.bits_per_sample;
-    }
-#if JPEGXL_ENABLE_JPEG
+    // When --disable_output was parsed, `filename_out` is empty and we don't
+    // need to write files.
     if (encoder) {
+      if (args.alpha_blend) {
+        float background[3];
+        if (!ParseBackgroundColor(args.background_spec, background)) {
+          fprintf(stderr, "Invalid background color %s\n",
+                  args.background_spec.c_str());
+        }
+        AlphaBlend(&ppf, background);
+      }
       std::ostringstream os;
       os << args.jpeg_quality;
       encoder->SetOption("q", os.str());
-    }
-#endif
-#if JPEGXL_ENABLE_SJPEG
-    if (encoder && args.use_sjpeg) {
-      encoder->SetOption("jpeg_encoder", "sjpeg");
-    }
-#endif
-    jxl::extras::EncodedImage encoded_image;
-    if (encoder) {
+      if (args.use_sjpeg) {
+        encoder->SetOption("jpeg_encoder", "sjpeg");
+      }
+      jxl::extras::EncodedImage encoded_image;
+      if (!args.quiet) cmdline.VerbosePrintf(2, "Encoding decoded image\n");
       if (!encoder->Encode(ppf, &encoded_image)) {
         fprintf(stderr, "Encode failed\n");
         return EXIT_FAILURE;
       }
-    }
-    size_t nlayers = 1 + encoded_image.extra_channel_bitstreams.size();
-    size_t nframes = encoded_image.bitstreams.size();
-    for (size_t i = 0; i < nlayers; ++i) {
-      for (size_t j = 0; j < nframes; ++j) {
-        const std::vector<uint8_t>& bitstream =
-            (i == 0 ? encoded_image.bitstreams[j]
-                    : encoded_image.extra_channel_bitstreams[i - 1][j]);
-        std::string fn = Filename(base, extension, i, j, nlayers, nframes);
-        if (!jpegxl::tools::WriteFile(fn.c_str(), bitstream)) {
-          return EXIT_FAILURE;
+      size_t nlayers = args.output_extra_channels
+                           ? 1 + encoded_image.extra_channel_bitstreams.size()
+                           : 1;
+      size_t nframes = args.output_frames ? encoded_image.bitstreams.size() : 1;
+      for (size_t i = 0; i < nlayers; ++i) {
+        for (size_t j = 0; j < nframes; ++j) {
+          const std::vector<uint8_t>& bitstream =
+              (i == 0 ? encoded_image.bitstreams[j]
+                      : encoded_image.extra_channel_bitstreams[i - 1][j]);
+          std::string fn =
+              Filename(filename, extension, i, j, nlayers, nframes);
+          if (!jpegxl::tools::WriteFile(fn.c_str(), bitstream)) {
+            return EXIT_FAILURE;
+          }
+          if (!args.quiet)
+            cmdline.VerbosePrintf(1, "Wrote output to %s\n", fn.c_str());
         }
       }
-    }
-    if (!WriteOptionalOutput(args.preview_out,
-                             encoded_image.preview_bitstream) ||
-        !WriteOptionalOutput(args.icc_out, ppf.icc) ||
-        !WriteOptionalOutput(args.orig_icc_out, ppf.orig_icc) ||
-        !WriteOptionalOutput(args.metadata_out, encoded_image.metadata)) {
-      return EXIT_FAILURE;
+      if (!WriteOptionalOutput(args.preview_out,
+                               encoded_image.preview_bitstream) ||
+          !WriteOptionalOutput(args.icc_out, ppf.icc) ||
+          !WriteOptionalOutput(args.orig_icc_out, ppf.orig_icc) ||
+          !WriteOptionalOutput(args.metadata_out, encoded_image.metadata)) {
+        return EXIT_FAILURE;
+      }
     }
   }
   if (!args.quiet) {
     stats.Print(num_worker_threads);
   }
+
   return EXIT_SUCCESS;
 }
diff --git a/tools/fast_lossless/README.md b/tools/fast_lossless/README.md
new file mode 100644 (file)
index 0000000..5f99c13
--- /dev/null
@@ -0,0 +1,10 @@
+# Fast-lossless
+This is a script to compile a standalone version of a JXL encoder that supports
+lossless compression, up to 16 bits, of 1- to 4-channel images and animations; it is
+very fast and compression is slightly worse than PNG for 8-bit nonphoto content
+and better or much better than PNG for all other situations.
+
+The main encoder is made out of two files, `lib/jxl/enc_fast_lossless.{cc,h}`;
+it automatically selects and runs a SIMD implementation supported by your CPU.
+
+This folder contains an example build script and `main` file.
similarity index 88%
rename from experimental/fast_lossless/build-android.sh
rename to tools/fast_lossless/build-android.sh
index 41452cd..c155b21 100755 (executable)
@@ -20,7 +20,8 @@ fi
 [ -f lodepng.h ] || curl -o lodepng.h --url 'https://raw.githubusercontent.com/lvandeve/lodepng/8c6a9e30576f07bf470ad6f09458a2dcd7a6a84a/lodepng.h'
 [ -f lodepng.o ] || "$CXX" lodepng.cpp -O3 -o lodepng.o -c
 
-"$CXX" -O3 -DFASTLL_ENABLE_NEON_INTRINSICS -fopenmp \
+"$CXX" -O3 \
   -I. lodepng.o \
-  "${DIR}"/fast_lossless.cc "${DIR}"/fast_lossless_main.cc \
+  -I"${DIR}"/../../ \
+  "${DIR}"/../../lib/jxl/enc_fast_lossless.cc "${DIR}"/fast_lossless_main.cc \
   -o fast_lossless
similarity index 79%
rename from experimental/fast_lossless/build.sh
rename to tools/fast_lossless/build.sh
index b2564c6..e2c0aa3 100755 (executable)
@@ -18,9 +18,10 @@ fi
 
 [ -f lodepng.cpp ] || curl -o lodepng.cpp --url 'https://raw.githubusercontent.com/lvandeve/lodepng/8c6a9e30576f07bf470ad6f09458a2dcd7a6a84a/lodepng.cpp'
 [ -f lodepng.h ] || curl -o lodepng.h --url 'https://raw.githubusercontent.com/lvandeve/lodepng/8c6a9e30576f07bf470ad6f09458a2dcd7a6a84a/lodepng.h'
-[ -f lodepng.o ] || "$CXX" lodepng.cpp -O3 -mavx2 -o lodepng.o -c
+[ -f lodepng.o ] || "$CXX" lodepng.cpp -O3 -o lodepng.o -c
 
-"$CXX" -O3 -mavx2 -DFASTLL_ENABLE_AVX2_INTRINSICS -fopenmp \
-  -I. lodepng.o \
-  "$DIR"/fast_lossless.cc "$DIR"/fast_lossless_main.cc \
+"$CXX" -O3 \
+  -I. -g lodepng.o \
+  -I"$DIR"/../../ \
+  "$DIR"/../../lib/jxl/enc_fast_lossless.cc "$DIR"/fast_lossless_main.cc \
   -o fast_lossless
diff --git a/tools/fast_lossless/cross_compile_aarch64.sh b/tools/fast_lossless/cross_compile_aarch64.sh
new file mode 100755 (executable)
index 0000000..a5e6aa2
--- /dev/null
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+set -e
+
+DIR=$(realpath "$(dirname "$0")")
+mkdir -p "$DIR"/build-aarch64
+cd "$DIR"/build-aarch64
+
+CXX="${CXX-aarch64-linux-gnu-c++}"
+if ! command -v "$CXX" >/dev/null ; then
+  printf >&2 '%s: C++ compiler not found\n' "${0##*/}"
+  exit 1
+fi
+
+[ -f lodepng.cpp ] || curl -o lodepng.cpp --url 'https://raw.githubusercontent.com/lvandeve/lodepng/8c6a9e30576f07bf470ad6f09458a2dcd7a6a84a/lodepng.cpp'
+[ -f lodepng.h ] || curl -o lodepng.h --url 'https://raw.githubusercontent.com/lvandeve/lodepng/8c6a9e30576f07bf470ad6f09458a2dcd7a6a84a/lodepng.h'
+[ -f lodepng.o ] || "$CXX" lodepng.cpp -O3 -o lodepng.o -c
+
+"$CXX" -O3 -static \
+  -I. lodepng.o \
+  -I"$DIR"/../../ \
+  "$DIR"/../../lib/jxl/enc_fast_lossless.cc "$DIR"/fast_lossless_main.cc \
+  -o fast_lossless
similarity index 61%
rename from experimental/fast_lossless/fast_lossless_main.cc
rename to tools/fast_lossless/fast_lossless_main.cc
index 4db1ec4..b59051d 100644 (file)
@@ -7,16 +7,20 @@
 #include <stdlib.h>
 #include <string.h>
 
+#include <atomic>
 #include <chrono>
 #include <thread>
+#include <vector>
 
-#include "fast_lossless.h"
+#include "lib/jxl/enc_fast_lossless.h"
 #include "lodepng.h"
 #include "pam-input.h"
 
 int main(int argc, char** argv) {
   if (argc < 3) {
-    fprintf(stderr, "Usage: %s in.png out.jxl [effort] [num_reps]\n", argv[0]);
+    fprintf(stderr,
+            "Usage: %s in.png out.jxl [effort] [num_reps] [num_threads]\n",
+            argv[0]);
     return 1;
   }
 
@@ -24,6 +28,7 @@ int main(int argc, char** argv) {
   const char* out = argv[2];
   int effort = argc >= 4 ? atoi(argv[3]) : 2;
   size_t num_reps = argc >= 5 ? atoi(argv[4]) : 1;
+  size_t num_threads = argc >= 6 ? atoi(argv[5]) : 0;
 
   if (effort < 0 || effort > 127) {
     fprintf(
@@ -44,6 +49,35 @@ int main(int argc, char** argv) {
     return 1;
   }
 
+  auto parallel_runner = [](void* num_threads_ptr, void* opaque,
+                            void fun(void*, size_t), size_t count) {
+    size_t num_threads = *(size_t*)num_threads_ptr;
+    if (num_threads == 0) {
+      num_threads = std::thread::hardware_concurrency();
+    }
+    if (num_threads > count) {
+      num_threads = count;
+    }
+    if (num_threads == 1) {
+      for (size_t i = 0; i < count; i++) {
+        fun(opaque, i);
+      }
+    } else {
+      std::atomic<int> task{0};
+      std::vector<std::thread> threads;
+      for (size_t i = 0; i < num_threads; i++) {
+        threads.push_back(std::thread([count, opaque, fun, &task]() {
+          while (true) {
+            int t = task++;
+            if (t >= count) break;
+            fun(opaque, t);
+          }
+        }));
+      }
+      for (auto& t : threads) t.join();
+    }
+  };
+
   size_t encoded_size = 0;
   unsigned char* encoded = nullptr;
   size_t stride = width * nb_chans * (bitdepth > 8 ? 2 : 1);
@@ -51,8 +85,9 @@ int main(int argc, char** argv) {
   auto start = std::chrono::high_resolution_clock::now();
   for (size_t _ = 0; _ < num_reps; _++) {
     free(encoded);
-    encoded_size = FastLosslessEncode(png, width, stride, height, nb_chans,
-                                      bitdepth, effort, &encoded);
+    encoded_size = JxlFastLosslessEncode(
+        png, width, stride, height, nb_chans, bitdepth,
+        /*big_endian=*/true, effort, &encoded, &num_threads, +parallel_runner);
   }
   auto stop = std::chrono::high_resolution_clock::now();
   if (num_reps > 1) {
similarity index 98%
rename from experimental/fast_lossless/pam-input.h
rename to tools/fast_lossless/pam-input.h
index 8bc41ef..b5a0233 100644 (file)
@@ -251,7 +251,10 @@ bool load_file(unsigned char** out, size_t* outsize, const char* filename) {
     return false;
   }
   *out = (unsigned char*)malloc(*outsize);
-  if (!(*out)) return false;
+  if (!(*out)) {
+    fclose(file);
+    return false;
+  }
   size_t readsize;
   readsize = fread(*out, 1, *outsize, file);
   fclose(file);
@@ -270,8 +273,8 @@ bool DecodePAM(const char* filename, uint8_t** buffer, size_t* w, size_t* h,
   const uint8_t* pos = nullptr;
   if (!parser.ParseHeader(&header, &pos)) return false;
 
-  if (header.bits_per_sample == 0 || header.bits_per_sample > 12) {
-    return error_msg("PNM: bits_per_sample invalid (can do at most 12-bit)");
+  if (header.bits_per_sample == 0 || header.bits_per_sample > 16) {
+    return error_msg("PNM: bits_per_sample invalid (can do at most 16-bit)");
   }
   *w = header.xsize;
   *h = header.ysize;
index 87e1439..09ea89c 100644 (file)
@@ -5,6 +5,7 @@
 
 #include <stdint.h>
 
+#include "lib/jxl/dec_ans.h"
 #include "lib/jxl/dec_bit_reader.h"
 #include "lib/jxl/frame_header.h"
 #include "lib/jxl/headers.h"
 #include "lib/jxl/modular/encoding/encoding.h"
 #include "lib/jxl/modular/transform/transform.h"
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
+
+using ::jxl::BitReader;
+using ::jxl::Bytes;
+using ::jxl::CodecMetadata;
+using ::jxl::CustomTransformData;
+using ::jxl::ImageMetadata;
+using ::jxl::SizeHeader;
 
 int TestOneInput(const uint8_t* data, size_t size) {
   // Global parameters used by some headers.
@@ -23,23 +32,23 @@ int TestOneInput(const uint8_t* data, size_t size) {
 
   // First byte controls which header to parse.
   if (size == 0) return 0;
-  BitReader reader(Span<const uint8_t>(data + 1, size - 1));
+  BitReader reader(Bytes(data + 1, size - 1));
 #define FUZZER_CASE_HEADER(number, classname, ...) \
   case number: {                                   \
-    classname header{__VA_ARGS__};                 \
-    (void)Bundle::Read(&reader, &header);          \
+    ::jxl::classname header{__VA_ARGS__};          \
+    (void)jxl::Bundle::Read(&reader, &header);     \
     break;                                         \
   }
   switch (data[0]) {
     case 0: {
       SizeHeader size_header;
-      (void)ReadSizeHeader(&reader, &size_header);
+      (void)jxl::ReadSizeHeader(&reader, &size_header);
       break;
     }
 
     case 1: {
       ImageMetadata metadata;
-      (void)ReadImageMetadata(&reader, &metadata);
+      (void)jxl::ReadImageMetadata(&reader, &metadata);
       break;
     }
 
@@ -69,7 +78,7 @@ int TestOneInput(const uint8_t* data, size_t size) {
     default: {
       CustomTransformData transform_data;
       transform_data.nonserialized_xyb_encoded = true;
-      (void)Bundle::Read(&reader, &transform_data);
+      (void)jxl::Bundle::Read(&reader, &transform_data);
       break;
     }
   }
@@ -78,8 +87,9 @@ int TestOneInput(const uint8_t* data, size_t size) {
   return 0;
 }
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
 
 extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
-  return jxl::TestOneInput(data, size);
+  return jpegxl::tools::TestOneInput(data, size);
 }
diff --git a/tools/file_io.cc b/tools/file_io.cc
deleted file mode 100644 (file)
index bc7f3b1..0000000
+++ /dev/null
@@ -1,75 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "tools/file_io.h"
-
-#include <errno.h>
-#include <limits.h>
-#include <stdio.h>
-#include <string.h>
-
-namespace jpegxl {
-namespace tools {
-
-bool ReadFile(const char* filename, std::vector<uint8_t>* out) {
-  FILE* file = fopen(filename, "rb");
-  if (!file) {
-    return false;
-  }
-
-  if (fseek(file, 0, SEEK_END) != 0) {
-    fclose(file);
-    return false;
-  }
-
-  long size = ftell(file);
-  // Avoid invalid file or directory.
-  if (size >= LONG_MAX || size < 0) {
-    fclose(file);
-    return false;
-  }
-
-  if (fseek(file, 0, SEEK_SET) != 0) {
-    fclose(file);
-    return false;
-  }
-
-  out->resize(size);
-  size_t readsize = fread(out->data(), 1, size, file);
-  if (fclose(file) != 0) {
-    return false;
-  }
-
-  return readsize == static_cast<size_t>(size);
-}
-
-bool WriteFile(const char* filename, const std::vector<uint8_t>& bytes) {
-  FILE* file = fopen(filename, "wb");
-  if (!file) {
-    fprintf(stderr,
-            "Could not open %s for writing\n"
-            "Error: %s",
-            filename, strerror(errno));
-    return false;
-  }
-  if (fwrite(bytes.data(), 1, bytes.size(), file) != bytes.size()) {
-    fprintf(stderr,
-            "Could not write to file\n"
-            "Error: %s",
-            strerror(errno));
-    return false;
-  }
-  if (fclose(file) != 0) {
-    fprintf(stderr,
-            "Could not close file\n"
-            "Error: %s",
-            strerror(errno));
-    return false;
-  }
-  return true;
-}
-
-}  // namespace tools
-}  // namespace jpegxl
index 959b79d..7d9f15d 100644 (file)
 #ifndef TOOLS_FILE_IO_H_
 #define TOOLS_FILE_IO_H_
 
+#include <errno.h>
+#include <limits.h>
 #include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/stat.h>
 
+#include <list>
+#include <string>
 #include <vector>
 
+#include "lib/jxl/base/compiler_specific.h"
+
 namespace jpegxl {
 namespace tools {
 
-bool ReadFile(const char* filename, std::vector<uint8_t>* out);
+namespace {
+
+// RAII, ensures files are closed even when returning early.
+class FileWrapper {
+ public:
+  FileWrapper(const FileWrapper& other) = delete;
+  FileWrapper& operator=(const FileWrapper& other) = delete;
+
+  explicit FileWrapper(const std::string& pathname, const char* mode)
+      : file_(pathname == "-" ? (mode[0] == 'r' ? stdin : stdout)
+                              : fopen(pathname.c_str(), mode)),
+        close_on_delete_(pathname != "-") {
+#ifdef _WIN32
+    struct __stat64 s = {};
+    const int err = _stat64(pathname.c_str(), &s);
+    const bool is_file = (s.st_mode & S_IFREG) != 0;
+#else
+    struct stat s = {};
+    const int err = stat(pathname.c_str(), &s);
+    const bool is_file = S_ISREG(s.st_mode);
+#endif
+    if (err == 0 && is_file) {
+      size_ = s.st_size;
+    }
+  }
+
+  ~FileWrapper() {
+    if (file_ != nullptr && close_on_delete_) {
+      const int err = fclose(file_);
+      if (err) {
+        fprintf(stderr,
+                "Could not close file\n"
+                "Error: %s",
+                strerror(errno));
+      }
+    }
+  }
+
+  // We intend to use FileWrapper as a replacement of FILE.
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  operator FILE*() const { return file_; }
+
+  int64_t size() { return size_; }
+
+ private:
+  FILE* const file_;
+  bool close_on_delete_ = true;
+  int64_t size_ = -1;
+};
+
+}  // namespace
+
+template <typename ContainerType>
+static inline bool ReadFile(FileWrapper& f, ContainerType* JXL_RESTRICT bytes) {
+  if (!f) return false;
+
+  // Get size of file in bytes
+  const int64_t size = f.size();
+  if (size < 0) {
+    // Size is unknown, loop reading chunks until EOF.
+    bytes->clear();
+    std::list<std::vector<uint8_t>> chunks;
+
+    size_t total_size = 0;
+    while (true) {
+      std::vector<uint8_t> chunk(16 * 1024);
+      const size_t bytes_read = fread(chunk.data(), 1, chunk.size(), f);
+      if (ferror(f) || bytes_read > chunk.size()) {
+        return false;
+      }
+
+      chunk.resize(bytes_read);
+      total_size += bytes_read;
+      if (bytes_read != 0) {
+        chunks.emplace_back(std::move(chunk));
+      }
+      if (feof(f)) {
+        break;
+      }
+    }
+    bytes->resize(total_size);
+    size_t pos = 0;
+    for (const auto& chunk : chunks) {
+      memcpy(bytes->data() + pos, chunk.data(), chunk.size());
+      pos += chunk.size();
+    }
+  } else {
+    // Size is known, read the file directly.
+    bytes->resize(static_cast<size_t>(size));
+
+    const size_t bytes_read = fread(bytes->data(), 1, bytes->size(), f);
+    if (bytes_read != static_cast<size_t>(size)) return false;
+  }
+
+  return true;
+}
+
+template <typename ContainerType>
+static inline bool ReadFile(const std::string& filename,
+                            ContainerType* JXL_RESTRICT bytes) {
+  FileWrapper f(filename, "rb");
+  return ReadFile(f, bytes);
+}
 
-bool WriteFile(const char* filename, const std::vector<uint8_t>& bytes);
+template <typename ContainerType>
+static inline bool WriteFile(const std::string& filename,
+                             const ContainerType& bytes) {
+  FileWrapper file(filename, "wb");
+  if (!file) {
+    fprintf(stderr,
+            "Could not open %s for writing\n"
+            "Error: %s",
+            filename.c_str(), strerror(errno));
+    return false;
+  }
+  if (fwrite(bytes.data(), 1, bytes.size(), file) != bytes.size()) {
+    fprintf(stderr,
+            "Could not write to file\n"
+            "Error: %s",
+            strerror(errno));
+    return false;
+  }
+  return true;
+}
 
 }  // namespace tools
 }  // namespace jpegxl
index efa4716..427a34f 100644 (file)
@@ -3,9 +3,9 @@
 # Use of this source code is governed by a BSD-style
 # license that can be found in the LICENSE file.
 
-find_package(Qt5 QUIET COMPONENTS Widgets)
-if (NOT Qt5_FOUND)
-  message(WARNING "Qt5 was not found. The flicker test tool will not be built.")
+find_package(Qt6 QUIET COMPONENTS Widgets)
+if (NOT Qt6_FOUND)
+  message(WARNING "Qt6 was not found. The flicker test tool will not be built.")
   return()
 endif ()
 
@@ -32,7 +32,7 @@ add_executable(flicker_test WIN32
   test_window.ui)
 
 target_link_libraries(flicker_test PUBLIC
-  Qt5::Widgets
+  Qt6::Widgets
   image_loading
   icc_detect
 )
index 67985a9..9617765 100644 (file)
 int main(int argc, char** argv) {
   QApplication application(argc, argv);
 
-  jxl::FlickerTestWizard wizard;
+  jpegxl::tools::FlickerTestWizard wizard;
   if (wizard.exec()) {
-    jxl::FlickerTestWindow test_window(wizard.parameters());
+    jpegxl::tools::FlickerTestWindow test_window(wizard.parameters());
     if (test_window.proceedWithTest()) {
       test_window.showMaximized();
       return application.exec();
     }
   }
+  return 0;
 }
index 575edb0..460867b 100644 (file)
@@ -5,7 +5,8 @@
 
 #include "tools/flicker_test/parameters.h"
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
 
 namespace {
 
@@ -84,4 +85,5 @@ void FlickerTestParameters::saveTo(QSettings* const settings) const {
   settings->endGroup();
 }
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
index a063995..777d479 100644 (file)
@@ -8,7 +8,8 @@
 
 #include <QSettings>
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
 
 struct FlickerTestParameters {
   QString originalFolder;
@@ -27,6 +28,7 @@ struct FlickerTestParameters {
   void saveTo(QSettings* settings) const;
 };
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
 
 #endif  // TOOLS_FLICKER_TEST_PARAMETERS_H_
index bfcddd5..ff17286 100644 (file)
@@ -11,7 +11,8 @@
 #include <QMessageBox>
 #include <QPushButton>
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
 
 FlickerTestWizard::FlickerTestWizard(QWidget* const parent)
     : QWizard(parent), settings_("JPEG XL project", "Flickering test") {
@@ -148,4 +149,5 @@ bool FlickerTestWizard::validateCurrentPage() {
   return QWizard::validateCurrentPage();
 }
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
index 0da78d6..e034e28 100644 (file)
@@ -11,7 +11,8 @@
 #include "tools/flicker_test/parameters.h"
 #include "tools/flicker_test/ui_setup.h"
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
 
 class FlickerTestWizard : public QWizard {
   Q_OBJECT
@@ -39,6 +40,7 @@ class FlickerTestWizard : public QWizard {
   QSettings settings_;
 };
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
 
 #endif  // TOOLS_FLICKER_TEST_SETUP_H_
index 055c7f7..44b850c 100644 (file)
@@ -11,6 +11,9 @@
   <property name="windowTitle">
    <string>New flicker test</string>
   </property>
+  <property name="wizardStyle">
+   <enum>QWizard::ClassicStyle</enum>
+  </property>
   <property name="options">
    <set>QWizard::NoBackButtonOnStartPage</set>
   </property>
   <widget class="QWizardPage" name="spacingPage">
    <layout class="QVBoxLayout" name="verticalLayout_3" stretch="1,0,0">
     <item>
-     <widget class="jxl::SplitView" name="spacingDemo" native="true"/>
+     <widget class="jpegxl::tools::SplitView" name="spacingDemo" native="true"/>
     </item>
     <item>
      <spacer name="verticalSpacer_2">
  </widget>
  <customwidgets>
   <customwidget>
-   <class>jxl::SplitView</class>
+   <class>jpegxl::tools::SplitView</class>
    <extends>QWidget</extends>
    <header>tools/flicker_test/split_view.h</header>
    <container>1</container>
index 3455d70..87df95e 100644 (file)
@@ -8,7 +8,8 @@
 #include <QMouseEvent>
 #include <QPainter>
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
 
 SplitView::SplitView(QWidget* const parent)
     : QWidget(parent), g_(std::random_device()()) {
@@ -37,12 +38,14 @@ SplitView::SplitView(QWidget* const parent)
 
 void SplitView::setOriginalImage(QImage image) {
   original_ = QPixmap::fromImage(std::move(image));
+  original_.setDevicePixelRatio(devicePixelRatio());
   updateMinimumSize();
   update();
 }
 
 void SplitView::setAlteredImage(QImage image) {
   altered_ = QPixmap::fromImage(std::move(image));
+  altered_.setDevicePixelRatio(devicePixelRatio());
   updateMinimumSize();
   update();
 }
@@ -139,15 +142,17 @@ void SplitView::paintEvent(QPaintEvent* const event) {
   QPixmap* const leftImage = imageForSide(Side::kLeft);
   QPixmap* const rightImage = imageForSide(Side::kRight);
 
-  leftRect_ = leftImage->rect();
+  leftRect_ = QRectF(QPoint(), leftImage->deviceIndependentSize());
   leftRect_.moveCenter(rect().center());
-  leftRect_.moveRight(rect().center().x() - spacing_ / 2 - spacing_ % 2);
-  painter.drawPixmap(leftRect_, *leftImage);
+  leftRect_.moveRight(rect().center().x() -
+                      (spacing_ / 2 + spacing_ % 2) / devicePixelRatio());
+  painter.drawPixmap(leftRect_.topLeft(), *leftImage);
 
-  rightRect_ = rightImage->rect();
+  rightRect_ = QRectF(QPoint(), rightImage->deviceIndependentSize());
   rightRect_.moveCenter(rect().center());
-  rightRect_.moveLeft(rect().center().x() + 1 + spacing_ / 2);
-  painter.drawPixmap(rightRect_, *rightImage);
+  rightRect_.moveLeft(rect().center().x() +
+                      (spacing_ / 2) / devicePixelRatio());
+  painter.drawPixmap(rightRect_.topLeft(), *rightImage);
 }
 
 void SplitView::startDisplaying() {
@@ -160,8 +165,12 @@ void SplitView::startDisplaying() {
 }
 
 void SplitView::updateMinimumSize() {
-  setMinimumWidth(2 * std::max(original_.width(), altered_.width()) + spacing_);
-  setMinimumHeight(std::max(original_.height(), altered_.height()));
+  setMinimumWidth(2 * std::max(original_.deviceIndependentSize().width(),
+                               altered_.deviceIndependentSize().width()) +
+                  spacing_ / devicePixelRatio());
+  setMinimumHeight(std::max(original_.deviceIndependentSize().height(),
+                            altered_.deviceIndependentSize().height()));
 }
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
index b4c7a1d..37c5f7e 100644 (file)
@@ -14,7 +14,8 @@
 #include <QWidget>
 #include <random>
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
 
 class SplitView : public QWidget {
   Q_OBJECT
@@ -67,7 +68,7 @@ class SplitView : public QWidget {
   Side originalSide_;
   bool clicking_ = false;
   Side clickedSide_;
-  QRect leftRect_, rightRect_;
+  QRectF leftRect_, rightRect_;
   State state_ = State::kDisplaying;
   bool gray_ = false;
   QTimer blankingTimer_;
@@ -79,6 +80,7 @@ class SplitView : public QWidget {
   QElapsedTimer viewingStartTime_;
 };
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
 
 #endif  // TOOLS_FLICKER_TEST_SPLIT_VIEW_H_
index f3827c5..c21ca6f 100644 (file)
@@ -13,7 +13,8 @@
 
 #include "tools/icc_detect/icc_detect.h"
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
 
 FlickerTestWindow::FlickerTestWindow(FlickerTestParameters parameters,
                                      QWidget* const parent)
@@ -181,4 +182,5 @@ retry:
       parameters_.grayFadingTimeMSecs, parameters_.grayTimeMSecs);
 }
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
index 1dfe5fc..ad712af 100644 (file)
@@ -16,7 +16,8 @@
 #include "tools/flicker_test/parameters.h"
 #include "tools/flicker_test/ui_test_window.h"
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
 
 class FlickerTestWindow : public QMainWindow {
   Q_OBJECT
@@ -45,6 +46,7 @@ class FlickerTestWindow : public QMainWindow {
   QStringList remainingImages_;
 };
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
 
 #endif  // TOOLS_FLICKER_TEST_TEST_WINDOW_H_
index 7eb2619..bd42873 100644 (file)
@@ -64,7 +64,7 @@
         </item>
        </layout>
       </widget>
-      <widget class="jxl::SplitView" name="splitView"/>
+      <widget class="jpegxl::tools::SplitView" name="splitView"/>
       <widget class="QWidget" name="finalPage">
        <layout class="QVBoxLayout" name="verticalLayout_3">
         <item>
  </widget>
  <customwidgets>
   <customwidget>
-   <class>jxl::SplitView</class>
+   <class>jpegxl::tools::SplitView</class>
    <extends>QWidget</extends>
    <header>tools/flicker_test/split_view.h</header>
    <container>1</container>
index f984c00..2f30e9e 100644 (file)
@@ -3,14 +3,14 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+#include <jxl/thread_parallel_runner.h>
+#include <jxl/thread_parallel_runner_cxx.h>
+
 #include <fstream>
 #include <iostream>
 #include <iterator>
 #include <vector>
 
-#include "jxl/thread_parallel_runner.h"
-#include "jxl/thread_parallel_runner_cxx.h"
-
 extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size);
 
 void ProcessInput(const char* filename) {
index 227b22b..85eb1bd 100644 (file)
@@ -99,6 +99,22 @@ This is the mathematical inverse of `tools/render_hlg`. Furthermore,
 `tools/pq_to_hlg` is equivalent to `tools/tone_map -t 1000` followed by
 `tools/display_to_hlg -m 1000`.
 
+## OpenEXR to PQ
+
+`tools/exr_to_pq` converts an OpenEXR image into a Rec. 2020 + PQ image, which
+can be saved as a PNG or PPM file. Luminance information is taken from the
+`whiteLuminance` tag if the input has it, and otherwise defaults to treating
+(1, 1, 1) as 100 cd/m². It is also possible to override this using the
+`--luminance` (`-l`) flag, in two different ways:
+
+```shell
+# Specifies that the brightest pixel in the image happens to be 1500 cd/m².
+$ tools/exr_to_pq --luminance='max=1500' input.exr output.png
+
+# Specifies that (1, 1, 1) in the input file is 203 cd/m².
+$ tools/exr_to_pq --luminance='white=203' input.exr output.png
+```
+
 # LUT generation
 
 There are additionally two tools that can be used to generate look-up tables
index a2caef2..8fa8fde 100644 (file)
@@ -9,13 +9,15 @@
 #include "lib/extras/codec.h"
 #include "lib/extras/hlg.h"
 #include "lib/extras/tone_mapping.h"
-#include "lib/jxl/base/thread_pool_internal.h"
-#include "lib/jxl/enc_color_management.h"
+#include "lib/jxl/base/span.h"
 #include "tools/args.h"
 #include "tools/cmdline.h"
+#include "tools/file_io.h"
+#include "tools/hdr/image_utils.h"
+#include "tools/thread_pool_internal.h"
 
 int main(int argc, const char** argv) {
-  jxl::ThreadPoolInternal pool;
+  jpegxl::tools::ThreadPoolInternal pool;
 
   jpegxl::tools::CommandLineParser parser;
   float max_nits = 0;
@@ -64,9 +66,11 @@ int main(int argc, const char** argv) {
     return EXIT_FAILURE;
   }
 
+  std::vector<uint8_t> encoded;
+  JXL_CHECK(jpegxl::tools::ReadFile(input_filename, &encoded));
   jxl::CodecInOut image;
-  JXL_CHECK(jxl::SetFromFile(input_filename, jxl::extras::ColorHints(), &image,
-                             &pool));
+  JXL_CHECK(jxl::SetFromBytes(jxl::Bytes(encoded), jxl::extras::ColorHints(),
+                              &image, &pool));
   image.metadata.m.SetIntensityTarget(max_nits);
   JXL_CHECK(jxl::HlgInverseOOTF(
       &image.Main(), jxl::GetHlgGamma(max_nits, surround_nits), &pool));
@@ -75,11 +79,12 @@ int main(int argc, const char** argv) {
 
   jxl::ColorEncoding hlg;
   hlg.SetColorSpace(jxl::ColorSpace::kRGB);
-  hlg.primaries = jxl::Primaries::k2100;
-  hlg.white_point = jxl::WhitePoint::kD65;
-  hlg.tf.SetTransferFunction(jxl::TransferFunction::kHLG);
+  JXL_CHECK(hlg.SetPrimariesType(jxl::Primaries::k2100));
+  JXL_CHECK(hlg.SetWhitePointType(jxl::WhitePoint::kD65));
+  hlg.Tf().SetTransferFunction(jxl::TransferFunction::kHLG);
   JXL_CHECK(hlg.CreateICC());
-  JXL_CHECK(image.TransformTo(hlg, jxl::GetJxlCms(), &pool));
+  JXL_CHECK(jpegxl::tools::TransformCodecInOutTo(image, hlg, &pool));
   image.metadata.m.color_encoding = hlg;
-  JXL_CHECK(jxl::EncodeToFile(image, output_filename, &pool));
+  JXL_CHECK(jxl::Encode(image, output_filename, &encoded, &pool));
+  JXL_CHECK(jpegxl::tools::WriteFile(output_filename, encoded));
 }
diff --git a/tools/hdr/exr_to_pq.cc b/tools/hdr/exr_to_pq.cc
new file mode 100644 (file)
index 0000000..c7ce1b7
--- /dev/null
@@ -0,0 +1,158 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "lib/extras/codec.h"
+#include "lib/extras/dec/decode.h"
+#include "lib/extras/packed_image_convert.h"
+#include "lib/jxl/cms/jxl_cms_internal.h"
+#include "lib/jxl/image_bundle.h"
+#include "tools/cmdline.h"
+#include "tools/file_io.h"
+#include "tools/hdr/image_utils.h"
+#include "tools/thread_pool_internal.h"
+
+namespace {
+
+struct LuminanceInfo {
+  enum class Kind { kWhite, kMaximum };
+  Kind kind = Kind::kWhite;
+  float luminance = 100.f;
+};
+
+bool ParseLuminanceInfo(const char* argument, LuminanceInfo* luminance_info) {
+  if (strncmp(argument, "white=", 6) == 0) {
+    luminance_info->kind = LuminanceInfo::Kind::kWhite;
+    argument += 6;
+  } else if (strncmp(argument, "max=", 4) == 0) {
+    luminance_info->kind = LuminanceInfo::Kind::kMaximum;
+    argument += 4;
+  } else {
+    fprintf(stderr,
+            "Invalid prefix for luminance info, expected white= or max=\n");
+    return false;
+  }
+  return jpegxl::tools::ParseFloat(argument, &luminance_info->luminance);
+}
+
+}  // namespace
+
+int main(int argc, const char** argv) {
+  jpegxl::tools::ThreadPoolInternal pool;
+
+  jpegxl::tools::CommandLineParser parser;
+  LuminanceInfo luminance_info;
+  auto luminance_option =
+      parser.AddOptionValue('l', "luminance", "<max|white=N>",
+                            "luminance information (defaults to whiteLuminance "
+                            "header if present, otherwise to white=100)",
+                            &luminance_info, &ParseLuminanceInfo, 0);
+  const char* input_filename = nullptr;
+  auto input_filename_option = parser.AddPositionalOption(
+      "input", true, "input image", &input_filename, 0);
+  const char* output_filename = nullptr;
+  auto output_filename_option = parser.AddPositionalOption(
+      "output", true, "output image", &output_filename, 0);
+
+  if (!parser.Parse(argc, argv)) {
+    fprintf(stderr, "See -h for help.\n");
+    return EXIT_FAILURE;
+  }
+
+  if (parser.HelpFlagPassed()) {
+    parser.PrintHelp();
+    return EXIT_SUCCESS;
+  }
+
+  if (!parser.GetOption(input_filename_option)->matched()) {
+    fprintf(stderr, "Missing input filename.\nSee -h for help.\n");
+    return EXIT_FAILURE;
+  }
+  if (!parser.GetOption(output_filename_option)->matched()) {
+    fprintf(stderr, "Missing output filename.\nSee -h for help.\n");
+    return EXIT_FAILURE;
+  }
+
+  jxl::extras::PackedPixelFile ppf;
+  std::vector<uint8_t> input_bytes;
+  JXL_CHECK(jpegxl::tools::ReadFile(input_filename, &input_bytes));
+  JXL_CHECK(jxl::extras::DecodeBytes(jxl::Bytes(input_bytes),
+                                     jxl::extras::ColorHints(), &ppf));
+
+  jxl::CodecInOut image;
+  JXL_CHECK(
+      jxl::extras::ConvertPackedPixelFileToCodecInOut(ppf, &pool, &image));
+  image.metadata.m.bit_depth.exponent_bits_per_sample = 0;
+  jxl::ColorEncoding linear_rec_2020 = image.Main().c_current();
+  JXL_CHECK(linear_rec_2020.SetPrimariesType(jxl::Primaries::k2100));
+  linear_rec_2020.Tf().SetTransferFunction(jxl::TransferFunction::kLinear);
+  JXL_CHECK(linear_rec_2020.CreateICC());
+  JXL_CHECK(
+      jpegxl::tools::TransformCodecInOutTo(image, linear_rec_2020, &pool));
+
+  float primaries_xyz[9];
+  const jxl::PrimariesCIExy p = image.Main().c_current().GetPrimaries();
+  const jxl::CIExy wp = image.Main().c_current().GetWhitePoint();
+  JXL_CHECK(jxl::PrimariesToXYZ(p.r.x, p.r.y, p.g.x, p.g.y, p.b.x, p.b.y, wp.x,
+                                wp.y, primaries_xyz));
+
+  float max_value = 0.f;
+  float max_relative_luminance = 0.f;
+  float white_luminance = ppf.info.intensity_target != 0 &&
+                                  !parser.GetOption(luminance_option)->matched()
+                              ? ppf.info.intensity_target
+                          : luminance_info.kind == LuminanceInfo::Kind::kWhite
+                              ? luminance_info.luminance
+                              : 0.f;
+  bool out_of_gamut = false;
+  for (size_t y = 0; y < image.ysize(); ++y) {
+    const float* const rows[3] = {image.Main().color()->ConstPlaneRow(0, y),
+                                  image.Main().color()->ConstPlaneRow(1, y),
+                                  image.Main().color()->ConstPlaneRow(2, y)};
+    for (size_t x = 0; x < image.xsize(); ++x) {
+      if (!out_of_gamut &&
+          (rows[0][x] < 0 || rows[1][x] < 0 || rows[2][x] < 0)) {
+        out_of_gamut = true;
+        fprintf(stderr,
+                "WARNING: found colors outside of the Rec. 2020 gamut.\n");
+      }
+      max_value = std::max(
+          max_value, std::max(rows[0][x], std::max(rows[1][x], rows[2][x])));
+      const float luminance = primaries_xyz[1] * rows[0][x] +
+                              primaries_xyz[4] * rows[1][x] +
+                              primaries_xyz[7] * rows[2][x];
+      if (luminance_info.kind == LuminanceInfo::Kind::kMaximum &&
+          luminance > max_relative_luminance) {
+        max_relative_luminance = luminance;
+        white_luminance = luminance_info.luminance / luminance;
+      }
+    }
+  }
+  jxl::ScaleImage(1.f / max_value, image.Main().color());
+  white_luminance *= max_value;
+  image.metadata.m.SetIntensityTarget(white_luminance);
+  if (white_luminance > 10000) {
+    fprintf(stderr,
+            "WARNING: the image is too bright for PQ (would need (1, 1, 1) to "
+            "be %g cd/m^2).\n",
+            white_luminance);
+  } else {
+    fprintf(stderr,
+            "The resulting image should be compressed with "
+            "--intensity_target=%g.\n",
+            white_luminance);
+  }
+
+  jxl::ColorEncoding pq = image.Main().c_current();
+  pq.Tf().SetTransferFunction(jxl::TransferFunction::kPQ);
+  JXL_CHECK(pq.CreateICC());
+  JXL_CHECK(jpegxl::tools::TransformCodecInOutTo(image, pq, &pool));
+  image.metadata.m.color_encoding = pq;
+  std::vector<uint8_t> encoded;
+  JXL_CHECK(jxl::Encode(image, output_filename, &encoded, &pool));
+  JXL_CHECK(jpegxl::tools::WriteFile(output_filename, encoded));
+}
index 626d54f..da8ecee 100644 (file)
@@ -7,12 +7,13 @@
 #include <stdlib.h>
 
 #include "lib/extras/codec.h"
-#include "lib/jxl/base/thread_pool_internal.h"
+#include "lib/jxl/image_metadata.h"
 #include "tools/args.h"
 #include "tools/cmdline.h"
+#include "tools/thread_pool_internal.h"
 
 int main(int argc, const char** argv) {
-  jxl::ThreadPoolInternal pool;
+  jpegxl::tools::ThreadPoolInternal pool;
 
   jpegxl::tools::CommandLineParser parser;
   size_t N = 64;
@@ -55,6 +56,8 @@ int main(int argc, const char** argv) {
   jxl::CodecInOut output;
   output.metadata.m.bit_depth.bits_per_sample = 16;
   output.SetFromImage(std::move(image), jxl::ColorEncoding::SRGB());
-  JXL_CHECK(jxl::EncodeToFile(output, jxl::ColorEncoding::SRGB(), 16,
-                              output_filename, &pool));
+  std::vector<uint8_t> encoded;
+  JXL_CHECK(jxl::Encode(output, jxl::ColorEncoding::SRGB(), 16, output_filename,
+                        &encoded, &pool));
+  JXL_CHECK(jpegxl::tools::WriteFile(output_filename, encoded));
 }
diff --git a/tools/hdr/image_utils.h b/tools/hdr/image_utils.h
new file mode 100644 (file)
index 0000000..901c2b6
--- /dev/null
@@ -0,0 +1,35 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef TOOLS_HDR_IMAGE_UTILS_H_
+#define TOOLS_HDR_IMAGE_UTILS_H_
+
+#include <jxl/cms.h>
+#include <jxl/cms_interface.h>
+
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/codec_in_out.h"
+#include "lib/jxl/image_bundle.h"
+
+namespace jpegxl {
+namespace tools {
+
+static inline jxl::Status TransformCodecInOutTo(
+    jxl::CodecInOut& io, const jxl::ColorEncoding& c_desired,
+    jxl::ThreadPool* pool) {
+  const JxlCmsInterface& cms = *JxlGetDefaultCms();
+  if (io.metadata.m.have_preview) {
+    JXL_RETURN_IF_ERROR(io.preview_frame.TransformTo(c_desired, cms, pool));
+  }
+  for (jxl::ImageBundle& ib : io.frames) {
+    JXL_RETURN_IF_ERROR(ib.TransformTo(c_desired, cms, pool));
+  }
+  return true;
+}
+
+}  // namespace tools
+}  // namespace jpegxl
+
+#endif  // TOOLS_HDR_IMAGE_UTILS_H_
diff --git a/tools/hdr/local_tone_map.cc b/tools/hdr/local_tone_map.cc
new file mode 100644 (file)
index 0000000..b6582a6
--- /dev/null
@@ -0,0 +1,541 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <jxl/cms.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "lib/extras/codec.h"
+#include "lib/extras/tone_mapping.h"
+#include "lib/jxl/convolve.h"
+#include "lib/jxl/enc_gamma_correct.h"
+#include "lib/jxl/image_bundle.h"
+#include "tools/args.h"
+#include "tools/cmdline.h"
+#include "tools/thread_pool_internal.h"
+
+namespace jxl {
+namespace {
+
+constexpr WeightsSeparable5 kPyramidFilter = {
+    {HWY_REP4(.375f), HWY_REP4(.25f), HWY_REP4(.0625f)},
+    {HWY_REP4(.375f), HWY_REP4(.25f), HWY_REP4(.0625f)}};
+
+template <typename Tin, typename Tout>
+void Subtract(const Image3<Tin>& image1, const Image3<Tin>& image2,
+              Image3<Tout>* out) {
+  const size_t xsize = image1.xsize();
+  const size_t ysize = image1.ysize();
+  JXL_CHECK(xsize == image2.xsize());
+  JXL_CHECK(ysize == image2.ysize());
+
+  for (size_t c = 0; c < 3; ++c) {
+    for (size_t y = 0; y < ysize; ++y) {
+      const Tin* const JXL_RESTRICT row1 = image1.ConstPlaneRow(c, y);
+      const Tin* const JXL_RESTRICT row2 = image2.ConstPlaneRow(c, y);
+      Tout* const JXL_RESTRICT row_out = out->PlaneRow(c, y);
+      for (size_t x = 0; x < xsize; ++x) {
+        row_out[x] = row1[x] - row2[x];
+      }
+    }
+  }
+}
+
+// Adds `what` of the size of `rect` to `to` in the position of `rect`.
+template <typename Tin, typename Tout>
+void AddTo(const Rect& rect, const Image3<Tin>& what, Image3<Tout>* to) {
+  const size_t xsize = what.xsize();
+  const size_t ysize = what.ysize();
+  JXL_ASSERT(xsize == rect.xsize());
+  JXL_ASSERT(ysize == rect.ysize());
+  for (size_t c = 0; c < 3; ++c) {
+    for (size_t y = 0; y < ysize; ++y) {
+      const Tin* JXL_RESTRICT row_what = what.ConstPlaneRow(c, y);
+      Tout* JXL_RESTRICT row_to = rect.PlaneRow(to, c, y);
+      for (size_t x = 0; x < xsize; ++x) {
+        row_to[x] += row_what[x];
+      }
+    }
+  }
+}
+
+template <typename T>
+Plane<T> Product(const Plane<T>& a, const Plane<T>& b) {
+  Plane<T> c(a.xsize(), a.ysize());
+  for (size_t y = 0; y < a.ysize(); ++y) {
+    const T* const JXL_RESTRICT row_a = a.Row(y);
+    const T* const JXL_RESTRICT row_b = b.Row(y);
+    T* const JXL_RESTRICT row_c = c.Row(y);
+    for (size_t x = 0; x < a.xsize(); ++x) {
+      row_c[x] = row_a[x] * row_b[x];
+    }
+  }
+  return c;
+}
+
+// Expects sRGB input.
+// Will call consumer(x, y, contrast) for each pixel.
+template <typename Consumer>
+void Contrast(const jxl::Image3F& image, const Consumer& consumer,
+              ThreadPool* const pool) {
+  static constexpr WeightsSymmetric3 kLaplacianWeights = {
+      {HWY_REP4(-4)}, {HWY_REP4(1)}, {HWY_REP4(0)}};
+  ImageF grayscale(image.xsize(), image.ysize());
+  static constexpr float kLuminances[3] = {0.2126, 0.7152, 0.0722};
+  for (size_t y = 0; y < image.ysize(); ++y) {
+    const float* const JXL_RESTRICT input_rows[3] = {
+        image.PlaneRow(0, y), image.PlaneRow(1, y), image.PlaneRow(2, y)};
+    float* const JXL_RESTRICT row = grayscale.Row(y);
+
+    for (size_t x = 0; x < image.xsize(); ++x) {
+      row[x] = LinearToSrgb8Direct(
+          kLuminances[0] * Srgb8ToLinearDirect(input_rows[0][x]) +
+          kLuminances[1] * Srgb8ToLinearDirect(input_rows[1][x]) +
+          kLuminances[2] * Srgb8ToLinearDirect(input_rows[2][x]));
+    }
+  }
+
+  ImageF laplacian(image.xsize(), image.ysize());
+  Symmetric3(grayscale, Rect(grayscale), kLaplacianWeights, pool, &laplacian);
+  for (size_t y = 0; y < image.ysize(); ++y) {
+    const float* const JXL_RESTRICT row = laplacian.ConstRow(y);
+    for (size_t x = 0; x < image.xsize(); ++x) {
+      consumer(x, y, std::abs(row[x]));
+    }
+  }
+}
+
+template <typename Consumer>
+void Saturation(const jxl::Image3F& image, const Consumer& consumer) {
+  for (size_t y = 0; y < image.ysize(); ++y) {
+    const float* const JXL_RESTRICT rows[3] = {
+        image.PlaneRow(0, y), image.PlaneRow(1, y), image.PlaneRow(2, y)};
+    for (size_t x = 0; x < image.xsize(); ++x) {
+      // TODO(sboukortt): experiment with other methods of computing the
+      // saturation, e.g. C*/L* in LUV/LCh.
+      const float mean = (1.f / 3) * (rows[0][x] + rows[1][x] + rows[2][x]);
+      const float deviations[3] = {rows[0][x] - mean, rows[1][x] - mean,
+                                   rows[2][x] - mean};
+      consumer(x, y,
+               std::sqrt((1.f / 3) * (deviations[0] * deviations[0] +
+                                      deviations[1] * deviations[1] +
+                                      deviations[2] * deviations[2])));
+    }
+  }
+}
+
+template <typename Consumer>
+void MidToneness(const jxl::Image3F& image, const float sigma,
+                 const Consumer& consumer) {
+  const float inv_sigma_squared = 1.f / (sigma * sigma);
+  const auto Gaussian = [inv_sigma_squared](const float x) {
+    return std::exp(-.5f * (x - .5f) * (x - .5f) * inv_sigma_squared);
+  };
+  for (size_t y = 0; y < image.ysize(); ++y) {
+    const float* const JXL_RESTRICT rows[3] = {
+        image.PlaneRow(0, y), image.PlaneRow(1, y), image.PlaneRow(2, y)};
+    for (size_t x = 0; x < image.xsize(); ++x) {
+      consumer(
+          x, y,
+          Gaussian(rows[0][x]) * Gaussian(rows[1][x]) * Gaussian(rows[2][x]));
+    }
+  }
+}
+
+ImageF ComputeWeights(const jxl::Image3F& image, const float contrast_weight,
+                      const float saturation_weight,
+                      const float midtoneness_weight,
+                      const float midtoneness_sigma, ThreadPool* const pool) {
+  ImageF log_weights(image.xsize(), image.ysize());
+  ZeroFillImage(&log_weights);
+
+  if (contrast_weight > 0) {
+    Contrast(
+        image,
+        [&log_weights, contrast_weight](const size_t x, const size_t y,
+                                        const float weight) {
+          log_weights.Row(y)[x] = contrast_weight * std::log(weight);
+        },
+        pool);
+  }
+
+  if (saturation_weight > 0) {
+    Saturation(image, [&log_weights, saturation_weight](
+                          const size_t x, const size_t y, const float weight) {
+      log_weights.Row(y)[x] += saturation_weight * std::log(weight);
+    });
+  }
+
+  if (midtoneness_weight > 0) {
+    MidToneness(image, midtoneness_sigma,
+                [&log_weights, midtoneness_weight](
+                    const size_t x, const size_t y, const float weight) {
+                  log_weights.Row(y)[x] +=
+                      midtoneness_weight * std::log(weight);
+                });
+  }
+
+  ImageF weights = std::move(log_weights);
+
+  for (size_t y = 0; y < weights.ysize(); ++y) {
+    float* const JXL_RESTRICT row = weights.Row(y);
+    for (size_t x = 0; x < weights.xsize(); ++x) {
+      row[x] = std::exp(row[x]);
+    }
+  }
+
+  return weights;
+}
+
+std::vector<ImageF> ComputeWeights(const std::vector<Image3F>& images,
+                                   const float contrast_weight,
+                                   const float saturation_weight,
+                                   const float midtoneness_weight,
+                                   const float midtoneness_sigma,
+                                   ThreadPool* const pool) {
+  std::vector<ImageF> weights;
+  weights.reserve(images.size());
+  for (const Image3F& image : images) {
+    if (image.xsize() != images.front().xsize() ||
+        image.ysize() != images.front().ysize()) {
+      return {};
+    }
+    weights.push_back(ComputeWeights(image, contrast_weight, saturation_weight,
+                                     midtoneness_weight, midtoneness_sigma,
+                                     pool));
+  }
+
+  std::vector<float*> rows(images.size());
+  for (size_t y = 0; y < images.front().ysize(); ++y) {
+    for (size_t i = 0; i < images.size(); ++i) {
+      rows[i] = weights[i].Row(y);
+    }
+    for (size_t x = 0; x < images.front().xsize(); ++x) {
+      float sum = 1e-9f;
+      for (size_t i = 0; i < images.size(); ++i) {
+        sum += rows[i][x];
+      }
+      const float ratio = 1.f / sum;
+      for (size_t i = 0; i < images.size(); ++i) {
+        rows[i][x] *= ratio;
+      }
+    }
+  }
+
+  return weights;
+}
+
+ImageF Downsample(const ImageF& image, ThreadPool* const pool) {
+  ImageF filtered(image.xsize(), image.ysize());
+  Separable5(image, Rect(image), kPyramidFilter, pool, &filtered);
+  ImageF result(DivCeil(image.xsize(), 2), DivCeil(image.ysize(), 2));
+  for (size_t y = 0; y < result.ysize(); ++y) {
+    const float* const JXL_RESTRICT filtered_row = filtered.ConstRow(2 * y);
+    float* const JXL_RESTRICT row = result.Row(y);
+    for (size_t x = 0; x < result.xsize(); ++x) {
+      row[x] = filtered_row[2 * x];
+    }
+  }
+  return result;
+}
+
+Image3F Downsample(const Image3F& image, ThreadPool* const pool) {
+  return Image3F(Downsample(image.Plane(0), pool),
+                 Downsample(image.Plane(1), pool),
+                 Downsample(image.Plane(2), pool));
+}
+
+Image3F PadImageMirror(const Image3F& in, const size_t xborder,
+                       const size_t yborder) {
+  size_t xsize = in.xsize();
+  size_t ysize = in.ysize();
+  Image3F out(xsize + 2 * xborder, ysize + 2 * yborder);
+  if (xborder > xsize || yborder > ysize) {
+    for (size_t c = 0; c < 3; c++) {
+      for (int32_t y = 0; y < static_cast<int32_t>(out.ysize()); y++) {
+        float* row_out = out.PlaneRow(c, y);
+        const float* row_in = in.PlaneRow(
+            c, Mirror(y - static_cast<int32_t>(yborder), in.ysize()));
+        for (int32_t x = 0; x < static_cast<int32_t>(out.xsize()); x++) {
+          int32_t xin = Mirror(x - static_cast<int32_t>(xborder), in.xsize());
+          row_out[x] = row_in[xin];
+        }
+      }
+    }
+    return out;
+  }
+  CopyImageTo(Rect(in), in, Rect(xborder, yborder, xsize, ysize), &out);
+  for (size_t c = 0; c < 3; c++) {
+    // Horizontal pad.
+    for (size_t y = 0; y < ysize; y++) {
+      for (size_t x = 0; x < xborder; x++) {
+        out.PlaneRow(c, y + yborder)[x] =
+            in.ConstPlaneRow(c, y)[xborder - x - 1];
+        out.PlaneRow(c, y + yborder)[x + xsize + xborder] =
+            in.ConstPlaneRow(c, y)[xsize - 1 - x];
+      }
+    }
+    // Vertical pad.
+    for (size_t y = 0; y < yborder; y++) {
+      memcpy(out.PlaneRow(c, y), out.ConstPlaneRow(c, 2 * yborder - 1 - y),
+             out.xsize() * sizeof(float));
+      memcpy(out.PlaneRow(c, y + ysize + yborder),
+             out.ConstPlaneRow(c, ysize + yborder - 1 - y),
+             out.xsize() * sizeof(float));
+    }
+  }
+  return out;
+}
+
+Image3F Upsample(const Image3F& image, const bool odd_width,
+                 const bool odd_height, ThreadPool* const pool) {
+  const Image3F padded = PadImageMirror(image, 1, 1);
+  Image3F upsampled(2 * padded.xsize(), 2 * padded.ysize());
+  ZeroFillImage(&upsampled);
+  for (int c = 0; c < 3; ++c) {
+    for (size_t y = 0; y < padded.ysize(); ++y) {
+      const float* const JXL_RESTRICT padded_row = padded.ConstPlaneRow(c, y);
+      float* const JXL_RESTRICT row = upsampled.PlaneRow(c, 2 * y);
+      for (size_t x = 0; x < padded.xsize(); ++x) {
+        row[2 * x] = 4 * padded_row[x];
+      }
+    }
+  }
+  Image3F filtered(upsampled.xsize(), upsampled.ysize());
+  for (int c = 0; c < 3; ++c) {
+    Separable5(upsampled.Plane(c), Rect(upsampled), kPyramidFilter, pool,
+               &filtered.Plane(c));
+  }
+  Image3F result(2 * image.xsize() - (odd_width ? 1 : 0),
+                 2 * image.ysize() - (odd_height ? 1 : 0));
+  CopyImageTo(Rect(2, 2, result.xsize(), result.ysize()), filtered,
+              Rect(result), &result);
+  return result;
+}
+
+std::vector<ImageF> GaussianPyramid(ImageF image, int num_levels,
+                                    ThreadPool* pool) {
+  std::vector<ImageF> pyramid(num_levels);
+  for (int i = 0; i < num_levels - 1; ++i) {
+    ImageF downsampled = Downsample(image, pool);
+    pyramid[i] = std::move(image);
+    image = std::move(downsampled);
+  }
+  pyramid[num_levels - 1] = std::move(image);
+  return pyramid;
+}
+
+std::vector<Image3F> LaplacianPyramid(Image3F image, int num_levels,
+                                      ThreadPool* pool) {
+  std::vector<Image3F> pyramid(num_levels);
+  for (int i = 0; i < num_levels - 1; ++i) {
+    Image3F downsampled = Downsample(image, pool);
+    const bool odd_width = image.xsize() % 2 != 0;
+    const bool odd_height = image.ysize() % 2 != 0;
+    Subtract(image, Upsample(downsampled, odd_width, odd_height, pool), &image);
+    pyramid[i] = std::move(image);
+    image = std::move(downsampled);
+  }
+  pyramid[num_levels - 1] = std::move(image);
+  return pyramid;
+}
+
+Image3F ReconstructFromLaplacianPyramid(std::vector<Image3F> pyramid,
+                                        ThreadPool* const pool) {
+  Image3F result = std::move(pyramid.back());
+  pyramid.pop_back();
+  for (auto it = pyramid.rbegin(); it != pyramid.rend(); ++it) {
+    const bool odd_width = it->xsize() % 2 != 0;
+    const bool odd_height = it->ysize() % 2 != 0;
+    result = Upsample(result, odd_width, odd_height, pool);
+    AddTo(Rect(result), *it, &result);
+  }
+  return result;
+}
+
+// Exposure fusion algorithm as described in:
+// https://mericam.github.io/exposure_fusion/
+//
+// That is, given n images of identical size: for each pixel coordinate, one
+// weight per input image is computed, indicating how much each input image will
+// contribute to the result. There are therefore n weight maps, the sum of which
+// is 1 at every pixel.
+//
+// Those weights are then applied at various scales rather than directly at full
+// resolution. To understand how, it helps to familiarize oneself with Laplacian
+// and Gaussian pyramids, as described in "The Laplacian Pyramid as a Compact
+// Image Code" by P. Burt and E. Adelson:
+// http://persci.mit.edu/pub_pdfs/pyramid83.pdf
+//
+// A Gaussian pyramid of k levels is a sequence of k images in which the first
+// image is the original image and each following level is a low-pass-filtered
+// version of the previous one. A Laplacian pyramid is obtained from a Gaussian
+// pyramid by:
+//
+//   laplacian_pyramid[i] = gaussian_pyramid[i] − gaussian_pyramid[i + 1].
+//   (The last item of the Laplacian pyramid is just the last one from the
+//    Gaussian pyramid without subtraction.)
+//
+// From there, the original image can be reconstructed by adding all the images
+// from the Laplacian pyramid together. (If desired, the Gaussian pyramid can be
+// reconstructed as well by storing the cumulative sums starting from the end.)
+//
+// Having established that, the application of the weight images is done by
+// constructing a Laplacian pyramid for each input image, as well as a Gaussian
+// pyramid for each weight image, and then constructing a Laplacian pyramid such
+// that:
+//
+//   pyramid[i] = sum(laplacian_pyramids[j][i] .* weight_gaussian_pyramids[j][i]
+//                      for j in 1..n)
+//
+// And then reconstructing an image from the pyramid thus obtained.
+Image3F ExposureFusion(std::vector<Image3F> images, int num_levels,
+                       const float contrast_weight,
+                       const float saturation_weight,
+                       const float midtoneness_weight,
+                       const float midtoneness_sigma, ThreadPool* const pool) {
+  std::vector<ImageF> weights =
+      ComputeWeights(images, contrast_weight, saturation_weight,
+                     midtoneness_weight, midtoneness_sigma, pool);
+
+  std::vector<Image3F> pyramid(num_levels);
+  for (size_t i = 0; i < images.size(); ++i) {
+    const std::vector<ImageF> weight_pyramid =
+        GaussianPyramid(std::move(weights[i]), num_levels, pool);
+    const std::vector<Image3F> image_pyramid =
+        LaplacianPyramid(std::move(images[i]), num_levels, pool);
+
+    for (int k = 0; k < num_levels; ++k) {
+      Image3F product(Product(weight_pyramid[k], image_pyramid[k].Plane(0)),
+                      Product(weight_pyramid[k], image_pyramid[k].Plane(1)),
+                      Product(weight_pyramid[k], image_pyramid[k].Plane(2)));
+      if (pyramid[k].xsize() == 0) {
+        pyramid[k] = std::move(product);
+      } else {
+        AddTo(Rect(product), product, &pyramid[k]);
+      }
+    }
+  }
+
+  return ReconstructFromLaplacianPyramid(std::move(pyramid), pool);
+}
+
+}  // namespace
+}  // namespace jxl
+
+int main(int argc, const char** argv) {
+  jpegxl::tools::ThreadPoolInternal pool;
+
+  jpegxl::tools::CommandLineParser parser;
+  float max_nits = 0;
+  parser.AddOptionValue('m', "max_nits", "nits",
+                        "maximum luminance in the image", &max_nits,
+                        &jpegxl::tools::ParseFloat, 0);
+  float preserve_saturation = .1f;
+  parser.AddOptionValue(
+      's', "preserve_saturation", "0..1",
+      "to what extent to try and preserve saturation over luminance",
+      &preserve_saturation, &jpegxl::tools::ParseFloat, 0);
+  int64_t num_levels = -1;
+  parser.AddOptionValue('l', "num_levels", "1..",
+                        "number of levels in the pyramid", &num_levels,
+                        &jpegxl::tools::ParseInt64, 0);
+  float contrast_weight = 0.f;
+  parser.AddOptionValue('c', "contrast_weight", "0..",
+                        "importance of contrast when computing weights",
+                        &contrast_weight, &jpegxl::tools::ParseFloat, 0);
+  float saturation_weight = .2f;
+  parser.AddOptionValue('a', "saturation_weight", "0..",
+                        "importance of saturation when computing weights",
+                        &saturation_weight, &jpegxl::tools::ParseFloat, 0);
+  float midtoneness_weight = 1.f;
+  parser.AddOptionValue('t', "midtoneness_weight", "0..",
+                        "importance of \"midtoneness\" when computing weights",
+                        &midtoneness_weight, &jpegxl::tools::ParseFloat, 0);
+  float midtoneness_sigma = .2f;
+  parser.AddOptionValue('g', "midtoneness_sigma", "0..",
+                        "spread of the function that computes midtoneness",
+                        &midtoneness_sigma, &jpegxl::tools::ParseFloat, 0);
+  const char* input_filename = nullptr;
+  auto input_filename_option = parser.AddPositionalOption(
+      "input", true, "input image", &input_filename, 0);
+  const char* output_filename = nullptr;
+  auto output_filename_option = parser.AddPositionalOption(
+      "output", true, "output image", &output_filename, 0);
+
+  if (!parser.Parse(argc, argv)) {
+    fprintf(stderr, "See -h for help.\n");
+    return EXIT_FAILURE;
+  }
+
+  if (parser.HelpFlagPassed()) {
+    parser.PrintHelp();
+    return EXIT_SUCCESS;
+  }
+
+  if (!parser.GetOption(input_filename_option)->matched()) {
+    fprintf(stderr, "Missing input filename.\nSee -h for help.\n");
+    return EXIT_FAILURE;
+  }
+  if (!parser.GetOption(output_filename_option)->matched()) {
+    fprintf(stderr, "Missing output filename.\nSee -h for help.\n");
+    return EXIT_FAILURE;
+  }
+
+  jxl::CodecInOut image;
+  jxl::extras::ColorHints color_hints;
+  color_hints.Add("color_space", "RGB_D65_202_Rel_PeQ");
+  std::vector<uint8_t> encoded;
+  JXL_CHECK(jpegxl::tools::ReadFile(input_filename, &encoded));
+  JXL_CHECK(jxl::SetFromBytes(jxl::Bytes(encoded), color_hints, &image, &pool));
+
+  if (max_nits > 0) {
+    image.metadata.m.SetIntensityTarget(max_nits);
+  } else {
+    max_nits = image.metadata.m.IntensityTarget();
+  }
+
+  std::vector<jxl::Image3F> input_images;
+
+  if (max_nits <= 4 * jxl::kDefaultIntensityTarget) {
+    jxl::CodecInOut sRGB_image;
+    jxl::Image3F color(image.xsize(), image.ysize());
+    CopyImageTo(*image.Main().color(), &color);
+    sRGB_image.SetFromImage(std::move(color), image.Main().c_current());
+    JXL_CHECK(sRGB_image.Main().TransformTo(jxl::ColorEncoding::SRGB(),
+                                            *JxlGetDefaultCms(), &pool));
+    input_images.push_back(std::move(*sRGB_image.Main().color()));
+  }
+
+  for (int i = 0; i < 4; ++i) {
+    const float target = std::ldexp(jxl::kDefaultIntensityTarget, 2 - i);
+    if (target >= max_nits) continue;
+    jxl::CodecInOut tone_mapped_image;
+    jxl::Image3F color(image.xsize(), image.ysize());
+    CopyImageTo(*image.Main().color(), &color);
+    tone_mapped_image.SetFromImage(std::move(color), image.Main().c_current());
+    tone_mapped_image.metadata.m.SetIntensityTarget(
+        image.metadata.m.IntensityTarget());
+    JXL_CHECK(jxl::ToneMapTo({0, target}, &tone_mapped_image, &pool));
+    JXL_CHECK(jxl::GamutMap(&tone_mapped_image, preserve_saturation, &pool));
+    JXL_CHECK(tone_mapped_image.Main().TransformTo(jxl::ColorEncoding::SRGB(),
+                                                   *JxlGetDefaultCms(), &pool));
+    input_images.push_back(std::move(*tone_mapped_image.Main().color()));
+  }
+
+  if (num_levels < 1) {
+    num_levels = jxl::FloorLog2Nonzero(std::min(image.xsize(), image.ysize()));
+  }
+
+  jxl::Image3F fused = jxl::ExposureFusion(
+      std::move(input_images), num_levels, contrast_weight, saturation_weight,
+      midtoneness_weight, midtoneness_sigma, &pool);
+
+  jxl::CodecInOut output;
+  output.SetFromImage(std::move(fused), jxl::ColorEncoding::SRGB());
+
+  JXL_CHECK(jxl::Encode(output, output_filename, &encoded, &pool));
+  JXL_CHECK(jpegxl::tools::WriteFile(output_filename, encoded));
+}
index 3b2125b..ea47a6b 100644 (file)
@@ -9,13 +9,13 @@
 #include "lib/extras/codec.h"
 #include "lib/extras/hlg.h"
 #include "lib/extras/tone_mapping.h"
-#include "lib/jxl/base/thread_pool_internal.h"
-#include "lib/jxl/enc_color_management.h"
 #include "tools/args.h"
 #include "tools/cmdline.h"
+#include "tools/hdr/image_utils.h"
+#include "tools/thread_pool_internal.h"
 
 int main(int argc, const char** argv) {
-  jxl::ThreadPoolInternal pool;
+  jpegxl::tools::ThreadPoolInternal pool;
 
   jpegxl::tools::CommandLineParser parser;
   float max_nits = 0;
@@ -56,10 +56,14 @@ int main(int argc, const char** argv) {
   jxl::CodecInOut image;
   jxl::extras::ColorHints color_hints;
   color_hints.Add("color_space", "RGB_D65_202_Rel_PeQ");
-  JXL_CHECK(jxl::SetFromFile(input_filename, color_hints, &image, &pool));
+  std::vector<uint8_t> encoded;
+  JXL_CHECK(jpegxl::tools::ReadFile(input_filename, &encoded));
+  JXL_CHECK(jxl::SetFromBytes(jxl::Bytes(encoded), color_hints, &image, &pool));
   if (max_nits > 0) {
     image.metadata.m.SetIntensityTarget(max_nits);
   }
+  const jxl::Primaries original_primaries =
+      image.Main().c_current().GetPrimariesType();
   JXL_CHECK(jxl::ToneMapTo({0, 1000}, &image, &pool));
   JXL_CHECK(jxl::HlgInverseOOTF(&image.Main(), 1.2f, &pool));
   JXL_CHECK(jxl::GamutMap(&image, preserve_saturation, &pool));
@@ -70,11 +74,12 @@ int main(int argc, const char** argv) {
 
   jxl::ColorEncoding hlg;
   hlg.SetColorSpace(jxl::ColorSpace::kRGB);
-  hlg.primaries = jxl::Primaries::k2100;
-  hlg.white_point = jxl::WhitePoint::kD65;
-  hlg.tf.SetTransferFunction(jxl::TransferFunction::kHLG);
+  JXL_CHECK(hlg.SetPrimariesType(original_primaries));
+  JXL_CHECK(hlg.SetWhitePointType(jxl::WhitePoint::kD65));
+  hlg.Tf().SetTransferFunction(jxl::TransferFunction::kHLG);
   JXL_CHECK(hlg.CreateICC());
-  JXL_CHECK(image.TransformTo(hlg, jxl::GetJxlCms(), &pool));
+  JXL_CHECK(jpegxl::tools::TransformCodecInOutTo(image, hlg, &pool));
   image.metadata.m.color_encoding = hlg;
-  JXL_CHECK(jxl::EncodeToFile(image, output_filename, &pool));
+  JXL_CHECK(jxl::Encode(image, output_filename, &encoded, &pool));
+  JXL_CHECK(jpegxl::tools::WriteFile(output_filename, encoded));
 }
index c8a2395..cca43b1 100644 (file)
@@ -9,13 +9,13 @@
 #include "lib/extras/codec.h"
 #include "lib/extras/hlg.h"
 #include "lib/extras/tone_mapping.h"
-#include "lib/jxl/base/thread_pool_internal.h"
-#include "lib/jxl/enc_color_management.h"
 #include "tools/args.h"
 #include "tools/cmdline.h"
+#include "tools/hdr/image_utils.h"
+#include "tools/thread_pool_internal.h"
 
 int main(int argc, const char** argv) {
-  jxl::ThreadPoolInternal pool;
+  jpegxl::tools::ThreadPoolInternal pool;
 
   jpegxl::tools::CommandLineParser parser;
   float target_nits = 0;
@@ -71,7 +71,9 @@ int main(int argc, const char** argv) {
   jxl::CodecInOut image;
   jxl::extras::ColorHints color_hints;
   color_hints.Add("color_space", "RGB_D65_202_Rel_HLG");
-  JXL_CHECK(jxl::SetFromFile(input_filename, color_hints, &image, &pool));
+  std::vector<uint8_t> encoded;
+  JXL_CHECK(jpegxl::tools::ReadFile(input_filename, &encoded));
+  JXL_CHECK(jxl::SetFromBytes(jxl::Bytes(encoded), color_hints, &image, &pool));
   // Ensures that conversions to linear by JxlCms will not apply the OOTF as we
   // apply it ourselves to control the subsequent gamut mapping.
   image.metadata.m.SetIntensityTarget(301);
@@ -82,13 +84,12 @@ int main(int argc, const char** argv) {
   image.metadata.m.SetIntensityTarget(target_nits);
 
   jxl::ColorEncoding c_out = image.metadata.m.color_encoding;
-  if (pq) {
-    c_out.tf.SetTransferFunction(jxl::TransferFunction::kPQ);
-  } else {
-    c_out.tf.SetTransferFunction(jxl::TransferFunction::k709);
-  }
+  jxl::cms::TransferFunction tf =
+      pq ? jxl::TransferFunction::kPQ : jxl::TransferFunction::kSRGB;
+  c_out.Tf().SetTransferFunction(tf);
   JXL_CHECK(c_out.CreateICC());
-  JXL_CHECK(image.TransformTo(c_out, jxl::GetJxlCms(), &pool));
+  JXL_CHECK(jpegxl::tools::TransformCodecInOutTo(image, c_out, &pool));
   image.metadata.m.color_encoding = c_out;
-  JXL_CHECK(jxl::EncodeToFile(image, output_filename, &pool));
+  JXL_CHECK(jxl::Encode(image, output_filename, &encoded, &pool));
+  JXL_CHECK(jpegxl::tools::WriteFile(output_filename, encoded));
 }
index a5e5af7..0d9f731 100644 (file)
@@ -7,12 +7,13 @@
 #include <stdlib.h>
 
 #include "lib/extras/codec.h"
-#include "lib/jxl/base/thread_pool_internal.h"
+#include "lib/jxl/image_bundle.h"
 #include "tools/args.h"
 #include "tools/cmdline.h"
+#include "tools/thread_pool_internal.h"
 
 int main(int argc, const char** argv) {
-  jxl::ThreadPoolInternal pool;
+  jpegxl::tools::ThreadPoolInternal pool;
 
   jpegxl::tools::CommandLineParser parser;
   const char* input_filename = nullptr;
@@ -42,8 +43,10 @@ int main(int argc, const char** argv) {
   }
 
   jxl::CodecInOut image;
-  JXL_CHECK(jxl::SetFromFile(input_filename, jxl::extras::ColorHints(), &image,
-                             &pool));
+  std::vector<uint8_t> encoded;
+  JXL_CHECK(jpegxl::tools::ReadFile(input_filename, &encoded));
+  JXL_CHECK(jxl::SetFromBytes(jxl::Bytes(encoded), jxl::extras::ColorHints(),
+                              &image, &pool));
 
   JXL_CHECK(image.xsize() == image.ysize() * image.ysize());
   const unsigned N = image.ysize();
index 1ef3823..67fea48 100644 (file)
@@ -8,13 +8,14 @@
 
 #include "lib/extras/codec.h"
 #include "lib/extras/tone_mapping.h"
-#include "lib/jxl/base/thread_pool_internal.h"
-#include "lib/jxl/enc_color_management.h"
 #include "tools/args.h"
 #include "tools/cmdline.h"
+#include "tools/file_io.h"
+#include "tools/hdr/image_utils.h"
+#include "tools/thread_pool_internal.h"
 
 int main(int argc, const char** argv) {
-  jxl::ThreadPoolInternal pool;
+  jpegxl::tools::ThreadPoolInternal pool;
 
   jpegxl::tools::CommandLineParser parser;
   float max_nits = 0;
@@ -69,7 +70,9 @@ int main(int argc, const char** argv) {
   jxl::CodecInOut image;
   jxl::extras::ColorHints color_hints;
   color_hints.Add("color_space", "RGB_D65_202_Rel_PeQ");
-  JXL_CHECK(jxl::SetFromFile(input_filename, color_hints, &image, &pool));
+  std::vector<uint8_t> encoded;
+  JXL_CHECK(jpegxl::tools::ReadFile(input_filename, &encoded));
+  JXL_CHECK(jxl::SetFromBytes(jxl::Bytes(encoded), color_hints, &image, &pool));
   if (max_nits > 0) {
     image.metadata.m.SetIntensityTarget(max_nits);
   }
@@ -77,13 +80,18 @@ int main(int argc, const char** argv) {
   JXL_CHECK(jxl::GamutMap(&image, preserve_saturation, &pool));
 
   jxl::ColorEncoding c_out = image.metadata.m.color_encoding;
-  if (pq) {
-    c_out.tf.SetTransferFunction(jxl::TransferFunction::kPQ);
-  } else {
-    c_out.tf.SetTransferFunction(jxl::TransferFunction::k709);
+  jxl::cms::TransferFunction tf =
+      pq ? jxl::TransferFunction::kPQ : jxl::TransferFunction::kSRGB;
+
+  if (jxl::extras::CodecFromPath(output_filename) == jxl::extras::Codec::kEXR) {
+    tf = jxl::TransferFunction::kLinear;
+    image.metadata.m.SetFloat16Samples();
   }
+  c_out.Tf().SetTransferFunction(tf);
+
   JXL_CHECK(c_out.CreateICC());
-  JXL_CHECK(image.TransformTo(c_out, jxl::GetJxlCms(), &pool));
+  JXL_CHECK(jpegxl::tools::TransformCodecInOutTo(image, c_out, &pool));
   image.metadata.m.color_encoding = c_out;
-  JXL_CHECK(jxl::EncodeToFile(image, output_filename, &pool));
+  JXL_CHECK(jxl::Encode(image, output_filename, &encoded, &pool));
+  JXL_CHECK(jpegxl::tools::WriteFile(output_filename, encoded));
 }
index 0af805c..410ece1 100644 (file)
@@ -6,7 +6,15 @@
 #include "lib/jxl/enc_icc_codec.h"
 #include "lib/jxl/icc_codec.h"
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
+
+using ::jxl::PaddedBytes;
+
+#ifdef JXL_ICC_FUZZER_SLOW_TEST
+using ::jxl::BitReader;
+using ::jxl::Span;
+#endif
 
 int TestOneInput(const uint8_t* data, size_t size) {
 #if defined(JXL_ICC_FUZZER_ONLY_WRITE)
@@ -27,33 +35,32 @@ int TestOneInput(const uint8_t* data, size_t size) {
   // the ICC parsing.
   if (read) {
     // Reading parses the compressed format.
-    BitReader br(Span<const uint8_t>(data, size));
-    PaddedBytes result;
-    (void)ReadICC(&br, &result);
+    BitReader br(Bytes(data, size));
+    std::vector<uint8_t> result;
+    (void)jxl::test::ReadICC(&br, &result);
     (void)br.Close();
   } else {
     // Writing parses the original ICC profile.
     PaddedBytes icc;
     icc.assign(data, data + size);
     BitWriter writer;
-    AuxOut aux;
     // Writing should support any random bytestream so must succeed, make
     // fuzzer fail if not.
-    JXL_ASSERT(WriteICC(icc, &writer, 0, &aux));
+    JXL_ASSERT(jxl::WriteICC(icc, &writer, 0, nullptr));
   }
 #else  // JXL_ICC_FUZZER_SLOW_TEST
   if (read) {
     // Reading (unpredicting) parses the compressed format.
     PaddedBytes result;
-    (void)UnpredictICC(data, size, &result);
+    (void)jxl::UnpredictICC(data, size, &result);
   } else {
     // Writing (predicting) parses the original ICC profile.
     PaddedBytes result;
     // Writing should support any random bytestream so must succeed, make
     // fuzzer fail if not.
-    JXL_ASSERT(PredictICC(data, size, &result));
+    JXL_ASSERT(jxl::PredictICC(data, size, &result));
     PaddedBytes reconstructed;
-    JXL_ASSERT(UnpredictICC(result.data(), result.size(), &reconstructed));
+    JXL_ASSERT(jxl::UnpredictICC(result.data(), result.size(), &reconstructed));
     JXL_ASSERT(reconstructed.size() == size);
     JXL_ASSERT(memcmp(data, reconstructed.data(), size) == 0);
   }
@@ -61,8 +68,9 @@ int TestOneInput(const uint8_t* data, size_t size) {
   return 0;
 }
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
 
 extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
-  return jxl::TestOneInput(data, size);
+  return jpegxl::tools::TestOneInput(data, size);
 }
index 9335d94..deca6d7 100644 (file)
@@ -9,11 +9,13 @@
 #include <QByteArray>
 #include <QWidget>
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
 
 // Should be cached if possible.
 QByteArray GetMonitorIccProfile(const QWidget* widget);
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
 
 #endif  // TOOLS_ICC_DETECT_ICC_DETECT_H_
index abd4a95..421ac50 100644 (file)
@@ -5,10 +5,12 @@
 
 #include "tools/icc_detect/icc_detect.h"
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
 
 QByteArray GetMonitorIccProfile(const QWidget* const /*widget*/) {
   return QByteArray();
 }
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
index 39ac5ee..f06e688 100644 (file)
@@ -10,7 +10,8 @@
 #include <memory>
 #include <type_traits>
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
 
 namespace {
 
@@ -61,4 +62,5 @@ QByteArray GetMonitorIccProfile(const QWidget* const widget) {
   return profile;
 }
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
index be1209e..e67b30e 100644 (file)
@@ -3,17 +3,23 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+// clang-format off
 #include "tools/icc_detect/icc_detect.h"
+// clang-format on
 
 #include <stdint.h>
 #include <stdlib.h>
 #include <xcb/xcb.h>
 
-#include <QX11Info>
-#include <algorithm>
 #include <memory>
 
-namespace jxl {
+// clang-format off
+#include <QApplication>
+#include <X11/Xlib.h>
+// clang-format on
+
+namespace jpegxl {
+namespace tools {
 
 namespace {
 
@@ -30,11 +36,17 @@ using XcbUniquePtr = std::unique_ptr<T, FreeDeleter>;
 
 QByteArray GetMonitorIccProfile(const QWidget* const widget) {
   Q_UNUSED(widget)
-  xcb_connection_t* const connection = QX11Info::connection();
+  auto* const qX11App =
+      qGuiApp->nativeInterface<QNativeInterface::QX11Application>();
+  if (qX11App == nullptr) {
+    return QByteArray();
+  }
+  xcb_connection_t* const connection = qX11App->connection();
   if (connection == nullptr) {
     return QByteArray();
   }
-  const int screen_number = QX11Info::appScreen();
+
+  const int screenNumber = DefaultScreen(qX11App->display());
 
   const xcb_intern_atom_cookie_t atomRequest =
       xcb_intern_atom(connection, /*only_if_exists=*/1,
@@ -51,7 +63,7 @@ QByteArray GetMonitorIccProfile(const QWidget* const widget) {
   for (xcb_screen_iterator_t it =
            xcb_setup_roots_iterator(xcb_get_setup(connection));
        it.rem; xcb_screen_next(&it)) {
-    if (i == screen_number) {
+    if (i == screenNumber) {
       screen = it.data;
       break;
     }
@@ -74,4 +86,5 @@ QByteArray GetMonitorIccProfile(const QWidget* const widget) {
       xcb_get_property_value_length(profile.get()));
 }
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
index 440ef6e..7bdd6a7 100644 (file)
@@ -32,7 +32,14 @@ public class Decoder {
     return new ImageData(basicInfo.width, basicInfo.height, pixels, icc, pixelFormat);
   }
 
-  // TODO(eustas): accept byte-array as input.
+  public static StreamInfo decodeInfo(byte[] data) {
+    return decodeInfo(ByteBuffer.wrap(data));
+  }
+
+  public static StreamInfo decodeInfo(byte[] data, int offset, int length) {
+    return decodeInfo(ByteBuffer.wrap(data, offset, length));
+  }
+
   public static StreamInfo decodeInfo(Buffer data) {
     return DecoderJni.getBasicInfo(data, null);
   }
index 1b3847e..d61464e 100644 (file)
@@ -6,13 +6,11 @@
 #include "tools/jni/org/jpeg/jpegxl/wrapper/decoder_jni.h"
 
 #include <jni.h>
+#include <jxl/decode.h>
+#include <jxl/thread_parallel_runner.h>
 
 #include <cstdlib>
 
-#include "jxl/decode.h"
-#include "jxl/thread_parallel_runner.h"
-#include "lib/jxl/base/status.h"
-
 namespace {
 
 template <typename From, typename To>
@@ -34,11 +32,11 @@ bool BufferToSpan(JNIEnv* env, jobject buffer, uint8_t** data, size_t* size) {
   return StaticCast(env->GetDirectBufferCapacity(buffer), size);
 }
 
-int ToStatusCode(const jxl::Status& status) {
-  if (status) return 0;
-  if (status.IsFatalError()) return -1;
-  return 1;  // Non-fatal -> not enough input.
-}
+enum class Status { OK = 0, FATAL_ERROR = -1, NOT_ENOUGH_INPUT = 1 };
+
+bool IsOk(Status status) { return status == Status::OK; }
+
+#define FAILURE(M) Status::FATAL_ERROR
 
 constexpr const size_t kLastPixelFormat = 3;
 constexpr const size_t kNoPixelFormat = static_cast<size_t>(-1);
@@ -64,28 +62,27 @@ JxlPixelFormat ToPixelFormat(size_t pixel_format) {
   }
 }
 
-jxl::Status DoDecode(JNIEnv* env, jobject data_buffer, size_t* info_pixels_size,
-                     size_t* info_icc_size, JxlBasicInfo* info,
-                     size_t pixel_format, jobject pixels_buffer,
-                     jobject icc_buffer) {
-  if (data_buffer == nullptr) return JXL_FAILURE("No data buffer");
+Status DoDecode(JNIEnv* env, jobject data_buffer, size_t* info_pixels_size,
+                size_t* info_icc_size, JxlBasicInfo* info, size_t pixel_format,
+                jobject pixels_buffer, jobject icc_buffer) {
+  if (data_buffer == nullptr) return FAILURE("No data buffer");
 
   uint8_t* data = nullptr;
   size_t data_size = 0;
   if (!BufferToSpan(env, data_buffer, &data, &data_size)) {
-    return JXL_FAILURE("Failed to access data buffer");
+    return FAILURE("Failed to access data buffer");
   }
 
   uint8_t* pixels = nullptr;
   size_t pixels_size = 0;
   if (!BufferToSpan(env, pixels_buffer, &pixels, &pixels_size)) {
-    return JXL_FAILURE("Failed to access pixels buffer");
+    return FAILURE("Failed to access pixels buffer");
   }
 
   uint8_t* icc = nullptr;
   size_t icc_size = 0;
   if (!BufferToSpan(env, icc_buffer, &icc, &icc_size)) {
-    return JXL_FAILURE("Failed to access ICC buffer");
+    return FAILURE("Failed to access ICC buffer");
   }
 
   JxlDecoder* dec = JxlDecoderCreate(NULL);
@@ -105,80 +102,76 @@ jxl::Status DoDecode(JNIEnv* env, jobject data_buffer, size_t* info_pixels_size,
   auto status =
       JxlDecoderSetParallelRunner(dec, JxlThreadParallelRunner, runner);
   if (status != JXL_DEC_SUCCESS) {
-    return JXL_FAILURE("Failed to set parallel runner");
+    return FAILURE("Failed to set parallel runner");
   }
   status = JxlDecoderSubscribeEvents(
       dec, JXL_DEC_BASIC_INFO | JXL_DEC_FULL_IMAGE | JXL_DEC_COLOR_ENCODING);
   if (status != JXL_DEC_SUCCESS) {
-    return JXL_FAILURE("Failed to subscribe for events");
+    return FAILURE("Failed to subscribe for events");
   }
   status = JxlDecoderSetInput(dec, data, data_size);
   if (status != JXL_DEC_SUCCESS) {
-    return JXL_FAILURE("Failed to set input");
+    return FAILURE("Failed to set input");
   }
   status = JxlDecoderProcessInput(dec);
   if (status == JXL_DEC_NEED_MORE_INPUT) {
-    return JXL_STATUS(jxl::StatusCode::kNotEnoughBytes, "Not enough input");
+    return Status::NOT_ENOUGH_INPUT;
   }
   if (status != JXL_DEC_BASIC_INFO) {
-    return JXL_FAILURE("Unexpected notification (want: basic info)");
+    return FAILURE("Unexpected notification (want: basic info)");
   }
   if (info_pixels_size) {
     JxlPixelFormat format = ToPixelFormat(pixel_format);
     status = JxlDecoderImageOutBufferSize(dec, &format, info_pixels_size);
     if (status != JXL_DEC_SUCCESS) {
-      return JXL_FAILURE("Failed to get pixels size");
+      return FAILURE("Failed to get pixels size");
     }
   }
   if (info) {
     status = JxlDecoderGetBasicInfo(dec, info);
     if (status != JXL_DEC_SUCCESS) {
-      return JXL_FAILURE("Failed to get basic info");
+      return FAILURE("Failed to get basic info");
     }
   }
   status = JxlDecoderProcessInput(dec);
   if (status != JXL_DEC_COLOR_ENCODING) {
-    return JXL_FAILURE("Unexpected notification (want: color encoding)");
+    return FAILURE("Unexpected notification (want: color encoding)");
   }
   if (info_icc_size) {
-    JxlPixelFormat format = ToPixelFormat(pixel_format);
-    status = JxlDecoderGetICCProfileSize(
-        dec, &format, JXL_COLOR_PROFILE_TARGET_DATA, info_icc_size);
+    status = JxlDecoderGetICCProfileSize(dec, JXL_COLOR_PROFILE_TARGET_DATA,
+                                         info_icc_size);
     if (status != JXL_DEC_SUCCESS) *info_icc_size = 0;
   }
   if (icc && icc_size > 0) {
-    JxlPixelFormat format = ToPixelFormat(pixel_format);
-    status = JxlDecoderGetColorAsICCProfile(
-        dec, &format, JXL_COLOR_PROFILE_TARGET_DATA, icc, icc_size);
+    status = JxlDecoderGetColorAsICCProfile(dec, JXL_COLOR_PROFILE_TARGET_DATA,
+                                            icc, icc_size);
     if (status != JXL_DEC_SUCCESS) {
-      return JXL_FAILURE("Failed to get ICC");
+      return FAILURE("Failed to get ICC");
     }
   }
   if (pixels) {
     JxlPixelFormat format = ToPixelFormat(pixel_format);
     status = JxlDecoderProcessInput(dec);
     if (status != JXL_DEC_NEED_IMAGE_OUT_BUFFER) {
-      return JXL_FAILURE("Unexpected notification (want: need out buffer)");
+      return FAILURE("Unexpected notification (want: need out buffer)");
     }
     status = JxlDecoderSetImageOutBuffer(dec, &format, pixels, pixels_size);
     if (status != JXL_DEC_SUCCESS) {
-      return JXL_FAILURE("Failed to set out buffer");
+      return FAILURE("Failed to set out buffer");
     }
     status = JxlDecoderProcessInput(dec);
     if (status != JXL_DEC_FULL_IMAGE) {
-      return JXL_FAILURE("Unexpected notification (want: full image)");
+      return FAILURE("Unexpected notification (want: full image)");
     }
     status = JxlDecoderProcessInput(dec);
     if (status != JXL_DEC_SUCCESS) {
-      return JXL_FAILURE("Unexpected notification (want: success)");
+      return FAILURE("Unexpected notification (want: success)");
     }
   }
 
-  return true;
+  return Status::OK;
 }
 
-#undef FAILURE
-
 }  // namespace
 
 #ifdef __cplusplus
@@ -196,18 +189,18 @@ Java_org_jpeg_jpegxl_wrapper_DecoderJni_nativeGetBasicInfo(
   size_t icc_size = 0;
   size_t pixel_format = 0;
 
-  jxl::Status status = true;
+  Status status = Status::OK;
 
-  if (status) {
+  if (IsOk(status)) {
     pixel_format = context[0];
     if (pixel_format == kNoPixelFormat) {
       // OK
     } else if (pixel_format > kLastPixelFormat) {
-      status = JXL_FAILURE("Unrecognized pixel format");
+      status = FAILURE("Unrecognized pixel format");
     }
   }
 
-  if (status) {
+  if (IsOk(status)) {
     bool want_output_size = (pixel_format != kNoPixelFormat);
     if (want_output_size) {
       status = DoDecode(
@@ -221,17 +214,17 @@ Java_org_jpeg_jpegxl_wrapper_DecoderJni_nativeGetBasicInfo(
     }
   }
 
-  if (status) {
+  if (IsOk(status)) {
     bool ok = true;
     ok &= StaticCast(info.xsize, context + 1);
     ok &= StaticCast(info.ysize, context + 2);
     ok &= StaticCast(pixels_size, context + 3);
     ok &= StaticCast(icc_size, context + 4);
     ok &= StaticCast(info.alpha_bits, context + 5);
-    if (!ok) status = JXL_FAILURE("Invalid value");
+    if (!ok) status = FAILURE("Invalid value");
   }
 
-  context[0] = ToStatusCode(status);
+  context[0] = static_cast<int>(status);
 
   env->SetIntArrayRegion(ctx, 0, 6, context);
 }
@@ -251,26 +244,28 @@ JNIEXPORT void JNICALL Java_org_jpeg_jpegxl_wrapper_DecoderJni_nativeGetPixels(
 
   size_t pixel_format = 0;
 
-  jxl::Status status = true;
+  Status status = Status::OK;
 
-  if (status) {
+  if (IsOk(status)) {
     // Unlike getBasicInfo, "no-pixel-format" is not supported.
     pixel_format = context[0];
     if (pixel_format > kLastPixelFormat) {
-      status = JXL_FAILURE("Unrecognized pixel format");
+      status = FAILURE("Unrecognized pixel format");
     }
   }
 
-  if (status) {
+  if (IsOk(status)) {
     status = DoDecode(env, data_buffer, /* info_pixels_size= */ nullptr,
                       /* info_icc_size= */ nullptr, /* info= */ nullptr,
                       pixel_format, pixels_buffer, icc_buffer);
   }
 
-  context[0] = ToStatusCode(status);
+  context[0] = static_cast<int>(status);
   env->SetIntArrayRegion(ctx, 0, 1, context);
 }
 
+#undef FAILURE
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/tools/jpegli_dec_fuzzer.cc b/tools/jpegli_dec_fuzzer.cc
new file mode 100644 (file)
index 0000000..12464c6
--- /dev/null
@@ -0,0 +1,212 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <setjmp.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <hwy/targets.h>
+#include <vector>
+
+#include "lib/jpegli/decode.h"
+
+namespace {
+
+// Externally visible value to ensure pixels are used in the fuzzer.
+int external_code = 0;
+
+template <typename It>
+void Consume(const It& begin, const It& end) {
+  for (auto it = begin; it < end; ++it) {
+    if (*it == 0) {
+      external_code ^= ~0;
+    } else {
+      external_code ^= *it;
+    }
+  }
+}
+
+// Options for the fuzzing
+struct FuzzSpec {
+  size_t chunk_size;
+  JpegliDataType output_type;
+  JpegliEndianness output_endianness;
+  int crop_output;
+};
+
+static constexpr uint8_t kFakeEoiMarker[2] = {0xff, 0xd9};
+static constexpr size_t kNumSourceBuffers = 4;
+
+class SourceManager {
+ public:
+  SourceManager(const uint8_t* data, size_t len, size_t max_chunk_size)
+      : data_(data), len_(len), max_chunk_size_(max_chunk_size) {
+    pub_.skip_input_data = skip_input_data;
+    pub_.resync_to_restart = jpegli_resync_to_restart;
+    pub_.term_source = term_source;
+    pub_.init_source = init_source;
+    pub_.fill_input_buffer = fill_input_buffer;
+    if (max_chunk_size_ == 0) max_chunk_size_ = len;
+    buffers_.resize(kNumSourceBuffers, std::vector<uint8_t>(max_chunk_size_));
+    Reset();
+  }
+
+  void Reset() {
+    pub_.next_input_byte = nullptr;
+    pub_.bytes_in_buffer = 0;
+    pos_ = 0;
+    chunk_idx_ = 0;
+  }
+
+ private:
+  jpeg_source_mgr pub_;
+  const uint8_t* data_;
+  size_t len_;
+  size_t chunk_idx_;
+  size_t pos_;
+  size_t max_chunk_size_;
+  std::vector<std::vector<uint8_t>> buffers_;
+
+  static void init_source(j_decompress_ptr cinfo) {}
+
+  static boolean fill_input_buffer(j_decompress_ptr cinfo) {
+    auto src = reinterpret_cast<SourceManager*>(cinfo->src);
+    if (src->pos_ < src->len_) {
+      size_t remaining = src->len_ - src->pos_;
+      size_t chunk_size = std::min(remaining, src->max_chunk_size_);
+      size_t next_idx = ++src->chunk_idx_ % kNumSourceBuffers;
+      // Larger number of chunks causes fuzzer timuout.
+      if (src->chunk_idx_ >= (1u << 15)) {
+        chunk_size = remaining;
+        next_idx = src->buffers_.size();
+        src->buffers_.emplace_back(chunk_size);
+      }
+      uint8_t* next_buffer = src->buffers_[next_idx].data();
+      memcpy(next_buffer, src->data_ + src->pos_, chunk_size);
+      src->pub_.next_input_byte = next_buffer;
+      src->pub_.bytes_in_buffer = chunk_size;
+    } else {
+      src->pub_.next_input_byte = kFakeEoiMarker;
+      src->pub_.bytes_in_buffer = 2;
+      src->len_ += 2;
+    }
+    src->pos_ += src->pub_.bytes_in_buffer;
+    return TRUE;
+  }
+
+  static void skip_input_data(j_decompress_ptr cinfo, long num_bytes) {
+    auto src = reinterpret_cast<SourceManager*>(cinfo->src);
+    if (num_bytes <= 0) {
+      return;
+    }
+    if (src->pub_.bytes_in_buffer >= static_cast<size_t>(num_bytes)) {
+      src->pub_.bytes_in_buffer -= num_bytes;
+      src->pub_.next_input_byte += num_bytes;
+    } else {
+      src->pos_ += num_bytes - src->pub_.bytes_in_buffer;
+      src->pub_.bytes_in_buffer = 0;
+    }
+  }
+
+  static void term_source(j_decompress_ptr cinfo) {}
+};
+
+bool DecodeJpeg(const uint8_t* data, size_t size, size_t max_pixels,
+                const FuzzSpec& spec, std::vector<uint8_t>* pixels,
+                size_t* xsize, size_t* ysize) {
+  SourceManager src(data, size, spec.chunk_size);
+  jpeg_decompress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    jpeg_error_mgr jerr;
+    jmp_buf env;
+    cinfo.err = jpegli_std_error(&jerr);
+    if (setjmp(env)) {
+      return false;
+    }
+    cinfo.client_data = reinterpret_cast<void*>(&env);
+    cinfo.err->error_exit = [](j_common_ptr cinfo) {
+      jmp_buf* env = reinterpret_cast<jmp_buf*>(cinfo->client_data);
+      jpegli_destroy(cinfo);
+      longjmp(*env, 1);
+    };
+    cinfo.err->emit_message = [](j_common_ptr cinfo, int msg_level) {};
+    jpegli_create_decompress(&cinfo);
+    cinfo.src = reinterpret_cast<jpeg_source_mgr*>(&src);
+    jpegli_read_header(&cinfo, TRUE);
+    *xsize = cinfo.image_width;
+    *ysize = cinfo.image_height;
+    size_t num_pixels = *xsize * *ysize;
+    if (num_pixels > max_pixels) return false;
+    jpegli_set_output_format(&cinfo, spec.output_type, spec.output_endianness);
+    jpegli_start_decompress(&cinfo);
+    if (spec.crop_output) {
+      JDIMENSION xoffset = cinfo.output_width / 3;
+      JDIMENSION xsize_cropped = cinfo.output_width / 3;
+      jpegli_crop_scanline(&cinfo, &xoffset, &xsize_cropped);
+    }
+
+    size_t bytes_per_sample = jpegli_bytes_per_sample(spec.output_type);
+    size_t stride =
+        bytes_per_sample * cinfo.output_components * cinfo.output_width;
+    size_t buffer_size = *ysize * stride;
+    pixels->resize(buffer_size);
+    for (size_t y = 0; y < *ysize; ++y) {
+      JSAMPROW rows[] = {pixels->data() + y * stride};
+      jpegli_read_scanlines(&cinfo, rows, 1);
+    }
+    Consume(pixels->cbegin(), pixels->cend());
+    jpegli_finish_decompress(&cinfo);
+    return true;
+  };
+  bool success = try_catch_block();
+  jpegli_destroy_decompress(&cinfo);
+  return success;
+}
+
+int TestOneInput(const uint8_t* data, size_t size) {
+  if (size < 4) return 0;
+  uint32_t flags = 0;
+  size_t used_flag_bits = 0;
+  memcpy(&flags, data + size - 4, 4);
+  size -= 4;
+
+  const auto getFlag = [&flags, &used_flag_bits](size_t max_value) {
+    size_t limit = 1;
+    while (limit <= max_value) {
+      limit <<= 1;
+      used_flag_bits++;
+      if (used_flag_bits > 32) abort();
+    }
+    uint32_t result = flags % limit;
+    flags /= limit;
+    return result % (max_value + 1);
+  };
+
+  FuzzSpec spec;
+  spec.output_type = static_cast<JpegliDataType>(getFlag(JPEGLI_TYPE_UINT16));
+  spec.output_endianness =
+      static_cast<JpegliEndianness>(getFlag(JPEGLI_BIG_ENDIAN));
+  uint32_t chunks = getFlag(15);
+  spec.chunk_size = chunks ? 1u << (chunks - 1) : 0;
+  spec.crop_output = getFlag(1);
+
+  std::vector<uint8_t> pixels;
+  size_t xsize, ysize;
+  size_t max_pixels = 1 << 21;
+
+  const auto targets = hwy::SupportedAndGeneratedTargets();
+  hwy::SetSupportedTargetsForTest(targets[getFlag(targets.size() - 1)]);
+  DecodeJpeg(data, size, max_pixels, spec, &pixels, &xsize, &ysize);
+  hwy::SetSupportedTargetsForTest(0);
+
+  return 0;
+}
+
+}  // namespace
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
+  return TestOneInput(data, size);
+}
diff --git a/tools/jpegli_dec_fuzzer_corpus.cc b/tools/jpegli_dec_fuzzer_corpus.cc
new file mode 100644 (file)
index 0000000..0963e66
--- /dev/null
@@ -0,0 +1,365 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <setjmp.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#if defined(_WIN32) || defined(_WIN64)
+#include "third_party/dirent.h"
+#else
+#include <dirent.h>
+#include <unistd.h>
+#endif
+
+#include <algorithm>
+#include <iostream>
+#include <mutex>
+#include <random>
+#include <vector>
+
+#include "lib/jpegli/encode.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/random.h"
+#include "tools/file_io.h"
+#include "tools/thread_pool_internal.h"
+
+namespace {
+
+const size_t kMaxWidth = 50000;
+const size_t kMaxHeight = 50000;
+const size_t kMaxPixels = 20 * (1 << 20);  // 20 MP
+
+std::mutex stderr_mutex;
+
+std::vector<uint8_t> GetSomeTestImage(size_t xsize, size_t ysize,
+                                      size_t num_channels, uint16_t seed) {
+  // Cause more significant image difference for successive seeds.
+  jxl::Rng generator(seed);
+
+  // Returns random integer in interval [0, max_value)
+  auto rng = [&generator](size_t max_value) -> size_t {
+    return generator.UniformU(0, max_value);
+  };
+
+  // Dark background gradient color
+  uint16_t r0 = rng(32768);
+  uint16_t g0 = rng(32768);
+  uint16_t b0 = rng(32768);
+  uint16_t r1 = rng(32768);
+  uint16_t g1 = rng(32768);
+  uint16_t b1 = rng(32768);
+
+  // Circle with different color
+  size_t circle_x = rng(xsize);
+  size_t circle_y = rng(ysize);
+  size_t circle_r = rng(std::min(xsize, ysize));
+
+  // Rectangle with random noise
+  size_t rect_x0 = rng(xsize);
+  size_t rect_y0 = rng(ysize);
+  size_t rect_x1 = rng(xsize);
+  size_t rect_y1 = rng(ysize);
+  if (rect_x1 < rect_x0) std::swap(rect_x0, rect_y1);
+  if (rect_y1 < rect_y0) std::swap(rect_y0, rect_y1);
+
+  size_t num_pixels = xsize * ysize;
+  std::vector<uint8_t> pixels(num_pixels * num_channels);
+  // Create pixel content to test.
+  for (size_t y = 0; y < ysize; y++) {
+    for (size_t x = 0; x < xsize; x++) {
+      uint16_t r = r0 * (ysize - y - 1) / ysize + r1 * y / ysize;
+      uint16_t g = g0 * (ysize - y - 1) / ysize + g1 * y / ysize;
+      uint16_t b = b0 * (ysize - y - 1) / ysize + b1 * y / ysize;
+      // put some shape in there for visual debugging
+      if ((x - circle_x) * (x - circle_x) + (y - circle_y) * (y - circle_y) <
+          circle_r * circle_r) {
+        r = (65535 - x * y) ^ seed;
+        g = (x << 8) + y + seed;
+        b = (y << 8) + x * seed;
+      } else if (x > rect_x0 && x < rect_x1 && y > rect_y0 && y < rect_y1) {
+        r = rng(65536);
+        g = rng(65536);
+        b = rng(65536);
+      }
+      size_t i = (y * xsize + x) * num_channels;
+      pixels[i + 0] = (r >> 8);
+      if (num_channels == 3) {
+        pixels[i + 1] = (g >> 8);
+        pixels[i + 2] = (b >> 8);
+      }
+    }
+  }
+  return pixels;
+}
+
+// ImageSpec needs to be a packed struct to allow us to use the raw memory of
+// the struct for hashing to create a consistent id.
+#pragma pack(push, 1)
+struct ImageSpec {
+  bool Validate() const {
+    if (width > kMaxWidth || height > kMaxHeight ||
+        width * height > kMaxPixels) {
+      return false;
+    }
+    return true;
+  }
+
+  friend std::ostream& operator<<(std::ostream& o, const ImageSpec& spec) {
+    o << "ImageSpec<"
+      << "size=" << spec.width << "x" << spec.height
+      << " * chan=" << spec.num_channels << " q=" << spec.quality
+      << " p=" << spec.progressive_level << " r=" << spec.restart_interval
+      << ">";
+    return o;
+  }
+
+  void SpecHash(uint8_t hash[16]) const {
+    const uint8_t* from = reinterpret_cast<const uint8_t*>(this);
+    std::seed_seq hasher(from, from + sizeof(*this));
+    uint32_t* to = reinterpret_cast<uint32_t*>(hash);
+    hasher.generate(to, to + 4);
+  }
+
+  uint32_t width = 256;
+  uint32_t height = 256;
+  uint32_t num_channels = 3;
+  uint32_t quality = 90;
+  uint32_t sampling = 0x11111111;
+  uint32_t progressive_level = 2;
+  uint32_t restart_interval = 0;
+  uint32_t fraction = 100;
+  // The seed for the PRNG.
+  uint32_t seed = 7777;
+};
+#pragma pack(pop)
+static_assert(sizeof(ImageSpec) % 4 == 0, "Add padding to ImageSpec.");
+
+bool EncodeWithJpegli(const ImageSpec& spec, const std::vector<uint8_t>& pixels,
+                      std::vector<uint8_t>* compressed) {
+  uint8_t* buffer = nullptr;
+  unsigned long buffer_size = 0;
+  jpeg_compress_struct cinfo;
+  const auto try_catch_block = [&]() -> bool {
+    jpeg_error_mgr jerr;
+    jmp_buf env;
+    cinfo.err = jpegli_std_error(&jerr);
+    if (setjmp(env)) {
+      return false;
+    }
+    cinfo.client_data = reinterpret_cast<void*>(&env);
+    cinfo.err->error_exit = [](j_common_ptr cinfo) {
+      (*cinfo->err->output_message)(cinfo);
+      jmp_buf* env = reinterpret_cast<jmp_buf*>(cinfo->client_data);
+      jpegli_destroy(cinfo);
+      longjmp(*env, 1);
+    };
+    jpegli_create_compress(&cinfo);
+    jpegli_mem_dest(&cinfo, &buffer, &buffer_size);
+    cinfo.image_width = spec.width;
+    cinfo.image_height = spec.height;
+    cinfo.input_components = spec.num_channels;
+    cinfo.in_color_space = spec.num_channels == 1 ? JCS_GRAYSCALE : JCS_RGB;
+    jpegli_set_defaults(&cinfo);
+    jpegli_set_quality(&cinfo, spec.quality, TRUE);
+    uint32_t sampling = spec.sampling;
+    for (int c = 0; c < cinfo.num_components; ++c) {
+      cinfo.comp_info[c].h_samp_factor = sampling & 0xf;
+      cinfo.comp_info[c].v_samp_factor = (sampling >> 4) & 0xf;
+      sampling >>= 8;
+    }
+    jpegli_set_progressive_level(&cinfo, spec.progressive_level);
+    cinfo.restart_interval = spec.restart_interval;
+    jpegli_start_compress(&cinfo, TRUE);
+    size_t stride = cinfo.image_width * cinfo.input_components;
+    std::vector<uint8_t> row_bytes(stride);
+    for (size_t y = 0; y < cinfo.image_height; ++y) {
+      memcpy(&row_bytes[0], &pixels[y * stride], stride);
+      JSAMPROW row[] = {row_bytes.data()};
+      jpegli_write_scanlines(&cinfo, row, 1);
+    }
+    jpegli_finish_compress(&cinfo);
+    return true;
+  };
+  bool success = try_catch_block();
+  jpegli_destroy_compress(&cinfo);
+  if (success) {
+    buffer_size = buffer_size * spec.fraction / 100;
+    compressed->assign(buffer, buffer + buffer_size);
+  }
+  if (buffer) std::free(buffer);
+  return success;
+}
+
+bool GenerateFile(const char* output_dir, const ImageSpec& spec,
+                  bool regenerate, bool quiet) {
+  // Compute a checksum of the ImageSpec to name the file. This is just to keep
+  // the output of this program repeatable.
+  uint8_t checksum[16];
+  spec.SpecHash(checksum);
+  std::string hash_str(sizeof(checksum) * 2, ' ');
+  static const char* hex_chars = "0123456789abcdef";
+  for (size_t i = 0; i < sizeof(checksum); i++) {
+    hash_str[2 * i] = hex_chars[checksum[i] >> 4];
+    hash_str[2 * i + 1] = hex_chars[checksum[i] % 0x0f];
+  }
+  std::string output_fn = std::string(output_dir) + "/" + hash_str + ".jpg";
+
+  // Don't regenerate files if they already exist on disk to speed-up
+  // consecutive calls when --regenerate is not used.
+  struct stat st;
+  if (!regenerate && stat(output_fn.c_str(), &st) == 0 && S_ISREG(st.st_mode)) {
+    return true;
+  }
+
+  if (!quiet) {
+    std::unique_lock<std::mutex> lock(stderr_mutex);
+    std::cerr << "Generating " << spec << " as " << hash_str << std::endl;
+  }
+
+  uint8_t hash[16];
+  spec.SpecHash(hash);
+  std::mt19937 mt(spec.seed);
+
+  std::vector<uint8_t> pixels =
+      GetSomeTestImage(spec.width, spec.height, spec.num_channels, spec.seed);
+  std::vector<uint8_t> compressed;
+  JXL_CHECK(EncodeWithJpegli(spec, pixels, &compressed));
+
+  // Append 4 bytes with the flags used by jpegli_dec_fuzzer to select the
+  // decoding output.
+  std::uniform_int_distribution<> dis256(0, 255);
+  for (size_t i = 0; i < 4; ++i) {
+    compressed.push_back(dis256(mt));
+  }
+
+  if (!jpegxl::tools::WriteFile(output_fn, compressed)) {
+    return false;
+  }
+  if (!quiet) {
+    std::unique_lock<std::mutex> lock(stderr_mutex);
+    std::cerr << "Stored " << output_fn << " size: " << compressed.size()
+              << std::endl;
+  }
+
+  return true;
+}
+
+void Usage() {
+  fprintf(stderr,
+          "Use: fuzzer_corpus [-r] [-q] [-j THREADS] [output_dir]\n"
+          "\n"
+          "  -r Regenerate files if already exist.\n"
+          "  -q Be quiet.\n"
+          "  -j THREADS Number of parallel jobs to run.\n");
+}
+
+}  // namespace
+
+int main(int argc, const char** argv) {
+  const char* dest_dir = nullptr;
+  bool regenerate = false;
+  bool quiet = false;
+  size_t num_threads = std::thread::hardware_concurrency();
+  for (int optind = 1; optind < argc;) {
+    if (!strcmp(argv[optind], "-r")) {
+      regenerate = true;
+      optind++;
+    } else if (!strcmp(argv[optind], "-q")) {
+      quiet = true;
+      optind++;
+    } else if (!strcmp(argv[optind], "-j")) {
+      optind++;
+      if (optind < argc) {
+        num_threads = atoi(argv[optind++]);
+      } else {
+        fprintf(stderr, "-j needs an argument value.\n");
+        Usage();
+        return 1;
+      }
+    } else if (dest_dir == nullptr) {
+      dest_dir = argv[optind++];
+    } else {
+      fprintf(stderr, "Unknown parameter: \"%s\".\n", argv[optind]);
+      Usage();
+      return 1;
+    }
+  }
+  if (!dest_dir) {
+    dest_dir = "corpus";
+  }
+
+  struct stat st;
+  memset(&st, 0, sizeof(st));
+  if (stat(dest_dir, &st) != 0 || !S_ISDIR(st.st_mode)) {
+    fprintf(stderr, "Output path \"%s\" is not a directory.\n", dest_dir);
+    Usage();
+    return 1;
+  }
+
+  std::mt19937 mt(77777);
+
+  std::vector<std::pair<uint32_t, uint32_t>> image_sizes = {
+      {8, 8},     {32, 32},   {128, 128}, {10000, 1}, {10000, 2}, {1, 10000},
+      {2, 10000}, {555, 256}, {257, 513}, {512, 265}, {264, 520},
+  };
+  std::vector<uint32_t> sampling_ratios = {
+      0x11111111,  // 444
+      0x11111112,  // 422
+      0x11111121,  // 440
+      0x11111122,  // 420
+      0x11222211,  // luma subsampling
+  };
+
+  ImageSpec spec;
+  std::vector<ImageSpec> specs;
+  for (auto img_size : image_sizes) {
+    spec.width = img_size.first;
+    spec.height = img_size.second;
+    for (uint32_t num_channels : {1, 3}) {
+      spec.num_channels = num_channels;
+      for (uint32_t sampling : sampling_ratios) {
+        spec.sampling = sampling;
+        if (num_channels == 1 && sampling != 0x11111111) continue;
+        for (uint32_t restart : {0, 1, 1024}) {
+          spec.restart_interval = restart;
+          for (uint32_t prog_level : {0, 1, 2}) {
+            spec.progressive_level = prog_level;
+            for (uint32_t quality : {10, 90, 100}) {
+              spec.quality = quality;
+              for (uint32_t fraction : {10, 70, 100}) {
+                spec.fraction = fraction;
+                spec.seed = mt() % 777777;
+                if (!spec.Validate()) {
+                  if (!quiet) {
+                    std::cerr << "Skipping " << spec << std::endl;
+                  }
+                } else {
+                  specs.push_back(spec);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  jpegxl::tools::ThreadPoolInternal pool{num_threads};
+  const auto generate = [&specs, dest_dir, regenerate, quiet](
+                            const uint32_t task, size_t /* thread */) {
+    const ImageSpec& spec = specs[task];
+    GenerateFile(dest_dir, spec, regenerate, quiet);
+  };
+  if (!RunOnPool(&pool, 0, specs.size(), jxl::ThreadPool::NoInit, generate,
+                 "FuzzerCorpus")) {
+    std::cerr << "Error generating fuzzer corpus" << std::endl;
+    return 1;
+  }
+  std::cerr << "Finished generating fuzzer corpus" << std::endl;
+  return 0;
+}
index aa85ff8..92a0874 100644 (file)
@@ -3,17 +3,18 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+#include <jxl/cms.h>
 #include <stdio.h>
 #include <string.h>
 
 #include <fstream>
 #include <iostream>
+#include <istream>
 #include <unordered_map>
 
-#include "lib/jxl/base/file_io.h"
+#include "lib/jxl/codec_in_out.h"
 #include "lib/jxl/enc_cache.h"
-#include "lib/jxl/enc_color_management.h"
-#include "lib/jxl/enc_file.h"
+#include "lib/jxl/enc_fields.h"
 #include "lib/jxl/enc_frame.h"
 #include "lib/jxl/enc_heuristics.h"
 #include "lib/jxl/modular/encoding/context_predict.h"
 #include "lib/jxl/modular/encoding/enc_ma.h"
 #include "lib/jxl/modular/encoding/encoding.h"
 #include "lib/jxl/splines.h"
+#include "lib/jxl/test_utils.h"  // TODO(eustas): cut this dependency
+#include "tools/file_io.h"
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
+
+using ::jxl::BitWriter;
+using ::jxl::BlendMode;
+using ::jxl::CodecInOut;
+using ::jxl::CodecMetadata;
+using ::jxl::ColorCorrelationMap;
+using ::jxl::ColorEncoding;
+using ::jxl::ColorTransform;
+using ::jxl::CompressParams;
+using ::jxl::DefaultEncoderHeuristics;
+using ::jxl::FrameDimensions;
+using ::jxl::FrameInfo;
+using ::jxl::Image3F;
+using ::jxl::ImageF;
+using ::jxl::PaddedBytes;
+using ::jxl::PassesEncoderState;
+using ::jxl::Predictor;
+using ::jxl::PropertyDecisionNode;
+using ::jxl::QuantizedSpline;
+using ::jxl::Spline;
+using ::jxl::Splines;
+using ::jxl::Tree;
 
 namespace {
 struct SplineData {
@@ -196,7 +222,7 @@ bool ParseNode(F& tok, Tree& tree, SplineData& spline_data,
   } else if (t == "Alpha") {
     io.metadata.m.SetAlphaBits(io.metadata.m.bit_depth.bits_per_sample);
     ImageF alpha(W, H);
-    io.frames[0].SetAlpha(std::move(alpha), false);
+    io.frames[0].SetAlpha(std::move(alpha));
   } else if (t == "Bitdepth") {
     t = tok();
     size_t num = 0;
@@ -392,7 +418,7 @@ bool ParseNode(F& tok, Tree& tree, SplineData& spline_data,
 
 class Heuristics : public DefaultEncoderHeuristics {
  public:
-  bool CustomFixedTreeLossless(const jxl::FrameDimensions& frame_dim,
+  bool CustomFixedTreeLossless(const FrameDimensions& frame_dim,
                                Tree* tree) override {
     *tree = tree_;
     return true;
@@ -412,16 +438,24 @@ int JxlFromTree(const char* in, const char* out, const char* tree_out) {
   size_t width = 1024, height = 1024;
   int x0 = 0, y0 = 0;
   cparams.SetLossless();
+  cparams.responsive = false;
   cparams.resampling = 1;
   cparams.ec_resampling = 1;
   cparams.modular_group_size_shift = 3;
   CodecInOut io;
   int have_next = 0;
 
-  std::ifstream f(in);
+  std::istream* f = &std::cin;
+  std::ifstream file;
+
+  if (strcmp(in, "-")) {
+    file.open(in, std::ifstream::in);
+    f = &file;
+  }
+
   auto tok = [&f]() {
     std::string out;
-    f >> out;
+    *f >> out;
     return out;
   };
   if (!ParseNode(tok, tree, spline_data, cparams, width, height, io, have_next,
@@ -436,7 +470,7 @@ int JxlFromTree(const char* in, const char* out, const char* tree_out) {
   io.SetFromImage(std::move(image), ColorEncoding::SRGB());
   io.SetSize((width + x0) * cparams.resampling,
              (height + y0) * cparams.resampling);
-  io.metadata.m.color_encoding.DecideIfWantICC();
+  io.metadata.m.color_encoding.DecideIfWantICC(*JxlGetDefaultCms());
   cparams.options.zero_tokens = true;
   cparams.palette_colors = 0;
   cparams.channel_colors_pre_transform_percent = 0;
@@ -452,14 +486,14 @@ int JxlFromTree(const char* in, const char* out, const char* tree_out) {
   *metadata = io.metadata;
   JXL_RETURN_IF_ERROR(metadata->size.Set(io.xsize(), io.ysize()));
 
-  metadata->m.xyb_encoded = cparams.color_transform == ColorTransform::kXYB;
+  metadata->m.xyb_encoded = (cparams.color_transform == ColorTransform::kXYB);
 
-  JXL_RETURN_IF_ERROR(WriteHeaders(metadata.get(), &writer, nullptr));
+  JXL_RETURN_IF_ERROR(WriteCodestreamHeaders(metadata.get(), &writer, nullptr));
   writer.ZeroPadToByte();
 
   while (true) {
     PassesEncoderState enc_state;
-    enc_state.heuristics = make_unique<Heuristics>(tree);
+    enc_state.heuristics = jxl::make_unique<Heuristics>(tree);
     enc_state.shared.image_features.splines =
         SplinesFromSplineData(spline_data, enc_state.shared.cmap);
 
@@ -469,14 +503,16 @@ int JxlFromTree(const char* in, const char* out, const char* tree_out) {
 
     io.frames[0].origin.x0 = x0;
     io.frames[0].origin.y0 = y0;
+    info.clamp = false;
 
-    JXL_RETURN_IF_ERROR(EncodeFrame(cparams, info, metadata.get(), io.frames[0],
-                                    &enc_state, GetJxlCms(), nullptr, &writer,
-                                    nullptr));
+    JXL_RETURN_IF_ERROR(jxl::EncodeFrame(
+        cparams, info, metadata.get(), io.frames[0], &enc_state,
+        *JxlGetDefaultCms(), nullptr, &writer, nullptr));
     if (!have_next) break;
     tree.clear();
     spline_data.splines.clear();
     have_next = 0;
+    cparams.manual_noise.clear();
     if (!ParseNode(tok, tree, spline_data, cparams, width, height, io,
                    have_next, x0, y0)) {
       return 1;
@@ -488,19 +524,22 @@ int JxlFromTree(const char* in, const char* out, const char* tree_out) {
 
   compressed = std::move(writer).TakeBytes();
 
-  if (!WriteFile(compressed, out)) {
+  if (!WriteFile(out, compressed)) {
     fprintf(stderr, "Failed to write to \"%s\"\n", out);
     return 1;
   }
 
   return 0;
 }
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
 
 int main(int argc, char** argv) {
-  if ((argc != 3 && argc != 4) || !strcmp(argv[1], argv[2])) {
+  if ((argc != 3 && argc != 4) ||
+      (strcmp(argv[1], "-") && !strcmp(argv[1], argv[2]))) {
     fprintf(stderr, "Usage: %s tree_in.txt out.jxl [tree_drawing]\n", argv[0]);
     return 1;
   }
-  return jxl::JxlFromTree(argv[1], argv[2], argc < 4 ? nullptr : argv[3]);
+  return jpegxl::tools::JxlFromTree(argv[1], argv[2],
+                                    argc < 4 ? nullptr : argv[3]);
 }
index d8d67e7..e7d23ee 100644 (file)
@@ -6,13 +6,12 @@
 // This example prints information from the main codestream header.
 
 #include <inttypes.h>
+#include <jxl/decode.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
-#include "jxl/decode.h"
-
 int PrintBasicInfo(FILE* file, int verbose) {
   uint8_t* data = NULL;
   size_t data_size = 0;
@@ -90,7 +89,7 @@ int PrintBasicInfo(FILE* file, int verbose) {
       if (info.exponent_bits_per_sample) {
         printf("float (%d exponent bits) ", info.exponent_bits_per_sample);
       }
-      int cmyk = 0, alpha = 0;
+      int cmyk = 0;
       const char* const ec_type_names[] = {
           "Alpha",     "Depth",     "Spotcolor", "Selection", "Black",
           "CFA",       "Thermal",   "Reserved0", "Reserved1", "Reserved2",
@@ -105,17 +104,12 @@ int PrintBasicInfo(FILE* file, int verbose) {
           break;
         }
         if (extra.type == JXL_CHANNEL_BLACK) cmyk = 1;
-        if (extra.type == JXL_CHANNEL_ALPHA) alpha = 1;
       }
       if (info.num_color_channels == 1)
         printf("Grayscale");
       else {
         if (cmyk) {
-          printf("CMYK");
-          cmyk = 0;
-        } else if (alpha) {
-          printf("RGBA");
-          alpha = 0;
+          printf("CMY");
         } else {
           printf("RGB");
         }
@@ -126,15 +120,6 @@ int PrintBasicInfo(FILE* file, int verbose) {
           fprintf(stderr, "JxlDecoderGetExtraChannelInfo failed\n");
           break;
         }
-        if (extra.type == JXL_CHANNEL_BLACK && cmyk == 0) {
-          cmyk = 1;
-          continue;
-        }
-        if (extra.type == JXL_CHANNEL_ALPHA && alpha == 0) {
-          alpha = 1;
-          continue;
-        }
-
         printf("+%s", (extra.type < ec_type_names_size
                            ? ec_type_names[extra.type]
                            : "Unknown, please update your libjxl"));
@@ -229,14 +214,12 @@ int PrintBasicInfo(FILE* file, int verbose) {
         fprintf(stderr, "Invalid orientation\n");
       }
     } else if (status == JXL_DEC_COLOR_ENCODING) {
-      JxlPixelFormat format = {4, JXL_TYPE_FLOAT, JXL_LITTLE_ENDIAN, 0};
       printf("Color space: ");
 
       JxlColorEncoding color_encoding;
       if (JXL_DEC_SUCCESS ==
-          JxlDecoderGetColorAsEncodedProfile(dec, &format,
-                                             JXL_COLOR_PROFILE_TARGET_ORIGINAL,
-                                             &color_encoding)) {
+          JxlDecoderGetColorAsEncodedProfile(
+              dec, JXL_COLOR_PROFILE_TARGET_ORIGINAL, &color_encoding)) {
         const char* const cs_string[4] = {"RGB", "Grayscale", "XYB", "Unknown"};
         const char* const wp_string[12] = {"", "D65", "Custom", "", "",  "",
                                            "", "",    "",       "", "E", "P3"};
@@ -280,8 +263,7 @@ int PrintBasicInfo(FILE* file, int verbose) {
         // instead.
         size_t profile_size;
         if (JXL_DEC_SUCCESS !=
-            JxlDecoderGetICCProfileSize(dec, &format,
-                                        JXL_COLOR_PROFILE_TARGET_ORIGINAL,
+            JxlDecoderGetICCProfileSize(dec, JXL_COLOR_PROFILE_TARGET_ORIGINAL,
                                         &profile_size)) {
           fprintf(stderr, "JxlDecoderGetICCProfileSize failed\n");
           continue;
@@ -292,10 +274,9 @@ int PrintBasicInfo(FILE* file, int verbose) {
           continue;
         }
         uint8_t* profile = (uint8_t*)malloc(profile_size);
-        if (JXL_DEC_SUCCESS !=
-            JxlDecoderGetColorAsICCProfile(dec, &format,
-                                           JXL_COLOR_PROFILE_TARGET_ORIGINAL,
-                                           profile, profile_size)) {
+        if (JXL_DEC_SUCCESS != JxlDecoderGetColorAsICCProfile(
+                                   dec, JXL_COLOR_PROFILE_TARGET_ORIGINAL,
+                                   profile, profile_size)) {
           fprintf(stderr, "JxlDecoderGetColorAsICCProfile failed\n");
           free(profile);
           continue;
@@ -326,11 +307,11 @@ int PrintBasicInfo(FILE* file, int verbose) {
       } else {
         printf("full image size");
       }
-
-      float ms = frame_header.duration * 1000.f *
-                 info.animation.tps_denominator / info.animation.tps_numerator;
-      total_duration += ms;
       if (info.have_animation) {
+        float ms = frame_header.duration * 1000.f *
+                   info.animation.tps_denominator /
+                   info.animation.tps_numerator;
+        total_duration += ms;
         printf(", duration: %.1f ms", ms);
         if (info.animation.have_timecodes) {
           printf(", time code: %X", frame_header.timecode);
index bb57c2d..f56a1fa 100644 (file)
@@ -7,9 +7,9 @@
 // This links against the shared libjpegxl library which doesn't expose any of
 // the internals of the jxl namespace.
 
-#include "jxl/decode.h"
+#include <jxl/decode.h>
 
-int main() {
+int main(void) {
   if (!JxlDecoderVersion()) return 1;
   JxlDecoder* dec = JxlDecoderCreate(NULL);
   if (!dec) return 1;
diff --git a/tools/optimizer/apply_simplex.py b/tools/optimizer/apply_simplex.py
new file mode 100755 (executable)
index 0000000..273305b
--- /dev/null
@@ -0,0 +1,111 @@
+#!/usr/bin/python
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+"""apply_simplex.py: Updates constants based on results of simplex search.
+
+To use this tool, the simplex search parameters must we wrapped in a bias(n)
+function call that returns the value of the VARn environment variable. The
+tool reads a text file containing the simplex definition that simplex_fork.py
+has written, and updates the target source files by substituting the bias(n)
+function calls with the (n+1)th coordinate of the simplex vector, and also
+simplifies these expressions by evaluating them to a sinlge floating point
+literal.
+
+The tool recognizes and evaluates the following expressions:
+  <constant> + bias(n),
+  <constant> * bias(n),
+  <constant> + <coeff> * bias(n).
+
+The --keep_bias command-line flag can be used to continue an aborted simplex
+search. This will keep the same bias(n) terms in the code, but would update the
+surronding constants.
+
+The --index_min and --index_max flags can be used to update only a subset of the
+bias(n) parameters.
+"""
+
+import argparse
+import re
+import sys
+
+def ParseSimplex(fn):
+    """Returns the simplex definition written by simplex_fork.py"""
+
+    with open(fn, "r") as f:
+        line = f.readline()
+        vec = eval(line)
+    return vec
+
+
+def PythonExpr(c_expr):
+    """Removes the f at the end of float literals"""
+
+    def repl(m):
+        return m.group(1)
+
+    return re.sub("(\d+)f", repl, c_expr)
+
+
+def UpdateSourceFile(fn, vec, keep_bias, id_min, id_max, minval):
+    """Updates expressions containing a bias(N) term."""
+
+    with open(fn, "r") as f:
+        lines_in = f.readlines()
+        lines_out = []
+        rbias = "(bias\((\d+)\))"
+        r = " -?\d+\.\d+f?( (\+|-|\*) (\d+\.\d+f? \* )?" + rbias + ")"
+        for line in lines_in:
+            line_out = line
+            x = re.search(r, line)
+            if x:
+                id = int(x.group(5))
+                if id >= id_min and id <= id_max:
+                    expr = re.sub(rbias, str(vec[id + 1]), x.group(0))
+                    val = eval(PythonExpr(expr))
+                    if minval and val < minval:
+                        val = minval
+                    expr_out = " " + str(val) + "f"
+                    if keep_bias:
+                        expr_out += x.group(1)
+                    line_out = re.sub(r, expr_out, line)
+            lines_out.append(line_out)
+
+    with open(fn, "w") as f:
+        f.writelines(lines_out)
+        f.close()
+
+
+def ApplySimplex(args):
+  """Main entry point of the program after parsing parameters."""
+
+  vec = ParseSimplex(args.simplex)
+  for fn in args.target:
+      UpdateSourceFile(fn, vec, args.keep_bias, args.index_min, args.index_max,
+                       args.minval)
+  return 0
+
+
+def main():
+  parser = argparse.ArgumentParser(description=__doc__)
+  parser.add_argument('target', type=str, nargs='+',
+                      help='source file(s) to update')
+  parser.add_argument('--simplex', default='best_simplex.txt',
+                      help='simplex to apply to the code')
+  parser.add_argument('--keep_bias', default=False, action='store_true',
+                      help='keep the bias term in the code, can be used to ' +
+                      'continue simplex search')
+  parser.add_argument('--index_min', type=int, default=0,
+                      help='start index of the simplex to apply')
+  parser.add_argument('--index_max', type=int, default=9999,
+                      help='last index of the simplex to apply')
+  parser.add_argument('--minval', type=float, default=None,
+                      help='apply a minimum to expression results')
+  args = parser.parse_args()
+  sys.exit(ApplySimplex(args))
+
+
+if __name__ == '__main__':
+  main()
index 20de4c9..f29e190 100755 (executable)
@@ -53,6 +53,7 @@ def Average(a, b):
 
 
 eval_hash = {}
+g_best_val = None
 
 def EvalCacheForget():
   global eval_hash
@@ -60,19 +61,18 @@ def EvalCacheForget():
 
 def RandomizedJxlCodecs():
   retval = []
-  minval = 0.5
-  maxval = 3.3
+  minval = 0.2
+  maxval = 9.3
   rangeval = maxval/minval
-  steps = 7
+  steps = 13
   for i in range(steps):
     mul = minval * rangeval**(float(i)/(steps - 1))
     mul *= 0.99 + 0.05 * random.random()
-    retval.append("jxl:epf2:d%.3f" % mul)
-  steps = 7
+    retval.append("jxl:d%.4f" % mul)
   for i in range(steps - 1):
     mul = minval * rangeval**(float(i+0.5)/(steps - 1))
     mul *= 0.99 + 0.05 * random.random()
-    retval.append("jxl:epf0:d%.3f" % mul)
+    retval.append("jxl:d%.4f" % mul)
   return ",".join(retval)
 
 g_codecs = RandomizedJxlCodecs()
@@ -87,6 +87,7 @@ def Eval(vec, binary_name, cached=True):
   """
   global eval_hash
   global g_codecs
+  global g_best_val
   key = ""
   # os.environ["BUTTERAUGLI_OPTIMIZE"] = "1"
   for i in range(300):
@@ -101,8 +102,8 @@ def Eval(vec, binary_name, cached=True):
   process = subprocess.Popen(
       (binary_name,
        '--input',
-       '/usr/local/google/home/jyrki/mix_corpus/*.png',
-       '--error_pnorm=3',
+       '/usr/local/google/home/jyrki/newcorpus/split/*.png',
+       '--error_pnorm=3.0',
        '--more_columns',
        '--codec', g_codecs),
       stdout=subprocess.PIPE,
@@ -122,20 +123,26 @@ def Eval(vec, binary_name, cached=True):
     sys.stdout.flush()
     if line[0:3] == b'jxl':
       bpp = line.split()[3]
-      dist_pnorm = line.split()[7]
+      dist_pnorm = line.split()[9]
+      dist_max = line.split()[6]
       vec[0] *= float(dist_pnorm) * float(bpp) / 16.0
-      #vec[0] *= (float(dist_max) * float(bpp) / 16.0) ** 0.2
+      #vec[0] *= (float(dist_max) * float(bpp) / 16.0) ** 0.01
       n += 1
       found_score = True
       distance = float(line.split()[0].split(b'd')[-1])
-      #faultybpp = 1.0 + 0.43 * ((float(bpp) * distance ** 0.74) - 1.57) ** 2
-      #vec[0] *= faultybpp
+      faultybpp = 1.0 + 0.43 * ((float(bpp) * distance ** 0.69) - 1.64) ** 2
+      vec[0] *= faultybpp
 
   print("eval: ", vec)
   if (vec[0] <= 0.0):
     vec[0] = 1e30
   if found_score:
     eval_hash[key] = vec[0]
+    if not g_best_val or vec[0] < g_best_val:
+      g_best_val = vec[0]
+      print("\nSaving best simplex\n")
+      with open("best_simplex.txt", "w") as f:
+        print(vec, file=f)
     return
   vec[0] = 1e33
   return
@@ -242,7 +249,7 @@ g_simplex = InitialSimplex(best, g_dim, g_amount * 0.33)
 best = g_simplex[0][:]
 
 for restarts in range(99999):
-  for ii in range(g_dim * 2):
+  for ii in range(g_dim * 5):
     g_simplex.sort()
     print("reflect", ii, g_simplex[0])
     Reflect(g_simplex, g_binary)
diff --git a/tools/optimizer/update_jpegli_global_scale.py b/tools/optimizer/update_jpegli_global_scale.py
new file mode 100755 (executable)
index 0000000..1a57c59
--- /dev/null
@@ -0,0 +1,103 @@
+#!/usr/bin/python
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+"""Script to update jpegli global scale after a change affecting quality.
+
+start as ./update_jpegli_global_scale.py build <corpus-dir>
+"""
+
+import os
+import re
+import subprocess
+import sys
+
+def SourceFileName():
+  return "lib/jpegli/quant.cc"
+
+def ScalePattern(scale_type):
+  return "constexpr float kGlobalScale" + scale_type + " = ";
+
+def CodecName(scale_type):
+  if scale_type == "YCbCr":
+    return "jpeg:enc-jpegli:q90"
+  elif scale_type == "XYB":
+    return "jpeg:enc-jpegli:xyb:q90"
+  else:
+    raise Exception("Unknown scale type %s" % scale_type)
+  
+def ReadGlobalScale(scale_type):
+  pattern = ScalePattern(scale_type)
+  with open(SourceFileName()) as f:
+    for line in f.read().splitlines():
+      if line.startswith(pattern):
+        return float(line[len(pattern):-2])
+  raise Exception("Global scale %s not found." % scale_type)
+  
+    
+def UpdateGlobalScale(scale_type, new_val):
+  pattern = ScalePattern(scale_type)
+  found_pattern = False
+  fdata = ""
+  with open(SourceFileName()) as f:
+    for line in f.read().splitlines():
+      if line.startswith(pattern):
+        fdata += pattern + "%.8ff;\n" % new_val
+        found_pattern = True
+      else:
+        fdata += line + "\n"
+  if not found_pattern:
+    raise Exception("Global scale %s not found." % scale_type)
+  with open(SourceFileName(), "w") as f:
+    f.write(fdata)
+    f.close()
+
+def EvalPnorm(build_dir, corpus_dir, codec):
+  compile_args = ["ninja", "-C", build_dir, "tools/benchmark_xl"]
+  try:
+    subprocess.check_output(compile_args)
+  except:
+    subprocess.check_call(compile_args)
+  process = subprocess.Popen(
+    (os.path.join(build_dir, "tools/benchmark_xl"),
+     "--input", os.path.join(corpus_dir, "*.png"),
+     "--codec", codec),
+    stdout=subprocess.PIPE,
+    stderr=subprocess.PIPE)
+  (out, err) = process.communicate(input=None)
+  for line in out.splitlines():
+    if line.startswith(codec):
+      return float(line.split()[8])
+  raise Exception("Unexpected benchmark output:\n%sstderr:\n%s" % (out, err))
+
+
+if len(sys.argv) != 3:
+  print("usage: ", sys.argv[0], "build-dir corpus-dir")
+  exit(1)
+
+build_dir = sys.argv[1]
+corpus_dir = sys.argv[2]
+    
+jpeg_pnorm = EvalPnorm(build_dir, corpus_dir, "jpeg:q90")
+
+print("Libjpeg pnorm: %.8f" % jpeg_pnorm)
+
+for scale_type in ["YCbCr", "XYB"]:
+  scale = ReadGlobalScale(scale_type)
+  best_scale = scale
+  best_rel_error = 100.0
+  for i in range(10):
+    jpegli_pnorm = EvalPnorm(build_dir, corpus_dir, CodecName(scale_type))
+    rel_error = abs(jpegli_pnorm / jpeg_pnorm - 1)
+    print("[%-5s] scale: %.8f  pnorm: %.8f  error: %.8f" %
+          (scale_type, scale, jpegli_pnorm, rel_error))
+    if rel_error < best_rel_error:
+      best_rel_error = rel_error
+      best_scale = scale
+    if rel_error < 0.0001:
+      break
+    scale = scale * jpeg_pnorm / jpegli_pnorm
+    UpdateGlobalScale(scale_type, scale)
+  UpdateGlobalScale(scale_type, best_scale)
index 7c78f0d..16fa99a 100644 (file)
@@ -3,10 +3,21 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
 #include "lib/jxl/dec_ans.h"
+#include "lib/jxl/dec_bit_reader.h"
 #include "lib/jxl/entropy_coder.h"
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
+
+using ::jxl::ANSCode;
+using ::jxl::ANSSymbolReader;
+using ::jxl::BitReader;
+using ::jxl::BitReaderScopedCloser;
+using ::jxl::Bytes;
+using ::jxl::Status;
 
 int TestOneInput(const uint8_t* data, size_t size) {
   if (size < 2) return 0;
@@ -17,7 +28,7 @@ int TestOneInput(const uint8_t* data, size_t size) {
   std::vector<uint8_t> context_map;
   Status ret = true;
   {
-    BitReader br(Span<const uint8_t>(data, size));
+    BitReader br(Bytes(data, size));
     BitReaderScopedCloser br_closer(&br, &ret);
     ANSCode code;
     JXL_RETURN_IF_ERROR(
@@ -28,7 +39,7 @@ int TestOneInput(const uint8_t* data, size_t size) {
     const size_t maxreads = size * 8;
     size_t numreads = 0;
     int context = 0;
-    while (DivCeil(br.TotalBitsConsumed(), kBitsPerByte) < size &&
+    while (jxl::DivCeil(br.TotalBitsConsumed(), jxl::kBitsPerByte) < size &&
            numreads <= maxreads) {
       int code = ansreader.ReadHybridUint(context, &br, context_map);
       context = code % numContexts;
@@ -39,8 +50,9 @@ int TestOneInput(const uint8_t* data, size_t size) {
   return 0;
 }
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
 
 extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
-  return jxl::TestOneInput(data, size);
+  return jpegxl::tools::TestOneInput(data, size);
 }
similarity index 98%
rename from tools/bisector
rename to tools/scripts/bisector
index 2552045..b6a82d0 100755 (executable)
@@ -1,4 +1,10 @@
 #!/usr/bin/env python
+#
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
 r"""General-purpose bisector
 
 Prints a space-separated list of values to stdout:
diff --git a/tools/scripts/build_cleaner.py b/tools/scripts/build_cleaner.py
new file mode 100755 (executable)
index 0000000..0185fc5
--- /dev/null
@@ -0,0 +1,270 @@
+#!/usr/bin/env python3
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+
+"""build_cleaner.py: Update build files.
+
+This tool keeps certain parts of the build files up to date.
+"""
+
+import argparse
+import locale
+import os
+import re
+import subprocess
+import sys
+import tempfile
+
+
+HEAD = """# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# This file is generated, do not modify by manually.
+# Run `tools/scripts/build_cleaner.py --update` to regenerate it.
+"""
+
+
+def RepoFiles(src_dir):
+  """Return the list of files from the source git repository"""
+  git_bin = os.environ.get('GIT_BIN', 'git')
+  files = subprocess.check_output([git_bin, '-C', src_dir, 'ls-files'])
+  ret = files.decode(locale.getpreferredencoding()).splitlines()
+  ret.sort()
+  return ret
+
+
+def Check(condition, msg):
+  if not condition:
+    print(msg)
+    sys.exit(2)
+
+
+def ContainsFn(*parts):
+  return lambda path: any(part in path for part in parts)
+
+
+def HasPrefixFn(*prefixes):
+  return lambda path: any(path.startswith(prefix) for prefix in prefixes)
+
+
+def HasSuffixFn(*suffixes):
+  return lambda path: any(path.endswith(suffix) for suffix in suffixes)
+
+
+def Filter(src, fn):
+  yes_list = []
+  no_list = []
+  for item in src:
+    (yes_list if fn(item) else no_list).append(item)
+  return yes_list, no_list
+
+
+def SplitLibFiles(repo_files):
+  """Splits the library files into the different groups."""
+
+  srcs_base = 'lib/'
+  srcs, _ = Filter(repo_files, HasPrefixFn(srcs_base))
+  srcs = [path[len(srcs_base):] for path in srcs]
+  srcs, _ = Filter(srcs, HasSuffixFn('.cc', '.h', '.ui'))
+  srcs.sort()
+
+  # Let's keep Jpegli sources a bit separate for a while.
+  jpegli_srcs, srcs = Filter(srcs, HasPrefixFn('jpegli'))
+  # TODO(eustas): move to tools?
+  _, srcs = Filter(srcs, HasSuffixFn('gbench_main.cc'))
+
+  # First pick files scattered across directories.
+  tests, srcs = Filter(srcs, HasSuffixFn('_test.cc'))
+  jpegli_tests, jpegli_srcs = Filter(jpegli_srcs, HasSuffixFn('_test.cc'))
+  # TODO(eustas): move to separate list?
+  _, srcs = Filter(srcs, ContainsFn('testing.h'))
+  _, jpegli_srcs = Filter(jpegli_srcs, ContainsFn('testing.h'))
+  testlib_files, srcs = Filter(srcs, ContainsFn('test'))
+  jpegli_testlib_files, jpegli_srcs = Filter(jpegli_srcs, ContainsFn('test'))
+  jpegli_libjpeg_helper_files, jpegli_testlib_files = Filter(
+    jpegli_testlib_files, ContainsFn('libjpeg_test_util'))
+  gbench_sources, srcs = Filter(srcs, HasSuffixFn('_gbench.cc'))
+
+  extras_sources, srcs = Filter(srcs, HasPrefixFn('extras/'))
+  lib_srcs, srcs = Filter(srcs, HasPrefixFn('jxl/'))
+  public_headers, srcs = Filter(srcs, HasPrefixFn('include/jxl/'))
+  threads_sources, srcs = Filter(srcs, HasPrefixFn('threads/'))
+
+  Check(len(srcs) == 0, 'Orphan source files: ' + str(srcs))
+
+  base_sources, lib_srcs = Filter(lib_srcs, HasPrefixFn('jxl/base/'))
+
+  jpegli_wrapper_sources, jpegli_srcs = Filter(
+      jpegli_srcs, HasSuffixFn('libjpeg_wrapper.cc'))
+  jpegli_sources = jpegli_srcs
+
+  threads_public_headers, public_headers = Filter(
+      public_headers, ContainsFn('_parallel_runner'))
+
+  codec_names = ['apng', 'exr', 'gif', 'jpegli', 'jpg', 'jxl', 'npy', 'pgx',
+    'pnm']
+  codecs = {}
+  for codec in codec_names:
+    codec_sources, extras_sources = Filter(extras_sources, HasPrefixFn(
+      f'extras/dec/{codec}', f'extras/enc/{codec}'))
+    codecs[f'codec_{codec}_sources'] = codec_sources
+
+  # TODO(eustas): move to separate folder?
+  extras_for_tools_sources, extras_sources = Filter(extras_sources, ContainsFn(
+    '/codec', '/hlg', '/metrics', '/packed_image_convert', '/render_hdr',
+    '/tone_mapping'))
+
+  # Source files only needed by the encoder or by tools (including decoding
+  # tools), but not by the decoder library.
+  # TODO(eustas): investigate the status of codec_in_out.h
+  # TODO(eustas): rename butteraugli_wrapper.cc to butteraugli.cc?
+  # TODO(eustas): is it possible to make butteraugli more standalone?
+  enc_sources, lib_srcs = Filter(lib_srcs, ContainsFn('/enc_', '/butteraugli',
+    'jxl/encode.cc', 'jxl/encode_internal.h'
+  ))
+
+  # The remaining of the files are in the dec_library.
+  dec_jpeg_sources, dec_sources = Filter(lib_srcs, HasPrefixFn('jxl/jpeg/',
+    'jxl/decode_to_jpeg.cc', 'jxl/decode_to_jpeg.h'))
+  dec_box_sources, dec_sources = Filter(dec_sources, HasPrefixFn(
+    'jxl/box_content_decoder.cc', 'jxl/box_content_decoder.h'))
+  cms_sources, dec_sources = Filter(dec_sources, HasPrefixFn('jxl/cms/'))
+
+  # TODO(lode): further prune dec_srcs: only those files that the decoder
+  # absolutely needs, and or not only for encoding, should be listed here.
+
+  return codecs | {'base_sources': base_sources,
+    'cms_sources': cms_sources,
+    'dec_box_sources': dec_box_sources,
+    'dec_jpeg_sources': dec_jpeg_sources,
+    'dec_sources': dec_sources,
+    'enc_sources': enc_sources,
+    'extras_for_tools_sources': extras_for_tools_sources,
+    'extras_sources': extras_sources,
+    'gbench_sources': gbench_sources,
+    'jpegli_sources': jpegli_sources,
+    'jpegli_testlib_files': jpegli_testlib_files,
+    'jpegli_libjpeg_helper_files': jpegli_libjpeg_helper_files,
+    'jpegli_tests': jpegli_tests,
+    'jpegli_wrapper_sources' : jpegli_wrapper_sources,
+    'public_headers': public_headers,
+    'testlib_files': testlib_files,
+    'tests': tests,
+    'threads_public_headers': threads_public_headers,
+    'threads_sources': threads_sources,
+  }
+
+
+def MaybeUpdateFile(args, filename, new_text):
+  """Optionally replace file with new contents.
+
+  If args.update is set, it will update the file with the new contents,
+  otherwise it will return True when no changes were needed.
+  """
+  filepath = os.path.join(args.src_dir, filename)
+  with open(filepath, 'r') as f:
+    src_text = f.read()
+
+  if new_text == src_text:
+    return True
+
+  if args.update:
+    print('Updating %s' % filename)
+    with open(filepath, 'w') as f:
+      f.write(new_text)
+    return True
+  else:
+    prefix = os.path.basename(filename)
+    with tempfile.NamedTemporaryFile(mode='w', prefix=prefix) as new_file:
+      new_file.write(new_text)
+      new_file.flush()
+      subprocess.call(['diff', '-u', filepath, '--label', 'a/' + filename,
+        new_file.name, '--label', 'b/' + filename])
+    return False
+
+
+def FormatList(items, prefix, suffix):
+  return ''.join(f'{prefix}{item}{suffix}\n' for item in items)
+
+
+def FormatGniVar(name, var):
+  if type(var) is list:
+    contents = FormatList(var, '    "', '",')
+    return f'{name} = [\n{contents}]\n'
+  else:  # TODO(eustas): do we need scalar strings?
+    return f'{name} = {var}\n'
+
+
+def FormatCMakeVar(name, var):
+  if type(var) is list:
+    contents = FormatList(var, '  ', '')
+    return f'set({name}\n{contents})\n'
+  else:  # TODO(eustas): do we need scalar strings?
+    return f'set({name} {var})\n'
+
+
+def GetJpegLibVersion(src_dir):
+  with open(os.path.join(src_dir, 'CMakeLists.txt'), 'r') as f:
+    cmake_text = f.read()
+    m = re.search(r'set\(JPEGLI_LIBJPEG_LIBRARY_SOVERSION "([0-9]+)"',
+                  cmake_text)
+    version = m.group(1)
+    if len(version) == 1:
+      version += "0"
+    return version
+
+
+def BuildCleaner(args):
+  repo_files = RepoFiles(args.src_dir)
+
+  with open(os.path.join(args.src_dir, 'lib/CMakeLists.txt'), 'r') as f:
+    cmake_text = f.read()
+  version = {'major_version': '', 'minor_version': '', 'patch_version': ''}
+  for var in version.keys():
+    cmake_var = f'JPEGXL_{var.upper()}'
+    # TODO(eustas): use `cmake -L`
+    # Regexp:
+    #   set(_varname_ _capture_decimal_)
+    match = re.search(r'set\(' + cmake_var + r' ([0-9]+)\)', cmake_text)
+    version[var] = match.group(1)
+
+  version['jpegli_lib_version'] = GetJpegLibVersion(args.src_dir)
+
+  lists = SplitLibFiles(repo_files)
+
+  cmake_chunks = [HEAD]
+  cmake_parts = lists
+  for var in sorted(cmake_parts):
+    cmake_chunks.append(FormatCMakeVar(
+        'JPEGXL_INTERNAL_' + var.upper(), cmake_parts[var]))
+
+  gni_chunks = [HEAD]
+  gni_parts = version | lists
+  for var in sorted(gni_parts):
+    gni_chunks.append(FormatGniVar('libjxl_' + var, gni_parts[var]))
+
+  okay = [
+    MaybeUpdateFile(args, 'lib/jxl_lists.cmake', '\n'.join(cmake_chunks)),
+    MaybeUpdateFile(args, 'lib/jxl_lists.bzl', '\n'.join(gni_chunks)),
+  ]
+  return all(okay)
+
+
+def main():
+  parser = argparse.ArgumentParser(description=__doc__)
+  parser.add_argument('--src-dir',
+    default=os.path.realpath(os.path.join( os.path.dirname(__file__), '../..')),
+    help='path to the build directory')
+  parser.add_argument('--update', default=False, action='store_true',
+    help='update the build files instead of only checking')
+  args = parser.parse_args()
+  Check(BuildCleaner(args), 'Build files need update.')
+
+
+if __name__ == '__main__':
+  main()
similarity index 92%
rename from tools/build_stats.py
rename to tools/scripts/build_stats.py
index b1dc1ea..63265e2 100755 (executable)
@@ -20,6 +20,7 @@ import collections
 import itertools
 import json
 import os
+import platform
 import re
 import struct
 import subprocess
@@ -29,6 +30,7 @@ import tempfile
 # Ignore functions with stack size smaller than this value.
 MIN_STACK_SIZE = 32
 
+IS_OSX = (platform.system() == 'Darwin')
 
 Symbol = collections.namedtuple('Symbol', ['address', 'size', 'typ', 'name'])
 
@@ -55,7 +57,10 @@ RAM_SIZE = 'dbs'
 
 # u - symbols imported from some other library
 # a - absolute address symbols
-IGNORE_SYMBOLS = 'ua'
+# c - common symbol
+# i - indirect symbol
+# - - debugger symbol table entries
+IGNORE_SYMBOLS = 'uaci-'
 
 SIMD_NAMESPACES = [
     'N_SCALAR', 'N_WASM', 'N_NEON', 'N_PPC8', 'N_SSE4', 'N_AVX2', 'N_AVX3']
@@ -65,17 +70,30 @@ def LoadSymbols(filename):
   ret = []
   nmout = subprocess.check_output(['nm', '--format=posix', filename])
   for line in nmout.decode('utf-8').splitlines():
-    if line.rstrip().endswith(':'):
+    line = line.rstrip()
+    if len(line) == 0:
+      # OSX nm produces extra crlf at the end
+      continue
+    if line.endswith(':'):
       # Ignore object names.
       continue
+    line = re.sub(' +', ' ', line)
     # symbol_name, symbol_type, (optional) address, (optional) size
     symlist = line.rstrip().split(' ')
-    assert 2 <= len(symlist) <= 4
+    col_count = len(symlist)
+    assert 2 <= col_count <= 4
     ret.append(Symbol(
-        int(symlist[2], 16) if len(symlist) > 2 else None,
-        int(symlist[3], 16) if len(symlist) > 3 else None,
+        int(symlist[2], 16) if col_count > 2 else None,
+        int(symlist[3], 16) if col_count > 3 else None,
         symlist[1],
         symlist[0]))
+  if IS_OSX:
+    ret = sorted(ret, key=lambda sym: sym.address)
+    for i in range(len(ret) - 1):
+      size = ret[i + 1].address - ret[i].address
+      if size > (1 << 30):
+        continue
+      ret[i] = ret[i]._replace(size=size)
   return ret
 
 def LoadTargetCommand(target, build_dir):
@@ -145,8 +163,9 @@ def LoadStackSizes(filename, binutils=''):
   section, which can be done by compiling with -fstack-size-section in clang.
   """
   with tempfile.NamedTemporaryFile() as stack_sizes_sec:
+    objcopy = ['objcopy', 'gobjcopy'][IS_OSX]
     subprocess.check_call(
-        [binutils + 'objcopy', '-O', 'binary', '--only-section=.stack_sizes',
+        [binutils + objcopy, '-O', 'binary', '--only-section=.stack_sizes',
          '--set-section-flags', '.stack_sizes=alloc', filename,
          stack_sizes_sec.name])
     stack_sizes = stack_sizes_sec.read()
@@ -157,10 +176,11 @@ def LoadStackSizes(filename, binutils=''):
   #  dynamic stack allocations are not included.
 
   # Get the pointer format based on the ELF file.
+  objdump = ['objdump', 'gobjdump'][IS_OSX]
   output = subprocess.check_output(
-      [binutils + 'objdump', '-a', filename]).decode('utf-8')
+      [binutils + objdump, '-a', filename]).decode('utf-8')
   elf_format = re.search('file format (.*)$', output, re.MULTILINE).group(1)
-  if elf_format.startswith('elf64-little') or elf_format == 'elf64-x86-64':
+  if elf_format.startswith('elf64-little') or elf_format.endswith('-x86-64') or elf_format.endswith('-arm64'):
     pointer_fmt = '<Q'
   elif elf_format.startswith('elf32-little') or elf_format == 'elf32-i386':
     pointer_fmt = '<I'
@@ -234,7 +254,7 @@ def PrintStats(stats):
   print('%-32s %17s %17s' % ('Object name', 'Binary size', 'Static RAM size'))
   for name, bin_size, ram_size in table:
     print('%-32s %8d (%5.1f%%) %8d (%5.1f%%)' % (
-        name, bin_size, 100. * bin_size / mx_bin_size,
+        name, bin_size, (100. * bin_size / mx_bin_size) if mx_bin_size else 0,
         ram_size, (100. * ram_size / mx_ram_size) if mx_ram_size else 0))
   print()
 
similarity index 93%
rename from tools/check_author.py
rename to tools/scripts/check_author.py
index ae1c279..7e16859 100755 (executable)
@@ -73,6 +73,8 @@ def CheckAuthor(args):
     print("User %s <%s> not found, please add yourself to the AUTHORS file" % (
               args.name, args.email),
           file=sys.stderr)
+    print("Hint: to override author in PR run:\n"
+          "  git commit --amend --author=\"Your Name <ldap@corp.com>\" --no-edit")
 
   sorted_alphabetically = IndividualsInAlphabeticOrder(authors_path)
   if not sorted_alphabetically:
@@ -90,7 +92,7 @@ def main():
                       help='name of the commit author to check')
   parser.add_argument(
       '--source-dir',
-      default=os.path.dirname(os.path.dirname(os.path.realpath(__file__))),
+      default=os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))),
       help='path to the source directory where the AUTHORS file is located')
   parser.add_argument('--dry-run', default=False, action='store_true',
                       help='Don\'t return an exit code in case of failure')
similarity index 89%
rename from tools/cjxl_bisect_bpp
rename to tools/scripts/cjxl_bisect_bpp
index d7a1066..13a908c 100755 (executable)
@@ -1,5 +1,10 @@
 #!/bin/sh
 #
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+#
 # Bisects JPEG XL encoding quality parameter to reach a given
 # target bits-per-pixel value.
 # (To be used directly, or as a template for tailored processing.)
similarity index 86%
rename from tools/cjxl_bisect_size
rename to tools/scripts/cjxl_bisect_size
index 9cd88ea..c0945d9 100755 (executable)
@@ -1,5 +1,10 @@
 #!/bin/sh
 #
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+#
 # Bisects JPEG XL encoding quality parameter to reach a given
 # target byte-size.
 # (To be used directly, or as a template for tailored processing.)
diff --git a/tools/scripts/jpegli_tools_test.sh b/tools/scripts/jpegli_tools_test.sh
new file mode 100644 (file)
index 0000000..96df3b0
--- /dev/null
@@ -0,0 +1,287 @@
+#!/bin/bash
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# End-to-end roundtrip tests for cjpegli and djpegli tools, and other linux
+# tools linked with the jpegli library.
+
+set -eux
+
+MYDIR=$(dirname $(realpath "$0"))
+JPEGXL_TEST_DATA_PATH="${MYDIR}/../../testdata"
+
+# Temporary files cleanup hooks.
+CLEANUP_FILES=()
+cleanup() {
+  if [[ ${#CLEANUP_FILES[@]} -ne 0 ]]; then
+    rm -rf "${CLEANUP_FILES[@]}"
+  fi
+}
+trap 'retcode=$?; { set +x; } 2>/dev/null; cleanup' INT TERM EXIT
+
+verify_ssimulacra2() {
+  local score="$("${ssimulacra2}" "${1}" "${2}")"
+  python3 -c "import sys; sys.exit(not ${score} >= ${3})"
+}
+
+verify_max_bpp() {
+  local infn="$1"
+  local jpgfn="$2"
+  local maxbpp="$3"
+  local size="$(wc -c "${jpgfn}" | cut -d' ' -f1)"
+  local pixels=$(( "$(identify "${infn}" | cut -d' ' -f3 | tr 'x' '*')" ))
+  python3 -c "import sys; sys.exit(not ${size} * 8 <= ${maxbpp} * ${pixels})"
+}
+
+# Test that jpeg files created with cjpegli can be decoded with normal djpeg.
+cjpegli_test() {
+  local infn="${JPEGXL_TEST_DATA_PATH}/$1"
+  local encargs="$2"
+  local minscore="$3"
+  local maxbpp="$4"
+  local jpgfn="$(mktemp -p "${tmpdir}")"
+  local outfn="$(mktemp -p "${tmpdir}").ppm"
+
+  "${cjpegli}" "${infn}" "${jpgfn}" $encargs
+  djpeg -outfile "${outfn}" "${jpgfn}"
+
+  verify_ssimulacra2 "${infn}" "${outfn}" "${minscore}"
+  verify_max_bpp "${infn}" "${jpgfn}" "${maxbpp}"
+}
+
+# Test full cjpegli/djpegli roundtrip.
+cjpegli_djpegli_test() {
+  local infn="${JPEGXL_TEST_DATA_PATH}/$1"
+  local encargs="$2"
+  local minscore="$3"
+  local maxbpp="$4"
+  local jpgfn="$(mktemp -p "${tmpdir}")"
+  local outfn="$(mktemp -p "${tmpdir}").png"
+
+  "${cjpegli}" "${infn}" "${jpgfn}" $encargs
+  "${djpegli}" "${jpgfn}" "${outfn}"
+
+  verify_ssimulacra2 "${infn}" "${outfn}" "${minscore}"
+  verify_max_bpp "${infn}" "${jpgfn}" "${maxbpp}"
+}
+
+# Test the --target_size command line argument of cjpegli.
+cjpegli_test_target_size() {
+  local infn="${JPEGXL_TEST_DATA_PATH}/$1"
+  local encargs="$2"
+  local target_size="$3"
+  local jpgfn="$(mktemp -p "$tmpdir")"
+
+  "${cjpegli}" "${infn}" "${jpgfn}" $encargs --target_size "${target_size}"
+  local size="$(wc -c "${jpgfn}" | cut -d' ' -f1)"
+  python3 -c "import sys; sys.exit(not ${target_size} * 0.996 <= ${size})"
+  python3 -c "import sys; sys.exit(not ${target_size} * 1.004 >= ${size})"
+}
+
+# Test that jpeg files created with cjpeg binary + jpegli library can be decoded
+# with normal libjpeg.
+cjpeg_test() {
+  local infn="${JPEGXL_TEST_DATA_PATH}/$1"
+  local encargs="$2"
+  local minscore="$3"
+  local maxbpp="$4"
+  local jpgfn="$(mktemp -p "$tmpdir")"
+  local outfn="$(mktemp -p "${tmpdir}").png"
+
+  LD_LIBRARY_PATH="${build_dir}/lib/jpegli:${LD_LIBRARY_PATH:-}" \
+    cjpeg $encargs -outfile "${jpgfn}" "${infn}"
+  djpeg -outfile "${outfn}" "${jpgfn}"
+
+  verify_ssimulacra2 "${infn}" "${outfn}" "${minscore}"
+  verify_max_bpp "${infn}" "${jpgfn}" "${maxbpp}"
+}
+
+# Test decoding of jpeg files with the djpegli binary.
+djpegli_test() {
+  local infn="${JPEGXL_TEST_DATA_PATH}/$1"
+  local encargs="$2"
+  local minscore="$3"
+  local jpgfn="$(mktemp -p "$tmpdir")"
+
+  cjpeg $encargs -outfile "${jpgfn}" "${infn}"
+
+  # Test that disabling output works.
+  "${djpegli}" "${jpgfn}" --disable_output
+  for ext in png pgm ppm pfm pnm baz; do
+    "${djpegli}" "${jpgfn}" /foo/bar.$ext --disable_output
+  done
+
+  # Test decoding to PNG, PPM, PNM, PFM
+  for ext in png ppm pnm pfm; do
+    local outfn="$(mktemp -p "${tmpdir}").${ext}"
+    "${djpegli}" "${jpgfn}" "${outfn}" --num_reps 2
+    verify_ssimulacra2 "${infn}" "${outfn}" "${minscore}"
+  done
+
+  # Test decoding to PGM (for grayscale input)
+  if [[ "${infn: -6}" == ".g.png" ]]; then
+    local outfn="$(mktemp -p "${tmpdir}").pgm"
+    "${djpegli}" "${jpgfn}" "${outfn}" --quiet
+    verify_ssimulacra2 "${infn}" "${outfn}" "${minscore}"
+  fi
+
+  # Test decoding to 16 bit
+  for ext in png pnm; do
+    local outfn8="$(mktemp -p "${tmpdir}").8.${ext}"
+    local outfn16="$(mktemp -p "${tmpdir}").16.${ext}"
+    "${djpegli}" "${jpgfn}" "${outfn8}"
+    "${djpegli}" "${jpgfn}" "${outfn16}" --bitdepth 16
+    local score8="$("${ssimulacra2}" "${infn}" "${outfn8}")"
+    local score16="$("${ssimulacra2}" "${infn}" "${outfn16}")"
+    python3 -c "import sys; sys.exit(not ${score16} > ${score8})"
+  done
+}
+
+# Test decoding of jpeg files with the djpeg binary + jpegli library.
+djpeg_test() {
+  local infn="${JPEGXL_TEST_DATA_PATH}/$1"
+  local encargs="$2"
+  local minscore="$3"
+  local jpgfn="$(mktemp -p "$tmpdir")"
+
+  cjpeg $encargs -outfile "${jpgfn}" "${infn}"
+
+  # Test default behaviour.
+  local outfn="$(mktemp -p "${tmpdir}").pnm"
+  LD_LIBRARY_PATH="${build_dir}/lib/jpegli:${LD_LIBRARY_PATH:-}" \
+    djpeg -outfile "${outfn}" "${jpgfn}"
+  verify_ssimulacra2 "${infn}" "${outfn}" "${minscore}"
+
+  # Test color quantization.
+  local outfn="$(mktemp -p "${tmpdir}").pnm"
+  LD_LIBRARY_PATH="${build_dir}/lib/jpegli:${LD_LIBRARY_PATH:-}" \
+    djpeg -outfile "${outfn}" -colors 128 "${jpgfn}"
+  verify_ssimulacra2 "${infn}" "${outfn}" 48
+
+  local outfn="$(mktemp -p "${tmpdir}").pnm"
+  LD_LIBRARY_PATH="${build_dir}/lib/jpegli:${LD_LIBRARY_PATH:-}" \
+    djpeg -outfile "${outfn}" -colors 128 -onepass -dither fs "${jpgfn}"
+  verify_ssimulacra2 "${infn}" "${outfn}" 30
+
+  local outfn="$(mktemp -p "${tmpdir}").pnm"
+  LD_LIBRARY_PATH="${build_dir}/lib/jpegli:${LD_LIBRARY_PATH:-}" \
+    djpeg -outfile "${outfn}" -colors 128 -onepass -dither ordered "${jpgfn}"
+  verify_ssimulacra2 "${infn}" "${outfn}" 30
+
+  # Test -grayscale flag.
+  local outfn="$(mktemp -p "${tmpdir}").pgm"
+  LD_LIBRARY_PATH="${build_dir}/lib/jpegli:${LD_LIBRARY_PATH:-}" \
+    djpeg -outfile "${outfn}" -grayscale "${jpgfn}"
+  local outfn2="$(mktemp -p "${tmpdir}").pgm"
+  convert "${infn}" -set colorspace Gray "${outfn2}"
+  # JPEG color conversion is in gamma-compressed space, so it will not match
+  # the correct grayscale version very well.
+  verify_ssimulacra2 "${outfn2}" "${outfn}" 60
+
+  # Test -rgb flag.
+  local outfn="$(mktemp -p "${tmpdir}").ppm"
+  LD_LIBRARY_PATH="${build_dir}/lib/jpegli:${LD_LIBRARY_PATH:-}" \
+    djpeg -outfile "${outfn}" -rgb "${jpgfn}"
+  verify_ssimulacra2 "${infn}" "${outfn}" "${minscore}"
+
+  # Test -crop flag.
+  for geometry in 256x256+128+128 256x127+128+117; do
+    local outfn="$(mktemp -p "${tmpdir}").pnm"
+    LD_LIBRARY_PATH="${build_dir}/lib/jpegli:${LD_LIBRARY_PATH:-}" \
+      djpeg -outfile "${outfn}" -crop "${geometry}" "${jpgfn}"
+    local outfn2="$(mktemp -p "${tmpdir}").pnm"
+    convert "${infn}" -crop "${geometry}" "${outfn2}"
+    verify_ssimulacra2 "${outfn2}" "${outfn}" "${minscore}"
+  done
+
+  # Test output scaling.
+  for scale in 1/4 3/8 1/2 5/8 9/8; do
+    local scalepct="$(python3 -c "print(100.0*${scale})")%"
+    local geometry=96x128+0+0
+    local outfn="$(mktemp -p "${tmpdir}").pnm"
+    LD_LIBRARY_PATH="${build_dir}/lib/jpegli:${LD_LIBRARY_PATH:-}" \
+      djpeg -outfile "${outfn}" -scale "${scale}" -crop "${geometry}" "${jpgfn}"
+    local outfn2="$(mktemp -p "${tmpdir}").pnm"
+    convert "${infn}" -scale "${scalepct}" -crop "${geometry}" "${outfn2}"
+    verify_ssimulacra2 "${outfn2}" "${outfn}" 80
+  done
+}
+
+main() {
+  local tmpdir=$(mktemp -d)
+  CLEANUP_FILES+=("${tmpdir}")
+
+  local build_dir="${1:-}"
+  if [[ -z "${build_dir}" ]]; then
+    build_dir=$(realpath "${MYDIR}/../../build")
+  fi
+
+  local cjpegli="${build_dir}/tools/cjpegli"
+  local djpegli="${build_dir}/tools/djpegli"
+  local ssimulacra2="${build_dir}/tools/ssimulacra2"
+  local rgb_in="jxl/flower/flower_small.rgb.png"
+  local gray_in="jxl/flower/flower_small.g.png"
+  local ppm_rgb="jxl/flower/flower_small.rgb.depth8.ppm"
+  local ppm_gray="jxl/flower/flower_small.g.depth8.pgm"
+
+  cjpegli_test "${rgb_in}" "" 88.5 1.7
+  cjpegli_test "${rgb_in}" "-q 80" 84 1.2
+  cjpegli_test "${rgb_in}" "-q 95" 91.5 2.4
+  cjpegli_test "${rgb_in}" "-d 0.5" 92 2.6
+  cjpegli_test "${rgb_in}" "--chroma_subsampling 420" 87 1.5
+  cjpegli_test "${rgb_in}" "--chroma_subsampling 440" 87 1.6
+  cjpegli_test "${rgb_in}" "--chroma_subsampling 422" 87 1.6
+  cjpegli_test "${rgb_in}" "--std_quant" 91 2.2
+  cjpegli_test "${rgb_in}" "--noadaptive_quantization" 88.5 1.85
+  cjpegli_test "${rgb_in}" "-p 1" 88.5 1.72
+  cjpegli_test "${rgb_in}" "-p 0" 88.5 1.75
+  cjpegli_test "${rgb_in}" "-p 0 --fixed_code" 88.5 1.8
+  cjpegli_test "${gray_in}" "" 92 1.4
+
+  cjpegli_test_target_size "${rgb_in}" "" 10000
+  cjpegli_test_target_size "${rgb_in}" "" 50000
+  cjpegli_test_target_size "${rgb_in}" "" 100000
+  cjpegli_test_target_size "${rgb_in}" "--chroma_subsampling 420" 20000
+  cjpegli_test_target_size "${rgb_in}" "--xyb" 20000
+  cjpegli_test_target_size "${rgb_in}" "-p 0 --fixed_code" 20000
+
+  cjpegli_test "jxl/flower/flower_small.rgb.depth8.ppm" "" 88.5 1.7
+  cjpegli_test "jxl/flower/flower_small.rgb.depth16.ppm" "" 89 1.7
+  cjpegli_test "jxl/flower/flower_small.g.depth8.pgm" "" 89 1.7
+  cjpegli_test "jxl/flower/flower_small.g.depth16.pgm" "" 89 1.7
+
+  cjpegli_djpegli_test "${rgb_in}" "" 89 1.7
+  cjpegli_djpegli_test "${rgb_in}" "--xyb" 87 1.5
+
+  djpegli_test "${ppm_rgb}" "-q 95" 92
+  djpegli_test "${ppm_rgb}" "-q 95 -sample 1x1" 93
+  djpegli_test "${ppm_gray}" "-q 95 -gray" 94
+
+  cjpeg_test "${ppm_rgb}" "" 89 1.9
+  cjpeg_test "${ppm_rgb}" "-optimize" 89 1.85
+  cjpeg_test "${ppm_rgb}" "-optimize -progressive" 89 1.8
+  cjpeg_test "${ppm_rgb}" "-sample 2x2" 87 1.65
+  cjpeg_test "${ppm_rgb}" "-sample 1x2" 88 1.75
+  cjpeg_test "${ppm_rgb}" "-sample 2x1" 88 1.75
+  cjpeg_test "${ppm_rgb}" "-grayscale" -50 1.45
+  cjpeg_test "${ppm_rgb}" "-rgb" 92 4.5
+  cjpeg_test "${ppm_rgb}" "-restart 1" 89 1.9
+  cjpeg_test "${ppm_rgb}" "-restart 1024B" 89 1.9
+  cjpeg_test "${ppm_rgb}" "-smooth 30" 88 1.75
+  cjpeg_test "${ppm_gray}" "-grayscale" 92 1.45
+  # The -q option works differently on v62 vs. v8 cjpeg binaries, so we have to
+  # have looser bounds than would be necessary if we sticked to a particular
+  # cjpeg version.
+  cjpeg_test "${ppm_rgb}" "-q 50" 76 0.95
+  cjpeg_test "${ppm_rgb}" "-q 80" 84 1.6
+  cjpeg_test "${ppm_rgb}" "-q 90" 89 2.35
+  cjpeg_test "${ppm_rgb}" "-q 100" 95 7.45
+
+  djpeg_test "${ppm_rgb}" "-q 95" 92
+  djpeg_test "${ppm_rgb}" "-q 95 -sample 1x1" 93
+  djpeg_test "${ppm_gray}" "-q 95 -gray" 94
+}
+
+main "$@"
diff --git a/tools/scripts/jxl-eval.sh b/tools/scripts/jxl-eval.sh
new file mode 100755 (executable)
index 0000000..138aac8
--- /dev/null
@@ -0,0 +1,124 @@
+#!/bin/bash
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+set -eu
+
+GSROOT="${GSROOT:-gs://jxl-quality}"
+URLROOT="${URLROOT:-https://storage.googleapis.com/jxl-quality}"
+BUILD_DIR="${BUILD_DIR:-./build}"
+BUILD_MODE="${BUILD_MODE:-opt}"
+DESC="${DESC:-exp}"
+
+build_libjxl() {
+  export BUILD_DIR="${BUILD_DIR}"
+  export SKIP_TEST=1
+  ./ci.sh "${BUILD_MODE}"
+}
+
+build_mozjpeg() {
+  if [[ ! -d "${HOME}/mozjpeg" ]]; then
+    (cd "${HOME}"
+     git clone https://github.com/mozilla/mozjpeg.git
+    )
+  fi
+  (cd "${HOME}/mozjpeg"
+   mkdir -p build
+   cmake -GNinja -B build
+   ninja -C build
+  )
+}
+
+download_corpus() {
+  local corpus="$1"
+  local localdir="${HOME}/corpora/${corpus}"
+  local remotedir="${GSROOT}/corpora/${corpus}"
+  if [[ ! -d "${localdir}" ]]; then
+    mkdir -p "${localdir}"
+  fi
+  gsutil -m rsync "${remotedir}" "${localdir}"
+}
+
+create_report() {
+  local corpus="$1"
+  local codec="$2"
+  shift 2
+  local rev="$(git rev-parse --short HEAD)"
+  local originals="${URLROOT}/corpora/${corpus}"
+  if git diff HEAD --quiet; then
+    local expid="${corpus}/${rev}/base"
+  else
+    local expid="${corpus}/${rev}/${DESC}"
+  fi
+  local output_dir="benchmark_results/${expid}"
+  local bucket="eval/${USER}/${expid}"
+  local indexhtml="index.$(echo ${codec} | tr ':' '_').html"
+  local url="${URLROOT}/${bucket}/${indexhtml}"
+  local use_decompressed="--save_decompressed --html_report_use_decompressed"
+  if [[ "${codec:0:4}" == "jpeg" ]]; then
+    use_decompressed="--nohtml_report_use_decompressed"
+  fi
+  (
+   cd "${BUILD_DIR}"
+   tools/benchmark_xl \
+     --output_dir "${output_dir}" \
+     --input "${HOME}/corpora/${corpus}/*.??g" \
+     --codec="${codec}" \
+     --save_compressed \
+     --write_html_report \
+     "${use_decompressed}" \
+     --originals_url="${originals}" \
+     $@
+   gsutil -m rsync "${output_dir}" "${GSROOT}/${bucket}"
+   echo "You can view evaluation results at:"
+   echo "${url}"
+  )
+}
+
+cmd_upload_corpus() {
+  local corpus="$1"
+  gsutil -m rsync "${HOME}/corpora/${corpus}" "${GSROOT}/corpora/${corpus}"
+}
+
+cmd_report() {
+  local corpus="$1"
+  local codec="$2"
+  if [[ "${codec}" == *","* ]]; then
+    echo "Multiple codecs are not allowed in html report"
+    exit 1
+  fi
+  download_corpus "${corpus}"
+  if [[ "${codec:0:4}" == "jpeg" ]]; then
+    build_mozjpeg
+    export LD_LIBRARY_PATH="${HOME}/mozjpeg/build:${LD_LIBRARY_PATH:-}"
+  fi
+  build_libjxl
+  create_report "$@"
+}
+
+main() {
+  local cmd="${1:-}"
+  if [[ -z "${cmd}" ]]; then
+    cat >&2 <<EOF
+Use: $0 CMD
+
+Where CMD is one of:
+ upload_corpus CORPUS
+   Upload the image corpus in $HOME/corpora/CORPUS to the cloud
+ report CORPUS CODEC
+   Build and run benchmark of codec CODEC on image corpus CORPUS and upload
+   the results to the cloud. If the codec is jpeg, the mozjpeg library will be
+   built and used through LD_LIBRARY_PATH
+EOF
+    echo "Usage $0 CMD"
+    exit 1
+  fi
+  cmd="cmd_${cmd}"
+  shift
+  set -x
+  "${cmd}" "$@"
+}
+
+main "$@"
similarity index 98%
rename from tools/ossfuzz-build.sh
rename to tools/scripts/ossfuzz-build.sh
index b5fbb45..7ab45b6 100755 (executable)
@@ -20,6 +20,7 @@ main() {
   build_args=(
     -G Ninja
     -DBUILD_TESTING=OFF
+    -DBUILD_SHARED_LIBS=OFF
     -DJPEGXL_ENABLE_BENCHMARK=OFF
     -DJPEGXL_ENABLE_DEVTOOLS=ON
     -DJPEGXL_ENABLE_EXAMPLES=OFF
similarity index 84%
rename from tools/progressive_sizes.sh
rename to tools/scripts/progressive_sizes.sh
index a1e808d..08d3079 100755 (executable)
@@ -16,8 +16,8 @@ cleanup() {
 trap cleanup EXIT
 
 
-CJXL=$(realpath $(dirname "$0"))/../build/tools/cjxl
-DJXL=$(realpath $(dirname "$0"))/../build/tools/djxl
+CJXL=$(realpath $(dirname "$0"))/../../build/tools/cjxl
+DJXL=$(realpath $(dirname "$0"))/../../build/tools/djxl
 
 ${CJXL} "$@" ${TMPDIR}/x.jxl &>/dev/null
 S1=$(${DJXL} ${TMPDIR}/x.jxl --print_read_bytes -s 1 2>&1 | grep 'Decoded' | grep -o '[0-9]*')
similarity index 51%
rename from tools/roundtrip_test.sh
rename to tools/scripts/roundtrip_test.sh
index 46b7756..b3bb300 100644 (file)
@@ -7,12 +7,10 @@
 # End-to-end roundtrip tests for cjxl and djxl tools.
 
 MYDIR=$(dirname $(realpath "$0"))
-JPEGXL_TEST_DATA_PATH="${MYDIR}/../testdata"
+JPEGXL_TEST_DATA_PATH="${MYDIR}/../../testdata"
 
 set -eux
 
-EMULATOR=${EMULATOR:-}
-
 # Temporary files cleanup hooks.
 CLEANUP_FILES=()
 cleanup() {
@@ -22,14 +20,20 @@ cleanup() {
 }
 trap 'retcode=$?; { set +x; } 2>/dev/null; cleanup' INT TERM EXIT
 
+roundtrip_lossless_pnm_test() {
+  local infn="${JPEGXL_TEST_DATA_PATH}/$1"
+  local jxlfn="$(mktemp -p "$tmpdir")"
+  local outfn="$(mktemp -p "$tmpdir").${infn: -3}"
+
+  "${encoder}" "${infn}" "${jxlfn}" -d 0 -e 1
+  "${decoder}" "${jxlfn}" "${outfn}"
+  diff "${infn}" "${outfn}"
+}
+
 roundtrip_test() {
   local infn="${JPEGXL_TEST_DATA_PATH}/$1"
   local encargs="$2"
   local maxdist="$3"
-  
-  local encoder="${EMULATOR} ${build_dir}/tools/cjxl"
-  local decoder="${EMULATOR} ${build_dir}/tools/djxl"
-  local comparator="${EMULATOR} ${build_dir}/tools/ssimulacra_main"
   local jxlfn="$(mktemp -p "$tmpdir")"
 
   "${encoder}" "${infn}" "${jxlfn}" $encargs
@@ -66,6 +70,28 @@ roundtrip_test() {
       local dist="$("${comparator}" "${infn}" "${outfn}")"
       python3 -c "import sys; sys.exit(not ${dist} <= ${maxdist})"
 
+      # Test decoding to 16 bit png.
+      "${decoder}" "${jxlfn}" "${outfn}" --bits_per_sample 16
+      local dist="$("${comparator}" "${infn}" "${outfn}")"
+      python3 -c "import sys; sys.exit(not ${dist} <= ${maxdist} + 0.0005)"
+
+      # Test decoding to pfm.
+      local outfn="$(mktemp -p "$tmpdir").pfm"
+      "${decoder}" "${jxlfn}" "${outfn}"
+      local dist="$("${comparator}" "${infn}" "${outfn}")"
+      python3 -c "import sys; sys.exit(not ${dist} <= ${maxdist})"
+
+      # Test decoding to ppm.
+      local outfn="$(mktemp -p "$tmpdir").ppm"
+      "${decoder}" "${jxlfn}" "${outfn}"
+      local dist="$("${comparator}" "${infn}" "${outfn}")"
+      python3 -c "import sys; sys.exit(not ${dist} <= ${maxdist})"
+
+      # Test decoding to 16 bit ppm.
+      "${decoder}" "${jxlfn}" "${outfn}" --bits_per_sample 16
+      local dist="$("${comparator}" "${infn}" "${outfn}")"
+      python3 -c "import sys; sys.exit(not ${dist} <= ${maxdist} + 0.0005)"
+
       # Test decoding to jpg.
       outfn="$(mktemp -p "$tmpdir").jpg"
       "${decoder}" "${jxlfn}" "${outfn}" --num_reps 2
@@ -83,9 +109,30 @@ main() {
     build_dir=$(realpath "${MYDIR}/../../build")
   fi
 
-  roundtrip_test "jxl/flower/flower.png" "-e 1" 0.02
-  roundtrip_test "jxl/flower/flower.png" "-e 1 -d 0.0" 0.0
+  local encoder="${build_dir}/tools/cjxl"
+  local decoder="${build_dir}/tools/djxl"
+  local comparator="${build_dir}/tools/ssimulacra_main"
+
+  roundtrip_test "jxl/flower/flower_small.rgb.png" "-e 1" 0.02
+  roundtrip_test "jxl/flower/flower_small.rgb.png" "-e 1 -d 0.0" 0.0
+  roundtrip_test "jxl/flower/flower_small.rgb.depth8.ppm" \
+                "-e 1 --streaming_input" 0.02
+  roundtrip_test "jxl/flower/flower_small.rgb.depth8.ppm" \
+                "-e 1 -d 0.0 --streaming_input" 0.0
+  roundtrip_test "jxl/flower/flower_small.rgb.depth8.ppm" \
+                "-e 1 --streaming_output" 0.02
+  roundtrip_test "jxl/flower/flower_small.rgb.depth8.ppm" \
+                "-e 1 -d 0.0 --streaming_input --streaming_output" 0.0
   roundtrip_test "jxl/flower/flower_cropped.jpg" "-e 1" 0.0
+
+  roundtrip_lossless_pnm_test "jxl/flower/flower_small.rgb.depth1.ppm"
+  roundtrip_lossless_pnm_test "jxl/flower/flower_small.g.depth1.pgm"
+  for i in `seq 2 16`; do
+      roundtrip_lossless_pnm_test "jxl/flower/flower_small.rgb.depth$i.ppm"
+      roundtrip_lossless_pnm_test "jxl/flower/flower_small.g.depth$i.pgm"
+      roundtrip_lossless_pnm_test "jxl/flower/flower_small.ga.depth$i.pam"
+      roundtrip_lossless_pnm_test "jxl/flower/flower_small.rgba.depth$i.pam"
+  done
 }
 
 main "$@"
diff --git a/tools/scripts/test_cost-arm64-lowprecision.zip b/tools/scripts/test_cost-arm64-lowprecision.zip
new file mode 100644 (file)
index 0000000..92045e2
Binary files /dev/null and b/tools/scripts/test_cost-arm64-lowprecision.zip differ
diff --git a/tools/scripts/test_cost-arm64.zip b/tools/scripts/test_cost-arm64.zip
new file mode 100644 (file)
index 0000000..0d196ed
Binary files /dev/null and b/tools/scripts/test_cost-arm64.zip differ
diff --git a/tools/scripts/test_cost-armhf.zip b/tools/scripts/test_cost-armhf.zip
new file mode 100644 (file)
index 0000000..988c96d
Binary files /dev/null and b/tools/scripts/test_cost-armhf.zip differ
diff --git a/tools/scripts/test_cost-i386.zip b/tools/scripts/test_cost-i386.zip
new file mode 100644 (file)
index 0000000..718789f
Binary files /dev/null and b/tools/scripts/test_cost-i386.zip differ
diff --git a/tools/scripts/transform_sources_list.py b/tools/scripts/transform_sources_list.py
new file mode 100644 (file)
index 0000000..1194fe0
--- /dev/null
@@ -0,0 +1,76 @@
+#!/usr/bin/env python3
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+import sys
+
+def find_key(entries : list[str], key: str) -> int:
+  prefix = f"{key.lower()}: "
+  for i in range(len(entries)):
+    if entries[i].lower().startswith(prefix):
+      return i
+  return -1
+
+def set_value(entries: list[str], key: str, value: str):
+  new_line = f'{key}: {value}'
+  # TODO(eustas): deal with repeated items
+  idx = find_key(entries, key)
+  if idx < 0:
+    entries.append(new_line)
+  else:
+    entries[idx] = new_line
+
+def transform_deb_822(archs):
+  sources_path = "/etc/apt/sources.list.d/debian.sources"
+  with open(sources_path) as f:
+    lines = [line.rstrip() for line in f]
+  lines.append('')
+  entries = []
+  entry = []
+  for line in lines:
+    if len(line) == 0:
+      if len(entry) > 0:
+        entries.append(entry)
+      entry = []
+    else:
+      entry.append(line)
+
+  new_entries = []
+  for entry in entries:
+    types_key = find_key(entry, "Types")
+    if types_key < 0:
+      continue
+    if "types: deb" != entry[types_key].lower():
+      continue
+    deb_entry = entry[:]
+    for arch in archs:
+      deb_entry.append(f"Architectures-Add: {arch}")
+    new_entries.append(deb_entry)
+    deb_src_entry = deb_entry[:]
+    set_value(deb_src_entry, "Types", "deb-src")
+    new_entries.append(deb_src_entry)
+
+  new_lines = []
+  for entry in new_entries:
+    if len(new_lines) > 0:
+      new_lines.append("")
+    new_lines.extend(entry)
+
+  with open(sources_path, "w") as f:
+    f.write('\n'.join(new_lines))
+
+def main():
+  if len(sys.argv) != 2:
+    print(f"Usage: {sys.argv[1]} ARCHS")
+    sys.exit(1)
+  archs_str = sys.argv[1]
+  archs = archs_str.split(',')
+  if True:
+    transform_deb_822(archs)
+  else:
+    sys.exit(1)
+
+if __name__ == '__main__':
+  main()
index 5eb9f75..0b95072 100644 (file)
@@ -7,27 +7,29 @@
 #include <stdint.h>
 
 #include "lib/extras/codec.h"
+#include "lib/extras/size_constraints.h"
 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/base/span.h"
-#include "lib/jxl/base/thread_pool_internal.h"
 #include "lib/jxl/codec_in_out.h"
+#include "tools/thread_pool_internal.h"
 
-namespace jxl {
+namespace {
 
 int TestOneInput(const uint8_t* data, size_t size) {
-  CodecInOut io;
-  io.constraints.dec_max_xsize = 1u << 16;
-  io.constraints.dec_max_ysize = 1u << 16;
-  io.constraints.dec_max_pixels = 1u << 22;
-  ThreadPoolInternal pool(0);
+  jxl::CodecInOut io;
+  jxl::SizeConstraints constraints;
+  constraints.dec_max_xsize = 1u << 16;
+  constraints.dec_max_ysize = 1u << 16;
+  constraints.dec_max_pixels = 1u << 22;
+  jpegxl::tools::ThreadPoolInternal pool(0);
 
-  (void)SetFromBytes(Span<const uint8_t>(data, size), &io, &pool);
+  (void)jxl::SetFromBytes(jxl::Bytes(data, size), &io, &pool, &constraints);
 
   return 0;
 }
 
-}  // namespace jxl
+}  // namespace
 
 extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
-  return jxl::TestOneInput(data, size);
+  return TestOneInput(data, size);
 }
index cdef814..d378d09 100644 (file)
@@ -5,6 +5,10 @@
 
 #include "tools/speed_stats.h"
 
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
 #include <inttypes.h>
 #include <math.h>
 #include <stddef.h>
@@ -54,21 +58,19 @@ bool SpeedStats::GetSummary(SpeedStats::Summary* s) {
     s->central_tendency = pow(product, 1.0 / (elapsed_.size() - 1));
     s->variability = 0.0;
     s->type = " geomean:";
-    return true;
+    if (isnormal(s->central_tendency)) return true;
   }
 
   // Else: median
   std::sort(elapsed_.begin(), elapsed_.end());
   s->central_tendency = elapsed_.data()[elapsed_.size() / 2];
-  std::vector<double> deviations(elapsed_.size());
+  double stdev = 0;
   for (size_t i = 0; i < elapsed_.size(); i++) {
-    deviations[i] = fabs(elapsed_[i] - s->central_tendency);
+    double diff = elapsed_[i] - s->central_tendency;
+    stdev += diff * diff;
   }
-  std::nth_element(deviations.begin(),
-                   deviations.begin() + deviations.size() / 2,
-                   deviations.end());
-  s->variability = deviations[deviations.size() / 2];
-  s->type = "median: ";
+  s->variability = sqrt(stdev);
+  s->type = " median:";
   return true;
 }
 
@@ -84,8 +86,14 @@ std::string SummaryStat(double value, const char* unit,
   const double value_min = value / s.max;
   const double value_max = value / s.min;
 
-  snprintf(stat_str, sizeof(stat_str), ",%s %.2f %s/s [%.2f, %.2f]", s.type,
-           value_tendency, unit, value_min, value_max);
+  char variability[20] = {'\0'};
+  if (s.variability != 0.0) {
+    const double stdev = value / s.variability;
+    snprintf(variability, sizeof(variability), " (stdev %.3f)", stdev);
+  }
+
+  snprintf(stat_str, sizeof(stat_str), ",%s %.3f %s/s [%.2f, %.2f]%s", s.type,
+           value_tendency, unit, value_min, value_max, variability);
   return stat_str;
 }
 
@@ -99,16 +107,11 @@ bool SpeedStats::Print(size_t worker_threads) {
   std::string mps_stats = SummaryStat(xsize_ * ysize_ * 1e-6, "MP", s);
   std::string mbs_stats = SummaryStat(file_size_ * 1e-6, "MB", s);
 
-  char variability[20] = {'\0'};
-  if (s.variability != 0.0) {
-    snprintf(variability, sizeof(variability), " (var %.2f)", s.variability);
-  }
-
   fprintf(stderr,
-          "%" PRIu64 " x %" PRIu64 "%s%s%s, %" PRIu64 " reps, %" PRIu64
+          "%" PRIu64 " x %" PRIu64 "%s%s, %" PRIu64 " reps, %" PRIu64
           " threads.\n",
           static_cast<uint64_t>(xsize_), static_cast<uint64_t>(ysize_),
-          mps_stats.c_str(), mbs_stats.c_str(), variability,
+          mps_stats.c_str(), mbs_stats.c_str(),
           static_cast<uint64_t>(elapsed_.size()),
           static_cast<uint64_t>(worker_threads));
   return true;
diff --git a/tools/ssimulacra2.cc b/tools/ssimulacra2.cc
new file mode 100644 (file)
index 0000000..5ddaab3
--- /dev/null
@@ -0,0 +1,492 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+SSIMULACRA 2
+Structural SIMilarity Unveiling Local And Compression Related Artifacts
+
+Perceptual metric developed by Jon Sneyers (Cloudinary) in July 2022,
+updated in April 2023.
+Design:
+- XYB color space (rescaled to a 0..1 range and with B-Y)
+- SSIM map (with correction: no double gamma correction)
+- 'blockiness/ringing' map (distorted has edges where original is smooth)
+- 'smoothing' map (distorted is smooth where original has edges)
+- error maps are computed at 6 scales (1:1 to 1:32) for each component (X,Y,B)
+- downscaling is done in linear RGB
+- for all 6*3*3=54 maps, two norms are computed: 1-norm (mean) and 4-norm
+- a weighted sum of these 54*2=108 norms leads to the final score
+- weights were tuned based on a large set of subjective scores
+  (CID22, TID2013, Kadid10k, KonFiG-IQA).
+*/
+
+#include "tools/ssimulacra2.h"
+
+#include <jxl/cms.h>
+#include <stdio.h>
+
+#include <cmath>
+
+#include "lib/jxl/base/printf_macros.h"
+#include "lib/jxl/enc_xyb.h"
+#include "lib/jxl/gauss_blur.h"
+#include "lib/jxl/image_ops.h"
+
+namespace {
+
+using jxl::Image3F;
+using jxl::ImageF;
+
+static const float kC2 = 0.0009f;
+static const int kNumScales = 6;
+
+Image3F Downsample(const Image3F& in, size_t fx, size_t fy) {
+  const size_t out_xsize = (in.xsize() + fx - 1) / fx;
+  const size_t out_ysize = (in.ysize() + fy - 1) / fy;
+  Image3F out(out_xsize, out_ysize);
+  const float normalize = 1.0f / (fx * fy);
+  for (size_t c = 0; c < 3; ++c) {
+    for (size_t oy = 0; oy < out_ysize; ++oy) {
+      float* JXL_RESTRICT row_out = out.PlaneRow(c, oy);
+      for (size_t ox = 0; ox < out_xsize; ++ox) {
+        float sum = 0.0f;
+        for (size_t iy = 0; iy < fy; ++iy) {
+          for (size_t ix = 0; ix < fx; ++ix) {
+            const size_t x = std::min(ox * fx + ix, in.xsize() - 1);
+            const size_t y = std::min(oy * fy + iy, in.ysize() - 1);
+            sum += in.PlaneRow(c, y)[x];
+          }
+        }
+        row_out[ox] = sum * normalize;
+      }
+    }
+  }
+  return out;
+}
+
+void Multiply(const Image3F& a, const Image3F& b, Image3F* mul) {
+  for (size_t c = 0; c < 3; ++c) {
+    for (size_t y = 0; y < a.ysize(); ++y) {
+      const float* JXL_RESTRICT in1 = a.PlaneRow(c, y);
+      const float* JXL_RESTRICT in2 = b.PlaneRow(c, y);
+      float* JXL_RESTRICT out = mul->PlaneRow(c, y);
+      for (size_t x = 0; x < a.xsize(); ++x) {
+        out[x] = in1[x] * in2[x];
+      }
+    }
+  }
+}
+
+// Temporary storage for Gaussian blur, reused for multiple images.
+class Blur {
+ public:
+  Blur(const size_t xsize, const size_t ysize)
+      : rg_(jxl::CreateRecursiveGaussian(1.5)), temp_(xsize, ysize) {}
+
+  void operator()(const ImageF& in, ImageF* JXL_RESTRICT out) {
+    jxl::ThreadPool* null_pool = nullptr;
+    FastGaussian(rg_, in, null_pool, &temp_, out);
+  }
+
+  Image3F operator()(const Image3F& in) {
+    Image3F out(in.xsize(), in.ysize());
+    operator()(in.Plane(0), &out.Plane(0));
+    operator()(in.Plane(1), &out.Plane(1));
+    operator()(in.Plane(2), &out.Plane(2));
+    return out;
+  }
+
+  // Allows reusing across scales.
+  void ShrinkTo(const size_t xsize, const size_t ysize) {
+    temp_.ShrinkTo(xsize, ysize);
+  }
+
+ private:
+  hwy::AlignedUniquePtr<jxl::RecursiveGaussian> rg_;
+  ImageF temp_;
+};
+
+double tothe4th(double x) {
+  x *= x;
+  x *= x;
+  return x;
+}
+void SSIMMap(const Image3F& m1, const Image3F& m2, const Image3F& s11,
+             const Image3F& s22, const Image3F& s12, double* plane_averages) {
+  const double onePerPixels = 1.0 / (m1.ysize() * m1.xsize());
+  for (size_t c = 0; c < 3; ++c) {
+    double sum1[2] = {0.0};
+    for (size_t y = 0; y < m1.ysize(); ++y) {
+      const float* JXL_RESTRICT row_m1 = m1.PlaneRow(c, y);
+      const float* JXL_RESTRICT row_m2 = m2.PlaneRow(c, y);
+      const float* JXL_RESTRICT row_s11 = s11.PlaneRow(c, y);
+      const float* JXL_RESTRICT row_s22 = s22.PlaneRow(c, y);
+      const float* JXL_RESTRICT row_s12 = s12.PlaneRow(c, y);
+      for (size_t x = 0; x < m1.xsize(); ++x) {
+        float mu1 = row_m1[x];
+        float mu2 = row_m2[x];
+        float mu11 = mu1 * mu1;
+        float mu22 = mu2 * mu2;
+        float mu12 = mu1 * mu2;
+        /* Correction applied compared to the original SSIM formula, which has:
+
+             luma_err = 2 * mu1 * mu2 / (mu1^2 + mu2^2)
+                      = 1 - (mu1 - mu2)^2 / (mu1^2 + mu2^2)
+
+           The denominator causes error in the darks (low mu1 and mu2) to weigh
+           more than error in the brights (high mu1 and mu2). This would make
+           sense if values correspond to linear luma. However, the actual values
+           are either gamma-compressed luma (which supposedly is already
+           perceptually uniform) or chroma (where weighing green more than red
+           or blue more than yellow does not make any sense at all). So it is
+           better to simply drop this denominator.
+        */
+        float num_m = 1.0 - (mu1 - mu2) * (mu1 - mu2);
+        float num_s = 2 * (row_s12[x] - mu12) + kC2;
+        float denom_s = (row_s11[x] - mu11) + (row_s22[x] - mu22) + kC2;
+
+        // Use 1 - SSIM' so it becomes an error score instead of a quality
+        // index. This makes it make sense to compute an L_4 norm.
+        double d = 1.0 - (num_m * num_s / denom_s);
+        d = std::max(d, 0.0);
+        sum1[0] += d;
+        sum1[1] += tothe4th(d);
+      }
+    }
+    plane_averages[c * 2] = onePerPixels * sum1[0];
+    plane_averages[c * 2 + 1] = sqrt(sqrt(onePerPixels * sum1[1]));
+  }
+}
+
+void EdgeDiffMap(const Image3F& img1, const Image3F& mu1, const Image3F& img2,
+                 const Image3F& mu2, double* plane_averages) {
+  const double onePerPixels = 1.0 / (img1.ysize() * img1.xsize());
+  for (size_t c = 0; c < 3; ++c) {
+    double sum1[4] = {0.0};
+    for (size_t y = 0; y < img1.ysize(); ++y) {
+      const float* JXL_RESTRICT row1 = img1.PlaneRow(c, y);
+      const float* JXL_RESTRICT row2 = img2.PlaneRow(c, y);
+      const float* JXL_RESTRICT rowm1 = mu1.PlaneRow(c, y);
+      const float* JXL_RESTRICT rowm2 = mu2.PlaneRow(c, y);
+      for (size_t x = 0; x < img1.xsize(); ++x) {
+        double d1 = (1.0 + std::abs(row2[x] - rowm2[x])) /
+                        (1.0 + std::abs(row1[x] - rowm1[x])) -
+                    1.0;
+
+        // d1 > 0: distorted has an edge where original is smooth
+        //         (indicating ringing, color banding, blockiness, etc)
+        double artifact = std::max(d1, 0.0);
+        sum1[0] += artifact;
+        sum1[1] += tothe4th(artifact);
+
+        // d1 < 0: original has an edge where distorted is smooth
+        //         (indicating smoothing, blurring, smearing, etc)
+        double detail_lost = std::max(-d1, 0.0);
+        sum1[2] += detail_lost;
+        sum1[3] += tothe4th(detail_lost);
+      }
+    }
+    plane_averages[c * 4] = onePerPixels * sum1[0];
+    plane_averages[c * 4 + 1] = sqrt(sqrt(onePerPixels * sum1[1]));
+    plane_averages[c * 4 + 2] = onePerPixels * sum1[2];
+    plane_averages[c * 4 + 3] = sqrt(sqrt(onePerPixels * sum1[3]));
+  }
+}
+
+/* Get all components in more or less 0..1 range
+   Range of Rec2020 with these adjustments:
+    X: 0.017223..0.998838
+    Y: 0.010000..0.855303
+    B: 0.048759..0.989551
+   Range of sRGB:
+    X: 0.204594..0.813402
+    Y: 0.010000..0.855308
+    B: 0.272295..0.938012
+   The maximum pixel-wise difference has to be <= 1 for the ssim formula to make
+   sense.
+*/
+void MakePositiveXYB(jxl::Image3F& img) {
+  for (size_t y = 0; y < img.ysize(); ++y) {
+    float* JXL_RESTRICT rowY = img.PlaneRow(1, y);
+    float* JXL_RESTRICT rowB = img.PlaneRow(2, y);
+    float* JXL_RESTRICT rowX = img.PlaneRow(0, y);
+    for (size_t x = 0; x < img.xsize(); ++x) {
+      rowB[x] = (rowB[x] - rowY[x]) + 0.55f;
+      rowX[x] = rowX[x] * 14.f + 0.42f;
+      rowY[x] += 0.01f;
+    }
+  }
+}
+
+void AlphaBlend(jxl::ImageBundle& img, float bg) {
+  for (size_t y = 0; y < img.ysize(); ++y) {
+    float* JXL_RESTRICT r = img.color()->PlaneRow(0, y);
+    float* JXL_RESTRICT g = img.color()->PlaneRow(1, y);
+    float* JXL_RESTRICT b = img.color()->PlaneRow(2, y);
+    const float* JXL_RESTRICT a = img.alpha()->Row(y);
+    for (size_t x = 0; x < img.xsize(); ++x) {
+      r[x] = a[x] * r[x] + (1.f - a[x]) * bg;
+      g[x] = a[x] * g[x] + (1.f - a[x]) * bg;
+      b[x] = a[x] * b[x] + (1.f - a[x]) * bg;
+    }
+  }
+}
+
+}  // namespace
+
+/*
+The final score is based on a weighted sum of 108 sub-scores:
+- for 6 scales (1:1 to 1:32, downsampled in linear RGB)
+- for 3 components (X, Y, B-Y, rescaled to 0..1 range)
+- using 2 norms (the 1-norm and the 4-norm)
+- over 3 error maps:
+    - SSIM' (SSIM without the spurious gamma correction term)
+    - "ringing" (distorted edges where there are no orig edges)
+    - "blurring" (orig edges where there are no distorted edges)
+
+The weights were obtained by running Nelder-Mead simplex search,
+optimizing to minimize MSE for the CID22 training set and to
+maximize Kendall rank correlation (and with a lower weight,
+also Pearson correlation) with the CID22 training set and the
+TID2013, Kadid10k and KonFiG-IQA datasets.
+Validation was done on the CID22 validation set.
+
+Final results after tuning (Kendall | Spearman | Pearson):
+   CID22:     0.6903 | 0.8805 | 0.8583
+   TID2013:   0.6590 | 0.8445 | 0.8471
+   KADID-10k: 0.6175 | 0.8133 | 0.8030
+   KonFiG(F): 0.7668 | 0.9194 | 0.9136
+*/
+double Msssim::Score() const {
+  double ssim = 0.0;
+  constexpr double weight[108] = {0.0,
+                                  0.0007376606707406586,
+                                  0.0,
+                                  0.0,
+                                  0.0007793481682867309,
+                                  0.0,
+                                  0.0,
+                                  0.0004371155730107379,
+                                  0.0,
+                                  1.1041726426657346,
+                                  0.00066284834129271,
+                                  0.00015231632783718752,
+                                  0.0,
+                                  0.0016406437456599754,
+                                  0.0,
+                                  1.8422455520539298,
+                                  11.441172603757666,
+                                  0.0,
+                                  0.0007989109436015163,
+                                  0.000176816438078653,
+                                  0.0,
+                                  1.8787594979546387,
+                                  10.94906990605142,
+                                  0.0,
+                                  0.0007289346991508072,
+                                  0.9677937080626833,
+                                  0.0,
+                                  0.00014003424285435884,
+                                  0.9981766977854967,
+                                  0.00031949755934435053,
+                                  0.0004550992113792063,
+                                  0.0,
+                                  0.0,
+                                  0.0013648766163243398,
+                                  0.0,
+                                  0.0,
+                                  0.0,
+                                  0.0,
+                                  0.0,
+                                  7.466890328078848,
+                                  0.0,
+                                  17.445833984131262,
+                                  0.0006235601634041466,
+                                  0.0,
+                                  0.0,
+                                  6.683678146179332,
+                                  0.00037724407979611296,
+                                  1.027889937768264,
+                                  225.20515300849274,
+                                  0.0,
+                                  0.0,
+                                  19.213238186143016,
+                                  0.0011401524586618361,
+                                  0.001237755635509985,
+                                  176.39317598450694,
+                                  0.0,
+                                  0.0,
+                                  24.43300999870476,
+                                  0.28520802612117757,
+                                  0.0004485436923833408,
+                                  0.0,
+                                  0.0,
+                                  0.0,
+                                  34.77906344483772,
+                                  44.835625328877896,
+                                  0.0,
+                                  0.0,
+                                  0.0,
+                                  0.0,
+                                  0.0,
+                                  0.0,
+                                  0.0,
+                                  0.0,
+                                  0.0008680556573291698,
+                                  0.0,
+                                  0.0,
+                                  0.0,
+                                  0.0,
+                                  0.0,
+                                  0.0005313191874358747,
+                                  0.0,
+                                  0.00016533814161379112,
+                                  0.0,
+                                  0.0,
+                                  0.0,
+                                  0.0,
+                                  0.0,
+                                  0.0004179171803251336,
+                                  0.0017290828234722833,
+                                  0.0,
+                                  0.0020827005846636437,
+                                  0.0,
+                                  0.0,
+                                  8.826982764996862,
+                                  23.19243343998926,
+                                  0.0,
+                                  95.1080498811086,
+                                  0.9863978034400682,
+                                  0.9834382792465353,
+                                  0.0012286405048278493,
+                                  171.2667255897307,
+                                  0.9807858872435379,
+                                  0.0,
+                                  0.0,
+                                  0.0,
+                                  0.0005130064588990679,
+                                  0.0,
+                                  0.00010854057858411537};
+
+  size_t i = 0;
+  char ch[] = "XYB";
+  const bool verbose = false;
+  for (size_t c = 0; c < 3; ++c) {
+    for (size_t scale = 0; scale < scales.size(); ++scale) {
+      for (size_t n = 0; n < 2; n++) {
+#ifdef SSIMULACRA2_OUTPUT_RAW_SCORES_FOR_WEIGHT_TUNING
+        printf("%.12f,%.12f,%.12f,", scales[scale].avg_ssim[c * 2 + n],
+               scales[scale].avg_edgediff[c * 4 + n],
+               scales[scale].avg_edgediff[c * 4 + 2 + n]);
+#endif
+        if (verbose) {
+          printf("%f from channel %c ssim, scale 1:%i, %" PRIuS
+                 "-norm (weight %f)\n",
+                 weight[i] * std::abs(scales[scale].avg_ssim[c * 2 + n]), ch[c],
+                 1 << scale, n * 3 + 1, weight[i]);
+        }
+        ssim += weight[i++] * std::abs(scales[scale].avg_ssim[c * 2 + n]);
+        if (verbose) {
+          printf("%f from channel %c ringing, scale 1:%i, %" PRIuS
+                 "-norm (weight %f)\n",
+                 weight[i] * std::abs(scales[scale].avg_edgediff[c * 4 + n]),
+                 ch[c], 1 << scale, n * 3 + 1, weight[i]);
+        }
+        ssim += weight[i++] * std::abs(scales[scale].avg_edgediff[c * 4 + n]);
+        if (verbose) {
+          printf(
+              "%f from channel %c blur, scale 1:%i, %" PRIuS
+              "-norm (weight %f)\n",
+              weight[i] * std::abs(scales[scale].avg_edgediff[c * 4 + n + 2]),
+              ch[c], 1 << scale, n * 3 + 1, weight[i]);
+        }
+        ssim +=
+            weight[i++] * std::abs(scales[scale].avg_edgediff[c * 4 + n + 2]);
+      }
+    }
+  }
+
+  ssim = ssim * 0.9562382616834844;
+  ssim = 2.326765642916932 * ssim - 0.020884521182843837 * ssim * ssim +
+         6.248496625763138e-05 * ssim * ssim * ssim;
+  if (ssim > 0) {
+    ssim = 100.0 - 10.0 * pow(ssim, 0.6276336467831387);
+  } else {
+    ssim = 100.0;
+  }
+  return ssim;
+}
+
+Msssim ComputeSSIMULACRA2(const jxl::ImageBundle& orig,
+                          const jxl::ImageBundle& dist, float bg) {
+  Msssim msssim;
+
+  jxl::Image3F img1(orig.xsize(), orig.ysize());
+  jxl::Image3F img2(img1.xsize(), img1.ysize());
+
+  jxl::ImageBundle orig2 = orig.Copy();
+  jxl::ImageBundle dist2 = dist.Copy();
+
+  if (orig.HasAlpha()) AlphaBlend(orig2, bg);
+  if (dist.HasAlpha()) AlphaBlend(dist2, bg);
+  orig2.ClearExtraChannels();
+  dist2.ClearExtraChannels();
+
+  JXL_CHECK(orig2.TransformTo(jxl::ColorEncoding::LinearSRGB(orig2.IsGray()),
+                              *JxlGetDefaultCms()));
+  JXL_CHECK(dist2.TransformTo(jxl::ColorEncoding::LinearSRGB(dist2.IsGray()),
+                              *JxlGetDefaultCms()));
+
+  jxl::ToXYB(orig2, nullptr, &img1, *JxlGetDefaultCms(), nullptr);
+  jxl::ToXYB(dist2, nullptr, &img2, *JxlGetDefaultCms(), nullptr);
+  MakePositiveXYB(img1);
+  MakePositiveXYB(img2);
+
+  Image3F mul(img1.xsize(), img1.ysize());
+  Blur blur(img1.xsize(), img1.ysize());
+
+  for (int scale = 0; scale < kNumScales; scale++) {
+    if (img1.xsize() < 8 || img1.ysize() < 8) {
+      break;
+    }
+    if (scale) {
+      orig2.SetFromImage(Downsample(*orig2.color(), 2, 2),
+                         jxl::ColorEncoding::LinearSRGB(orig2.IsGray()));
+      dist2.SetFromImage(Downsample(*dist2.color(), 2, 2),
+                         jxl::ColorEncoding::LinearSRGB(dist2.IsGray()));
+      img1.ShrinkTo(orig2.xsize(), orig2.ysize());
+      img2.ShrinkTo(orig2.xsize(), orig2.ysize());
+      jxl::ToXYB(orig2, nullptr, &img1, *JxlGetDefaultCms(), nullptr);
+      jxl::ToXYB(dist2, nullptr, &img2, *JxlGetDefaultCms(), nullptr);
+      MakePositiveXYB(img1);
+      MakePositiveXYB(img2);
+    }
+    mul.ShrinkTo(img1.xsize(), img1.ysize());
+    blur.ShrinkTo(img1.xsize(), img1.ysize());
+
+    Multiply(img1, img1, &mul);
+    Image3F sigma1_sq = blur(mul);
+
+    Multiply(img2, img2, &mul);
+    Image3F sigma2_sq = blur(mul);
+
+    Multiply(img1, img2, &mul);
+    Image3F sigma12 = blur(mul);
+
+    Image3F mu1 = blur(img1);
+    Image3F mu2 = blur(img2);
+
+    MsssimScale sscale;
+    SSIMMap(mu1, mu2, sigma1_sq, sigma2_sq, sigma12, sscale.avg_ssim);
+    EdgeDiffMap(img1, mu1, img2, mu2, sscale.avg_edgediff);
+    msssim.scales.push_back(sscale);
+  }
+  return msssim;
+}
+
+Msssim ComputeSSIMULACRA2(const jxl::ImageBundle& orig,
+                          const jxl::ImageBundle& distorted) {
+  return ComputeSSIMULACRA2(orig, distorted, 0.5f);
+}
diff --git a/tools/ssimulacra2.h b/tools/ssimulacra2.h
new file mode 100644 (file)
index 0000000..36d1193
--- /dev/null
@@ -0,0 +1,32 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef TOOLS_SSIMULACRA2_H_
+#define TOOLS_SSIMULACRA2_H_
+
+#include <vector>
+
+#include "lib/jxl/image_bundle.h"
+
+struct MsssimScale {
+  double avg_ssim[3 * 2];
+  double avg_edgediff[3 * 4];
+};
+
+struct Msssim {
+  std::vector<MsssimScale> scales;
+
+  double Score() const;
+};
+
+// Computes the SSIMULACRA 2 score between reference image 'orig' and
+// distorted image 'distorted'. In case of alpha transparency, assume
+// a gray background if intensity 'bg' (in range 0..1).
+Msssim ComputeSSIMULACRA2(const jxl::ImageBundle &orig,
+                          const jxl::ImageBundle &distorted, float bg);
+Msssim ComputeSSIMULACRA2(const jxl::ImageBundle &orig,
+                          const jxl::ImageBundle &distorted);
+
+#endif  // TOOLS_SSIMULACRA2_H_
diff --git a/tools/ssimulacra2_main.cc b/tools/ssimulacra2_main.cc
new file mode 100644 (file)
index 0000000..758f188
--- /dev/null
@@ -0,0 +1,83 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <stdio.h>
+
+#include "lib/extras/codec.h"
+// TODO(eustas): we should, but we can't?
+// #include "lib/jxl/base/span.h"
+#include "tools/file_io.h"
+#include "tools/ssimulacra2.h"
+
+int PrintUsage(char** argv) {
+  fprintf(stderr, "Usage: %s orig.png distorted.png\n", argv[0]);
+  fprintf(stderr,
+          "Returns a score in range -inf..100, which correlates to subjective "
+          "visual quality:\n");
+  fprintf(stderr,
+          "     30 = low quality (p10 worst output of mozjpeg -quality 30)\n");
+  fprintf(stderr,
+          "     50 = medium quality (average output of cjxl -q 40 or mozjpeg "
+          "-quality 40,\n");
+  fprintf(stderr,
+          "                          p10 output of cjxl -q 50 or mozjpeg "
+          "-quality 60)\n");
+  fprintf(stderr,
+          "     70 = high quality (average output of cjxl -q 70 or mozjpeg "
+          "-quality 70,\n");
+  fprintf(stderr,
+          "                        p10 output of cjxl -q 75 or mozjpeg "
+          "-quality 80)\n");
+  fprintf(stderr,
+          "     90 = very high quality (impossible to distinguish from "
+          "original at 1:1,\n");
+  fprintf(stderr,
+          "                             average output of cjxl -q 90 or "
+          "mozjpeg -quality 90)\n");
+  return 1;
+}
+
+int main(int argc, char** argv) {
+  if (argc != 3) return PrintUsage(argv);
+
+  jxl::CodecInOut io[2];
+  const char* purpose[] = {"original", "distorted"};
+  for (size_t i = 0; i < 2; ++i) {
+    std::vector<uint8_t> encoded;
+    if (!jpegxl::tools::ReadFile(argv[1 + i], &encoded)) {
+      fprintf(stderr, "Could not load %s image: %s\n", purpose[i], argv[1 + i]);
+      return 1;
+    }
+    if (!jxl::SetFromBytes(jxl::Bytes(encoded), jxl::extras::ColorHints(),
+                           &io[i])) {
+      fprintf(stderr, "Could not decode %s image: %s\n", purpose[i],
+              argv[1 + i]);
+      return 1;
+    }
+    if (io[i].xsize() < 8 || io[i].ysize() < 8) {
+      fprintf(stderr, "Minimum image size is 8x8 pixels\n");
+      return 1;
+    }
+  }
+  jxl::CodecInOut& io1 = io[0];
+  jxl::CodecInOut& io2 = io[1];
+
+  if (io1.xsize() != io2.xsize() || io1.ysize() != io2.ysize()) {
+    fprintf(stderr, "Image size mismatch\n");
+    return 1;
+  }
+
+  if (!io1.Main().HasAlpha()) {
+    Msssim msssim = ComputeSSIMULACRA2(io1.Main(), io2.Main());
+    printf("%.8f\n", msssim.Score());
+  } else {
+    // in case of alpha transparency: blend against dark and bright backgrounds
+    // and return the worst of both scores
+    Msssim msssim0 = ComputeSSIMULACRA2(io1.Main(), io2.Main(), 0.1f);
+    Msssim msssim1 = ComputeSSIMULACRA2(io1.Main(), io2.Main(), 0.9f);
+    printf("%.8f\n", std::min(msssim0.Score(), msssim1.Score()));
+  }
+  return 0;
+}
index 5b48fe2..70538a5 100644 (file)
@@ -6,8 +6,12 @@
 #include <stdio.h>
 
 #include "lib/extras/codec.h"
-#include "lib/jxl/color_management.h"
-#include "lib/jxl/enc_color_management.h"
+// TODO(eustas): we should, but we can't?
+// #include "lib/jxl/base/span.h"
+#include <jxl/cms.h>
+
+#include "lib/jxl/image_bundle.h"
+#include "tools/file_io.h"
 #include "tools/ssimulacra.h"
 
 namespace ssimulacra {
@@ -33,26 +37,31 @@ int Run(int argc, char** argv) {
   }
   if (argc < input_arg + 2) return PrintUsage(argv);
 
-  jxl::CodecInOut io1;
-  jxl::CodecInOut io2;
-  JXL_CHECK(SetFromFile(argv[input_arg], jxl::extras::ColorHints(), &io1));
-  JXL_CHECK(SetFromFile(argv[input_arg + 1], jxl::extras::ColorHints(), &io2));
-  JXL_CHECK(io1.TransformTo(jxl::ColorEncoding::LinearSRGB(io1.Main().IsGray()),
-                            jxl::GetJxlCms()));
-  JXL_CHECK(io2.TransformTo(jxl::ColorEncoding::LinearSRGB(io2.Main().IsGray()),
-                            jxl::GetJxlCms()));
-
-  if (io1.xsize() != io2.xsize() || io1.ysize() != io2.ysize()) {
+  jxl::CodecInOut io[2];
+  for (size_t i = 0; i < 2; ++i) {
+    std::vector<uint8_t> encoded;
+    JXL_CHECK(jpegxl::tools::ReadFile(argv[input_arg + i], &encoded));
+    JXL_CHECK(jxl::SetFromBytes(jxl::Bytes(encoded), jxl::extras::ColorHints(),
+                                &io[i]));
+  }
+  jxl::ImageBundle& ib1 = io[0].Main();
+  jxl::ImageBundle& ib2 = io[1].Main();
+  JXL_CHECK(ib1.TransformTo(jxl::ColorEncoding::LinearSRGB(ib1.IsGray()),
+                            *JxlGetDefaultCms(), nullptr));
+  JXL_CHECK(ib2.TransformTo(jxl::ColorEncoding::LinearSRGB(ib2.IsGray()),
+                            *JxlGetDefaultCms(), nullptr));
+  jxl::Image3F& img1 = *ib1.color();
+  jxl::Image3F& img2 = *ib2.color();
+  if (img1.xsize() != img2.xsize() || img1.ysize() != img2.ysize()) {
     fprintf(stderr, "Image size mismatch\n");
     return 1;
   }
-  if (io1.xsize() < 8 || io1.ysize() < 8) {
+  if (img1.xsize() < 8 || img1.ysize() < 8) {
     fprintf(stderr, "Minimum image size is 8x8 pixels\n");
     return 1;
   }
 
-  Ssimulacra ssimulacra =
-      ComputeDiff(*io1.Main().color(), *io2.Main().color(), simple);
+  Ssimulacra ssimulacra = ComputeDiff(img1, img2, simple);
 
   if (verbose) {
     ssimulacra.PrintDetails();
diff --git a/tools/thread_pool_internal.h b/tools/thread_pool_internal.h
new file mode 100644 (file)
index 0000000..92a1176
--- /dev/null
@@ -0,0 +1,47 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef TOOLS_THREAD_POOL_INTERNAL_H_
+#define TOOLS_THREAD_POOL_INTERNAL_H_
+
+#include <jxl/thread_parallel_runner_cxx.h>
+#include <stddef.h>
+
+#include <cmath>
+#include <thread>  // NOLINT
+
+#include "lib/jxl/base/data_parallel.h"
+
+namespace jpegxl {
+namespace tools {
+
+using ::jxl::ThreadPool;
+
+// Helper class to pass an internal ThreadPool-like object using threads.
+class ThreadPoolInternal {
+ public:
+  // Starts the given number of worker threads and blocks until they are ready.
+  // "num_worker_threads" defaults to one per hyperthread. If zero, all tasks
+  // run on the main thread.
+  explicit ThreadPoolInternal(
+      size_t num_threads = std::thread::hardware_concurrency()) {
+    runner_ =
+        JxlThreadParallelRunnerMake(/* memory_manager */ nullptr, num_threads);
+    pool_.reset(new ThreadPool(JxlThreadParallelRunner, runner_.get()));
+  }
+
+  ThreadPoolInternal(const ThreadPoolInternal&) = delete;
+  ThreadPoolInternal& operator&(const ThreadPoolInternal&) = delete;
+  ThreadPool* operator&() { return pool_.get(); }
+
+ private:
+  JxlThreadParallelRunnerPtr runner_;
+  std::unique_ptr<ThreadPool> pool_;
+};
+
+}  // namespace tools
+}  // namespace jpegxl
+
+#endif  // TOOLS_THREAD_POOL_INTERNAL_H_
index 1ef08b2..6d78a05 100644 (file)
 #include "lib/jxl/modular/encoding/encoding.h"
 #include "lib/jxl/modular/transform/transform.h"
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
+
+using ::jxl::BitReader;
+using ::jxl::BitReaderScopedCloser;
+using ::jxl::Bytes;
+using ::jxl::Channel;
+using ::jxl::GroupHeader;
+using ::jxl::Image;
+using ::jxl::ModularOptions;
+using ::jxl::pixel_type;
+using ::jxl::Rng;
+using ::jxl::Status;
+using ::jxl::Transform;
+using ::jxl::weighted::Header;
 
 namespace {
 void FillChannel(Channel& ch, Rng& rng) {
@@ -32,7 +46,7 @@ void AssertEq(T a, T b) {
 
 int TestOneInput(const uint8_t* data, size_t size) {
   static Status nevermind = true;
-  BitReader reader(Span<const uint8_t>(data, size));
+  BitReader reader(Bytes(data, size));
   BitReaderScopedCloser reader_closer(&reader, &nevermind);
 
   Rng rng(reader.ReadFixedBits<56>());
@@ -50,8 +64,8 @@ int TestOneInput(const uint8_t* data, size_t size) {
 
   size_t w_orig = static_cast<size_t>(reader.ReadFixedBits<16>());
   size_t h_orig = static_cast<size_t>(reader.ReadFixedBits<16>());
-  size_t w = DivCeil(w_orig, upsampling);
-  size_t h = DivCeil(h_orig, upsampling);
+  size_t w = jxl::DivCeil(w_orig, upsampling);
+  size_t h = jxl::DivCeil(h_orig, upsampling);
 
   if ((nb_chans == 2) || ((nb_chans + nb_extra) == 0) || (w * h == 0) ||
       ((w_orig * h_orig * (nb_chans + nb_extra)) > (1 << 23))) {
@@ -80,21 +94,22 @@ int TestOneInput(const uint8_t* data, size_t size) {
     Channel& ch = image.channel[c];
     ch.hshift = hshift[c];
     ch.vshift = vshift[c];
-    ch.shrink(DivCeil(w, 1 << hshift[c]), DivCeil(h, 1 << vshift[c]));
+    ch.shrink(jxl::DivCeil(w, 1 << hshift[c]), jxl::DivCeil(h, 1 << vshift[c]));
   }
 
   for (size_t ec = 0; ec < nb_extra; ec++) {
     Channel& ch = image.channel[ec + nb_chans];
     size_t ch_up = ec_upsampling[ec];
-    int up_level = CeilLog2Nonzero(ch_up) - CeilLog2Nonzero(upsampling);
-    ch.shrink(DivCeil(w_orig, ch_up), DivCeil(h_orig, ch_up));
+    int up_level =
+        jxl::CeilLog2Nonzero(ch_up) - jxl::CeilLog2Nonzero(upsampling);
+    ch.shrink(jxl::DivCeil(w_orig, ch_up), jxl::DivCeil(h_orig, ch_up));
     ch.hshift = ch.vshift = up_level;
   }
 
   GroupHeader header;
-  if (!Bundle::Read(&reader, &header)) return 0;
-  weighted::Header w_header;
-  if (!Bundle::Read(&reader, &w_header)) return 0;
+  if (!jxl::Bundle::Read(&reader, &header)) return 0;
+  Header w_header;
+  if (!jxl::Bundle::Read(&reader, &w_header)) return 0;
 
   // TODO(eustas): give it a try?
   if (!reader.AllReadsWithinBounds()) return 0;
@@ -122,16 +137,17 @@ int TestOneInput(const uint8_t* data, size_t size) {
     const Channel& ch = image.channel[c];
     AssertEq(ch.hshift, hshift[c]);
     AssertEq(ch.vshift, vshift[c]);
-    AssertEq(ch.w, DivCeil(w, 1 << hshift[c]));
-    AssertEq(ch.h, DivCeil(h, 1 << vshift[c]));
+    AssertEq(ch.w, jxl::DivCeil(w, 1 << hshift[c]));
+    AssertEq(ch.h, jxl::DivCeil(h, 1 << vshift[c]));
   }
 
   for (size_t ec = 0; ec < nb_extra; ec++) {
     const Channel& ch = image.channel[ec + nb_chans];
     size_t ch_up = ec_upsampling[ec];
-    int up_level = CeilLog2Nonzero(ch_up) - CeilLog2Nonzero(upsampling);
-    AssertEq(ch.w, DivCeil(w_orig, ch_up));
-    AssertEq(ch.h, DivCeil(h_orig, ch_up));
+    int up_level =
+        jxl::CeilLog2Nonzero(ch_up) - jxl::CeilLog2Nonzero(upsampling);
+    AssertEq(ch.w, jxl::DivCeil(w_orig, ch_up));
+    AssertEq(ch.h, jxl::DivCeil(h_orig, ch_up));
     AssertEq(ch.hshift, up_level);
     AssertEq(ch.vshift, up_level);
   }
@@ -139,8 +155,9 @@ int TestOneInput(const uint8_t* data, size_t size) {
   return 0;
 }
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
 
 extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
-  return jxl::TestOneInput(data, size);
+  return jpegxl::tools::TestOneInput(data, size);
 }
index 89f1320..e873bd1 100644 (file)
@@ -37,14 +37,14 @@ def convolution(pixels, kernel):
   `kernel`.
 
   Args:
-    pixels: A [heigth, width]- or [height, width, num_channels]-array
+    pixels: A [height, width]- or [height, width, num_channels]-array
     representing an image.
 
     kernel: A [upscaling_factor, upscaling_factor, kernel_size,
      kernel_size]-array used for the convolution.
 
   Returns:
-    A [upscaling_factor*heigth, upscaling_factor*width]- or
+    A [upscaling_factor*height, upscaling_factor*width]- or
     [upscaling_factor*height, upscaling_factor*width, num_channels]-array representing the
     convoluted upscaled image.
   """
index 7dbe5e3..2b25e26 100644 (file)
@@ -3,9 +3,9 @@
 # Use of this source code is governed by a BSD-style
 # license that can be found in the LICENSE file.
 
-find_package(Qt5 QUIET COMPONENTS Widgets)
-if (NOT Qt5_FOUND)
-  message(WARNING "Qt5 was not found. The directory viewer will not be built.")
+find_package(Qt6 QUIET COMPONENTS Widgets)
+if (NOT Qt6_FOUND)
+  message(WARNING "Qt6 was not found. The directory viewer will not be built.")
   return()
 endif ()
 
@@ -31,7 +31,7 @@ target_include_directories(viewer PRIVATE
   "${PROJECT_SOURCE_DIR}"
 )
 target_link_libraries(viewer
-  Qt5::Widgets
+  Qt6::Widgets
   icc_detect
   jxl
   jxl_threads
index 7fd35d8..b97a906 100644 (file)
@@ -5,18 +5,21 @@
 
 #include "tools/viewer/load_jxl.h"
 
+#include <jxl/decode.h>
+#include <jxl/decode_cxx.h>
+#include <jxl/thread_parallel_runner_cxx.h>
+#include <jxl/types.h>
 #include <stdint.h>
 
 #include <QElapsedTimer>
 #include <QFile>
 
-#include "jxl/decode.h"
-#include "jxl/decode_cxx.h"
-#include "jxl/thread_parallel_runner_cxx.h"
-#include "jxl/types.h"
+#define CMS_NO_REGISTER_KEYWORD 1
 #include "lcms2.h"
+#undef CMS_NO_REGISTER_KEYWORD
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
 
 namespace {
 
@@ -97,12 +100,11 @@ QImage loadJxlImage(const QString& filename, const QByteArray& targetIccProfile,
   size_t icc_size;
   EXPECT_EQ(JXL_DEC_SUCCESS,
             JxlDecoderGetICCProfileSize(
-                dec.get(), &format, JXL_COLOR_PROFILE_TARGET_DATA, &icc_size));
+                dec.get(), JXL_COLOR_PROFILE_TARGET_DATA, &icc_size));
   std::vector<uint8_t> icc_profile(icc_size);
-  EXPECT_EQ(JXL_DEC_SUCCESS,
-            JxlDecoderGetColorAsICCProfile(
-                dec.get(), &format, JXL_COLOR_PROFILE_TARGET_DATA,
-                icc_profile.data(), icc_profile.size()));
+  EXPECT_EQ(JXL_DEC_SUCCESS, JxlDecoderGetColorAsICCProfile(
+                                 dec.get(), JXL_COLOR_PROFILE_TARGET_DATA,
+                                 icc_profile.data(), icc_profile.size()));
 
   std::vector<float> float_pixels(pixel_count * 4);
   EXPECT_EQ(JXL_DEC_NEED_IMAGE_OUT_BUFFER, JxlDecoderProcessInput(dec.get()));
@@ -135,40 +137,19 @@ QImage loadJxlImage(const QString& filename, const QByteArray& targetIccProfile,
   if (elapsed_ns != nullptr) *elapsed_ns = timer.nsecsElapsed();
 
   QImage result(info.xsize, info.ysize,
-#if QT_VERSION >= QT_VERSION_CHECK(5, 12, 0)
                 info.alpha_premultiplied ? QImage::Format_RGBA64_Premultiplied
-                                         : QImage::Format_RGBA64
-#else
-                info.alpha_premultiplied ? QImage::Format_ARGB32_Premultiplied
-                                         : QImage::Format_ARGB32
-#endif
-  );
+                                         : QImage::Format_RGBA64);
 
   for (int y = 0; y < result.height(); ++y) {
-#if QT_VERSION >= QT_VERSION_CHECK(5, 12, 0)
     QRgba64* const row = reinterpret_cast<QRgba64*>(result.scanLine(y));
-#else
-    QRgb* const row = reinterpret_cast<QRgb*>(result.scanLine(y));
-#endif
     const uint16_t* const data = uint16_pixels.data() + result.width() * y * 4;
     for (int x = 0; x < result.width(); ++x) {
-#if QT_VERSION >= QT_VERSION_CHECK(5, 6, 0)
       row[x] = qRgba64(data[4 * x + 0], data[4 * x + 1], data[4 * x + 2],
-                       data[4 * x + 3])
-#if QT_VERSION < QT_VERSION_CHECK(5, 12, 0)
-                   .toArgb32()
-#endif
-          ;
-#else
-      // Qt version older than 5.6 doesn't have a qRgba64.
-      row[x] = qRgba(data[4 * x + 0] * (255.f / 65535) + .5f,
-                     data[4 * x + 1] * (255.f / 65535) + .5f,
-                     data[4 * x + 2] * (255.f / 65535) + .5f,
-                     data[4 * x + 3] * (255.f / 65535) + .5f);
-#endif
+                       data[4 * x + 3]);
     }
   }
   return result;
 }
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
index 594f646..85dc1a9 100644 (file)
 #include <QImage>
 #include <QString>
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
 
 QImage loadJxlImage(const QString& filename, const QByteArray& targetIccProfile,
                     qint64* elapsed, bool* usedRequestedProfile = nullptr);
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
 
 #endif  // TOOLS_VIEWER_LOAD_JXL_H_
index d677888..1e80be3 100644 (file)
@@ -12,7 +12,7 @@ int main(int argc, char** argv) {
   QStringList arguments = application.arguments();
   arguments.removeFirst();
 
-  jxl::ViewerWindow window;
+  jpegxl::tools::ViewerWindow window;
   window.show();
 
   if (!arguments.empty()) {
index 530c2f0..6b5f912 100644 (file)
@@ -15,7 +15,8 @@
 #include "tools/icc_detect/icc_detect.h"
 #include "tools/viewer/load_jxl.h"
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
 
 namespace {
 
@@ -50,7 +51,7 @@ void ViewerWindow::loadFilesAndDirectories(QStringList entries) {
   filenames_.clear();
   QSet<QString> visited;
   for (const QString& entry : entries) {
-    recursivelyAddSubEntries(entry, &visited, &filenames_);
+    recursivelyAddSubEntries(QFileInfo(entry), &visited, &filenames_);
   }
 
   const bool several = filenames_.size() > 1;
@@ -127,4 +128,5 @@ void ViewerWindow::refreshImage() {
   }
 }
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
index 42de5bc..78aafb9 100644 (file)
@@ -12,7 +12,8 @@
 
 #include "tools/viewer/ui_viewer_window.h"
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
 
 class ViewerWindow : public QMainWindow {
   Q_OBJECT
@@ -36,6 +37,7 @@ class ViewerWindow : public QMainWindow {
   bool hasWarnedAboutMonitorProfile_ = false;
 };
 
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
 
 #endif  // TOOLS_VIEWER_VIEWER_WINDOW_H_
diff --git a/tools/wasm_demo/CMakeLists.txt b/tools/wasm_demo/CMakeLists.txt
new file mode 100644 (file)
index 0000000..0549e76
--- /dev/null
@@ -0,0 +1,64 @@
+if (NOT JPEGXL_ENABLE_TOOLS OR NOT EMSCRIPTEN)
+  return()
+endif()
+
+# WASM API facade.
+add_executable(jxl_decoder jxl_decoder.cc jxl_decompressor.cc no_png.cc)
+add_executable(jxl_decoder_for_test jxl_decoder.cc jxl_decompressor.cc no_png.cc)
+target_link_libraries(jxl_decoder jxl_extras-internal jxl_threads)
+target_link_libraries(jxl_decoder_for_test jxl_extras-internal jxl_threads)
+
+set(JXL_C_SYMBOLS
+  _free
+  _malloc
+)
+
+set(JXL_DECODER_SYMBOLS
+  _jxlCreateInstance
+  _jxlDestroyInstance
+  _jxlFlush
+  _jxlProcessInput
+)
+
+set(JXL_DECOMPRESSOR_SYMBOLS
+  _jxlDecompress
+  _jxlCleanup
+)
+
+set(JXL_MODULE_SYMBOLS ${JXL_C_SYMBOLS} ${JXL_DECODER_SYMBOLS} ${JXL_DECOMPRESSOR_SYMBOLS})
+
+list(JOIN JXL_MODULE_SYMBOLS ", " JXL_MODULE_EXPORTS)
+
+set(JXL_WASM_SITE_LINK_FLAGS " -O3 -s FILESYSTEM=0 --closure 1 -mnontrapping-fptoint")
+set(JXL_WASM_TEST_LINK_FLAGS " -O1 -s NODERAWFS=1 ")
+
+set(JXL_WASM_BASE_LINK_FLAGS "\
+  -s ALLOW_MEMORY_GROWTH=1 \
+  -s DISABLE_EXCEPTION_CATCHING=1 \
+  -s MODULARIZE=1 \
+  -s USE_PTHREADS=1 \
+  -s PTHREAD_POOL_SIZE=4 \
+")
+
+# libpng is used only by "decompressor"
+set(JXL_DECODER_LINK_FLAGS "${JXL_WASM_BASE_LINK_FLAGS} \
+  -s EXPORT_NAME=\"JxlDecoderModule\" \
+  -s \"EXPORTED_FUNCTIONS=[${JXL_MODULE_EXPORTS}]\" \
+")
+
+set_target_properties(jxl_decoder PROPERTIES LINK_FLAGS
+  "${JXL_DECODER_LINK_FLAGS} ${JXL_WASM_SITE_LINK_FLAGS}")
+
+set_target_properties(jxl_decoder_for_test PROPERTIES LINK_FLAGS
+  "${JXL_DECODER_LINK_FLAGS} ${JXL_WASM_TEST_LINK_FLAGS}")
+
+if (BUILD_TESTING)
+  add_test(
+    NAME test_wasm_jxl_decoder
+    COMMAND ${CMAKE_CROSSCOMPILING_EMULATOR}
+            --no-experimental-fetch
+            ${CMAKE_CURRENT_SOURCE_DIR}/jxl_decoder_test.js
+  )
+  set_tests_properties(test_wasm_jxl_decoder PROPERTIES
+    ENVIRONMENT NODE_PATH=$<TARGET_FILE_DIR:jxl_decoder_for_test>)
+endif()  # BUILD_TESTING
diff --git a/tools/wasm_demo/README.md b/tools/wasm_demo/README.md
new file mode 100644 (file)
index 0000000..804cd35
--- /dev/null
@@ -0,0 +1,126 @@
+## WebAssembly demonstration
+
+This folder contains an example how to decode JPEG XL files on a web page using
+WASM engine.
+
+### One line demo
+
+The simplest way to get support of JXL images on the client side is simply to
+link one extra script (`<script src="service_worker.js">`) to the page.
+This script installs a `ServiceWorker` that:
+
+ - checks if the browser supports the JXL image format already
+ - if it is not, then advertise `image/jxl` as media format in image requests
+ - then, if the server responds with `image/jxl` content it gets decoded and
+   re-encoded to PNG on the fly
+
+Generally the message / data flow looks the following way:
+
+ - `Fetch API` receives a resource request from client page (e.g. when the HTML
+   engine discovers an `img` tag) and asks the `ServiceWorker` how to proceed
+ - the `ServiceWorker` alters the request and uses the `Fetch API`
+   to obtain data
+ - when data arrives, the `ServiceWorker` forwards it to the "client"
+   (the page) that initiated the resource request
+ - the client forwards the data to a worker (see `client_worker.js`) to avoid
+   processing in the "main loop" thread
+ - a worker does the actual decoding; to make it faster several additional
+   workers are spawned (to enable multi-threading in WASM module);
+   the decoded image is wrapped in non-compressed PNG format and sent back
+   to client
+ - the client relays image data to `ServiceWorker`
+ - the `ServiceWorker` passes data to `Fetch API` as a response to initial
+   resource request
+
+Despite the additional "hop" (client) in the flow, data is not copied every
+time but rather "transferred" between the participants.
+
+Demo page: `one_line_demo.html`. Extended demo, that also shows how long it
+took do decode images: `one_line_demo_with_console.html`.
+
+Page that shows "manual" decoding (and has benchmarking capabilities):
+`manual_decode_demo.html`.
+
+### Hosting
+
+To enable multi-threading some files should be served in a secure context (i.e.
+transferred over HTTPS) and executed in a "site-isolation" mode (controlled by
+COOP and COEP response headers).
+
+Unfortunately [GitHub Pages](https://pages.github.com/) does not allow setting
+response headers.
+
+[Netlify](https://www.netlify.com/) provides free, easy to setup and deploy
+platform for serving such demonstration sites. However, any other
+service provider / software that allows changing response headers could be
+employed as well.
+
+`netlify.toml` and `netlify/precompressed.ts` specify the serving rules.
+Namely, some requests get "upgraded" responses:
+
+ - if a request specifies that `brotli` compression is supported,
+   then precompressed entries are sent
+ - if a request specifies that `image/jxl` format is allowed,
+   then entries transcoded to JXL format are sent
+
+### How to build the demo
+
+`build_site.py` script takes care of JavaScript minification, template
+substitution and resource compression. Its arguments are:
+
+ - source path: site template directory (that contains this README file)
+ - binary path: build directory, that contains compiled WASM module
+ - output path
+
+To complete the site few more files are to be added to output directory:
+
+ - `image00.jpg`, `image01.png` demo images; will be shown if `ServiceWorker`
+   is not yet operable (fallback); to see those one could initiate
+   "hard page reload" (press Shift-(Ctrl|Cmd)-R)
+ - `image00.jpg.jxl`, `image01.png.jxl` demo images in JXL format
+ - `imageNN.jxl` images for "manual" decoding demo; NN is a number starting
+   form `00`
+ - `favicon.ico` is an optional site icon
+ - `index.html` is an optional site "home" page
+
+In the source code (`service_worker.js`) there are two compile-time constants
+that modify the behaviour of Service Worker:
+
+ - `FORCE_COP` flag allows rewriting responses to add COOP / COEP headers;
+   this is useful when it is difficult / impossible to setup response headers
+   otherwise (e.g. GitHub Pages)
+ - `FORCE_DECODING` flag activate JXL decoding when image response type has
+   `Content-Encoding` header set to `application/octet-stream`; this happens
+   when server does not know the JXL MIME-type
+
+One dependency that `build_site.py` requires is [uglifyjs](https://github.com/mishoo/UglifyJS), which can be installed with
+```
+npm install uglify-js -g
+```
+If you followed the [wasm build instructions](../../docs/building_wasm.md),
+assuming you are in the root level of the cloned libjxl repo a typical call to
+build the site would be
+```bash
+python3 ./tools/wasm_demo/build_site.py ./tools/wasm_demo/ ./build-wasm32/tools/wasm_demo/ /path/to/demo-site
+```
+Then you need to put your image files in the correct same place and are should be good to go.
+
+
+To summarize, using the wasm decoder together with a service workder amounts to adding
+```html
+<script src="service_worker.js"></script>
+```
+to your html and then putting the `service_worker.js` and `jxl_decoder.wasm` binary in directory where they can be read.
+
+
+It is not guaranteed, but somewhat fresh demo is hosted on
+`https://jxl-demo.netlify.app/`, e.g.:
+
+ - [one line demo](https://jxl-demo.netlify.app/one_line_demo_with_console.html)
+ - [one line demo with console](https://jxl-demo.netlify.app/one_line_demo.html)
+ - [manual decode demo](https://jxl-demo.netlify.app/manual_decode_demo.html?img=1&colorSpace=rec2100-pq&runBenchmark=30&wantSdr=false&displayNits=1500);
+   URL contains query parameters that control rendering and benchmarking options;
+   please note, that HDR canvas is often not enabled by default, it could be
+   enabled in some browsers via `about://flags/#enable-experimental-web-platform-features`
+ - [`service_worker.js`](https://jxl-demo.netlify.app/service_worker.js)
+ - [`jxl_decoder.wasm`](https://jxl-demo.netlify.app/jxl_decoder.wasm)
diff --git a/tools/wasm_demo/build_site.py b/tools/wasm_demo/build_site.py
new file mode 100644 (file)
index 0000000..f47fa97
--- /dev/null
@@ -0,0 +1,145 @@
+#!/usr/bin/env python3
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+import shutil
+import subprocess
+import sys
+
+from pathlib import Path
+
+BROTLIFY = False
+ZOPFLIFY = False
+LEAN = True
+NETLIFY = False
+
+REMOVE_SHEBANG = ['jxl_decoder.js']
+EMBED_BIN = [
+  'jxl_decoder.js',
+  'jxl_decoder.worker.js'
+]
+EMBED_SRC = ['client_worker.js']
+TEMPLATES = ['service_worker.js']
+COPY_BIN = ['jxl_decoder.wasm'] + [] if LEAN else EMBED_BIN
+COPY_SRC = [
+  'one_line_demo.html',
+  'one_line_demo_with_console.html',
+  'manual_decode_demo.html',
+] + [] if not NETLIFY else [
+  'netlify.toml',
+  'netlify'
+] + [] if LEAN else EMBED_SRC
+
+COMPRESS = COPY_BIN + COPY_SRC + TEMPLATES
+COMPRESSIBLE_EXT = ['.html', '.js', '.wasm']
+
+def escape_js(js):
+  return js.replace('\\', '\\\\').replace('\'', '\\\'')
+
+def remove_shebang(txt):
+  lines = txt.splitlines(True) # Keep line-breaks
+  if len(lines) > 0:
+    if lines[0].startswith('#!'):
+      lines = lines[1:]
+  return ''.join(lines)
+
+def compress(path):
+  name = path.name
+  compressible = any([name.endswith(ext) for ext in COMPRESSIBLE_EXT])
+  if not compressible:
+    print(f'Not compressing {name}')
+    return
+  print(f'Processing {name}')
+  orig_size = path.stat().st_size
+  if BROTLIFY:
+    cmd_brotli = ['brotli', '-Zfk', path.absolute()]
+    subprocess.run(cmd_brotli, check=True, stdout=sys.stdout, stderr=sys.stderr)
+    br_size = path.parent.joinpath(name + '.br').stat().st_size
+    print(f'  Brotli: {orig_size} -> {br_size}')
+  if ZOPFLIFY:
+    cmd_zopfli = ['zopfli', path.absolute()]
+    subprocess.run(cmd_zopfli, check=True, stdout=sys.stdout, stderr=sys.stderr)
+    gz_size = path.parent.joinpath(name + '.gz').stat().st_size
+    print(f'  Zopfli: {orig_size} -> {gz_size}')
+
+def check_util(name):
+  cmd = [name, '-h']
+  try:
+    subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+  except:
+    print(f"NOTE: {name} not installed")
+    return False
+  return True
+
+def check_utils():
+  global BROTLIFY
+  BROTLIFY = BROTLIFY and check_util('brotli')
+  global ZOPFLIFY
+  ZOPFLIFY = ZOPFLIFY and check_util('zopfli')
+  if not check_util('uglifyjs'):
+    print("FAIL: uglifyjs is required to build a site")
+    sys.exit()
+
+def uglify(text, name):
+  cmd = ['uglifyjs', '-m', '-c']
+  ugly_result = subprocess.run(
+      cmd, capture_output=True, check=True, input=text, text=True)
+  ugly_text = ugly_result.stdout.strip()
+  print(f'Uglify {name}: {len(text)} -> {len(ugly_text)}')
+  return ugly_text
+
+if __name__ == "__main__":
+  if len(sys.argv) != 4:
+    print(f"Usage: python3 {sys.argv[0]} SRC_DIR BINARY_DIR OUTPUT_DIR")
+    exit(-1)
+  source_path = Path(sys.argv[1]) # CMake build dir
+  binary_path = Path(sys.argv[2]) # Site template dir
+  output_path = Path(sys.argv[3]) # Site output
+
+  check_utils()
+
+  for name in REMOVE_SHEBANG:
+    path = binary_path.joinpath(name)
+    text = path.read_text().strip()
+    path.write_text(remove_shebang(text))
+    remove_shebang
+
+  substitutes = {}
+
+  for name in EMBED_BIN:
+    key = '$' + name + '$'
+    path = binary_path.joinpath(name)
+    value = escape_js(uglify(path.read_text().strip(), name))
+    substitutes[key] = value
+
+  for name in EMBED_SRC:
+    key = '$' + name + '$'
+    path = source_path.joinpath(name)
+    value = escape_js(uglify(path.read_text().strip(), name))
+    substitutes[key] = value
+
+  for name in TEMPLATES:
+    print(f'Processing template {name}')
+    path = source_path.joinpath(name)
+    text = path.read_text().strip()
+    for key, value in substitutes.items():
+      text = text.replace(key, value)
+    #text = uglify(text, name)
+    output_path.joinpath(name).write_text(text)
+
+  for name in COPY_SRC:
+    path = source_path.joinpath(name)
+    if path.is_dir():
+      shutil.copytree(path, output_path.joinpath(
+          name).absolute(), dirs_exist_ok=True)
+    else:
+      shutil.copy(path, output_path.absolute())
+
+  # TODO(eustas): uglify
+  for name in COPY_BIN:
+    shutil.copy(binary_path.joinpath(name), output_path.absolute())
+
+  for name in COMPRESS:
+    compress(output_path.joinpath(name))
diff --git a/tools/wasm_demo/client_worker.js b/tools/wasm_demo/client_worker.js
new file mode 100644 (file)
index 0000000..5751b38
--- /dev/null
@@ -0,0 +1,99 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+let decoder = null;
+
+// Serialize work; plus postpone processing until decoder is ready.
+let jobs = [];
+
+const processJobs = () => {
+  // Decoder not yet loaded.
+  if (!decoder) {
+    return;
+  }
+
+  while (true) {
+    let job = null;
+    // Currently we do not do progressive; process only "inputComplete" jobs.
+    for (let i = 0; i < jobs.length; ++i) {
+      if (!jobs[i].inputComplete) {
+        continue;
+      }
+      job = jobs[i];
+      jobs[i] = jobs[jobs.length - 1];
+      jobs.pop();
+      break;
+    }
+    if (!job) {
+      return;
+    }
+    console.log('CW job: ' + job.uid);
+    const input = job.input;
+    let totalInputLength = 0;
+    for (let i = 0; i < input.length; i++) {
+      totalInputLength += input[i].length;
+    }
+
+    // TODO(eustas): persist to reduce fragmentation?
+    const buffer = decoder._malloc(totalInputLength);
+    // TODO(eustas): check OOM
+    let offset = 0;
+    for (let i = 0; i < input.length; ++i) {
+      decoder.HEAP8.set(input[i], buffer + offset);
+      offset += input[i].length;
+    }
+    let t0 = Date.now();
+    // TODO(eustas): check result
+    const result = decoder._jxlDecompress(buffer, totalInputLength);
+    let t1 = Date.now();
+    const msg = 'Decoded ' + job.url + ' in ' + (t1 - t0) + 'ms';
+    // console.log(msg);
+    decoder._free(buffer);
+    const outputLength = decoder.HEAP32[result >> 2];
+    const outputAddr = decoder.HEAP32[(result + 4) >> 2];
+    const output = new Uint8Array(outputLength);
+    const outputSrc = new Uint8Array(decoder.HEAP8.buffer);
+    output.set(outputSrc.slice(outputAddr, outputAddr + outputLength));
+    decoder._jxlCleanup(result);
+    const response = {uid: job.uid, data: output, msg: msg};
+    postMessage(response, [output.buffer]);
+  }
+};
+
+onmessage = function(event) {
+  const data = event.data;
+  console.log('CW received: ' + data.op);
+  if (data.op === 'decodeJxl') {
+    let job = null;
+    for (let i = 0; i < jobs.length; ++i) {
+      if (jobs[i].uid === data.uid) {
+        job = jobs[i];
+        break;
+      }
+    }
+    if (!job) {
+      job = {uid: data.uid, input: [], inputComplete: false, url: data.url};
+      jobs.push(job);
+    }
+    if (data.data) {
+      job.input.push(data.data);
+    } else {
+      job.inputComplete = true;
+    }
+    processJobs();
+  }
+};
+
+const onLoadJxlModule = (instance) => {
+  decoder = instance;
+  processJobs();
+};
+
+importScripts('jxl_decoder.js');
+const config = {
+  mainScriptUrlOrBlob: 'https://jxl-demo.netlify.app/jxl_decoder.js',
+  INITIAL_MEMORY: 16 * 1024 * 1024,
+};
+JxlDecoderModule(config).then(onLoadJxlModule);
similarity index 64%
rename from tools/jxl_emcc.cc
rename to tools/wasm_demo/jxl_decoder.cc
index c4c855a..633674c 100644 (file)
@@ -3,25 +3,25 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+#include "tools/wasm_demo/jxl_decoder.h"
+
+#include <jxl/decode.h>
+#include <jxl/decode_cxx.h>
+#include <jxl/thread_parallel_runner_cxx.h>
+
+#include <cstdio>
 #include <cstring>
 #include <memory>
 #include <vector>
 
-#include "jxl/decode.h"
-#include "jxl/decode_cxx.h"
-#include "jxl/thread_parallel_runner_cxx.h"
-
-#if !defined(__wasm__)
-#include "lib/jxl/base/file_io.h"
-#endif
+extern "C" {
 
 namespace {
 
-struct DecoderInstance {
-  uint32_t width = 0;
-  uint32_t height = 0;
-  uint8_t* pixels = nullptr;
-  uint32_t color_space = 0;
+struct DecoderInstancePrivate {
+  // Due to "Standard Layout" rules it is guaranteed that address of the entity
+  // and its first non-static member are the same.
+  DecoderInstance info;
 
   size_t pixels_size = 0;
   bool want_sdr;
@@ -35,26 +35,29 @@ struct DecoderInstance {
 
 }  // namespace
 
-extern "C" {
+DecoderInstance* jxlCreateInstance(bool want_sdr, uint32_t display_nits) {
+  DecoderInstancePrivate* self = new DecoderInstancePrivate();
 
-void* jxlCreateInstance(bool want_sdr, uint32_t display_nits) {
-  DecoderInstance* instance = new DecoderInstance();
-  instance->want_sdr = want_sdr;
-  instance->display_nits = display_nits;
+  if (!self) {
+    return nullptr;
+  }
+
+  self->want_sdr = want_sdr;
+  self->display_nits = display_nits;
   JxlDataType storageFormat = want_sdr ? JXL_TYPE_UINT8 : JXL_TYPE_UINT16;
-  instance->format = {4, storageFormat, JXL_NATIVE_ENDIAN, 0};
-  instance->decoder = JxlDecoderMake(nullptr);
+  self->format = {4, storageFormat, JXL_NATIVE_ENDIAN, 0};
+  self->decoder = JxlDecoderMake(nullptr);
 
-  JxlDecoder* dec = instance->decoder.get();
+  JxlDecoder* dec = self->decoder.get();
 
   auto report_error = [&](uint32_t code, const char* text) {
     fprintf(stderr, "%s\n", text);
-    // instance->result = code;
-    return instance;
+    delete self;
+    return reinterpret_cast<DecoderInstance*>(code);
   };
 
-  instance->thread_pool = JxlThreadParallelRunnerMake(nullptr, 4);
-  void* runner = instance->thread_pool.get();
+  self->thread_pool = JxlThreadParallelRunnerMake(nullptr, 4);
+  void* runner = self->thread_pool.get();
 
   auto status =
       JxlDecoderSetParallelRunner(dec, JxlThreadParallelRunner, runner);
@@ -74,33 +77,32 @@ void* jxlCreateInstance(bool want_sdr, uint32_t display_nits) {
   if (JXL_DEC_SUCCESS != status) {
     return report_error(3, "JxlDecoderSetProgressiveDetail failed");
   }
-  return instance;
+  return &self->info;
 }
 
-void jxlDestroyInstance(void* opaque_instance) {
-  if (opaque_instance == nullptr) return;
-  DecoderInstance* instance =
-      reinterpret_cast<DecoderInstance*>(opaque_instance);
+void jxlDestroyInstance(DecoderInstance* instance) {
+  if (instance == nullptr) return;
+  DecoderInstancePrivate* self =
+      reinterpret_cast<DecoderInstancePrivate*>(instance);
   if (instance->pixels) {
     free(instance->pixels);
   }
-  delete instance;
+  delete self;
 }
 
-uint32_t jxlProcessInput(void* opaque_instance, const uint8_t* input,
+uint32_t jxlProcessInput(DecoderInstance* instance, const uint8_t* input,
                          size_t input_size) {
-  if (opaque_instance == nullptr) return static_cast<uint32_t>(-1);
-  DecoderInstance* instance =
-      reinterpret_cast<DecoderInstance*>(opaque_instance);
-  JxlDecoder* dec = instance->decoder.get();
+  if (instance == nullptr) return static_cast<uint32_t>(-1);
+  DecoderInstancePrivate* self =
+      reinterpret_cast<DecoderInstancePrivate*>(instance);
+  JxlDecoder* dec = self->decoder.get();
 
   auto report_error = [&](int code, const char* text) {
     fprintf(stderr, "%s\n", text);
-    // instance->result = code;
     return static_cast<uint32_t>(code);
   };
 
-  std::vector<uint8_t>& tail = instance->tail;
+  std::vector<uint8_t>& tail = self->tail;
   if (!tail.empty()) {
     tail.reserve(tail.size() + input_size);
     tail.insert(tail.end(), input, input + input_size);
@@ -152,8 +154,8 @@ uint32_t jxlProcessInput(void* opaque_instance, const uint8_t* input,
       }
       instance->width = info.xsize;
       instance->height = info.ysize;
-      status = JxlDecoderImageOutBufferSize(dec, &instance->format,
-                                            &instance->pixels_size);
+      status =
+          JxlDecoderImageOutBufferSize(dec, &self->format, &self->pixels_size);
       if (status != JXL_DEC_SUCCESS) {
         release_input();
         return report_error(-6, "JxlDecoderImageOutBufferSize failed");
@@ -162,15 +164,14 @@ uint32_t jxlProcessInput(void* opaque_instance, const uint8_t* input,
         release_input();
         return report_error(-7, "Tried to realloc pixels");
       }
-      instance->pixels =
-          reinterpret_cast<uint8_t*>(malloc(instance->pixels_size));
+      instance->pixels = reinterpret_cast<uint8_t*>(malloc(self->pixels_size));
     } else if (JXL_DEC_NEED_IMAGE_OUT_BUFFER == status) {
-      if (!instance->pixels) {
+      if (!self->info.pixels) {
         release_input();
         return report_error(-8, "Out buffer not allocated");
       }
-      status = JxlDecoderSetImageOutBuffer(
-          dec, &instance->format, instance->pixels, instance->pixels_size);
+      status = JxlDecoderSetImageOutBuffer(dec, &self->format, instance->pixels,
+                                           self->pixels_size);
       if (status != JXL_DEC_SUCCESS) {
         release_input();
         return report_error(-9, "JxlDecoderSetImageOutBuffer failed");
@@ -180,8 +181,8 @@ uint32_t jxlProcessInput(void* opaque_instance, const uint8_t* input,
       color_encoding.color_space = JXL_COLOR_SPACE_RGB;
       color_encoding.white_point = JXL_WHITE_POINT_D65;
       color_encoding.primaries =
-          instance->want_sdr ? JXL_PRIMARIES_SRGB : JXL_PRIMARIES_2100;
-      color_encoding.transfer_function = instance->want_sdr
+          self->want_sdr ? JXL_PRIMARIES_SRGB : JXL_PRIMARIES_2100;
+      color_encoding.transfer_function = self->want_sdr
                                              ? JXL_TRANSFER_FUNCTION_SRGB
                                              : JXL_TRANSFER_FUNCTION_PQ;
       color_encoding.rendering_intent = JXL_RENDERING_INTENT_PERCEPTUAL;
@@ -200,15 +201,15 @@ uint32_t jxlProcessInput(void* opaque_instance, const uint8_t* input,
   return 0;
 }
 
-uint32_t jxlFlush(void* opaque_instance) {
-  if (opaque_instance == nullptr) return static_cast<uint32_t>(-1);
-  DecoderInstance* instance =
-      reinterpret_cast<DecoderInstance*>(opaque_instance);
-  JxlDecoder* dec = instance->decoder.get();
+uint32_t jxlFlush(DecoderInstance* instance) {
+  if (instance == nullptr) return static_cast<uint32_t>(-1);
+  DecoderInstancePrivate* self =
+      reinterpret_cast<DecoderInstancePrivate*>(instance);
+  JxlDecoder* dec = self->decoder.get();
 
   auto report_error = [&](int code, const char* text) {
     fprintf(stderr, "%s\n", text);
-    // instance->result = code;
+    // self->result = code;
     return static_cast<uint32_t>(code);
   };
 
@@ -224,20 +225,4 @@ uint32_t jxlFlush(void* opaque_instance) {
   return 0;
 }
 
-#if !defined(__wasm__)
-int main(int argc, char* argv[]) {
-  std::vector<uint8_t> data;
-  JXL_RETURN_IF_ERROR(jxl::ReadFile(argv[1], &data));
-  fprintf(stderr, "File size: %d\n", (int)data.size());
-
-  void* instance = jxlCreateInstance(true, 100);
-  uint32_t status = jxlProcessInput(instance, data.data(), data.size());
-  fprintf(stderr, "Process result: %d\n", status);
-  jxlFlush(instance);
-  status = jxlProcessInput(instance, nullptr, 0);
-  fprintf(stderr, "Process result: %d\n", status);
-  jxlDestroyInstance(instance);
-}
-#endif
-
 }  // extern "C"
diff --git a/tools/wasm_demo/jxl_decoder.h b/tools/wasm_demo/jxl_decoder.h
new file mode 100644 (file)
index 0000000..ad6d88e
--- /dev/null
@@ -0,0 +1,48 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef TOOLS_WASM_DEMO_JXL_DECODER_H_
+#define TOOLS_WASM_DEMO_JXL_DECODER_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+extern "C" {
+
+typedef struct DecoderInstance {
+  uint32_t width = 0;
+  uint32_t height = 0;
+  uint8_t* pixels = nullptr;
+
+  // The rest is opaque.
+} DecoderInstance;
+
+/*
+  Returns (as uint32_t):
+    0 - OOM
+    1 - JxlDecoderSetParallelRunner failed
+    2 - JxlDecoderSubscribeEvents failed
+    3 - JxlDecoderSetProgressiveDetail failed
+    >=4 - OK
+ */
+DecoderInstance* jxlCreateInstance(bool want_sdr, uint32_t display_nits);
+
+void jxlDestroyInstance(DecoderInstance* instance);
+
+/*
+  Returns (as uint32_t):
+    0 - OK (pixels are ready)
+    1 - ready to flush
+    2 - needs more input
+    >=3 - error
+ */
+uint32_t jxlProcessInput(DecoderInstance* instance, const uint8_t* input,
+                         size_t input_size);
+
+uint32_t jxlFlush(DecoderInstance* instance);
+
+}  // extern "C"
+
+#endif  // TOOLS_WASM_DEMO_JXL_DECODER_H_
diff --git a/tools/wasm_demo/jxl_decoder_test.js b/tools/wasm_demo/jxl_decoder_test.js
new file mode 100644 (file)
index 0000000..22dfa07
--- /dev/null
@@ -0,0 +1,140 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+function assertTrue(ok, msg) {
+  if (!ok) {
+    console.log('FAIL: ' + msg);
+    process.exit(1);
+  }
+}
+
+function runTest(testFn) {
+  console.log('Running ' + testFn.name);
+  testFn();
+  console.log('PASS');
+}
+
+let jxlModule;
+
+const isAddress = (v) => {
+  return (v >= 4) && ((v & (1 << 31)) === 0);
+};
+
+let splinesJxl = new Uint8Array([
+  0xff, 0x0a, 0xf8, 0x19, 0x10, 0x09, 0xd8, 0x63, 0x10, 0x00, 0xbc, 0x00,
+  0xa6, 0x19, 0x4a, 0xa3, 0x56, 0x8c, 0x94, 0x62, 0x24, 0x7d, 0x12, 0x72,
+  0x87, 0x00, 0x00, 0xda, 0xd4, 0xc9, 0xc1, 0xe2, 0x9e, 0x02, 0xb9, 0x37,
+  0x00, 0xfe, 0x07, 0x9a, 0x91, 0x08, 0xcd, 0xbf, 0xa1, 0xdc, 0x71, 0x36,
+  0x62, 0xc8, 0x97, 0x31, 0xc4, 0x3e, 0x58, 0x02, 0xc1, 0x01, 0x00
+]);
+
+let crossJxl = new Uint8Array([
+  0xff, 0x0a, 0x98, 0x10, 0x10, 0x50, 0x5c, 0x08, 0x08, 0x02, 0x01,
+  0x00, 0x98, 0x00, 0x4b, 0x18, 0x8b, 0x15, 0x00, 0xd4, 0x92, 0x62,
+  0xcc, 0x98, 0x91, 0x17, 0x08, 0x01, 0xe0, 0x92, 0xbc, 0x7e, 0xdf,
+  0xbf, 0xff, 0x50, 0xc0, 0x64, 0x35, 0xb0, 0x40, 0x1e, 0x24, 0xa9,
+  0xac, 0x38, 0xd9, 0x13, 0x1e, 0x85, 0x4a, 0x0d
+]);
+
+function testSdr() {
+  let decoder = jxlModule._jxlCreateInstance(
+      /* wantSdr */ true, /* displayNits */ 100);
+  assertTrue(isAddress(decoder), 'create decoder instance');
+  let encoded = splinesJxl;
+  let buffer = jxlModule._malloc(encoded.length);
+  jxlModule.HEAP8.set(encoded, buffer);
+
+  let result = jxlModule._jxlProcessInput(decoder, buffer, encoded.length);
+  assertTrue(result === 0, 'process input');
+
+  let w = jxlModule.HEAP32[decoder >> 2];
+  let h = jxlModule.HEAP32[(decoder + 4) >> 2];
+  let pixelData = jxlModule.HEAP32[(decoder + 8) >> 2];
+
+  assertTrue(pixelData, 'output allocated');
+  assertTrue(h === 320, 'output height');
+  assertTrue(w === 320, 'output width ');
+
+  jxlModule._jxlDestroyInstance(decoder);
+  jxlModule._free(buffer);
+}
+
+function testRegular() {
+  let decoder = jxlModule._jxlCreateInstance(
+      /* wantSdr */ false, /* displayNits */ 100);
+  assertTrue(isAddress(decoder), 'create decoder instance');
+  let encoded = splinesJxl;
+  let buffer = jxlModule._malloc(encoded.length);
+  jxlModule.HEAP8.set(encoded, buffer);
+
+  let result = jxlModule._jxlProcessInput(decoder, buffer, encoded.length);
+  assertTrue(result === 0, 'process input');
+
+  let w = jxlModule.HEAP32[decoder >> 2];
+  let h = jxlModule.HEAP32[(decoder + 4) >> 2];
+  let pixelData = jxlModule.HEAP32[(decoder + 8) >> 2];
+
+  assertTrue(pixelData, 'output allocated');
+  assertTrue(h === 320, 'output height');
+  assertTrue(w === 320, 'output width ');
+
+  jxlModule._jxlDestroyInstance(decoder);
+  jxlModule._free(buffer);
+}
+
+function testChunks() {
+  let decoder = jxlModule._jxlCreateInstance(
+      /* wantSdr */ false, /* displayNits */ 100);
+  assertTrue(isAddress(decoder), 'create decoder instance');
+  let encoded = splinesJxl;
+  let buffer = jxlModule._malloc(encoded.length);
+  jxlModule.HEAP8.set(encoded, buffer);
+
+  let part1_length = encoded.length >> 1;
+  let part2_length = encoded.length - part1_length;
+
+  let result = jxlModule._jxlProcessInput(decoder, buffer, part1_length);
+  assertTrue(result === 2, 'process first part');
+
+  result =
+      jxlModule._jxlProcessInput(decoder, buffer + part1_length, part2_length);
+  assertTrue(result === 0, 'process second part');
+
+  let w = jxlModule.HEAP32[decoder >> 2];
+  let h = jxlModule.HEAP32[(decoder + 4) >> 2];
+  let pixelData = jxlModule.HEAP32[(decoder + 8) >> 2];
+
+  assertTrue(pixelData, 'output allocated');
+  assertTrue(h === 320, 'output height');
+  assertTrue(w === 320, 'output width ');
+
+  jxlModule._jxlDestroyInstance(decoder);
+  jxlModule._free(buffer);
+}
+
+function testDecompress() {
+  let encoded = crossJxl;
+  let buffer = jxlModule._malloc(encoded.length);
+  jxlModule.HEAP8.set(encoded, buffer);
+
+  let output = jxlModule._jxlDecompress(buffer, encoded.length);
+  assertTrue(isAddress(output), 'decompress');
+
+  jxlModule._free(buffer);
+
+  let pngSize = jxlModule.HEAP32[output >> 2];
+  let px = 20 * 20;
+  assertTrue(pngSize >= 6 * px, 'png size');
+  assertTrue(pngSize <= 6 * px + 800, 'png size');
+
+  jxlModule._jxlCleanup(output);
+}
+
+require('jxl_decoder_for_test.js')().then(module => {
+  jxlModule = module;
+  let tests = [testSdr, testRegular, testChunks, testDecompress];
+  tests.forEach(runTest);
+  process.exit(0);
+});
diff --git a/tools/wasm_demo/jxl_decompressor.cc b/tools/wasm_demo/jxl_decompressor.cc
new file mode 100644 (file)
index 0000000..648e1ef
--- /dev/null
@@ -0,0 +1,117 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "tools/wasm_demo/jxl_decompressor.h"
+
+#include <jxl/thread_parallel_runner_cxx.h>
+
+#include <cstring>
+#include <memory>
+
+#include "lib/extras/dec/jxl.h"
+#include "tools/wasm_demo/no_png.h"
+
+extern "C" {
+
+namespace {
+
+struct DecompressorOutputPrivate {
+  // Due to "Standard Layout" rules it is guaranteed that address of the entity
+  // and its first non-static member are the same.
+  DecompressorOutput output;
+};
+
+void MaybeMakeCicp(const jxl::extras::PackedPixelFile& ppf,
+                   std::vector<uint8_t>* cicp) {
+  cicp->clear();
+  const JxlColorEncoding& clr = ppf.color_encoding;
+  uint8_t color_primaries = 0;
+  uint8_t transfer_function = static_cast<uint8_t>(clr.transfer_function);
+
+  if (clr.color_space != JXL_COLOR_SPACE_RGB) {
+    return;
+  }
+  if (clr.primaries == JXL_PRIMARIES_P3) {
+    if (clr.white_point == JXL_WHITE_POINT_D65) {
+      color_primaries = 12;
+    } else if (clr.white_point == JXL_WHITE_POINT_DCI) {
+      color_primaries = 11;
+    } else {
+      return;
+    }
+  } else if (clr.primaries != JXL_PRIMARIES_CUSTOM &&
+             clr.white_point == JXL_WHITE_POINT_D65) {
+    color_primaries = static_cast<uint8_t>(clr.primaries);
+  } else {
+    return;
+  }
+  if (clr.transfer_function == JXL_TRANSFER_FUNCTION_UNKNOWN ||
+      clr.transfer_function == JXL_TRANSFER_FUNCTION_GAMMA) {
+    return;
+  }
+
+  cicp->resize(4);
+  cicp->at(0) = color_primaries;    // Colour Primaries
+  cicp->at(1) = transfer_function;  // Transfer Function
+  cicp->at(2) = 0;                  // Matrix Coefficients
+  cicp->at(3) = 1;                  // Video Full Range Flag
+}
+
+}  // namespace
+
+DecompressorOutput* jxlDecompress(const uint8_t* input, size_t input_size) {
+  DecompressorOutputPrivate* self = new DecompressorOutputPrivate();
+
+  if (!self) {
+    return nullptr;
+  }
+
+  auto report_error = [&](uint32_t code, const char* text) {
+    fprintf(stderr, "%s\n", text);
+    delete self;
+    return reinterpret_cast<DecompressorOutput*>(code);
+  };
+
+  auto thread_pool = JxlThreadParallelRunnerMake(nullptr, 4);
+  void* runner = thread_pool.get();
+
+  jxl::extras::JXLDecompressParams dparams;
+  JxlPixelFormat format = {/* num_channels */ 3, JXL_TYPE_UINT16,
+                           JXL_BIG_ENDIAN, /* align */ 0};
+  dparams.accepted_formats.push_back(format);
+  dparams.runner = JxlThreadParallelRunner;
+  dparams.runner_opaque = runner;
+  jxl::extras::PackedPixelFile ppf;
+
+  if (!jxl::extras::DecodeImageJXL(input, input_size, dparams, nullptr, &ppf)) {
+    return report_error(1, "failed to decode jxl");
+  }
+
+  // Just 1-st frame.
+  const auto& image = ppf.frames[0].color;
+  std::vector<uint8_t> cicp;
+  MaybeMakeCicp(ppf, &cicp);
+  self->output.data = WrapPixelsToPng(
+      image.xsize, image.ysize, (format.data_type == JXL_TYPE_UINT16) ? 16 : 8,
+      /* has_alpha */ false, reinterpret_cast<const uint8_t*>(image.pixels()),
+      ppf.icc, cicp, &self->output.size);
+  if (!self->output.data) {
+    return report_error(2, "failed to encode png");
+  }
+
+  return &self->output;
+}
+
+void jxlCleanup(DecompressorOutput* output) {
+  if (output == nullptr) return;
+  DecompressorOutputPrivate* self =
+      reinterpret_cast<DecompressorOutputPrivate*>(output);
+  if (self->output.data) {
+    free(self->output.data);
+  }
+  delete self;
+}
+
+}  // extern "C"
diff --git a/tools/wasm_demo/jxl_decompressor.h b/tools/wasm_demo/jxl_decompressor.h
new file mode 100644 (file)
index 0000000..2ba16a0
--- /dev/null
@@ -0,0 +1,34 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef TOOLS_WASM_DEMO_JXL_DECOMPRESSOR_H_
+#define TOOLS_WASM_DEMO_JXL_DECOMPRESSOR_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+extern "C" {
+
+typedef struct DecompressorOutput {
+  uint32_t size = 0;
+  uint8_t* data = nullptr;
+
+  // The rest is opaque.
+} DecompressorOutput;
+
+/*
+  Returns (as uint32_t):
+    0 - OOM
+    1 - decoding JXL failed
+    2 - encoding PNG failed
+    >=4 - OK
+ */
+DecompressorOutput* jxlDecompress(const uint8_t* input, size_t input_size);
+
+void jxlCleanup(DecompressorOutput* output);
+
+}  // extern "C"
+
+#endif  // TOOLS_WASM_DEMO_JXL_DECOMPRESSOR_H_
diff --git a/tools/wasm_demo/manual_decode_demo.html b/tools/wasm_demo/manual_decode_demo.html
new file mode 100644 (file)
index 0000000..e11aed6
--- /dev/null
@@ -0,0 +1,340 @@
+<html>
+<head>
+  <link rel="icon" type="image/x-icon" href="favicon.ico">
+  <style>
+#log p {
+  margin: 0;
+}
+  </style>
+</head>
+<body>
+<div id="log" style="padding:2px; border: solid 1px #000; background-color: #ccc; margin:2px; height: 8em; font-family: monospace; overflow-y: auto; font-size: 8px;"></div>
+<script>
+// WASM module.
+let jxlModule = null;
+// Flag; if true, then HDR color space / 16 bit output is supported.
+let hdrCanvas = false;
+
+// Add message to "console".
+let addMessage = (text, color) => {
+  let log = document.getElementById('log');
+  let message = document.createElement('p');
+  message.style = 'color: ' + color + ';';
+  message.textContent = text;
+  log.append(message);
+  log.scrollTop = log.scrollHeight;
+}
+
+// Callback from WASM module when it becomes available.
+let onLoadJxlModule = (module) => {
+  jxlModule = module;
+  addMessage('WASM module loaded', 'black');
+  onJxlModuleReady();
+};
+
+// Check if multi-threading is supported (i.e. SharedArrayBuffer is allowed).
+let probeMutlithreading = () => {
+  try {
+    new SharedArrayBuffer();
+    return true;
+  } catch (ex) {
+    addMessage('Installing Service Worker, please wait...', 'orange');
+    return false;
+  }
+};
+
+// Check if HDR features are enabled.
+let probeHdr = () => {
+  addMessage('Probing HDR features', 'black');
+  try {
+    let tmpCanvas = document.createElement('canvas');
+    tmpCanvas.width = 1;
+    tmpCanvas.height = 1;
+    let ctx = tmpCanvas.getContext('2d', {colorSpace: 'rec2100-pq', pixelFormat: 'float16'});
+    // make it fail on firefox...
+    ctx.getContextAttributes();
+    addMessage('HDR canvas supported', 'green');
+    return true;
+  } catch (ex) {
+    addMessage(ex, 'red');
+    addMessage('Are Blink experiments enabled? about://flags/#enable-experimental-web-platform-features', 'blue');
+    return false;
+  }
+};
+
+// "main" method executed after page is loaded; all scripts are "synchronous" elements,
+// so it is guaranted that script elements are loaded and executed.
+let onDomContentLoaded = () => {
+  if (!probeMutlithreading()) return;
+  hdrCanvas = probeHdr();
+  JxlDecoderModule().then(onLoadJxlModule);
+};
+
+// Pass next chunk to decoder and interprets result.
+let processInput = (img, chunkLen) => {
+  let response = {
+    wantFlush: false,
+    copyPixels: false,
+    error: false,
+  }
+  do {
+    let t0 = performance.now();
+    let result = jxlModule._jxlProcessInput(img.decoder, img.buffer, chunkLen);
+    let t1 = performance.now();
+    let tProcessing = t1 - t0;
+    // addMessage('Processed chunk in ' + tProcessing + 'ms', 'blue');
+    img.totalProcessing += tProcessing;
+    // addMessage('Process result: ' + result, 'green');
+    if (result === 2) {
+      addMessage('Needs more input', 'gray');
+    } else if (result === 0) {
+      // addMessage('Image ready', 'gray');
+      response.wantFlush = false;
+      response.copyPixels = true;
+    } else if (result === 1) {
+      if (img.wantProgressive) {
+        addMessage('DC ready', 'gray');
+        response.wantFlush = true;
+        response.copyPixels = true;
+      } else {
+        // addMessage('Skipping DC flush', 'gray');
+        chunkLen = 0;
+        continue;
+      }
+    } else {
+      addMessage('Processing error', 'red');
+      img.broken = true;
+      response.error = true;
+      break;
+    }
+    break;
+  } while (true);
+  return response;
+}
+
+// Decode chunk and present results (dump to canvas).
+let processChunk = (img, chunkLen) => {
+  let result = processInput(img, chunkLen);
+  if (result.error) return;
+
+  if (result.wantFlush) {
+    let t2 = performance.now();
+    let flushResult = jxlModule._jxlFlush(img.decoder);
+    let t3 = performance.now();
+    let tFlushing = t3 - t2;
+    addMessage('Flush result: ' + flushResult, 'gray');
+    img.totalFlushing += tFlushing;
+  }
+
+  if (!result.copyPixels) return;
+
+  let w = jxlModule.HEAP32[img.decoder >> 2];
+  let h = jxlModule.HEAP32[(img.decoder + 4) >> 2];
+  let pixelData = jxlModule.HEAP32[(img.decoder + 8) >> 2];
+  if (!img.canvas) {
+    img.canvas = document.createElement('canvas');
+    img.canvas.width = w;
+    img.canvas.height = h;
+    img.canvas.style = 'width:100%';
+    // TODO(eustas): postpone until really flushed
+    document.body.appendChild(img.canvas);
+    let ctxOptions = {colorSpace: img.colorSpace, pixelFormat: 'float16'};
+    let pixelOptions = {colorSpace: img.colorSpace, storageFormat: 'uint16'};
+    if (img.wantSdr) {
+      ctxOptions = null;
+      pixelOptions = null;
+    }
+    img.canvasCtx = img.canvas.getContext('2d', ctxOptions);
+    img.pixels = img.canvasCtx.getImageData(0, 0, w, h, pixelOptions);
+  }
+
+  let src = null;
+  let start = pixelData;
+  if (img.wantSdr) {
+    src = new Uint8Array(jxlModule.HEAP8.buffer);
+  } else {
+    src = new Uint16Array(jxlModule.HEAP8.buffer);
+    start = start >> 1;
+  }
+  let end = start + w * h * 4;
+  img.pixels.data.set(src.slice(start, end));
+  img.canvasCtx.putImageData(img.pixels, 0, 0);
+};
+
+const BUF_LEN = 150 * 1024;
+
+// Image data cache for benchmarking.
+let fullImage = new Uint8Array(0);
+
+// Callback for fetch data.
+let onChunk = (img, chunk) => {
+  if (chunk.done) {
+    addMessage('Read finished | total processing: ' + img.totalProcessing.toFixed(1) + 'ms | total flushing ' + img.totalFlushing.toFixed(1) + 'ms', 'black');
+    cleanup(img);
+    img.onComplete(img);
+    return;
+  }
+  if (img.broken) return;
+
+  if (!img.decoder) {
+    let decoder = jxlModule._jxlCreateInstance(img.wantSdr, img.displayNits);
+    if (decoder < 4) {
+      img.broken = true;
+      cleanup(img);
+      addMessage('Failed to create decoder instance', 'red');
+      return;
+    }
+    img.decoder = decoder;
+    img.buffer = jxlModule._malloc(BUF_LEN);
+  }
+
+  // addMessage('Received chunk: ' + chunk.value.length, 'gray');
+  let newFullImage = new Uint8Array(fullImage.length + chunk.value.length);
+  newFullImage.set(fullImage);
+  newFullImage.set(chunk.value, fullImage.length);
+  fullImage = newFullImage;
+
+  let offset = 0;
+  while (offset < chunk.value.length) {
+    let delta = chunk.value.length - offset;
+    if (delta > BUF_LEN) delta = BUF_LEN;
+    jxlModule.HEAP8.set(chunk.value.slice(offset, offset + delta), img.buffer);
+    offset += delta;
+    processChunk(img, delta);
+    if (img.broken) {
+      return;
+    }
+  }
+
+  // Break the promise chain.
+  setTimeout(img.proceed, 0);
+};
+
+// Read next chunk; NB: used to break promise chain.
+let proceed = (img) => {
+  img.reader.read().then(img.onChunk, img.onReadError);
+};
+
+// Release (in-module) memory resources.
+let cleanup = (img) => {
+  if (img.decoder) {
+    jxlModule._jxlDestroyInstance(img.decoder);
+    img.decoder = 0;
+  }
+  if (img.buffer) {
+    jxlModule._free(img.buffer);
+    img.buffer = 0;
+  }
+};
+
+// Report error and cleanup.
+let onReadError = (img, error) => {
+  img.broken = true;
+  cleanup(img);
+  addMessage('Read failed: ' + error, 'red');
+};
+
+// On successful fetch start.
+let onResponse = (img, response) => {
+  if (!response.ok) {
+    addMessage('Fetch failed: ' + response.status + ' (' + response.statusText + ')');
+    return;
+  }
+  // Alas, not supported by fetch:
+  // let reader = response.body.getReader({mode: "byob"});
+  img.reader = response.body.getReader();
+
+  img.proceed();
+};
+
+// On image decoding completion.
+let onComplete = (img) => {
+  if (!img.runBenchmark) return;
+
+  let buffer = jxlModule._malloc(fullImage.length);
+  jxlModule.HEAP8.set(fullImage, buffer);
+  img.buffer = buffer;
+  let results = [];
+
+  for (let i = 0; i < img.runBenchmark; ++i) {
+    img.totalProcessing = 0;
+    img.decoder = jxlModule._jxlCreateInstance(img.wantSdr, img.displayNits);
+    processChunk(img, fullImage.length);
+    jxlModule._jxlDestroyInstance(img.decoder);
+    results.push(img.totalProcessing);
+    //addMessage('Decoding time: ' + img.totalProcessing + 'ms', 'black');
+  }
+
+  results.sort();
+  addMessage('Min decoding time: ' + results[0].toFixed(3) + 'ms', 'black');
+  addMessage('Median decoding time: ' + results[results.length >> 1].toFixed(3) + 'ms', 'black');
+  addMessage('Max decoding time: ' + results[results.length - 1].toFixed(3) + 'ms', 'black');
+
+  jxlModule._free(buffer);
+};
+
+// Fill cookie object template.
+let makeImg = () => {
+  return {
+    name: '',
+    colorSpace: 'rec2100-pq',
+    wantSdr: false,
+    displayNits: 100,
+    broken: false,
+    decoder: 0,
+    canvas: null,
+    canvasCtx: null,
+    pixels: null,
+    buffer: 0,
+    wantProgressive: false,
+    onlyDecode: false,
+    totalProcessing: 0,
+    totalFlushing: 0,
+    runBenchmark: 0,
+    onChunk: () => {},
+    onReadError: () => {},
+    proceed: () => {},
+    onComplete: () => {},
+  };
+}
+
+// Parse URL query and run image decoding / benchmarking.
+let onJxlModuleReady = () => {
+  let params = (new URL(document.location)).searchParams;
+  const images = ['image00.jxl', 'image01.jxl'];
+  let imgIdx = (params.get('img') | 0) % images.length;
+  let imgName = images[imgIdx];
+
+  let colorSpace = params.get('colorSpace') || 'srgb';
+  let wantSdr = params.get('wantSdr') == 'true';
+  let displayNits = parseInt(params.get('displayNits') || '0');
+  let runBenchmark = parseInt(params.get('runBenchmark') || '0');
+
+  if (!hdrCanvas) {
+    colorSpace = 'srgb-linear';
+    displayNits = displayNits || 100;
+    wantSdr = true;
+  }
+
+  addMessage('Color-space: "' + colorSpace + '", tone-map to SDR: ' + wantSdr + ', displayNits: ' + (displayNits || 'n/a'), 'black');
+
+  let img = makeImg();
+  img.name = imgName;
+  img.colorSpace = colorSpace;
+  img.wantSdr = wantSdr;
+  img.displayNits = displayNits;
+  img.onChunk = onChunk.bind(null, img);
+  img.onReadError = onReadError.bind(null, img);
+  img.proceed = proceed.bind(null, img);
+  img.onComplete = onComplete.bind(null, img);
+  img.runBenchmark = runBenchmark;
+
+  fetch(new Request(imgName, {cache: "no-store"})).then(onResponse.bind(null, img));
+};
+
+document.addEventListener('DOMContentLoaded', onDomContentLoaded);
+</script>
+
+<script src="jxl_decoder.js"></script>
+</body>
+</html>
diff --git a/tools/wasm_demo/netlify.toml b/tools/wasm_demo/netlify.toml
new file mode 100644 (file)
index 0000000..44d9d56
--- /dev/null
@@ -0,0 +1,19 @@
+# Copyright (c) the JPEG XL Project Authors. All rights reserved.
+#
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# We use "edge functions" feature to substitute response with pre-compressed
+# entries whenever those are available and browser supports Brotli or Gzip
+# content-encoding.
+[[edge_functions]]
+path = "/*"
+function = "precompressed"
+
+# Request browser "site-isolation" enabled.
+# This allows using "SharedArrayBuffers" required for multi-threaded WASM.
+[[headers]]
+for = "/*"
+  [headers.values]
+    Cross-Origin-Opener-Policy = "same-origin"
+    Cross-Origin-Embedder-Policy = "require-corp"
diff --git a/tools/wasm_demo/netlify/edge-functions/precompressed.ts b/tools/wasm_demo/netlify/edge-functions/precompressed.ts
new file mode 100644 (file)
index 0000000..c169432
--- /dev/null
@@ -0,0 +1,87 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+import type {Context} from 'netlify:edge';
+
+// This lambda is executed whenever request URL matches.
+export default async (request: Request, context: Context) => {
+  // Measure time for debugging purpose.
+  let t0 = Date.now();
+  // Get resource path (i.e. ignore query parameters).
+  let url = request.url.split('?')[0];
+  // Pick request headers; fallback to empty string if header is not set.
+  let acceptEncodingHeader = request.headers.get('Accept-Encoding') || '';
+  let acceptHeader = request.headers.get('Accept') || '';
+  let etag = request.headers.get('If-None-Match') || '';
+  // Roughly parse encodings list; this ignores "quality"; no modern browsers
+  // use it -> don't care.
+  let splitter = /[,;]/;
+  let supportedEncodings =
+      acceptEncodingHeader.split(splitter).map(v => v.trimStart());
+  let supportsBr = supportedEncodings.includes('br');
+  let supportedMedia = acceptHeader.split(splitter).map(v => v.trimStart());
+  let supportsJxl = supportedMedia.includes('image/jxl');
+  // Dump basic request info (we care about).
+  context.log(
+      'URL: ' + url + '; acceptEncodingHeader: ' + acceptEncodingHeader +
+      '; supportsBr: ' + supportsBr + '; supportsJxl: ' + supportsJxl +
+      '; etag: ' + etag);
+
+  // If browser does not support Brotli/Jxl - just process request normally.
+
+  if (!supportsBr && !supportsJxl) {
+    return;
+  }
+
+  // Jxl processing is higher priority, because images are (usually) transferred
+  // with 'identity' content encoding.
+  let isJxlWorkflow = supportsJxl;
+  let suffix = isJxlWorkflow ? '.jxl' : '.br';
+
+  // Request pre-compressed resource (with a suffix).
+  let response = await context.rewrite(url + suffix);
+  context.log('Response status: ' + response.status);
+  // First latency checkpoint (as we synchronously wait for resource fetch).
+  let t1 = Date.now();
+  // If pre-compressed resource does not exist - pass.
+  if (response.status == 404) {
+    return;
+  }
+  // Get resource ETag.
+  let responseEtag = response.headers.get('ETag') || '';
+  context.log('Response etag: ' + responseEtag);
+  // We rely on platform to check ETag; add debugging info just in case.
+  if (etag.length >= 4 && responseEtag === etag) {
+    console.log('Match; status: ' + response.status);
+  }
+  // Status 200 is regular "OK" - fetch resource; in such a case we need to
+  // craft response with the response contents.
+  // Status 3xx likely means "use cache"; pass response as is.
+  // Status 4xx is unlikely (404 has been already processed).
+  // Status 5xx is server error - nothing we could do around it.
+  if (response.status != 200) return response;
+  // Second time consuming operation - wait for resource contents.
+  let data = await response.arrayBuffer();
+  let fixedHeaders = new Headers(response.headers);
+
+  if (isJxlWorkflow) {
+    fixedHeaders.set('Content-Type', 'image/jxl');
+  } else {  // is Brotli workflow
+    // Set "Content-Type" based on resource suffix;
+    // otherwise browser will complain.
+    let contentEncoding = 'text/html; charset=UTF-8';
+    if (url.endsWith('.js')) {
+      contentEncoding = 'application/javascript';
+    } else if (url.endsWith('.wasm')) {
+      contentEncoding = 'application/wasm';
+    }
+    fixedHeaders.set('Content-Type', contentEncoding);
+    // Inform browser that data stream is compressed.
+    fixedHeaders.set('Content-Encoding', 'br');
+  }
+  let t2 = Date.now();
+  console.log('Timing: ' + (t1 - t0) + ' ' + (t2 - t1));
+  return new Response(data, {headers: fixedHeaders});
+};
diff --git a/tools/wasm_demo/no_png.cc b/tools/wasm_demo/no_png.cc
new file mode 100644 (file)
index 0000000..01527d3
--- /dev/null
@@ -0,0 +1,220 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "tools/wasm_demo/no_png.h"
+
+#include <array>
+#include <memory>
+
+extern "C" {
+
+namespace {
+
+static std::array<uint32_t, 256> makeCrc32Lut() {
+  std::array<uint32_t, 256> result;
+  for (uint32_t i = 0; i < 256; ++i) {
+    constexpr uint32_t poly = 0xEDB88320;
+    uint32_t v = i;
+    for (size_t i = 0; i < 8; ++i) {
+      uint32_t mask = ~((v & 1) - 1);
+      v = (v >> 1) ^ (poly & mask);
+    }
+    result[i] = v;
+  }
+  return result;
+}
+
+const std::array<uint32_t, 256> kCrc32Lut = makeCrc32Lut();
+
+const std::array<uint32_t, 8> kPngMagic = {137, 80, 78, 71, 13, 10, 26, 10};
+
+// No need to SIMDify it, only small blocks are actually checksummed.
+uint32_t CalculateCrc32(const uint8_t* start, const uint8_t* end) {
+  uint32_t result = ~0;
+  for (const uint8_t* data = start; data < end; ++data) {
+    result ^= *data;
+    result = (result >> 8) ^ kCrc32Lut[result & 0xFF];
+  }
+  return ~result;
+}
+
+void AdlerCopy(const uint8_t* src, uint8_t* dst, size_t length, uint32_t* s1,
+               uint32_t* s2) {
+  // TODO(eustas): SIMD-ify and use multithreading.
+
+  // Precondition: s1, s2 normalized; length <= 65535
+  uint32_t a = *s1;
+  uint32_t b = *s2;
+
+  for (size_t i = 0; i < length; ++i) {
+    const uint8_t v = src[i];
+    a += v;
+    b += a;
+    dst[i] = v;
+  }
+
+  // Postcondition: s1, s2 normalized.
+  *s1 = a % 65521;
+  *s2 = b % 65521;
+}
+
+constexpr size_t kMaxDeflateBlock = 65535;
+constexpr uint32_t kIhdrSize = 13;
+constexpr uint32_t kCicpSize = 4;
+
+void WriteU8(uint8_t*& dst, uint8_t value) { *(dst++) = value; }
+
+void WriteU16(uint8_t*& dst, uint16_t value) {
+  memcpy(dst, &value, 2);
+  dst += 2;
+}
+
+void WriteU32(uint8_t*& dst, uint32_t value) {
+  memcpy(dst, &value, 4);
+  dst += 4;
+}
+
+void WriteU32BE(uint8_t*& dst, uint32_t value) {
+  WriteU32(dst, __builtin_bswap32(value));
+}
+
+}  // namespace
+
+uint8_t* WrapPixelsToPng(size_t width, size_t height, size_t bit_depth,
+                         bool has_alpha, const uint8_t* input,
+                         const std::vector<uint8_t>& icc,
+                         const std::vector<uint8_t>& cicp,
+                         uint32_t* output_size) {
+  size_t row_size = width * (bit_depth / 8) * (3 + has_alpha);
+  size_t data_size = height * (row_size + 1);
+  size_t num_deflate_blocks =
+      (data_size + kMaxDeflateBlock - 1) / kMaxDeflateBlock;
+  size_t idat_size = data_size + num_deflate_blocks * 5 + 6;
+  // 64k is enough for everyone
+  bool has_iccp = !icc.empty() && (icc.size() <= kMaxDeflateBlock);
+  size_t iccp_size = 3 + icc.size() + 5 + 6;  // name + data + deflate-wrapping
+  bool has_cicp = (cicp.size() == kCicpSize);
+  size_t total_size = 0;
+  total_size += kPngMagic.size();
+  total_size += 12 + kIhdrSize;
+  total_size += has_cicp ? (kCicpSize + 12) : 0;
+  total_size += has_iccp ? (iccp_size + 12) : 0;
+  total_size += 12 + idat_size;
+  total_size += 12;  // IEND
+
+  uint8_t* output = static_cast<uint8_t*>(malloc(total_size));
+  if (!output) {
+    return nullptr;
+  }
+  uint8_t* dst = output;
+  *output_size = total_size;
+
+  for (size_t i = 0; i < kPngMagic.size(); ++i) {
+    *(dst++) = kPngMagic[i];
+  }
+
+  // IHDR
+  WriteU32BE(dst, kIhdrSize);
+  uint8_t* chunk_start = dst;
+  WriteU32(dst, 0x52444849);
+  WriteU32BE(dst, width);
+  WriteU32BE(dst, height);
+  WriteU8(dst, bit_depth);
+  WriteU8(dst, has_alpha ? 6 : 2);
+  WriteU8(dst, 0);  // compression: deflate
+  WriteU8(dst, 0);  // filters: standard
+  WriteU8(dst, 0);  // interlace: no
+  uint32_t crc32 = CalculateCrc32(chunk_start, dst);
+  WriteU32BE(dst, crc32);
+
+  if (has_cicp) {
+    // cICP
+    WriteU32BE(dst, kCicpSize);
+    uint8_t* chunk_start = dst;
+    WriteU32(dst, 0x50434963);
+    for (size_t i = 0; i < kCicpSize; ++i) {
+      WriteU8(dst, cicp[i]);
+    }
+    uint32_t crc32 = CalculateCrc32(chunk_start, dst);
+    WriteU32BE(dst, crc32);
+  }
+
+  if (has_iccp) {
+    // iCCP
+    WriteU32BE(dst, iccp_size);
+    uint8_t* chunk_start = dst;
+    WriteU32(dst, 0x50434369);
+    WriteU8(dst, '1');   // Profile name
+    WriteU8(dst, 0);     // NUL terminator
+    WriteU8(dst, 0);     // Compression method: deflate
+    WriteU8(dst, 0x08);  // CM = 8 (deflate), CINFO = 0 (window size = 2**(0+8))
+    WriteU8(dst, 29);    // FCHECK; (FCHECK + 256* CMF) % 31 = 0
+    uint32_t adler_s1 = 1;
+    uint32_t adler_s2 = 0;
+    WriteU8(dst, 1);  // btype = 00 (uncompressed), last
+    uint16_t block_size = static_cast<uint16_t>(icc.size());
+    WriteU16(dst, block_size);
+    WriteU16(dst, ~block_size);
+    AdlerCopy(icc.data(), dst, block_size, &adler_s1, &adler_s2);
+    dst += block_size;
+    uint32_t adler = (adler_s2 << 8) | adler_s1;
+    WriteU32BE(dst, adler);
+    uint32_t crc32 = CalculateCrc32(chunk_start, dst);
+    WriteU32BE(dst, crc32);
+  }
+
+  // IDAT
+  WriteU32BE(dst, idat_size);
+  WriteU32(dst, 0x54414449);
+  size_t offset = 0;
+  size_t bytes_to_next_row = 0;
+  uint32_t adler_s1 = 1;
+  uint32_t adler_s2 = 0;
+  WriteU8(dst, 0x08);  // CM = 8 (deflate), CINFO = 0 (window size = 2**(0+8))
+  WriteU8(dst, 29);    // FCHECK; (FCHECK + 256* CMF) % 31 = 0
+  for (size_t i = 0; i < num_deflate_blocks; ++i) {
+    size_t block_size = data_size - offset;
+    if (block_size > kMaxDeflateBlock) {
+      block_size = kMaxDeflateBlock;
+    }
+    bool is_last = ((i + 1) == num_deflate_blocks);
+    WriteU8(dst, is_last);  // btype = 00 (uncompressed)
+    offset += block_size;
+
+    WriteU16(dst, block_size);
+    WriteU16(dst, ~block_size);
+    while (block_size > 0) {
+      if (bytes_to_next_row == 0) {
+        WriteU8(dst, 0);  // filter: raw
+        adler_s2 += adler_s1;
+        bytes_to_next_row = row_size;
+        block_size--;
+        continue;
+      }
+      size_t bytes_to_copy = std::min(block_size, bytes_to_next_row);
+      AdlerCopy(input, dst, bytes_to_copy, &adler_s1, &adler_s2);
+      dst += bytes_to_copy;
+      input += bytes_to_copy;
+      block_size -= bytes_to_copy;
+      bytes_to_next_row -= bytes_to_copy;
+    }
+  }
+  // Fake Adler works well in Chrome; so let's not waste CPU cycles.
+  uint32_t adler = 0;  // (adler_s2 << 8) | adler_s1;
+  WriteU32BE(dst, adler);
+  WriteU32BE(dst, 0);  // Fake CRC32
+
+  // IEND
+  WriteU32BE(dst, 0);
+  chunk_start = dst;
+  WriteU32(dst, 0x444E4549);
+  // TODO(eustas): this is fixed value; precalculate?
+  crc32 = CalculateCrc32(chunk_start, dst);
+  WriteU32BE(dst, crc32);
+
+  return output;
+}
+
+}  // extern "C"
diff --git a/tools/wasm_demo/no_png.h b/tools/wasm_demo/no_png.h
new file mode 100644 (file)
index 0000000..1486c47
--- /dev/null
@@ -0,0 +1,24 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#ifndef TOOLS_WASM_DEMO_NO_PNG_H_
+#define TOOLS_WASM_DEMO_NO_PNG_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
+extern "C" {
+
+uint8_t* WrapPixelsToPng(size_t width, size_t height, size_t bit_depth,
+                         bool has_alpha, const uint8_t* input,
+                         const std::vector<uint8_t>& icc,
+                         const std::vector<uint8_t>& cicp,
+                         uint32_t* output_size);
+
+}  // extern "C"
+
+#endif  // TOOLS_WASM_DEMO_NO_PNG_H_
diff --git a/tools/wasm_demo/one_line_demo.html b/tools/wasm_demo/one_line_demo.html
new file mode 100644 (file)
index 0000000..a2966ac
--- /dev/null
@@ -0,0 +1,20 @@
+<html>
+
+<head>
+  <link rel="icon" type="image/x-icon" href="favicon.ico" />
+  <script src="service_worker.js">
+/*
+ * Just load this script, et voila! It will install ServiceWorker to
+ * advertise image/jxl media type and decode responses.
+ * NB: if "addMessage" function is defined it will be used to report
+ * decoding times / problems.
+ */
+  </script>
+</head>
+
+<body>
+  <img src="image00.jxl" style="width:100%" />
+  <img src="image01.jxl" style="width:100%" />
+</body>
+
+</html>
diff --git a/tools/wasm_demo/one_line_demo_with_console.html b/tools/wasm_demo/one_line_demo_with_console.html
new file mode 100644 (file)
index 0000000..e2c52ae
--- /dev/null
@@ -0,0 +1,34 @@
+<html>
+
+<head>
+  <link rel="icon" type="image/x-icon" href="favicon.ico">
+  <script src="service_worker.js"></script>
+  <style>
+    #log p {
+      margin: 0;
+    }
+  </style>
+</head>
+
+<body>
+  <div id="log" style="padding:2px; border: solid 1px #000; background-color: #ccc; margin:2px; height: 8em; font-family: monospace; overflow-y: auto; font-size: 8px;"></div>
+  <script>
+    let addMessage = (text, color) => {
+      let log = document.getElementById('log');
+      let message = document.createElement('p');
+      message.style = 'color: ' + color + ';';
+      message.textContent = text;
+      log.append(message);
+      log.scrollTop = log.scrollHeight;
+    }
+  </script>
+
+<!-- Use those with capable server
+  <img src="image00.jpg" style="width:100%" />
+  <img src="image01.png" style="width:100%" />
+-->
+  <img src="image00.jxl" style="width:100%" />
+  <img src="image01.jxl" style="width:100%" />
+</body>
+
+</html>
diff --git a/tools/wasm_demo/service_worker.js b/tools/wasm_demo/service_worker.js
new file mode 100644 (file)
index 0000000..531e5c2
--- /dev/null
@@ -0,0 +1,317 @@
+// Copyright (c) the JPEG XL Project Authors. All rights reserved.
+//
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+ * ServiceWorker script.
+ *
+ * Multi-threading in WASM is currently implemented by the means of
+ * SharedArrayBuffer. Due to infamous vulnerabilities this feature is disabled
+ * unless site is running in "cross-origin isolated" mode.
+ * If there is not enough control over the server (e.g. when pages are hosted as
+ * "github pages") ServiceWorker is used to upgrade responses with corresponding
+ * headers.
+ *
+ * This script could be executed in 2 environments: HTML page or ServiceWorker.
+ * The environment is detected by the type of "window" reference.
+ *
+ * When this script is executed from HTML page then ServiceWorker is registered.
+ * Page reload might be necessary in some situations. By default it is done via
+ * `window.location.reload()`. However this can be altered by setting a
+ * configuration object `window.serviceWorkerConfig`. It's `doReload` property
+ * should be a replacement callable.
+ *
+ * When this script is executed from ServiceWorker then standard lifecycle
+ * event dispatchers are setup along with `fetch` interceptor.
+ */
+
+(() => {
+  // Set COOP/COEP headers for document/script responses; use when this can not
+  // be done on server side (e.g. GitHub Pages).
+  const FORCE_COP = true;
+  // Interpret 'content-type: application/octet-stream' as JXL; use when server
+  // does not set appropriate content type (e.g. GitHub Pages).
+  const FORCE_DECODING = true;
+  // Embedded (baked-in) responses for faster turn-around.
+  const EMBEDDED = {
+    'client_worker.js': '$client_worker.js$',
+    'jxl_decoder.js': '$jxl_decoder.js$',
+    'jxl_decoder.worker.js': '$jxl_decoder.worker.js$',
+  };
+
+  // Enable SharedArrayBuffer.
+  const setCopHeaders = (headers) => {
+    headers.set('Cross-Origin-Embedder-Policy', 'require-corp');
+    headers.set('Cross-Origin-Opener-Policy', 'same-origin');
+  };
+
+  // Inflight object: {clientId, uid, timestamp, controller}
+  const inflight = [];
+
+  // Generate (very likely) unique string.
+  const makeUid = () => {
+    return Math.random().toString(36).substring(2) +
+        Math.random().toString(36).substring(2);
+  };
+
+  // Make list (non-recursively) of transferable entities.
+  const gatherTransferrables = (...args) => {
+    const result = [];
+    for (let i = 0; i < args.length; ++i) {
+      if (args[i] && args[i].buffer) {
+        result.push(args[i].buffer);
+      }
+    }
+    return result;
+  };
+
+  // Serve items that are embedded in this service worker.
+  const maybeProcessEmbeddedResources = (event) => {
+    const url = event.request.url;
+    // Shortcut for baked-in scripts.
+    for (const [key, value] of Object.entries(EMBEDDED)) {
+      if (url.endsWith(key)) {
+        const headers = new Headers();
+        headers.set('Content-Type', 'application/javascript');
+        setCopHeaders(headers);
+
+        event.respondWith(new Response(value, {
+          status: 200,
+          statusText: 'OK',
+          headers: headers,
+        }));
+        return true;
+      }
+    }
+    return false;
+  };
+
+  // Decode JXL image response and serve it as a PNG image.
+  const wrapImageResponse = async (clientId, originalResponse) => {
+    // TODO(eustas): cache?
+    const client = await clients.get(clientId);
+    // Client is gone? Not our problem then.
+    if (!client) {
+      return originalResponse;
+    }
+
+    const inputStream = await originalResponse.body;
+    // Can't use "BYOB" for regular responses.
+    const reader = inputStream.getReader();
+
+    const inflightEntry = {
+      clientId: clientId,
+      uid: makeUid(),
+      timestamp: Date.now(),
+      inputStreamReader: reader,
+      outputStreamController: null
+    };
+    inflight.push(inflightEntry);
+
+    const outputStream = new ReadableStream({
+      start: (controller) => {
+        inflightEntry.outputStreamController = controller;
+      }
+    });
+
+    const onRead = (chunk) => {
+      const msg = {
+        op: 'decodeJxl',
+        uid: inflightEntry.uid,
+        url: originalResponse.url,
+        data: chunk.value || null
+      };
+      client.postMessage(msg, gatherTransferrables(msg.data));
+      if (!chunk.done) {
+        reader.read().then(onRead);
+      }
+    };
+    // const view = new SharedArrayBuffer(65536);
+    const view = new Uint8Array(65536);
+    reader.read(view).then(onRead);
+
+    let modifiedResponseHeaders = new Headers(originalResponse.headers);
+    modifiedResponseHeaders.delete('Content-Length');
+    modifiedResponseHeaders.set('Content-Type', 'image/png');
+    modifiedResponseHeaders.set('Server', 'ServiceWorker');
+    return new Response(outputStream, {headers: modifiedResponseHeaders});
+  };
+
+  // Check if response needs decoding; if so - do it.
+  const wrapImageRequest = async (clientId, request) => {
+    let modifiedRequestHeaders = new Headers(request.headers);
+    modifiedRequestHeaders.append('Accept', 'image/jxl');
+    let modifiedRequest =
+        new Request(request, {headers: modifiedRequestHeaders});
+    let originalResponse = await fetch(modifiedRequest);
+    let contentType = originalResponse.headers.get('Content-Type');
+
+    let isJxlResponse = (contentType === 'image/jxl');
+    if (FORCE_DECODING && contentType === 'application/octet-stream') {
+      isJxlResponse = true;
+    }
+    if (isJxlResponse) {
+      return wrapImageResponse(clientId, originalResponse);
+    }
+
+    return originalResponse;
+  };
+
+  const reportError = (err) => {
+    // console.error(err);
+  };
+
+  const upgradeResponse = (response) => {
+    if (response.status === 0) {
+      return response;
+    }
+
+    const newHeaders = new Headers(response.headers);
+    setCopHeaders(newHeaders);
+
+    return new Response(response.body, {
+      status: response.status,
+      statusText: response.statusText,
+      headers: newHeaders,
+    });
+  };
+
+  // Process fetch request; either bypass, or serve embedded resource,
+  // or upgrade.
+  const onFetch = async (event) => {
+    const clientId = event.clientId;
+    const request = event.request;
+
+    // Pass direct cached resource requests.
+    if (request.cache === 'only-if-cached' && request.mode !== 'same-origin') {
+      return;
+    }
+
+    // Serve backed resources.
+    if (maybeProcessEmbeddedResources(event)) {
+      return;
+    }
+
+    // Notify server we are JXL-capable.
+    if (request.destination === 'image') {
+      let accept = request.headers.get('Accept');
+      // Only if browser does not support JXL.
+      if (accept.indexOf('image/jxl') === -1) {
+        event.respondWith(wrapImageRequest(clientId, request));
+      }
+      return;
+    }
+
+    if (FORCE_COP) {
+      event.respondWith(
+          fetch(event.request).then(upgradeResponse).catch(reportError));
+    }
+  };
+
+  // Serve decoded bytes.
+  const onMessage = (event) => {
+    const data = event.data;
+    const uid = data.uid;
+    let inflightEntry = null;
+    for (let i = 0; i < inflight.length; ++i) {
+      if (inflight[i].uid === uid) {
+        inflightEntry = inflight[i];
+        break;
+      }
+    }
+    if (!inflightEntry) {
+      console.log('Ooops, not found: ' + uid);
+      return;
+    }
+    inflightEntry.outputStreamController.enqueue(data.data);
+    inflightEntry.outputStreamController.close();
+  };
+
+  // This method is "main" for service worker.
+  const serviceWorkerMain = () => {
+    // https://v8.dev/blog/wasm-code-caching
+    // > Every web site must perform at least one full compilation of a
+    // > WebAssembly module — use workers to hide that from your users.
+    // TODO(eustas): not 100% reliable, investigate why
+    self['JxlDecoderLeak'] =
+        WebAssembly.compileStreaming(fetch('jxl_decoder.wasm'));
+
+    // ServiceWorker lifecycle.
+    self.addEventListener('install', () => {
+      return self.skipWaiting();
+    });
+    self.addEventListener(
+        'activate', (event) => event.waitUntil(self.clients.claim()));
+    self.addEventListener('message', onMessage);
+    // Intercept some requests.
+    self.addEventListener('fetch', onFetch);
+  };
+
+  // Service workers does not support multi-threading; that is why decoding is
+  // relayed back to "client" (document / window).
+  const prepareClient = () => {
+    const clientWorker = new Worker('client_worker.js');
+    clientWorker.onmessage = (event) => {
+      const data = event.data;
+      if (typeof addMessage !== 'undefined') {
+        if (data.msg) {
+          addMessage(data.msg, 'blue');
+        }
+      }
+      navigator.serviceWorker.controller.postMessage(
+          data, gatherTransferrables(data.data));
+    };
+
+    // Forward ServiceWorker requests to "Client" worker.
+    navigator.serviceWorker.addEventListener('message', (event) => {
+      clientWorker.postMessage(
+          event.data, gatherTransferrables(event.data.data));
+    });
+  };
+
+  // Executed in HTML page environment.
+  const maybeRegisterServiceWorker = () => {
+    const config = {
+      log: console.log,
+      error: console.error,
+      requestReload: (msg) => window.location.reload(),
+      ...window.serviceWorkerConfig  // add overrides
+    }
+
+    if (!window.isSecureContext) {
+      config.log('Secure context is required for this ServiceWorker.');
+      return;
+    }
+
+    const nav = navigator;  // Explicitly capture navigator object.
+    const onServiceWorkerRegistrationSuccess = (registration) => {
+      config.log('Service Worker registered', registration.scope);
+      if (!registration.active || !nav.serviceWorker.controller) {
+        config.requestReload(
+            'Reload to allow Service Worker process all requests');
+      }
+    };
+
+    const onServiceWorkerRegistrationFailure = (err) => {
+      config.error('Service Worker failed to register:', err);
+    };
+
+    navigator.serviceWorker.register(window.document.currentScript.src)
+        .then(
+            onServiceWorkerRegistrationSuccess,
+            onServiceWorkerRegistrationFailure);
+  };
+
+  const pageMain = () => {
+    maybeRegisterServiceWorker();
+    prepareClient();
+  };
+
+  // Detect environment and run corresponding "main" method.
+  if (typeof window === 'undefined') {
+    serviceWorkerMain();
+  } else {
+    pageMain();
+  }
+})();
index 1ce4882..e2afd56 100644 (file)
@@ -3,6 +3,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+#include <jxl/cms.h>
 #include <stdio.h>
 
 #include <utility>
 #include "lib/jxl/base/printf_macros.h"
 #include "lib/jxl/codec_in_out.h"
 #include "lib/jxl/color_encoding_internal.h"
-#include "lib/jxl/color_management.h"
-#include "lib/jxl/enc_color_management.h"
 #include "lib/jxl/enc_xyb.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/image_bundle.h"
 
-namespace jxl {
+namespace jpegxl {
+namespace tools {
 namespace {
 
+using ::jxl::CodecInOut;
+using ::jxl::ColorEncoding;
+using ::jxl::Image3F;
+using ::jxl::ImageBundle;
+using ::jxl::ThreadPool;
+
 void PrintXybRange() {
   Image3F linear(1u << 16, 257);
   for (int b = 0; b < 256; ++b) {
@@ -43,7 +49,7 @@ void PrintXybRange() {
   const ImageBundle& ib = io.Main();
   ThreadPool* null_pool = nullptr;
   Image3F opsin(ib.xsize(), ib.ysize());
-  (void)ToXYB(ib, null_pool, &opsin, GetJxlCms());
+  (void)jxl::ToXYB(ib, null_pool, &opsin, *JxlGetDefaultCms());
   for (size_t c = 0; c < 3; ++c) {
     float minval = 1e10f;
     float maxval = -1e10f;
@@ -75,6 +81,7 @@ void PrintXybRange() {
 }
 
 }  // namespace
-}  // namespace jxl
+}  // namespace tools
+}  // namespace jpegxl
 
-int main() { jxl::PrintXybRange(); }
+int main() { jpegxl::tools::PrintXybRange(); }